vellum 0.2.11 → 0.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bun.lock +6 -2
- package/package.json +2 -2
- package/src/__tests__/call-orchestrator.test.ts +58 -0
- package/src/__tests__/config-schema.test.ts +278 -0
- package/src/__tests__/elevenlabs-client.test.ts +209 -0
- package/src/__tests__/gateway-only-enforcement.test.ts +9 -35
- package/src/__tests__/oauth2-gateway-transport.test.ts +14 -33
- package/src/__tests__/skills.test.ts +2 -2
- package/src/__tests__/trust-store.test.ts +1 -0
- package/src/__tests__/twilio-routes-twiml.test.ts +127 -0
- package/src/__tests__/twilio-routes.test.ts +78 -153
- package/src/__tests__/twitter-auth-handler.test.ts +1 -1
- package/src/calls/call-orchestrator.ts +3 -1
- package/src/calls/elevenlabs-client.ts +89 -0
- package/src/calls/elevenlabs-config.ts +29 -0
- package/src/calls/twilio-routes.ts +55 -6
- package/src/calls/voice-quality.ts +92 -0
- package/src/cli/main-screen.tsx +15 -117
- package/src/config/bundled-skills/macos-automation/SKILL.md +66 -0
- package/src/config/bundled-skills/phone-calls/SKILL.md +414 -0
- package/src/config/defaults.ts +18 -0
- package/src/config/schema.ts +110 -0
- package/src/config/system-prompt.ts +9 -59
- package/src/config/types.ts +2 -0
- package/src/daemon/lifecycle.ts +20 -7
- package/src/memory/db.ts +36 -0
- package/src/permissions/defaults.ts +11 -0
- package/src/runtime/routes/conversation-routes.ts +12 -5
- package/src/security/oauth2.ts +8 -8
- package/src/util/logger.ts +4 -4
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: "Phone Calls"
|
|
3
|
+
description: "Set up Twilio for outgoing phone calls and place AI-powered voice calls on behalf of the user"
|
|
4
|
+
user-invocable: true
|
|
5
|
+
metadata: {"vellum": {"emoji": "📞", "requires": {"config": ["calls.enabled"]}}}
|
|
6
|
+
includes: ["public-ingress"]
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
You are helping the user set up and make outgoing phone calls via Twilio. This skill covers the full lifecycle: Twilio account setup, credential storage, public ingress configuration, enabling the calls feature, placing calls, and monitoring live transcripts.
|
|
10
|
+
|
|
11
|
+
## Overview
|
|
12
|
+
|
|
13
|
+
The calling system uses Twilio's ConversationRelay to place outbound phone calls. Twilio works out of the box as the default voice provider. Optionally, you can enable ElevenLabs integration for higher-quality, more natural-sounding voices — but this is entirely optional.
|
|
14
|
+
|
|
15
|
+
When a call is placed:
|
|
16
|
+
|
|
17
|
+
1. The assistant initiates an outbound call via the Twilio REST API
|
|
18
|
+
2. Twilio connects to the gateway's voice webhook, which returns TwiML
|
|
19
|
+
3. Twilio opens a ConversationRelay WebSocket for real-time voice streaming
|
|
20
|
+
4. An LLM-driven orchestrator manages the conversation — receiving caller speech (transcribed by Deepgram), generating responses via Claude, and streaming text back for TTS playback
|
|
21
|
+
5. The transcript is relayed live to the user's conversation thread
|
|
22
|
+
|
|
23
|
+
Three voice quality modes are available:
|
|
24
|
+
- **`twilio_standard`** (default) — Standard Twilio TTS with Google voices. No extra setup required.
|
|
25
|
+
- **`twilio_elevenlabs_tts`** — Uses ElevenLabs voices through Twilio ConversationRelay for more natural speech.
|
|
26
|
+
- **`elevenlabs_agent`** — Full ElevenLabs conversational agent mode for the highest quality (requires ElevenLabs agent setup).
|
|
27
|
+
|
|
28
|
+
You can keep using Twilio only — no changes needed. Enabling ElevenLabs can improve naturalness and quality.
|
|
29
|
+
|
|
30
|
+
The user's assistant gets its own personal phone number through Twilio.
|
|
31
|
+
|
|
32
|
+
## Step 1: Check Current Configuration
|
|
33
|
+
|
|
34
|
+
First, check whether Twilio is already configured:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
vellum config get calls.enabled
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Also check for existing credentials:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
credential_store action=get service=credential:twilio:account_sid
|
|
44
|
+
credential_store action=get service=credential:twilio:auth_token
|
|
45
|
+
credential_store action=get service=credential:twilio:phone_number
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
If all three credentials exist and `calls.enabled` is `true`, skip to the **Making Calls** section. If credentials are partially configured, skip to whichever step is still needed.
|
|
49
|
+
|
|
50
|
+
## Step 2: Create a Twilio Account
|
|
51
|
+
|
|
52
|
+
If the user doesn't have a Twilio account yet, guide them through setup:
|
|
53
|
+
|
|
54
|
+
1. Tell the user: **"You'll need a Twilio account to make phone calls. Sign up at https://www.twilio.com/try-twilio — it's free to start and includes trial credit."**
|
|
55
|
+
2. Once they have an account, they need three pieces of information:
|
|
56
|
+
- **Account SID** — found on the Twilio Console dashboard at https://console.twilio.com
|
|
57
|
+
- **Auth Token** — found on the same dashboard (click "Show" to reveal it)
|
|
58
|
+
- **Phone Number** — a Twilio phone number capable of making voice calls
|
|
59
|
+
|
|
60
|
+
### Getting a Twilio Phone Number
|
|
61
|
+
|
|
62
|
+
If the user doesn't have a Twilio phone number yet:
|
|
63
|
+
|
|
64
|
+
1. Direct them to https://console.twilio.com/us1/develop/phone-numbers/manage/incoming
|
|
65
|
+
2. Click **"Buy a Number"**
|
|
66
|
+
3. Select a number with **Voice** capability enabled
|
|
67
|
+
4. For trial accounts, Twilio provides one free number automatically — check "Active Numbers" first
|
|
68
|
+
|
|
69
|
+
Tell the user: **"This will be your assistant's personal phone number — the number that shows up on caller ID when calls are placed."**
|
|
70
|
+
|
|
71
|
+
## Step 3: Store Twilio Credentials
|
|
72
|
+
|
|
73
|
+
Once the user provides their credentials, store them securely using the `credential_store` tool. Ask the user to paste each value, then store them one at a time:
|
|
74
|
+
|
|
75
|
+
**Account SID:**
|
|
76
|
+
```
|
|
77
|
+
credential_store action=set service=credential:twilio:account_sid value=<their_account_sid>
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Auth Token:**
|
|
81
|
+
```
|
|
82
|
+
credential_store action=set service=credential:twilio:auth_token value=<their_auth_token>
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
**Phone Number** (must be in E.164 format, e.g. `+14155551234`):
|
|
86
|
+
```
|
|
87
|
+
credential_store action=set service=credential:twilio:phone_number value=<their_phone_number>
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
After storing, verify each credential was saved:
|
|
91
|
+
```
|
|
92
|
+
credential_store action=get service=credential:twilio:account_sid
|
|
93
|
+
credential_store action=get service=credential:twilio:auth_token
|
|
94
|
+
credential_store action=get service=credential:twilio:phone_number
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Important:** Credentials are stored in the OS keychain (macOS Keychain / Linux secret-service) or encrypted at rest. They are never logged or exposed in plaintext.
|
|
98
|
+
|
|
99
|
+
## Step 4: Set Up Public Ingress
|
|
100
|
+
|
|
101
|
+
Twilio needs a publicly reachable URL to send voice webhooks and establish the ConversationRelay WebSocket. The **public-ingress** skill handles this via ngrok.
|
|
102
|
+
|
|
103
|
+
Check if ingress is already configured:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
vellum config get ingress.publicBaseUrl
|
|
107
|
+
vellum config get ingress.enabled
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
If not configured, load and run the public-ingress skill:
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
skill_load skill=public-ingress
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Follow the public-ingress skill's instructions to set up the ngrok tunnel. Once complete, the gateway will be reachable at the configured `ingress.publicBaseUrl`.
|
|
117
|
+
|
|
118
|
+
**Twilio needs these webhook endpoints (handled automatically by the gateway):**
|
|
119
|
+
- Voice webhook: `{publicBaseUrl}/webhooks/twilio/voice`
|
|
120
|
+
- Status callback: `{publicBaseUrl}/webhooks/twilio/status`
|
|
121
|
+
- ConversationRelay WebSocket: `{publicBaseUrl}/webhooks/twilio/relay` (wss://)
|
|
122
|
+
|
|
123
|
+
No manual Twilio webhook configuration is needed — the assistant registers webhook URLs dynamically when placing each call.
|
|
124
|
+
|
|
125
|
+
## Step 5: Enable Calls
|
|
126
|
+
|
|
127
|
+
Enable the calls feature:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
vellum config set calls.enabled true
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Verify:
|
|
134
|
+
```bash
|
|
135
|
+
vellum config get calls.enabled
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Step 6: Verify Setup (Test Call)
|
|
139
|
+
|
|
140
|
+
Before making real calls, offer a quick verification:
|
|
141
|
+
|
|
142
|
+
1. Confirm credentials are stored: all three `credential:twilio:*` keys must be present
|
|
143
|
+
2. Confirm ingress is running: `ingress.publicBaseUrl` must be set and the tunnel active
|
|
144
|
+
3. Confirm calls are enabled: `calls.enabled` must be `true`
|
|
145
|
+
|
|
146
|
+
Suggest a test call to the user's own phone: **"Want to do a quick test call to your phone to make sure everything works?"**
|
|
147
|
+
|
|
148
|
+
If they agree, ask for their personal phone number and place a test call with a simple task like "Introduce yourself and confirm the call system is working."
|
|
149
|
+
|
|
150
|
+
## Optional: Higher Quality Voice with ElevenLabs
|
|
151
|
+
|
|
152
|
+
ElevenLabs integration is entirely optional. The standard Twilio-only setup works unchanged — this section is only relevant if you want to improve voice quality.
|
|
153
|
+
|
|
154
|
+
### Mode: `twilio_elevenlabs_tts`
|
|
155
|
+
|
|
156
|
+
Uses ElevenLabs voices through Twilio's ConversationRelay. Speech is more natural-sounding than the default Google TTS voices. No ElevenLabs API key is needed for this mode — just a voice ID.
|
|
157
|
+
|
|
158
|
+
**Setup:**
|
|
159
|
+
|
|
160
|
+
1. Browse ElevenLabs voices at https://elevenlabs.io/voice-library and pick a voice ID
|
|
161
|
+
2. Set the voice mode and voice ID:
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
vellum config set calls.voice.mode twilio_elevenlabs_tts
|
|
165
|
+
vellum config set calls.voice.elevenlabs.voiceId "<your-voice-id>"
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Mode: `elevenlabs_agent`
|
|
169
|
+
|
|
170
|
+
Full ElevenLabs conversational agent mode. This requires an ElevenLabs account with an agent configured on their platform.
|
|
171
|
+
|
|
172
|
+
**Setup:**
|
|
173
|
+
|
|
174
|
+
1. Store your ElevenLabs API key securely:
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
credential_store action=set service=credential:elevenlabs:api_key value=<your_api_key>
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
2. Set the voice mode and agent ID:
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
vellum config set calls.voice.mode elevenlabs_agent
|
|
184
|
+
vellum config set calls.voice.elevenlabs.agentId "<your-agent-id>"
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Fallback behavior
|
|
188
|
+
|
|
189
|
+
By default, `calls.voice.fallbackToStandardOnError` is `true`. This means if ElevenLabs is unavailable or misconfigured (e.g., missing voice ID, API errors), calls automatically fall back to standard Twilio TTS rather than failing. You can disable this if you want strict ElevenLabs-only behavior:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
vellum config set calls.voice.fallbackToStandardOnError false
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Reverting to standard Twilio
|
|
196
|
+
|
|
197
|
+
To go back to the default voice at any time:
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
vellum config set calls.voice.mode twilio_standard
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## Making Calls
|
|
204
|
+
|
|
205
|
+
Use the `call_start` tool to place outbound calls. Every call requires:
|
|
206
|
+
- **phone_number**: The number to call in E.164 format (e.g. `+14155551234`)
|
|
207
|
+
- **task**: What the call should accomplish — this becomes the AI voice agent's objective
|
|
208
|
+
- **context** (optional): Additional background information for the conversation
|
|
209
|
+
|
|
210
|
+
### Example calls:
|
|
211
|
+
|
|
212
|
+
**Making a reservation:**
|
|
213
|
+
```
|
|
214
|
+
call_start phone_number="+14155551234" task="Make a dinner reservation for 2 people tonight at 7pm" context="The user's name is John Smith. Prefer a table by the window if available."
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
**Calling a business:**
|
|
218
|
+
```
|
|
219
|
+
call_start phone_number="+18005551234" task="Check if they have a specific product in stock" context="Looking for a 65-inch Samsung OLED TV, model QN65S95D. Ask about availability and price."
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
**Following up on an appointment:**
|
|
223
|
+
```
|
|
224
|
+
call_start phone_number="+12125551234" task="Confirm the dentist appointment scheduled for next Tuesday at 2pm" context="The appointment is under the name Jane Doe, DOB 03/15/1990."
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Phone number format
|
|
228
|
+
|
|
229
|
+
Phone numbers MUST be in E.164 format: `+` followed by country code and number with no spaces, dashes, or parentheses.
|
|
230
|
+
- US/Canada: `+1XXXXXXXXXX` (e.g. `+14155551234`)
|
|
231
|
+
- UK: `+44XXXXXXXXXX` (e.g. `+442071234567`)
|
|
232
|
+
- International: `+{country_code}{number}`
|
|
233
|
+
|
|
234
|
+
If the user provides a number in a different format, convert it to E.164 before calling. If the country is ambiguous, ask.
|
|
235
|
+
|
|
236
|
+
### Trial account limitations
|
|
237
|
+
|
|
238
|
+
On Twilio trial accounts, outbound calls can ONLY be made to **verified numbers**. If a call fails with a "not verified" error:
|
|
239
|
+
1. Tell the user they need to verify the number at https://console.twilio.com/us1/develop/phone-numbers/manage/verified
|
|
240
|
+
2. Or upgrade to a paid Twilio account to call any number
|
|
241
|
+
|
|
242
|
+
## Live Call Monitoring
|
|
243
|
+
|
|
244
|
+
### Showing the live transcript
|
|
245
|
+
|
|
246
|
+
By default, always show the live transcript of the call as it happens. When a call is in progress:
|
|
247
|
+
|
|
248
|
+
1. After placing the call with `call_start`, immediately begin polling with `call_status` to track the call state
|
|
249
|
+
2. The system fires transcript notifications as the conversation unfolds — both caller speech and assistant responses appear in real time in the conversation thread
|
|
250
|
+
3. Present each transcript entry clearly as it arrives:
|
|
251
|
+
|
|
252
|
+
```
|
|
253
|
+
📞 Call in progress...
|
|
254
|
+
|
|
255
|
+
🗣️ Assistant: "Hi, I'm calling on behalf of John to make a dinner reservation for tonight."
|
|
256
|
+
👤 Caller: "Sure, what time would you like?"
|
|
257
|
+
🗣️ Assistant: "We'd like a table for two at 7pm, please."
|
|
258
|
+
👤 Caller: "Let me check... yes, we have availability at 7pm."
|
|
259
|
+
🗣️ Assistant: "Wonderful! The reservation would be under John Smith."
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
4. Continue monitoring until the call completes or fails
|
|
263
|
+
|
|
264
|
+
### Handling questions during a call
|
|
265
|
+
|
|
266
|
+
The AI voice agent may encounter situations where it needs input from the user. When this happens:
|
|
267
|
+
|
|
268
|
+
1. The call status changes to `waiting_on_user`
|
|
269
|
+
2. A **pending question** appears in `call_status` output
|
|
270
|
+
3. Present the question prominently to the user:
|
|
271
|
+
|
|
272
|
+
```
|
|
273
|
+
❓ The person on the call asked something the assistant needs your help with:
|
|
274
|
+
"They're asking if you'd prefer the smoking or non-smoking section?"
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
4. The user can reply directly in the chat — their response is automatically routed to the live call via the call bridge
|
|
278
|
+
5. The AI voice agent receives the answer and continues the conversation naturally
|
|
279
|
+
|
|
280
|
+
**Important:** Respond to pending questions quickly. There is a consultation timeout (default: 2 minutes). If no answer is provided in time, the AI voice agent will move on.
|
|
281
|
+
|
|
282
|
+
### Call status values
|
|
283
|
+
|
|
284
|
+
- **initiated** — Call is being placed
|
|
285
|
+
- **ringing** — Phone is ringing on the other end
|
|
286
|
+
- **in_progress** — Call is connected, conversation is active
|
|
287
|
+
- **waiting_on_user** — AI agent needs input from the user (check pending question)
|
|
288
|
+
- **completed** — Call ended successfully
|
|
289
|
+
- **failed** — Call failed (check lastError for details)
|
|
290
|
+
- **cancelled** — Call was manually cancelled
|
|
291
|
+
|
|
292
|
+
### Ending a call early
|
|
293
|
+
|
|
294
|
+
Use `call_end` with the call session ID to terminate an active call:
|
|
295
|
+
```
|
|
296
|
+
call_end call_session_id="<session_id>" reason="User requested to end the call"
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
## Call Quality Tips
|
|
300
|
+
|
|
301
|
+
When crafting tasks for the AI voice agent, follow these guidelines for the best call experience:
|
|
302
|
+
|
|
303
|
+
### Writing good task descriptions
|
|
304
|
+
|
|
305
|
+
- **Be specific about the objective**: "Make a dinner reservation for 2 at 7pm tonight" is better than "Call the restaurant"
|
|
306
|
+
- **Include relevant context**: Names, account numbers, appointment details — anything the agent might need
|
|
307
|
+
- **Specify what information to collect**: "Ask about their return policy and store hours" tells the agent what to gather
|
|
308
|
+
- **Set clear completion criteria**: The agent knows to end the call when the task is fulfilled
|
|
309
|
+
|
|
310
|
+
### Providing context
|
|
311
|
+
|
|
312
|
+
The `context` field is powerful — use it to give the agent background that helps it sound natural:
|
|
313
|
+
|
|
314
|
+
- User's name and identifying details (for making appointments, verifying accounts)
|
|
315
|
+
- Preferences and constraints (dietary restrictions, budget limits, scheduling conflicts)
|
|
316
|
+
- Previous interaction history ("I called last week and spoke with Sarah about...")
|
|
317
|
+
- Special instructions ("If they put you on hold for more than 5 minutes, hang up and we'll try again later")
|
|
318
|
+
|
|
319
|
+
### Things the AI voice agent handles well
|
|
320
|
+
|
|
321
|
+
- Making reservations and appointments
|
|
322
|
+
- Checking business hours, availability, or pricing
|
|
323
|
+
- Confirming or rescheduling existing appointments
|
|
324
|
+
- Gathering information (store policies, product availability)
|
|
325
|
+
- Simple customer service interactions
|
|
326
|
+
- Leaving voicemails (it will speak the message if voicemail picks up)
|
|
327
|
+
|
|
328
|
+
### Things to be aware of
|
|
329
|
+
|
|
330
|
+
- Calls have a maximum duration (configurable via `calls.maxDurationSeconds`, default: 1 hour)
|
|
331
|
+
- The agent gives a 2-minute warning before the time limit
|
|
332
|
+
- Emergency numbers (911, 112, 999, etc.) are blocked and cannot be called
|
|
333
|
+
- The AI disclosure setting (`calls.disclosure.enabled`) controls whether the agent announces it's an AI at the start of the call
|
|
334
|
+
|
|
335
|
+
## Configuration Reference
|
|
336
|
+
|
|
337
|
+
All call-related settings can be managed via `vellum config`:
|
|
338
|
+
|
|
339
|
+
| Setting | Description | Default |
|
|
340
|
+
|---|---|---|
|
|
341
|
+
| `calls.enabled` | Master switch for the calling feature | `false` |
|
|
342
|
+
| `calls.provider` | Voice provider (currently only `twilio`) | `twilio` |
|
|
343
|
+
| `calls.maxDurationSeconds` | Maximum call length in seconds | `3600` (1 hour) |
|
|
344
|
+
| `calls.userConsultTimeoutSeconds` | How long to wait for user answers | `120` (2 min) |
|
|
345
|
+
| `calls.disclosure.enabled` | Whether the AI announces itself at call start | `true` |
|
|
346
|
+
| `calls.disclosure.text` | The disclosure message spoken at call start | `"I should let you know that I'm an AI assistant calling on behalf of my user."` |
|
|
347
|
+
| `calls.model` | Override LLM model for call orchestration | *(uses default model)* |
|
|
348
|
+
| `calls.voice.mode` | Voice quality mode (`twilio_standard`, `twilio_elevenlabs_tts`, `elevenlabs_agent`) | `twilio_standard` |
|
|
349
|
+
| `calls.voice.language` | Language code for TTS and transcription | `en-US` |
|
|
350
|
+
| `calls.voice.transcriptionProvider` | Speech-to-text provider (`Deepgram`, `Google`) | `Deepgram` |
|
|
351
|
+
| `calls.voice.fallbackToStandardOnError` | Auto-fallback to standard Twilio TTS on ElevenLabs errors | `true` |
|
|
352
|
+
| `calls.voice.elevenlabs.voiceId` | ElevenLabs voice ID (for `twilio_elevenlabs_tts` mode) | *(empty)* |
|
|
353
|
+
| `calls.voice.elevenlabs.agentId` | ElevenLabs agent ID (for `elevenlabs_agent` mode) | *(empty)* |
|
|
354
|
+
|
|
355
|
+
### Adjusting settings
|
|
356
|
+
|
|
357
|
+
```bash
|
|
358
|
+
# Increase max call duration to 2 hours
|
|
359
|
+
vellum config set calls.maxDurationSeconds 7200
|
|
360
|
+
|
|
361
|
+
# Disable AI disclosure (check local regulations first)
|
|
362
|
+
vellum config set calls.disclosure.enabled false
|
|
363
|
+
|
|
364
|
+
# Custom disclosure message
|
|
365
|
+
vellum config set calls.disclosure.text "Just so you know, this is an AI assistant calling for my user."
|
|
366
|
+
|
|
367
|
+
# Give more time for user consultation
|
|
368
|
+
vellum config set calls.userConsultTimeoutSeconds 300
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
## Troubleshooting
|
|
372
|
+
|
|
373
|
+
### "Twilio credentials not configured"
|
|
374
|
+
Run Step 3 to store your Account SID, Auth Token, and Phone Number via `credential_store`.
|
|
375
|
+
|
|
376
|
+
### "Calls feature is disabled"
|
|
377
|
+
Run `vellum config set calls.enabled true`.
|
|
378
|
+
|
|
379
|
+
### "No public base URL configured"
|
|
380
|
+
Run the **public-ingress** skill to set up ngrok and configure `ingress.publicBaseUrl`.
|
|
381
|
+
|
|
382
|
+
### Call fails immediately after initiating
|
|
383
|
+
- Check that the phone number is in E.164 format
|
|
384
|
+
- Verify Twilio credentials are correct (wrong auth token causes API errors)
|
|
385
|
+
- On trial accounts, ensure the destination number is verified
|
|
386
|
+
- Check that the ngrok tunnel is still running (`curl -s http://127.0.0.1:4040/api/tunnels`)
|
|
387
|
+
|
|
388
|
+
### Call connects but no audio / one-way audio
|
|
389
|
+
- The ConversationRelay WebSocket may not be connecting. Check that `ingress.publicBaseUrl` is correct and the tunnel is active
|
|
390
|
+
- Verify the gateway is running on `http://127.0.0.1:${GATEWAY_PORT:-7830}`
|
|
391
|
+
|
|
392
|
+
### "This phone number is not allowed to be called"
|
|
393
|
+
Emergency numbers (911, 112, 999, 000, 110, 119) are permanently blocked for safety.
|
|
394
|
+
|
|
395
|
+
### ngrok tunnel URL changed
|
|
396
|
+
If you restarted ngrok, the public URL has changed. Update it:
|
|
397
|
+
```bash
|
|
398
|
+
vellum config set ingress.publicBaseUrl "<new-url>"
|
|
399
|
+
```
|
|
400
|
+
Or re-run the public-ingress skill to auto-detect and save the new URL.
|
|
401
|
+
|
|
402
|
+
### Call drops after 30 seconds of silence
|
|
403
|
+
The system has a 30-second silence timeout. If nobody speaks for 30 seconds, the agent will ask "Are you still there?" This is expected behavior.
|
|
404
|
+
|
|
405
|
+
### Call quality didn't improve after enabling ElevenLabs
|
|
406
|
+
- Verify `calls.voice.mode` is set to `twilio_elevenlabs_tts` or `elevenlabs_agent` (not still `twilio_standard`)
|
|
407
|
+
- Check that `calls.voice.elevenlabs.voiceId` contains a valid ElevenLabs voice ID
|
|
408
|
+
- If mode is `elevenlabs_agent`, ensure `calls.voice.elevenlabs.agentId` is also set
|
|
409
|
+
|
|
410
|
+
### ElevenLabs mode falls back to standard
|
|
411
|
+
When `calls.voice.fallbackToStandardOnError` is `true` (the default), the system silently falls back to standard Twilio TTS if ElevenLabs encounters an error. Check:
|
|
412
|
+
- For `elevenlabs_agent` mode: verify the API key is stored (`credential_store action=get service=credential:elevenlabs:api_key`) and that `calls.voice.elevenlabs.agentId` is configured
|
|
413
|
+
- For `twilio_elevenlabs_tts` mode: verify `calls.voice.elevenlabs.voiceId` is set to a valid voice ID
|
|
414
|
+
- Review daemon logs for error messages related to ElevenLabs
|
package/src/config/defaults.ts
CHANGED
|
@@ -226,6 +226,24 @@ export const DEFAULT_CONFIG: AssistantConfig = {
|
|
|
226
226
|
safety: {
|
|
227
227
|
denyCategories: [],
|
|
228
228
|
},
|
|
229
|
+
voice: {
|
|
230
|
+
mode: 'twilio_standard' as const,
|
|
231
|
+
language: 'en-US',
|
|
232
|
+
transcriptionProvider: 'Deepgram' as const,
|
|
233
|
+
fallbackToStandardOnError: true,
|
|
234
|
+
elevenlabs: {
|
|
235
|
+
voiceId: '',
|
|
236
|
+
voiceModelId: 'turbo_v2_5',
|
|
237
|
+
stability: 0.5,
|
|
238
|
+
similarityBoost: 0.75,
|
|
239
|
+
style: 0.0,
|
|
240
|
+
useSpeakerBoost: true,
|
|
241
|
+
agentId: '',
|
|
242
|
+
apiBaseUrl: 'https://api.elevenlabs.io',
|
|
243
|
+
registerCallTimeoutMs: 5000,
|
|
244
|
+
},
|
|
245
|
+
},
|
|
246
|
+
model: undefined,
|
|
229
247
|
},
|
|
230
248
|
ingress: {
|
|
231
249
|
enabled: false,
|
package/src/config/schema.ts
CHANGED
|
@@ -9,6 +9,8 @@ const VALID_SANDBOX_BACKENDS = ['native', 'docker'] as const;
|
|
|
9
9
|
const VALID_DOCKER_NETWORKS = ['none', 'bridge'] as const;
|
|
10
10
|
const VALID_PERMISSIONS_MODES = ['legacy', 'strict'] as const;
|
|
11
11
|
const VALID_CALL_PROVIDERS = ['twilio'] as const;
|
|
12
|
+
const VALID_CALL_VOICE_MODES = ['twilio_standard', 'twilio_elevenlabs_tts', 'elevenlabs_agent'] as const;
|
|
13
|
+
const VALID_CALL_TRANSCRIPTION_PROVIDERS = ['Deepgram', 'Google'] as const;
|
|
12
14
|
|
|
13
15
|
export const TimeoutConfigSchema = z.object({
|
|
14
16
|
shellMaxTimeoutSec: z
|
|
@@ -885,6 +887,75 @@ export const CallsSafetyConfigSchema = z.object({
|
|
|
885
887
|
.default([]),
|
|
886
888
|
});
|
|
887
889
|
|
|
890
|
+
export const CallsElevenLabsConfigSchema = z.object({
|
|
891
|
+
voiceId: z
|
|
892
|
+
.string({ error: 'calls.voice.elevenlabs.voiceId must be a string' })
|
|
893
|
+
.default(''),
|
|
894
|
+
voiceModelId: z
|
|
895
|
+
.string({ error: 'calls.voice.elevenlabs.voiceModelId must be a string' })
|
|
896
|
+
.default('turbo_v2_5'),
|
|
897
|
+
stability: z
|
|
898
|
+
.number({ error: 'calls.voice.elevenlabs.stability must be a number' })
|
|
899
|
+
.min(0, 'calls.voice.elevenlabs.stability must be >= 0')
|
|
900
|
+
.max(1, 'calls.voice.elevenlabs.stability must be <= 1')
|
|
901
|
+
.default(0.5),
|
|
902
|
+
similarityBoost: z
|
|
903
|
+
.number({ error: 'calls.voice.elevenlabs.similarityBoost must be a number' })
|
|
904
|
+
.min(0, 'calls.voice.elevenlabs.similarityBoost must be >= 0')
|
|
905
|
+
.max(1, 'calls.voice.elevenlabs.similarityBoost must be <= 1')
|
|
906
|
+
.default(0.75),
|
|
907
|
+
style: z
|
|
908
|
+
.number({ error: 'calls.voice.elevenlabs.style must be a number' })
|
|
909
|
+
.min(0, 'calls.voice.elevenlabs.style must be >= 0')
|
|
910
|
+
.max(1, 'calls.voice.elevenlabs.style must be <= 1')
|
|
911
|
+
.default(0.0),
|
|
912
|
+
useSpeakerBoost: z
|
|
913
|
+
.boolean({ error: 'calls.voice.elevenlabs.useSpeakerBoost must be a boolean' })
|
|
914
|
+
.default(true),
|
|
915
|
+
agentId: z
|
|
916
|
+
.string({ error: 'calls.voice.elevenlabs.agentId must be a string' })
|
|
917
|
+
.default(''),
|
|
918
|
+
apiBaseUrl: z
|
|
919
|
+
.string({ error: 'calls.voice.elevenlabs.apiBaseUrl must be a string' })
|
|
920
|
+
.default('https://api.elevenlabs.io'),
|
|
921
|
+
registerCallTimeoutMs: z
|
|
922
|
+
.number({ error: 'calls.voice.elevenlabs.registerCallTimeoutMs must be a number' })
|
|
923
|
+
.int('calls.voice.elevenlabs.registerCallTimeoutMs must be an integer')
|
|
924
|
+
.min(1000, 'calls.voice.elevenlabs.registerCallTimeoutMs must be >= 1000')
|
|
925
|
+
.max(15000, 'calls.voice.elevenlabs.registerCallTimeoutMs must be <= 15000')
|
|
926
|
+
.default(5000),
|
|
927
|
+
});
|
|
928
|
+
|
|
929
|
+
export const CallsVoiceConfigSchema = z.object({
|
|
930
|
+
mode: z
|
|
931
|
+
.enum(VALID_CALL_VOICE_MODES, {
|
|
932
|
+
error: `calls.voice.mode must be one of: ${VALID_CALL_VOICE_MODES.join(', ')}`,
|
|
933
|
+
})
|
|
934
|
+
.default('twilio_standard'),
|
|
935
|
+
language: z
|
|
936
|
+
.string({ error: 'calls.voice.language must be a string' })
|
|
937
|
+
.default('en-US'),
|
|
938
|
+
transcriptionProvider: z
|
|
939
|
+
.enum(VALID_CALL_TRANSCRIPTION_PROVIDERS, {
|
|
940
|
+
error: `calls.voice.transcriptionProvider must be one of: ${VALID_CALL_TRANSCRIPTION_PROVIDERS.join(', ')}`,
|
|
941
|
+
})
|
|
942
|
+
.default('Deepgram'),
|
|
943
|
+
fallbackToStandardOnError: z
|
|
944
|
+
.boolean({ error: 'calls.voice.fallbackToStandardOnError must be a boolean' })
|
|
945
|
+
.default(true),
|
|
946
|
+
elevenlabs: CallsElevenLabsConfigSchema.default({
|
|
947
|
+
voiceId: '',
|
|
948
|
+
voiceModelId: 'turbo_v2_5',
|
|
949
|
+
stability: 0.5,
|
|
950
|
+
similarityBoost: 0.75,
|
|
951
|
+
style: 0.0,
|
|
952
|
+
useSpeakerBoost: true,
|
|
953
|
+
agentId: '',
|
|
954
|
+
apiBaseUrl: 'https://api.elevenlabs.io',
|
|
955
|
+
registerCallTimeoutMs: 5000,
|
|
956
|
+
}),
|
|
957
|
+
});
|
|
958
|
+
|
|
888
959
|
export const CallsConfigSchema = z.object({
|
|
889
960
|
enabled: z
|
|
890
961
|
.boolean({ error: 'calls.enabled must be a boolean' })
|
|
@@ -913,6 +984,26 @@ export const CallsConfigSchema = z.object({
|
|
|
913
984
|
safety: CallsSafetyConfigSchema.default({
|
|
914
985
|
denyCategories: [],
|
|
915
986
|
}),
|
|
987
|
+
voice: CallsVoiceConfigSchema.default({
|
|
988
|
+
mode: 'twilio_standard',
|
|
989
|
+
language: 'en-US',
|
|
990
|
+
transcriptionProvider: 'Deepgram',
|
|
991
|
+
fallbackToStandardOnError: true,
|
|
992
|
+
elevenlabs: {
|
|
993
|
+
voiceId: '',
|
|
994
|
+
voiceModelId: 'turbo_v2_5',
|
|
995
|
+
stability: 0.5,
|
|
996
|
+
similarityBoost: 0.75,
|
|
997
|
+
style: 0.0,
|
|
998
|
+
useSpeakerBoost: true,
|
|
999
|
+
agentId: '',
|
|
1000
|
+
apiBaseUrl: 'https://api.elevenlabs.io',
|
|
1001
|
+
registerCallTimeoutMs: 5000,
|
|
1002
|
+
},
|
|
1003
|
+
}),
|
|
1004
|
+
model: z
|
|
1005
|
+
.string({ error: 'calls.model must be a string' })
|
|
1006
|
+
.optional(),
|
|
916
1007
|
});
|
|
917
1008
|
|
|
918
1009
|
export const SkillsConfigSchema = z.object({
|
|
@@ -1178,6 +1269,23 @@ export const AssistantConfigSchema = z.object({
|
|
|
1178
1269
|
safety: {
|
|
1179
1270
|
denyCategories: [],
|
|
1180
1271
|
},
|
|
1272
|
+
voice: {
|
|
1273
|
+
mode: 'twilio_standard',
|
|
1274
|
+
language: 'en-US',
|
|
1275
|
+
transcriptionProvider: 'Deepgram',
|
|
1276
|
+
fallbackToStandardOnError: true,
|
|
1277
|
+
elevenlabs: {
|
|
1278
|
+
voiceId: '',
|
|
1279
|
+
voiceModelId: 'turbo_v2_5',
|
|
1280
|
+
stability: 0.5,
|
|
1281
|
+
similarityBoost: 0.75,
|
|
1282
|
+
style: 0.0,
|
|
1283
|
+
useSpeakerBoost: true,
|
|
1284
|
+
agentId: '',
|
|
1285
|
+
apiBaseUrl: 'https://api.elevenlabs.io',
|
|
1286
|
+
registerCallTimeoutMs: 5000,
|
|
1287
|
+
},
|
|
1288
|
+
},
|
|
1181
1289
|
}),
|
|
1182
1290
|
ingress: IngressConfigSchema.default({
|
|
1183
1291
|
enabled: false,
|
|
@@ -1243,4 +1351,6 @@ export type WorkspaceGitConfig = z.infer<typeof WorkspaceGitConfigSchema>;
|
|
|
1243
1351
|
export type CallsConfig = z.infer<typeof CallsConfigSchema>;
|
|
1244
1352
|
export type CallsDisclosureConfig = z.infer<typeof CallsDisclosureConfigSchema>;
|
|
1245
1353
|
export type CallsSafetyConfig = z.infer<typeof CallsSafetyConfigSchema>;
|
|
1354
|
+
export type CallsVoiceConfig = z.infer<typeof CallsVoiceConfigSchema>;
|
|
1355
|
+
export type CallsElevenLabsConfig = z.infer<typeof CallsElevenLabsConfigSchema>;
|
|
1246
1356
|
export type IngressConfig = z.infer<typeof IngressConfigSchema>;
|
|
@@ -410,63 +410,8 @@ function buildAccessPreferenceSection(): string {
|
|
|
410
410
|
'If yes to any of these, use that path instead of the browser.',
|
|
411
411
|
...(isMacOS() ? [
|
|
412
412
|
'',
|
|
413
|
-
'
|
|
414
|
-
'',
|
|
415
|
-
'When interacting with native macOS apps or performing system-level actions, prefer **osascript**',
|
|
416
|
-
'via host_bash over browser automation or computer-use.',
|
|
417
|
-
'',
|
|
418
|
-
'The following apps support AppleScript and should be automated via osascript:',
|
|
419
|
-
'',
|
|
420
|
-
'**Communication:** Messages, Mail, Microsoft Outlook, FaceTime',
|
|
421
|
-
'**Contacts & Calendar:** Contacts, Calendar, Reminders',
|
|
422
|
-
'**Notes & Writing:** Notes, TextEdit, Pages, BBEdit, CotEditor',
|
|
423
|
-
'**Files & Finder:** Finder, Path Finder',
|
|
424
|
-
'**Browsers:** Safari, Google Chrome',
|
|
425
|
-
'**Music & Media:** Music (iTunes), Spotify, VLC, Podcasts, TV',
|
|
426
|
-
'**Productivity:** OmniFocus, Things 3, OmniOutliner, OmniPlan, OmniGraffle',
|
|
427
|
-
'**Office:** Microsoft Word, Microsoft Excel, Numbers, Keynote',
|
|
428
|
-
'**Developer tools:** Xcode, Terminal, iTerm2, Script Editor',
|
|
429
|
-
'**System:** Finder, System Events (UI scripting for any app), System Settings',
|
|
430
|
-
'**Automation:** Keyboard Maestro, Alfred, Automator',
|
|
431
|
-
'**Creative:** Adobe Photoshop, Final Cut Pro',
|
|
432
|
-
'',
|
|
433
|
-
'For any other app, try osascript first — check scriptability with:',
|
|
434
|
-
'```bash',
|
|
435
|
-
'osascript -e \'tell application "AppName" to get name\'',
|
|
436
|
-
'```',
|
|
437
|
-
'',
|
|
438
|
-
'Common examples:',
|
|
439
|
-
'```bash',
|
|
440
|
-
'# Send an iMessage',
|
|
441
|
-
'osascript -e \'tell application "Messages" to send "Hello!" to buddy "user@example.com"\'',
|
|
442
|
-
'',
|
|
443
|
-
'# Look up a contact',
|
|
444
|
-
'osascript -e \'tell application "Contacts" to get {name, phones} of every person whose name contains "Marina"\'',
|
|
445
|
-
'',
|
|
446
|
-
'# Read upcoming calendar events',
|
|
447
|
-
'osascript -e \'tell application "Calendar" to get summary of every event of calendar "Home" whose start date > (current date)\'',
|
|
448
|
-
'',
|
|
449
|
-
'# Create a reminder',
|
|
450
|
-
'osascript -e \'tell application "Reminders" to make new reminder with properties {name:"Buy milk", due date:((current date) + 1 * hours)}\'',
|
|
451
|
-
'',
|
|
452
|
-
'# Send an email',
|
|
453
|
-
'osascript -e \'tell application "Mail" to send (make new outgoing message with properties {subject:"Hi", content:"Hello", visible:true})\'',
|
|
454
|
-
'',
|
|
455
|
-
'# Create a note',
|
|
456
|
-
'osascript -e \'tell application "Notes" to make new note at folder "Notes" with properties {body:"My note"}\'',
|
|
457
|
-
'',
|
|
458
|
-
'# Open a URL in Safari',
|
|
459
|
-
'osascript -e \'tell application "Safari" to open location "https://example.com"\'',
|
|
460
|
-
'',
|
|
461
|
-
'# Play/pause Music',
|
|
462
|
-
'osascript -e \'tell application "Music" to playpause\'',
|
|
463
|
-
'',
|
|
464
|
-
'# Display a system notification',
|
|
465
|
-
'osascript -e \'display notification "Done!" with title "Vellum"\'',
|
|
466
|
-
'```',
|
|
467
|
-
'',
|
|
468
|
-
'osascript (AppleScript/JXA) has direct, reliable access to macOS app APIs and system events.',
|
|
469
|
-
'Use it whenever the task involves a native macOS app or system-level interaction.',
|
|
413
|
+
'On macOS, also consider the `macos-automation` skill for interacting with native apps',
|
|
414
|
+
'(Messages, Contacts, Calendar, Mail, Reminders, Music, Finder, etc.) via osascript.',
|
|
470
415
|
] : []),
|
|
471
416
|
].join('\n');
|
|
472
417
|
}
|
|
@@ -702,8 +647,13 @@ function escapeXml(str: string): string {
|
|
|
702
647
|
}
|
|
703
648
|
|
|
704
649
|
function formatSkillsCatalog(skills: SkillSummary[]): string {
|
|
705
|
-
// Filter out skills with disableModelInvocation
|
|
706
|
-
const visible = skills.filter(s =>
|
|
650
|
+
// Filter out skills with disableModelInvocation or unsupported OS
|
|
651
|
+
const visible = skills.filter(s => {
|
|
652
|
+
if (s.disableModelInvocation) return false;
|
|
653
|
+
const os = s.metadata?.os;
|
|
654
|
+
if (os && os.length > 0 && !os.includes(process.platform)) return false;
|
|
655
|
+
return true;
|
|
656
|
+
});
|
|
707
657
|
if (visible.length === 0) return '';
|
|
708
658
|
|
|
709
659
|
const lines = ['<available_skills>'];
|