@vellumai/assistant 0.4.23 → 0.4.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bun.lock +3 -0
- package/package.json +2 -1
- package/src/__tests__/__snapshots__/ipc-snapshot.test.ts.snap +0 -15
- package/src/__tests__/assistant-events-sse-hardening.test.ts +9 -3
- package/src/__tests__/call-controller.test.ts +80 -0
- package/src/__tests__/config-schema.test.ts +38 -178
- package/src/__tests__/conversation-routes-guardian-reply.test.ts +4 -1
- package/src/__tests__/credential-security-invariants.test.ts +0 -2
- package/src/__tests__/guardian-verify-setup-skill-regression.test.ts +2 -2
- package/src/__tests__/ipc-snapshot.test.ts +0 -9
- package/src/__tests__/onboarding-template-contract.test.ts +10 -20
- package/src/__tests__/relay-server.test.ts +3 -3
- package/src/__tests__/runtime-events-sse-parity.test.ts +10 -0
- package/src/__tests__/runtime-events-sse.test.ts +7 -0
- package/src/__tests__/session-runtime-assembly.test.ts +34 -8
- package/src/__tests__/system-prompt.test.ts +7 -1
- package/src/__tests__/trusted-contact-approval-notifier.test.ts +12 -8
- package/src/__tests__/twilio-routes-twiml.test.ts +2 -2
- package/src/__tests__/twilio-routes.test.ts +2 -3
- package/src/__tests__/voice-quality.test.ts +21 -132
- package/src/calls/call-controller.ts +34 -29
- package/src/calls/relay-server.ts +11 -5
- package/src/calls/twilio-routes.ts +4 -38
- package/src/calls/voice-quality.ts +7 -63
- package/src/config/bundled-skills/guardian-verify-setup/SKILL.md +7 -10
- package/src/config/bundled-skills/messaging/SKILL.md +3 -5
- package/src/config/bundled-skills/phone-calls/SKILL.md +144 -83
- package/src/config/bundled-skills/sms-setup/SKILL.md +0 -20
- package/src/config/bundled-skills/twilio-setup/SKILL.md +9 -17
- package/src/config/bundled-skills/voice-setup/SKILL.md +36 -1
- package/src/config/bundled-skills/voice-setup/icon.svg +20 -0
- package/src/config/calls-schema.ts +3 -53
- package/src/config/elevenlabs-schema.ts +33 -0
- package/src/config/schema.ts +183 -137
- package/src/config/types.ts +0 -1
- package/src/daemon/handlers/browser.ts +1 -6
- package/src/daemon/ipc-contract/browser.ts +5 -14
- package/src/daemon/ipc-contract-inventory.json +0 -2
- package/src/daemon/session-agent-loop-handlers.ts +3 -0
- package/src/daemon/session-runtime-assembly.ts +9 -7
- package/src/mcp/client.ts +2 -1
- package/src/memory/conversation-crud.ts +339 -166
- package/src/runtime/auth/middleware.ts +87 -26
- package/src/runtime/routes/events-routes.ts +7 -0
- package/src/runtime/routes/inbound-message-handler.ts +3 -4
- package/src/schedule/scheduler.ts +159 -45
- package/src/security/secure-keys.ts +3 -3
- package/src/tools/browser/browser-manager.ts +72 -228
- package/src/tools/browser/browser-screencast.ts +0 -5
- package/src/tools/network/script-proxy/certs.ts +7 -237
- package/src/tools/network/script-proxy/connect-tunnel.ts +1 -82
- package/src/tools/network/script-proxy/http-forwarder.ts +2 -151
- package/src/tools/network/script-proxy/logging.ts +12 -196
- package/src/tools/network/script-proxy/mitm-handler.ts +2 -270
- package/src/tools/network/script-proxy/policy.ts +4 -152
- package/src/tools/network/script-proxy/router.ts +2 -60
- package/src/tools/network/script-proxy/server.ts +5 -137
- package/src/tools/network/script-proxy/types.ts +19 -125
- package/src/tools/system/voice-config.ts +23 -1
- package/src/util/logger.ts +4 -1
- package/src/__tests__/elevenlabs-config.test.ts +0 -95
- package/src/__tests__/twilio-routes-elevenlabs.test.ts +0 -407
- package/src/calls/elevenlabs-config.ts +0 -32
|
@@ -215,44 +215,36 @@ Confirm:
|
|
|
215
215
|
|
|
216
216
|
Tell the user: **"Twilio is configured. Your assistant's phone number is {phoneNumber}. This number is used for both voice calls and SMS messaging."**
|
|
217
217
|
|
|
218
|
-
## Step 5.5: Guardian Verification (
|
|
218
|
+
## Step 5.5: Guardian Verification (Voice)
|
|
219
219
|
|
|
220
|
-
Now link the user's phone number as the trusted
|
|
220
|
+
Now link the user's phone number as the trusted voice guardian. Tell the user: "Now let's verify your guardian identity for voice. This links your phone number so the assistant can verify inbound callers."
|
|
221
221
|
|
|
222
222
|
Load the **guardian-verify-setup** skill to handle the verification flow:
|
|
223
223
|
|
|
224
224
|
- Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
|
|
225
225
|
|
|
226
|
-
The guardian-verify-setup skill manages the full outbound verification flow
|
|
226
|
+
When invoking the skill, indicate the channel is `voice`. The guardian-verify-setup skill manages the full outbound verification flow, including:
|
|
227
227
|
|
|
228
228
|
- Collecting the user's phone number as the destination (accepts any common format -- the API normalizes to E.164)
|
|
229
|
-
- Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start`
|
|
230
|
-
-
|
|
231
|
-
-
|
|
229
|
+
- Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start` with `channel: "voice"`
|
|
230
|
+
- Calling the phone number and providing a code for the user to enter via their phone's keypad
|
|
231
|
+
- Proactively polling for completion (voice auto-check) so the user gets instant confirmation
|
|
232
232
|
- Checking guardian status to confirm the binding was created
|
|
233
233
|
- Handling resend, cancel, and error cases
|
|
234
234
|
|
|
235
|
-
|
|
235
|
+
Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted voice guardian."_
|
|
236
236
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
After the guardian-verify-setup skill completes verification for a channel, load it again for the next channel if needed. Once all desired channels are verified (or the user skips), continue to Step 6.
|
|
237
|
+
After the guardian-verify-setup skill completes (or the user skips), continue to Step 6.
|
|
240
238
|
|
|
241
239
|
**Note:** Guardian verification is optional but recommended. If the user declines or wants to skip, proceed to Step 6 without blocking.
|
|
242
240
|
|
|
243
|
-
To re-check guardian status later
|
|
241
|
+
To re-check guardian status later:
|
|
244
242
|
|
|
245
243
|
```bash
|
|
246
|
-
# Check SMS guardian status
|
|
247
|
-
curl -s "$INTERNAL_GATEWAY_BASE_URL/v1/integrations/guardian/status?channel=sms" \
|
|
248
|
-
-H "Authorization: Bearer $GATEWAY_AUTH_TOKEN"
|
|
249
|
-
# Check voice guardian status
|
|
250
244
|
curl -s "$INTERNAL_GATEWAY_BASE_URL/v1/integrations/guardian/status?channel=voice" \
|
|
251
245
|
-H "Authorization: Bearer $GATEWAY_AUTH_TOKEN"
|
|
252
246
|
```
|
|
253
247
|
|
|
254
|
-
Check the status for whichever channel(s) the user actually verified (SMS, voice, or both). Report the guardian verification result per channel: **"Guardian identity — SMS: {verified | not configured}, Voice: {verified | not configured}."**
|
|
255
|
-
|
|
256
248
|
## Step 6: Enable Features
|
|
257
249
|
|
|
258
250
|
Now that Twilio is configured, the user can enable the features that depend on it:
|
|
@@ -9,7 +9,7 @@ You are helping the user set up and troubleshoot voice features (push-to-talk, w
|
|
|
9
9
|
|
|
10
10
|
## Available Tools
|
|
11
11
|
|
|
12
|
-
- `voice_config_update` — Change any voice setting (PTT key, wake word enabled/keyword/timeout)
|
|
12
|
+
- `voice_config_update` — Change any voice setting (PTT key, wake word enabled/keyword/timeout, TTS voice ID)
|
|
13
13
|
- `open_system_settings` — Open macOS System Settings to a specific privacy pane
|
|
14
14
|
- `navigate_settings_tab` — Open the Vellum settings panel to the Voice tab
|
|
15
15
|
- `credential_store` — Collect API keys securely (for ElevenLabs TTS)
|
|
@@ -66,6 +66,41 @@ Ask if they want high-quality text-to-speech voices via ElevenLabs (optional —
|
|
|
66
66
|
2. Use `credential_store` with `action: "prompt"`, `service: "elevenlabs"`, `field: "api_key"` to show a secure input dialog.
|
|
67
67
|
3. After the key is stored, confirm success.
|
|
68
68
|
|
|
69
|
+
#### Choose an ElevenLabs voice
|
|
70
|
+
|
|
71
|
+
After storing the API key, let the user pick their preferred voice. The shared config key `elevenlabs.voiceId` controls the voice for **both** in-app TTS and phone calls (defaulting to Rachel).
|
|
72
|
+
|
|
73
|
+
Check the current voice:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
vellum config get elevenlabs.voiceId
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Ask the user if they want to change their TTS voice. If yes, use `voice_config_update` with `setting: "tts_voice_id"` and the chosen voice ID. This writes to both the config file (`elevenlabs.voiceId`) and pushes to the macOS app via IPC in one call.
|
|
80
|
+
|
|
81
|
+
Common choices from the curated ElevenLabs list:
|
|
82
|
+
- **Rachel** (`21m00Tcm4TlvDq8ikWAM`) — Calm, warm, conversational (default)
|
|
83
|
+
- **Sarah** (`EXAVITQu4vr4xnSDxMaL`) — Soft, young, approachable
|
|
84
|
+
- **Charlotte** (`XB0fDUnXU5powFXDhCwa`) — Warm, Swedish-accented
|
|
85
|
+
- **Josh** (`TxGEqnHWrfWFTfGW9XjX`) — Deep, young, clear
|
|
86
|
+
- **Adam** (`pNInz6obpgDQGcFmaJgB`) — Deep, middle-aged, professional
|
|
87
|
+
|
|
88
|
+
If the user wants to browse more voices, they can search at https://elevenlabs.io/voice-library or use the ElevenLabs API with their key.
|
|
89
|
+
|
|
90
|
+
#### Sync with phone calls
|
|
91
|
+
|
|
92
|
+
After setting the voice, check whether phone calls are configured:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
vellum config get calls.enabled
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**If phone calls are enabled** (`calls.enabled` is `true`):
|
|
99
|
+
- Tell the user their phone calls will automatically use the same voice they just chose, since both in-app TTS and phone calls read from `elevenlabs.voiceId`.
|
|
100
|
+
|
|
101
|
+
**If phone calls are not yet configured** (`calls.enabled` is `false` or not set):
|
|
102
|
+
- Tell the user: "When you set up phone calls later, they'll automatically use the same voice for a consistent experience."
|
|
103
|
+
|
|
69
104
|
### 5. Verification
|
|
70
105
|
|
|
71
106
|
After setup is complete:
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
<svg viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
|
|
2
|
+
<rect x="7" y="1" width="2" height="2" fill="#4A90E2"/>
|
|
3
|
+
<rect x="6" y="3" width="4" height="1" fill="#4A90E2"/>
|
|
4
|
+
<rect x="5" y="4" width="6" height="1" fill="#7B68EE"/>
|
|
5
|
+
<rect x="4" y="5" width="8" height="1" fill="#7B68EE"/>
|
|
6
|
+
<rect x="4" y="6" width="8" height="1" fill="#4A90E2"/>
|
|
7
|
+
<rect x="5" y="7" width="6" height="1" fill="#4A90E2"/>
|
|
8
|
+
<rect x="6" y="8" width="4" height="1" fill="#7B68EE"/>
|
|
9
|
+
<rect x="7" y="9" width="2" height="1" fill="#7B68EE"/>
|
|
10
|
+
<rect x="3" y="10" width="2" height="4" fill="#50C878"/>
|
|
11
|
+
<rect x="4" y="10" width="1" height="1" fill="#50C878"/>
|
|
12
|
+
<rect x="5" y="11" width="1" height="1" fill="#50C878"/>
|
|
13
|
+
<rect x="11" y="10" width="2" height="4" fill="#E74C3C"/>
|
|
14
|
+
<rect x="10" y="10" width="1" height="1" fill="#E74C3C"/>
|
|
15
|
+
<rect x="9" y="11" width="1" height="1" fill="#E74C3C"/>
|
|
16
|
+
<rect x="6" y="11" width="4" height="1" fill="#FFD700"/>
|
|
17
|
+
<rect x="5" y="12" width="6" height="1" fill="#FFD700"/>
|
|
18
|
+
<rect x="4" y="13" width="8" height="1" fill="#FFD700"/>
|
|
19
|
+
<rect x="7" y="14" width="2" height="2" fill="#4A90E2"/>
|
|
20
|
+
</svg>
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
|
|
3
3
|
const VALID_CALL_PROVIDERS = ['twilio'] as const;
|
|
4
|
-
const VALID_CALL_VOICE_MODES = ['twilio_standard', 'twilio_elevenlabs_tts', 'elevenlabs_agent'] as const;
|
|
5
4
|
export const VALID_CALLER_IDENTITY_MODES = ['assistant_number', 'user_number'] as const;
|
|
6
5
|
const VALID_CALL_TRANSCRIPTION_PROVIDERS = ['Deepgram', 'Google'] as const;
|
|
7
6
|
|
|
@@ -20,51 +19,7 @@ export const CallsSafetyConfigSchema = z.object({
|
|
|
20
19
|
.default([]),
|
|
21
20
|
});
|
|
22
21
|
|
|
23
|
-
export const CallsElevenLabsConfigSchema = z.object({
|
|
24
|
-
voiceId: z
|
|
25
|
-
.string({ error: 'calls.voice.elevenlabs.voiceId must be a string' })
|
|
26
|
-
.default(''),
|
|
27
|
-
voiceModelId: z
|
|
28
|
-
.string({ error: 'calls.voice.elevenlabs.voiceModelId must be a string' })
|
|
29
|
-
.default(''),
|
|
30
|
-
speed: z
|
|
31
|
-
.number({ error: 'calls.voice.elevenlabs.speed must be a number' })
|
|
32
|
-
.min(0.7, 'calls.voice.elevenlabs.speed must be >= 0.7')
|
|
33
|
-
.max(1.2, 'calls.voice.elevenlabs.speed must be <= 1.2')
|
|
34
|
-
.default(1.0),
|
|
35
|
-
stability: z
|
|
36
|
-
.number({ error: 'calls.voice.elevenlabs.stability must be a number' })
|
|
37
|
-
.min(0, 'calls.voice.elevenlabs.stability must be >= 0')
|
|
38
|
-
.max(1, 'calls.voice.elevenlabs.stability must be <= 1')
|
|
39
|
-
.default(0.5),
|
|
40
|
-
similarityBoost: z
|
|
41
|
-
.number({ error: 'calls.voice.elevenlabs.similarityBoost must be a number' })
|
|
42
|
-
.min(0, 'calls.voice.elevenlabs.similarityBoost must be >= 0')
|
|
43
|
-
.max(1, 'calls.voice.elevenlabs.similarityBoost must be <= 1')
|
|
44
|
-
.default(0.75),
|
|
45
|
-
useSpeakerBoost: z
|
|
46
|
-
.boolean({ error: 'calls.voice.elevenlabs.useSpeakerBoost must be a boolean' })
|
|
47
|
-
.default(true),
|
|
48
|
-
agentId: z
|
|
49
|
-
.string({ error: 'calls.voice.elevenlabs.agentId must be a string' })
|
|
50
|
-
.default(''),
|
|
51
|
-
apiBaseUrl: z
|
|
52
|
-
.string({ error: 'calls.voice.elevenlabs.apiBaseUrl must be a string' })
|
|
53
|
-
.default('https://api.elevenlabs.io'),
|
|
54
|
-
registerCallTimeoutMs: z
|
|
55
|
-
.number({ error: 'calls.voice.elevenlabs.registerCallTimeoutMs must be a number' })
|
|
56
|
-
.int('calls.voice.elevenlabs.registerCallTimeoutMs must be an integer')
|
|
57
|
-
.min(1000, 'calls.voice.elevenlabs.registerCallTimeoutMs must be >= 1000')
|
|
58
|
-
.max(15000, 'calls.voice.elevenlabs.registerCallTimeoutMs must be <= 15000')
|
|
59
|
-
.default(5000),
|
|
60
|
-
});
|
|
61
|
-
|
|
62
22
|
export const CallsVoiceConfigSchema = z.object({
|
|
63
|
-
mode: z
|
|
64
|
-
.enum(VALID_CALL_VOICE_MODES, {
|
|
65
|
-
error: `calls.voice.mode must be one of: ${VALID_CALL_VOICE_MODES.join(', ')}`,
|
|
66
|
-
})
|
|
67
|
-
.default('twilio_standard'),
|
|
68
23
|
language: z
|
|
69
24
|
.string({ error: 'calls.voice.language must be a string' })
|
|
70
25
|
.default('en-US'),
|
|
@@ -73,10 +28,6 @@ export const CallsVoiceConfigSchema = z.object({
|
|
|
73
28
|
error: `calls.voice.transcriptionProvider must be one of: ${VALID_CALL_TRANSCRIPTION_PROVIDERS.join(', ')}`,
|
|
74
29
|
})
|
|
75
30
|
.default('Deepgram'),
|
|
76
|
-
fallbackToStandardOnError: z
|
|
77
|
-
.boolean({ error: 'calls.voice.fallbackToStandardOnError must be a boolean' })
|
|
78
|
-
.default(true),
|
|
79
|
-
elevenlabs: CallsElevenLabsConfigSchema.default(CallsElevenLabsConfigSchema.parse({})),
|
|
80
31
|
});
|
|
81
32
|
|
|
82
33
|
export const CallerIdentityConfigSchema = z.object({
|
|
@@ -142,7 +93,7 @@ export const CallsConfigSchema = z.object({
|
|
|
142
93
|
.int('calls.guardianWaitUpdateInitialIntervalMs must be an integer')
|
|
143
94
|
.min(1000, 'calls.guardianWaitUpdateInitialIntervalMs must be >= 1000')
|
|
144
95
|
.max(60_000, 'calls.guardianWaitUpdateInitialIntervalMs must be at most 60000')
|
|
145
|
-
.default(
|
|
96
|
+
.default(15_000),
|
|
146
97
|
guardianWaitUpdateInitialWindowMs: z
|
|
147
98
|
.number({ error: 'calls.guardianWaitUpdateInitialWindowMs must be a number' })
|
|
148
99
|
.int('calls.guardianWaitUpdateInitialWindowMs must be an integer')
|
|
@@ -154,13 +105,13 @@ export const CallsConfigSchema = z.object({
|
|
|
154
105
|
.int('calls.guardianWaitUpdateSteadyMinIntervalMs must be an integer')
|
|
155
106
|
.min(1000, 'calls.guardianWaitUpdateSteadyMinIntervalMs must be >= 1000')
|
|
156
107
|
.max(60_000, 'calls.guardianWaitUpdateSteadyMinIntervalMs must be at most 60000')
|
|
157
|
-
.default(
|
|
108
|
+
.default(20_000),
|
|
158
109
|
guardianWaitUpdateSteadyMaxIntervalMs: z
|
|
159
110
|
.number({ error: 'calls.guardianWaitUpdateSteadyMaxIntervalMs must be a number' })
|
|
160
111
|
.int('calls.guardianWaitUpdateSteadyMaxIntervalMs must be an integer')
|
|
161
112
|
.min(1000, 'calls.guardianWaitUpdateSteadyMaxIntervalMs must be >= 1000')
|
|
162
113
|
.max(60_000, 'calls.guardianWaitUpdateSteadyMaxIntervalMs must be at most 60000')
|
|
163
|
-
.default(
|
|
114
|
+
.default(30_000),
|
|
164
115
|
disclosure: CallsDisclosureConfigSchema.default(CallsDisclosureConfigSchema.parse({})),
|
|
165
116
|
safety: CallsSafetyConfigSchema.default(CallsSafetyConfigSchema.parse({})),
|
|
166
117
|
voice: CallsVoiceConfigSchema.default(CallsVoiceConfigSchema.parse({})),
|
|
@@ -175,6 +126,5 @@ export type CallsConfig = z.infer<typeof CallsConfigSchema>;
|
|
|
175
126
|
export type CallsDisclosureConfig = z.infer<typeof CallsDisclosureConfigSchema>;
|
|
176
127
|
export type CallsSafetyConfig = z.infer<typeof CallsSafetyConfigSchema>;
|
|
177
128
|
export type CallsVoiceConfig = z.infer<typeof CallsVoiceConfigSchema>;
|
|
178
|
-
export type CallsElevenLabsConfig = z.infer<typeof CallsElevenLabsConfigSchema>;
|
|
179
129
|
export type CallerIdentityConfig = z.infer<typeof CallerIdentityConfigSchema>;
|
|
180
130
|
export type CallsVerificationConfig = z.infer<typeof CallsVerificationConfigSchema>;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
|
|
3
|
+
// Default ElevenLabs voice — "Rachel" (calm, warm, conversational).
|
|
4
|
+
// Used by both in-app TTS and phone calls (via Twilio ConversationRelay).
|
|
5
|
+
// Mirrored in: clients/macos/.../OpenAIVoiceService.swift (defaultVoiceId)
|
|
6
|
+
export const DEFAULT_ELEVENLABS_VOICE_ID = '21m00Tcm4TlvDq8ikWAM';
|
|
7
|
+
|
|
8
|
+
export const ElevenLabsConfigSchema = z.object({
|
|
9
|
+
voiceId: z
|
|
10
|
+
.string({ error: 'elevenlabs.voiceId must be a string' })
|
|
11
|
+
.min(1, 'elevenlabs.voiceId must not be empty')
|
|
12
|
+
.default(DEFAULT_ELEVENLABS_VOICE_ID),
|
|
13
|
+
voiceModelId: z
|
|
14
|
+
.string({ error: 'elevenlabs.voiceModelId must be a string' })
|
|
15
|
+
.default(''),
|
|
16
|
+
speed: z
|
|
17
|
+
.number({ error: 'elevenlabs.speed must be a number' })
|
|
18
|
+
.min(0.7, 'elevenlabs.speed must be >= 0.7')
|
|
19
|
+
.max(1.2, 'elevenlabs.speed must be <= 1.2')
|
|
20
|
+
.default(1.0),
|
|
21
|
+
stability: z
|
|
22
|
+
.number({ error: 'elevenlabs.stability must be a number' })
|
|
23
|
+
.min(0, 'elevenlabs.stability must be >= 0')
|
|
24
|
+
.max(1, 'elevenlabs.stability must be <= 1')
|
|
25
|
+
.default(0.5),
|
|
26
|
+
similarityBoost: z
|
|
27
|
+
.number({ error: 'elevenlabs.similarityBoost must be a number' })
|
|
28
|
+
.min(0, 'elevenlabs.similarityBoost must be >= 0')
|
|
29
|
+
.max(1, 'elevenlabs.similarityBoost must be <= 1')
|
|
30
|
+
.default(0.75),
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
export type ElevenLabsConfig = z.infer<typeof ElevenLabsConfigSchema>;
|