npm - @vellumai/assistant - Versions diffs - 0.4.22 → 0.4.25 - Mend

@vellumai/assistant 0.4.22 → 0.4.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/bun.lock +3 -0
package/package.json +2 -1
package/scripts/ipc/check-swift-decoder-drift.ts +55 -44
package/src/__tests__/__snapshots__/ipc-snapshot.test.ts.snap +0 -90
package/src/__tests__/assistant-events-sse-hardening.test.ts +9 -3
package/src/__tests__/config-schema.test.ts +38 -178
package/src/__tests__/conversation-routes-guardian-reply.test.ts +4 -1
package/src/__tests__/credential-security-invariants.test.ts +0 -2
package/src/__tests__/guardian-verify-setup-skill-regression.test.ts +2 -2
package/src/__tests__/headless-browser-interactions.test.ts +0 -4
package/src/__tests__/ipc-snapshot.test.ts +0 -63
package/src/__tests__/onboarding-template-contract.test.ts +10 -20
package/src/__tests__/relay-server.test.ts +3 -3
package/src/__tests__/resolve-guardian-trust-class.test.ts +61 -0
package/src/__tests__/runtime-events-sse-parity.test.ts +10 -0
package/src/__tests__/runtime-events-sse.test.ts +7 -0
package/src/__tests__/session-init.benchmark.test.ts +0 -4
package/src/__tests__/session-runtime-assembly.test.ts +34 -8
package/src/__tests__/system-prompt.test.ts +7 -1
package/src/__tests__/trusted-contact-approval-notifier.test.ts +12 -8
package/src/__tests__/twilio-routes-twiml.test.ts +2 -2
package/src/__tests__/twilio-routes.test.ts +2 -3
package/src/__tests__/voice-quality.test.ts +21 -132
package/src/calls/relay-server.ts +11 -5
package/src/calls/twilio-routes.ts +4 -38
package/src/calls/voice-quality.ts +7 -63
package/src/config/bundled-skills/guardian-verify-setup/SKILL.md +7 -10
package/src/config/bundled-skills/messaging/SKILL.md +3 -5
package/src/config/bundled-skills/phone-calls/SKILL.md +143 -82
package/src/config/bundled-skills/sms-setup/SKILL.md +0 -20
package/src/config/bundled-skills/twilio-setup/SKILL.md +9 -17
package/src/config/bundled-skills/voice-setup/SKILL.md +36 -1
package/src/config/bundled-skills/voice-setup/icon.svg +20 -0
package/src/config/calls-schema.ts +3 -53
package/src/config/elevenlabs-schema.ts +33 -0
package/src/config/schema.ts +183 -137
package/src/config/types.ts +0 -1
package/src/daemon/daemon-control.ts +3 -0
package/src/daemon/handlers/browser.ts +2 -53
package/src/daemon/ipc-contract/browser.ts +5 -84
package/src/daemon/ipc-contract/surfaces.ts +51 -48
package/src/daemon/ipc-contract-inventory.json +0 -9
package/src/daemon/session-agent-loop-handlers.ts +3 -0
package/src/daemon/session-agent-loop.ts +2 -1
package/src/daemon/session-runtime-assembly.ts +9 -7
package/src/daemon/session-tool-setup.ts +27 -13
package/src/mcp/client.ts +2 -1
package/src/memory/conversation-crud.ts +339 -166
package/src/memory/migrations/102-alter-table-columns.ts +254 -37
package/src/memory/schema.ts +1227 -1035
package/src/runtime/routes/events-routes.ts +7 -0
package/src/runtime/routes/inbound-message-handler.ts +3 -4
package/src/schedule/scheduler.ts +159 -45
package/src/security/secure-keys.ts +3 -3
package/src/tools/browser/browser-execution.ts +314 -331
package/src/tools/browser/browser-handoff.ts +11 -37
package/src/tools/browser/browser-manager.ts +203 -352
package/src/tools/browser/browser-screencast.ts +15 -76
package/src/tools/network/script-proxy/certs.ts +7 -237
package/src/tools/network/script-proxy/connect-tunnel.ts +1 -82
package/src/tools/network/script-proxy/http-forwarder.ts +2 -151
package/src/tools/network/script-proxy/logging.ts +12 -196
package/src/tools/network/script-proxy/mitm-handler.ts +2 -270
package/src/tools/network/script-proxy/policy.ts +4 -152
package/src/tools/network/script-proxy/router.ts +2 -60
package/src/tools/network/script-proxy/server.ts +5 -137
package/src/tools/network/script-proxy/types.ts +19 -125
package/src/tools/system/voice-config.ts +23 -1
package/src/util/logger.ts +4 -1
package/src/__tests__/elevenlabs-config.test.ts +0 -95
package/src/__tests__/twilio-routes-elevenlabs.test.ts +0 -407
package/src/calls/elevenlabs-config.ts +0 -32

package/src/config/bundled-skills/phone-calls/SKILL.md CHANGED Viewed

@@ -11,7 +11,7 @@ You are helping the user set up and manage phone calls via Twilio. This skill co
 ## Overview
-The calling system uses Twilio's ConversationRelay for both **outbound** and **inbound** voice calls. Twilio works out of the box as the default voice provider. Optionally, you can enable ElevenLabs integration for higher-quality, more natural-sounding voices — but this is entirely optional.
+The calling system uses Twilio's ConversationRelay for both **outbound** and **inbound** voice calls with **ElevenLabs** providing the text-to-speech voice. After Twilio setup, the assistant configures ElevenLabs as the TTS provider and prompts the user to choose a voice from a curated list of supported options.
 ### Outbound calls
@@ -34,14 +34,6 @@ When someone dials the assistant's Twilio phone number:
 5. Once verified (or if no challenge is pending), the LLM orchestrator greets the caller in a receptionist style: "Hello, this is [user]'s assistant. How can I help you today?"
 6. The assistant converses naturally, using ASK_GUARDIAN to consult the user when needed, just like outbound calls.
-Three voice quality modes are available:
-- **`twilio_standard`** (default) — Fully supported. Standard Twilio TTS with Google voices. No extra setup required.
-- **`twilio_elevenlabs_tts`** — Fully supported. Uses ElevenLabs voices through Twilio ConversationRelay for more natural speech.
-- **`elevenlabs_agent`** — **Experimental/restricted.** Full ElevenLabs conversational agent mode. Consultation bridging (`waiting_on_user`) is not yet supported in this mode; the runtime guard blocks it before any ElevenLabs API calls are made. See the "Runtime behavior" section below for fallback and strict-fail details.
-You can keep using Twilio only — no changes needed. Enabling ElevenLabs can improve naturalness and quality.
 The user's assistant gets its own personal phone number through Twilio. All implicit calls (without an explicit mode) always use this assistant number. Optionally, users can call from their own phone number if it's authorized with the Twilio account — this must be explicitly requested per call via `caller_identity_mode="user_number"`.
 ## Step 1: Verify Twilio Setup
@@ -79,18 +71,105 @@ Verify:
 vellum config get calls.enabled
 ```
-## Step 3: Verify Setup (Test Call)
+## Step 3: Choose a Voice
+After enabling calls, let the user choose an ElevenLabs voice. Twilio has a native ElevenLabs integration — no separate ElevenLabs account or API key is needed.
+### Voice consistency with in-app TTS
+The shared config key `elevenlabs.voiceId` is the single source of truth for ElevenLabs voice identity. Both in-app TTS and phone calls read from it (defaulting to **Rachel** — `21m00Tcm4TlvDq8ikWAM`).
+Before presenting the voice list, check the current shared voice:
+```bash
+vellum config get elevenlabs.voiceId
+```
+**If a non-default voice is already set**, the user chose it during voice-setup or a previous session. Tell them:
+> "Your assistant currently uses [voice name] for both in-app chat and phone calls. I'll keep the same voice for calls. You can change it if you'd like."
+Skip the selection prompt unless the user wants to change.
+**If the default (Rachel) is set or no override exists**, present the curated voice list below and let them pick. When they choose, set the shared config so both in-app TTS and phone calls use it:
+### Voice selection
+Present the user with a list of supported ElevenLabs voices. These are pre-made voices with stable IDs that work with Twilio ConversationRelay out of the box.
+**Ask the user: "Which voice would you like your assistant to use on phone calls?"**
+Present these voices grouped by category:
+#### Female voices
+| Voice     | Style                          | Voice ID                       |
+| --------- | ------------------------------ | ------------------------------ |
+| Rachel    | Calm, warm, conversational     | `21m00Tcm4TlvDq8ikWAM`        |
+| Sarah     | Soft, young, approachable      | `EXAVITQu4vr4xnSDxMaL`        |
+| Charlotte | Warm, Swedish-accented         | `XB0fDUnXU5powFXDhCwa`        |
+| Alice     | Confident, British             | `Xb7hH8MSUJpSbSDYk0k2`        |
+| Matilda   | Warm, friendly, young          | `XrExE9yKIg1WjnnlVkGX`        |
+| Lily      | Warm, British                  | `pFZP5JQG7iQjIQuC4Bku`        |
+#### Male voices
+| Voice   | Style                            | Voice ID                       |
+| ------- | -------------------------------- | ------------------------------ |
+| Antoni  | Warm, well-rounded               | `ErXwobaYiN019PkySvjV`        |
+| Josh    | Deep, young, clear               | `TxGEqnHWrfWFTfGW9XjX`       |
+| Arnold  | Crisp, narrative                  | `VR6AewLTigWG4xSOukaG`        |
+| Adam    | Deep, middle-aged, professional  | `pNInz6obpgDQGcFmaJgB`        |
+| Bill    | Trustworthy, American            | `pqHfZKP75CvOlQylNhV4`        |
+| George  | Warm, British, distinguished     | `JBFqnCBsd6RMkjVDRZzb`        |
+| Daniel  | Authoritative, British           | `onwK4e9ZLuTAKqWW03F9`        |
+| Charlie | Casual, Australian               | `IKne3meq5aSn9XLyUdCD`        |
+| Liam    | Young, articulate                | `TX3LPaxmHKxFdv7VOQHJ`        |
+After the user picks a voice, use `voice_config_update` to set the shared voice ID. This writes to the config file (`elevenlabs.voiceId`) for phone calls **and** pushes to the macOS app via IPC (`ttsVoiceId`) for in-app TTS in one call:
+```
+voice_config_update setting="tts_voice_id" value="<selected-voice-id>"
+```
+**If the user wants a voice not on this list**, they can browse more voices at https://elevenlabs.io/voice-library and provide the voice ID manually.
+## Step 4: Verify Setup (Test Call)
 Before making real calls, offer a quick verification:
 1. Confirm credentials are stored: check the Twilio config endpoint for `hasCredentials: true` and `phoneNumber`
 2. Confirm ingress is running: `ingress.publicBaseUrl` must be set and the tunnel active
 3. Confirm calls are enabled: `calls.enabled` must be `true`
+4. Confirm voice is configured: `elevenlabs.voiceId` should be set
-Suggest a test call to the user's own phone: **"Want to do a quick test call to your phone to make sure everything works?"**
+Suggest a test call to the user's own phone: **"Want to do a quick test call to your phone to make sure everything works? This is a good way to hear how your chosen voice sounds."**
 If they agree, ask for their personal phone number and place a test call with a simple task like "Introduce yourself and confirm the call system is working."
+## Step 5: Verify Guardian Identity (Voice)
+Now link the user's phone number as the trusted voice guardian. Tell the user: "Now let's verify your guardian identity for voice. This links your phone number so the assistant can verify inbound callers."
+Load the **guardian-verify-setup** skill to handle the verification flow:
+- Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
+When invoking the skill, indicate the channel is `voice`. The guardian-verify-setup skill manages the full outbound verification flow, including:
+- Collecting the user's phone number as the destination
+- Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start` with `channel: "voice"`
+- Calling the phone number and providing a code for the user to enter via their phone's keypad
+- Proactively polling for completion (voice auto-check) so the user gets instant confirmation
+- Checking guardian status to confirm the binding was created
+- Handling resend, cancel, and error cases
+Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted voice guardian."_
+After the guardian-verify-setup skill completes (or the user skips), continue to the next sections.
+**Note:** Guardian verification is optional but recommended. If the user declines or wants to skip, proceed without blocking. Once verified, inbound callers can be prompted for voice verification before calls proceed (see the **Guardian voice verification for inbound calls** section below).
 ## Caller Identity
 All implicit calls (calls without an explicit `caller_identity_mode`) always use the assistant's Twilio phone number. This is the number that appears on the recipient's caller ID.
@@ -133,88 +212,83 @@ An optional verification step where the callee must enter a numeric code via the
 | `calls.verification.enabled`    | Enable DTMF callee verification           | `false` |
 | `calls.verification.codeLength` | Number of digits in the verification code | `6`     |
-## Optional: Higher Quality Voice with ElevenLabs
+## Advanced Voice Configuration
-ElevenLabs integration is entirely optional. The standard Twilio-only setup works unchanged — this section is only relevant if you want to improve voice quality.
+ElevenLabs is the TTS provider for all calls. This section covers advanced voice selection and tuning.
-### Mode: `twilio_elevenlabs_tts`
+### Changing the voice
-Uses ElevenLabs voices through Twilio's ConversationRelay. Speech is more natural-sounding than the default Google TTS voices.
+To switch to a different voice after initial setup, use `voice_config_update` to set the shared voice ID. This writes to the config file (`elevenlabs.voiceId`) for phone calls **and** pushes to the macOS app via IPC for in-app TTS:
-**Recommended user-friendly workflow (no technical IDs required):**
-1. Ask what kind of voice the user wants (examples: "warm", "professional", "playful", "calm", "deeper", "brighter")
-2. If the user doesn't care, keep `twilio_standard` (simplest path)
-3. If they want higher-quality voice, switch to `twilio_elevenlabs_tts` and choose a matching ElevenLabs voice on their behalf
+```
+voice_config_update setting="tts_voice_id" value="<new-voice-id>"
+```
-The user should not need to know what a `voiceId` is unless they explicitly want advanced/manual control.
+Browse more voices at https://elevenlabs.io/voice-library.
-**Manual/advanced setup (optional):**
+### Advanced voice selection with an ElevenLabs account
-```bash
-vellum config set calls.voice.mode twilio_elevenlabs_tts
-vellum config set calls.voice.elevenlabs.voiceId "<your-voice-id>"
-```
+Users who have an ElevenLabs account and API key (e.g., from the **voice-setup** skill) can go beyond the curated voice list. With an API key, they can:
-By default, the system sends a **bare** `voiceId` to Twilio ConversationRelay (no model/tuning suffix). This is the safest default across voice IDs.
+- **Browse the full ElevenLabs voice library programmatically** — the ElevenLabs API (`GET https://api.elevenlabs.io/v2/voices`) supports searching by name, category, language, and accent. This returns voice IDs, names, labels, and preview URLs.
+- **Use custom or cloned voices** — if the user has created a custom voice or voice clone in their ElevenLabs account, they can use its voice ID here. These voices are available in Twilio ConversationRelay just like pre-made voices.
+- **Preview voices before choosing** — each voice in the API response includes a `preview_url` with an audio sample.
-If you want to force Twilio's extended voice spec, you can optionally set a model ID:
+To check if the user has an API key stored:
 ```bash
-vellum config set calls.voice.elevenlabs.voiceModelId "flash_v2_5"
+credential_store action=get service=elevenlabs field=api_key
 ```
-When `voiceModelId` is set, the emitted voice string becomes:
-`voiceId-model-speed_stability_similarity`.
-### Mode: `elevenlabs_agent` (experimental/restricted)
+If they have a key and want to browse voices, fetch the voice list:
-Full ElevenLabs conversational agent mode. This requires an ElevenLabs account with an agent configured on their platform.
+```bash
+curl -s "https://api.elevenlabs.io/v2/voices?category=premade&page_size=50" \
+  -H "xi-api-key: <api_key_from_credential_store>" | python3 -m json.tool
+```
-**Restriction:** This mode is currently restricted because consultation bridging (`waiting_on_user`) is not yet supported. A runtime guard in `handleVoiceWebhook` blocks `elevenlabs_agent` before any ElevenLabs API calls are made.
+To search for a specific voice style:
-**Setup:**
+```bash
+curl -s "https://api.elevenlabs.io/v2/voices?search=warm+female&page_size=10" \
+  -H "xi-api-key: <api_key_from_credential_store>" | python3 -m json.tool
+```
-1. Store your ElevenLabs API key securely:
+After the user picks a voice, set the shared voice ID:
 ```
-credential_store action=store service=elevenlabs field=api_key value=<your_api_key>
+voice_config_update setting="tts_voice_id" value="<selected-voice-id>"
 ```
-2. Set the voice mode and agent ID:
+### Voice tuning parameters
-```bash
-vellum config set calls.voice.mode elevenlabs_agent
-vellum config set calls.voice.elevenlabs.agentId "<your-agent-id>"
-```
+Fine-tune how the selected voice sounds. These parameters apply to all ElevenLabs modes:
-### Fallback behavior and `fallbackToStandardOnError`
+```bash
+# Playback speed (0.7 = slower, 1.0 = normal, 1.2 = faster)
+vellum config set elevenlabs.speed 1.0
-By default, `calls.voice.fallbackToStandardOnError` is `true`. This setting controls what happens when an ElevenLabs mode encounters errors or is restricted.
+# Stability (0.0 = more expressive/variable, 1.0 = more consistent/monotone)
+vellum config set elevenlabs.stability 0.5
-#### Invalid configuration (e.g., missing voiceId or agentId)
+# Similarity boost (0.0 = more creative, 1.0 = closer to original voice)
+vellum config set elevenlabs.similarityBoost 0.75
+```
-- **`true` (default):** The profile resolver silently falls back to `twilio_standard` mode and logs a warning. The call proceeds with standard Twilio TTS.
-- **`false`:** The voice webhook returns **HTTP 500** with the specific configuration error details (e.g., `"Voice quality configuration error: calls.voice.elevenlabs.voiceId is required..."`).
+Lower stability makes the voice more expressive but less predictable — good for conversational calls. Higher stability is better for scripted/formal calls.
-#### `elevenlabs_agent` mode guard (consultation bridging unsupported)
+### Voice model tuning
-- **`true` (default):** The `elevenlabs_agent` mode is silently downgraded to standard ConversationRelay TwiML with a warning log. The call proceeds normally with standard Twilio TTS. No ElevenLabs API calls are made.
-- **`false`:** The voice webhook returns **HTTP 501** with the message: `"elevenlabs_agent mode is restricted: consultation bridging (waiting_on_user) is not yet supported."`. No ElevenLabs API calls are made.
+By default, the system sends a **bare** `voiceId` to Twilio ConversationRelay (no model/tuning suffix). This is the safest default across voice IDs.
-You can disable fallback if you want strict ElevenLabs-only behavior:
+If you want to force Twilio's extended voice spec, you can optionally set a model ID:
 ```bash
-vellum config set calls.voice.fallbackToStandardOnError false
+vellum config set elevenlabs.voiceModelId "flash_v2_5"
 ```
-### Reverting to standard Twilio
-To go back to the default voice at any time:
-```bash
-vellum config set calls.voice.mode twilio_standard
-```
+When `voiceModelId` is set, the emitted voice string becomes:
+`voiceId-model-speed_stability_similarity`.
 ## Making Outbound Calls
@@ -477,16 +551,13 @@ All call-related settings can be managed via `vellum config`:
 | `calls.model`                               | Override LLM model for call orchestration                                                                  | _(uses default model)_                                                                                   |
 | `calls.callerIdentity.allowPerCallOverride` | Allow per-call caller identity selection                                                                   | `true`                                                                                                   |
 | `calls.callerIdentity.userNumber`           | E.164 phone number for user-number mode                                                                    | _(empty)_                                                                                                |
-| `calls.voice.mode`                          | Voice quality mode (`twilio_standard`, `twilio_elevenlabs_tts`, `elevenlabs_agent`)                        | `twilio_standard`                                                                                        |
 | `calls.voice.language`                      | Language code for TTS and transcription                                                                    | `en-US`                                                                                                  |
 | `calls.voice.transcriptionProvider`         | Speech-to-text provider (`Deepgram`, `Google`)                                                             | `Deepgram`                                                                                               |
-| `calls.voice.fallbackToStandardOnError`     | Auto-fallback to standard Twilio TTS on ElevenLabs errors                                                  | `true`                                                                                                   |
-| `calls.voice.elevenlabs.voiceId`            | Advanced/internal ElevenLabs voice identifier. Usually set by the assistant based on requested voice style | _(empty)_                                                                                                |
-| `calls.voice.elevenlabs.voiceModelId`       | Optional Twilio ConversationRelay model suffix. Leave empty to send bare `voiceId`                         | _(empty)_                                                                                                |
-| `calls.voice.elevenlabs.agentId`            | ElevenLabs agent ID (for `elevenlabs_agent` mode)                                                          | _(empty)_                                                                                                |
-| `calls.voice.elevenlabs.speed`              | Playback speed (`0.7` – `1.2`)                                                                             | `1.0`                                                                                                    |
-| `calls.voice.elevenlabs.stability`          | Voice stability (`0.0` – `1.0`)                                                                            | `0.5`                                                                                                    |
-| `calls.voice.elevenlabs.similarityBoost`    | Voice similarity boost (`0.0` – `1.0`)                                                                     | `0.75`                                                                                                   |
+| `elevenlabs.voiceId`                        | ElevenLabs voice ID used by both in-app TTS and phone calls. Set during setup from the curated voice list. Defaults to Rachel  | `21m00Tcm4TlvDq8ikWAM`                                                                                  |
+| `elevenlabs.voiceModelId`                   | Optional Twilio ConversationRelay model suffix. Leave empty to send bare `voiceId`                         | _(empty)_                                                                                                |
+| `elevenlabs.speed`                          | Playback speed (`0.7` – `1.2`)                                                                             | `1.0`                                                                                                    |
+| `elevenlabs.stability`                      | Voice stability (`0.0` – `1.0`)                                                                            | `0.5`                                                                                                    |
+| `elevenlabs.similarityBoost`                | Voice similarity boost (`0.0` – `1.0`)                                                                     | `0.75`                                                                                                   |
 ### Adjusting settings
@@ -560,25 +631,15 @@ Or re-run the public-ingress skill to auto-detect and save the new URL.
 The system has a 30-second silence timeout. If nobody speaks for 30 seconds, the agent will ask "Are you still there?" This is expected behavior.
-### Call quality didn't improve after enabling ElevenLabs
+### Call quality sounds off
-- Verify `calls.voice.mode` is set to `twilio_elevenlabs_tts` or `elevenlabs_agent` (not still `twilio_standard`)
+- Verify `elevenlabs.voiceId` is set to a valid ElevenLabs voice ID
 - Ask for the desired voice style again and try a different voice selection
-- If configuring manually: check that `calls.voice.elevenlabs.voiceId` contains a valid ElevenLabs voice ID
-- If mode is `elevenlabs_agent`, ensure `calls.voice.elevenlabs.agentId` is also set
 ### Twilio says "application error" right after answer
 - This often means ConversationRelay rejected voice configuration after TwiML fetch
-- Keep `calls.voice.elevenlabs.voiceModelId` empty first (bare `voiceId` mode)
+- Keep `elevenlabs.voiceModelId` empty first (bare `voiceId` mode)
 - If you set `voiceModelId`, try clearing it and retesting:
-  `vellum config set calls.voice.elevenlabs.voiceModelId ""`
-### ElevenLabs mode falls back to standard
-When `calls.voice.fallbackToStandardOnError` is `true` (the default), the system silently falls back to standard Twilio TTS if ElevenLabs encounters an error or restriction. Check:
+  `vellum config set elevenlabs.voiceModelId ""`
-- For `elevenlabs_agent` mode: this mode is currently restricted (consultation bridging not yet supported) and will always fall back to standard when fallback is enabled. If fallback is disabled, the voice webhook returns HTTP 501.
-- For `twilio_elevenlabs_tts` mode: verify `calls.voice.elevenlabs.voiceId` is set to a valid voice ID
-- For invalid configs (missing voiceId/agentId): if fallback is disabled, the voice webhook returns HTTP 500 with the config error
-- Review daemon logs for warning messages about fallback or guard activation

package/src/config/bundled-skills/sms-setup/SKILL.md CHANGED Viewed

@@ -144,26 +144,6 @@ After deletion, return to Step 3b to collect information and resubmit. Warn the
 **On failure:** Report the exact error message and guide the user through resolution.
-## Step 3.5: Guardian Verification (SMS)
-Now link the user's phone number as the trusted SMS guardian. Tell the user: "Now let's verify your guardian identity for SMS. This links your phone number as the trusted guardian for SMS messaging."
-Load the **guardian-verify-setup** skill to handle the verification flow:
-- Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
-When invoking the skill, indicate the channel is `sms`. The guardian-verify-setup skill manages the full outbound verification flow, including:
-- Collecting the user's phone number as the destination (accepts any common format -- the API normalizes to E.164)
-- Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start` with `channel: "sms"`
-- Sending a 6-digit code to the phone number that the user must reply with from the SMS channel
-- Checking guardian status to confirm the binding was created
-- Handling resend, cancel, and error cases
-Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted SMS guardian."_
-**Note:** Guardian verification is optional but recommended. If the user declines or wants to skip, proceed to Step 4 without blocking.
 ## Step 4: Test Send
 Run a test SMS to verify end-to-end delivery:

package/src/config/bundled-skills/twilio-setup/SKILL.md CHANGED Viewed

@@ -215,44 +215,36 @@ Confirm:
 Tell the user: **"Twilio is configured. Your assistant's phone number is {phoneNumber}. This number is used for both voice calls and SMS messaging."**
-## Step 5.5: Guardian Verification (SMS and Voice)
+## Step 5.5: Guardian Verification (Voice)
-Now link the user's phone number as the trusted guardian for SMS and/or voice channels. Tell the user: "Now let's verify your guardian identity. This links your phone number as the trusted guardian for messaging and calls."
+Now link the user's phone number as the trusted voice guardian. Tell the user: "Now let's verify your guardian identity for voice. This links your phone number so the assistant can verify inbound callers."
 Load the **guardian-verify-setup** skill to handle the verification flow:
 - Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
-The guardian-verify-setup skill manages the full outbound verification flow for **one channel at a time** (sms, voice, or telegram). Each invocation handles:
+When invoking the skill, indicate the channel is `voice`. The guardian-verify-setup skill manages the full outbound verification flow, including:
 - Collecting the user's phone number as the destination (accepts any common format -- the API normalizes to E.164)
-- Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start`
-- For **SMS**: sending a 6-digit code to the phone number that the user must reply with from the SMS channel
-- For **voice**: calling the phone number and providing a code for the user to enter via their phone's keypad
+- Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start` with `channel: "voice"`
+- Calling the phone number and providing a code for the user to enter via their phone's keypad
+- Proactively polling for completion (voice auto-check) so the user gets instant confirmation
 - Checking guardian status to confirm the binding was created
 - Handling resend, cancel, and error cases
-**If the user wants to verify both SMS and voice**, load the skill twice -- once for SMS and once for voice. Each channel requires its own separate verification session.
+Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted voice guardian."_
-Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted guardian. We'll verify one channel at a time."_
-After the guardian-verify-setup skill completes verification for a channel, load it again for the next channel if needed. Once all desired channels are verified (or the user skips), continue to Step 6.
+After the guardian-verify-setup skill completes (or the user skips), continue to Step 6.
 **Note:** Guardian verification is optional but recommended. If the user declines or wants to skip, proceed to Step 6 without blocking.
-To re-check guardian status later, query the channel(s) that were verified:
+To re-check guardian status later:
 ```bash
-# Check SMS guardian status
-curl -s "$INTERNAL_GATEWAY_BASE_URL/v1/integrations/guardian/status?channel=sms" \
-  -H "Authorization: Bearer $GATEWAY_AUTH_TOKEN"
-# Check voice guardian status
 curl -s "$INTERNAL_GATEWAY_BASE_URL/v1/integrations/guardian/status?channel=voice" \
   -H "Authorization: Bearer $GATEWAY_AUTH_TOKEN"
 ```
-Check the status for whichever channel(s) the user actually verified (SMS, voice, or both). Report the guardian verification result per channel: **"Guardian identity — SMS: {verified | not configured}, Voice: {verified | not configured}."**
 ## Step 6: Enable Features
 Now that Twilio is configured, the user can enable the features that depend on it:

package/src/config/bundled-skills/voice-setup/SKILL.md CHANGED Viewed

@@ -9,7 +9,7 @@ You are helping the user set up and troubleshoot voice features (push-to-talk, w
 ## Available Tools
-- `voice_config_update` — Change any voice setting (PTT key, wake word enabled/keyword/timeout)
+- `voice_config_update` — Change any voice setting (PTT key, wake word enabled/keyword/timeout, TTS voice ID)
 - `open_system_settings` — Open macOS System Settings to a specific privacy pane
 - `navigate_settings_tab` — Open the Vellum settings panel to the Voice tab
 - `credential_store` — Collect API keys securely (for ElevenLabs TTS)
@@ -66,6 +66,41 @@ Ask if they want high-quality text-to-speech voices via ElevenLabs (optional —
 2. Use `credential_store` with `action: "prompt"`, `service: "elevenlabs"`, `field: "api_key"` to show a secure input dialog.
 3. After the key is stored, confirm success.
+#### Choose an ElevenLabs voice
+After storing the API key, let the user pick their preferred voice. The shared config key `elevenlabs.voiceId` controls the voice for **both** in-app TTS and phone calls (defaulting to Rachel).
+Check the current voice:
+```bash
+vellum config get elevenlabs.voiceId
+```
+Ask the user if they want to change their TTS voice. If yes, use `voice_config_update` with `setting: "tts_voice_id"` and the chosen voice ID. This writes to both the config file (`elevenlabs.voiceId`) and pushes to the macOS app via IPC in one call.
+Common choices from the curated ElevenLabs list:
+- **Rachel** (`21m00Tcm4TlvDq8ikWAM`) — Calm, warm, conversational (default)
+- **Sarah** (`EXAVITQu4vr4xnSDxMaL`) — Soft, young, approachable
+- **Charlotte** (`XB0fDUnXU5powFXDhCwa`) — Warm, Swedish-accented
+- **Josh** (`TxGEqnHWrfWFTfGW9XjX`) — Deep, young, clear
+- **Adam** (`pNInz6obpgDQGcFmaJgB`) — Deep, middle-aged, professional
+If the user wants to browse more voices, they can search at https://elevenlabs.io/voice-library or use the ElevenLabs API with their key.
+#### Sync with phone calls
+After setting the voice, check whether phone calls are configured:
+```bash
+vellum config get calls.enabled
+```
+**If phone calls are enabled** (`calls.enabled` is `true`):
+- Tell the user their phone calls will automatically use the same voice they just chose, since both in-app TTS and phone calls read from `elevenlabs.voiceId`.
+**If phone calls are not yet configured** (`calls.enabled` is `false` or not set):
+- Tell the user: "When you set up phone calls later, they'll automatically use the same voice for a consistent experience."
 ### 5. Verification
 After setup is complete:

package/src/config/bundled-skills/voice-setup/icon.svg ADDED Viewed

@@ -0,0 +1,20 @@
+<svg viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
+  <rect x="7" y="1" width="2" height="2" fill="#4A90E2"/>
+  <rect x="6" y="3" width="4" height="1" fill="#4A90E2"/>
+  <rect x="5" y="4" width="6" height="1" fill="#7B68EE"/>
+  <rect x="4" y="5" width="8" height="1" fill="#7B68EE"/>
+  <rect x="4" y="6" width="8" height="1" fill="#4A90E2"/>
+  <rect x="5" y="7" width="6" height="1" fill="#4A90E2"/>
+  <rect x="6" y="8" width="4" height="1" fill="#7B68EE"/>
+  <rect x="7" y="9" width="2" height="1" fill="#7B68EE"/>
+  <rect x="3" y="10" width="2" height="4" fill="#50C878"/>
+  <rect x="4" y="10" width="1" height="1" fill="#50C878"/>
+  <rect x="5" y="11" width="1" height="1" fill="#50C878"/>
+  <rect x="11" y="10" width="2" height="4" fill="#E74C3C"/>
+  <rect x="10" y="10" width="1" height="1" fill="#E74C3C"/>
+  <rect x="9" y="11" width="1" height="1" fill="#E74C3C"/>
+  <rect x="6" y="11" width="4" height="1" fill="#FFD700"/>
+  <rect x="5" y="12" width="6" height="1" fill="#FFD700"/>
+  <rect x="4" y="13" width="8" height="1" fill="#FFD700"/>
+  <rect x="7" y="14" width="2" height="2" fill="#4A90E2"/>
+</svg>

package/src/config/calls-schema.ts CHANGED Viewed

@@ -1,7 +1,6 @@
 import { z } from 'zod';
 const VALID_CALL_PROVIDERS = ['twilio'] as const;
-const VALID_CALL_VOICE_MODES = ['twilio_standard', 'twilio_elevenlabs_tts', 'elevenlabs_agent'] as const;
 export const VALID_CALLER_IDENTITY_MODES = ['assistant_number', 'user_number'] as const;
 const VALID_CALL_TRANSCRIPTION_PROVIDERS = ['Deepgram', 'Google'] as const;
@@ -20,51 +19,7 @@ export const CallsSafetyConfigSchema = z.object({
     .default([]),
 });
-export const CallsElevenLabsConfigSchema = z.object({
-  voiceId: z
-    .string({ error: 'calls.voice.elevenlabs.voiceId must be a string' })
-    .default(''),
-  voiceModelId: z
-    .string({ error: 'calls.voice.elevenlabs.voiceModelId must be a string' })
-    .default(''),
-  speed: z
-    .number({ error: 'calls.voice.elevenlabs.speed must be a number' })
-    .min(0.7, 'calls.voice.elevenlabs.speed must be >= 0.7')
-    .max(1.2, 'calls.voice.elevenlabs.speed must be <= 1.2')
-    .default(1.0),
-  stability: z
-    .number({ error: 'calls.voice.elevenlabs.stability must be a number' })
-    .min(0, 'calls.voice.elevenlabs.stability must be >= 0')
-    .max(1, 'calls.voice.elevenlabs.stability must be <= 1')
-    .default(0.5),
-  similarityBoost: z
-    .number({ error: 'calls.voice.elevenlabs.similarityBoost must be a number' })
-    .min(0, 'calls.voice.elevenlabs.similarityBoost must be >= 0')
-    .max(1, 'calls.voice.elevenlabs.similarityBoost must be <= 1')
-    .default(0.75),
-  useSpeakerBoost: z
-    .boolean({ error: 'calls.voice.elevenlabs.useSpeakerBoost must be a boolean' })
-    .default(true),
-  agentId: z
-    .string({ error: 'calls.voice.elevenlabs.agentId must be a string' })
-    .default(''),
-  apiBaseUrl: z
-    .string({ error: 'calls.voice.elevenlabs.apiBaseUrl must be a string' })
-    .default('https://api.elevenlabs.io'),
-  registerCallTimeoutMs: z
-    .number({ error: 'calls.voice.elevenlabs.registerCallTimeoutMs must be a number' })
-    .int('calls.voice.elevenlabs.registerCallTimeoutMs must be an integer')
-    .min(1000, 'calls.voice.elevenlabs.registerCallTimeoutMs must be >= 1000')
-    .max(15000, 'calls.voice.elevenlabs.registerCallTimeoutMs must be <= 15000')
-    .default(5000),
-});
 export const CallsVoiceConfigSchema = z.object({
-  mode: z
-    .enum(VALID_CALL_VOICE_MODES, {
-      error: `calls.voice.mode must be one of: ${VALID_CALL_VOICE_MODES.join(', ')}`,
-    })
-    .default('twilio_standard'),
   language: z
     .string({ error: 'calls.voice.language must be a string' })
     .default('en-US'),
@@ -73,10 +28,6 @@ export const CallsVoiceConfigSchema = z.object({
       error: `calls.voice.transcriptionProvider must be one of: ${VALID_CALL_TRANSCRIPTION_PROVIDERS.join(', ')}`,
     })
     .default('Deepgram'),
-  fallbackToStandardOnError: z
-    .boolean({ error: 'calls.voice.fallbackToStandardOnError must be a boolean' })
-    .default(true),
-  elevenlabs: CallsElevenLabsConfigSchema.default(CallsElevenLabsConfigSchema.parse({})),
 });
 export const CallerIdentityConfigSchema = z.object({
@@ -142,7 +93,7 @@ export const CallsConfigSchema = z.object({
     .int('calls.guardianWaitUpdateInitialIntervalMs must be an integer')
     .min(1000, 'calls.guardianWaitUpdateInitialIntervalMs must be >= 1000')
     .max(60_000, 'calls.guardianWaitUpdateInitialIntervalMs must be at most 60000')
-    .default(5000),
+    .default(15_000),
   guardianWaitUpdateInitialWindowMs: z
     .number({ error: 'calls.guardianWaitUpdateInitialWindowMs must be a number' })
     .int('calls.guardianWaitUpdateInitialWindowMs must be an integer')
@@ -154,13 +105,13 @@ export const CallsConfigSchema = z.object({
     .int('calls.guardianWaitUpdateSteadyMinIntervalMs must be an integer')
     .min(1000, 'calls.guardianWaitUpdateSteadyMinIntervalMs must be >= 1000')
     .max(60_000, 'calls.guardianWaitUpdateSteadyMinIntervalMs must be at most 60000')
-    .default(7000),
+    .default(20_000),
   guardianWaitUpdateSteadyMaxIntervalMs: z
     .number({ error: 'calls.guardianWaitUpdateSteadyMaxIntervalMs must be a number' })
     .int('calls.guardianWaitUpdateSteadyMaxIntervalMs must be an integer')
     .min(1000, 'calls.guardianWaitUpdateSteadyMaxIntervalMs must be >= 1000')
     .max(60_000, 'calls.guardianWaitUpdateSteadyMaxIntervalMs must be at most 60000')
-    .default(10_000),
+    .default(30_000),
   disclosure: CallsDisclosureConfigSchema.default(CallsDisclosureConfigSchema.parse({})),
   safety: CallsSafetyConfigSchema.default(CallsSafetyConfigSchema.parse({})),
   voice: CallsVoiceConfigSchema.default(CallsVoiceConfigSchema.parse({})),
@@ -175,6 +126,5 @@ export type CallsConfig = z.infer<typeof CallsConfigSchema>;
 export type CallsDisclosureConfig = z.infer<typeof CallsDisclosureConfigSchema>;
 export type CallsSafetyConfig = z.infer<typeof CallsSafetyConfigSchema>;
 export type CallsVoiceConfig = z.infer<typeof CallsVoiceConfigSchema>;
-export type CallsElevenLabsConfig = z.infer<typeof CallsElevenLabsConfigSchema>;
 export type CallerIdentityConfig = z.infer<typeof CallerIdentityConfigSchema>;
 export type CallsVerificationConfig = z.infer<typeof CallsVerificationConfigSchema>;

package/src/config/elevenlabs-schema.ts ADDED Viewed

@@ -0,0 +1,33 @@
+import { z } from 'zod';
+// Default ElevenLabs voice — "Rachel" (calm, warm, conversational).
+// Used by both in-app TTS and phone calls (via Twilio ConversationRelay).
+// Mirrored in: clients/macos/.../OpenAIVoiceService.swift (defaultVoiceId)
+export const DEFAULT_ELEVENLABS_VOICE_ID = '21m00Tcm4TlvDq8ikWAM';
+export const ElevenLabsConfigSchema = z.object({
+  voiceId: z
+    .string({ error: 'elevenlabs.voiceId must be a string' })
+    .min(1, 'elevenlabs.voiceId must not be empty')
+    .default(DEFAULT_ELEVENLABS_VOICE_ID),
+  voiceModelId: z
+    .string({ error: 'elevenlabs.voiceModelId must be a string' })
+    .default(''),
+  speed: z
+    .number({ error: 'elevenlabs.speed must be a number' })
+    .min(0.7, 'elevenlabs.speed must be >= 0.7')
+    .max(1.2, 'elevenlabs.speed must be <= 1.2')
+    .default(1.0),
+  stability: z
+    .number({ error: 'elevenlabs.stability must be a number' })
+    .min(0, 'elevenlabs.stability must be >= 0')
+    .max(1, 'elevenlabs.stability must be <= 1')
+    .default(0.5),
+  similarityBoost: z
+    .number({ error: 'elevenlabs.similarityBoost must be a number' })
+    .min(0, 'elevenlabs.similarityBoost must be >= 0')
+    .max(1, 'elevenlabs.similarityBoost must be <= 1')
+    .default(0.75),
+});
+export type ElevenLabsConfig = z.infer<typeof ElevenLabsConfigSchema>;