@jambonz/schema 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +974 -0
- package/callbacks/amd.schema.json +50 -0
- package/callbacks/base.schema.json +29 -0
- package/callbacks/call-status.schema.json +22 -0
- package/callbacks/conference-status.schema.json +24 -0
- package/callbacks/conference-wait.schema.json +11 -0
- package/callbacks/conference.schema.json +11 -0
- package/callbacks/dequeue.schema.json +19 -0
- package/callbacks/dial-dtmf.schema.json +18 -0
- package/callbacks/dial-hold.schema.json +22 -0
- package/callbacks/dial-refer.schema.json +28 -0
- package/callbacks/dial.schema.json +31 -0
- package/callbacks/enqueue-wait.schema.json +17 -0
- package/callbacks/enqueue.schema.json +27 -0
- package/callbacks/gather-partial.schema.json +54 -0
- package/callbacks/gather.schema.json +60 -0
- package/callbacks/listen.schema.json +21 -0
- package/callbacks/llm.schema.json +30 -0
- package/callbacks/message.schema.json +35 -0
- package/callbacks/pipeline-turn.schema.json +109 -0
- package/callbacks/play.schema.json +36 -0
- package/callbacks/session-new.schema.json +143 -0
- package/callbacks/session-reconnect.schema.json +9 -0
- package/callbacks/session-redirect.schema.json +38 -0
- package/callbacks/sip-refer-event.schema.json +20 -0
- package/callbacks/sip-refer.schema.json +22 -0
- package/callbacks/sip-request.schema.json +27 -0
- package/callbacks/transcribe-translation.schema.json +24 -0
- package/callbacks/transcribe.schema.json +46 -0
- package/callbacks/tts-streaming-event.schema.json +77 -0
- package/callbacks/verb-status.schema.json +57 -0
- package/components/actionHook.schema.json +36 -0
- package/components/actionHookDelayAction.schema.json +37 -0
- package/components/amd.schema.json +68 -0
- package/components/auth.schema.json +18 -0
- package/components/bidirectionalAudio.schema.json +22 -0
- package/components/fillerNoise.schema.json +25 -0
- package/components/llm-base.schema.json +94 -0
- package/components/recognizer-assemblyAiOptions.schema.json +66 -0
- package/components/recognizer-awsOptions.schema.json +52 -0
- package/components/recognizer-azureOptions.schema.json +32 -0
- package/components/recognizer-cobaltOptions.schema.json +34 -0
- package/components/recognizer-customOptions.schema.json +27 -0
- package/components/recognizer-deepgramOptions.schema.json +147 -0
- package/components/recognizer-elevenlabsOptions.schema.json +39 -0
- package/components/recognizer-gladiaOptions.schema.json +8 -0
- package/components/recognizer-googleOptions.schema.json +35 -0
- package/components/recognizer-houndifyOptions.schema.json +53 -0
- package/components/recognizer-ibmOptions.schema.json +54 -0
- package/components/recognizer-nuanceOptions.schema.json +150 -0
- package/components/recognizer-nvidiaOptions.schema.json +39 -0
- package/components/recognizer-openaiOptions.schema.json +59 -0
- package/components/recognizer-sonioxOptions.schema.json +46 -0
- package/components/recognizer-speechmaticsOptions.schema.json +100 -0
- package/components/recognizer-verbioOptions.schema.json +46 -0
- package/components/recognizer.schema.json +216 -0
- package/components/synthesizer.schema.json +82 -0
- package/components/target.schema.json +105 -0
- package/components/vad.schema.json +48 -0
- package/docs/components/recognizer.md +78 -0
- package/docs/components/synthesizer.md +27 -0
- package/docs/guides/session-commands.md +417 -0
- package/docs/verbs/conference.md +51 -0
- package/docs/verbs/deepgram_s2s.md +108 -0
- package/docs/verbs/dial.md +8 -0
- package/docs/verbs/listen.md +71 -0
- package/docs/verbs/pipeline.md +475 -0
- package/docs/verbs/stream.md +5 -0
- package/index.js +9 -0
- package/jambonz-app.schema.json +112 -0
- package/lib/normalize.js +72 -0
- package/lib/validator.js +137 -0
- package/package.json +39 -0
- package/verbs/alert.schema.json +34 -0
- package/verbs/answer.schema.json +22 -0
- package/verbs/conference.schema.json +107 -0
- package/verbs/config.schema.json +218 -0
- package/verbs/deepgram_s2s.schema.json +81 -0
- package/verbs/dequeue.schema.json +51 -0
- package/verbs/dial.schema.json +187 -0
- package/verbs/dialogflow.schema.json +148 -0
- package/verbs/dtmf.schema.json +49 -0
- package/verbs/dub.schema.json +103 -0
- package/verbs/elevenlabs_s2s.schema.json +81 -0
- package/verbs/enqueue.schema.json +53 -0
- package/verbs/gather.schema.json +188 -0
- package/verbs/google_s2s.schema.json +42 -0
- package/verbs/hangup.schema.json +36 -0
- package/verbs/leave.schema.json +22 -0
- package/verbs/listen.schema.json +127 -0
- package/verbs/llm.schema.json +44 -0
- package/verbs/message.schema.json +82 -0
- package/verbs/openai_s2s.schema.json +42 -0
- package/verbs/pause.schema.json +36 -0
- package/verbs/pipeline.schema.json +240 -0
- package/verbs/play.schema.json +96 -0
- package/verbs/redirect.schema.json +34 -0
- package/verbs/s2s.schema.json +39 -0
- package/verbs/say.schema.json +107 -0
- package/verbs/sip-decline.schema.json +58 -0
- package/verbs/sip-refer.schema.json +58 -0
- package/verbs/sip-request.schema.json +54 -0
- package/verbs/stream.schema.json +103 -0
- package/verbs/tag.schema.json +41 -0
- package/verbs/transcribe.schema.json +57 -0
- package/verbs/ultravox_s2s.schema.json +41 -0
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
# Session Commands
|
|
2
|
+
|
|
3
|
+
Session commands are async operations you can perform at any time during an active WebSocket call. They are not verbs — they don't go in the verb array and don't execute sequentially. Instead, they're SDK method calls that send commands over the WebSocket immediately.
|
|
4
|
+
|
|
5
|
+
## TTS Token Streaming
|
|
6
|
+
|
|
7
|
+
TTS token streaming lets your app pipe text to jambonz incrementally — as tokens arrive from an LLM — rather than waiting for the full response. jambonz buffers the text, breaks it into natural chunks (sentence boundaries), and streams each chunk to the TTS engine. The caller hears the first sentence while the LLM is still generating the rest.
|
|
8
|
+
|
|
9
|
+
**This is different from `autoStreamTts`**. `autoStreamTts` is a jambonz-internal optimization: when a `say` verb has its complete text, jambonz streams the *audio* playback as TTS chunks arrive rather than waiting for full synthesis. Token streaming is app-level: your app sends *text* tokens incrementally via `sendTtsTokens()` while the LLM is still generating.
|
|
10
|
+
|
|
11
|
+
### Setup
|
|
12
|
+
|
|
13
|
+
Enable TTS streaming via the `config` verb before sending tokens:
|
|
14
|
+
|
|
15
|
+
```typescript
|
|
16
|
+
session
|
|
17
|
+
.config({
|
|
18
|
+
ttsStream: { enable: true },
|
|
19
|
+
synthesizer: { vendor: 'elevenlabs', voice: 'EXAVITQu4vr4xnSDxMaL' }, // ElevenLabs requires voice ID, not name
|
|
20
|
+
})
|
|
21
|
+
.say({ text: 'Hi there, how can I help you?' })
|
|
22
|
+
.send();
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
The `ttsStream.enable: true` setting opens a persistent connection to the TTS engine. You can optionally specify a different synthesizer in `ttsStream` to use a different vendor/voice for streaming than the session default.
|
|
26
|
+
|
|
27
|
+
### Methods
|
|
28
|
+
|
|
29
|
+
```typescript
|
|
30
|
+
// Send text tokens as they arrive from the LLM
|
|
31
|
+
await session.sendTtsTokens(tokenText);
|
|
32
|
+
|
|
33
|
+
// Signal end of the current response — flushes any buffered text
|
|
34
|
+
session.flushTtsTokens();
|
|
35
|
+
|
|
36
|
+
// Discard all buffered tokens (e.g. on user interruption)
|
|
37
|
+
session.clearTtsTokens();
|
|
38
|
+
|
|
39
|
+
// Check if jambonz has signalled backpressure
|
|
40
|
+
if (session.isTtsPaused) { /* wait before sending more */ }
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
`sendTtsTokens()` returns a Promise that resolves when jambonz acknowledges receipt. If the buffer is full, jambonz returns a `full` status and the SDK automatically pauses until a `tts:stream_resumed` event arrives.
|
|
44
|
+
|
|
45
|
+
### Events
|
|
46
|
+
|
|
47
|
+
```typescript
|
|
48
|
+
session.on('tts:stream_open', () => { /* vendor connection established */ });
|
|
49
|
+
session.on('tts:stream_paused', () => { /* backpressure: stop sending tokens */ });
|
|
50
|
+
session.on('tts:stream_resumed', () => { /* backpressure released: resume sending */ });
|
|
51
|
+
session.on('tts:stream_closed', () => { /* TTS stream ended */ });
|
|
52
|
+
session.on('tts:user_interrupt', () => { /* user barged in — stop LLM, clear tokens */ });
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Tracking What Was Actually Spoken (`trackTtsPlayout`)
|
|
56
|
+
|
|
57
|
+
When `trackTtsPlayout: true` is set in the `config` verb, jambonz uses word-level alignment data from the TTS engine to track exactly which words were played out to the caller. This requires a TTS vendor that supports alignment (currently ElevenLabs).
|
|
58
|
+
|
|
59
|
+
When the caller interrupts mid-response, jambonz fires a `tts:streaming-event` with:
|
|
60
|
+
```json
|
|
61
|
+
{ "event_type": "tts_spoken", "text": "only the words actually heard", "bargein": true }
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
On normal completion (no interruption):
|
|
65
|
+
```json
|
|
66
|
+
{ "event_type": "tts_spoken", "text": "the full response text", "bargein": false }
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
This is critical for maintaining accurate conversation history with the LLM — record only what the caller actually heard, not the full generated response that may have been cut short.
|
|
70
|
+
|
|
71
|
+
See the `callback:tts-streaming-event` schema for the full event payload.
|
|
72
|
+
|
|
73
|
+
## Inject Commands
|
|
74
|
+
|
|
75
|
+
Inject commands modify an active call without replacing the verb stack. They execute immediately alongside whatever verb is currently running.
|
|
76
|
+
|
|
77
|
+
### Recording
|
|
78
|
+
|
|
79
|
+
```typescript
|
|
80
|
+
// Start recording
|
|
81
|
+
session.injectRecord('startCallRecording', { siprecServerURL: 'sip:recorder@example.com' });
|
|
82
|
+
|
|
83
|
+
// Stop recording
|
|
84
|
+
session.injectRecord('stopCallRecording');
|
|
85
|
+
|
|
86
|
+
// Pause/resume
|
|
87
|
+
session.injectRecord('pauseCallRecording');
|
|
88
|
+
session.injectRecord('resumeCallRecording');
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Mute / Unmute
|
|
92
|
+
|
|
93
|
+
```typescript
|
|
94
|
+
session.injectMute('mute');
|
|
95
|
+
session.injectMute('unmute');
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Whisper
|
|
99
|
+
|
|
100
|
+
Inject a verb to play to one party (e.g. a supervisor message to the agent):
|
|
101
|
+
|
|
102
|
+
```typescript
|
|
103
|
+
session.injectWhisper({ verb: 'say', text: 'You have 5 minutes remaining.' });
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### DTMF
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
session.injectDtmf('1');
|
|
110
|
+
session.injectDtmf('1234#');
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Audio Stream Control
|
|
114
|
+
|
|
115
|
+
Pause or resume an active `listen`/`stream` verb:
|
|
116
|
+
|
|
117
|
+
```typescript
|
|
118
|
+
session.injectListenStatus('pause');
|
|
119
|
+
session.injectListenStatus('resume');
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Tag (Metadata)
|
|
123
|
+
|
|
124
|
+
Attach arbitrary metadata to the call:
|
|
125
|
+
|
|
126
|
+
```typescript
|
|
127
|
+
session.injectTag({ supervisor: 'jane', priority: 'high' });
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Redirect
|
|
131
|
+
|
|
132
|
+
Transfer call control to a different webhook:
|
|
133
|
+
|
|
134
|
+
```typescript
|
|
135
|
+
session.injectCommand('redirect', { call_hook: '/new-flow' });
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Generic Command
|
|
139
|
+
|
|
140
|
+
For any command not covered by a specific method:
|
|
141
|
+
|
|
142
|
+
```typescript
|
|
143
|
+
session.injectCommand('commandName', { ...data });
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Pipeline Update
|
|
147
|
+
|
|
148
|
+
The `updatePipeline()` method sends mid-conversation updates to an active `pipeline` verb. Four operation types are supported:
|
|
149
|
+
|
|
150
|
+
### Update Instructions
|
|
151
|
+
|
|
152
|
+
Replace the LLM system prompt while the conversation is in progress:
|
|
153
|
+
|
|
154
|
+
```typescript
|
|
155
|
+
session.updatePipeline({
|
|
156
|
+
type: 'update_instructions',
|
|
157
|
+
instructions: 'You are now a billing support agent. Help the caller with invoice questions.',
|
|
158
|
+
});
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Inject Context
|
|
162
|
+
|
|
163
|
+
Append messages to the LLM conversation history (e.g. CRM data retrieved after the call started):
|
|
164
|
+
|
|
165
|
+
```typescript
|
|
166
|
+
session.updatePipeline({
|
|
167
|
+
type: 'inject_context',
|
|
168
|
+
messages: [
|
|
169
|
+
{ role: 'system', content: 'Customer account #12345: Gold tier, 3 open tickets.' },
|
|
170
|
+
],
|
|
171
|
+
});
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Update Tools
|
|
175
|
+
|
|
176
|
+
Replace the tool set available to the LLM:
|
|
177
|
+
|
|
178
|
+
```typescript
|
|
179
|
+
session.updatePipeline({
|
|
180
|
+
type: 'update_tools',
|
|
181
|
+
tools: [
|
|
182
|
+
{
|
|
183
|
+
type: 'function',
|
|
184
|
+
function: {
|
|
185
|
+
name: 'transfer_call',
|
|
186
|
+
description: 'Transfer the caller to a specialist',
|
|
187
|
+
parameters: { type: 'object', properties: { department: { type: 'string' } } },
|
|
188
|
+
},
|
|
189
|
+
},
|
|
190
|
+
],
|
|
191
|
+
});
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Generate Reply
|
|
195
|
+
|
|
196
|
+
Prompt the LLM to generate a new response. If the pipeline is not idle, the request is queued and executes when the current turn completes. Use `interrupt: true` to cancel the current response and generate immediately.
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
// Simple prompt
|
|
200
|
+
session.updatePipeline({
|
|
201
|
+
type: 'generate_reply',
|
|
202
|
+
user_input: 'The customer just entered their account number: 12345',
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
// With one-shot instructions
|
|
206
|
+
session.updatePipeline({
|
|
207
|
+
type: 'generate_reply',
|
|
208
|
+
user_input: 'Customer is asking about refunds',
|
|
209
|
+
instructions: 'Be empathetic and offer a 20% discount before processing a refund.',
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
// Interrupt current response and generate a new one
|
|
213
|
+
session.updatePipeline({
|
|
214
|
+
type: 'generate_reply',
|
|
215
|
+
user_input: 'Urgent: supervisor override',
|
|
216
|
+
interrupt: true,
|
|
217
|
+
});
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## LLM Tool Output
|
|
221
|
+
|
|
222
|
+
When using the `pipeline` verb with a `toolHook`, tool call requests arrive as events. Return results with:
|
|
223
|
+
|
|
224
|
+
```typescript
|
|
225
|
+
session.on('/tool-hook', (evt: Record<string, any>) => {
|
|
226
|
+
const { tool_call_id, name, arguments: args } = evt;
|
|
227
|
+
|
|
228
|
+
// Process the tool call...
|
|
229
|
+
const result = { temperature: 72, unit: 'F' };
|
|
230
|
+
|
|
231
|
+
// Return the result to the LLM
|
|
232
|
+
session.toolOutput(tool_call_id, result).reply();
|
|
233
|
+
});
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
The result is stringified and fed back to the LLM as the tool response.
|
|
237
|
+
|
|
238
|
+
## Building a Cascaded Voice AI Agent
|
|
239
|
+
|
|
240
|
+
The **pipeline** verb is the simplest way to build a voice AI agent — jambonz manages everything. But when you need full control over the LLM interaction (custom tool handling, conversation history management, multiple LLM providers, etc.), build a **cascaded agent**: your app handles STT transcripts and LLM calls directly, piping responses back via TTS token streaming.
|
|
241
|
+
|
|
242
|
+
### Architecture
|
|
243
|
+
|
|
244
|
+
```
|
|
245
|
+
config (ttsStream + bargeIn) → say (greeting)
|
|
246
|
+
↓
|
|
247
|
+
bargeIn fires '/speech-detected' with transcript
|
|
248
|
+
↓
|
|
249
|
+
App calls LLM with streaming → tokens arrive
|
|
250
|
+
↓
|
|
251
|
+
sendTtsTokens() pipes each token to jambonz → TTS plays audio
|
|
252
|
+
↓
|
|
253
|
+
flushTtsTokens() when LLM finishes → caller hears full response
|
|
254
|
+
↓
|
|
255
|
+
User speaks again → bargeIn fires → repeat
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
The key mechanism is the `bargeIn` actionHook on the `config` verb. When enabled with `sticky: true`, it persists across all verbs. Whenever the caller speaks, the `/speech-detected` hook fires with the speech transcript — even while TTS is playing (which triggers an interruption). Your app then calls the LLM and streams the response back.
|
|
259
|
+
|
|
260
|
+
### When to Use Cascaded vs Pipeline
|
|
261
|
+
|
|
262
|
+
| | Pipeline verb | Cascaded agent |
|
|
263
|
+
|---|---|---|
|
|
264
|
+
| **STT/LLM/TTS** | jambonz orchestrates all three | App owns the LLM; jambonz handles STT and TTS |
|
|
265
|
+
| **Turn detection** | Built-in (Krisp or STT-native) | App manages via bargeIn actionHook |
|
|
266
|
+
| **Tool calls** | Via toolHook | App handles directly in LLM loop |
|
|
267
|
+
| **Conversation history** | jambonz manages internally | App manages — full control |
|
|
268
|
+
| **Complexity** | Low — one verb | Higher — app manages LLM streaming and history |
|
|
269
|
+
| **Use when** | Standard voice AI agent | Custom LLM logic, multiple providers, precise history control |
|
|
270
|
+
|
|
271
|
+
### Example (TypeScript)
|
|
272
|
+
|
|
273
|
+
```typescript
|
|
274
|
+
import Anthropic from '@anthropic-ai/sdk';
|
|
275
|
+
import http from 'http';
|
|
276
|
+
import { createEndpoint } from '@jambonz/sdk/websocket';
|
|
277
|
+
|
|
278
|
+
const server = http.createServer();
|
|
279
|
+
const makeService = createEndpoint({ server, port: 3000 });
|
|
280
|
+
const svc = makeService({ path: '/' });
|
|
281
|
+
|
|
282
|
+
svc.on('session:new', (session) => {
|
|
283
|
+
const client = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
|
|
284
|
+
const messages: Array<{ role: 'user' | 'assistant'; content: string }> = [];
|
|
285
|
+
let assistantResponse = '';
|
|
286
|
+
let userInterrupt = false;
|
|
287
|
+
|
|
288
|
+
session
|
|
289
|
+
.on('/speech-detected', async (evt: Record<string, any>) => {
|
|
290
|
+
const { speech } = evt;
|
|
291
|
+
session.reply(); // Acknowledge immediately
|
|
292
|
+
|
|
293
|
+
if (speech?.is_final) {
|
|
294
|
+
const { transcript } = speech.alternatives[0];
|
|
295
|
+
messages.push({ role: 'user', content: transcript });
|
|
296
|
+
userInterrupt = false;
|
|
297
|
+
|
|
298
|
+
const stream = await client.messages.create({
|
|
299
|
+
model: 'claude-sonnet-4-20250514',
|
|
300
|
+
max_tokens: 1024,
|
|
301
|
+
system: 'You are a helpful voice assistant. Keep answers concise.',
|
|
302
|
+
messages,
|
|
303
|
+
stream: true,
|
|
304
|
+
});
|
|
305
|
+
|
|
306
|
+
for await (const event of stream) {
|
|
307
|
+
if (userInterrupt) {
|
|
308
|
+
messages.push({ role: 'assistant', content: `${assistantResponse}...` });
|
|
309
|
+
assistantResponse = '';
|
|
310
|
+
break;
|
|
311
|
+
}
|
|
312
|
+
if ((event as any).delta?.text) {
|
|
313
|
+
const tokens = (event as any).delta.text;
|
|
314
|
+
assistantResponse += tokens;
|
|
315
|
+
session.sendTtsTokens(tokens).catch(console.error);
|
|
316
|
+
} else if (event.type === 'message_stop') {
|
|
317
|
+
session.flushTtsTokens();
|
|
318
|
+
messages.push({ role: 'assistant', content: assistantResponse });
|
|
319
|
+
assistantResponse = '';
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
})
|
|
324
|
+
.on('tts:user_interrupt', () => {
|
|
325
|
+
userInterrupt = true;
|
|
326
|
+
});
|
|
327
|
+
|
|
328
|
+
session
|
|
329
|
+
.config({
|
|
330
|
+
ttsStream: { enable: true },
|
|
331
|
+
bargeIn: {
|
|
332
|
+
enable: true,
|
|
333
|
+
sticky: true,
|
|
334
|
+
minBargeinWordCount: 1,
|
|
335
|
+
actionHook: '/speech-detected',
|
|
336
|
+
input: ['speech'],
|
|
337
|
+
},
|
|
338
|
+
})
|
|
339
|
+
.say({ text: 'Hi there, how can I help you today?' })
|
|
340
|
+
.send();
|
|
341
|
+
});
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
### Example (JavaScript)
|
|
345
|
+
|
|
346
|
+
```javascript
|
|
347
|
+
const Anthropic = require('@anthropic-ai/sdk');
|
|
348
|
+
const http = require('node:http');
|
|
349
|
+
const { createEndpoint } = require('@jambonz/sdk/websocket');
|
|
350
|
+
|
|
351
|
+
const server = http.createServer();
|
|
352
|
+
const makeService = createEndpoint({ server, port: 3000 });
|
|
353
|
+
const svc = makeService({ path: '/' });
|
|
354
|
+
|
|
355
|
+
svc.on('session:new', (session) => {
|
|
356
|
+
const client = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
|
|
357
|
+
const messages = [];
|
|
358
|
+
let assistantResponse = '';
|
|
359
|
+
let userInterrupt = false;
|
|
360
|
+
|
|
361
|
+
session
|
|
362
|
+
.on('/speech-detected', async (evt) => {
|
|
363
|
+
const { speech } = evt;
|
|
364
|
+
session.reply();
|
|
365
|
+
|
|
366
|
+
if (speech?.is_final) {
|
|
367
|
+
const { transcript } = speech.alternatives[0];
|
|
368
|
+
messages.push({ role: 'user', content: transcript });
|
|
369
|
+
userInterrupt = false;
|
|
370
|
+
|
|
371
|
+
const stream = await client.messages.create({
|
|
372
|
+
model: 'claude-sonnet-4-20250514',
|
|
373
|
+
max_tokens: 1024,
|
|
374
|
+
system: 'You are a helpful voice assistant. Keep answers concise.',
|
|
375
|
+
messages,
|
|
376
|
+
stream: true,
|
|
377
|
+
});
|
|
378
|
+
|
|
379
|
+
for await (const event of stream) {
|
|
380
|
+
if (userInterrupt) {
|
|
381
|
+
messages.push({ role: 'assistant', content: `${assistantResponse}...` });
|
|
382
|
+
assistantResponse = '';
|
|
383
|
+
break;
|
|
384
|
+
}
|
|
385
|
+
if (event.delta?.text) {
|
|
386
|
+
const tokens = event.delta.text;
|
|
387
|
+
assistantResponse += tokens;
|
|
388
|
+
session.sendTtsTokens(tokens).catch(console.error);
|
|
389
|
+
} else if (event.type === 'message_stop') {
|
|
390
|
+
session.flushTtsTokens();
|
|
391
|
+
messages.push({ role: 'assistant', content: assistantResponse });
|
|
392
|
+
assistantResponse = '';
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
})
|
|
397
|
+
.on('tts:user_interrupt', () => {
|
|
398
|
+
userInterrupt = true;
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
session
|
|
402
|
+
.config({
|
|
403
|
+
ttsStream: { enable: true },
|
|
404
|
+
bargeIn: {
|
|
405
|
+
enable: true,
|
|
406
|
+
sticky: true,
|
|
407
|
+
minBargeinWordCount: 1,
|
|
408
|
+
actionHook: '/speech-detected',
|
|
409
|
+
input: ['speech'],
|
|
410
|
+
},
|
|
411
|
+
})
|
|
412
|
+
.say({ text: 'Hi there, how can I help you today?' })
|
|
413
|
+
.send();
|
|
414
|
+
});
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
See the `examples/llm-streaming/` directory for the complete working example.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
## Conference auto-destroy behavior
|
|
2
|
+
|
|
3
|
+
A conference is created when the first participant with `startConferenceOnEnter: true` joins. It is destroyed when the last participant leaves — you do not need to explicitly clean it up.
|
|
4
|
+
|
|
5
|
+
If you want a specific participant (e.g. a moderator) to end the conference for everyone when they leave, set `endConferenceOnExit: true` on that participant only.
|
|
6
|
+
|
|
7
|
+
## PIN-based conferences
|
|
8
|
+
|
|
9
|
+
A common pattern is to collect a PIN via `gather` and use it as the conference name. This lets multiple independent conferences share one application:
|
|
10
|
+
|
|
11
|
+
```typescript
|
|
12
|
+
session.conference({
|
|
13
|
+
name: `pin-${digits}`,
|
|
14
|
+
beep: true,
|
|
15
|
+
startConferenceOnEnter: true,
|
|
16
|
+
actionHook: '/conference-done',
|
|
17
|
+
});
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Waiting room (hold music)
|
|
21
|
+
|
|
22
|
+
If `startConferenceOnEnter` is `false` for a participant, they wait silently until someone with `startConferenceOnEnter: true` joins. Use `waitHook` to play hold music while they wait:
|
|
23
|
+
|
|
24
|
+
```typescript
|
|
25
|
+
session.conference({
|
|
26
|
+
name: 'team-call',
|
|
27
|
+
startConferenceOnEnter: false,
|
|
28
|
+
waitHook: '/hold-music',
|
|
29
|
+
});
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
The `waitHook` handler should return `say` or `play` verbs. jambonz will loop them until the conference starts.
|
|
33
|
+
|
|
34
|
+
## statusHook events
|
|
35
|
+
|
|
36
|
+
Subscribe to events via `statusEvents` and receive them at `statusHook`. Available events: `join`, `leave`, `start-talking`, `stop-talking`.
|
|
37
|
+
|
|
38
|
+
The payload includes `event`, `conferenceSid` (the conference name), and `members` (current participant count). Your handler must reply — in WebSocket mode, call `session.reply()` even if you have no verbs to send.
|
|
39
|
+
|
|
40
|
+
## actionHook fires on exit
|
|
41
|
+
|
|
42
|
+
The `actionHook` fires when this participant leaves the conference (either by hanging up or being removed). Return verbs to continue the call, or `hangup` to end it:
|
|
43
|
+
|
|
44
|
+
```typescript
|
|
45
|
+
session.on('/conference-done', () => {
|
|
46
|
+
session
|
|
47
|
+
.say({ text: 'You have left the conference. Goodbye.' })
|
|
48
|
+
.hangup()
|
|
49
|
+
.reply();
|
|
50
|
+
});
|
|
51
|
+
```
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
## Deepgram Voice Agent configuration
|
|
2
|
+
|
|
3
|
+
See [Deepgram Voice Agent Settings documentation](https://developers.deepgram.com/docs/voice-agent-settings) for the full configuration reference.
|
|
4
|
+
|
|
5
|
+
The Deepgram Voice Agent API does **not** have its own LLM — it delegates to an external LLM provider (OpenAI, Anthropic, etc.) via a "think" provider. The `model` property on the verb is **not used**. Instead, all configuration goes inside `llmOptions.Settings`.
|
|
6
|
+
|
|
7
|
+
### Required structure
|
|
8
|
+
|
|
9
|
+
The `llmOptions` must contain a `Settings` object with this structure:
|
|
10
|
+
|
|
11
|
+
```json
|
|
12
|
+
{
|
|
13
|
+
"verb": "deepgram_s2s",
|
|
14
|
+
"auth": { "apiKey": "your-deepgram-api-key" },
|
|
15
|
+
"llmOptions": {
|
|
16
|
+
"Settings": {
|
|
17
|
+
"agent": {
|
|
18
|
+
"think": {
|
|
19
|
+
"provider": {
|
|
20
|
+
"type": "open_ai",
|
|
21
|
+
"model": "gpt-4o"
|
|
22
|
+
},
|
|
23
|
+
"prompt": "You are a helpful customer service agent."
|
|
24
|
+
},
|
|
25
|
+
"speak": {
|
|
26
|
+
"provider": {
|
|
27
|
+
"type": "deepgram",
|
|
28
|
+
"model": "aura-2-thalia-en"
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"actionHook": "/agent-complete"
|
|
35
|
+
}
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Key differences from other s2s verbs
|
|
39
|
+
|
|
40
|
+
- **No top-level `model`** — the LLM model is inside `Settings.agent.think.provider.model`
|
|
41
|
+
- **No `messages` array** — the system prompt goes in `Settings.agent.think.prompt`
|
|
42
|
+
- **Think provider `type`** — use `"open_ai"` (with underscore), `"anthropic"`, `"groq"`, etc.
|
|
43
|
+
- **Speak provider** — controls the TTS voice. Use `"deepgram"` type with an Aura model name like `"aura-2-thalia-en"`
|
|
44
|
+
- **Do NOT confuse Deepgram TTS voice names (e.g. `aura-asteria-en`) with LLM model names** — they are completely different
|
|
45
|
+
|
|
46
|
+
### Common think provider types
|
|
47
|
+
|
|
48
|
+
| type | Example models |
|
|
49
|
+
|------|---------------|
|
|
50
|
+
| `open_ai` | `gpt-4o`, `gpt-4o-mini` |
|
|
51
|
+
| `anthropic` | `claude-sonnet-4-20250514` |
|
|
52
|
+
| `groq` | `llama-3.3-70b-versatile` |
|
|
53
|
+
|
|
54
|
+
### SDK example
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
session
|
|
58
|
+
.deepgram_s2s({
|
|
59
|
+
auth: { apiKey: deepgramApiKey },
|
|
60
|
+
llmOptions: {
|
|
61
|
+
Settings: {
|
|
62
|
+
agent: {
|
|
63
|
+
think: {
|
|
64
|
+
provider: { type: 'open_ai', model: 'gpt-4o' },
|
|
65
|
+
prompt: 'You are a helpful assistant.',
|
|
66
|
+
},
|
|
67
|
+
speak: {
|
|
68
|
+
provider: { type: 'deepgram', model: 'aura-2-thalia-en' },
|
|
69
|
+
},
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
actionHook: '/agent-complete',
|
|
74
|
+
})
|
|
75
|
+
.send();
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Tool/function support
|
|
79
|
+
|
|
80
|
+
Deepgram Voice Agent supports function calling. Define functions in `Settings.agent.think.functions`:
|
|
81
|
+
|
|
82
|
+
```json
|
|
83
|
+
{
|
|
84
|
+
"Settings": {
|
|
85
|
+
"agent": {
|
|
86
|
+
"think": {
|
|
87
|
+
"provider": { "type": "open_ai", "model": "gpt-4o" },
|
|
88
|
+
"prompt": "You help users check order status.",
|
|
89
|
+
"functions": [
|
|
90
|
+
{
|
|
91
|
+
"name": "check_order",
|
|
92
|
+
"description": "Look up an order by ID",
|
|
93
|
+
"parameters": {
|
|
94
|
+
"type": "object",
|
|
95
|
+
"properties": {
|
|
96
|
+
"order_id": { "type": "string" }
|
|
97
|
+
},
|
|
98
|
+
"required": ["order_id"]
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Function call results are returned via the `toolHook` webhook or WebSocket `llm:tool-call` event.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
## Generating ringback tones with tone_stream
|
|
2
|
+
|
|
3
|
+
The `dialMusic` property is an optional property that can be used to generate audio towards the A party while we are outdialing the B party and get a 180 Ringing from B. In that case, we may want to play ringback tone or a message to the A party until we gert an answer or a 183 with early media.
|
|
4
|
+
|
|
5
|
+
Besides an http(s) URL, the value can also be a FreeSWITCH `tone_stream://` URIs in addition to audio file URLs. Use the `L=` parameter to repeat the tone pattern (`L=-1` is not supported; use a finite count like `L=20`):
|
|
6
|
+
|
|
7
|
+
- **US ringback**: `tone_stream://L=20;%(2000,4000,440,480)`
|
|
8
|
+
- **UK ringback**: `tone_stream://L=20;%(400,200,400,450);%(400,2000,400,450)`
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
## Bidirectional audio modes
|
|
2
|
+
|
|
3
|
+
The listen verb supports two bidirectional audio modes, controlled by the `bidirectionalAudio` property:
|
|
4
|
+
|
|
5
|
+
**Non-streaming** (default) — send complete audio clips as base64 via `playAudio()`:
|
|
6
|
+
```typescript
|
|
7
|
+
session.listen({
|
|
8
|
+
url: '/audio',
|
|
9
|
+
actionHook: '/listen-done',
|
|
10
|
+
// bidirectionalAudio not needed — non-streaming is the default
|
|
11
|
+
});
|
|
12
|
+
```
|
|
13
|
+
Use `stream.playAudio(base64, { audioContentType: 'raw', sampleRate: 8000 })` to play audio back. Up to 10 clips can be queued. Each clip triggers a `playDone` event when finished.
|
|
14
|
+
|
|
15
|
+
**Streaming** — send raw L16 PCM binary frames via `sendAudio()`:
|
|
16
|
+
```typescript
|
|
17
|
+
session.listen({
|
|
18
|
+
url: '/audio',
|
|
19
|
+
actionHook: '/listen-done',
|
|
20
|
+
bidirectionalAudio: {
|
|
21
|
+
enabled: true,
|
|
22
|
+
streaming: true,
|
|
23
|
+
sampleRate: 8000,
|
|
24
|
+
},
|
|
25
|
+
});
|
|
26
|
+
```
|
|
27
|
+
Use `stream.sendAudio(pcmBuffer)` to stream audio continuously. This is required for real-time audio processing (e.g. AI voice agents, echo, audio manipulation).
|
|
28
|
+
|
|
29
|
+
## Marks (synchronization markers)
|
|
30
|
+
|
|
31
|
+
Marks let you track when streamed audio has been played out to the caller. They work **only in streaming mode** — you must set `bidirectionalAudio: { enabled: true, streaming: true }`.
|
|
32
|
+
|
|
33
|
+
The pattern: stream audio via `sendAudio()`, then call `sendMark(name)`. When the audio preceding the mark finishes playing, jambonz sends back a mark event with `event: 'playout'`.
|
|
34
|
+
|
|
35
|
+
```typescript
|
|
36
|
+
stream.sendAudio(pcmChunk1);
|
|
37
|
+
stream.sendMark('after-chunk-1'); // fires when pcmChunk1 finishes playing
|
|
38
|
+
|
|
39
|
+
stream.on('mark', (evt) => {
|
|
40
|
+
// evt.name = 'after-chunk-1'
|
|
41
|
+
// evt.event = 'playout' or 'cleared'
|
|
42
|
+
});
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Use `stream.clearMarks()` to cancel all pending marks — they fire with `event: 'cleared'` instead of `'playout'`.
|
|
46
|
+
|
|
47
|
+
**Common mistake**: marks silently fail without `bidirectionalAudio.streaming: true`. Without it there is no playout buffer, so marks are accepted but never fire.
|
|
48
|
+
|
|
49
|
+
## Relative URLs
|
|
50
|
+
|
|
51
|
+
The `url` property accepts relative paths. jambonz connects the audio WebSocket back to the same server:
|
|
52
|
+
|
|
53
|
+
```typescript
|
|
54
|
+
session.listen({ url: '/audio', actionHook: '/listen-done' });
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
This avoids hardcoding hostnames. Use `makeService.audio({ path: '/audio' })` to register the audio handler on the same endpoint.
|
|
58
|
+
|
|
59
|
+
## Path separation
|
|
60
|
+
|
|
61
|
+
The control WebSocket and audio WebSocket must use **different paths**. If both are registered on the same path (e.g. `/`), audio routes take priority and steal control connections.
|
|
62
|
+
|
|
63
|
+
```typescript
|
|
64
|
+
// Correct — separate paths
|
|
65
|
+
const svc = makeService({ path: '/' });
|
|
66
|
+
const audioSvc = makeService.audio({ path: '/audio' });
|
|
67
|
+
|
|
68
|
+
// Wrong — same path causes conflicts
|
|
69
|
+
const svc = makeService({ path: '/' });
|
|
70
|
+
const audioSvc = makeService.audio({ path: '/' }); // steals control connections
|
|
71
|
+
```
|