@jambonz/schema 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/AGENTS.md +974 -0
  2. package/callbacks/amd.schema.json +50 -0
  3. package/callbacks/base.schema.json +29 -0
  4. package/callbacks/call-status.schema.json +22 -0
  5. package/callbacks/conference-status.schema.json +24 -0
  6. package/callbacks/conference-wait.schema.json +11 -0
  7. package/callbacks/conference.schema.json +11 -0
  8. package/callbacks/dequeue.schema.json +19 -0
  9. package/callbacks/dial-dtmf.schema.json +18 -0
  10. package/callbacks/dial-hold.schema.json +22 -0
  11. package/callbacks/dial-refer.schema.json +28 -0
  12. package/callbacks/dial.schema.json +31 -0
  13. package/callbacks/enqueue-wait.schema.json +17 -0
  14. package/callbacks/enqueue.schema.json +27 -0
  15. package/callbacks/gather-partial.schema.json +54 -0
  16. package/callbacks/gather.schema.json +60 -0
  17. package/callbacks/listen.schema.json +21 -0
  18. package/callbacks/llm.schema.json +30 -0
  19. package/callbacks/message.schema.json +35 -0
  20. package/callbacks/pipeline-turn.schema.json +109 -0
  21. package/callbacks/play.schema.json +36 -0
  22. package/callbacks/session-new.schema.json +143 -0
  23. package/callbacks/session-reconnect.schema.json +9 -0
  24. package/callbacks/session-redirect.schema.json +38 -0
  25. package/callbacks/sip-refer-event.schema.json +20 -0
  26. package/callbacks/sip-refer.schema.json +22 -0
  27. package/callbacks/sip-request.schema.json +27 -0
  28. package/callbacks/transcribe-translation.schema.json +24 -0
  29. package/callbacks/transcribe.schema.json +46 -0
  30. package/callbacks/tts-streaming-event.schema.json +77 -0
  31. package/callbacks/verb-status.schema.json +57 -0
  32. package/components/actionHook.schema.json +36 -0
  33. package/components/actionHookDelayAction.schema.json +37 -0
  34. package/components/amd.schema.json +68 -0
  35. package/components/auth.schema.json +18 -0
  36. package/components/bidirectionalAudio.schema.json +22 -0
  37. package/components/fillerNoise.schema.json +25 -0
  38. package/components/llm-base.schema.json +94 -0
  39. package/components/recognizer-assemblyAiOptions.schema.json +66 -0
  40. package/components/recognizer-awsOptions.schema.json +52 -0
  41. package/components/recognizer-azureOptions.schema.json +32 -0
  42. package/components/recognizer-cobaltOptions.schema.json +34 -0
  43. package/components/recognizer-customOptions.schema.json +27 -0
  44. package/components/recognizer-deepgramOptions.schema.json +147 -0
  45. package/components/recognizer-elevenlabsOptions.schema.json +39 -0
  46. package/components/recognizer-gladiaOptions.schema.json +8 -0
  47. package/components/recognizer-googleOptions.schema.json +35 -0
  48. package/components/recognizer-houndifyOptions.schema.json +53 -0
  49. package/components/recognizer-ibmOptions.schema.json +54 -0
  50. package/components/recognizer-nuanceOptions.schema.json +150 -0
  51. package/components/recognizer-nvidiaOptions.schema.json +39 -0
  52. package/components/recognizer-openaiOptions.schema.json +59 -0
  53. package/components/recognizer-sonioxOptions.schema.json +46 -0
  54. package/components/recognizer-speechmaticsOptions.schema.json +100 -0
  55. package/components/recognizer-verbioOptions.schema.json +46 -0
  56. package/components/recognizer.schema.json +216 -0
  57. package/components/synthesizer.schema.json +82 -0
  58. package/components/target.schema.json +105 -0
  59. package/components/vad.schema.json +48 -0
  60. package/docs/components/recognizer.md +78 -0
  61. package/docs/components/synthesizer.md +27 -0
  62. package/docs/guides/session-commands.md +417 -0
  63. package/docs/verbs/conference.md +51 -0
  64. package/docs/verbs/deepgram_s2s.md +108 -0
  65. package/docs/verbs/dial.md +8 -0
  66. package/docs/verbs/listen.md +71 -0
  67. package/docs/verbs/pipeline.md +475 -0
  68. package/docs/verbs/stream.md +5 -0
  69. package/index.js +9 -0
  70. package/jambonz-app.schema.json +112 -0
  71. package/lib/normalize.js +72 -0
  72. package/lib/validator.js +137 -0
  73. package/package.json +39 -0
  74. package/verbs/alert.schema.json +34 -0
  75. package/verbs/answer.schema.json +22 -0
  76. package/verbs/conference.schema.json +107 -0
  77. package/verbs/config.schema.json +218 -0
  78. package/verbs/deepgram_s2s.schema.json +81 -0
  79. package/verbs/dequeue.schema.json +51 -0
  80. package/verbs/dial.schema.json +187 -0
  81. package/verbs/dialogflow.schema.json +148 -0
  82. package/verbs/dtmf.schema.json +49 -0
  83. package/verbs/dub.schema.json +103 -0
  84. package/verbs/elevenlabs_s2s.schema.json +81 -0
  85. package/verbs/enqueue.schema.json +53 -0
  86. package/verbs/gather.schema.json +188 -0
  87. package/verbs/google_s2s.schema.json +42 -0
  88. package/verbs/hangup.schema.json +36 -0
  89. package/verbs/leave.schema.json +22 -0
  90. package/verbs/listen.schema.json +127 -0
  91. package/verbs/llm.schema.json +44 -0
  92. package/verbs/message.schema.json +82 -0
  93. package/verbs/openai_s2s.schema.json +42 -0
  94. package/verbs/pause.schema.json +36 -0
  95. package/verbs/pipeline.schema.json +240 -0
  96. package/verbs/play.schema.json +96 -0
  97. package/verbs/redirect.schema.json +34 -0
  98. package/verbs/s2s.schema.json +39 -0
  99. package/verbs/say.schema.json +107 -0
  100. package/verbs/sip-decline.schema.json +58 -0
  101. package/verbs/sip-refer.schema.json +58 -0
  102. package/verbs/sip-request.schema.json +54 -0
  103. package/verbs/stream.schema.json +103 -0
  104. package/verbs/tag.schema.json +41 -0
  105. package/verbs/transcribe.schema.json +57 -0
  106. package/verbs/ultravox_s2s.schema.json +41 -0
@@ -0,0 +1,475 @@
1
+ ## Overview
2
+
3
+ The pipeline verb orchestrates a complete voice AI agent by wiring together three separate components — STT, LLM, and TTS — with integrated turn detection. Unlike the s2s verbs (where a single vendor handles everything), pipeline lets you mix and match: e.g. Deepgram for STT, Anthropic for the LLM, and Cartesia for TTS.
4
+
5
+ Pipeline manages the full conversational turn cycle:
6
+ 1. User speaks → STT produces a transcript
7
+ 2. Turn detection decides the user is done speaking
8
+ 3. Transcript is sent to the LLM
9
+ 4. LLM response tokens stream to TTS
10
+ 5. TTS audio plays back to the caller
11
+ 6. If the user barges in, TTS stops and a new turn begins
12
+
13
+ ## Turn detection
14
+
15
+ The `turnDetection` property controls how the pipeline decides the user has finished speaking.
16
+
17
+ **`"stt"` (default)** — Uses the STT vendor's native end-of-utterance signal. For most vendors this is silence-based. Some vendors have smarter built-in turn detection:
18
+ - **deepgramflux** — Acoustic + semantic turn detection (Deepgram's "Flux" model)
19
+ - **assemblyai** — Native turn-taking with `u3-rt-pro` model
20
+ - **speechmatics** — Built-in turn detection
21
+
22
+ These vendors always use their native detection regardless of the `turnDetection` setting.
23
+
24
+ **`"krisp"`** — Uses the Krisp acoustic end-of-turn model, which analyzes speech patterns rather than just silence. Good for natural conversation where users pause mid-thought. Can be tuned:
25
+
26
+ ```json
27
+ {
28
+ "turnDetection": {
29
+ "mode": "krisp",
30
+ "threshold": 0.3
31
+ }
32
+ }
33
+ ```
34
+
35
+ Lower threshold = faster turn transitions (more aggressive). Default is 0.5.
36
+
37
+ **IMPORTANT NOTE**: you must have a krisp API key in order to utilize this module on a self-hosted jambonz system. Please contact us at support@jambonz.org if you need more details.
38
+
39
+ ## Early generation (speculative preflight)
40
+
41
+ Early generation speculatively sends the transcript to the LLM *before* end-of-turn is confirmed. If the transcript matches when the turn does end, buffered tokens are released immediately — shaving off the LLM prompt time. If the user keeps talking and the transcript changes, the speculative response is discarded. This is a latency optimization with no correctness downside.
42
+
43
+ There are two ways early generation is triggered:
44
+
45
+ - **Krisp turn detection** — Set `earlyGeneration: true` to opt in. Krisp emits an early signal that triggers the speculative LLM prompt before final end-of-turn confirmation.
46
+ - **Deepgram Flux** — Early generation happens automatically. Flux emits a native `EagerEndOfTurn` event that triggers preflight regardless of the `earlyGeneration` setting.
47
+
48
+ For other STT vendors with native turn-taking (assemblyai, speechmatics), early generation is not available — they don't emit a preflight signal.
49
+
50
+ ## Noise isolation
51
+
52
+ The `noiseIsolation` property enables server-side noise cancellation on the call audio. By default it filters the inbound (caller) audio, improving STT accuracy in noisy environments. It can also be configured to filter outbound audio via the `direction` option. Two vendors are available:
53
+
54
+ - **`"krisp"`** — Krisp's proprietary noise cancellation. Requires a Krisp API key on self-hosted systems.
55
+ - **`"rnnoise"`** — Open-source RNNoise-based noise cancellation. No API key required.
56
+
57
+ Shorthand (default settings):
58
+
59
+ ```json
60
+ {
61
+ "noiseIsolation": "krisp"
62
+ }
63
+ ```
64
+
65
+ Detailed configuration:
66
+
67
+ ```json
68
+ {
69
+ "noiseIsolation": {
70
+ "mode": "krisp",
71
+ "level": 80,
72
+ "direction": "read"
73
+ }
74
+ }
75
+ ```
76
+
77
+ - `mode` — Vendor: `"krisp"` or `"rnnoise"`.
78
+ - `level` — Suppression level 0–100. Higher values are more aggressive. Default: 100.
79
+ - `direction` — `"read"` filters caller audio (default), `"write"` filters outbound audio.
80
+ - `model` — Optional model name override.
81
+
82
+ Noise isolation can also be enabled/disabled mid-call via the `config` verb, the REST LCC API, or a WebSocket inject command (`noiseIsolation:status`).
83
+
84
+ ## Barge-in
85
+
86
+ By default, users can interrupt the assistant while it's speaking. The `bargeIn` object controls this:
87
+
88
+ ```json
89
+ {
90
+ "bargeIn": {
91
+ "enable": true,
92
+ "minSpeechDuration": 0.5,
93
+ "sticky": false
94
+ }
95
+ }
96
+ ```
97
+
98
+ - `minSpeechDuration` — Seconds of speech required to confirm an interruption. Prevents brief noises from cutting off the assistant. Default: 0.5.
99
+ - `sticky` — If true, once the user interrupts, the assistant does not resume speaking the interrupted response.
100
+
101
+ ## eventHook events
102
+
103
+ The `eventHook` receives real-time events during the conversation. In WebSocket mode, listen for these with `session.on('/your-event-hook', handler)`.
104
+
105
+ | Event type | Description | Key fields |
106
+ |---|---|---|
107
+ | `user_transcript` | User speech recognized | `transcript` |
108
+ | `agent_response` | Assistant reply text | `response` |
109
+ | `user_interruption` | User barged in | — |
110
+ | `turn_end` | End-of-turn summary | `transcript`, `response`, `interrupted`, `latency` |
111
+
112
+ The `turn_end` event is the most useful for observability. It includes per-component latency metrics (STT, LLM, TTS) in milliseconds. See the `callback:pipeline-turn` schema for the full payload structure.
113
+
114
+ ## toolHook (function calling)
115
+
116
+ When the LLM requests a tool/function call, the pipeline sends a request to the `toolHook` with:
117
+
118
+ ```json
119
+ {
120
+ "tool_call_id": "call_abc123",
121
+ "name": "get_weather",
122
+ "arguments": { "city": "Portland" }
123
+ }
124
+ ```
125
+
126
+ The `arguments` field is already parsed (an object, not a JSON string).
127
+
128
+ **Webhook response**: Return the tool result in the HTTP response body as JSON. The result is stringified and fed back to the LLM.
129
+
130
+ **WebSocket**: The tool call arrives as an event on the hook path. Respond by calling `session.toolOutput(tool_call_id, result).reply()`.
131
+
132
+ ## MCP servers (external tools)
133
+
134
+ Instead of (or in addition to) defining tools inline via `llmOptions.tools` and handling them with `toolHook`, you can connect to external MCP servers. The pipeline connects to each server at startup via SSE transport, discovers available tools, and makes them available to the LLM alongside any inline tools.
135
+
136
+ ```json
137
+ {
138
+ "verb": "pipeline",
139
+ "mcpServers": [
140
+ {
141
+ "url": "https://livescoremcp.com/sse"
142
+ }
143
+ ],
144
+ "llm": {
145
+ "vendor": "openai",
146
+ "model": "gpt-4.1",
147
+ "llmOptions": {
148
+ "messages": [
149
+ { "role": "system", "content": "You are a sports assistant. Use available tools to look up live scores and fixtures when asked." }
150
+ ]
151
+ }
152
+ },
153
+ "stt": { "vendor": "deepgram", "language": "en-US" },
154
+ "tts": { "vendor": "cartesia", "voice": "sonic-english" }
155
+ }
156
+ ```
157
+
158
+ The [LiveScore MCP server](https://livescoremcp.com/) is a free, public MCP server that exposes tools for live football scores, fixtures, team stats, and player data. The pipeline discovers these tools automatically at startup — no need to define tool schemas in `llmOptions.tools`. A caller can simply ask "what football matches are on right now?" and the LLM will use the `get_live_scores` tool to fetch real-time data.
159
+
160
+ If an MCP server requires authentication, pass credentials in the `auth` property:
161
+
162
+ ```json
163
+ {
164
+ "mcpServers": [
165
+ {
166
+ "url": "https://mcp.example.com/sse",
167
+ "auth": {
168
+ "apiKey": "your-api-key-here"
169
+ }
170
+ }
171
+ ]
172
+ }
173
+ ```
174
+
175
+ **How tool dispatch works**: When the LLM requests a tool call, the pipeline checks MCP servers first. If the tool name matches one discovered from an MCP server, the call is dispatched there directly and the result is fed back to the LLM. If no MCP server provides the tool, it falls through to the `toolHook` webhook. You can use both MCP servers and `toolHook` together — MCP handles the tools it knows about, and `toolHook` handles the rest.
176
+
177
+ **TypeScript example** — a pipeline agent with the LiveScore MCP server:
178
+
179
+ ```typescript
180
+ session
181
+ .pipeline({
182
+ stt: { vendor: 'deepgram', language: 'en-US' },
183
+ tts: { vendor: 'cartesia', voice: 'sonic-english' },
184
+ llm: {
185
+ vendor: 'openai',
186
+ model: 'gpt-4.1',
187
+ llmOptions: {
188
+ messages: [
189
+ { role: 'system', content: 'You are a sports assistant. Use available tools to answer questions about football scores, fixtures, and teams.' },
190
+ ],
191
+ },
192
+ },
193
+ mcpServers: [
194
+ { url: 'https://livescoremcp.com/sse' },
195
+ // To use a server that requires auth:
196
+ // { url: 'https://mcp.example.com/sse', auth: { apiKey: 'your-key' } },
197
+ ],
198
+ turnDetection: 'krisp',
199
+ actionHook: '/pipeline-complete',
200
+ })
201
+ .send();
202
+ ```
203
+
204
+ ## Mid-conversation updates
205
+
206
+ The pipeline supports asynchronous updates while a conversation is in progress. These let you change the agent's behavior, inject new context, modify available tools, or trigger a new LLM response — without interrupting the current verb stack.
207
+
208
+ Updates can be sent via:
209
+ - **WebSocket**: `session.updatePipeline(data)` (sends a `pipeline:update` command)
210
+ - **REST API**: `client.calls.updatePipeline(callSid, data)` (sends `pipeline_update` in the PUT body)
211
+
212
+ ### update_instructions
213
+
214
+ Replace the LLM system prompt mid-conversation. Useful when the conversation transitions to a different topic or agent persona.
215
+
216
+ ```typescript
217
+ // WebSocket
218
+ session.updatePipeline({
219
+ type: 'update_instructions',
220
+ instructions: 'You are now a billing support agent. Help the caller with invoice questions.',
221
+ });
222
+
223
+ // REST
224
+ await client.calls.updatePipeline(callSid, {
225
+ type: 'update_instructions',
226
+ instructions: 'You are now a billing support agent. Help the caller with invoice questions.',
227
+ });
228
+ ```
229
+
230
+ ### inject_context
231
+
232
+ Append messages to the LLM conversation history. Useful for injecting CRM data, call notes, or other context retrieved after the call started.
233
+
234
+ ```typescript
235
+ session.updatePipeline({
236
+ type: 'inject_context',
237
+ messages: [
238
+ { role: 'system', content: 'Customer account #12345: Gold tier, 3 open tickets.' },
239
+ ],
240
+ });
241
+ ```
242
+
243
+ ### update_tools
244
+
245
+ Replace the tool set available to the LLM. The new tools take effect on the next LLM turn.
246
+
247
+ ```typescript
248
+ session.updatePipeline({
249
+ type: 'update_tools',
250
+ tools: [
251
+ {
252
+ type: 'function',
253
+ function: {
254
+ name: 'transfer_call',
255
+ description: 'Transfer the caller to a specialist',
256
+ parameters: { type: 'object', properties: { department: { type: 'string' } } },
257
+ },
258
+ },
259
+ ],
260
+ });
261
+ ```
262
+
263
+ ### generate_reply
264
+
265
+ Prompt the LLM to generate a new response. If the pipeline is currently idle, the prompt executes immediately. If the pipeline is busy (e.g. the assistant is speaking), the request is queued and executes when the current turn completes.
266
+
267
+ Use `interrupt: true` to cancel the current response and generate immediately — useful for supervisor overrides or urgent context changes.
268
+
269
+ ```typescript
270
+ // Simple prompt
271
+ session.updatePipeline({
272
+ type: 'generate_reply',
273
+ user_input: 'The customer just entered their account number: 12345',
274
+ });
275
+
276
+ // With one-shot instructions
277
+ session.updatePipeline({
278
+ type: 'generate_reply',
279
+ user_input: 'Customer is asking about refunds',
280
+ instructions: 'Be empathetic and offer a 20% discount before processing a refund.',
281
+ });
282
+
283
+ // Interrupt current response
284
+ session.updatePipeline({
285
+ type: 'generate_reply',
286
+ user_input: 'Urgent: supervisor override',
287
+ interrupt: true,
288
+ });
289
+ ```
290
+
291
+ ## LLM configuration
292
+
293
+ The `llm` property is the only required field. It configures the text-to-text LLM:
294
+
295
+ ```json
296
+ {
297
+ "llm": {
298
+ "vendor": "openai",
299
+ "model": "gpt-4.1",
300
+ "llmOptions": {
301
+ "messages": [
302
+ { "role": "system", "content": "You are a helpful voice assistant." }
303
+ ],
304
+ "tools": [
305
+ {
306
+ "type": "function",
307
+ "function": {
308
+ "name": "get_weather",
309
+ "description": "Get current weather for a city",
310
+ "parameters": {
311
+ "type": "object",
312
+ "properties": {
313
+ "city": { "type": "string" }
314
+ },
315
+ "required": ["city"]
316
+ }
317
+ }
318
+ }
319
+ ]
320
+ }
321
+ }
322
+ }
323
+ ```
324
+
325
+ For Anthropic models, use `"vendor": "anthropic"` and structure messages accordingly (Anthropic uses `"role": "user"` for the system-level instruction).
326
+
327
+ ## Greeting
328
+
329
+ By default (`greeting: true`), the pipeline prompts the LLM to generate an initial greeting before the user speaks. Set `greeting: false` if you want the agent to wait silently for the user to speak first.
330
+
331
+ ## Complete example (TypeScript)
332
+
333
+ A pipeline voice agent using Deepgram STT, OpenAI LLM, and Cartesia TTS with Krisp turn detection. Exposes multiple endpoints with different STT/TTS combinations:
334
+
335
+ ```typescript
336
+ import * as http from 'node:http';
337
+ import { createEndpoint, Session } from '@jambonz/sdk/websocket';
338
+
339
+ const envVars = {
340
+ OPENAI_MODEL: {
341
+ type: 'string' as const,
342
+ description: 'OpenAI model to use',
343
+ default: 'gpt-4.1-mini',
344
+ },
345
+ SYSTEM_PROMPT: {
346
+ type: 'string' as const,
347
+ description: 'System prompt for the voice agent',
348
+ uiHint: 'textarea' as const,
349
+ default: 'You are a helpful voice AI assistant. Your responses are concise and conversational.',
350
+ },
351
+ };
352
+
353
+ function handleSession(session: Session) {
354
+ const model = session.data.env_vars?.OPENAI_MODEL || 'gpt-4.1-mini';
355
+ const systemPrompt = session.data.env_vars?.SYSTEM_PROMPT || envVars.SYSTEM_PROMPT.default;
356
+
357
+ session.on('/pipeline-event', (evt: Record<string, unknown>) => {
358
+ if (evt.type === 'turn_end') {
359
+ const { transcript, response, interrupted, latency } = evt as Record<string, unknown>;
360
+ console.log('turn_end', JSON.stringify({ transcript, response, interrupted, latency }, null, 2));
361
+ }
362
+ });
363
+
364
+ session.on('/pipeline-complete', () => {
365
+ session.hangup().reply();
366
+ });
367
+
368
+ session
369
+ .pipeline({
370
+ stt: {
371
+ vendor: 'deepgram',
372
+ language: 'multi',
373
+ deepgramOptions: { model: 'nova-3-general' },
374
+ },
375
+ tts: {
376
+ vendor: 'cartesia',
377
+ voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
378
+ },
379
+ llm: {
380
+ vendor: 'openai',
381
+ model,
382
+ llmOptions: {
383
+ messages: [{ role: 'system', content: systemPrompt }],
384
+ },
385
+ },
386
+ turnDetection: 'krisp',
387
+ earlyGeneration: true,
388
+ bargeIn: { enable: true },
389
+ eventHook: '/pipeline-event',
390
+ actionHook: '/pipeline-complete',
391
+ })
392
+ .send();
393
+ }
394
+
395
+ const port = parseInt(process.env.PORT || '3000', 10);
396
+ const server = http.createServer();
397
+ const makeService = createEndpoint({ server, port, envVars });
398
+
399
+ const svc = makeService({ path: '/' });
400
+ svc.on('session:new', (session) => handleSession(session));
401
+ ```
402
+
403
+ ## Complete example (JavaScript)
404
+
405
+ The same agent in plain JavaScript:
406
+
407
+ ```javascript
408
+ const http = require('node:http');
409
+ const { createEndpoint } = require('@jambonz/sdk/websocket');
410
+
411
+ const envVars = {
412
+ OPENAI_MODEL: {
413
+ type: 'string',
414
+ description: 'OpenAI model to use',
415
+ default: 'gpt-4.1-mini',
416
+ },
417
+ SYSTEM_PROMPT: {
418
+ type: 'string',
419
+ description: 'System prompt for the voice agent',
420
+ uiHint: 'textarea',
421
+ default: 'You are a helpful voice AI assistant. Your responses are concise and conversational.',
422
+ },
423
+ };
424
+
425
+ function handleSession(session) {
426
+ const model = session.data.env_vars?.OPENAI_MODEL || 'gpt-4.1-mini';
427
+ const systemPrompt = session.data.env_vars?.SYSTEM_PROMPT || envVars.SYSTEM_PROMPT.default;
428
+
429
+ session.on('/pipeline-event', (evt) => {
430
+ if (evt.type === 'turn_end') {
431
+ const { transcript, response, interrupted, latency } = evt;
432
+ console.log('turn_end', JSON.stringify({ transcript, response, interrupted, latency }, null, 2));
433
+ }
434
+ });
435
+
436
+ session.on('/pipeline-complete', () => {
437
+ session.hangup().reply();
438
+ });
439
+
440
+ session
441
+ .pipeline({
442
+ stt: {
443
+ vendor: 'deepgram',
444
+ language: 'multi',
445
+ deepgramOptions: { model: 'nova-3-general' },
446
+ },
447
+ tts: {
448
+ vendor: 'cartesia',
449
+ voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
450
+ },
451
+ llm: {
452
+ vendor: 'openai',
453
+ model,
454
+ llmOptions: {
455
+ messages: [{ role: 'system', content: systemPrompt }],
456
+ },
457
+ },
458
+ turnDetection: 'krisp',
459
+ earlyGeneration: true,
460
+ bargeIn: { enable: true },
461
+ eventHook: '/pipeline-event',
462
+ actionHook: '/pipeline-complete',
463
+ })
464
+ .send();
465
+ }
466
+
467
+ const port = parseInt(process.env.PORT || '3000', 10);
468
+ const server = http.createServer();
469
+ const makeService = createEndpoint({ server, port, envVars });
470
+
471
+ const svc = makeService({ path: '/' });
472
+ svc.on('session:new', (session) => handleSession(session));
473
+
474
+ console.log(`jambonz voice agent listening on port ${port}`);
475
+ ```
@@ -0,0 +1,5 @@
1
+ ## stream is a synonym for listen
2
+
3
+ The `stream` verb is functionally identical to `listen` — they share the same implementation, the same properties, and the same audio WebSocket protocol. Use whichever name reads better in your application.
4
+
5
+ All tips for the `listen` verb apply equally to `stream`: bidirectional audio modes, marks, relative URLs, and path separation. See the listen verb usage guide for full details.
package/index.js ADDED
@@ -0,0 +1,9 @@
1
+ const {validate, validateVerb, validateApp} = require('./lib/validator');
2
+ const {normalizeJambones} = require('./lib/normalize');
3
+
4
+ module.exports = {
5
+ validate,
6
+ validateVerb,
7
+ validateApp,
8
+ normalizeJambones,
9
+ };
@@ -0,0 +1,112 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://jambonz.org/schema/jambonz-app",
4
+ "title": "jambonz Application",
5
+ "description": "A jambonz application is an array of verbs that are executed sequentially to control a phone call. Each verb performs an action: speaking text, playing audio, collecting input, dialing a number, connecting to an AI model, etc. When a webhook (actionHook) is invoked, it must return a new verb array to continue call processing.\n\nThe execution model is simple: verbs execute one after another, top to bottom. When a verb with an actionHook completes (e.g. gather collects input), the actionHook is called and its response replaces the remaining verb stack. If the verb array is exhausted without a hangup, the call is terminated.\n\nThere are two transport modes for delivering verb arrays to jambonz:\n- **Webhook**: Your HTTP server receives POST/GET requests with call data and returns JSON verb arrays in the response body.\n- **WebSocket**: Your server maintains a persistent websocket connection with jambonz and sends/receives verb arrays as JSON messages. Required for real-time features like LLM conversations.\n\nThe verb schemas and JSON structure are identical regardless of transport mode.",
6
+ "type": "array",
7
+ "items": {
8
+ "$ref": "#/$defs/Verb"
9
+ },
10
+ "minItems": 1,
11
+ "$defs": {
12
+ "Verb": {
13
+ "oneOf": [
14
+ { "$ref": "verbs/answer" },
15
+ { "$ref": "verbs/alert" },
16
+ { "$ref": "verbs/config" },
17
+ { "$ref": "verbs/say" },
18
+ { "$ref": "verbs/play" },
19
+ { "$ref": "verbs/gather" },
20
+ { "$ref": "verbs/dial" },
21
+ { "$ref": "verbs/listen" },
22
+ { "$ref": "verbs/stream" },
23
+ { "$ref": "verbs/llm" },
24
+ { "$ref": "verbs/s2s" },
25
+ { "$ref": "verbs/openai_s2s" },
26
+ { "$ref": "verbs/google_s2s" },
27
+ { "$ref": "verbs/elevenlabs_s2s" },
28
+ { "$ref": "verbs/deepgram_s2s" },
29
+ { "$ref": "verbs/ultravox_s2s" },
30
+ { "$ref": "verbs/dialogflow" },
31
+ { "$ref": "verbs/pipeline" },
32
+ { "$ref": "verbs/conference" },
33
+ { "$ref": "verbs/transcribe" },
34
+ { "$ref": "verbs/enqueue" },
35
+ { "$ref": "verbs/dequeue" },
36
+ { "$ref": "verbs/dtmf" },
37
+ { "$ref": "verbs/dub" },
38
+ { "$ref": "verbs/hangup" },
39
+ { "$ref": "verbs/leave" },
40
+ { "$ref": "verbs/message" },
41
+ { "$ref": "verbs/pause" },
42
+ { "$ref": "verbs/redirect" },
43
+ { "$ref": "verbs/tag" },
44
+ { "$ref": "verbs/sip:decline" },
45
+ { "$ref": "verbs/sip:request" },
46
+ { "$ref": "verbs/sip:refer" }
47
+ ],
48
+ "discriminator": {
49
+ "propertyName": "verb"
50
+ }
51
+ }
52
+ },
53
+ "examples": [
54
+ [
55
+ {
56
+ "verb": "config",
57
+ "synthesizer": { "vendor": "elevenlabs", "voice": "EXAVITQu4vr4xnSDxMaL", "language": "en-US" },
58
+ "recognizer": { "vendor": "deepgram", "language": "en-US" }
59
+ },
60
+ {
61
+ "verb": "say",
62
+ "text": "Hello! Welcome to Acme Corp. How can I help you today?"
63
+ },
64
+ {
65
+ "verb": "gather",
66
+ "input": ["speech"],
67
+ "actionHook": "/process-input",
68
+ "timeout": 15,
69
+ "say": { "text": "I'm listening." }
70
+ }
71
+ ],
72
+ [
73
+ {
74
+ "verb": "say",
75
+ "text": "Please hold while I connect you to an agent."
76
+ },
77
+ {
78
+ "verb": "dial",
79
+ "target": [{ "type": "phone", "number": "+15085551212" }],
80
+ "answerOnBridge": true,
81
+ "timeout": 30,
82
+ "actionHook": "/dial-complete"
83
+ },
84
+ {
85
+ "verb": "say",
86
+ "text": "Sorry, the agent is not available. Please try again later."
87
+ },
88
+ {
89
+ "verb": "hangup"
90
+ }
91
+ ],
92
+ [
93
+ {
94
+ "verb": "config",
95
+ "synthesizer": { "vendor": "cartesia", "voice": "sonic-english" },
96
+ "recognizer": { "vendor": "deepgram", "language": "en-US" }
97
+ },
98
+ {
99
+ "verb": "openai_s2s",
100
+ "model": "gpt-4o",
101
+ "llmOptions": {
102
+ "messages": [
103
+ { "role": "system", "content": "You are a helpful customer service agent for Acme Corp. Be concise and friendly." }
104
+ ],
105
+ "temperature": 0.7
106
+ },
107
+ "actionHook": "/llm-complete",
108
+ "toolHook": "/llm-tool"
109
+ }
110
+ ]
111
+ ]
112
+ }
@@ -0,0 +1,72 @@
1
+ const debug = require('debug')('jambonz:schema:normalize');
2
+
3
+ /**
4
+ * Verb transforms: maps alias verb names to their canonical form,
5
+ * optionally injecting properties (e.g. vendor) into the verb data.
6
+ */
7
+ const verbTransforms = new Map([
8
+ ['stream', {verb: 'listen'}],
9
+ ['s2s', {verb: 'llm'}],
10
+ ['openai_s2s', {verb: 'llm', properties: {vendor: 'openai'}}],
11
+ ['microsoft_s2s', {verb: 'llm', properties: {vendor: 'microsoft'}}],
12
+ ['google_s2s', {verb: 'llm', properties: {vendor: 'google'}}],
13
+ ['elevenlabs_s2s', {verb: 'llm', properties: {vendor: 'elevenlabs'}}],
14
+ ['deepgram_s2s', {verb: 'llm', properties: {vendor: 'deepgram'}}],
15
+ ['voiceagent_s2s', {verb: 'llm', properties: {vendor: 'voiceagent'}}],
16
+ ['ultravox_s2s', {verb: 'llm', properties: {vendor: 'ultravox'}}],
17
+ ]);
18
+
19
+ function applyVerbTransform(name, data) {
20
+ const transform = verbTransforms.get(name);
21
+ if (!transform) return {name, data};
22
+ const newData = transform.properties ? {...transform.properties, ...data} : data;
23
+ return {name: transform.verb, data: newData};
24
+ }
25
+
26
+ /**
27
+ * Normalize a jambonz application payload into canonical form.
28
+ *
29
+ * Accepts both verb formats:
30
+ * - {verb: 'say', text: 'hello'}
31
+ * - {'say': {text: 'hello'}}
32
+ *
33
+ * Applies verb transforms (stream→listen, openai_s2s→llm, etc.)
34
+ *
35
+ * @param {object} logger - Logger instance with .info method
36
+ * @param {Array} obj - Array of verb objects
37
+ * @returns {Array} Normalized array of {verbName: data} objects
38
+ */
39
+ function normalizeJambones(logger, obj) {
40
+ if (!Array.isArray(obj)) {
41
+ throw new Error('malformed jambonz payload: must be array');
42
+ }
43
+ const document = [];
44
+ for (const tdata of obj) {
45
+ if (typeof tdata !== 'object') throw new Error('malformed jambonz payload: must be array of objects');
46
+ if ('verb' in tdata) {
47
+ const o = {};
48
+ Object.keys(tdata)
49
+ .filter((k) => k !== 'verb')
50
+ .forEach((k) => o[k] = tdata[k]);
51
+ const {name, data} = applyVerbTransform(tdata.verb, o);
52
+ const o2 = {};
53
+ o2[name] = data;
54
+ document.push(o2);
55
+ }
56
+ else if (Object.keys(tdata).length === 1) {
57
+ const key = Object.keys(tdata)[0];
58
+ const {name, data} = applyVerbTransform(key, tdata[key]);
59
+ const o2 = {};
60
+ o2[name] = data;
61
+ document.push(o2);
62
+ }
63
+ else {
64
+ logger.info(tdata, 'malformed jambonz payload: missing verb property');
65
+ throw new Error('malformed jambonz payload: missing verb property');
66
+ }
67
+ }
68
+ debug({document}, `normalizeJambones: returning document with ${document.length} tasks`);
69
+ return document;
70
+ }
71
+
72
+ module.exports = {normalizeJambones, verbTransforms};