@jambonz/schema 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/callbacks/call-status.schema.json +6 -1
- package/commands/llm-tool-output.schema.json +36 -0
- package/components/recognizer-assemblyAiOptions.schema.json +3 -3
- package/docs/guides/bridged-call-patterns.md +261 -0
- package/docs/guides/session-commands.md +38 -0
- package/docs/verbs/dub.md +219 -0
- package/docs/verbs/transcribe.md +167 -0
- package/index.js +2 -1
- package/lib/validator.js +46 -1
- package/package.json +2 -1
- package/verbs/agent.schema.json +98 -2
- package/verbs/dub.schema.json +1 -1
- package/verbs/transcribe.schema.json +2 -1
|
@@ -2,12 +2,17 @@
|
|
|
2
2
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
3
|
"$id": "https://jambonz.org/schema/callbacks/call-status",
|
|
4
4
|
"title": "Call Status Webhook Payload",
|
|
5
|
-
"description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.",
|
|
5
|
+
"description": "Payload sent to the call status webhook URL whenever the call state changes (e.g. trying, in-progress, completed). The status webhook is configured at the application level in jambonz. Multiple status events are sent over the life of a call. The final event (completed or failed) includes additional fields like duration and termination cause.\n\n**Capturing B-leg call_sid:** When using the dial verb to bridge calls, status events are sent for both legs. The A-leg (original inbound call) has `direction: 'inbound'`. The B-leg (outbound dialed call) has `direction: 'outbound'`. To capture the B-leg's call_sid for later use (e.g., injecting commands to the B-leg), listen for status events where `direction === 'outbound'` and extract the `call_sid` field.",
|
|
6
6
|
"allOf": [
|
|
7
7
|
{ "$ref": "base" }
|
|
8
8
|
],
|
|
9
9
|
"type": "object",
|
|
10
10
|
"properties": {
|
|
11
|
+
"direction": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"enum": ["inbound", "outbound"],
|
|
14
|
+
"description": "Call direction. 'inbound' = A-leg (original incoming call to the application). 'outbound' = B-leg (call placed by the dial verb). Use this field to identify which leg generated the status event, especially when capturing the B-leg's call_sid for mid-call control."
|
|
15
|
+
},
|
|
11
16
|
"call_termination_by": {
|
|
12
17
|
"type": "string",
|
|
13
18
|
"enum": ["caller", "jambonz"],
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://jambonz.org/schema/commands/llm:tool-output",
|
|
4
|
+
"title": "LLM Tool Output Command",
|
|
5
|
+
"description": "Command sent from the application to feature-server over the session websocket, carrying the result of a tool invocation that the LLM initiated during an agent verb. Sent in response to an `agent:tool-call` event. The result is fed back into the LLM so it can continue generating using the tool output.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["type", "command", "tool_call_id", "data"],
|
|
8
|
+
"properties": {
|
|
9
|
+
"type": { "const": "command" },
|
|
10
|
+
"command": { "const": "llm:tool-output" },
|
|
11
|
+
"tool_call_id": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"minLength": 1,
|
|
14
|
+
"description": "The tool_call_id echoed from the originating `agent:tool-call` event."
|
|
15
|
+
},
|
|
16
|
+
"data": {
|
|
17
|
+
"type": "object",
|
|
18
|
+
"description": "Tool output payload. `{result: ...}` is the canonical shape — the wrapped value is what the LLM sees as the tool's return value. Other object shapes are accepted (they're JSON-stringified as-is into the tool_result content block on the wire) but `result` is preferred for clarity.",
|
|
19
|
+
"properties": {
|
|
20
|
+
"result": {}
|
|
21
|
+
},
|
|
22
|
+
"additionalProperties": true
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"additionalProperties": false,
|
|
26
|
+
"examples": [
|
|
27
|
+
{
|
|
28
|
+
"type": "command",
|
|
29
|
+
"command": "llm:tool-output",
|
|
30
|
+
"tool_call_id": "toolu_017NKXtgnD7fuo1KaTeSM1ok",
|
|
31
|
+
"data": {
|
|
32
|
+
"result": "The current temperature in Boston is 9.3°C with wind speed 17 km/h."
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
]
|
|
36
|
+
}
|
|
@@ -28,15 +28,15 @@
|
|
|
28
28
|
},
|
|
29
29
|
"minEndOfTurnSilenceWhenConfident": {
|
|
30
30
|
"type": "number",
|
|
31
|
-
"description": "Minimum silence duration (
|
|
31
|
+
"description": "Minimum silence duration (milliseconds) to trigger end-of-turn when confidence is met. Default: 400."
|
|
32
32
|
},
|
|
33
33
|
"maxTurnSilence": {
|
|
34
34
|
"type": "number",
|
|
35
|
-
"description": "Maximum silence duration (
|
|
35
|
+
"description": "Maximum silence duration (milliseconds) before forcing end-of-turn. Default: 1280."
|
|
36
36
|
},
|
|
37
37
|
"minTurnSilence": {
|
|
38
38
|
"type": "number",
|
|
39
|
-
"description": "Minimum silence duration (
|
|
39
|
+
"description": "Minimum silence duration (milliseconds) before allowing end-of-turn."
|
|
40
40
|
},
|
|
41
41
|
"keyterms": {
|
|
42
42
|
"type": "array",
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# Bridged Call Patterns
|
|
2
|
+
|
|
3
|
+
This guide covers common patterns for building applications that bridge two call legs (A-leg and B-leg) and need to interact with each party independently—such as real-time translation, call coaching, or call monitoring.
|
|
4
|
+
|
|
5
|
+
## Understanding A-leg and B-leg
|
|
6
|
+
|
|
7
|
+
When an inbound call arrives and your application uses the `dial` verb to connect the caller to another party:
|
|
8
|
+
|
|
9
|
+
- **A-leg**: The original inbound call (caller → jambonz)
|
|
10
|
+
- **B-leg**: The outbound call placed by the `dial` verb (jambonz → callee)
|
|
11
|
+
|
|
12
|
+
Each leg has its own `call_sid` identifier and can receive independent commands.
|
|
13
|
+
|
|
14
|
+
## Capturing the B-leg call_sid
|
|
15
|
+
|
|
16
|
+
Many patterns require knowing the B-leg's `call_sid` to inject commands to that leg. Capture it from `call:status` events:
|
|
17
|
+
|
|
18
|
+
```typescript
|
|
19
|
+
let dialCallSid: string;
|
|
20
|
+
|
|
21
|
+
session.on('call:status', (evt: Record<string, any>) => {
|
|
22
|
+
// B-leg events have direction === 'outbound'
|
|
23
|
+
if (evt.direction === 'outbound') {
|
|
24
|
+
dialCallSid = evt.call_sid;
|
|
25
|
+
console.log(`B-leg call_sid captured: ${dialCallSid}`);
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
**When it fires:** The `call:status` event with `direction: 'outbound'` fires when the dial verb initiates the B-leg call. You'll receive status updates for both legs throughout the call lifecycle.
|
|
31
|
+
|
|
32
|
+
## Setting Up Transcription on Both Legs
|
|
33
|
+
|
|
34
|
+
To transcribe both parties separately (e.g., for translation), configure transcription on each leg:
|
|
35
|
+
|
|
36
|
+
```typescript
|
|
37
|
+
session
|
|
38
|
+
// Transcribe A-leg (caller)
|
|
39
|
+
.transcribe({
|
|
40
|
+
transcriptionHook: '/transcription/caller',
|
|
41
|
+
channel: 1, // Near-end = caller's voice
|
|
42
|
+
recognizer: {
|
|
43
|
+
vendor: 'deepgram',
|
|
44
|
+
language: 'en-US',
|
|
45
|
+
deepgramOptions: { model: 'nova-2' }
|
|
46
|
+
}
|
|
47
|
+
})
|
|
48
|
+
// Bridge to B-leg with separate transcription
|
|
49
|
+
.dial({
|
|
50
|
+
target: [{ type: 'phone', number: '+15551234567' }],
|
|
51
|
+
transcribe: {
|
|
52
|
+
transcriptionHook: '/transcription/callee',
|
|
53
|
+
channel: 2, // Far-end from A-leg = callee's voice
|
|
54
|
+
recognizer: {
|
|
55
|
+
vendor: 'deepgram',
|
|
56
|
+
language: 'es-ES',
|
|
57
|
+
deepgramOptions: { model: 'nova-2' }
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
})
|
|
61
|
+
.send();
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Channel Values Explained
|
|
65
|
+
|
|
66
|
+
| Location | Channel | What it captures |
|
|
67
|
+
|----------|---------|------------------|
|
|
68
|
+
| A-leg transcribe | `1` (near-end) | Caller's voice |
|
|
69
|
+
| A-leg transcribe | `2` (far-end) | What caller hears (including B-leg) |
|
|
70
|
+
| dial.transcribe | `2` (far-end) | Callee's voice (B-leg inbound audio) |
|
|
71
|
+
| dial.transcribe | (omitted) | Both parties mixed |
|
|
72
|
+
|
|
73
|
+
### Identifying the Speaker in Transcription Events
|
|
74
|
+
|
|
75
|
+
Use the `call_sid` in transcription events to determine who spoke:
|
|
76
|
+
|
|
77
|
+
```typescript
|
|
78
|
+
session.on('/transcription/caller', (evt: Record<string, any>) => {
|
|
79
|
+
if (evt.speech?.is_final) {
|
|
80
|
+
const transcript = evt.speech.alternatives[0].transcript;
|
|
81
|
+
handleCallerSpeech(transcript);
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
session.on('/transcription/callee', (evt: Record<string, any>) => {
|
|
86
|
+
if (evt.speech?.is_final) {
|
|
87
|
+
const transcript = evt.speech.alternatives[0].transcript;
|
|
88
|
+
handleCalleeSpeech(transcript);
|
|
89
|
+
}
|
|
90
|
+
});
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Or with a single hook, use `call_sid` to differentiate:
|
|
94
|
+
|
|
95
|
+
```typescript
|
|
96
|
+
session.on('/transcription', (evt: Record<string, any>) => {
|
|
97
|
+
if (!evt.speech?.is_final) return;
|
|
98
|
+
|
|
99
|
+
const transcript = evt.speech.alternatives[0].transcript;
|
|
100
|
+
const speaker = evt.call_sid === dialCallSid ? 'callee' : 'caller';
|
|
101
|
+
|
|
102
|
+
console.log(`${speaker}: ${transcript}`);
|
|
103
|
+
});
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Creating Dub Tracks for Audio Injection
|
|
107
|
+
|
|
108
|
+
To inject audio to each party (e.g., translated speech), create tracks on each leg:
|
|
109
|
+
|
|
110
|
+
```typescript
|
|
111
|
+
session
|
|
112
|
+
// Track on A-leg — caller hears it
|
|
113
|
+
.dub({ action: 'addTrack', track: 'caller-audio' })
|
|
114
|
+
.dial({
|
|
115
|
+
target: [{ type: 'phone', number: '+15551234567' }],
|
|
116
|
+
// Track on B-leg — callee hears it
|
|
117
|
+
dub: [
|
|
118
|
+
{ action: 'addTrack', track: 'callee-audio' }
|
|
119
|
+
]
|
|
120
|
+
})
|
|
121
|
+
.send();
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**Track routing rule:** Tracks are heard by the party on whose call leg they exist.
|
|
125
|
+
|
|
126
|
+
## Injecting Commands to Specific Legs
|
|
127
|
+
|
|
128
|
+
### Default: Commands go to A-leg
|
|
129
|
+
|
|
130
|
+
```typescript
|
|
131
|
+
session.injectCommand('dub', {
|
|
132
|
+
action: 'sayOnTrack',
|
|
133
|
+
track: 'caller-audio',
|
|
134
|
+
say: 'This message is for the caller'
|
|
135
|
+
});
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Targeting B-leg: Pass call_sid as third argument
|
|
139
|
+
|
|
140
|
+
```typescript
|
|
141
|
+
session.injectCommand('dub', {
|
|
142
|
+
action: 'sayOnTrack',
|
|
143
|
+
track: 'callee-audio',
|
|
144
|
+
say: 'This message is for the callee'
|
|
145
|
+
}, dialCallSid); // Third argument routes to B-leg
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
**Important:** The `call_sid` is passed as a separate third argument, not inside the data object.
|
|
149
|
+
|
|
150
|
+
## Complete Example: Real-Time Translator
|
|
151
|
+
|
|
152
|
+
This example bridges an English caller with a Spanish-speaking callee, providing real-time translation in both directions.
|
|
153
|
+
|
|
154
|
+
```typescript
|
|
155
|
+
import { createEndpoint, Session } from '@jambonz/sdk/websocket';
|
|
156
|
+
import http from 'http';
|
|
157
|
+
|
|
158
|
+
const server = http.createServer();
|
|
159
|
+
const makeService = createEndpoint({ server, port: 3000 });
|
|
160
|
+
const svc = makeService({ path: '/' });
|
|
161
|
+
|
|
162
|
+
svc.on('session:new', (session: Session) => {
|
|
163
|
+
let dialCallSid: string;
|
|
164
|
+
|
|
165
|
+
// Capture B-leg call_sid
|
|
166
|
+
session.on('call:status', (evt: Record<string, any>) => {
|
|
167
|
+
if (evt.direction === 'outbound') {
|
|
168
|
+
dialCallSid = evt.call_sid;
|
|
169
|
+
}
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
// Handle caller's speech (English → Spanish for callee)
|
|
173
|
+
session.on('/transcription/caller', async (evt: Record<string, any>) => {
|
|
174
|
+
session.reply();
|
|
175
|
+
if (!evt.speech?.is_final) return;
|
|
176
|
+
|
|
177
|
+
const transcript = evt.speech.alternatives[0].transcript;
|
|
178
|
+
const translated = await translateToSpanish(transcript);
|
|
179
|
+
|
|
180
|
+
// Inject to B-leg track (callee hears it)
|
|
181
|
+
session.injectCommand('dub', {
|
|
182
|
+
action: 'sayOnTrack',
|
|
183
|
+
track: 'callee-audio',
|
|
184
|
+
say: {
|
|
185
|
+
text: translated,
|
|
186
|
+
synthesizer: { vendor: 'elevenlabs', language: 'es-ES' }
|
|
187
|
+
}
|
|
188
|
+
}, dialCallSid);
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
// Handle callee's speech (Spanish → English for caller)
|
|
192
|
+
session.on('/transcription/callee', async (evt: Record<string, any>) => {
|
|
193
|
+
session.reply();
|
|
194
|
+
if (!evt.speech?.is_final) return;
|
|
195
|
+
|
|
196
|
+
const transcript = evt.speech.alternatives[0].transcript;
|
|
197
|
+
const translated = await translateToEnglish(transcript);
|
|
198
|
+
|
|
199
|
+
// Inject to A-leg track (caller hears it)
|
|
200
|
+
session.injectCommand('dub', {
|
|
201
|
+
action: 'sayOnTrack',
|
|
202
|
+
track: 'caller-audio',
|
|
203
|
+
say: {
|
|
204
|
+
text: translated,
|
|
205
|
+
synthesizer: { vendor: 'elevenlabs', language: 'en-US' }
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
// Set up the call
|
|
211
|
+
session
|
|
212
|
+
.dub({ action: 'addTrack', track: 'caller-audio' })
|
|
213
|
+
.transcribe({
|
|
214
|
+
transcriptionHook: '/transcription/caller',
|
|
215
|
+
channel: 1,
|
|
216
|
+
recognizer: { vendor: 'deepgram', language: 'en-US' }
|
|
217
|
+
})
|
|
218
|
+
.dial({
|
|
219
|
+
target: [{ type: 'phone', number: process.env.CALLEE_NUMBER! }],
|
|
220
|
+
dub: [
|
|
221
|
+
{ action: 'addTrack', track: 'callee-audio' }
|
|
222
|
+
],
|
|
223
|
+
transcribe: {
|
|
224
|
+
transcriptionHook: '/transcription/callee',
|
|
225
|
+
channel: 2,
|
|
226
|
+
recognizer: { vendor: 'deepgram', language: 'es-ES' }
|
|
227
|
+
}
|
|
228
|
+
})
|
|
229
|
+
.send();
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
async function translateToSpanish(text: string): Promise<string> {
|
|
233
|
+
// Implement translation logic
|
|
234
|
+
return text;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
async function translateToEnglish(text: string): Promise<string> {
|
|
238
|
+
// Implement translation logic
|
|
239
|
+
return text;
|
|
240
|
+
}
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## Summary: Key Patterns
|
|
244
|
+
|
|
245
|
+
| Pattern | Implementation |
|
|
246
|
+
|---------|----------------|
|
|
247
|
+
| Capture B-leg call_sid | Listen for `call:status` where `direction === 'outbound'` |
|
|
248
|
+
| Transcribe caller | `transcribe` with `channel: 1` on A-leg |
|
|
249
|
+
| Transcribe callee | `transcribe` with `channel: 2` nested in `dial` |
|
|
250
|
+
| Audio track for caller | `dub` with `addTrack` on A-leg |
|
|
251
|
+
| Audio track for callee | `dub` with `addTrack` in `dial.dub` array |
|
|
252
|
+
| Inject to A-leg | `session.injectCommand(verb, data)` |
|
|
253
|
+
| Inject to B-leg | `session.injectCommand(verb, data, dialCallSid)` |
|
|
254
|
+
|
|
255
|
+
## See Also
|
|
256
|
+
|
|
257
|
+
- `docs/verbs/transcribe.md` - Transcribe verb usage guide
|
|
258
|
+
- `docs/verbs/dub.md` - Dub verb usage guide
|
|
259
|
+
- `docs/guides/session-commands.md` - Session commands reference
|
|
260
|
+
- `callback:call-status` - Call status webhook schema
|
|
261
|
+
- `verb:dial` - Dial verb schema
|
|
@@ -143,6 +143,44 @@ For any command not covered by a specific method:
|
|
|
143
143
|
session.injectCommand('commandName', { ...data });
|
|
144
144
|
```
|
|
145
145
|
|
|
146
|
+
### Targeting Specific Call Legs
|
|
147
|
+
|
|
148
|
+
When bridging calls with the `dial` verb, you may need to inject commands to a specific call leg (A-leg or B-leg). By default, `injectCommand` targets the current session's call (A-leg). To target the B-leg (or any other call), pass the target `call_sid` as the third argument.
|
|
149
|
+
|
|
150
|
+
**Capturing the B-leg call_sid:**
|
|
151
|
+
|
|
152
|
+
Listen for `call:status` events with `direction === 'outbound'` to capture the B-leg's call_sid:
|
|
153
|
+
|
|
154
|
+
```typescript
|
|
155
|
+
let dialCallSid: string;
|
|
156
|
+
|
|
157
|
+
session.on('call:status', (evt: Record<string, any>) => {
|
|
158
|
+
if (evt.direction === 'outbound') {
|
|
159
|
+
dialCallSid = evt.call_sid;
|
|
160
|
+
}
|
|
161
|
+
});
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**Injecting commands to specific legs:**
|
|
165
|
+
|
|
166
|
+
```typescript
|
|
167
|
+
// Target A-leg (current session) — omit the third argument:
|
|
168
|
+
session.injectCommand('dub', {
|
|
169
|
+
action: 'sayOnTrack',
|
|
170
|
+
track: 'caller-track',
|
|
171
|
+
say: 'Message for the caller'
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
// Target B-leg — pass call_sid as third argument:
|
|
175
|
+
session.injectCommand('dub', {
|
|
176
|
+
action: 'sayOnTrack',
|
|
177
|
+
track: 'callee-track',
|
|
178
|
+
say: 'Message for the callee'
|
|
179
|
+
}, dialCallSid);
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
This pattern is essential for applications like real-time translation, where you need to inject translated speech to each party separately based on which leg's audio was transcribed.
|
|
183
|
+
|
|
146
184
|
## Agent Update
|
|
147
185
|
|
|
148
186
|
The `updateAgent()` method sends mid-conversation updates to an active `agent` verb. Four operation types are supported:
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
# Dub Verb Usage Guide
|
|
2
|
+
|
|
3
|
+
The `dub` verb manages auxiliary audio tracks that are mixed into the call audio. Tracks can play background music, coaching whispers, or inject synthesized speech—useful for real-time translation, agent assistance, or audio overlays.
|
|
4
|
+
|
|
5
|
+
## Track Routing: Who Hears What
|
|
6
|
+
|
|
7
|
+
**Critical concept:** Dub tracks are heard by the party on whose call leg they are created.
|
|
8
|
+
|
|
9
|
+
| Where track is created | Who hears it |
|
|
10
|
+
|------------------------|--------------|
|
|
11
|
+
| Main verb stack (A-leg) | Caller |
|
|
12
|
+
| Nested in dial verb's `dub` array | Callee |
|
|
13
|
+
|
|
14
|
+
When using `injectCommand` to play/say on a track, the command routes to the call leg where the track was created—unless you specify a different `call_sid` as the third argument.
|
|
15
|
+
|
|
16
|
+
## Basic Usage
|
|
17
|
+
|
|
18
|
+
### Creating Tracks
|
|
19
|
+
|
|
20
|
+
Create a track before playing audio on it:
|
|
21
|
+
|
|
22
|
+
```typescript
|
|
23
|
+
// Track on A-leg (caller hears it)
|
|
24
|
+
session
|
|
25
|
+
.dub({ action: 'addTrack', track: 'caller-audio' })
|
|
26
|
+
.dial({
|
|
27
|
+
target: [{ type: 'phone', number: '+15551234567' }],
|
|
28
|
+
dub: [
|
|
29
|
+
// Track on B-leg (callee hears it)
|
|
30
|
+
{ action: 'addTrack', track: 'callee-audio' }
|
|
31
|
+
]
|
|
32
|
+
})
|
|
33
|
+
.send();
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Playing Audio on Tracks
|
|
37
|
+
|
|
38
|
+
```typescript
|
|
39
|
+
// Play audio file
|
|
40
|
+
session
|
|
41
|
+
.dub({
|
|
42
|
+
action: 'playOnTrack',
|
|
43
|
+
track: 'bgm',
|
|
44
|
+
play: 'https://example.com/music.mp3',
|
|
45
|
+
loop: true,
|
|
46
|
+
gain: -15 // Reduce volume by 15dB
|
|
47
|
+
})
|
|
48
|
+
.send();
|
|
49
|
+
|
|
50
|
+
// Synthesize and play speech
|
|
51
|
+
session
|
|
52
|
+
.dub({
|
|
53
|
+
action: 'sayOnTrack',
|
|
54
|
+
track: 'coach',
|
|
55
|
+
say: 'Ask about their timeline'
|
|
56
|
+
})
|
|
57
|
+
.send();
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Controlling Tracks
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
// Silence a track (mute without removing)
|
|
64
|
+
session.dub({ action: 'silenceTrack', track: 'bgm' }).send();
|
|
65
|
+
|
|
66
|
+
// Remove a track entirely
|
|
67
|
+
session.dub({ action: 'removeTrack', track: 'bgm' }).send();
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Mid-Call Audio Injection with injectCommand
|
|
71
|
+
|
|
72
|
+
The most powerful use of dub tracks is injecting audio mid-call via `injectCommand`. This is how you implement real-time features like translation or coaching.
|
|
73
|
+
|
|
74
|
+
### Injecting to the Current Call Leg (A-leg)
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
session.injectCommand('dub', {
|
|
78
|
+
action: 'sayOnTrack',
|
|
79
|
+
track: 'caller-audio',
|
|
80
|
+
say: 'Translated text for the caller'
|
|
81
|
+
});
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Injecting to a Specific Call Leg (B-leg)
|
|
85
|
+
|
|
86
|
+
When the target track exists on a different call leg, pass the target `call_sid` as the third argument:
|
|
87
|
+
|
|
88
|
+
```typescript
|
|
89
|
+
// First, capture the B-leg call_sid from call:status events
|
|
90
|
+
let dialCallSid: string;
|
|
91
|
+
|
|
92
|
+
session.on('call:status', (evt: Record<string, any>) => {
|
|
93
|
+
if (evt.direction === 'outbound') {
|
|
94
|
+
dialCallSid = evt.call_sid;
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
// Then inject to the B-leg
|
|
99
|
+
session.injectCommand('dub', {
|
|
100
|
+
action: 'sayOnTrack',
|
|
101
|
+
track: 'callee-audio',
|
|
102
|
+
say: 'Translated text for the callee'
|
|
103
|
+
}, dialCallSid);
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Important:** The third argument is the `call_sid` of the call leg where the track exists, not part of the data object.
|
|
107
|
+
|
|
108
|
+
## Common Use Cases
|
|
109
|
+
|
|
110
|
+
### Real-Time Translation
|
|
111
|
+
|
|
112
|
+
Set up tracks for each party, then inject translated speech based on transcriptions:
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
session
|
|
116
|
+
.dub({ action: 'addTrack', track: 'caller-translation' })
|
|
117
|
+
.dial({
|
|
118
|
+
target: [{ type: 'phone', number: '+15551234567' }],
|
|
119
|
+
dub: [
|
|
120
|
+
{ action: 'addTrack', track: 'callee-translation' }
|
|
121
|
+
],
|
|
122
|
+
transcribe: {
|
|
123
|
+
transcriptionHook: '/transcription',
|
|
124
|
+
recognizer: { vendor: 'deepgram' }
|
|
125
|
+
}
|
|
126
|
+
})
|
|
127
|
+
.send();
|
|
128
|
+
|
|
129
|
+
// When callee speaks, translate and play to caller:
|
|
130
|
+
session.injectCommand('dub', {
|
|
131
|
+
action: 'sayOnTrack',
|
|
132
|
+
track: 'caller-translation',
|
|
133
|
+
say: { text: translatedText, synthesizer: { language: 'en-US' } }
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
// When caller speaks, translate and play to callee:
|
|
137
|
+
session.injectCommand('dub', {
|
|
138
|
+
action: 'sayOnTrack',
|
|
139
|
+
track: 'callee-translation',
|
|
140
|
+
say: { text: translatedText, synthesizer: { language: 'es-ES' } }
|
|
141
|
+
}, dialCallSid);
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Agent Coaching / Whisper
|
|
145
|
+
|
|
146
|
+
Play prompts only the agent hears (A-leg is the agent):
|
|
147
|
+
|
|
148
|
+
```typescript
|
|
149
|
+
session
|
|
150
|
+
.dub({ action: 'addTrack', track: 'coach' })
|
|
151
|
+
.dial({
|
|
152
|
+
target: [{ type: 'phone', number: '+15551234567' }]
|
|
153
|
+
})
|
|
154
|
+
.send();
|
|
155
|
+
|
|
156
|
+
// Supervisor sends a coaching message:
|
|
157
|
+
session.injectCommand('dub', {
|
|
158
|
+
action: 'sayOnTrack',
|
|
159
|
+
track: 'coach',
|
|
160
|
+
say: 'Offer them a 20% discount'
|
|
161
|
+
});
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Background Music / Hold Music
|
|
165
|
+
|
|
166
|
+
```typescript
|
|
167
|
+
session
|
|
168
|
+
.dub({ action: 'addTrack', track: 'bgm' })
|
|
169
|
+
.dub({
|
|
170
|
+
action: 'playOnTrack',
|
|
171
|
+
track: 'bgm',
|
|
172
|
+
play: 'https://example.com/hold-music.mp3',
|
|
173
|
+
loop: true,
|
|
174
|
+
gain: -20
|
|
175
|
+
})
|
|
176
|
+
.send();
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Say Configuration Object
|
|
180
|
+
|
|
181
|
+
The `say` property can be a string or a configuration object for more control:
|
|
182
|
+
|
|
183
|
+
```typescript
|
|
184
|
+
session.injectCommand('dub', {
|
|
185
|
+
action: 'sayOnTrack',
|
|
186
|
+
track: 'translation',
|
|
187
|
+
say: {
|
|
188
|
+
text: 'Hello, how can I help you?',
|
|
189
|
+
synthesizer: {
|
|
190
|
+
vendor: 'elevenlabs',
|
|
191
|
+
voice: 'EXAVITQu4vr4xnSDxMaL',
|
|
192
|
+
language: 'en-US'
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Gain Control
|
|
199
|
+
|
|
200
|
+
Use the `gain` property to adjust track volume in dB:
|
|
201
|
+
|
|
202
|
+
- Negative values reduce volume (e.g., `-15` for quiet background music)
|
|
203
|
+
- Positive values increase volume (use carefully to avoid clipping)
|
|
204
|
+
- `0` is the default (no change)
|
|
205
|
+
|
|
206
|
+
```typescript
|
|
207
|
+
session.dub({
|
|
208
|
+
action: 'playOnTrack',
|
|
209
|
+
track: 'bgm',
|
|
210
|
+
play: 'https://example.com/music.mp3',
|
|
211
|
+
gain: -15
|
|
212
|
+
}).send();
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## See Also
|
|
216
|
+
|
|
217
|
+
- `verb:dub` - Full schema with all properties
|
|
218
|
+
- `docs/guides/session-commands.md` - injectCommand documentation
|
|
219
|
+
- `docs/guides/bridged-call-patterns.md` - Complete guide to bridged call scenarios
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# Transcribe Verb Usage Guide
|
|
2
|
+
|
|
3
|
+
The `transcribe` verb enables real-time speech-to-text on a call. Transcription runs as a background process—subsequent verbs execute immediately while transcription continues. Results are streamed to your `transcriptionHook` webhook as they are produced.
|
|
4
|
+
|
|
5
|
+
## Basic Usage
|
|
6
|
+
|
|
7
|
+
```typescript
|
|
8
|
+
session
|
|
9
|
+
.transcribe({
|
|
10
|
+
transcriptionHook: '/transcription',
|
|
11
|
+
recognizer: {
|
|
12
|
+
vendor: 'deepgram',
|
|
13
|
+
language: 'en-US',
|
|
14
|
+
deepgramOptions: {
|
|
15
|
+
model: 'nova-2',
|
|
16
|
+
smartFormatting: true
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
})
|
|
20
|
+
.dial({ target: [{ type: 'phone', number: '+15551234567' }] })
|
|
21
|
+
.send();
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Channel Isolation for Bridged Calls
|
|
25
|
+
|
|
26
|
+
When transcribing a bridged call (A-leg connected to B-leg via `dial`), you can isolate which party's audio to transcribe using the `channel` property:
|
|
27
|
+
|
|
28
|
+
| Channel | Description |
|
|
29
|
+
|---------|-------------|
|
|
30
|
+
| (omitted) | Both parties' audio, mixed |
|
|
31
|
+
| `1` | Near-end audio (local party—caller on A-leg, callee on B-leg) |
|
|
32
|
+
| `2` | Far-end audio (remote party) |
|
|
33
|
+
|
|
34
|
+
### Transcribing Both Legs Separately
|
|
35
|
+
|
|
36
|
+
To get separate transcriptions for each party in a bridged call:
|
|
37
|
+
|
|
38
|
+
```typescript
|
|
39
|
+
session
|
|
40
|
+
.transcribe({
|
|
41
|
+
transcriptionHook: '/transcription/caller',
|
|
42
|
+
channel: 1, // Caller's audio (A-leg near-end)
|
|
43
|
+
recognizer: { vendor: 'deepgram', language: 'en-US' }
|
|
44
|
+
})
|
|
45
|
+
.dial({
|
|
46
|
+
target: [{ type: 'phone', number: '+15551234567' }],
|
|
47
|
+
transcribe: {
|
|
48
|
+
transcriptionHook: '/transcription/callee',
|
|
49
|
+
channel: 2, // Callee's audio (B-leg far-end from A-leg perspective)
|
|
50
|
+
recognizer: { vendor: 'deepgram', language: 'es-ES' }
|
|
51
|
+
}
|
|
52
|
+
})
|
|
53
|
+
.send();
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
**Important:** When `transcribe` is nested in the `dial` verb, channel 2 isolates the B-leg's inbound audio (what the callee is saying).
|
|
57
|
+
|
|
58
|
+
## Transcription Hook Payload
|
|
59
|
+
|
|
60
|
+
Your webhook receives transcription results with these key fields:
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
session.on('/transcription', (evt: Record<string, any>) => {
|
|
64
|
+
const {
|
|
65
|
+
call_sid, // Which call leg generated this transcript
|
|
66
|
+
speech, // Speech recognition results
|
|
67
|
+
is_final, // true for final results, false for interim
|
|
68
|
+
transcription_sid, // Unique ID for this transcription session
|
|
69
|
+
} = evt;
|
|
70
|
+
|
|
71
|
+
if (speech?.is_final) {
|
|
72
|
+
const { transcript, confidence } = speech.alternatives[0];
|
|
73
|
+
console.log(`[${call_sid}] ${transcript} (${confidence})`);
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Identifying Which Party Spoke
|
|
79
|
+
|
|
80
|
+
The `call_sid` in transcription events identifies which call leg generated the transcript. If you're tracking the B-leg's call_sid (captured from `call:status` events), you can identify the speaker:
|
|
81
|
+
|
|
82
|
+
```typescript
|
|
83
|
+
let dialCallSid: string;
|
|
84
|
+
|
|
85
|
+
session.on('call:status', (evt: Record<string, any>) => {
|
|
86
|
+
if (evt.direction === 'outbound') {
|
|
87
|
+
dialCallSid = evt.call_sid;
|
|
88
|
+
}
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
session.on('/transcription', (evt: Record<string, any>) => {
|
|
92
|
+
const speaker = evt.call_sid === dialCallSid ? 'callee' : 'caller';
|
|
93
|
+
console.log(`${speaker}: ${evt.speech?.alternatives[0]?.transcript}`);
|
|
94
|
+
});
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Nested Transcribe in Config vs Dial
|
|
98
|
+
|
|
99
|
+
You can enable transcription in two places:
|
|
100
|
+
|
|
101
|
+
### In the main verb stack (or config)
|
|
102
|
+
|
|
103
|
+
Transcribes the A-leg. Without `channel`, captures caller audio. With `channel: 2`, captures far-end (what caller hears).
|
|
104
|
+
|
|
105
|
+
```typescript
|
|
106
|
+
session
|
|
107
|
+
.config({
|
|
108
|
+
transcribe: {
|
|
109
|
+
enable: true,
|
|
110
|
+
transcriptionHook: '/transcription',
|
|
111
|
+
recognizer: { vendor: 'deepgram' }
|
|
112
|
+
}
|
|
113
|
+
})
|
|
114
|
+
.send();
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Nested in dial
|
|
118
|
+
|
|
119
|
+
Transcribes during the bridged call. Without `channel`, captures both parties mixed. With `channel: 2`, isolates B-leg audio.
|
|
120
|
+
|
|
121
|
+
```typescript
|
|
122
|
+
session
|
|
123
|
+
.dial({
|
|
124
|
+
target: [{ type: 'phone', number: '+15551234567' }],
|
|
125
|
+
transcribe: {
|
|
126
|
+
transcriptionHook: '/transcription',
|
|
127
|
+
channel: 2, // Just the callee's voice
|
|
128
|
+
recognizer: { vendor: 'deepgram', language: 'es-ES' }
|
|
129
|
+
}
|
|
130
|
+
})
|
|
131
|
+
.send();
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Interim vs Final Results
|
|
135
|
+
|
|
136
|
+
Most STT vendors provide both interim (partial) and final transcription results:
|
|
137
|
+
|
|
138
|
+
- **Interim results** (`is_final: false`): Real-time partial transcripts that update as speech continues. Useful for live displays.
|
|
139
|
+
- **Final results** (`is_final: true`): Complete utterance transcripts after the speaker pauses. Use these for processing/translation.
|
|
140
|
+
|
|
141
|
+
```typescript
|
|
142
|
+
session.on('/transcription', (evt: Record<string, any>) => {
|
|
143
|
+
if (evt.speech?.is_final) {
|
|
144
|
+
// Process complete utterance
|
|
145
|
+
processTranscript(evt.speech.alternatives[0].transcript);
|
|
146
|
+
}
|
|
147
|
+
// Optionally handle interim results for live display
|
|
148
|
+
});
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Enabling/Disabling Transcription
|
|
152
|
+
|
|
153
|
+
Use `enable: false` to stop background transcription:
|
|
154
|
+
|
|
155
|
+
```typescript
|
|
156
|
+
session
|
|
157
|
+
.config({
|
|
158
|
+
transcribe: { enable: false }
|
|
159
|
+
})
|
|
160
|
+
.send();
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## See Also
|
|
164
|
+
|
|
165
|
+
- `callback:transcribe` - Full transcription webhook payload schema
|
|
166
|
+
- `component:recognizer` - STT configuration options per vendor
|
|
167
|
+
- `docs/guides/bridged-call-patterns.md` - Complete guide to bridged call scenarios
|
package/index.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
const {validate, validateVerb, validateApp} = require('./lib/validator');
|
|
1
|
+
const {validate, validateVerb, validateApp, validateCommand} = require('./lib/validator');
|
|
2
2
|
const {normalizeJambones} = require('./lib/normalize');
|
|
3
3
|
|
|
4
4
|
module.exports = {
|
|
5
5
|
validate,
|
|
6
6
|
validateVerb,
|
|
7
7
|
validateApp,
|
|
8
|
+
validateCommand,
|
|
8
9
|
normalizeJambones,
|
|
9
10
|
};
|
package/lib/validator.js
CHANGED
|
@@ -54,6 +54,12 @@ function getAjv() {
|
|
|
54
54
|
_ajv.addSchema(schema);
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
+
/* register command schemas (app → server over the session websocket) */
|
|
58
|
+
for (const name of discoverSchemas('commands')) {
|
|
59
|
+
const schema = loadSchema(`commands/${name}.schema.json`);
|
|
60
|
+
_ajv.addSchema(schema);
|
|
61
|
+
}
|
|
62
|
+
|
|
57
63
|
/* compile the root app schema */
|
|
58
64
|
const appSchema = loadSchema('jambonz-app.schema.json');
|
|
59
65
|
_validateApp = _ajv.compile(appSchema);
|
|
@@ -103,6 +109,45 @@ function validateVerb(name, data, logger) {
|
|
|
103
109
|
}
|
|
104
110
|
}
|
|
105
111
|
|
|
112
|
+
/**
|
|
113
|
+
* Validate a command message sent from the app to feature-server over the
|
|
114
|
+
* session websocket.
|
|
115
|
+
*
|
|
116
|
+
* Commands are the inverse of callbacks: callbacks flow server → app; commands
|
|
117
|
+
* flow app → server. Currently defined commands:
|
|
118
|
+
* - llm:tool-output — response to an agent:tool-call event.
|
|
119
|
+
*
|
|
120
|
+
* @param {string} name - Command name (e.g. 'llm:tool-output').
|
|
121
|
+
* @param {object} message - The full command message including `type` and `command`.
|
|
122
|
+
* @param {object} logger - Logger instance.
|
|
123
|
+
* @throws {Error} If the command is unknown or validation fails.
|
|
124
|
+
*/
|
|
125
|
+
function validateCommand(name, message, logger) {
|
|
126
|
+
const ajv = getAjv();
|
|
127
|
+
const schemaId = `https://jambonz.org/schema/commands/${name}`;
|
|
128
|
+
const validate = ajv.getSchema(schemaId);
|
|
129
|
+
if (!validate) {
|
|
130
|
+
throw new Error(`invalid command: ${name}`);
|
|
131
|
+
}
|
|
132
|
+
const valid = validate(message);
|
|
133
|
+
if (!valid) {
|
|
134
|
+
const errors = validate.errors || [];
|
|
135
|
+
const details = errors.map((e) => {
|
|
136
|
+
const path = e.instancePath || '(root)';
|
|
137
|
+
let msg = `'${path}': ${e.message}`;
|
|
138
|
+
if (e.params) {
|
|
139
|
+
if (e.params.type) msg += ` (expected ${e.params.type})`;
|
|
140
|
+
if (e.params.allowedValues) msg += ` (allowed: ${e.params.allowedValues.join(', ')})`;
|
|
141
|
+
if (e.params.missingProperty) msg += ` (missing: ${e.params.missingProperty})`;
|
|
142
|
+
}
|
|
143
|
+
return msg;
|
|
144
|
+
}).join('; ');
|
|
145
|
+
const errMsg = `command '${name}' validation failed — ${details}. Schema: ${schemaId}`;
|
|
146
|
+
debug(errMsg);
|
|
147
|
+
throw new Error(errMsg);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
106
151
|
/**
|
|
107
152
|
* Validate a complete jambonz application (array of verbs).
|
|
108
153
|
*
|
|
@@ -144,4 +189,4 @@ function validateApp(app) {
|
|
|
144
189
|
};
|
|
145
190
|
}
|
|
146
191
|
|
|
147
|
-
module.exports = {validate, validateVerb, validateApp};
|
|
192
|
+
module.exports = {validate, validateVerb, validateApp, validateCommand};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jambonz/schema",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "JSON Schema definitions and validation for jambonz verb applications",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -32,6 +32,7 @@
|
|
|
32
32
|
"verbs/",
|
|
33
33
|
"components/",
|
|
34
34
|
"callbacks/",
|
|
35
|
+
"commands/",
|
|
35
36
|
"docs/",
|
|
36
37
|
"jambonz-app.schema.json",
|
|
37
38
|
"AGENTS.md"
|
package/verbs/agent.schema.json
CHANGED
|
@@ -86,8 +86,89 @@
|
|
|
86
86
|
},
|
|
87
87
|
"llm": {
|
|
88
88
|
"type": "object",
|
|
89
|
-
"description": "LLM configuration for the agent.
|
|
90
|
-
"
|
|
89
|
+
"description": "LLM configuration for the agent.",
|
|
90
|
+
"required": ["vendor", "model"],
|
|
91
|
+
"properties": {
|
|
92
|
+
"vendor": {
|
|
93
|
+
"type": "string",
|
|
94
|
+
"enum": [
|
|
95
|
+
"openai",
|
|
96
|
+
"anthropic",
|
|
97
|
+
"google",
|
|
98
|
+
"vertex-gemini",
|
|
99
|
+
"vertex-openai",
|
|
100
|
+
"bedrock",
|
|
101
|
+
"deepseek"
|
|
102
|
+
],
|
|
103
|
+
"description": "LLM vendor id. Must match a `@jambonz/llm` registered adapter."
|
|
104
|
+
},
|
|
105
|
+
"model": {
|
|
106
|
+
"type": "string",
|
|
107
|
+
"description": "Vendor-specific model id (e.g. 'gpt-4o', 'claude-sonnet-4-5-20250929')."
|
|
108
|
+
},
|
|
109
|
+
"label": {
|
|
110
|
+
"type": "string",
|
|
111
|
+
"description": "Optional label to disambiguate when the account has multiple credentials for the same vendor."
|
|
112
|
+
},
|
|
113
|
+
"auth": {
|
|
114
|
+
"type": "object",
|
|
115
|
+
"description": "Optional inline credentials. When omitted, feature-server looks up credentials by (vendor, label) from the database.",
|
|
116
|
+
"properties": {
|
|
117
|
+
"apiKey": { "type": "string" }
|
|
118
|
+
},
|
|
119
|
+
"additionalProperties": true
|
|
120
|
+
},
|
|
121
|
+
"connectOptions": {
|
|
122
|
+
"type": "object",
|
|
123
|
+
"description": "SDK-level client options.",
|
|
124
|
+
"properties": {
|
|
125
|
+
"timeout": { "type": "number", "minimum": 0 },
|
|
126
|
+
"maxRetries": { "type": "integer", "minimum": 0 },
|
|
127
|
+
"endpoint": { "type": "string" },
|
|
128
|
+
"baseURL": { "type": "string" }
|
|
129
|
+
},
|
|
130
|
+
"additionalProperties": false
|
|
131
|
+
},
|
|
132
|
+
"llmOptions": {
|
|
133
|
+
"type": "object",
|
|
134
|
+
"description": "Per-call LLM configuration.",
|
|
135
|
+
"properties": {
|
|
136
|
+
"systemPrompt": {
|
|
137
|
+
"type": "string",
|
|
138
|
+
"description": "System prompt for the model. Placed vendor-appropriately (top-level for Anthropic/Bedrock, config.systemInstruction for Gemini, role:'system' for OpenAI-compatibles)."
|
|
139
|
+
},
|
|
140
|
+
"messages": {
|
|
141
|
+
"type": "array",
|
|
142
|
+
"description": "Seed conversation history. A role:'system' entry is extracted into systemPrompt internally.",
|
|
143
|
+
"items": { "$ref": "#/$defs/llmMessage" }
|
|
144
|
+
},
|
|
145
|
+
"initialMessages": {
|
|
146
|
+
"type": "array",
|
|
147
|
+
"description": "Alias of 'messages' (historical).",
|
|
148
|
+
"items": { "$ref": "#/$defs/llmMessage" }
|
|
149
|
+
},
|
|
150
|
+
"maxTokens": {
|
|
151
|
+
"type": "integer",
|
|
152
|
+
"minimum": 1,
|
|
153
|
+
"description": "Maximum tokens the model may generate per turn."
|
|
154
|
+
},
|
|
155
|
+
"temperature": {
|
|
156
|
+
"type": "number",
|
|
157
|
+
"minimum": 0,
|
|
158
|
+
"description": "Sampling temperature."
|
|
159
|
+
},
|
|
160
|
+
"tools": {
|
|
161
|
+
"type": "array",
|
|
162
|
+
"description": "Tool / function definitions available to the model. The MCP-flat shape `{name, description, parameters}` is canonical; the OpenAI-wrapped form `{type:'function', function:{...}}` is also accepted.",
|
|
163
|
+
"items": {
|
|
164
|
+
"type": "object"
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
},
|
|
168
|
+
"additionalProperties": false
|
|
169
|
+
}
|
|
170
|
+
},
|
|
171
|
+
"additionalProperties": false
|
|
91
172
|
},
|
|
92
173
|
"actionHook": {
|
|
93
174
|
"$ref": "../components/actionHook",
|
|
@@ -177,6 +258,21 @@
|
|
|
177
258
|
"required": [
|
|
178
259
|
"llm"
|
|
179
260
|
],
|
|
261
|
+
"$defs": {
|
|
262
|
+
"llmMessage": {
|
|
263
|
+
"type": "object",
|
|
264
|
+
"description": "A conversation-history message. The library normalizes content to a string; adapters may carry vendor-native shapes internally.",
|
|
265
|
+
"required": ["role", "content"],
|
|
266
|
+
"properties": {
|
|
267
|
+
"role": {
|
|
268
|
+
"type": "string",
|
|
269
|
+
"enum": ["system", "user", "assistant", "tool"]
|
|
270
|
+
},
|
|
271
|
+
"content": {}
|
|
272
|
+
},
|
|
273
|
+
"additionalProperties": true
|
|
274
|
+
}
|
|
275
|
+
},
|
|
180
276
|
"examples": [
|
|
181
277
|
{
|
|
182
278
|
"verb": "agent",
|
package/verbs/dub.schema.json
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
"$id": "https://jambonz.org/schema/verbs/dub",
|
|
4
4
|
"minVersion": "0.9.6",
|
|
5
5
|
"title": "Dub",
|
|
6
|
-
"description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.",
|
|
6
|
+
"description": "Manages audio dubbing tracks on a call. Allows adding, removing, and controlling auxiliary audio tracks that are mixed into the call audio. Used for background music, coaching whispers, or injecting audio from external sources.\n\n**Track Routing:** Tracks are heard by the party on whose call leg they are created. A dub verb in the main verb stack (A-leg) creates tracks heard by the caller. A dub verb nested in the dial verb's `dub` array creates tracks heard by the callee. When using injectCommand to play/say on a track from a different call leg, pass the target call's `call_sid` as the third argument to `session.injectCommand()` to route the command to the correct leg.",
|
|
7
7
|
"type": "object",
|
|
8
8
|
"properties": {
|
|
9
9
|
"verb": {
|
|
@@ -37,7 +37,8 @@
|
|
|
37
37
|
},
|
|
38
38
|
"channel": {
|
|
39
39
|
"type": "number",
|
|
40
|
-
"
|
|
40
|
+
"enum": [1, 2],
|
|
41
|
+
"description": "Specific audio channel to transcribe. Channel 1 = near-end (local party's audio, i.e. caller on A-leg or callee on B-leg). Channel 2 = far-end (remote party's audio). When transcribe is nested in the dial verb, omitting channel captures both legs mixed; specifying channel: 2 isolates the B-leg's inbound audio."
|
|
41
42
|
}
|
|
42
43
|
},
|
|
43
44
|
"examples": [
|