@jambonz/mcp-schema-server 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +305 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +135 -0
- package/dist/index.js.map +1 -0
- package/package.json +47 -0
- package/schema/components/actionHook.schema.json +36 -0
- package/schema/components/actionHookDelayAction.schema.json +37 -0
- package/schema/components/auth.schema.json +18 -0
- package/schema/components/bidirectionalAudio.schema.json +22 -0
- package/schema/components/fillerNoise.schema.json +25 -0
- package/schema/components/recognizer.schema.json +280 -0
- package/schema/components/synthesizer.schema.json +82 -0
- package/schema/components/target.schema.json +105 -0
- package/schema/components/vad.schema.json +48 -0
- package/schema/jambonz-app.schema.json +106 -0
- package/schema/verbs/alert.schema.json +20 -0
- package/schema/verbs/answer.schema.json +12 -0
- package/schema/verbs/conference.schema.json +43 -0
- package/schema/verbs/config.schema.json +174 -0
- package/schema/verbs/dequeue.schema.json +36 -0
- package/schema/verbs/dial.schema.json +157 -0
- package/schema/verbs/dtmf.schema.json +27 -0
- package/schema/verbs/dub.schema.json +52 -0
- package/schema/verbs/enqueue.schema.json +38 -0
- package/schema/verbs/gather.schema.json +145 -0
- package/schema/verbs/hangup.schema.json +29 -0
- package/schema/verbs/leave.schema.json +12 -0
- package/schema/verbs/listen.schema.json +110 -0
- package/schema/verbs/llm.schema.json +131 -0
- package/schema/verbs/message.schema.json +30 -0
- package/schema/verbs/pause.schema.json +26 -0
- package/schema/verbs/pipeline.schema.json +61 -0
- package/schema/verbs/play.schema.json +69 -0
- package/schema/verbs/redirect.schema.json +23 -0
- package/schema/verbs/say.schema.json +84 -0
- package/schema/verbs/sip-decline.schema.json +31 -0
- package/schema/verbs/sip-refer.schema.json +41 -0
- package/schema/verbs/sip-request.schema.json +33 -0
- package/schema/verbs/stream.schema.json +30 -0
- package/schema/verbs/tag.schema.json +21 -0
- package/schema/verbs/transcribe.schema.json +44 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://jambonz.org/schema/components/recognizer",
|
|
4
|
+
"title": "Recognizer",
|
|
5
|
+
"description": "Configuration for speech-to-text recognition. Specifies the STT vendor, language, and vendor-specific options. Can be set at the session level via the 'config' verb or overridden per-verb (e.g. on 'gather').",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"vendor": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"description": "The STT vendor to use. Must match a vendor configured in the jambonz platform.",
|
|
11
|
+
"examples": ["google", "aws", "microsoft", "deepgram", "nuance", "ibm", "nvidia", "soniox", "cobalt", "assemblyai", "speechmatics", "openai", "houndify", "gladia", "elevenlabs", "verbio", "custom"]
|
|
12
|
+
},
|
|
13
|
+
"label": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"description": "An optional label identifying a specific credential set for this vendor. Used when multiple credentials are configured for the same vendor."
|
|
16
|
+
},
|
|
17
|
+
"language": {
|
|
18
|
+
"type": "string",
|
|
19
|
+
"description": "The language code for speech recognition, in BCP-47 format.",
|
|
20
|
+
"examples": ["en-US", "en-GB", "es-ES", "fr-FR"]
|
|
21
|
+
},
|
|
22
|
+
"fallbackVendor": {
|
|
23
|
+
"type": "string",
|
|
24
|
+
"description": "A backup STT vendor to use if the primary vendor fails or is unavailable."
|
|
25
|
+
},
|
|
26
|
+
"fallbackLabel": {
|
|
27
|
+
"type": "string",
|
|
28
|
+
"description": "Credential label for the fallback vendor."
|
|
29
|
+
},
|
|
30
|
+
"fallbackLanguage": {
|
|
31
|
+
"type": "string",
|
|
32
|
+
"description": "Language code to use with the fallback vendor."
|
|
33
|
+
},
|
|
34
|
+
"vad": {
|
|
35
|
+
"$ref": "vad",
|
|
36
|
+
"description": "Voice activity detection settings for this recognizer."
|
|
37
|
+
},
|
|
38
|
+
"hints": {
|
|
39
|
+
"type": "array",
|
|
40
|
+
"items": { "type": "string" },
|
|
41
|
+
"description": "An array of words or phrases that the recognizer should favor. Use this to improve accuracy for domain-specific terminology, product names, or proper nouns.",
|
|
42
|
+
"examples": [["jambonz", "drachtio", "SIP", "WebRTC"]]
|
|
43
|
+
},
|
|
44
|
+
"hintsBoost": {
|
|
45
|
+
"type": "number",
|
|
46
|
+
"description": "A boost factor for hint words. Higher values increase the likelihood of recognizing hinted words. Vendor-specific range."
|
|
47
|
+
},
|
|
48
|
+
"altLanguages": {
|
|
49
|
+
"type": "array",
|
|
50
|
+
"items": { "type": "string" },
|
|
51
|
+
"description": "Additional languages the recognizer should listen for simultaneously. Enables multilingual recognition.",
|
|
52
|
+
"examples": [["es-ES", "fr-FR"]]
|
|
53
|
+
},
|
|
54
|
+
"profanityFilter": {
|
|
55
|
+
"type": "boolean",
|
|
56
|
+
"description": "If true, the vendor will attempt to filter profanity from transcription results."
|
|
57
|
+
},
|
|
58
|
+
"interim": {
|
|
59
|
+
"type": "boolean",
|
|
60
|
+
"description": "If true, return interim (partial) transcription results as they become available, before the utterance is complete."
|
|
61
|
+
},
|
|
62
|
+
"singleUtterance": {
|
|
63
|
+
"type": "boolean",
|
|
64
|
+
"description": "If true, recognition stops after the first complete utterance is detected."
|
|
65
|
+
},
|
|
66
|
+
"dualChannel": {
|
|
67
|
+
"type": "boolean",
|
|
68
|
+
"description": "If true, send separate audio channels for each call leg (caller and callee) to the recognizer."
|
|
69
|
+
},
|
|
70
|
+
"separateRecognitionPerChannel": {
|
|
71
|
+
"type": "boolean",
|
|
72
|
+
"description": "If true, perform independent recognition on each audio channel. Requires dualChannel."
|
|
73
|
+
},
|
|
74
|
+
"punctuation": {
|
|
75
|
+
"type": "boolean",
|
|
76
|
+
"description": "If true, enable automatic punctuation in transcription results."
|
|
77
|
+
},
|
|
78
|
+
"enhancedModel": {
|
|
79
|
+
"type": "boolean",
|
|
80
|
+
"description": "If true, use an enhanced (premium) recognition model if available from the vendor."
|
|
81
|
+
},
|
|
82
|
+
"words": {
|
|
83
|
+
"type": "boolean",
|
|
84
|
+
"description": "If true, include word-level timing information in transcription results."
|
|
85
|
+
},
|
|
86
|
+
"diarization": {
|
|
87
|
+
"type": "boolean",
|
|
88
|
+
"description": "If true, enable speaker diarization to identify different speakers in the audio."
|
|
89
|
+
},
|
|
90
|
+
"diarizationMinSpeakers": {
|
|
91
|
+
"type": "number",
|
|
92
|
+
"description": "Minimum number of speakers expected. Used to guide the diarization algorithm."
|
|
93
|
+
},
|
|
94
|
+
"diarizationMaxSpeakers": {
|
|
95
|
+
"type": "number",
|
|
96
|
+
"description": "Maximum number of speakers expected. Used to guide the diarization algorithm."
|
|
97
|
+
},
|
|
98
|
+
"interactionType": {
|
|
99
|
+
"type": "string",
|
|
100
|
+
"description": "A hint to the recognizer about the type of interaction, which can improve accuracy.",
|
|
101
|
+
"enum": ["unspecified", "discussion", "presentation", "phone_call", "voicemail", "voice_search", "voice_command", "dictation"]
|
|
102
|
+
},
|
|
103
|
+
"naicsCode": {
|
|
104
|
+
"type": "number",
|
|
105
|
+
"description": "North American Industry Classification System code. Some vendors use this to improve domain-specific accuracy."
|
|
106
|
+
},
|
|
107
|
+
"identifyChannels": {
|
|
108
|
+
"type": "boolean",
|
|
109
|
+
"description": "If true, identify and label which channel each transcription segment came from."
|
|
110
|
+
},
|
|
111
|
+
"vocabularyName": {
|
|
112
|
+
"type": "string",
|
|
113
|
+
"description": "Name of a custom vocabulary resource configured at the vendor for improved recognition of specialized terms."
|
|
114
|
+
},
|
|
115
|
+
"vocabularyFilterName": {
|
|
116
|
+
"type": "string",
|
|
117
|
+
"description": "Name of a vocabulary filter configured at the vendor for masking or removing specific words."
|
|
118
|
+
},
|
|
119
|
+
"filterMethod": {
|
|
120
|
+
"type": "string",
|
|
121
|
+
"description": "How filtered vocabulary words should be handled in the transcript.",
|
|
122
|
+
"enum": ["remove", "mask", "tag"]
|
|
123
|
+
},
|
|
124
|
+
"model": {
|
|
125
|
+
"type": "string",
|
|
126
|
+
"description": "The specific recognition model to use. Model names are vendor-specific.",
|
|
127
|
+
"examples": ["latest_long", "phone_call", "nova-2", "chirp"]
|
|
128
|
+
},
|
|
129
|
+
"outputFormat": {
|
|
130
|
+
"type": "string",
|
|
131
|
+
"description": "The level of detail in recognition results.",
|
|
132
|
+
"enum": ["simple", "detailed"]
|
|
133
|
+
},
|
|
134
|
+
"profanityOption": {
|
|
135
|
+
"type": "string",
|
|
136
|
+
"description": "How profanity should be handled in results.",
|
|
137
|
+
"enum": ["masked", "removed", "raw"]
|
|
138
|
+
},
|
|
139
|
+
"requestSnr": {
|
|
140
|
+
"type": "boolean",
|
|
141
|
+
"description": "If true, request signal-to-noise ratio information in results."
|
|
142
|
+
},
|
|
143
|
+
"initialSpeechTimeoutMs": {
|
|
144
|
+
"type": "number",
|
|
145
|
+
"description": "Time in milliseconds to wait for initial speech before timing out.",
|
|
146
|
+
"examples": [5000]
|
|
147
|
+
},
|
|
148
|
+
"azureServiceEndpoint": {
|
|
149
|
+
"type": "string",
|
|
150
|
+
"description": "Custom Azure Speech Services endpoint URL. Only applies when vendor is 'microsoft'."
|
|
151
|
+
},
|
|
152
|
+
"azureSttEndpointId": {
|
|
153
|
+
"type": "string",
|
|
154
|
+
"description": "Azure custom speech endpoint ID for using a custom-trained model. Only applies when vendor is 'microsoft'."
|
|
155
|
+
},
|
|
156
|
+
"asrDtmfTerminationDigit": {
|
|
157
|
+
"type": "string",
|
|
158
|
+
"description": "A DTMF digit that terminates speech recognition when pressed.",
|
|
159
|
+
"examples": ["#"]
|
|
160
|
+
},
|
|
161
|
+
"asrTimeout": {
|
|
162
|
+
"type": "number",
|
|
163
|
+
"description": "Maximum time in seconds to wait for a complete recognition result."
|
|
164
|
+
},
|
|
165
|
+
"fastRecognitionTimeout": {
|
|
166
|
+
"type": "number",
|
|
167
|
+
"description": "Timeout in seconds for fast recognition mode. Shorter timeout for quick responses."
|
|
168
|
+
},
|
|
169
|
+
"minConfidence": {
|
|
170
|
+
"type": "number",
|
|
171
|
+
"description": "Minimum confidence score (0-1) required to accept a recognition result. Results below this threshold are discarded.",
|
|
172
|
+
"minimum": 0,
|
|
173
|
+
"maximum": 1
|
|
174
|
+
},
|
|
175
|
+
"deepgramOptions": {
|
|
176
|
+
"type": "object",
|
|
177
|
+
"description": "Deepgram-specific recognition options. Only applies when vendor is 'deepgram'. See Deepgram API documentation for available options.",
|
|
178
|
+
"additionalProperties": true
|
|
179
|
+
},
|
|
180
|
+
"googleOptions": {
|
|
181
|
+
"type": "object",
|
|
182
|
+
"description": "Google Speech-to-Text specific options. Only applies when vendor is 'google'.",
|
|
183
|
+
"additionalProperties": true
|
|
184
|
+
},
|
|
185
|
+
"awsOptions": {
|
|
186
|
+
"type": "object",
|
|
187
|
+
"description": "AWS Transcribe specific options. Only applies when vendor is 'aws'.",
|
|
188
|
+
"additionalProperties": true
|
|
189
|
+
},
|
|
190
|
+
"azureOptions": {
|
|
191
|
+
"type": "object",
|
|
192
|
+
"description": "Azure Speech Services specific options. Only applies when vendor is 'microsoft'.",
|
|
193
|
+
"additionalProperties": true
|
|
194
|
+
},
|
|
195
|
+
"nuanceOptions": {
|
|
196
|
+
"type": "object",
|
|
197
|
+
"description": "Nuance-specific recognition options. Only applies when vendor is 'nuance'.",
|
|
198
|
+
"additionalProperties": true
|
|
199
|
+
},
|
|
200
|
+
"ibmOptions": {
|
|
201
|
+
"type": "object",
|
|
202
|
+
"description": "IBM Watson Speech-to-Text specific options. Only applies when vendor is 'ibm'.",
|
|
203
|
+
"additionalProperties": true
|
|
204
|
+
},
|
|
205
|
+
"nvidiaOptions": {
|
|
206
|
+
"type": "object",
|
|
207
|
+
"description": "NVIDIA Riva specific options. Only applies when vendor is 'nvidia'.",
|
|
208
|
+
"additionalProperties": true
|
|
209
|
+
},
|
|
210
|
+
"sonioxOptions": {
|
|
211
|
+
"type": "object",
|
|
212
|
+
"description": "Soniox-specific recognition options. Only applies when vendor is 'soniox'.",
|
|
213
|
+
"additionalProperties": true
|
|
214
|
+
},
|
|
215
|
+
"cobaltOptions": {
|
|
216
|
+
"type": "object",
|
|
217
|
+
"description": "Cobalt-specific recognition options. Only applies when vendor is 'cobalt'.",
|
|
218
|
+
"additionalProperties": true
|
|
219
|
+
},
|
|
220
|
+
"assemblyAiOptions": {
|
|
221
|
+
"type": "object",
|
|
222
|
+
"description": "AssemblyAI-specific recognition options. Only applies when vendor is 'assemblyai'.",
|
|
223
|
+
"additionalProperties": true
|
|
224
|
+
},
|
|
225
|
+
"speechmaticsOptions": {
|
|
226
|
+
"type": "object",
|
|
227
|
+
"description": "Speechmatics-specific recognition options. Only applies when vendor is 'speechmatics'.",
|
|
228
|
+
"additionalProperties": true
|
|
229
|
+
},
|
|
230
|
+
"openaiOptions": {
|
|
231
|
+
"type": "object",
|
|
232
|
+
"description": "OpenAI Whisper/Realtime specific options. Only applies when vendor is 'openai'.",
|
|
233
|
+
"additionalProperties": true
|
|
234
|
+
},
|
|
235
|
+
"houndifyOptions": {
|
|
236
|
+
"type": "object",
|
|
237
|
+
"description": "Houndify-specific recognition options. Only applies when vendor is 'houndify'.",
|
|
238
|
+
"additionalProperties": true
|
|
239
|
+
},
|
|
240
|
+
"gladiaOptions": {
|
|
241
|
+
"type": "object",
|
|
242
|
+
"description": "Gladia-specific recognition options. Only applies when vendor is 'gladia'.",
|
|
243
|
+
"additionalProperties": true
|
|
244
|
+
},
|
|
245
|
+
"elevenlabsOptions": {
|
|
246
|
+
"type": "object",
|
|
247
|
+
"description": "ElevenLabs-specific recognition options. Only applies when vendor is 'elevenlabs'.",
|
|
248
|
+
"additionalProperties": true
|
|
249
|
+
},
|
|
250
|
+
"verbioOptions": {
|
|
251
|
+
"type": "object",
|
|
252
|
+
"description": "Verbio-specific recognition options. Only applies when vendor is 'verbio'.",
|
|
253
|
+
"additionalProperties": true
|
|
254
|
+
},
|
|
255
|
+
"customOptions": {
|
|
256
|
+
"type": "object",
|
|
257
|
+
"description": "Options for custom STT vendors. Only applies when vendor is 'custom'.",
|
|
258
|
+
"additionalProperties": true
|
|
259
|
+
}
|
|
260
|
+
},
|
|
261
|
+
"required": ["vendor"],
|
|
262
|
+
"examples": [
|
|
263
|
+
{
|
|
264
|
+
"vendor": "deepgram",
|
|
265
|
+
"language": "en-US",
|
|
266
|
+
"deepgramOptions": {
|
|
267
|
+
"model": "nova-2",
|
|
268
|
+
"smartFormatting": true,
|
|
269
|
+
"endpointing": 500
|
|
270
|
+
}
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
"vendor": "google",
|
|
274
|
+
"language": "en-US",
|
|
275
|
+
"hints": ["jambonz", "drachtio"],
|
|
276
|
+
"punctuation": true,
|
|
277
|
+
"enhancedModel": true
|
|
278
|
+
}
|
|
279
|
+
]
|
|
280
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://jambonz.org/schema/components/synthesizer",
|
|
4
|
+
"title": "Synthesizer",
|
|
5
|
+
"description": "Configuration for text-to-speech synthesis. Specifies the TTS vendor, voice, language, and vendor-specific options. Can be set at the session level via the 'config' verb or overridden per-verb (e.g. on 'say').",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"vendor": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"description": "The TTS vendor to use. Must match a vendor configured in the jambonz platform.",
|
|
11
|
+
"examples": ["google", "aws", "microsoft", "elevenlabs", "cartesia", "deepgram", "ibm", "nuance", "nvidia", "wellsaid", "whisper", "verbio", "custom"]
|
|
12
|
+
},
|
|
13
|
+
"label": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"description": "An optional label identifying a specific credential set for this vendor. Used when multiple credentials are configured for the same vendor on the jambonz platform."
|
|
16
|
+
},
|
|
17
|
+
"language": {
|
|
18
|
+
"type": "string",
|
|
19
|
+
"description": "The language code for speech synthesis, in BCP-47 format.",
|
|
20
|
+
"examples": ["en-US", "en-GB", "es-ES", "fr-FR", "de-DE"]
|
|
21
|
+
},
|
|
22
|
+
"voice": {
|
|
23
|
+
"oneOf": [
|
|
24
|
+
{ "type": "string" },
|
|
25
|
+
{ "type": "object", "additionalProperties": true }
|
|
26
|
+
],
|
|
27
|
+
"description": "The voice to use for synthesis. Typically a string voice name (e.g. 'en-US-Wavenet-D' for Google, 'Joanna' for AWS Polly). Some vendors accept an object for more complex voice configuration.",
|
|
28
|
+
"examples": ["en-US-Wavenet-D", "Joanna", "Rachel"]
|
|
29
|
+
},
|
|
30
|
+
"fallbackVendor": {
|
|
31
|
+
"type": "string",
|
|
32
|
+
"description": "A backup TTS vendor to use if the primary vendor fails or is unavailable."
|
|
33
|
+
},
|
|
34
|
+
"fallbackLabel": {
|
|
35
|
+
"type": "string",
|
|
36
|
+
"description": "Credential label for the fallback vendor."
|
|
37
|
+
},
|
|
38
|
+
"fallbackLanguage": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"description": "Language code to use with the fallback vendor."
|
|
41
|
+
},
|
|
42
|
+
"fallbackVoice": {
|
|
43
|
+
"oneOf": [
|
|
44
|
+
{ "type": "string" },
|
|
45
|
+
{ "type": "object", "additionalProperties": true }
|
|
46
|
+
],
|
|
47
|
+
"description": "Voice to use with the fallback vendor."
|
|
48
|
+
},
|
|
49
|
+
"engine": {
|
|
50
|
+
"type": "string",
|
|
51
|
+
"description": "The synthesis engine tier to use. Availability depends on the vendor.",
|
|
52
|
+
"enum": ["standard", "neural", "generative", "long-form"]
|
|
53
|
+
},
|
|
54
|
+
"gender": {
|
|
55
|
+
"type": "string",
|
|
56
|
+
"description": "Preferred voice gender. Used by some vendors (e.g. Google) when a specific voice is not specified.",
|
|
57
|
+
"enum": ["MALE", "FEMALE", "NEUTRAL"]
|
|
58
|
+
},
|
|
59
|
+
"options": {
|
|
60
|
+
"type": "object",
|
|
61
|
+
"description": "Vendor-specific options passed through to the TTS provider. The structure depends on the vendor being used.",
|
|
62
|
+
"additionalProperties": true
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"required": ["vendor"],
|
|
66
|
+
"examples": [
|
|
67
|
+
{
|
|
68
|
+
"vendor": "google",
|
|
69
|
+
"language": "en-US",
|
|
70
|
+
"voice": "en-US-Wavenet-D"
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"vendor": "elevenlabs",
|
|
74
|
+
"voice": "Rachel",
|
|
75
|
+
"options": {
|
|
76
|
+
"model_id": "eleven_turbo_v2",
|
|
77
|
+
"stability": 0.5,
|
|
78
|
+
"similarity_boost": 0.75
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
]
|
|
82
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://jambonz.org/schema/components/target",
|
|
4
|
+
"title": "Target",
|
|
5
|
+
"description": "A call target for the 'dial' verb. Specifies who or what to connect the call to: a phone number (PSTN), a SIP endpoint, a registered user, or a Microsoft Teams user.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"type": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"description": "The type of target to dial.",
|
|
11
|
+
"enum": ["phone", "sip", "user", "teams"]
|
|
12
|
+
},
|
|
13
|
+
"number": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"description": "The phone number to dial. Required when type is 'phone'. Use E.164 format.",
|
|
16
|
+
"examples": ["+15085551212"]
|
|
17
|
+
},
|
|
18
|
+
"sipUri": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "The SIP URI to dial. Required when type is 'sip'.",
|
|
21
|
+
"examples": ["sip:alice@example.com"]
|
|
22
|
+
},
|
|
23
|
+
"name": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"description": "The registered user name to dial. Required when type is 'user'. Also used as the display name for SIP targets."
|
|
26
|
+
},
|
|
27
|
+
"tenant": {
|
|
28
|
+
"type": "string",
|
|
29
|
+
"description": "The Microsoft Teams tenant ID. Required when type is 'teams'."
|
|
30
|
+
},
|
|
31
|
+
"trunk": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"description": "The SIP trunk to use for the outbound call. When specified, overrides the default carrier routing."
|
|
34
|
+
},
|
|
35
|
+
"confirmHook": {
|
|
36
|
+
"oneOf": [
|
|
37
|
+
{ "type": "string", "format": "uri" },
|
|
38
|
+
{ "$ref": "actionHook" }
|
|
39
|
+
],
|
|
40
|
+
"description": "A webhook to invoke when the target answers, before connecting the call. Use this to screen calls, play a whisper prompt, or require the target to press a key to accept."
|
|
41
|
+
},
|
|
42
|
+
"method": {
|
|
43
|
+
"type": "string",
|
|
44
|
+
"description": "The HTTP method to use when invoking the confirmHook.",
|
|
45
|
+
"enum": ["GET", "POST"],
|
|
46
|
+
"default": "POST"
|
|
47
|
+
},
|
|
48
|
+
"headers": {
|
|
49
|
+
"type": "object",
|
|
50
|
+
"description": "Custom SIP headers to include on the outbound INVITE. Keys are header names, values are header values.",
|
|
51
|
+
"additionalProperties": { "type": "string" }
|
|
52
|
+
},
|
|
53
|
+
"from": {
|
|
54
|
+
"type": "object",
|
|
55
|
+
"description": "Override the From header on the outbound SIP INVITE.",
|
|
56
|
+
"properties": {
|
|
57
|
+
"user": {
|
|
58
|
+
"type": "string",
|
|
59
|
+
"description": "The user part of the SIP From URI."
|
|
60
|
+
},
|
|
61
|
+
"host": {
|
|
62
|
+
"type": "string",
|
|
63
|
+
"description": "The host part of the SIP From URI."
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"auth": {
|
|
68
|
+
"$ref": "auth",
|
|
69
|
+
"description": "SIP authentication credentials for the outbound call, if the far end requires digest auth."
|
|
70
|
+
},
|
|
71
|
+
"vmail": {
|
|
72
|
+
"type": "boolean",
|
|
73
|
+
"description": "If true, follow the call into voicemail if the target does not answer."
|
|
74
|
+
},
|
|
75
|
+
"overrideTo": {
|
|
76
|
+
"type": "string",
|
|
77
|
+
"description": "Override the Request-URI on the outbound SIP INVITE. Useful when the Request-URI needs to differ from the To header."
|
|
78
|
+
},
|
|
79
|
+
"proxy": {
|
|
80
|
+
"type": "string",
|
|
81
|
+
"description": "A SIP proxy to route the outbound call through, specified as a SIP URI.",
|
|
82
|
+
"examples": ["sip:proxy.example.com"]
|
|
83
|
+
}
|
|
84
|
+
},
|
|
85
|
+
"required": ["type"],
|
|
86
|
+
"examples": [
|
|
87
|
+
{
|
|
88
|
+
"type": "phone",
|
|
89
|
+
"number": "+15085551212"
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"type": "sip",
|
|
93
|
+
"sipUri": "sip:alice@example.com"
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"type": "user",
|
|
97
|
+
"name": "bob"
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"type": "teams",
|
|
101
|
+
"number": "+15085551212",
|
|
102
|
+
"tenant": "a]b]c]d]e"
|
|
103
|
+
}
|
|
104
|
+
]
|
|
105
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://jambonz.org/schema/components/vad",
|
|
4
|
+
"title": "VAD",
|
|
5
|
+
"description": "Voice Activity Detection configuration. Controls how jambonz detects the presence or absence of speech on the audio channel. Used to determine speech start/end boundaries for recognition and barge-in.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"enable": {
|
|
9
|
+
"type": "boolean",
|
|
10
|
+
"description": "Whether to enable voice activity detection."
|
|
11
|
+
},
|
|
12
|
+
"voiceMs": {
|
|
13
|
+
"type": "number",
|
|
14
|
+
"description": "Duration of voice activity (in milliseconds) required before speech is considered to have started.",
|
|
15
|
+
"examples": [250]
|
|
16
|
+
},
|
|
17
|
+
"silenceMs": {
|
|
18
|
+
"type": "number",
|
|
19
|
+
"description": "Duration of silence (in milliseconds) required before speech is considered to have ended.",
|
|
20
|
+
"examples": [1000]
|
|
21
|
+
},
|
|
22
|
+
"strategy": {
|
|
23
|
+
"type": "string",
|
|
24
|
+
"description": "The VAD strategy to use."
|
|
25
|
+
},
|
|
26
|
+
"mode": {
|
|
27
|
+
"type": "number",
|
|
28
|
+
"description": "WebRTC VAD aggressiveness mode (0-3). Higher values are more aggressive at filtering non-speech. Only applies when vendor is 'webrtc'.",
|
|
29
|
+
"minimum": 0,
|
|
30
|
+
"maximum": 3
|
|
31
|
+
},
|
|
32
|
+
"vendor": {
|
|
33
|
+
"type": "string",
|
|
34
|
+
"description": "The VAD engine to use.",
|
|
35
|
+
"enum": ["webrtc", "silero"]
|
|
36
|
+
},
|
|
37
|
+
"threshold": {
|
|
38
|
+
"type": "number",
|
|
39
|
+
"description": "Speech detection confidence threshold for Silero VAD. Value between 0 and 1, where higher values require greater confidence. Only applies when vendor is 'silero'.",
|
|
40
|
+
"minimum": 0,
|
|
41
|
+
"maximum": 1
|
|
42
|
+
},
|
|
43
|
+
"speechPadMs": {
|
|
44
|
+
"type": "number",
|
|
45
|
+
"description": "Padding in milliseconds added before and after detected speech segments. Prevents clipping utterance boundaries. Only applies when vendor is 'silero'."
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://jambonz.org/schema/jambonz-app",
|
|
4
|
+
"title": "jambonz Application",
|
|
5
|
+
"description": "A jambonz application is an array of verbs that are executed sequentially to control a phone call. Each verb performs an action: speaking text, playing audio, collecting input, dialing a number, connecting to an AI model, etc. When a webhook (actionHook) is invoked, it must return a new verb array to continue call processing.\n\nThe execution model is simple: verbs execute one after another, top to bottom. When a verb with an actionHook completes (e.g. gather collects input), the actionHook is called and its response replaces the remaining verb stack. If the verb array is exhausted without a hangup, the call is terminated.\n\nThere are two transport modes for delivering verb arrays to jambonz:\n- **Webhook**: Your HTTP server receives POST/GET requests with call data and returns JSON verb arrays in the response body.\n- **WebSocket**: Your server maintains a persistent websocket connection with jambonz and sends/receives verb arrays as JSON messages. Required for real-time features like LLM conversations.\n\nThe verb schemas and JSON structure are identical regardless of transport mode.",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"$ref": "#/$defs/Verb"
|
|
9
|
+
},
|
|
10
|
+
"minItems": 1,
|
|
11
|
+
"$defs": {
|
|
12
|
+
"Verb": {
|
|
13
|
+
"oneOf": [
|
|
14
|
+
{ "$ref": "verbs/answer" },
|
|
15
|
+
{ "$ref": "verbs/alert" },
|
|
16
|
+
{ "$ref": "verbs/config" },
|
|
17
|
+
{ "$ref": "verbs/say" },
|
|
18
|
+
{ "$ref": "verbs/play" },
|
|
19
|
+
{ "$ref": "verbs/gather" },
|
|
20
|
+
{ "$ref": "verbs/dial" },
|
|
21
|
+
{ "$ref": "verbs/listen" },
|
|
22
|
+
{ "$ref": "verbs/stream" },
|
|
23
|
+
{ "$ref": "verbs/llm" },
|
|
24
|
+
{ "$ref": "verbs/pipeline" },
|
|
25
|
+
{ "$ref": "verbs/conference" },
|
|
26
|
+
{ "$ref": "verbs/transcribe" },
|
|
27
|
+
{ "$ref": "verbs/enqueue" },
|
|
28
|
+
{ "$ref": "verbs/dequeue" },
|
|
29
|
+
{ "$ref": "verbs/dtmf" },
|
|
30
|
+
{ "$ref": "verbs/dub" },
|
|
31
|
+
{ "$ref": "verbs/hangup" },
|
|
32
|
+
{ "$ref": "verbs/leave" },
|
|
33
|
+
{ "$ref": "verbs/message" },
|
|
34
|
+
{ "$ref": "verbs/pause" },
|
|
35
|
+
{ "$ref": "verbs/redirect" },
|
|
36
|
+
{ "$ref": "verbs/tag" },
|
|
37
|
+
{ "$ref": "verbs/sip:decline" },
|
|
38
|
+
{ "$ref": "verbs/sip:request" },
|
|
39
|
+
{ "$ref": "verbs/sip:refer" }
|
|
40
|
+
],
|
|
41
|
+
"discriminator": {
|
|
42
|
+
"propertyName": "verb"
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"examples": [
|
|
47
|
+
[
|
|
48
|
+
{
|
|
49
|
+
"verb": "config",
|
|
50
|
+
"synthesizer": { "vendor": "elevenlabs", "voice": "Rachel", "language": "en-US" },
|
|
51
|
+
"recognizer": { "vendor": "deepgram", "language": "en-US" }
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"verb": "say",
|
|
55
|
+
"text": "Hello! Welcome to Acme Corp. How can I help you today?"
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"verb": "gather",
|
|
59
|
+
"input": ["speech"],
|
|
60
|
+
"actionHook": "/process-input",
|
|
61
|
+
"timeout": 15,
|
|
62
|
+
"say": { "text": "I'm listening." }
|
|
63
|
+
}
|
|
64
|
+
],
|
|
65
|
+
[
|
|
66
|
+
{
|
|
67
|
+
"verb": "say",
|
|
68
|
+
"text": "Please hold while I connect you to an agent."
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"verb": "dial",
|
|
72
|
+
"target": [{ "type": "phone", "number": "+15085551212" }],
|
|
73
|
+
"answerOnBridge": true,
|
|
74
|
+
"timeout": 30,
|
|
75
|
+
"actionHook": "/dial-complete"
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"verb": "say",
|
|
79
|
+
"text": "Sorry, the agent is not available. Please try again later."
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"verb": "hangup"
|
|
83
|
+
}
|
|
84
|
+
],
|
|
85
|
+
[
|
|
86
|
+
{
|
|
87
|
+
"verb": "config",
|
|
88
|
+
"synthesizer": { "vendor": "cartesia", "voice": "sonic-english" },
|
|
89
|
+
"recognizer": { "vendor": "deepgram", "language": "en-US" }
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"verb": "llm",
|
|
93
|
+
"vendor": "openai",
|
|
94
|
+
"model": "gpt-4o",
|
|
95
|
+
"llmOptions": {
|
|
96
|
+
"messages": [
|
|
97
|
+
{ "role": "system", "content": "You are a helpful customer service agent for Acme Corp. Be concise and friendly." }
|
|
98
|
+
],
|
|
99
|
+
"temperature": 0.7
|
|
100
|
+
},
|
|
101
|
+
"actionHook": "/llm-complete",
|
|
102
|
+
"toolHook": "/llm-tool"
|
|
103
|
+
}
|
|
104
|
+
]
|
|
105
|
+
]
|
|
106
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://jambonz.org/schema/verbs/alert",
|
|
4
|
+
"title": "Alert",
|
|
5
|
+
"description": "Sends a 180 Ringing provisional response with an Alert-Info header. Used to trigger a specific ring tone or alert behavior on the caller's device before the call is answered.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"verb": { "const": "alert" },
|
|
9
|
+
"id": { "type": "string", "description": "An optional unique identifier for this verb instance." },
|
|
10
|
+
"message": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "The value to include in the Alert-Info header.",
|
|
13
|
+
"examples": ["info=alert-internal", "http://example.com/ringtone.wav"]
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"required": ["message"],
|
|
17
|
+
"examples": [
|
|
18
|
+
{ "verb": "alert", "message": "info=alert-internal" }
|
|
19
|
+
]
|
|
20
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://jambonz.org/schema/verbs/answer",
|
|
4
|
+
"title": "Answer",
|
|
5
|
+
"description": "Answers an incoming call (sends a 200 OK to the SIP INVITE). Most verbs implicitly answer the call, so this verb is only needed when you want to explicitly control when the call is answered — for example, to play early media before answering.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"verb": { "const": "answer" },
|
|
9
|
+
"id": { "type": "string", "description": "An optional unique identifier for this verb instance." }
|
|
10
|
+
},
|
|
11
|
+
"examples": [{ "verb": "answer" }]
|
|
12
|
+
}
|