@jambonz/schema 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/AGENTS.md +974 -0
  2. package/callbacks/amd.schema.json +50 -0
  3. package/callbacks/base.schema.json +29 -0
  4. package/callbacks/call-status.schema.json +22 -0
  5. package/callbacks/conference-status.schema.json +24 -0
  6. package/callbacks/conference-wait.schema.json +11 -0
  7. package/callbacks/conference.schema.json +11 -0
  8. package/callbacks/dequeue.schema.json +19 -0
  9. package/callbacks/dial-dtmf.schema.json +18 -0
  10. package/callbacks/dial-hold.schema.json +22 -0
  11. package/callbacks/dial-refer.schema.json +28 -0
  12. package/callbacks/dial.schema.json +31 -0
  13. package/callbacks/enqueue-wait.schema.json +17 -0
  14. package/callbacks/enqueue.schema.json +27 -0
  15. package/callbacks/gather-partial.schema.json +54 -0
  16. package/callbacks/gather.schema.json +60 -0
  17. package/callbacks/listen.schema.json +21 -0
  18. package/callbacks/llm.schema.json +30 -0
  19. package/callbacks/message.schema.json +35 -0
  20. package/callbacks/pipeline-turn.schema.json +109 -0
  21. package/callbacks/play.schema.json +36 -0
  22. package/callbacks/session-new.schema.json +143 -0
  23. package/callbacks/session-reconnect.schema.json +9 -0
  24. package/callbacks/session-redirect.schema.json +38 -0
  25. package/callbacks/sip-refer-event.schema.json +20 -0
  26. package/callbacks/sip-refer.schema.json +22 -0
  27. package/callbacks/sip-request.schema.json +27 -0
  28. package/callbacks/transcribe-translation.schema.json +24 -0
  29. package/callbacks/transcribe.schema.json +46 -0
  30. package/callbacks/tts-streaming-event.schema.json +77 -0
  31. package/callbacks/verb-status.schema.json +57 -0
  32. package/components/actionHook.schema.json +36 -0
  33. package/components/actionHookDelayAction.schema.json +37 -0
  34. package/components/amd.schema.json +68 -0
  35. package/components/auth.schema.json +18 -0
  36. package/components/bidirectionalAudio.schema.json +22 -0
  37. package/components/fillerNoise.schema.json +25 -0
  38. package/components/llm-base.schema.json +94 -0
  39. package/components/recognizer-assemblyAiOptions.schema.json +66 -0
  40. package/components/recognizer-awsOptions.schema.json +52 -0
  41. package/components/recognizer-azureOptions.schema.json +32 -0
  42. package/components/recognizer-cobaltOptions.schema.json +34 -0
  43. package/components/recognizer-customOptions.schema.json +27 -0
  44. package/components/recognizer-deepgramOptions.schema.json +147 -0
  45. package/components/recognizer-elevenlabsOptions.schema.json +39 -0
  46. package/components/recognizer-gladiaOptions.schema.json +8 -0
  47. package/components/recognizer-googleOptions.schema.json +35 -0
  48. package/components/recognizer-houndifyOptions.schema.json +53 -0
  49. package/components/recognizer-ibmOptions.schema.json +54 -0
  50. package/components/recognizer-nuanceOptions.schema.json +150 -0
  51. package/components/recognizer-nvidiaOptions.schema.json +39 -0
  52. package/components/recognizer-openaiOptions.schema.json +59 -0
  53. package/components/recognizer-sonioxOptions.schema.json +46 -0
  54. package/components/recognizer-speechmaticsOptions.schema.json +100 -0
  55. package/components/recognizer-verbioOptions.schema.json +46 -0
  56. package/components/recognizer.schema.json +216 -0
  57. package/components/synthesizer.schema.json +82 -0
  58. package/components/target.schema.json +105 -0
  59. package/components/vad.schema.json +48 -0
  60. package/docs/components/recognizer.md +78 -0
  61. package/docs/components/synthesizer.md +27 -0
  62. package/docs/guides/session-commands.md +417 -0
  63. package/docs/verbs/conference.md +51 -0
  64. package/docs/verbs/deepgram_s2s.md +108 -0
  65. package/docs/verbs/dial.md +8 -0
  66. package/docs/verbs/listen.md +71 -0
  67. package/docs/verbs/pipeline.md +475 -0
  68. package/docs/verbs/stream.md +5 -0
  69. package/index.js +9 -0
  70. package/jambonz-app.schema.json +112 -0
  71. package/lib/normalize.js +72 -0
  72. package/lib/validator.js +137 -0
  73. package/package.json +39 -0
  74. package/verbs/alert.schema.json +34 -0
  75. package/verbs/answer.schema.json +22 -0
  76. package/verbs/conference.schema.json +107 -0
  77. package/verbs/config.schema.json +218 -0
  78. package/verbs/deepgram_s2s.schema.json +81 -0
  79. package/verbs/dequeue.schema.json +51 -0
  80. package/verbs/dial.schema.json +187 -0
  81. package/verbs/dialogflow.schema.json +148 -0
  82. package/verbs/dtmf.schema.json +49 -0
  83. package/verbs/dub.schema.json +103 -0
  84. package/verbs/elevenlabs_s2s.schema.json +81 -0
  85. package/verbs/enqueue.schema.json +53 -0
  86. package/verbs/gather.schema.json +188 -0
  87. package/verbs/google_s2s.schema.json +42 -0
  88. package/verbs/hangup.schema.json +36 -0
  89. package/verbs/leave.schema.json +22 -0
  90. package/verbs/listen.schema.json +127 -0
  91. package/verbs/llm.schema.json +44 -0
  92. package/verbs/message.schema.json +82 -0
  93. package/verbs/openai_s2s.schema.json +42 -0
  94. package/verbs/pause.schema.json +36 -0
  95. package/verbs/pipeline.schema.json +240 -0
  96. package/verbs/play.schema.json +96 -0
  97. package/verbs/redirect.schema.json +34 -0
  98. package/verbs/s2s.schema.json +39 -0
  99. package/verbs/say.schema.json +107 -0
  100. package/verbs/sip-decline.schema.json +58 -0
  101. package/verbs/sip-refer.schema.json +58 -0
  102. package/verbs/sip-request.schema.json +54 -0
  103. package/verbs/stream.schema.json +103 -0
  104. package/verbs/tag.schema.json +41 -0
  105. package/verbs/transcribe.schema.json +57 -0
  106. package/verbs/ultravox_s2s.schema.json +41 -0
@@ -0,0 +1,216 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://jambonz.org/schema/components/recognizer",
4
+ "title": "Recognizer",
5
+ "description": "Configuration for speech-to-text recognition. Specifies the STT vendor, language, and vendor-specific options. Can be set at the session level via the 'config' verb or overridden per-verb (e.g. on 'gather').",
6
+ "type": "object",
7
+ "properties": {
8
+ "vendor": {
9
+ "type": "string",
10
+ "description": "The STT vendor to use. Must match a vendor configured in the jambonz platform.",
11
+ "examples": ["google", "aws", "microsoft", "deepgram", "nuance", "ibm", "nvidia", "soniox", "cobalt", "assemblyai", "speechmatics", "openai", "houndify", "gladia", "elevenlabs", "verbio", "custom"]
12
+ },
13
+ "label": {
14
+ "type": "string",
15
+ "description": "An optional label identifying a specific credential set for this vendor. Used when multiple credentials are configured for the same vendor."
16
+ },
17
+ "language": {
18
+ "type": "string",
19
+ "description": "The language code for speech recognition, in BCP-47 format.",
20
+ "examples": ["en-US", "en-GB", "es-ES", "fr-FR"]
21
+ },
22
+ "fallbackVendor": {
23
+ "type": "string",
24
+ "description": "A backup STT vendor to use if the primary vendor fails or is unavailable."
25
+ },
26
+ "fallbackLabel": {
27
+ "type": "string",
28
+ "description": "Credential label for the fallback vendor."
29
+ },
30
+ "fallbackLanguage": {
31
+ "type": "string",
32
+ "description": "Language code to use with the fallback vendor."
33
+ },
34
+ "vad": {
35
+ "$ref": "vad",
36
+ "description": "Voice activity detection settings for this recognizer."
37
+ },
38
+ "autogeneratePrompt": {
39
+ "type": "boolean",
40
+ "description": "If true, automatically generate a prompt for the STT vendor based on context (e.g. TTS voice, language). Supported by vendors that accept prompts for recognition guidance."
41
+ },
42
+ "hints": {
43
+ "type": "array",
44
+ "items": { "type": "string" },
45
+ "description": "An array of words or phrases that the recognizer should favor. Use this to improve accuracy for domain-specific terminology, product names, or proper nouns.",
46
+ "examples": [["jambonz", "drachtio", "SIP", "WebRTC"]]
47
+ },
48
+ "hintsBoost": {
49
+ "type": "number",
50
+ "description": "A boost factor for hint words. Higher values increase the likelihood of recognizing hinted words. Vendor-specific range."
51
+ },
52
+ "altLanguages": {
53
+ "type": "array",
54
+ "items": { "type": "string" },
55
+ "description": "Additional languages the recognizer should listen for simultaneously. Enables multilingual recognition.",
56
+ "examples": [["es-ES", "fr-FR"]]
57
+ },
58
+ "profanityFilter": {
59
+ "type": "boolean",
60
+ "description": "If true, the vendor will attempt to filter profanity from transcription results."
61
+ },
62
+ "interim": {
63
+ "type": "boolean",
64
+ "description": "If true, return interim (partial) transcription results as they become available, before the utterance is complete."
65
+ },
66
+ "singleUtterance": {
67
+ "type": "boolean",
68
+ "description": "If true, recognition stops after the first complete utterance is detected."
69
+ },
70
+ "dualChannel": {
71
+ "type": "boolean",
72
+ "description": "If true, send separate audio channels for each call leg (caller and callee) to the recognizer."
73
+ },
74
+ "separateRecognitionPerChannel": {
75
+ "type": "boolean",
76
+ "description": "If true, perform independent recognition on each audio channel. Requires dualChannel."
77
+ },
78
+ "punctuation": {
79
+ "type": "boolean",
80
+ "description": "If true, enable automatic punctuation in transcription results."
81
+ },
82
+ "enhancedModel": {
83
+ "type": "boolean",
84
+ "description": "If true, use an enhanced (premium) recognition model if available from the vendor."
85
+ },
86
+ "words": {
87
+ "type": "boolean",
88
+ "description": "If true, include word-level timing information in transcription results."
89
+ },
90
+ "diarization": {
91
+ "type": "boolean",
92
+ "description": "If true, enable speaker diarization to identify different speakers in the audio."
93
+ },
94
+ "diarizationMinSpeakers": {
95
+ "type": "number",
96
+ "description": "Minimum number of speakers expected. Used to guide the diarization algorithm."
97
+ },
98
+ "diarizationMaxSpeakers": {
99
+ "type": "number",
100
+ "description": "Maximum number of speakers expected. Used to guide the diarization algorithm."
101
+ },
102
+ "interactionType": {
103
+ "type": "string",
104
+ "description": "A hint to the recognizer about the type of interaction, which can improve accuracy.",
105
+ "enum": ["unspecified", "discussion", "presentation", "phone_call", "voicemail", "voice_search", "voice_command", "dictation"]
106
+ },
107
+ "naicsCode": {
108
+ "type": "number",
109
+ "description": "North American Industry Classification System code. Some vendors use this to improve domain-specific accuracy."
110
+ },
111
+ "identifyChannels": {
112
+ "type": "boolean",
113
+ "description": "If true, identify and label which channel each transcription segment came from."
114
+ },
115
+ "vocabularyName": {
116
+ "type": "string",
117
+ "description": "Name of a custom vocabulary resource configured at the vendor for improved recognition of specialized terms."
118
+ },
119
+ "vocabularyFilterName": {
120
+ "type": "string",
121
+ "description": "Name of a vocabulary filter configured at the vendor for masking or removing specific words."
122
+ },
123
+ "filterMethod": {
124
+ "type": "string",
125
+ "description": "How filtered vocabulary words should be handled in the transcript.",
126
+ "enum": ["remove", "mask", "tag"]
127
+ },
128
+ "model": {
129
+ "type": "string",
130
+ "description": "The specific recognition model to use. Model names are vendor-specific.",
131
+ "examples": ["latest_long", "phone_call", "nova-2", "chirp"]
132
+ },
133
+ "outputFormat": {
134
+ "type": "string",
135
+ "description": "The level of detail in recognition results.",
136
+ "enum": ["simple", "detailed"]
137
+ },
138
+ "profanityOption": {
139
+ "type": "string",
140
+ "description": "How profanity should be handled in results.",
141
+ "enum": ["masked", "removed", "raw"]
142
+ },
143
+ "requestSnr": {
144
+ "type": "boolean",
145
+ "description": "If true, request signal-to-noise ratio information in results."
146
+ },
147
+ "initialSpeechTimeoutMs": {
148
+ "type": "number",
149
+ "description": "Time in milliseconds to wait for initial speech before timing out.",
150
+ "examples": [5000]
151
+ },
152
+ "azureServiceEndpoint": {
153
+ "type": "string",
154
+ "description": "Custom Azure Speech Services endpoint URL. Only applies when vendor is 'microsoft'."
155
+ },
156
+ "azureSttEndpointId": {
157
+ "type": "string",
158
+ "description": "Azure custom speech endpoint ID for using a custom-trained model. Only applies when vendor is 'microsoft'."
159
+ },
160
+ "asrDtmfTerminationDigit": {
161
+ "type": "string",
162
+ "description": "A DTMF digit that terminates speech recognition when pressed.",
163
+ "examples": ["#"]
164
+ },
165
+ "asrTimeout": {
166
+ "type": "number",
167
+ "description": "Maximum time in seconds to wait for a complete recognition result."
168
+ },
169
+ "fastRecognitionTimeout": {
170
+ "type": "number",
171
+ "description": "Timeout in seconds for fast recognition mode. Shorter timeout for quick responses."
172
+ },
173
+ "minConfidence": {
174
+ "type": "number",
175
+ "description": "Minimum confidence score (0-1) required to accept a recognition result. Results below this threshold are discarded.",
176
+ "minimum": 0,
177
+ "maximum": 1
178
+ },
179
+ "deepgramOptions": { "$ref": "recognizer-deepgramOptions" },
180
+ "googleOptions": { "$ref": "recognizer-googleOptions" },
181
+ "awsOptions": { "$ref": "recognizer-awsOptions" },
182
+ "azureOptions": { "$ref": "recognizer-azureOptions" },
183
+ "nuanceOptions": { "$ref": "recognizer-nuanceOptions" },
184
+ "ibmOptions": { "$ref": "recognizer-ibmOptions" },
185
+ "nvidiaOptions": { "$ref": "recognizer-nvidiaOptions" },
186
+ "sonioxOptions": { "$ref": "recognizer-sonioxOptions" },
187
+ "cobaltOptions": { "$ref": "recognizer-cobaltOptions" },
188
+ "assemblyAiOptions": { "$ref": "recognizer-assemblyAiOptions" },
189
+ "speechmaticsOptions": { "$ref": "recognizer-speechmaticsOptions" },
190
+ "openaiOptions": { "$ref": "recognizer-openaiOptions" },
191
+ "houndifyOptions": { "$ref": "recognizer-houndifyOptions" },
192
+ "gladiaOptions": { "$ref": "recognizer-gladiaOptions" },
193
+ "elevenlabsOptions": { "$ref": "recognizer-elevenlabsOptions" },
194
+ "verbioOptions": { "$ref": "recognizer-verbioOptions" },
195
+ "customOptions": { "$ref": "recognizer-customOptions" }
196
+ },
197
+ "required": ["vendor"],
198
+ "examples": [
199
+ {
200
+ "vendor": "deepgram",
201
+ "language": "en-US",
202
+ "deepgramOptions": {
203
+ "model": "nova-2",
204
+ "smartFormatting": true,
205
+ "endpointing": 500
206
+ }
207
+ },
208
+ {
209
+ "vendor": "google",
210
+ "language": "en-US",
211
+ "hints": ["jambonz", "drachtio"],
212
+ "punctuation": true,
213
+ "enhancedModel": true
214
+ }
215
+ ]
216
+ }
@@ -0,0 +1,82 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://jambonz.org/schema/components/synthesizer",
4
+ "title": "Synthesizer",
5
+ "description": "Configuration for text-to-speech synthesis. Specifies the TTS vendor, voice, language, and vendor-specific options. Can be set at the session level via the 'config' verb or overridden per-verb (e.g. on 'say').",
6
+ "type": "object",
7
+ "properties": {
8
+ "vendor": {
9
+ "type": "string",
10
+ "description": "The TTS vendor to use. Must match a vendor configured in the jambonz platform.",
11
+ "examples": ["google", "aws", "microsoft", "elevenlabs", "cartesia", "deepgram", "ibm", "nuance", "nvidia", "wellsaid", "whisper", "verbio", "custom"]
12
+ },
13
+ "label": {
14
+ "type": "string",
15
+ "description": "An optional label identifying a specific credential set for this vendor. Used when multiple credentials are configured for the same vendor on the jambonz platform."
16
+ },
17
+ "language": {
18
+ "type": "string",
19
+ "description": "The language code for speech synthesis, in BCP-47 format.",
20
+ "examples": ["en-US", "en-GB", "es-ES", "fr-FR", "de-DE"]
21
+ },
22
+ "voice": {
23
+ "oneOf": [
24
+ { "type": "string" },
25
+ { "type": "object", "additionalProperties": true }
26
+ ],
27
+ "description": "The voice to use for synthesis. Format varies by vendor: Google uses voice names like 'en-US-Wavenet-D', AWS Polly uses names like 'Joanna', but ElevenLabs and Cartesia require voice IDs (alphanumeric strings like 'EXAVITQu4vr4xnSDxMaL'), not human-readable names. Some vendors accept an object for more complex voice configuration.",
28
+ "examples": ["en-US-Wavenet-D", "Joanna", "EXAVITQu4vr4xnSDxMaL"]
29
+ },
30
+ "fallbackVendor": {
31
+ "type": "string",
32
+ "description": "A backup TTS vendor to use if the primary vendor fails or is unavailable."
33
+ },
34
+ "fallbackLabel": {
35
+ "type": "string",
36
+ "description": "Credential label for the fallback vendor."
37
+ },
38
+ "fallbackLanguage": {
39
+ "type": "string",
40
+ "description": "Language code to use with the fallback vendor."
41
+ },
42
+ "fallbackVoice": {
43
+ "oneOf": [
44
+ { "type": "string" },
45
+ { "type": "object", "additionalProperties": true }
46
+ ],
47
+ "description": "Voice to use with the fallback vendor."
48
+ },
49
+ "engine": {
50
+ "type": "string",
51
+ "description": "The synthesis engine tier to use. Availability depends on the vendor.",
52
+ "enum": ["standard", "neural", "generative", "long-form"]
53
+ },
54
+ "gender": {
55
+ "type": "string",
56
+ "description": "Preferred voice gender. Used by some vendors (e.g. Google) when a specific voice is not specified.",
57
+ "enum": ["MALE", "FEMALE", "NEUTRAL"]
58
+ },
59
+ "options": {
60
+ "type": "object",
61
+ "description": "Vendor-specific options passed through to the TTS provider. The structure depends on the vendor being used.",
62
+ "additionalProperties": true
63
+ }
64
+ },
65
+ "required": ["vendor"],
66
+ "examples": [
67
+ {
68
+ "vendor": "google",
69
+ "language": "en-US",
70
+ "voice": "en-US-Wavenet-D"
71
+ },
72
+ {
73
+ "vendor": "elevenlabs",
74
+ "voice": "Rachel",
75
+ "options": {
76
+ "model_id": "eleven_turbo_v2",
77
+ "stability": 0.5,
78
+ "similarity_boost": 0.75
79
+ }
80
+ }
81
+ ]
82
+ }
@@ -0,0 +1,105 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://jambonz.org/schema/components/target",
4
+ "title": "Target",
5
+ "description": "A call target for the 'dial' verb. Specifies who or what to connect the call to: a phone number (PSTN), a SIP endpoint, a registered user, or a Microsoft Teams user.",
6
+ "type": "object",
7
+ "properties": {
8
+ "type": {
9
+ "type": "string",
10
+ "description": "The type of target to dial.",
11
+ "enum": ["phone", "sip", "user", "teams"]
12
+ },
13
+ "number": {
14
+ "type": "string",
15
+ "description": "The phone number to dial. Required when type is 'phone'. Use E.164 format.",
16
+ "examples": ["+15085551212"]
17
+ },
18
+ "sipUri": {
19
+ "type": "string",
20
+ "description": "The SIP URI to dial. Required when type is 'sip'.",
21
+ "examples": ["sip:alice@example.com"]
22
+ },
23
+ "name": {
24
+ "type": "string",
25
+ "description": "The registered user name to dial. Required when type is 'user'. Also used as the display name for SIP targets."
26
+ },
27
+ "tenant": {
28
+ "type": "string",
29
+ "description": "The Microsoft Teams tenant ID. Required when type is 'teams'."
30
+ },
31
+ "trunk": {
32
+ "type": "string",
33
+ "description": "The SIP trunk to use for the outbound call. When specified, overrides the default carrier routing."
34
+ },
35
+ "confirmHook": {
36
+ "oneOf": [
37
+ { "type": "string", "format": "uri" },
38
+ { "$ref": "actionHook" }
39
+ ],
40
+ "description": "A webhook to invoke when the target answers, before connecting the call. Use this to screen calls, play a whisper prompt, or require the target to press a key to accept."
41
+ },
42
+ "method": {
43
+ "type": "string",
44
+ "description": "The HTTP method to use when invoking the confirmHook.",
45
+ "enum": ["GET", "POST"],
46
+ "default": "POST"
47
+ },
48
+ "headers": {
49
+ "type": "object",
50
+ "description": "Custom SIP headers to include on the outbound INVITE. Keys are header names, values are header values.",
51
+ "additionalProperties": { "type": "string" }
52
+ },
53
+ "from": {
54
+ "type": "object",
55
+ "description": "Override the From header on the outbound SIP INVITE.",
56
+ "properties": {
57
+ "user": {
58
+ "type": "string",
59
+ "description": "The user part of the SIP From URI."
60
+ },
61
+ "host": {
62
+ "type": "string",
63
+ "description": "The host part of the SIP From URI."
64
+ }
65
+ }
66
+ },
67
+ "auth": {
68
+ "$ref": "auth",
69
+ "description": "SIP authentication credentials for the outbound call, if the far end requires digest auth."
70
+ },
71
+ "vmail": {
72
+ "type": "boolean",
73
+ "description": "If true, follow the call into voicemail if the target does not answer."
74
+ },
75
+ "overrideTo": {
76
+ "type": "string",
77
+ "description": "Override the Request-URI on the outbound SIP INVITE. Useful when the Request-URI needs to differ from the To header."
78
+ },
79
+ "proxy": {
80
+ "type": "string",
81
+ "description": "A SIP proxy to route the outbound call through, specified as a SIP URI.",
82
+ "examples": ["sip:proxy.example.com"]
83
+ }
84
+ },
85
+ "required": ["type"],
86
+ "examples": [
87
+ {
88
+ "type": "phone",
89
+ "number": "+15085551212"
90
+ },
91
+ {
92
+ "type": "sip",
93
+ "sipUri": "sip:alice@example.com"
94
+ },
95
+ {
96
+ "type": "user",
97
+ "name": "bob"
98
+ },
99
+ {
100
+ "type": "teams",
101
+ "number": "+15085551212",
102
+ "tenant": "a]b]c]d]e"
103
+ }
104
+ ]
105
+ }
@@ -0,0 +1,48 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://jambonz.org/schema/components/vad",
4
+ "title": "VAD",
5
+ "description": "Voice Activity Detection configuration. Controls how jambonz detects the presence or absence of speech on the audio channel. Used to determine speech start/end boundaries for recognition and barge-in.",
6
+ "type": "object",
7
+ "properties": {
8
+ "enable": {
9
+ "type": "boolean",
10
+ "description": "Whether to enable voice activity detection."
11
+ },
12
+ "voiceMs": {
13
+ "type": "number",
14
+ "description": "Duration of voice activity (in milliseconds) required before speech is considered to have started.",
15
+ "examples": [250]
16
+ },
17
+ "silenceMs": {
18
+ "type": "number",
19
+ "description": "Duration of silence (in milliseconds) required before speech is considered to have ended.",
20
+ "examples": [1000]
21
+ },
22
+ "strategy": {
23
+ "type": "string",
24
+ "description": "The VAD strategy to use."
25
+ },
26
+ "mode": {
27
+ "type": "number",
28
+ "description": "WebRTC VAD aggressiveness mode (0-3). Higher values are more aggressive at filtering non-speech. Only applies when vendor is 'webrtc'.",
29
+ "minimum": 0,
30
+ "maximum": 3
31
+ },
32
+ "vendor": {
33
+ "type": "string",
34
+ "description": "The VAD engine to use.",
35
+ "enum": ["webrtc", "silero"]
36
+ },
37
+ "threshold": {
38
+ "type": "number",
39
+ "description": "Speech detection confidence threshold for Silero VAD. Value between 0 and 1, where higher values require greater confidence. Only applies when vendor is 'silero'.",
40
+ "minimum": 0,
41
+ "maximum": 1
42
+ },
43
+ "speechPadMs": {
44
+ "type": "number",
45
+ "description": "Padding in milliseconds added before and after detected speech segments. Prevents clipping utterance boundaries. Only applies when vendor is 'silero'."
46
+ }
47
+ }
48
+ }
@@ -0,0 +1,78 @@
1
+ ## Vendor documentation links
2
+
3
+ Refer to vendor documentation for supported models and languages.
4
+
5
+ **Important**: jambonz requires real-time streaming STT. When choosing a model, ensure it supports real-time/streaming transcription. Models that only support batch transcription cannot be used.
6
+
7
+ ### Deepgram
8
+ - When using Deepgram default to the latest nova model (e.g. nova-3)
9
+ - [Models & Languages Overview](https://developers.deepgram.com/docs/models-languages-overview)
10
+
11
+ ### Google
12
+
13
+ - [Supported Languages](https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages)
14
+
15
+ ### Microsoft Azure
16
+
17
+ - [Language and Voice Support](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support)
18
+
19
+ ### AWS Transcribe
20
+
21
+ - [Supported Languages](https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html)
22
+
23
+ ### IBM Watson
24
+
25
+ - [Models and Languages](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng)
26
+
27
+ ### AssemblyAI
28
+
29
+ - [Supported Languages](https://www.assemblyai.com/docs/getting-started/supported-languages)
30
+
31
+ #### Prompting (Universal-3 Pro)
32
+
33
+ AssemblyAI's Universal-3 Pro streaming model supports a `prompt` parameter that guides transcription behavior around punctuation, disfluencies, formatting, and domain-specific terminology. The default prompt achieves strong turn detection accuracy out of the box — only customize if needed, and start by extending the default rather than replacing it.
34
+
35
+ - See [Prompting guide](https://www.assemblyai.com/docs/streaming/universal-3-pro/prompting)
36
+
37
+ jambonz has an autogeneratePrompt recognizer setting which when used with AssemblyAI universal-3 pro streaming will automatically create the prompt for a gather verb based on the text in a nested say property. See [here](../../examples/assemblyai-autogenerate-prompt/) for details.
38
+
39
+ A `keyterms` array can boost recognition of specific names, brands, or technical terms. This can be updated mid-stream, making it useful for voice agent scenarios where context changes during the call.
40
+
41
+ - See [Keyterms guide](https://www.assemblyai.com/docs/streaming/keyterms-prompting)
42
+
43
+ ### OpenAI (Whisper)
44
+
45
+ - [Speech to Text Guide](https://platform.openai.com/docs/guides/speech-to-text)
46
+
47
+ ### Nvidia Riva
48
+
49
+ - [ASR Overview](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-overview.html)
50
+
51
+ ### Speechmatics
52
+
53
+ - [Transcription Languages](https://docs.speechmatics.com/speech-to-text/languages#transcription-languages)
54
+
55
+ #### Voice Agent (Preview)
56
+
57
+ Speechmatics offers a Voice Agent API (currently in preview) that provides low-latency conversational AI capabilities. When using the Voice Agent API, set the `host` and `profile` properties in `speechmaticsOptions`:
58
+
59
+ - `host` - the Speechmatics Voice Agent endpoint URL
60
+ - `profile` - one of `adaptive`, `agile`, `smart`, or `external`
61
+
62
+ See [Voice Agent API documentation](https://docs.speechmatics.com/private/voice-agent-api#introduction) for details.
63
+
64
+ ### Soniox
65
+
66
+ - [STT Models](https://soniox.com/docs/stt/models)
67
+
68
+ ### Verbio
69
+
70
+ - [Supported Languages](https://www.verbio.com/supported-languages)
71
+
72
+ ### Gladia
73
+
74
+ - [Supported Languages](https://docs.gladia.io/chapters/language/supported-languages)
75
+
76
+ ### Nuance
77
+
78
+ - [ASR gRPC API](https://docs.mix.nuance.com/asr-grpc/v1/) (Nuance is now part of Microsoft; Azure Speech Service is the successor)
@@ -0,0 +1,27 @@
1
+ ## Vendor documentation links
2
+
3
+ Refer to vendor documentation for supported models, voices, and vendor-specific options.
4
+
5
+ ### Deepgram
6
+
7
+ - [TTS Models](https://developers.deepgram.com/docs/tts-models)
8
+
9
+ ### Google
10
+
11
+ - [Supported Voices and Languages](https://cloud.google.com/text-to-speech/docs/voices)
12
+
13
+ ### Microsoft Azure
14
+
15
+ - [Language and Voice Support](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)
16
+
17
+ ### AWS Polly
18
+
19
+ - [Available Voices](https://docs.aws.amazon.com/polly/latest/dg/voicelist.html)
20
+
21
+ ### ElevenLabs
22
+
23
+ - [Models](https://elevenlabs.io/docs/overview/models)
24
+
25
+ ### Cartesia
26
+
27
+ - [TTS Models](https://docs.cartesia.ai/build-with-cartesia/tts-models)