@jambonz/schema 0.3.10 → 0.3.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,19 +11,91 @@
11
11
  },
12
12
  "model": {
13
13
  "type": "string",
14
- "description": "Soniox recognition model."
14
+ "description": "Soniox recognition model (e.g. 'stt-rt-v5')."
15
+ },
16
+ "languageHints": {
17
+ "type": "array",
18
+ "items": { "type": "string" },
19
+ "description": "Language hints for the v5 multilingual model, as ISO language codes (e.g. ['en','es']). Improves accuracy and speeds up language detection."
20
+ },
21
+ "enableLanguageIdentification": {
22
+ "type": "boolean",
23
+ "description": "Enable language identification (v5); each recognized token is tagged with its detected language."
24
+ },
25
+ "enableSpeakerDiarization": {
26
+ "type": "boolean",
27
+ "description": "Enable speaker diarization (v5); each recognized token is tagged with a speaker."
15
28
  },
16
29
  "endpointDetection": {
17
30
  "type": "boolean",
18
31
  "description": "Enable endpoint detection."
19
32
  },
33
+ "endpointSensitivity": {
34
+ "type": "number",
35
+ "minimum": -1,
36
+ "maximum": 1,
37
+ "description": "Endpoint detection sensitivity (v5), -1.0 to 1.0 (default 0.0). Higher values finalize endpoints faster; lower values are more conservative."
38
+ },
39
+ "maxEndpointDelayMs": {
40
+ "type": "integer",
41
+ "minimum": 500,
42
+ "maximum": 3000,
43
+ "description": "Maximum delay in milliseconds before an endpoint is forced (v5), 500-3000 (default 2000)."
44
+ },
45
+ "maxNonFinalTokensDurationMs": {
46
+ "type": "integer",
47
+ "minimum": 0,
48
+ "description": "Maximum duration in milliseconds that tokens may remain non-final before forced finalization (v5)."
49
+ },
20
50
  "profanityFilter": {
21
51
  "type": "boolean",
22
52
  "description": "Filter profanity from results."
23
53
  },
54
+ "context": {
55
+ "type": "object",
56
+ "additionalProperties": false,
57
+ "description": "Soniox v5 recognition context to improve accuracy (max ~8000 tokens). See https://soniox.com/docs/stt/concepts/context. When supplied this takes precedence outright over the 'speechContext' and 'hints' shortcuts.",
58
+ "properties": {
59
+ "general": {
60
+ "type": "array",
61
+ "description": "Structured key/value metadata describing the domain, topic, participants, setting, etc.",
62
+ "items": {
63
+ "type": "object",
64
+ "additionalProperties": false,
65
+ "properties": {
66
+ "key": { "type": "string" },
67
+ "value": { "type": "string" }
68
+ },
69
+ "required": ["key", "value"]
70
+ }
71
+ },
72
+ "text": {
73
+ "type": "string",
74
+ "description": "Free-form background text (history of prior interactions, reference documents, meeting notes, summaries)."
75
+ },
76
+ "terms": {
77
+ "type": "array",
78
+ "items": { "type": "string" },
79
+ "description": "Domain-specific vocabulary to boost recognition."
80
+ },
81
+ "translation_terms": {
82
+ "type": "array",
83
+ "description": "Source/target term mappings; only used when translation is enabled.",
84
+ "items": {
85
+ "type": "object",
86
+ "additionalProperties": false,
87
+ "properties": {
88
+ "source": { "type": "string" },
89
+ "target": { "type": "string" }
90
+ },
91
+ "required": ["source", "target"]
92
+ }
93
+ }
94
+ }
95
+ },
24
96
  "speechContext": {
25
97
  "type": "string",
26
- "description": "Speech context for improved recognition."
98
+ "description": "Shortcut for context.text (free-text background). Ignored when the full 'context' object is supplied. The generic recognizer 'hints' are likewise sent as context.terms unless 'context' is supplied."
27
99
  },
28
100
  "clientRequestReference": {
29
101
  "type": "string",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jambonz/schema",
3
- "version": "0.3.10",
3
+ "version": "0.3.12",
4
4
  "description": "JSON Schema definitions and validation for jambonz verb applications",
5
5
  "main": "index.js",
6
6
  "scripts": {
@@ -86,10 +86,26 @@
86
86
  "description": "Allow the user to interrupt the assistant while it is speaking. Default: true.",
87
87
  "default": true
88
88
  },
89
+ "strategy": {
90
+ "type": "string",
91
+ "enum": ["vad", "interruptPrediction"],
92
+ "default": "vad",
93
+ "description": "How interruptions are detected. 'vad' (default): speech onset tentatively interrupts the assistant and is confirmed after minSpeechDuration of sustained speech, otherwise the assistant resumes. 'interruptPrediction': an ML model scores whether caller speech is a genuine interruption attempt vs backchannel (e.g. 'uh-huh'), so the assistant is never tentatively paused and backchannel does not cut it off; requires a provisioned vendor API key."
94
+ },
95
+ "vendor": {
96
+ "type": "string",
97
+ "description": "Detection vendor for strategy 'interruptPrediction'; defaults to 'krisp' (currently the only option). Not used with strategy 'vad'."
98
+ },
99
+ "threshold": {
100
+ "type": "number",
101
+ "minimum": 0,
102
+ "maximum": 1,
103
+ "description": "Probability threshold for interruptPrediction (0-1). Higher values require stronger evidence before interrupting the assistant. Default: 0.5."
104
+ },
89
105
  "minSpeechDuration": {
90
106
  "type": "number",
91
107
  "minimum": 0,
92
- "description": "Seconds of detected speech required before confirming an interruption. Prevents brief noises from cutting off the assistant. Default: 0.5",
108
+ "description": "Seconds of detected speech required before confirming an interruption. Prevents brief noises from cutting off the assistant. Applies to strategy 'vad' only. Default: 0.5",
93
109
  "default": 0.5
94
110
  },
95
111
  "sticky": {
@@ -183,6 +199,11 @@
183
199
  "minimum": 0,
184
200
  "description": "Sampling temperature."
185
201
  },
202
+ "reasoningEffort": {
203
+ "type": "string",
204
+ "enum": ["minimal", "low", "medium", "high"],
205
+ "description": "Vendor-neutral thinking/reasoning effort. Mapped per-vendor by the LLM adapter (Gemini thinkingLevel, OpenAI reasoning_effort, Anthropic extended thinking); ignored by vendors without a native equivalent. 'minimal' minimizes thinking for lowest TTFT on latency-sensitive voice turns."
206
+ },
186
207
  "tools": {
187
208
  "type": "array",
188
209
  "description": "Tool / function definitions available to the model. The MCP-flat shape `{name, description, parameters}` is canonical; the OpenAI-wrapped form `{type:'function', function:{...}}` is also accepted.",