npm - @jambonz/schema - Versions diffs - 0.3.10 → 0.3.12 - Mend

@jambonz/schema 0.3.10 → 0.3.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/components/recognizer-sonioxOptions.schema.json +74 -2
package/package.json +1 -1
package/verbs/agent.schema.json +22 -1

package/components/recognizer-sonioxOptions.schema.json CHANGED Viewed

@@ -11,19 +11,91 @@
     },
     "model": {
       "type": "string",
-      "description": "Soniox recognition model."
+      "description": "Soniox recognition model (e.g. 'stt-rt-v5')."
+    },
+    "languageHints": {
+      "type": "array",
+      "items": { "type": "string" },
+      "description": "Language hints for the v5 multilingual model, as ISO language codes (e.g. ['en','es']). Improves accuracy and speeds up language detection."
+    },
+    "enableLanguageIdentification": {
+      "type": "boolean",
+      "description": "Enable language identification (v5); each recognized token is tagged with its detected language."
+    },
+    "enableSpeakerDiarization": {
+      "type": "boolean",
+      "description": "Enable speaker diarization (v5); each recognized token is tagged with a speaker."
     },
     "endpointDetection": {
       "type": "boolean",
       "description": "Enable endpoint detection."
     },
+    "endpointSensitivity": {
+      "type": "number",
+      "minimum": -1,
+      "maximum": 1,
+      "description": "Endpoint detection sensitivity (v5), -1.0 to 1.0 (default 0.0). Higher values finalize endpoints faster; lower values are more conservative."
+    },
+    "maxEndpointDelayMs": {
+      "type": "integer",
+      "minimum": 500,
+      "maximum": 3000,
+      "description": "Maximum delay in milliseconds before an endpoint is forced (v5), 500-3000 (default 2000)."
+    },
+    "maxNonFinalTokensDurationMs": {
+      "type": "integer",
+      "minimum": 0,
+      "description": "Maximum duration in milliseconds that tokens may remain non-final before forced finalization (v5)."
+    },
     "profanityFilter": {
       "type": "boolean",
       "description": "Filter profanity from results."
     },
+    "context": {
+      "type": "object",
+      "additionalProperties": false,
+      "description": "Soniox v5 recognition context to improve accuracy (max ~8000 tokens). See https://soniox.com/docs/stt/concepts/context. When supplied this takes precedence outright over the 'speechContext' and 'hints' shortcuts.",
+      "properties": {
+        "general": {
+          "type": "array",
+          "description": "Structured key/value metadata describing the domain, topic, participants, setting, etc.",
+          "items": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "key": { "type": "string" },
+              "value": { "type": "string" }
+            },
+            "required": ["key", "value"]
+          }
+        },
+        "text": {
+          "type": "string",
+          "description": "Free-form background text (history of prior interactions, reference documents, meeting notes, summaries)."
+        },
+        "terms": {
+          "type": "array",
+          "items": { "type": "string" },
+          "description": "Domain-specific vocabulary to boost recognition."
+        },
+        "translation_terms": {
+          "type": "array",
+          "description": "Source/target term mappings; only used when translation is enabled.",
+          "items": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+              "source": { "type": "string" },
+              "target": { "type": "string" }
+            },
+            "required": ["source", "target"]
+          }
+        }
+      }
+    },
     "speechContext": {
       "type": "string",
-      "description": "Speech context for improved recognition."
+      "description": "Shortcut for context.text (free-text background). Ignored when the full 'context' object is supplied. The generic recognizer 'hints' are likewise sent as context.terms unless 'context' is supplied."
     },
     "clientRequestReference": {
       "type": "string",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@jambonz/schema",
-  "version": "0.3.10",
+  "version": "0.3.12",
   "description": "JSON Schema definitions and validation for jambonz verb applications",
   "main": "index.js",
   "scripts": {

package/verbs/agent.schema.json CHANGED Viewed

@@ -86,10 +86,26 @@
           "description": "Allow the user to interrupt the assistant while it is speaking. Default: true.",
           "default": true
         },
+        "strategy": {
+          "type": "string",
+          "enum": ["vad", "interruptPrediction"],
+          "default": "vad",
+          "description": "How interruptions are detected. 'vad' (default): speech onset tentatively interrupts the assistant and is confirmed after minSpeechDuration of sustained speech, otherwise the assistant resumes. 'interruptPrediction': an ML model scores whether caller speech is a genuine interruption attempt vs backchannel (e.g. 'uh-huh'), so the assistant is never tentatively paused and backchannel does not cut it off; requires a provisioned vendor API key."
+        },
+        "vendor": {
+          "type": "string",
+          "description": "Detection vendor for strategy 'interruptPrediction'; defaults to 'krisp' (currently the only option). Not used with strategy 'vad'."
+        },
+        "threshold": {
+          "type": "number",
+          "minimum": 0,
+          "maximum": 1,
+          "description": "Probability threshold for interruptPrediction (0-1). Higher values require stronger evidence before interrupting the assistant. Default: 0.5."
+        },
         "minSpeechDuration": {
           "type": "number",
           "minimum": 0,
-          "description": "Seconds of detected speech required before confirming an interruption. Prevents brief noises from cutting off the assistant. Default: 0.5",
+          "description": "Seconds of detected speech required before confirming an interruption. Prevents brief noises from cutting off the assistant. Applies to strategy 'vad' only. Default: 0.5",
           "default": 0.5
         },
         "sticky": {
@@ -183,6 +199,11 @@
               "minimum": 0,
               "description": "Sampling temperature."
             },
+            "reasoningEffort": {
+              "type": "string",
+              "enum": ["minimal", "low", "medium", "high"],
+              "description": "Vendor-neutral thinking/reasoning effort. Mapped per-vendor by the LLM adapter (Gemini thinkingLevel, OpenAI reasoning_effort, Anthropic extended thinking); ignored by vendors without a native equivalent. 'minimal' minimizes thinking for lowest TTFT on latency-sensitive voice turns."
+            },
             "tools": {
               "type": "array",
               "description": "Tool / function definitions available to the model. The MCP-flat shape `{name, description, parameters}` is canonical; the OpenAI-wrapped form `{type:'function', function:{...}}` is also accepted.",