@volley/recognition-client-sdk 0.1.621 → 0.1.670

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,11 +12,14 @@ declare enum RecognitionProvider {
12
12
  DEEPGRAM = "deepgram",
13
13
  ELEVENLABS = "elevenlabs",
14
14
  FIREWORKS = "fireworks",
15
+ GLADIA = "gladia",
15
16
  GOOGLE = "google",
16
17
  GEMINI_BATCH = "gemini-batch",
17
18
  OPENAI_BATCH = "openai-batch",
19
+ SELF_SERVE_VLLM = "self-serve-vllm",
18
20
  OPENAI_REALTIME = "openai-realtime",
19
21
  MISTRAL_VOXTRAL = "mistral-voxtral",
22
+ CARTESIA = "cartesia",
20
23
  DASHSCOPE = "dashscope",
21
24
  TEST_ASR_PROVIDER_QUOTA = "test-asr-provider-quota",
22
25
  TEST_ASR_STREAMING = "test-asr-streaming"
@@ -67,6 +70,13 @@ declare enum FireworksModel {
67
70
  WHISPER_V3 = "whisper-v3",
68
71
  WHISPER_V3_TURBO = "whisper-v3-turbo"
69
72
  }
73
+ /**
74
+ * Gladia Solaria realtime transcription models
75
+ * @see https://docs.gladia.io/api-reference/v2/live/init
76
+ */
77
+ declare enum GladiaModel {
78
+ SOLARIA_1 = "solaria-1"
79
+ }
70
80
  /**
71
81
  * ElevenLabs Scribe models for speech-to-text
72
82
  * @see https://elevenlabs.io/blog/introducing-scribe-v2-realtime
@@ -74,8 +84,7 @@ declare enum FireworksModel {
74
84
  * @see https://elevenlabs.io/docs/api-reference/speech-to-text/convert
75
85
  */
76
86
  declare enum ElevenLabsModel {
77
- SCRIBE_V2_REALTIME = "scribe_v2_realtime",
78
- SCRIBE_V1 = "scribe_v1"
87
+ SCRIBE_V2_REALTIME = "scribe_v2_realtime"
79
88
  }
80
89
  /**
81
90
  * OpenAI Realtime API transcription models
@@ -94,6 +103,14 @@ declare enum OpenAIRealtimeModel {
94
103
  declare enum MistralVoxtralModel {
95
104
  VOXTRAL_MINI_REALTIME_2602 = "voxtral-mini-transcribe-realtime-2602"
96
105
  }
106
+ /**
107
+ * Cartesia Ink-Whisper Realtime transcription models
108
+ * @see https://docs.cartesia.ai/build-with-cartesia/stt-models
109
+ */
110
+ declare enum CartesiaModel {
111
+ INK_WHISPER = "ink-whisper",
112
+ INK_WHISPER_20250604 = "ink-whisper-2025-06-04"
113
+ }
97
114
  /**
98
115
  * DashScope Qwen-ASR Realtime transcription models
99
116
  * @see https://www.alibabacloud.com/help/en/model-studio/qwen-real-time-speech-recognition
@@ -102,10 +119,17 @@ declare enum DashScopeModel {
102
119
  QWEN3_ASR_FLASH_REALTIME_2602 = "qwen3-asr-flash-realtime-2026-02-10",
103
120
  QWEN3_ASR_FLASH_REALTIME = "qwen3-asr-flash-realtime"
104
121
  }
122
+ /**
123
+ * Self-serve vLLM batch transcription models
124
+ * Backed by recognition-inference / RunPod `/transcribe`
125
+ */
126
+ declare enum SelfServeVllmModel {
127
+ QWEN3_ASR_1_7B = "qwen3-asr-1.7b"
128
+ }
105
129
  /**
106
130
  * Type alias for any model from any provider
107
131
  */
108
- type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | DashScopeModel | string;
132
+ type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | GladiaModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | CartesiaModel | DashScopeModel | SelfServeVllmModel | string;
109
133
 
110
134
  /**
111
135
  * Audio encoding types
@@ -260,6 +284,7 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
260
284
  voiceStart: z.ZodOptional<z.ZodNumber>;
261
285
  voiceDuration: z.ZodOptional<z.ZodNumber>;
262
286
  voiceEnd: z.ZodOptional<z.ZodNumber>;
287
+ lastNonSilence: z.ZodOptional<z.ZodNumber>;
263
288
  startTimestamp: z.ZodOptional<z.ZodNumber>;
264
289
  endTimestamp: z.ZodOptional<z.ZodNumber>;
265
290
  receivedAtMs: z.ZodOptional<z.ZodNumber>;
@@ -278,6 +303,7 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
278
303
  voiceStart?: number | undefined;
279
304
  voiceDuration?: number | undefined;
280
305
  voiceEnd?: number | undefined;
306
+ lastNonSilence?: number | undefined;
281
307
  startTimestamp?: number | undefined;
282
308
  endTimestamp?: number | undefined;
283
309
  receivedAtMs?: number | undefined;
@@ -296,6 +322,7 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
296
322
  voiceStart?: number | undefined;
297
323
  voiceDuration?: number | undefined;
298
324
  voiceEnd?: number | undefined;
325
+ lastNonSilence?: number | undefined;
299
326
  startTimestamp?: number | undefined;
300
327
  endTimestamp?: number | undefined;
301
328
  receivedAtMs?: number | undefined;
@@ -366,6 +393,8 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
366
393
  rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
367
394
  costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
368
395
  apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
396
+ provider: z.ZodOptional<z.ZodString>;
397
+ model: z.ZodOptional<z.ZodString>;
369
398
  asrConfig: z.ZodOptional<z.ZodString>;
370
399
  rawAsrMetadata: z.ZodOptional<z.ZodString>;
371
400
  transcriptOutcome: z.ZodOptional<z.ZodNativeEnum<typeof TranscriptOutcomeType>>;
@@ -420,6 +449,8 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
420
449
  rawAudioTimeMs?: number | undefined;
421
450
  costInUSD?: number | undefined;
422
451
  apiType?: ASRApiType | undefined;
452
+ provider?: string | undefined;
453
+ model?: string | undefined;
423
454
  asrConfig?: string | undefined;
424
455
  rawAsrMetadata?: string | undefined;
425
456
  transcriptOutcome?: TranscriptOutcomeType | undefined;
@@ -450,6 +481,8 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
450
481
  rawAudioTimeMs?: number | undefined;
451
482
  costInUSD?: number | undefined;
452
483
  apiType?: ASRApiType | undefined;
484
+ provider?: string | undefined;
485
+ model?: string | undefined;
453
486
  asrConfig?: string | undefined;
454
487
  rawAsrMetadata?: string | undefined;
455
488
  transcriptOutcome?: TranscriptOutcomeType | undefined;