@volley/recognition-client-sdk 0.1.621 → 0.1.670

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3737,11 +3737,14 @@ var RecognitionProvider;
3737
3737
  RecognitionProvider2["DEEPGRAM"] = "deepgram";
3738
3738
  RecognitionProvider2["ELEVENLABS"] = "elevenlabs";
3739
3739
  RecognitionProvider2["FIREWORKS"] = "fireworks";
3740
+ RecognitionProvider2["GLADIA"] = "gladia";
3740
3741
  RecognitionProvider2["GOOGLE"] = "google";
3741
3742
  RecognitionProvider2["GEMINI_BATCH"] = "gemini-batch";
3742
3743
  RecognitionProvider2["OPENAI_BATCH"] = "openai-batch";
3744
+ RecognitionProvider2["SELF_SERVE_VLLM"] = "self-serve-vllm";
3743
3745
  RecognitionProvider2["OPENAI_REALTIME"] = "openai-realtime";
3744
3746
  RecognitionProvider2["MISTRAL_VOXTRAL"] = "mistral-voxtral";
3747
+ RecognitionProvider2["CARTESIA"] = "cartesia";
3745
3748
  RecognitionProvider2["DASHSCOPE"] = "dashscope";
3746
3749
  RecognitionProvider2["TEST_ASR_PROVIDER_QUOTA"] = "test-asr-provider-quota";
3747
3750
  RecognitionProvider2["TEST_ASR_STREAMING"] = "test-asr-streaming";
@@ -3783,10 +3786,13 @@ var FireworksModel;
3783
3786
  FireworksModel2["WHISPER_V3"] = "whisper-v3";
3784
3787
  FireworksModel2["WHISPER_V3_TURBO"] = "whisper-v3-turbo";
3785
3788
  })(FireworksModel || (FireworksModel = {}));
3789
+ var GladiaModel;
3790
+ (function(GladiaModel2) {
3791
+ GladiaModel2["SOLARIA_1"] = "solaria-1";
3792
+ })(GladiaModel || (GladiaModel = {}));
3786
3793
  var ElevenLabsModel;
3787
3794
  (function(ElevenLabsModel2) {
3788
3795
  ElevenLabsModel2["SCRIBE_V2_REALTIME"] = "scribe_v2_realtime";
3789
- ElevenLabsModel2["SCRIBE_V1"] = "scribe_v1";
3790
3796
  })(ElevenLabsModel || (ElevenLabsModel = {}));
3791
3797
  var OpenAIRealtimeModel;
3792
3798
  (function(OpenAIRealtimeModel2) {
@@ -3797,11 +3803,20 @@ var MistralVoxtralModel;
3797
3803
  (function(MistralVoxtralModel2) {
3798
3804
  MistralVoxtralModel2["VOXTRAL_MINI_REALTIME_2602"] = "voxtral-mini-transcribe-realtime-2602";
3799
3805
  })(MistralVoxtralModel || (MistralVoxtralModel = {}));
3806
+ var CartesiaModel;
3807
+ (function(CartesiaModel2) {
3808
+ CartesiaModel2["INK_WHISPER"] = "ink-whisper";
3809
+ CartesiaModel2["INK_WHISPER_20250604"] = "ink-whisper-2025-06-04";
3810
+ })(CartesiaModel || (CartesiaModel = {}));
3800
3811
  var DashScopeModel;
3801
3812
  (function(DashScopeModel2) {
3802
3813
  DashScopeModel2["QWEN3_ASR_FLASH_REALTIME_2602"] = "qwen3-asr-flash-realtime-2026-02-10";
3803
3814
  DashScopeModel2["QWEN3_ASR_FLASH_REALTIME"] = "qwen3-asr-flash-realtime";
3804
3815
  })(DashScopeModel || (DashScopeModel = {}));
3816
+ var SelfServeVllmModel;
3817
+ (function(SelfServeVllmModel2) {
3818
+ SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
3819
+ })(SelfServeVllmModel || (SelfServeVllmModel = {}));
3805
3820
 
3806
3821
  // ../../libs/types/dist/recognition-result-v1.types.js
3807
3822
  var RecognitionResultTypeV1;
@@ -3826,6 +3841,7 @@ var TranscriptionResultSchemaV1 = z.object({
3826
3841
  voiceStart: z.number().optional(),
3827
3842
  voiceDuration: z.number().optional(),
3828
3843
  voiceEnd: z.number().optional(),
3844
+ lastNonSilence: z.number().optional(),
3829
3845
  startTimestamp: z.number().optional(),
3830
3846
  endTimestamp: z.number().optional(),
3831
3847
  receivedAtMs: z.number().optional(),
@@ -3873,6 +3889,9 @@ var MetadataResultSchemaV1 = z.object({
3873
3889
  costInUSD: z.number().default(0).optional(),
3874
3890
  // ASR API Type
3875
3891
  apiType: z.nativeEnum(ASRApiType).optional(),
3892
+ // Provider identification
3893
+ provider: z.string().optional(),
3894
+ model: z.string().optional(),
3876
3895
  // ASR configuration as JSON string (no type validation)
3877
3896
  asrConfig: z.string().optional(),
3878
3897
  // Raw ASR metadata payload as provided by the provider (stringified if needed)
@@ -5248,7 +5267,7 @@ var MessageHandler = class {
5248
5267
  }
5249
5268
  if (msg.data && typeof msg.data !== "object") {
5250
5269
  if (this.callbacks.logger) {
5251
- this.callbacks.logger("error", "[RecogSDK] Received primitive msg.data from server", {
5270
+ this.callbacks.logger("warn", "[RecogSDK] Received primitive msg.data from server", {
5252
5271
  dataType: typeof msg.data,
5253
5272
  data: msg.data,
5254
5273
  fullMessage: msg
@@ -5529,7 +5548,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5529
5548
  const timeout = setTimeout(() => {
5530
5549
  if (settled) return;
5531
5550
  settled = true;
5532
- this.log("warn", "Connection timeout", { timeout: connectionTimeout, attempt });
5551
+ this.log("warn", `Connection timeout url=${this.config.url}`, { timeout: connectionTimeout, attempt });
5533
5552
  this.state = "failed" /* FAILED */;
5534
5553
  reject(new Error(`Connection timeout after ${connectionTimeout}ms`));
5535
5554
  }, connectionTimeout);
@@ -5551,7 +5570,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5551
5570
  if (settled) return;
5552
5571
  settled = true;
5553
5572
  clearTimeout(timeout);
5554
- this.log("warn", "Connection error", { error, attempt });
5573
+ this.log("warn", `Connection error url=${this.config.url}`, { error, attempt });
5555
5574
  this.state = "failed" /* FAILED */;
5556
5575
  reject(error);
5557
5576
  };
@@ -5566,14 +5585,14 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5566
5585
  lastError = error;
5567
5586
  if (attempt < maxAttempts) {
5568
5587
  const logLevel = attempt < 3 ? "info" : "warn";
5569
- this.log(logLevel, `Connection attempt ${attempt} failed, retrying after ${delayMs}ms`, {
5588
+ this.log(logLevel, `Connection attempt ${attempt} failed, retrying after ${delayMs}ms url=${this.config.url}`, {
5570
5589
  error: lastError.message,
5571
5590
  nextAttempt: attempt + 1
5572
5591
  });
5573
5592
  this.state = "initial" /* INITIAL */;
5574
5593
  await new Promise((resolve) => setTimeout(resolve, delayMs));
5575
5594
  } else {
5576
- this.log("warn", `All ${maxAttempts} connection attempts failed`, {
5595
+ this.log("warn", `All ${maxAttempts} connection attempts failed url=${this.config.url}`, {
5577
5596
  error: lastError.message
5578
5597
  });
5579
5598
  }
@@ -5596,7 +5615,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5596
5615
  blobToArrayBuffer(audioData).then((arrayBuffer) => {
5597
5616
  this.sendAudioInternal(arrayBuffer);
5598
5617
  }).catch((error) => {
5599
- this.log("error", "Failed to convert Blob to ArrayBuffer", error);
5618
+ this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
5600
5619
  });
5601
5620
  return;
5602
5621
  }
@@ -5636,7 +5655,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5636
5655
  */
5637
5656
  async stopRecording() {
5638
5657
  if (this.state !== "ready" /* READY */) {
5639
- this.log("warn", "stopRecording called but not in READY state", { state: this.state });
5658
+ this.log("info", "stopRecording called but not in READY state", { state: this.state });
5640
5659
  return;
5641
5660
  }
5642
5661
  this.log("debug", "Stopping recording");
@@ -5806,7 +5825,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5806
5825
  if (this.state === "stopping" /* STOPPING */) {
5807
5826
  this.state = "stopped" /* STOPPED */;
5808
5827
  } else if (this.state === "connected" /* CONNECTED */ || this.state === "ready" /* READY */ || this.state === "connecting" /* CONNECTING */) {
5809
- this.log("error", "[DIAGNOSTIC] Unexpected disconnection", {
5828
+ this.log("warn", "[DIAGNOSTIC] Unexpected disconnection", {
5810
5829
  code,
5811
5830
  codeDescription: closeCodeDescription,
5812
5831
  reason: reason || "(empty)",
@@ -5928,7 +5947,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5928
5947
  blobToArrayBuffer(audioData).then((arrayBuffer) => {
5929
5948
  this.sendPrefixAudioInternal(arrayBuffer);
5930
5949
  }).catch((error) => {
5931
- this.log("error", "Failed to convert Blob to ArrayBuffer for prefix audio", error);
5950
+ this.log("warn", "Failed to convert Blob to ArrayBuffer for prefix audio", error);
5932
5951
  });
5933
5952
  return;
5934
5953
  }
@@ -6187,6 +6206,11 @@ var RecognitionVGFStateSchema = z.object({
6187
6206
  finalTranscript: z.string().optional(),
6188
6207
  // Full finalized transcript for the utterance. Will not change.
6189
6208
  finalConfidence: z.number().optional(),
6209
+ // Voice timing (ms from stream start, prefix-adjusted)
6210
+ voiceEnd: z.number().optional(),
6211
+ // voice end time identified by ASR
6212
+ lastNonSilence: z.number().optional(),
6213
+ // last non-silence sample time from PCM analysis
6190
6214
  // Tracking-only metadata
6191
6215
  asrConfig: z.string().optional(),
6192
6216
  // Json format of the ASR config
@@ -6275,6 +6299,12 @@ function mapTranscriptionResultToState(currentState, result, isRecording) {
6275
6299
  newState.finalConfidence = result.finalTranscriptConfidence;
6276
6300
  }
6277
6301
  }
6302
+ if (result.voiceEnd !== void 0) {
6303
+ newState.voiceEnd = result.voiceEnd;
6304
+ }
6305
+ if (result.lastNonSilence !== void 0) {
6306
+ newState.lastNonSilence = result.lastNonSilence;
6307
+ }
6278
6308
  } else {
6279
6309
  newState.transcriptionStatus = TranscriptionStatus.FINALIZED;
6280
6310
  newState.finalTranscript = result.finalTranscript || "";
@@ -6282,6 +6312,12 @@ function mapTranscriptionResultToState(currentState, result, isRecording) {
6282
6312
  newState.finalConfidence = result.finalTranscriptConfidence;
6283
6313
  }
6284
6314
  newState.finalTranscriptionTimestamp = (/* @__PURE__ */ new Date()).toISOString();
6315
+ if (result.voiceEnd !== void 0) {
6316
+ newState.voiceEnd = result.voiceEnd;
6317
+ }
6318
+ if (result.lastNonSilence !== void 0) {
6319
+ newState.lastNonSilence = result.lastNonSilence;
6320
+ }
6285
6321
  newState.pendingTranscript = "";
6286
6322
  newState.pendingConfidence = void 0;
6287
6323
  }
@@ -6317,7 +6353,9 @@ function resetRecognitionVGFState(currentState) {
6317
6353
  transcriptionStatus: TranscriptionStatus.NOT_STARTED,
6318
6354
  startRecordingStatus: RecordingStatus.READY,
6319
6355
  recognitionActionProcessingState: RecognitionActionProcessingState.NOT_STARTED,
6320
- finalTranscript: void 0
6356
+ finalTranscript: void 0,
6357
+ voiceEnd: void 0,
6358
+ lastNonSilence: void 0
6321
6359
  };
6322
6360
  }
6323
6361
  function generateUUID() {
@@ -6571,6 +6609,7 @@ function createSimplifiedVGFClient(config) {
6571
6609
  }
6572
6610
  export {
6573
6611
  AudioEncoding,
6612
+ CartesiaModel,
6574
6613
  ClientControlActionV1,
6575
6614
  ClientState,
6576
6615
  ConfigBuilder,
@@ -6584,10 +6623,12 @@ export {
6584
6623
  FinalTranscriptStability,
6585
6624
  FireworksModel,
6586
6625
  GeminiModel,
6626
+ GladiaModel,
6587
6627
  GoogleModel,
6588
6628
  Language,
6589
6629
  MistralVoxtralModel,
6590
6630
  OpenAIModel,
6631
+ OpenAIRealtimeModel,
6591
6632
  RECOGNITION_CONDUCTOR_BASES,
6592
6633
  RECOGNITION_SERVICE_BASES,
6593
6634
  RealTimeTwoWayWebSocketRecognitionClient,
@@ -6599,6 +6640,7 @@ export {
6599
6640
  RecordingStatus,
6600
6641
  STAGES,
6601
6642
  SampleRate,
6643
+ SelfServeVllmModel,
6602
6644
  SimplifiedVGFRecognitionClient,
6603
6645
  TimeoutError,
6604
6646
  TranscriptionStatus,