voice-router-dev 0.8.6 → 0.8.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8270,6 +8270,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
8270
8270
  id: String(speakerId),
8271
8271
  label: `Speaker ${speakerId}`
8272
8272
  })) : void 0;
8273
+ const utterances = words.length > 0 ? buildUtterancesFromWords(words) : void 0;
8273
8274
  const transcriptionId = transcription.self?.split("/").pop() || "";
8274
8275
  return {
8275
8276
  success: true,
@@ -8283,6 +8284,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
8283
8284
  duration: transcriptionData.duration ? transcriptionData.duration / 1e7 : void 0,
8284
8285
  speakers,
8285
8286
  words: words.length > 0 ? words : void 0,
8287
+ utterances: utterances && utterances.length > 0 ? utterances : void 0,
8286
8288
  createdAt: transcription.createdDateTime,
8287
8289
  completedAt: transcription.lastActionDateTime
8288
8290
  },
@@ -8918,6 +8920,7 @@ function createOpenAIWhisperAdapter(config) {
8918
8920
 
8919
8921
  // src/adapters/speechmatics-adapter.ts
8920
8922
  var import_axios8 = __toESM(require("axios"));
8923
+ var import_ws5 = __toESM(require("ws"));
8921
8924
 
8922
8925
  // src/generated/speechmatics/schema/notificationConfigContentsItem.ts
8923
8926
  var NotificationConfigContentsItem = {
@@ -8967,8 +8970,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8967
8970
  super(...arguments);
8968
8971
  this.name = "speechmatics";
8969
8972
  this.capabilities = {
8970
- streaming: false,
8971
- // Batch only (streaming available via separate WebSocket API)
8973
+ streaming: true,
8972
8974
  diarization: true,
8973
8975
  wordTimestamps: true,
8974
8976
  languageDetection: false,
@@ -9103,13 +9105,16 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9103
9105
  jobConfig.fetch_data = {
9104
9106
  url: audio.url
9105
9107
  };
9106
- requestBody = { config: JSON.stringify(jobConfig) };
9107
- headers = { "Content-Type": "application/json" };
9108
+ const formData = new FormData();
9109
+ formData.append("config", JSON.stringify(jobConfig));
9110
+ requestBody = formData;
9111
+ headers = { "Content-Type": "multipart/form-data" };
9108
9112
  } else if (audio.type === "file") {
9109
- requestBody = {
9110
- config: JSON.stringify(jobConfig),
9111
- data_file: audio.file
9112
- };
9113
+ const formData = new FormData();
9114
+ formData.append("config", JSON.stringify(jobConfig));
9115
+ const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9116
+ formData.append("data_file", audioBlob, audio.filename || "audio.wav");
9117
+ requestBody = formData;
9113
9118
  headers = { "Content-Type": "multipart/form-data" };
9114
9119
  } else {
9115
9120
  return {
@@ -9214,6 +9219,381 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9214
9219
  throw error;
9215
9220
  }
9216
9221
  }
9222
+ /**
9223
+ * Build WebSocket URL for real-time streaming
9224
+ *
9225
+ * Note: Real-time API uses a different host from the batch API:
9226
+ * - Batch: {region}.asr.api.speechmatics.com
9227
+ * - Real-time: {region}.rt.speechmatics.com
9228
+ *
9229
+ * @param region - Regional endpoint identifier
9230
+ * @returns WebSocket URL for real-time API
9231
+ */
9232
+ getRegionalWsUrl(region) {
9233
+ if (this.config?.wsBaseUrl) {
9234
+ return this.config.wsBaseUrl;
9235
+ }
9236
+ const regionPrefix = region || "eu1";
9237
+ return `wss://${regionPrefix}.rt.speechmatics.com/v2`;
9238
+ }
9239
+ /**
9240
+ * Stream audio for real-time transcription via WebSocket
9241
+ *
9242
+ * Connects to Speechmatics' real-time API and sends audio chunks
9243
+ * for transcription with results returned via callbacks.
9244
+ *
9245
+ * @param options - Streaming configuration options
9246
+ * @param callbacks - Event callbacks for transcription results
9247
+ * @returns Promise that resolves with a StreamingSession
9248
+ *
9249
+ * @example Basic streaming
9250
+ * ```typescript
9251
+ * const session = await adapter.transcribeStream({
9252
+ * language: 'en',
9253
+ * speechmaticsStreaming: {
9254
+ * enablePartials: true,
9255
+ * operatingPoint: 'enhanced'
9256
+ * }
9257
+ * }, {
9258
+ * onTranscript: (event) => console.log(event.text),
9259
+ * onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
9260
+ * onError: (error) => console.error(error)
9261
+ * });
9262
+ *
9263
+ * await session.sendAudio({ data: audioBuffer });
9264
+ * await session.close();
9265
+ * ```
9266
+ */
9267
+ async transcribeStream(options, callbacks) {
9268
+ this.validateConfig();
9269
+ const smOpts = options?.speechmaticsStreaming || {};
9270
+ const region = smOpts.region || this.config?.region;
9271
+ const wsUrl = this.getRegionalWsUrl(region);
9272
+ const ws = new import_ws5.default(wsUrl, {
9273
+ headers: {
9274
+ Authorization: `Bearer ${this.config.apiKey}`
9275
+ }
9276
+ });
9277
+ let sessionStatus = "connecting";
9278
+ const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
9279
+ let seqNo = 0;
9280
+ let utteranceResults = [];
9281
+ const sessionReady = new Promise((resolve, reject) => {
9282
+ const timeout = setTimeout(() => {
9283
+ reject(new Error("WebSocket connection timeout"));
9284
+ }, 1e4);
9285
+ let wsOpen = false;
9286
+ ws.once("error", (error) => {
9287
+ clearTimeout(timeout);
9288
+ reject(error);
9289
+ });
9290
+ ws.once("open", () => {
9291
+ wsOpen = true;
9292
+ const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
9293
+ const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
9294
+ const startMsg = {
9295
+ message: "StartRecognition",
9296
+ audio_format: {
9297
+ type: "raw",
9298
+ encoding,
9299
+ sample_rate: sampleRate
9300
+ },
9301
+ transcription_config: {
9302
+ language: smOpts.language || options?.language || "en",
9303
+ enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
9304
+ }
9305
+ };
9306
+ const txConfig = startMsg.transcription_config;
9307
+ if (smOpts.domain) txConfig.domain = smOpts.domain;
9308
+ if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
9309
+ if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
9310
+ if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
9311
+ if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
9312
+ if (smOpts.diarization === "speaker" || options?.diarization) {
9313
+ txConfig.diarization = "speaker";
9314
+ if (smOpts.maxSpeakers) {
9315
+ txConfig.speaker_diarization_config = {
9316
+ max_speakers: smOpts.maxSpeakers
9317
+ };
9318
+ } else if (options?.speakersExpected) {
9319
+ txConfig.speaker_diarization_config = {
9320
+ max_speakers: options.speakersExpected
9321
+ };
9322
+ }
9323
+ }
9324
+ if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
9325
+ txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
9326
+ content: word
9327
+ }));
9328
+ } else if (options?.customVocabulary && options.customVocabulary.length > 0) {
9329
+ txConfig.additional_vocab = options.customVocabulary.map((word) => ({
9330
+ content: word
9331
+ }));
9332
+ }
9333
+ if (smOpts.conversationConfig) {
9334
+ txConfig.conversation_config = {
9335
+ end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9336
+ };
9337
+ }
9338
+ const startPayload = JSON.stringify(startMsg);
9339
+ if (callbacks?.onRawMessage) {
9340
+ callbacks.onRawMessage({
9341
+ provider: "speechmatics",
9342
+ direction: "outgoing",
9343
+ timestamp: Date.now(),
9344
+ payload: startPayload,
9345
+ messageType: "StartRecognition"
9346
+ });
9347
+ }
9348
+ ws.send(startPayload);
9349
+ });
9350
+ const onMessage = (data) => {
9351
+ const rawPayload = data.toString();
9352
+ try {
9353
+ const msg = JSON.parse(rawPayload);
9354
+ if (msg.message === "RecognitionStarted") {
9355
+ clearTimeout(timeout);
9356
+ ws.removeListener("message", onMessage);
9357
+ ws.emit("message", data);
9358
+ resolve();
9359
+ } else if (msg.message === "Error") {
9360
+ clearTimeout(timeout);
9361
+ ws.removeListener("message", onMessage);
9362
+ reject(new Error(msg.reason || "Recognition failed to start"));
9363
+ }
9364
+ } catch {
9365
+ }
9366
+ };
9367
+ ws.on("message", onMessage);
9368
+ });
9369
+ ws.on("message", (data) => {
9370
+ const rawPayload = data.toString();
9371
+ try {
9372
+ const message = JSON.parse(rawPayload);
9373
+ if (callbacks?.onRawMessage) {
9374
+ callbacks.onRawMessage({
9375
+ provider: "speechmatics",
9376
+ direction: "incoming",
9377
+ timestamp: Date.now(),
9378
+ payload: rawPayload,
9379
+ messageType: message.message
9380
+ });
9381
+ }
9382
+ this.handleStreamingMessage(message, callbacks, utteranceResults);
9383
+ } catch (error) {
9384
+ if (callbacks?.onRawMessage) {
9385
+ callbacks.onRawMessage({
9386
+ provider: "speechmatics",
9387
+ direction: "incoming",
9388
+ timestamp: Date.now(),
9389
+ payload: rawPayload,
9390
+ messageType: "parse_error"
9391
+ });
9392
+ }
9393
+ callbacks?.onError?.({
9394
+ code: "PARSE_ERROR",
9395
+ message: "Failed to parse WebSocket message",
9396
+ details: error
9397
+ });
9398
+ }
9399
+ });
9400
+ ws.on("error", (error) => {
9401
+ callbacks?.onError?.({
9402
+ code: "WEBSOCKET_ERROR",
9403
+ message: error.message,
9404
+ details: error
9405
+ });
9406
+ });
9407
+ ws.on("close", (code, reason) => {
9408
+ sessionStatus = "closed";
9409
+ callbacks?.onClose?.(code, reason.toString());
9410
+ });
9411
+ await sessionReady;
9412
+ sessionStatus = "open";
9413
+ callbacks?.onOpen?.();
9414
+ return {
9415
+ id: sessionId,
9416
+ provider: this.name,
9417
+ createdAt: /* @__PURE__ */ new Date(),
9418
+ getStatus: () => sessionStatus,
9419
+ sendAudio: async (chunk) => {
9420
+ if (sessionStatus !== "open") {
9421
+ throw new Error(`Cannot send audio: session is ${sessionStatus}`);
9422
+ }
9423
+ if (ws.readyState !== import_ws5.default.OPEN) {
9424
+ throw new Error("WebSocket is not open");
9425
+ }
9426
+ if (callbacks?.onRawMessage) {
9427
+ const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
9428
+ chunk.data.byteOffset,
9429
+ chunk.data.byteOffset + chunk.data.byteLength
9430
+ );
9431
+ callbacks.onRawMessage({
9432
+ provider: this.name,
9433
+ direction: "outgoing",
9434
+ timestamp: Date.now(),
9435
+ payload: audioPayload,
9436
+ messageType: "audio"
9437
+ });
9438
+ }
9439
+ ws.send(chunk.data);
9440
+ seqNo++;
9441
+ if (chunk.isLast) {
9442
+ const endMsg = JSON.stringify({
9443
+ message: "EndOfStream",
9444
+ last_seq_no: seqNo
9445
+ });
9446
+ if (callbacks?.onRawMessage) {
9447
+ callbacks.onRawMessage({
9448
+ provider: this.name,
9449
+ direction: "outgoing",
9450
+ timestamp: Date.now(),
9451
+ payload: endMsg,
9452
+ messageType: "EndOfStream"
9453
+ });
9454
+ }
9455
+ ws.send(endMsg);
9456
+ }
9457
+ },
9458
+ close: async () => {
9459
+ if (sessionStatus === "closed" || sessionStatus === "closing") {
9460
+ return;
9461
+ }
9462
+ sessionStatus = "closing";
9463
+ if (ws.readyState === import_ws5.default.OPEN) {
9464
+ seqNo++;
9465
+ ws.send(
9466
+ JSON.stringify({
9467
+ message: "EndOfStream",
9468
+ last_seq_no: seqNo
9469
+ })
9470
+ );
9471
+ }
9472
+ return new Promise((resolve) => {
9473
+ const timeout = setTimeout(() => {
9474
+ ws.terminate();
9475
+ sessionStatus = "closed";
9476
+ resolve();
9477
+ }, 5e3);
9478
+ const onMsg = (data) => {
9479
+ try {
9480
+ const msg = JSON.parse(data.toString());
9481
+ if (msg.message === "EndOfTranscript") {
9482
+ ws.removeListener("message", onMsg);
9483
+ clearTimeout(timeout);
9484
+ ws.close();
9485
+ }
9486
+ } catch {
9487
+ }
9488
+ };
9489
+ ws.on("message", onMsg);
9490
+ ws.once("close", () => {
9491
+ clearTimeout(timeout);
9492
+ sessionStatus = "closed";
9493
+ resolve();
9494
+ });
9495
+ });
9496
+ }
9497
+ };
9498
+ }
9499
+ /**
9500
+ * Handle incoming Speechmatics real-time WebSocket messages
9501
+ */
9502
+ handleStreamingMessage(message, callbacks, utteranceResults) {
9503
+ switch (message.message) {
9504
+ case "RecognitionStarted": {
9505
+ break;
9506
+ }
9507
+ case "AddPartialTranscript": {
9508
+ const results = message.results || [];
9509
+ const text = buildTextFromSpeechmaticsResults(results);
9510
+ if (text) {
9511
+ callbacks?.onTranscript?.({
9512
+ type: "transcript",
9513
+ text,
9514
+ isFinal: false,
9515
+ words: this.extractWordsFromResults(results),
9516
+ data: message
9517
+ });
9518
+ }
9519
+ break;
9520
+ }
9521
+ case "AddTranscript": {
9522
+ const results = message.results || [];
9523
+ const text = buildTextFromSpeechmaticsResults(results);
9524
+ if (utteranceResults) {
9525
+ utteranceResults.push(...results);
9526
+ }
9527
+ if (text) {
9528
+ callbacks?.onTranscript?.({
9529
+ type: "transcript",
9530
+ text,
9531
+ isFinal: true,
9532
+ words: this.extractWordsFromResults(results),
9533
+ data: message
9534
+ });
9535
+ }
9536
+ break;
9537
+ }
9538
+ case "EndOfUtterance": {
9539
+ if (utteranceResults && utteranceResults.length > 0) {
9540
+ const text = buildTextFromSpeechmaticsResults(utteranceResults);
9541
+ const words = this.extractWordsFromResults(utteranceResults);
9542
+ const utterances = buildUtterancesFromWords(words);
9543
+ if (utterances.length > 0) {
9544
+ for (const utt of utterances) {
9545
+ callbacks?.onUtterance?.(utt);
9546
+ }
9547
+ } else if (text) {
9548
+ callbacks?.onUtterance?.({
9549
+ text,
9550
+ start: words.length > 0 ? words[0].start : 0,
9551
+ end: words.length > 0 ? words[words.length - 1].end : 0,
9552
+ words
9553
+ });
9554
+ }
9555
+ utteranceResults.length = 0;
9556
+ }
9557
+ break;
9558
+ }
9559
+ case "AudioAdded": {
9560
+ break;
9561
+ }
9562
+ case "EndOfTranscript": {
9563
+ break;
9564
+ }
9565
+ case "Info":
9566
+ case "Warning": {
9567
+ callbacks?.onMetadata?.(message);
9568
+ break;
9569
+ }
9570
+ case "Error": {
9571
+ const errMsg = message;
9572
+ callbacks?.onError?.({
9573
+ code: errMsg.type || "SPEECHMATICS_ERROR",
9574
+ message: errMsg.reason || "Unknown error",
9575
+ details: message
9576
+ });
9577
+ break;
9578
+ }
9579
+ default: {
9580
+ callbacks?.onMetadata?.(message);
9581
+ break;
9582
+ }
9583
+ }
9584
+ }
9585
+ /**
9586
+ * Extract unified Word[] from Speechmatics recognition results
9587
+ */
9588
+ extractWordsFromResults(results) {
9589
+ return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
9590
+ word: result.alternatives?.[0]?.content || "",
9591
+ start: result.start_time,
9592
+ end: result.end_time,
9593
+ confidence: result.alternatives?.[0]?.confidence,
9594
+ speaker: result.alternatives?.[0]?.speaker
9595
+ }));
9596
+ }
9217
9597
  /**
9218
9598
  * Normalize Speechmatics status to unified status
9219
9599
  * Uses generated JobDetailsStatus enum values
@@ -9432,26 +9812,13 @@ var SonioxAdapter = class extends BaseAdapter {
9432
9812
  } else if (audio.type === "file") {
9433
9813
  const formData = new FormData();
9434
9814
  const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9435
- formData.append("audio", audioBlob, audio.filename || "audio.wav");
9436
- formData.append("model", requestBody.model);
9437
- if (options?.language) {
9438
- formData.append("language_hints", JSON.stringify([options.language]));
9439
- }
9440
- if (options?.diarization) {
9441
- formData.append("enable_speaker_diarization", "true");
9442
- }
9443
- if (options?.languageDetection) {
9444
- formData.append("enable_language_identification", "true");
9445
- }
9446
- if (options?.customVocabulary) {
9447
- formData.append("context", JSON.stringify({ terms: options.customVocabulary }));
9448
- }
9449
- const response2 = await this.client.post("/speech/transcribe", formData, {
9815
+ formData.append("file", audioBlob, audio.filename || "audio.wav");
9816
+ const uploadResponse = await this.client.post("/files", formData, {
9450
9817
  headers: {
9451
9818
  "Content-Type": "multipart/form-data"
9452
9819
  }
9453
9820
  });
9454
- return this.normalizeResponse(response2.data);
9821
+ requestBody.file_id = uploadResponse.data.id;
9455
9822
  } else {
9456
9823
  return {
9457
9824
  success: false,
@@ -9476,8 +9843,9 @@ var SonioxAdapter = class extends BaseAdapter {
9476
9843
  terms: options.customVocabulary
9477
9844
  };
9478
9845
  }
9479
- const response = await this.client.post("/speech/transcribe", requestBody);
9480
- return this.normalizeResponse(response.data);
9846
+ const response = await this.client.post("/transcriptions", requestBody);
9847
+ const transcriptionId = response.data.id;
9848
+ return await this.pollForCompletion(transcriptionId);
9481
9849
  } catch (error) {
9482
9850
  return this.createErrorResponse(error);
9483
9851
  }
@@ -9485,8 +9853,9 @@ var SonioxAdapter = class extends BaseAdapter {
9485
9853
  /**
9486
9854
  * Get transcription result by ID
9487
9855
  *
9488
- * Soniox batch transcription is synchronous (returns immediately),
9489
- * but this method can be used for consistency with other providers.
9856
+ * Checks job status via GET /v1/transcriptions/{id}, then fetches
9857
+ * the full transcript via GET /v1/transcriptions/{id}/transcript
9858
+ * when completed.
9490
9859
  *
9491
9860
  * @param transcriptId - Transcript ID
9492
9861
  * @returns Transcription response
@@ -9494,8 +9863,39 @@ var SonioxAdapter = class extends BaseAdapter {
9494
9863
  async getTranscript(transcriptId) {
9495
9864
  this.validateConfig();
9496
9865
  try {
9497
- const response = await this.client.get(`/speech/transcripts/${transcriptId}`);
9498
- return this.normalizeResponse(response.data);
9866
+ const statusResponse = await this.client.get(`/transcriptions/${transcriptId}`);
9867
+ const job = statusResponse.data;
9868
+ if (job.status === "error") {
9869
+ return {
9870
+ success: false,
9871
+ provider: this.name,
9872
+ error: {
9873
+ code: "TRANSCRIPTION_ERROR",
9874
+ message: job.error_message || "Transcription failed"
9875
+ }
9876
+ };
9877
+ }
9878
+ if (job.status !== "completed") {
9879
+ return {
9880
+ success: true,
9881
+ provider: this.name,
9882
+ data: {
9883
+ id: job.id,
9884
+ text: "",
9885
+ status: job.status
9886
+ },
9887
+ raw: job
9888
+ };
9889
+ }
9890
+ const transcriptResponse = await this.client.get(
9891
+ `/transcriptions/${transcriptId}/transcript`
9892
+ );
9893
+ return this.normalizeResponse({
9894
+ ...transcriptResponse.data,
9895
+ // Carry over job metadata
9896
+ id: job.id,
9897
+ audio_duration_ms: job.audio_duration_ms
9898
+ });
9499
9899
  } catch (error) {
9500
9900
  return this.createErrorResponse(error);
9501
9901
  }
@@ -9515,50 +9915,51 @@ var SonioxAdapter = class extends BaseAdapter {
9515
9915
  const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9516
9916
  const createdAt = /* @__PURE__ */ new Date();
9517
9917
  const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
9518
- const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
9519
- wsUrl.searchParams.set("api_key", this.config.apiKey);
9520
- const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
9521
- wsUrl.searchParams.set("model", modelId);
9522
- if (options?.encoding) {
9918
+ const wsUrl = `${wsBase}/transcribe-websocket`;
9919
+ const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
9920
+ const sonioxOpts = options?.sonioxStreaming;
9921
+ const initMessage = {
9922
+ api_key: this.config.apiKey,
9923
+ model: modelId
9924
+ };
9925
+ if (sonioxOpts?.audioFormat) {
9926
+ initMessage.audio_format = sonioxOpts.audioFormat;
9927
+ } else if (options?.encoding) {
9523
9928
  const encodingMap = {
9524
9929
  linear16: "pcm_s16le",
9525
9930
  pcm: "pcm_s16le",
9526
9931
  mulaw: "mulaw",
9527
9932
  alaw: "alaw"
9528
9933
  };
9529
- wsUrl.searchParams.set("audio_format", encodingMap[options.encoding] || options.encoding);
9934
+ initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
9530
9935
  }
9531
- if (options?.sampleRate) {
9532
- wsUrl.searchParams.set("sample_rate", options.sampleRate.toString());
9936
+ if (sonioxOpts?.sampleRate || options?.sampleRate) {
9937
+ initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
9533
9938
  }
9534
- if (options?.channels) {
9535
- wsUrl.searchParams.set("num_channels", options.channels.toString());
9939
+ if (sonioxOpts?.numChannels || options?.channels) {
9940
+ initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
9536
9941
  }
9537
- const sonioxOpts = options?.sonioxStreaming;
9538
9942
  if (sonioxOpts) {
9539
9943
  if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
9540
- wsUrl.searchParams.set("language_hints", JSON.stringify(sonioxOpts.languageHints));
9944
+ initMessage.language_hints = sonioxOpts.languageHints;
9541
9945
  }
9542
9946
  if (sonioxOpts.enableLanguageIdentification) {
9543
- wsUrl.searchParams.set("enable_language_identification", "true");
9947
+ initMessage.enable_language_identification = true;
9544
9948
  }
9545
9949
  if (sonioxOpts.enableEndpointDetection) {
9546
- wsUrl.searchParams.set("enable_endpoint_detection", "true");
9950
+ initMessage.enable_endpoint_detection = true;
9547
9951
  }
9548
9952
  if (sonioxOpts.enableSpeakerDiarization) {
9549
- wsUrl.searchParams.set("enable_speaker_diarization", "true");
9953
+ initMessage.enable_speaker_diarization = true;
9550
9954
  }
9551
9955
  if (sonioxOpts.context) {
9552
- wsUrl.searchParams.set(
9553
- "context",
9554
- typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
9555
- );
9956
+ initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
9556
9957
  }
9557
9958
  if (sonioxOpts.translation) {
9558
- wsUrl.searchParams.set("translation", JSON.stringify(sonioxOpts.translation));
9959
+ initMessage.translation = sonioxOpts.translation;
9559
9960
  }
9560
9961
  if (sonioxOpts.clientReferenceId) {
9561
- wsUrl.searchParams.set("client_reference_id", sonioxOpts.clientReferenceId);
9962
+ initMessage.client_reference_id = sonioxOpts.clientReferenceId;
9562
9963
  }
9563
9964
  }
9564
9965
  if (!sonioxOpts?.languageHints && options?.language) {
@@ -9567,24 +9968,33 @@ var SonioxAdapter = class extends BaseAdapter {
9567
9968
  `[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
9568
9969
  );
9569
9970
  }
9570
- wsUrl.searchParams.set("language_hints", JSON.stringify([options.language]));
9971
+ initMessage.language_hints = [options.language];
9571
9972
  }
9572
9973
  if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
9573
- wsUrl.searchParams.set("enable_speaker_diarization", "true");
9974
+ initMessage.enable_speaker_diarization = true;
9574
9975
  }
9575
9976
  if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
9576
- wsUrl.searchParams.set("enable_language_identification", "true");
9577
- }
9578
- if (options?.interimResults !== false) {
9977
+ initMessage.enable_language_identification = true;
9579
9978
  }
9580
9979
  let status = "connecting";
9581
9980
  let openedAt = null;
9582
9981
  let receivedData = false;
9583
9982
  const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
9584
- const ws = new WebSocketImpl(wsUrl.toString());
9983
+ const ws = new WebSocketImpl(wsUrl);
9585
9984
  ws.onopen = () => {
9586
- status = "open";
9587
9985
  openedAt = Date.now();
9986
+ const initPayload = JSON.stringify(initMessage);
9987
+ if (callbacks?.onRawMessage) {
9988
+ callbacks.onRawMessage({
9989
+ provider: this.name,
9990
+ direction: "outgoing",
9991
+ timestamp: Date.now(),
9992
+ payload: initPayload,
9993
+ messageType: "init"
9994
+ });
9995
+ }
9996
+ ws.send(initPayload);
9997
+ status = "open";
9588
9998
  callbacks?.onOpen?.();
9589
9999
  };
9590
10000
  ws.onmessage = (event) => {
@@ -9663,10 +10073,10 @@ var SonioxAdapter = class extends BaseAdapter {
9663
10073
  ws.onclose = (event) => {
9664
10074
  status = "closed";
9665
10075
  const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
9666
- const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
9667
- if (isImmediateClose && event.code === 1e3) {
10076
+ const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
10077
+ if (isEarlyClose && event.code === 1e3) {
9668
10078
  const errorMessage = [
9669
- "Soniox closed connection immediately after opening.",
10079
+ "Soniox closed connection shortly after opening.",
9670
10080
  `Current config: region=${this.region}, model=${modelId}`,
9671
10081
  "Likely causes:",
9672
10082
  " - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
@@ -9787,8 +10197,10 @@ var SonioxAdapter = class extends BaseAdapter {
9787
10197
  * Normalize Soniox response to unified format
9788
10198
  */
9789
10199
  normalizeResponse(response) {
9790
- const text = response.text || (response.tokens ? response.tokens.filter((t) => t.is_final).map((t) => t.text).join("") : "");
9791
- const words = response.tokens ? response.tokens.filter((t) => t.is_final && t.start_ms !== void 0 && t.end_ms !== void 0).map((token) => ({
10200
+ const text = response.text || (response.tokens ? response.tokens.filter((t) => t.is_final !== false).map((t) => t.text).join("") : "");
10201
+ const words = response.tokens ? response.tokens.filter(
10202
+ (t) => t.is_final !== false && t.start_ms !== void 0 && t.end_ms !== void 0
10203
+ ).map((token) => ({
9792
10204
  word: token.text,
9793
10205
  start: token.start_ms / 1e3,
9794
10206
  end: token.end_ms / 1e3,
@@ -9805,7 +10217,8 @@ var SonioxAdapter = class extends BaseAdapter {
9805
10217
  id,
9806
10218
  label: `Speaker ${id}`
9807
10219
  })) : void 0;
9808
- const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens.filter((t) => t.is_final)) : [];
10220
+ const tokens = response.tokens ? response.tokens.filter((t) => t.is_final !== false) : [];
10221
+ const utterances = tokens.length > 0 ? this.buildUtterancesFromTokens(tokens) : [];
9809
10222
  const language = response.tokens?.find((t) => t.language)?.language;
9810
10223
  return {
9811
10224
  success: true,
@@ -9815,7 +10228,7 @@ var SonioxAdapter = class extends BaseAdapter {
9815
10228
  text,
9816
10229
  status: TranscriptionStatus.completed,
9817
10230
  language,
9818
- duration: response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
10231
+ duration: response.audio_duration_ms ? response.audio_duration_ms / 1e3 : response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
9819
10232
  speakers,
9820
10233
  words: words.length > 0 ? words : void 0,
9821
10234
  utterances: utterances.length > 0 ? utterances : void 0
@@ -36682,7 +37095,7 @@ var AzureCapabilities = {
36682
37095
  deleteTranscript: true
36683
37096
  };
36684
37097
  var SpeechmaticsCapabilities = {
36685
- streaming: false,
37098
+ streaming: true,
36686
37099
  diarization: true,
36687
37100
  wordTimestamps: true,
36688
37101
  languageDetection: false,