voice-router-dev 0.8.6 → 0.8.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -8041,6 +8041,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
8041
8041
  id: String(speakerId),
8042
8042
  label: `Speaker ${speakerId}`
8043
8043
  })) : void 0;
8044
+ const utterances = words.length > 0 ? buildUtterancesFromWords(words) : void 0;
8044
8045
  const transcriptionId = transcription.self?.split("/").pop() || "";
8045
8046
  return {
8046
8047
  success: true,
@@ -8054,6 +8055,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
8054
8055
  duration: transcriptionData.duration ? transcriptionData.duration / 1e7 : void 0,
8055
8056
  speakers,
8056
8057
  words: words.length > 0 ? words : void 0,
8058
+ utterances: utterances && utterances.length > 0 ? utterances : void 0,
8057
8059
  createdAt: transcription.createdDateTime,
8058
8060
  completedAt: transcription.lastActionDateTime
8059
8061
  },
@@ -8689,6 +8691,7 @@ function createOpenAIWhisperAdapter(config) {
8689
8691
 
8690
8692
  // src/adapters/speechmatics-adapter.ts
8691
8693
  import axios8 from "axios";
8694
+ import WebSocket6 from "ws";
8692
8695
 
8693
8696
  // src/generated/speechmatics/schema/notificationConfigContentsItem.ts
8694
8697
  var NotificationConfigContentsItem = {
@@ -8738,8 +8741,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8738
8741
  super(...arguments);
8739
8742
  this.name = "speechmatics";
8740
8743
  this.capabilities = {
8741
- streaming: false,
8742
- // Batch only (streaming available via separate WebSocket API)
8744
+ streaming: true,
8743
8745
  diarization: true,
8744
8746
  wordTimestamps: true,
8745
8747
  languageDetection: false,
@@ -8874,13 +8876,16 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8874
8876
  jobConfig.fetch_data = {
8875
8877
  url: audio.url
8876
8878
  };
8877
- requestBody = { config: JSON.stringify(jobConfig) };
8878
- headers = { "Content-Type": "application/json" };
8879
+ const formData = new FormData();
8880
+ formData.append("config", JSON.stringify(jobConfig));
8881
+ requestBody = formData;
8882
+ headers = { "Content-Type": "multipart/form-data" };
8879
8883
  } else if (audio.type === "file") {
8880
- requestBody = {
8881
- config: JSON.stringify(jobConfig),
8882
- data_file: audio.file
8883
- };
8884
+ const formData = new FormData();
8885
+ formData.append("config", JSON.stringify(jobConfig));
8886
+ const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
8887
+ formData.append("data_file", audioBlob, audio.filename || "audio.wav");
8888
+ requestBody = formData;
8884
8889
  headers = { "Content-Type": "multipart/form-data" };
8885
8890
  } else {
8886
8891
  return {
@@ -8985,6 +8990,381 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8985
8990
  throw error;
8986
8991
  }
8987
8992
  }
8993
+ /**
8994
+ * Build WebSocket URL for real-time streaming
8995
+ *
8996
+ * Note: Real-time API uses a different host from the batch API:
8997
+ * - Batch: {region}.asr.api.speechmatics.com
8998
+ * - Real-time: {region}.rt.speechmatics.com
8999
+ *
9000
+ * @param region - Regional endpoint identifier
9001
+ * @returns WebSocket URL for real-time API
9002
+ */
9003
+ getRegionalWsUrl(region) {
9004
+ if (this.config?.wsBaseUrl) {
9005
+ return this.config.wsBaseUrl;
9006
+ }
9007
+ const regionPrefix = region || "eu1";
9008
+ return `wss://${regionPrefix}.rt.speechmatics.com/v2`;
9009
+ }
9010
+ /**
9011
+ * Stream audio for real-time transcription via WebSocket
9012
+ *
9013
+ * Connects to Speechmatics' real-time API and sends audio chunks
9014
+ * for transcription with results returned via callbacks.
9015
+ *
9016
+ * @param options - Streaming configuration options
9017
+ * @param callbacks - Event callbacks for transcription results
9018
+ * @returns Promise that resolves with a StreamingSession
9019
+ *
9020
+ * @example Basic streaming
9021
+ * ```typescript
9022
+ * const session = await adapter.transcribeStream({
9023
+ * language: 'en',
9024
+ * speechmaticsStreaming: {
9025
+ * enablePartials: true,
9026
+ * operatingPoint: 'enhanced'
9027
+ * }
9028
+ * }, {
9029
+ * onTranscript: (event) => console.log(event.text),
9030
+ * onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
9031
+ * onError: (error) => console.error(error)
9032
+ * });
9033
+ *
9034
+ * await session.sendAudio({ data: audioBuffer });
9035
+ * await session.close();
9036
+ * ```
9037
+ */
9038
+ async transcribeStream(options, callbacks) {
9039
+ this.validateConfig();
9040
+ const smOpts = options?.speechmaticsStreaming || {};
9041
+ const region = smOpts.region || this.config?.region;
9042
+ const wsUrl = this.getRegionalWsUrl(region);
9043
+ const ws = new WebSocket6(wsUrl, {
9044
+ headers: {
9045
+ Authorization: `Bearer ${this.config.apiKey}`
9046
+ }
9047
+ });
9048
+ let sessionStatus = "connecting";
9049
+ const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
9050
+ let seqNo = 0;
9051
+ let utteranceResults = [];
9052
+ const sessionReady = new Promise((resolve, reject) => {
9053
+ const timeout = setTimeout(() => {
9054
+ reject(new Error("WebSocket connection timeout"));
9055
+ }, 1e4);
9056
+ let wsOpen = false;
9057
+ ws.once("error", (error) => {
9058
+ clearTimeout(timeout);
9059
+ reject(error);
9060
+ });
9061
+ ws.once("open", () => {
9062
+ wsOpen = true;
9063
+ const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
9064
+ const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
9065
+ const startMsg = {
9066
+ message: "StartRecognition",
9067
+ audio_format: {
9068
+ type: "raw",
9069
+ encoding,
9070
+ sample_rate: sampleRate
9071
+ },
9072
+ transcription_config: {
9073
+ language: smOpts.language || options?.language || "en",
9074
+ enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
9075
+ }
9076
+ };
9077
+ const txConfig = startMsg.transcription_config;
9078
+ if (smOpts.domain) txConfig.domain = smOpts.domain;
9079
+ if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
9080
+ if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
9081
+ if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
9082
+ if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
9083
+ if (smOpts.diarization === "speaker" || options?.diarization) {
9084
+ txConfig.diarization = "speaker";
9085
+ if (smOpts.maxSpeakers) {
9086
+ txConfig.speaker_diarization_config = {
9087
+ max_speakers: smOpts.maxSpeakers
9088
+ };
9089
+ } else if (options?.speakersExpected) {
9090
+ txConfig.speaker_diarization_config = {
9091
+ max_speakers: options.speakersExpected
9092
+ };
9093
+ }
9094
+ }
9095
+ if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
9096
+ txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
9097
+ content: word
9098
+ }));
9099
+ } else if (options?.customVocabulary && options.customVocabulary.length > 0) {
9100
+ txConfig.additional_vocab = options.customVocabulary.map((word) => ({
9101
+ content: word
9102
+ }));
9103
+ }
9104
+ if (smOpts.conversationConfig) {
9105
+ txConfig.conversation_config = {
9106
+ end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9107
+ };
9108
+ }
9109
+ const startPayload = JSON.stringify(startMsg);
9110
+ if (callbacks?.onRawMessage) {
9111
+ callbacks.onRawMessage({
9112
+ provider: "speechmatics",
9113
+ direction: "outgoing",
9114
+ timestamp: Date.now(),
9115
+ payload: startPayload,
9116
+ messageType: "StartRecognition"
9117
+ });
9118
+ }
9119
+ ws.send(startPayload);
9120
+ });
9121
+ const onMessage = (data) => {
9122
+ const rawPayload = data.toString();
9123
+ try {
9124
+ const msg = JSON.parse(rawPayload);
9125
+ if (msg.message === "RecognitionStarted") {
9126
+ clearTimeout(timeout);
9127
+ ws.removeListener("message", onMessage);
9128
+ ws.emit("message", data);
9129
+ resolve();
9130
+ } else if (msg.message === "Error") {
9131
+ clearTimeout(timeout);
9132
+ ws.removeListener("message", onMessage);
9133
+ reject(new Error(msg.reason || "Recognition failed to start"));
9134
+ }
9135
+ } catch {
9136
+ }
9137
+ };
9138
+ ws.on("message", onMessage);
9139
+ });
9140
+ ws.on("message", (data) => {
9141
+ const rawPayload = data.toString();
9142
+ try {
9143
+ const message = JSON.parse(rawPayload);
9144
+ if (callbacks?.onRawMessage) {
9145
+ callbacks.onRawMessage({
9146
+ provider: "speechmatics",
9147
+ direction: "incoming",
9148
+ timestamp: Date.now(),
9149
+ payload: rawPayload,
9150
+ messageType: message.message
9151
+ });
9152
+ }
9153
+ this.handleStreamingMessage(message, callbacks, utteranceResults);
9154
+ } catch (error) {
9155
+ if (callbacks?.onRawMessage) {
9156
+ callbacks.onRawMessage({
9157
+ provider: "speechmatics",
9158
+ direction: "incoming",
9159
+ timestamp: Date.now(),
9160
+ payload: rawPayload,
9161
+ messageType: "parse_error"
9162
+ });
9163
+ }
9164
+ callbacks?.onError?.({
9165
+ code: "PARSE_ERROR",
9166
+ message: "Failed to parse WebSocket message",
9167
+ details: error
9168
+ });
9169
+ }
9170
+ });
9171
+ ws.on("error", (error) => {
9172
+ callbacks?.onError?.({
9173
+ code: "WEBSOCKET_ERROR",
9174
+ message: error.message,
9175
+ details: error
9176
+ });
9177
+ });
9178
+ ws.on("close", (code, reason) => {
9179
+ sessionStatus = "closed";
9180
+ callbacks?.onClose?.(code, reason.toString());
9181
+ });
9182
+ await sessionReady;
9183
+ sessionStatus = "open";
9184
+ callbacks?.onOpen?.();
9185
+ return {
9186
+ id: sessionId,
9187
+ provider: this.name,
9188
+ createdAt: /* @__PURE__ */ new Date(),
9189
+ getStatus: () => sessionStatus,
9190
+ sendAudio: async (chunk) => {
9191
+ if (sessionStatus !== "open") {
9192
+ throw new Error(`Cannot send audio: session is ${sessionStatus}`);
9193
+ }
9194
+ if (ws.readyState !== WebSocket6.OPEN) {
9195
+ throw new Error("WebSocket is not open");
9196
+ }
9197
+ if (callbacks?.onRawMessage) {
9198
+ const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
9199
+ chunk.data.byteOffset,
9200
+ chunk.data.byteOffset + chunk.data.byteLength
9201
+ );
9202
+ callbacks.onRawMessage({
9203
+ provider: this.name,
9204
+ direction: "outgoing",
9205
+ timestamp: Date.now(),
9206
+ payload: audioPayload,
9207
+ messageType: "audio"
9208
+ });
9209
+ }
9210
+ ws.send(chunk.data);
9211
+ seqNo++;
9212
+ if (chunk.isLast) {
9213
+ const endMsg = JSON.stringify({
9214
+ message: "EndOfStream",
9215
+ last_seq_no: seqNo
9216
+ });
9217
+ if (callbacks?.onRawMessage) {
9218
+ callbacks.onRawMessage({
9219
+ provider: this.name,
9220
+ direction: "outgoing",
9221
+ timestamp: Date.now(),
9222
+ payload: endMsg,
9223
+ messageType: "EndOfStream"
9224
+ });
9225
+ }
9226
+ ws.send(endMsg);
9227
+ }
9228
+ },
9229
+ close: async () => {
9230
+ if (sessionStatus === "closed" || sessionStatus === "closing") {
9231
+ return;
9232
+ }
9233
+ sessionStatus = "closing";
9234
+ if (ws.readyState === WebSocket6.OPEN) {
9235
+ seqNo++;
9236
+ ws.send(
9237
+ JSON.stringify({
9238
+ message: "EndOfStream",
9239
+ last_seq_no: seqNo
9240
+ })
9241
+ );
9242
+ }
9243
+ return new Promise((resolve) => {
9244
+ const timeout = setTimeout(() => {
9245
+ ws.terminate();
9246
+ sessionStatus = "closed";
9247
+ resolve();
9248
+ }, 5e3);
9249
+ const onMsg = (data) => {
9250
+ try {
9251
+ const msg = JSON.parse(data.toString());
9252
+ if (msg.message === "EndOfTranscript") {
9253
+ ws.removeListener("message", onMsg);
9254
+ clearTimeout(timeout);
9255
+ ws.close();
9256
+ }
9257
+ } catch {
9258
+ }
9259
+ };
9260
+ ws.on("message", onMsg);
9261
+ ws.once("close", () => {
9262
+ clearTimeout(timeout);
9263
+ sessionStatus = "closed";
9264
+ resolve();
9265
+ });
9266
+ });
9267
+ }
9268
+ };
9269
+ }
9270
+ /**
9271
+ * Handle incoming Speechmatics real-time WebSocket messages
9272
+ */
9273
+ handleStreamingMessage(message, callbacks, utteranceResults) {
9274
+ switch (message.message) {
9275
+ case "RecognitionStarted": {
9276
+ break;
9277
+ }
9278
+ case "AddPartialTranscript": {
9279
+ const results = message.results || [];
9280
+ const text = buildTextFromSpeechmaticsResults(results);
9281
+ if (text) {
9282
+ callbacks?.onTranscript?.({
9283
+ type: "transcript",
9284
+ text,
9285
+ isFinal: false,
9286
+ words: this.extractWordsFromResults(results),
9287
+ data: message
9288
+ });
9289
+ }
9290
+ break;
9291
+ }
9292
+ case "AddTranscript": {
9293
+ const results = message.results || [];
9294
+ const text = buildTextFromSpeechmaticsResults(results);
9295
+ if (utteranceResults) {
9296
+ utteranceResults.push(...results);
9297
+ }
9298
+ if (text) {
9299
+ callbacks?.onTranscript?.({
9300
+ type: "transcript",
9301
+ text,
9302
+ isFinal: true,
9303
+ words: this.extractWordsFromResults(results),
9304
+ data: message
9305
+ });
9306
+ }
9307
+ break;
9308
+ }
9309
+ case "EndOfUtterance": {
9310
+ if (utteranceResults && utteranceResults.length > 0) {
9311
+ const text = buildTextFromSpeechmaticsResults(utteranceResults);
9312
+ const words = this.extractWordsFromResults(utteranceResults);
9313
+ const utterances = buildUtterancesFromWords(words);
9314
+ if (utterances.length > 0) {
9315
+ for (const utt of utterances) {
9316
+ callbacks?.onUtterance?.(utt);
9317
+ }
9318
+ } else if (text) {
9319
+ callbacks?.onUtterance?.({
9320
+ text,
9321
+ start: words.length > 0 ? words[0].start : 0,
9322
+ end: words.length > 0 ? words[words.length - 1].end : 0,
9323
+ words
9324
+ });
9325
+ }
9326
+ utteranceResults.length = 0;
9327
+ }
9328
+ break;
9329
+ }
9330
+ case "AudioAdded": {
9331
+ break;
9332
+ }
9333
+ case "EndOfTranscript": {
9334
+ break;
9335
+ }
9336
+ case "Info":
9337
+ case "Warning": {
9338
+ callbacks?.onMetadata?.(message);
9339
+ break;
9340
+ }
9341
+ case "Error": {
9342
+ const errMsg = message;
9343
+ callbacks?.onError?.({
9344
+ code: errMsg.type || "SPEECHMATICS_ERROR",
9345
+ message: errMsg.reason || "Unknown error",
9346
+ details: message
9347
+ });
9348
+ break;
9349
+ }
9350
+ default: {
9351
+ callbacks?.onMetadata?.(message);
9352
+ break;
9353
+ }
9354
+ }
9355
+ }
9356
+ /**
9357
+ * Extract unified Word[] from Speechmatics recognition results
9358
+ */
9359
+ extractWordsFromResults(results) {
9360
+ return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
9361
+ word: result.alternatives?.[0]?.content || "",
9362
+ start: result.start_time,
9363
+ end: result.end_time,
9364
+ confidence: result.alternatives?.[0]?.confidence,
9365
+ speaker: result.alternatives?.[0]?.speaker
9366
+ }));
9367
+ }
8988
9368
  /**
8989
9369
  * Normalize Speechmatics status to unified status
8990
9370
  * Uses generated JobDetailsStatus enum values
@@ -9203,26 +9583,13 @@ var SonioxAdapter = class extends BaseAdapter {
9203
9583
  } else if (audio.type === "file") {
9204
9584
  const formData = new FormData();
9205
9585
  const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9206
- formData.append("audio", audioBlob, audio.filename || "audio.wav");
9207
- formData.append("model", requestBody.model);
9208
- if (options?.language) {
9209
- formData.append("language_hints", JSON.stringify([options.language]));
9210
- }
9211
- if (options?.diarization) {
9212
- formData.append("enable_speaker_diarization", "true");
9213
- }
9214
- if (options?.languageDetection) {
9215
- formData.append("enable_language_identification", "true");
9216
- }
9217
- if (options?.customVocabulary) {
9218
- formData.append("context", JSON.stringify({ terms: options.customVocabulary }));
9219
- }
9220
- const response2 = await this.client.post("/speech/transcribe", formData, {
9586
+ formData.append("file", audioBlob, audio.filename || "audio.wav");
9587
+ const uploadResponse = await this.client.post("/files", formData, {
9221
9588
  headers: {
9222
9589
  "Content-Type": "multipart/form-data"
9223
9590
  }
9224
9591
  });
9225
- return this.normalizeResponse(response2.data);
9592
+ requestBody.file_id = uploadResponse.data.id;
9226
9593
  } else {
9227
9594
  return {
9228
9595
  success: false,
@@ -9247,8 +9614,9 @@ var SonioxAdapter = class extends BaseAdapter {
9247
9614
  terms: options.customVocabulary
9248
9615
  };
9249
9616
  }
9250
- const response = await this.client.post("/speech/transcribe", requestBody);
9251
- return this.normalizeResponse(response.data);
9617
+ const response = await this.client.post("/transcriptions", requestBody);
9618
+ const transcriptionId = response.data.id;
9619
+ return await this.pollForCompletion(transcriptionId);
9252
9620
  } catch (error) {
9253
9621
  return this.createErrorResponse(error);
9254
9622
  }
@@ -9256,8 +9624,9 @@ var SonioxAdapter = class extends BaseAdapter {
9256
9624
  /**
9257
9625
  * Get transcription result by ID
9258
9626
  *
9259
- * Soniox batch transcription is synchronous (returns immediately),
9260
- * but this method can be used for consistency with other providers.
9627
+ * Checks job status via GET /v1/transcriptions/{id}, then fetches
9628
+ * the full transcript via GET /v1/transcriptions/{id}/transcript
9629
+ * when completed.
9261
9630
  *
9262
9631
  * @param transcriptId - Transcript ID
9263
9632
  * @returns Transcription response
@@ -9265,8 +9634,39 @@ var SonioxAdapter = class extends BaseAdapter {
9265
9634
  async getTranscript(transcriptId) {
9266
9635
  this.validateConfig();
9267
9636
  try {
9268
- const response = await this.client.get(`/speech/transcripts/${transcriptId}`);
9269
- return this.normalizeResponse(response.data);
9637
+ const statusResponse = await this.client.get(`/transcriptions/${transcriptId}`);
9638
+ const job = statusResponse.data;
9639
+ if (job.status === "error") {
9640
+ return {
9641
+ success: false,
9642
+ provider: this.name,
9643
+ error: {
9644
+ code: "TRANSCRIPTION_ERROR",
9645
+ message: job.error_message || "Transcription failed"
9646
+ }
9647
+ };
9648
+ }
9649
+ if (job.status !== "completed") {
9650
+ return {
9651
+ success: true,
9652
+ provider: this.name,
9653
+ data: {
9654
+ id: job.id,
9655
+ text: "",
9656
+ status: job.status
9657
+ },
9658
+ raw: job
9659
+ };
9660
+ }
9661
+ const transcriptResponse = await this.client.get(
9662
+ `/transcriptions/${transcriptId}/transcript`
9663
+ );
9664
+ return this.normalizeResponse({
9665
+ ...transcriptResponse.data,
9666
+ // Carry over job metadata
9667
+ id: job.id,
9668
+ audio_duration_ms: job.audio_duration_ms
9669
+ });
9270
9670
  } catch (error) {
9271
9671
  return this.createErrorResponse(error);
9272
9672
  }
@@ -9286,50 +9686,51 @@ var SonioxAdapter = class extends BaseAdapter {
9286
9686
  const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9287
9687
  const createdAt = /* @__PURE__ */ new Date();
9288
9688
  const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
9289
- const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
9290
- wsUrl.searchParams.set("api_key", this.config.apiKey);
9291
- const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
9292
- wsUrl.searchParams.set("model", modelId);
9293
- if (options?.encoding) {
9689
+ const wsUrl = `${wsBase}/transcribe-websocket`;
9690
+ const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
9691
+ const sonioxOpts = options?.sonioxStreaming;
9692
+ const initMessage = {
9693
+ api_key: this.config.apiKey,
9694
+ model: modelId
9695
+ };
9696
+ if (sonioxOpts?.audioFormat) {
9697
+ initMessage.audio_format = sonioxOpts.audioFormat;
9698
+ } else if (options?.encoding) {
9294
9699
  const encodingMap = {
9295
9700
  linear16: "pcm_s16le",
9296
9701
  pcm: "pcm_s16le",
9297
9702
  mulaw: "mulaw",
9298
9703
  alaw: "alaw"
9299
9704
  };
9300
- wsUrl.searchParams.set("audio_format", encodingMap[options.encoding] || options.encoding);
9705
+ initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
9301
9706
  }
9302
- if (options?.sampleRate) {
9303
- wsUrl.searchParams.set("sample_rate", options.sampleRate.toString());
9707
+ if (sonioxOpts?.sampleRate || options?.sampleRate) {
9708
+ initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
9304
9709
  }
9305
- if (options?.channels) {
9306
- wsUrl.searchParams.set("num_channels", options.channels.toString());
9710
+ if (sonioxOpts?.numChannels || options?.channels) {
9711
+ initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
9307
9712
  }
9308
- const sonioxOpts = options?.sonioxStreaming;
9309
9713
  if (sonioxOpts) {
9310
9714
  if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
9311
- wsUrl.searchParams.set("language_hints", JSON.stringify(sonioxOpts.languageHints));
9715
+ initMessage.language_hints = sonioxOpts.languageHints;
9312
9716
  }
9313
9717
  if (sonioxOpts.enableLanguageIdentification) {
9314
- wsUrl.searchParams.set("enable_language_identification", "true");
9718
+ initMessage.enable_language_identification = true;
9315
9719
  }
9316
9720
  if (sonioxOpts.enableEndpointDetection) {
9317
- wsUrl.searchParams.set("enable_endpoint_detection", "true");
9721
+ initMessage.enable_endpoint_detection = true;
9318
9722
  }
9319
9723
  if (sonioxOpts.enableSpeakerDiarization) {
9320
- wsUrl.searchParams.set("enable_speaker_diarization", "true");
9724
+ initMessage.enable_speaker_diarization = true;
9321
9725
  }
9322
9726
  if (sonioxOpts.context) {
9323
- wsUrl.searchParams.set(
9324
- "context",
9325
- typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
9326
- );
9727
+ initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
9327
9728
  }
9328
9729
  if (sonioxOpts.translation) {
9329
- wsUrl.searchParams.set("translation", JSON.stringify(sonioxOpts.translation));
9730
+ initMessage.translation = sonioxOpts.translation;
9330
9731
  }
9331
9732
  if (sonioxOpts.clientReferenceId) {
9332
- wsUrl.searchParams.set("client_reference_id", sonioxOpts.clientReferenceId);
9733
+ initMessage.client_reference_id = sonioxOpts.clientReferenceId;
9333
9734
  }
9334
9735
  }
9335
9736
  if (!sonioxOpts?.languageHints && options?.language) {
@@ -9338,24 +9739,33 @@ var SonioxAdapter = class extends BaseAdapter {
9338
9739
  `[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
9339
9740
  );
9340
9741
  }
9341
- wsUrl.searchParams.set("language_hints", JSON.stringify([options.language]));
9742
+ initMessage.language_hints = [options.language];
9342
9743
  }
9343
9744
  if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
9344
- wsUrl.searchParams.set("enable_speaker_diarization", "true");
9745
+ initMessage.enable_speaker_diarization = true;
9345
9746
  }
9346
9747
  if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
9347
- wsUrl.searchParams.set("enable_language_identification", "true");
9348
- }
9349
- if (options?.interimResults !== false) {
9748
+ initMessage.enable_language_identification = true;
9350
9749
  }
9351
9750
  let status = "connecting";
9352
9751
  let openedAt = null;
9353
9752
  let receivedData = false;
9354
9753
  const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : __require("ws");
9355
- const ws = new WebSocketImpl(wsUrl.toString());
9754
+ const ws = new WebSocketImpl(wsUrl);
9356
9755
  ws.onopen = () => {
9357
- status = "open";
9358
9756
  openedAt = Date.now();
9757
+ const initPayload = JSON.stringify(initMessage);
9758
+ if (callbacks?.onRawMessage) {
9759
+ callbacks.onRawMessage({
9760
+ provider: this.name,
9761
+ direction: "outgoing",
9762
+ timestamp: Date.now(),
9763
+ payload: initPayload,
9764
+ messageType: "init"
9765
+ });
9766
+ }
9767
+ ws.send(initPayload);
9768
+ status = "open";
9359
9769
  callbacks?.onOpen?.();
9360
9770
  };
9361
9771
  ws.onmessage = (event) => {
@@ -9434,10 +9844,10 @@ var SonioxAdapter = class extends BaseAdapter {
9434
9844
  ws.onclose = (event) => {
9435
9845
  status = "closed";
9436
9846
  const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
9437
- const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
9438
- if (isImmediateClose && event.code === 1e3) {
9847
+ const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
9848
+ if (isEarlyClose && event.code === 1e3) {
9439
9849
  const errorMessage = [
9440
- "Soniox closed connection immediately after opening.",
9850
+ "Soniox closed connection shortly after opening.",
9441
9851
  `Current config: region=${this.region}, model=${modelId}`,
9442
9852
  "Likely causes:",
9443
9853
  " - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
@@ -9558,8 +9968,10 @@ var SonioxAdapter = class extends BaseAdapter {
9558
9968
  * Normalize Soniox response to unified format
9559
9969
  */
9560
9970
  normalizeResponse(response) {
9561
- const text = response.text || (response.tokens ? response.tokens.filter((t) => t.is_final).map((t) => t.text).join("") : "");
9562
- const words = response.tokens ? response.tokens.filter((t) => t.is_final && t.start_ms !== void 0 && t.end_ms !== void 0).map((token) => ({
9971
+ const text = response.text || (response.tokens ? response.tokens.filter((t) => t.is_final !== false).map((t) => t.text).join("") : "");
9972
+ const words = response.tokens ? response.tokens.filter(
9973
+ (t) => t.is_final !== false && t.start_ms !== void 0 && t.end_ms !== void 0
9974
+ ).map((token) => ({
9563
9975
  word: token.text,
9564
9976
  start: token.start_ms / 1e3,
9565
9977
  end: token.end_ms / 1e3,
@@ -9576,7 +9988,8 @@ var SonioxAdapter = class extends BaseAdapter {
9576
9988
  id,
9577
9989
  label: `Speaker ${id}`
9578
9990
  })) : void 0;
9579
- const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens.filter((t) => t.is_final)) : [];
9991
+ const tokens = response.tokens ? response.tokens.filter((t) => t.is_final !== false) : [];
9992
+ const utterances = tokens.length > 0 ? this.buildUtterancesFromTokens(tokens) : [];
9580
9993
  const language = response.tokens?.find((t) => t.language)?.language;
9581
9994
  return {
9582
9995
  success: true,
@@ -9586,7 +9999,7 @@ var SonioxAdapter = class extends BaseAdapter {
9586
9999
  text,
9587
10000
  status: TranscriptionStatus.completed,
9588
10001
  language,
9589
- duration: response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
10002
+ duration: response.audio_duration_ms ? response.audio_duration_ms / 1e3 : response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
9590
10003
  speakers,
9591
10004
  words: words.length > 0 ? words : void 0,
9592
10005
  utterances: utterances.length > 0 ? utterances : void 0
@@ -36453,7 +36866,7 @@ var AzureCapabilities = {
36453
36866
  deleteTranscript: true
36454
36867
  };
36455
36868
  var SpeechmaticsCapabilities = {
36456
- streaming: false,
36869
+ streaming: true,
36457
36870
  diarization: true,
36458
36871
  wordTimestamps: true,
36459
36872
  languageDetection: false,