voice-router-dev 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -82,7 +82,7 @@ __export(src_exports, {
82
82
  DeepgramTTSSampleRate: () => DeepgramTTSSampleRate,
83
83
  DeepgramTopicMode: () => DeepgramTopicMode,
84
84
  DeepgramTranscriptionSchema: () => DeepgramTranscriptionSchema,
85
- DeepgramTypes: () => schema_exports4,
85
+ DeepgramTypes: () => schema_exports5,
86
86
  DeepgramZodSchemas: () => deepgramAPI_zod_exports,
87
87
  ElevenLabsAdapter: () => ElevenLabsAdapter,
88
88
  ElevenLabsCapabilities: () => ElevenLabsCapabilities,
@@ -119,7 +119,7 @@ __export(src_exports, {
119
119
  OpenAIResponseFormat: () => OpenAIResponseFormat,
120
120
  OpenAIStreamingTypes: () => streaming_types_exports,
121
121
  OpenAITranscriptionSchema: () => OpenAITranscriptionSchema,
122
- OpenAITypes: () => schema_exports5,
122
+ OpenAITypes: () => schema_exports6,
123
123
  OpenAIWhisperAdapter: () => OpenAIWhisperAdapter,
124
124
  OpenAIZodSchemas: () => openAIAudioRealtimeAPI_zod_exports,
125
125
  ProfanityFilterMode: () => ProfanityFilterMode,
@@ -148,7 +148,7 @@ __export(src_exports, {
148
148
  SonioxStreamingUpdateSchema: () => SonioxStreamingUpdateSchema,
149
149
  SonioxStreamingZodSchemas: () => streaming_types_zod_exports,
150
150
  SonioxTranscriptionSchema: () => SonioxTranscriptionSchema,
151
- SonioxTypes: () => schema_exports7,
151
+ SonioxTypes: () => schema_exports4,
152
152
  SpeakV1ContainerParameter: () => SpeakV1ContainerParameter,
153
153
  SpeakV1EncodingParameter: () => SpeakV1EncodingParameter,
154
154
  SpeakV1SampleRateParameter: () => SpeakV1SampleRateParameter,
@@ -163,7 +163,7 @@ __export(src_exports, {
163
163
  SpeechmaticsStreamingSchema: () => SpeechmaticsStreamingSchema,
164
164
  SpeechmaticsStreamingUpdateSchema: () => SpeechmaticsStreamingUpdateSchema,
165
165
  SpeechmaticsTranscriptionSchema: () => SpeechmaticsTranscriptionSchema,
166
- SpeechmaticsTypes: () => schema_exports6,
166
+ SpeechmaticsTypes: () => schema_exports7,
167
167
  SpeechmaticsZodSchemas: () => speechmaticsASRRESTAPI_zod_exports,
168
168
  StreamingProviders: () => StreamingProviders,
169
169
  StreamingSupportedBitDepthEnum: () => StreamingSupportedBitDepthEnum,
@@ -6064,23 +6064,22 @@ var AssemblyAIAdapter = class extends BaseAdapter {
6064
6064
  "AssemblyAI adapter currently only supports URL-based audio input. Use audio.type='url'"
6065
6065
  );
6066
6066
  }
6067
- const aaiOpts = { ...options?.assemblyai };
6068
- if ("speech_model" in aaiOpts && aaiOpts.speech_model != null) {
6069
- if (!aaiOpts.speech_models) {
6070
- aaiOpts.speech_models = [aaiOpts.speech_model];
6071
- }
6072
- delete aaiOpts.speech_model;
6067
+ const passthrough = options?.assemblyai;
6068
+ let speechModels;
6069
+ if (passthrough?.speech_model != null && !passthrough.speech_models) {
6070
+ speechModels = [passthrough.speech_model];
6071
+ } else if (passthrough?.speech_models) {
6072
+ speechModels = passthrough.speech_models;
6073
6073
  }
6074
+ const { speech_model: _deprecated, ...typedOpts } = passthrough ?? {};
6074
6075
  const request = {
6075
- ...aaiOpts,
6076
+ ...typedOpts,
6076
6077
  audio_url: audioUrl,
6077
6078
  // speech_models is required — default to universal-3-pro
6078
- speech_models: aaiOpts.speech_models ?? [
6079
- "universal-3-pro"
6080
- ],
6079
+ speech_models: speechModels ?? ["universal-3-pro"],
6081
6080
  // Enable punctuation and formatting by default
6082
- punctuate: aaiOpts.punctuate ?? true,
6083
- format_text: aaiOpts.format_text ?? true
6081
+ punctuate: typedOpts.punctuate ?? true,
6082
+ format_text: typedOpts.format_text ?? true
6084
6083
  };
6085
6084
  if (options) {
6086
6085
  if (options.model) {
@@ -6128,22 +6127,22 @@ var AssemblyAIAdapter = class extends BaseAdapter {
6128
6127
  normalizeResponse(response) {
6129
6128
  let status;
6130
6129
  switch (response.status) {
6131
- case TranscriptStatus.queued:
6130
+ case "queued":
6132
6131
  status = "queued";
6133
6132
  break;
6134
- case TranscriptStatus.processing:
6133
+ case "processing":
6135
6134
  status = "processing";
6136
6135
  break;
6137
- case TranscriptStatus.completed:
6136
+ case "completed":
6138
6137
  status = "completed";
6139
6138
  break;
6140
- case TranscriptStatus.error:
6139
+ case "error":
6141
6140
  status = "error";
6142
6141
  break;
6143
6142
  default:
6144
6143
  status = "queued";
6145
6144
  }
6146
- if (response.status === TranscriptStatus.error) {
6145
+ if (response.status === "error") {
6147
6146
  return {
6148
6147
  success: false,
6149
6148
  provider: this.name,
@@ -6795,8 +6794,10 @@ var DeepgramAdapter = class extends BaseAdapter {
6795
6794
  /**
6796
6795
  * Submit audio for transcription
6797
6796
  *
6798
- * Sends audio to Deepgram API for transcription. Deepgram processes
6799
- * synchronously and returns results immediately (no polling required).
6797
+ * Sends audio to Deepgram API for transcription. Deepgram normally processes
6798
+ * synchronously and returns results immediately. When `webhookUrl` is set,
6799
+ * Deepgram can instead return an async callback acknowledgment containing a
6800
+ * request ID.
6800
6801
  *
6801
6802
  * @param audio - Audio input (URL or file buffer)
6802
6803
  * @param options - Transcription options
@@ -6847,17 +6848,59 @@ var DeepgramAdapter = class extends BaseAdapter {
6847
6848
  { params }
6848
6849
  ).then((res) => res.data);
6849
6850
  } else if (audio.type === "file") {
6850
- response = await this.client.post("/listen", audio.file, {
6851
- params,
6852
- headers: {
6853
- "Content-Type": "audio/*"
6851
+ response = await this.client.post(
6852
+ "/listen",
6853
+ audio.file,
6854
+ {
6855
+ params,
6856
+ headers: {
6857
+ "Content-Type": "audio/*"
6858
+ }
6854
6859
  }
6855
- }).then((res) => res.data);
6860
+ ).then((res) => res.data);
6856
6861
  } else {
6857
6862
  throw new Error(
6858
6863
  "Deepgram adapter does not support stream type for pre-recorded transcription. Use transcribeStream() for real-time streaming."
6859
6864
  );
6860
6865
  }
6866
+ if (options?.webhookUrl) {
6867
+ const requestId = ("request_id" in response ? response.request_id : void 0) || ("metadata" in response ? response.metadata?.request_id : void 0);
6868
+ if (!requestId) {
6869
+ return {
6870
+ success: false,
6871
+ provider: this.name,
6872
+ error: {
6873
+ code: "MISSING_REQUEST_ID",
6874
+ message: "Deepgram callback mode did not return a request ID"
6875
+ },
6876
+ raw: response
6877
+ };
6878
+ }
6879
+ return {
6880
+ success: true,
6881
+ provider: this.name,
6882
+ data: {
6883
+ id: requestId,
6884
+ text: "",
6885
+ status: "queued"
6886
+ },
6887
+ tracking: {
6888
+ requestId
6889
+ },
6890
+ raw: response
6891
+ };
6892
+ }
6893
+ if (!("results" in response) || !("metadata" in response)) {
6894
+ return {
6895
+ success: false,
6896
+ provider: this.name,
6897
+ error: {
6898
+ code: "INVALID_RESPONSE",
6899
+ message: "Deepgram did not return a synchronous transcription payload"
6900
+ },
6901
+ raw: response
6902
+ };
6903
+ }
6861
6904
  return this.normalizeResponse(response);
6862
6905
  } catch (error) {
6863
6906
  return this.createErrorResponse(error);
@@ -7518,7 +7561,8 @@ var DeepgramAdapter = class extends BaseAdapter {
7518
7561
  break;
7519
7562
  }
7520
7563
  case "Metadata": {
7521
- callbacks?.onMetadata?.(message);
7564
+ const { type: _, ...metadata } = message;
7565
+ callbacks?.onMetadata?.(metadata);
7522
7566
  break;
7523
7567
  }
7524
7568
  case "Error": {
@@ -7954,10 +7998,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
7954
7998
  contentUrls: [audio.url],
7955
7999
  properties: this.buildTranscriptionProperties(options)
7956
8000
  };
7957
- const response = await transcriptionsCreate(
7958
- transcriptionRequest,
7959
- this.getAxiosConfig()
7960
- );
8001
+ const response = await transcriptionsCreate(transcriptionRequest, this.getAxiosConfig());
7961
8002
  const transcription = response.data;
7962
8003
  const transcriptId = transcription.self?.split("/").pop() || "";
7963
8004
  return await this.pollForCompletion(transcriptId);
@@ -8497,7 +8538,6 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8497
8538
  const request = {
8498
8539
  ...options?.openai,
8499
8540
  file: audioData,
8500
- // Buffer/Blob both accepted at runtime; generated type expects Blob
8501
8541
  model
8502
8542
  };
8503
8543
  if (options?.language) {
@@ -8517,11 +8557,7 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8517
8557
  request.response_format = OpenAIResponseFormat.json;
8518
8558
  }
8519
8559
  const response = await createTranscription(request, this.getAxiosConfig());
8520
- return this.normalizeResponse(
8521
- response.data,
8522
- model,
8523
- isDiarization
8524
- );
8560
+ return this.normalizeResponse(response.data, model, isDiarization);
8525
8561
  } catch (error) {
8526
8562
  return this.createErrorResponse(error);
8527
8563
  }
@@ -8928,7 +8964,6 @@ function createOpenAIWhisperAdapter(config) {
8928
8964
 
8929
8965
  // src/adapters/speechmatics-adapter.ts
8930
8966
  var import_axios8 = __toESM(require("axios"));
8931
- var import_ws5 = __toESM(require("ws"));
8932
8967
 
8933
8968
  // src/generated/speechmatics/schema/notificationConfigContentsItem.ts
8934
8969
  var NotificationConfigContentsItem = {
@@ -8978,7 +9013,8 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8978
9013
  super(...arguments);
8979
9014
  this.name = "speechmatics";
8980
9015
  this.capabilities = {
8981
- streaming: true,
9016
+ streaming: false,
9017
+ // Batch only (streaming available via separate WebSocket API)
8982
9018
  diarization: true,
8983
9019
  wordTimestamps: true,
8984
9020
  languageDetection: false,
@@ -9113,16 +9149,13 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9113
9149
  jobConfig.fetch_data = {
9114
9150
  url: audio.url
9115
9151
  };
9116
- const formData = new FormData();
9117
- formData.append("config", JSON.stringify(jobConfig));
9118
- requestBody = formData;
9119
- headers = { "Content-Type": "multipart/form-data" };
9152
+ requestBody = { config: JSON.stringify(jobConfig) };
9153
+ headers = { "Content-Type": "application/json" };
9120
9154
  } else if (audio.type === "file") {
9121
- const formData = new FormData();
9122
- formData.append("config", JSON.stringify(jobConfig));
9123
- const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9124
- formData.append("data_file", audioBlob, audio.filename || "audio.wav");
9125
- requestBody = formData;
9155
+ requestBody = {
9156
+ config: JSON.stringify(jobConfig),
9157
+ data_file: audio.file
9158
+ };
9126
9159
  headers = { "Content-Type": "multipart/form-data" };
9127
9160
  } else {
9128
9161
  return {
@@ -9227,389 +9260,6 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9227
9260
  throw error;
9228
9261
  }
9229
9262
  }
9230
- /**
9231
- * Build WebSocket URL for real-time streaming
9232
- *
9233
- * Note: Real-time API uses a different host from the batch API:
9234
- * - Batch: {region}.asr.api.speechmatics.com
9235
- * - Real-time: {region}.rt.speechmatics.com
9236
- *
9237
- * @param region - Regional endpoint identifier
9238
- * @returns WebSocket URL for real-time API
9239
- */
9240
- getRegionalWsUrl(region) {
9241
- if (this.config?.wsBaseUrl) {
9242
- return this.config.wsBaseUrl;
9243
- }
9244
- const rtRegionMap = {
9245
- eu1: "eu",
9246
- eu2: "eu",
9247
- us1: "us",
9248
- us2: "us",
9249
- au1: "eu"
9250
- // No AU RT endpoint — fall back to EU
9251
- };
9252
- const rtPrefix = rtRegionMap[region || ""] || "eu";
9253
- return `wss://${rtPrefix}.rt.speechmatics.com/v2`;
9254
- }
9255
- /**
9256
- * Stream audio for real-time transcription via WebSocket
9257
- *
9258
- * Connects to Speechmatics' real-time API and sends audio chunks
9259
- * for transcription with results returned via callbacks.
9260
- *
9261
- * @param options - Streaming configuration options
9262
- * @param callbacks - Event callbacks for transcription results
9263
- * @returns Promise that resolves with a StreamingSession
9264
- *
9265
- * @example Basic streaming
9266
- * ```typescript
9267
- * const session = await adapter.transcribeStream({
9268
- * language: 'en',
9269
- * speechmaticsStreaming: {
9270
- * enablePartials: true,
9271
- * operatingPoint: 'enhanced'
9272
- * }
9273
- * }, {
9274
- * onTranscript: (event) => console.log(event.text),
9275
- * onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
9276
- * onError: (error) => console.error(error)
9277
- * });
9278
- *
9279
- * await session.sendAudio({ data: audioBuffer });
9280
- * await session.close();
9281
- * ```
9282
- */
9283
- async transcribeStream(options, callbacks) {
9284
- this.validateConfig();
9285
- const smOpts = options?.speechmaticsStreaming || {};
9286
- const region = smOpts.region || this.config?.region;
9287
- const wsUrl = this.getRegionalWsUrl(region);
9288
- const ws = new import_ws5.default(wsUrl, {
9289
- headers: {
9290
- Authorization: `Bearer ${this.config.apiKey}`
9291
- }
9292
- });
9293
- let sessionStatus = "connecting";
9294
- const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
9295
- let seqNo = 0;
9296
- let utteranceResults = [];
9297
- const sessionReady = new Promise((resolve, reject) => {
9298
- const timeout = setTimeout(() => {
9299
- reject(new Error("WebSocket connection timeout"));
9300
- }, 1e4);
9301
- let wsOpen = false;
9302
- ws.once("error", (error) => {
9303
- clearTimeout(timeout);
9304
- reject(error);
9305
- });
9306
- ws.once("open", () => {
9307
- wsOpen = true;
9308
- const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
9309
- const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
9310
- const startMsg = {
9311
- message: "StartRecognition",
9312
- audio_format: {
9313
- type: "raw",
9314
- encoding,
9315
- sample_rate: sampleRate
9316
- },
9317
- transcription_config: {
9318
- language: smOpts.language || options?.language || "en",
9319
- enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
9320
- }
9321
- };
9322
- const txConfig = startMsg.transcription_config;
9323
- if (smOpts.domain) txConfig.domain = smOpts.domain;
9324
- if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
9325
- if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
9326
- if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
9327
- if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
9328
- if (smOpts.diarization === "speaker" || options?.diarization) {
9329
- txConfig.diarization = "speaker";
9330
- if (smOpts.maxSpeakers) {
9331
- txConfig.speaker_diarization_config = {
9332
- max_speakers: smOpts.maxSpeakers
9333
- };
9334
- } else if (options?.speakersExpected) {
9335
- txConfig.speaker_diarization_config = {
9336
- max_speakers: options.speakersExpected
9337
- };
9338
- }
9339
- }
9340
- if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
9341
- txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
9342
- content: word
9343
- }));
9344
- } else if (options?.customVocabulary && options.customVocabulary.length > 0) {
9345
- txConfig.additional_vocab = options.customVocabulary.map((word) => ({
9346
- content: word
9347
- }));
9348
- }
9349
- if (smOpts.conversationConfig) {
9350
- txConfig.conversation_config = {
9351
- end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9352
- };
9353
- }
9354
- const startPayload = JSON.stringify(startMsg);
9355
- if (callbacks?.onRawMessage) {
9356
- callbacks.onRawMessage({
9357
- provider: "speechmatics",
9358
- direction: "outgoing",
9359
- timestamp: Date.now(),
9360
- payload: startPayload,
9361
- messageType: "StartRecognition"
9362
- });
9363
- }
9364
- ws.send(startPayload);
9365
- });
9366
- const onMessage = (data) => {
9367
- const rawPayload = data.toString();
9368
- try {
9369
- const msg = JSON.parse(rawPayload);
9370
- if (msg.message === "RecognitionStarted") {
9371
- clearTimeout(timeout);
9372
- ws.removeListener("message", onMessage);
9373
- ws.emit("message", data);
9374
- resolve();
9375
- } else if (msg.message === "Error") {
9376
- clearTimeout(timeout);
9377
- ws.removeListener("message", onMessage);
9378
- reject(new Error(msg.reason || "Recognition failed to start"));
9379
- }
9380
- } catch {
9381
- }
9382
- };
9383
- ws.on("message", onMessage);
9384
- });
9385
- ws.on("message", (data) => {
9386
- const rawPayload = data.toString();
9387
- try {
9388
- const message = JSON.parse(rawPayload);
9389
- if (callbacks?.onRawMessage) {
9390
- callbacks.onRawMessage({
9391
- provider: "speechmatics",
9392
- direction: "incoming",
9393
- timestamp: Date.now(),
9394
- payload: rawPayload,
9395
- messageType: message.message
9396
- });
9397
- }
9398
- this.handleStreamingMessage(message, callbacks, utteranceResults);
9399
- } catch (error) {
9400
- if (callbacks?.onRawMessage) {
9401
- callbacks.onRawMessage({
9402
- provider: "speechmatics",
9403
- direction: "incoming",
9404
- timestamp: Date.now(),
9405
- payload: rawPayload,
9406
- messageType: "parse_error"
9407
- });
9408
- }
9409
- callbacks?.onError?.({
9410
- code: "PARSE_ERROR",
9411
- message: "Failed to parse WebSocket message",
9412
- details: error
9413
- });
9414
- }
9415
- });
9416
- ws.on("error", (error) => {
9417
- callbacks?.onError?.({
9418
- code: "WEBSOCKET_ERROR",
9419
- message: error.message,
9420
- details: error
9421
- });
9422
- });
9423
- ws.on("close", (code, reason) => {
9424
- sessionStatus = "closed";
9425
- callbacks?.onClose?.(code, reason.toString());
9426
- });
9427
- await sessionReady;
9428
- sessionStatus = "open";
9429
- callbacks?.onOpen?.();
9430
- return {
9431
- id: sessionId,
9432
- provider: this.name,
9433
- createdAt: /* @__PURE__ */ new Date(),
9434
- getStatus: () => sessionStatus,
9435
- sendAudio: async (chunk) => {
9436
- if (sessionStatus !== "open") {
9437
- throw new Error(`Cannot send audio: session is ${sessionStatus}`);
9438
- }
9439
- if (ws.readyState !== import_ws5.default.OPEN) {
9440
- throw new Error("WebSocket is not open");
9441
- }
9442
- if (callbacks?.onRawMessage) {
9443
- const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
9444
- chunk.data.byteOffset,
9445
- chunk.data.byteOffset + chunk.data.byteLength
9446
- );
9447
- callbacks.onRawMessage({
9448
- provider: this.name,
9449
- direction: "outgoing",
9450
- timestamp: Date.now(),
9451
- payload: audioPayload,
9452
- messageType: "audio"
9453
- });
9454
- }
9455
- ws.send(chunk.data);
9456
- seqNo++;
9457
- if (chunk.isLast) {
9458
- const endMsg = JSON.stringify({
9459
- message: "EndOfStream",
9460
- last_seq_no: seqNo
9461
- });
9462
- if (callbacks?.onRawMessage) {
9463
- callbacks.onRawMessage({
9464
- provider: this.name,
9465
- direction: "outgoing",
9466
- timestamp: Date.now(),
9467
- payload: endMsg,
9468
- messageType: "EndOfStream"
9469
- });
9470
- }
9471
- ws.send(endMsg);
9472
- }
9473
- },
9474
- close: async () => {
9475
- if (sessionStatus === "closed" || sessionStatus === "closing") {
9476
- return;
9477
- }
9478
- sessionStatus = "closing";
9479
- if (ws.readyState === import_ws5.default.OPEN) {
9480
- seqNo++;
9481
- ws.send(
9482
- JSON.stringify({
9483
- message: "EndOfStream",
9484
- last_seq_no: seqNo
9485
- })
9486
- );
9487
- }
9488
- return new Promise((resolve) => {
9489
- const timeout = setTimeout(() => {
9490
- ws.terminate();
9491
- sessionStatus = "closed";
9492
- resolve();
9493
- }, 5e3);
9494
- const onMsg = (data) => {
9495
- try {
9496
- const msg = JSON.parse(data.toString());
9497
- if (msg.message === "EndOfTranscript") {
9498
- ws.removeListener("message", onMsg);
9499
- clearTimeout(timeout);
9500
- ws.close();
9501
- }
9502
- } catch {
9503
- }
9504
- };
9505
- ws.on("message", onMsg);
9506
- ws.once("close", () => {
9507
- clearTimeout(timeout);
9508
- sessionStatus = "closed";
9509
- resolve();
9510
- });
9511
- });
9512
- }
9513
- };
9514
- }
9515
- /**
9516
- * Handle incoming Speechmatics real-time WebSocket messages
9517
- */
9518
- handleStreamingMessage(message, callbacks, utteranceResults) {
9519
- switch (message.message) {
9520
- case "RecognitionStarted": {
9521
- break;
9522
- }
9523
- case "AddPartialTranscript": {
9524
- const results = message.results || [];
9525
- const text = buildTextFromSpeechmaticsResults(results);
9526
- if (text) {
9527
- callbacks?.onTranscript?.({
9528
- type: "transcript",
9529
- text,
9530
- isFinal: false,
9531
- words: this.extractWordsFromResults(results),
9532
- data: message
9533
- });
9534
- }
9535
- break;
9536
- }
9537
- case "AddTranscript": {
9538
- const results = message.results || [];
9539
- const text = buildTextFromSpeechmaticsResults(results);
9540
- if (utteranceResults) {
9541
- utteranceResults.push(...results);
9542
- }
9543
- if (text) {
9544
- callbacks?.onTranscript?.({
9545
- type: "transcript",
9546
- text,
9547
- isFinal: true,
9548
- words: this.extractWordsFromResults(results),
9549
- data: message
9550
- });
9551
- }
9552
- break;
9553
- }
9554
- case "EndOfUtterance": {
9555
- if (utteranceResults && utteranceResults.length > 0) {
9556
- const text = buildTextFromSpeechmaticsResults(utteranceResults);
9557
- const words = this.extractWordsFromResults(utteranceResults);
9558
- const utterances = buildUtterancesFromWords(words);
9559
- if (utterances.length > 0) {
9560
- for (const utt of utterances) {
9561
- callbacks?.onUtterance?.(utt);
9562
- }
9563
- } else if (text) {
9564
- callbacks?.onUtterance?.({
9565
- text,
9566
- start: words.length > 0 ? words[0].start : 0,
9567
- end: words.length > 0 ? words[words.length - 1].end : 0,
9568
- words
9569
- });
9570
- }
9571
- utteranceResults.length = 0;
9572
- }
9573
- break;
9574
- }
9575
- case "AudioAdded": {
9576
- break;
9577
- }
9578
- case "EndOfTranscript": {
9579
- break;
9580
- }
9581
- case "Info":
9582
- case "Warning": {
9583
- callbacks?.onMetadata?.(message);
9584
- break;
9585
- }
9586
- case "Error": {
9587
- const errMsg = message;
9588
- callbacks?.onError?.({
9589
- code: errMsg.type || "SPEECHMATICS_ERROR",
9590
- message: errMsg.reason || "Unknown error",
9591
- details: message
9592
- });
9593
- break;
9594
- }
9595
- default: {
9596
- callbacks?.onMetadata?.(message);
9597
- break;
9598
- }
9599
- }
9600
- }
9601
- /**
9602
- * Extract unified Word[] from Speechmatics recognition results
9603
- */
9604
- extractWordsFromResults(results) {
9605
- return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
9606
- word: result.alternatives?.[0]?.content || "",
9607
- start: result.start_time,
9608
- end: result.end_time,
9609
- confidence: result.alternatives?.[0]?.confidence,
9610
- speaker: result.alternatives?.[0]?.speaker
9611
- }));
9612
- }
9613
9263
  /**
9614
9264
  * Normalize Speechmatics status to unified status
9615
9265
  * Uses generated JobDetailsStatus enum values
@@ -9678,9 +9328,6 @@ function createSpeechmaticsAdapter(config) {
9678
9328
  return adapter;
9679
9329
  }
9680
9330
 
9681
- // src/adapters/soniox-adapter.ts
9682
- var import_axios9 = __toESM(require("axios"));
9683
-
9684
9331
  // src/generated/soniox/schema/transcriptionStatus.ts
9685
9332
  var TranscriptionStatus = {
9686
9333
  queued: "queued",
@@ -9689,6 +9336,57 @@ var TranscriptionStatus = {
9689
9336
  error: "error"
9690
9337
  };
9691
9338
 
9339
+ // src/generated/soniox/api/sonioxPublicAPI.ts
9340
+ var import_axios9 = __toESM(require("axios"));
9341
+
9342
+ // src/generated/soniox/schema/index.ts
9343
+ var schema_exports4 = {};
9344
+ __export(schema_exports4, {
9345
+ TemporaryApiKeyUsageType: () => TemporaryApiKeyUsageType,
9346
+ TranscriptionMode: () => TranscriptionMode,
9347
+ TranscriptionStatus: () => TranscriptionStatus,
9348
+ TranslationConfigType: () => TranslationConfigType
9349
+ });
9350
+
9351
+ // src/generated/soniox/schema/temporaryApiKeyUsageType.ts
9352
+ var TemporaryApiKeyUsageType = {
9353
+ transcribe_websocket: "transcribe_websocket"
9354
+ };
9355
+
9356
+ // src/generated/soniox/schema/transcriptionMode.ts
9357
+ var TranscriptionMode = {
9358
+ real_time: "real_time",
9359
+ async: "async"
9360
+ };
9361
+
9362
+ // src/generated/soniox/schema/translationConfigType.ts
9363
+ var TranslationConfigType = {
9364
+ one_way: "one_way",
9365
+ two_way: "two_way"
9366
+ };
9367
+
9368
+ // src/generated/soniox/api/sonioxPublicAPI.ts
9369
+ var uploadFile = (uploadFileBody2, options) => {
9370
+ const formData = new FormData();
9371
+ if (uploadFileBody2.client_reference_id !== void 0 && uploadFileBody2.client_reference_id !== null) {
9372
+ formData.append("client_reference_id", uploadFileBody2.client_reference_id);
9373
+ }
9374
+ formData.append("file", uploadFileBody2.file);
9375
+ return import_axios9.default.post("/v1/files", formData, options);
9376
+ };
9377
+ var createTranscription2 = (createTranscriptionPayload, options) => {
9378
+ return import_axios9.default.post("/v1/transcriptions", createTranscriptionPayload, options);
9379
+ };
9380
+ var getTranscription = (transcriptionId, options) => {
9381
+ return import_axios9.default.get(`/v1/transcriptions/${transcriptionId}`, options);
9382
+ };
9383
+ var getTranscriptionTranscript = (transcriptionId, options) => {
9384
+ return import_axios9.default.get(`/v1/transcriptions/${transcriptionId}/transcript`, options);
9385
+ };
9386
+ var getModels = (options) => {
9387
+ return import_axios9.default.get("/v1/models", options);
9388
+ };
9389
+
9692
9390
  // src/adapters/soniox-adapter.ts
9693
9391
  var SonioxAdapter = class extends BaseAdapter {
9694
9392
  constructor() {
@@ -9743,11 +9441,17 @@ var SonioxAdapter = class extends BaseAdapter {
9743
9441
  }
9744
9442
  }
9745
9443
  /**
9746
- * Get the base URL for API requests
9444
+ * Get the base URL for API requests (no /v1 suffix — generated functions include /v1 in paths)
9747
9445
  */
9748
9446
  get baseUrl() {
9749
9447
  if (this.config?.baseUrl) return this.config.baseUrl;
9750
- return `https://${this.getRegionalHost()}/v1`;
9448
+ return `https://${this.getRegionalHost()}`;
9449
+ }
9450
+ /**
9451
+ * Build axios config with Soniox Bearer auth
9452
+ */
9453
+ getAxiosConfig() {
9454
+ return super.getAxiosConfig("Authorization", (key) => `Bearer ${key}`);
9751
9455
  }
9752
9456
  initialize(config) {
9753
9457
  super.initialize(config);
@@ -9757,15 +9461,6 @@ var SonioxAdapter = class extends BaseAdapter {
9757
9461
  if (config.model) {
9758
9462
  this.defaultModel = config.model;
9759
9463
  }
9760
- this.client = import_axios9.default.create({
9761
- baseURL: this.baseUrl,
9762
- timeout: config.timeout || 12e4,
9763
- headers: {
9764
- Authorization: `Bearer ${config.apiKey}`,
9765
- "Content-Type": "application/json",
9766
- ...config.headers
9767
- }
9768
- });
9769
9464
  }
9770
9465
  /**
9771
9466
  * Get current region
@@ -9795,23 +9490,12 @@ var SonioxAdapter = class extends BaseAdapter {
9795
9490
  */
9796
9491
  setRegion(region) {
9797
9492
  this.region = region;
9798
- if (this.config?.apiKey) {
9799
- this.client = import_axios9.default.create({
9800
- baseURL: this.baseUrl,
9801
- timeout: this.config.timeout || 12e4,
9802
- headers: {
9803
- Authorization: `Bearer ${this.config.apiKey}`,
9804
- "Content-Type": "application/json",
9805
- ...this.config.headers
9806
- }
9807
- });
9808
- }
9809
9493
  }
9810
9494
  /**
9811
9495
  * Submit audio for transcription
9812
9496
  *
9813
- * Soniox uses async batch processing. The transcribe method submits audio
9814
- * and waits for completion (or use getTranscript for polling).
9497
+ * Uses the async v1 API: createTranscription returns status `queued`,
9498
+ * then polls until completed (or returns immediately if webhook is set).
9815
9499
  *
9816
9500
  * @param audio - Audio input (URL or file)
9817
9501
  * @param options - Transcription options
@@ -9820,21 +9504,44 @@ var SonioxAdapter = class extends BaseAdapter {
9820
9504
  async transcribe(audio, options) {
9821
9505
  this.validateConfig();
9822
9506
  try {
9823
- const requestBody = {
9824
- model: options?.model || this.defaultModel
9825
- };
9826
- if (audio.type === "url") {
9827
- requestBody.audio_url = audio.url;
9828
- } else if (audio.type === "file") {
9829
- const formData = new FormData();
9507
+ const sonioxOpts = options?.soniox;
9508
+ if (audio.type === "file") {
9830
9509
  const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9831
- formData.append("file", audioBlob, audio.filename || "audio.wav");
9832
- const uploadResponse = await this.client.post("/files", formData, {
9833
- headers: {
9834
- "Content-Type": "multipart/form-data"
9835
- }
9836
- });
9837
- requestBody.file_id = uploadResponse.data.id;
9510
+ const uploadBody = { file: audioBlob };
9511
+ const fileResp = await uploadFile(uploadBody, this.getAxiosConfig());
9512
+ const payload = {
9513
+ ...sonioxOpts,
9514
+ model: options?.model || this.defaultModel,
9515
+ file_id: fileResp.data.id,
9516
+ language_hints: options?.language ? [options.language] : sonioxOpts?.language_hints,
9517
+ enable_speaker_diarization: options?.diarization || sonioxOpts?.enable_speaker_diarization,
9518
+ enable_language_identification: options?.languageDetection || sonioxOpts?.enable_language_identification,
9519
+ context: options?.customVocabulary?.length ? { terms: options.customVocabulary } : sonioxOpts?.context,
9520
+ webhook_url: options?.webhookUrl || sonioxOpts?.webhook_url
9521
+ };
9522
+ const createResp = await createTranscription2(payload, this.getAxiosConfig());
9523
+ const meta = createResp.data;
9524
+ if (options?.webhookUrl || sonioxOpts?.webhook_url) {
9525
+ return this.normalizeTranscription(meta);
9526
+ }
9527
+ return this.pollForCompletion(meta.id);
9528
+ } else if (audio.type === "url") {
9529
+ const payload = {
9530
+ ...sonioxOpts,
9531
+ model: options?.model || this.defaultModel,
9532
+ audio_url: audio.url,
9533
+ language_hints: options?.language ? [options.language] : sonioxOpts?.language_hints,
9534
+ enable_speaker_diarization: options?.diarization || sonioxOpts?.enable_speaker_diarization,
9535
+ enable_language_identification: options?.languageDetection || sonioxOpts?.enable_language_identification,
9536
+ context: options?.customVocabulary?.length ? { terms: options.customVocabulary } : sonioxOpts?.context,
9537
+ webhook_url: options?.webhookUrl || sonioxOpts?.webhook_url
9538
+ };
9539
+ const createResp = await createTranscription2(payload, this.getAxiosConfig());
9540
+ const meta = createResp.data;
9541
+ if (options?.webhookUrl || sonioxOpts?.webhook_url) {
9542
+ return this.normalizeTranscription(meta);
9543
+ }
9544
+ return this.pollForCompletion(meta.id);
9838
9545
  } else {
9839
9546
  return {
9840
9547
  success: false,
@@ -9845,38 +9552,6 @@ var SonioxAdapter = class extends BaseAdapter {
9845
9552
  }
9846
9553
  };
9847
9554
  }
9848
- if (options?.language) {
9849
- requestBody.language_hints = [options.language];
9850
- }
9851
- if (options?.diarization) {
9852
- requestBody.enable_speaker_diarization = true;
9853
- }
9854
- if (options?.languageDetection) {
9855
- requestBody.enable_language_identification = true;
9856
- }
9857
- if (options?.customVocabulary && options.customVocabulary.length > 0) {
9858
- requestBody.context = {
9859
- terms: options.customVocabulary
9860
- };
9861
- }
9862
- if (options?.webhookUrl) {
9863
- requestBody.webhook_url = options.webhookUrl;
9864
- }
9865
- const response = await this.client.post("/transcriptions", requestBody);
9866
- const transcriptionId = response.data.id;
9867
- if (options?.webhookUrl) {
9868
- return {
9869
- success: true,
9870
- provider: this.name,
9871
- data: {
9872
- id: transcriptionId,
9873
- text: "",
9874
- status: "queued"
9875
- },
9876
- raw: response.data
9877
- };
9878
- }
9879
- return await this.pollForCompletion(transcriptionId);
9880
9555
  } catch (error) {
9881
9556
  return this.createErrorResponse(error);
9882
9557
  }
@@ -9884,9 +9559,8 @@ var SonioxAdapter = class extends BaseAdapter {
9884
9559
  /**
9885
9560
  * Get transcription result by ID
9886
9561
  *
9887
- * Checks job status via GET /v1/transcriptions/{id}, then fetches
9888
- * the full transcript via GET /v1/transcriptions/{id}/transcript
9889
- * when completed.
9562
+ * Fetches transcription metadata and, if completed, the transcript text/tokens.
9563
+ * Used by pollForCompletion() for async polling.
9890
9564
  *
9891
9565
  * @param transcriptId - Transcript ID
9892
9566
  * @returns Transcription response
@@ -9894,39 +9568,20 @@ var SonioxAdapter = class extends BaseAdapter {
9894
9568
  async getTranscript(transcriptId) {
9895
9569
  this.validateConfig();
9896
9570
  try {
9897
- const statusResponse = await this.client.get(`/transcriptions/${transcriptId}`);
9898
- const job = statusResponse.data;
9899
- if (job.status === "error") {
9900
- return {
9901
- success: false,
9902
- provider: this.name,
9903
- error: {
9904
- code: "TRANSCRIPTION_ERROR",
9905
- message: job.error_message || "Transcription failed"
9906
- }
9907
- };
9908
- }
9909
- if (job.status !== "completed") {
9910
- return {
9911
- success: true,
9912
- provider: this.name,
9913
- data: {
9914
- id: job.id,
9915
- text: "",
9916
- status: job.status
9917
- },
9918
- raw: job
9919
- };
9571
+ const metaResp = await getTranscription(transcriptId, this.getAxiosConfig());
9572
+ const meta = metaResp.data;
9573
+ if (meta.status === TranscriptionStatus.completed) {
9574
+ try {
9575
+ const transcriptResp = await getTranscriptionTranscript(
9576
+ transcriptId,
9577
+ this.getAxiosConfig()
9578
+ );
9579
+ return this.normalizeTranscription(meta, transcriptResp.data);
9580
+ } catch (transcriptError) {
9581
+ return this.createErrorResponse(transcriptError);
9582
+ }
9920
9583
  }
9921
- const transcriptResponse = await this.client.get(
9922
- `/transcriptions/${transcriptId}/transcript`
9923
- );
9924
- return this.normalizeResponse({
9925
- ...transcriptResponse.data,
9926
- // Carry over job metadata
9927
- id: job.id,
9928
- audio_duration_ms: job.audio_duration_ms
9929
- });
9584
+ return this.normalizeTranscription(meta);
9930
9585
  } catch (error) {
9931
9586
  return this.createErrorResponse(error);
9932
9587
  }
@@ -9946,51 +9601,50 @@ var SonioxAdapter = class extends BaseAdapter {
9946
9601
  const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9947
9602
  const createdAt = /* @__PURE__ */ new Date();
9948
9603
  const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
9949
- const wsUrl = `${wsBase}/transcribe-websocket`;
9950
- const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
9951
- const sonioxOpts = options?.sonioxStreaming;
9952
- const initMessage = {
9953
- api_key: this.config.apiKey,
9954
- model: modelId
9955
- };
9956
- if (sonioxOpts?.audioFormat) {
9957
- initMessage.audio_format = sonioxOpts.audioFormat;
9958
- } else if (options?.encoding) {
9604
+ const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
9605
+ wsUrl.searchParams.set("api_key", this.config.apiKey);
9606
+ const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
9607
+ wsUrl.searchParams.set("model", modelId);
9608
+ if (options?.encoding) {
9959
9609
  const encodingMap = {
9960
9610
  linear16: "pcm_s16le",
9961
9611
  pcm: "pcm_s16le",
9962
9612
  mulaw: "mulaw",
9963
9613
  alaw: "alaw"
9964
9614
  };
9965
- initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
9615
+ wsUrl.searchParams.set("audio_format", encodingMap[options.encoding] || options.encoding);
9966
9616
  }
9967
- if (sonioxOpts?.sampleRate || options?.sampleRate) {
9968
- initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
9617
+ if (options?.sampleRate) {
9618
+ wsUrl.searchParams.set("sample_rate", options.sampleRate.toString());
9969
9619
  }
9970
- if (sonioxOpts?.numChannels || options?.channels) {
9971
- initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
9620
+ if (options?.channels) {
9621
+ wsUrl.searchParams.set("num_channels", options.channels.toString());
9972
9622
  }
9623
+ const sonioxOpts = options?.sonioxStreaming;
9973
9624
  if (sonioxOpts) {
9974
9625
  if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
9975
- initMessage.language_hints = sonioxOpts.languageHints;
9626
+ wsUrl.searchParams.set("language_hints", JSON.stringify(sonioxOpts.languageHints));
9976
9627
  }
9977
9628
  if (sonioxOpts.enableLanguageIdentification) {
9978
- initMessage.enable_language_identification = true;
9629
+ wsUrl.searchParams.set("enable_language_identification", "true");
9979
9630
  }
9980
9631
  if (sonioxOpts.enableEndpointDetection) {
9981
- initMessage.enable_endpoint_detection = true;
9632
+ wsUrl.searchParams.set("enable_endpoint_detection", "true");
9982
9633
  }
9983
9634
  if (sonioxOpts.enableSpeakerDiarization) {
9984
- initMessage.enable_speaker_diarization = true;
9635
+ wsUrl.searchParams.set("enable_speaker_diarization", "true");
9985
9636
  }
9986
9637
  if (sonioxOpts.context) {
9987
- initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
9638
+ wsUrl.searchParams.set(
9639
+ "context",
9640
+ typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
9641
+ );
9988
9642
  }
9989
9643
  if (sonioxOpts.translation) {
9990
- initMessage.translation = sonioxOpts.translation;
9644
+ wsUrl.searchParams.set("translation", JSON.stringify(sonioxOpts.translation));
9991
9645
  }
9992
9646
  if (sonioxOpts.clientReferenceId) {
9993
- initMessage.client_reference_id = sonioxOpts.clientReferenceId;
9647
+ wsUrl.searchParams.set("client_reference_id", sonioxOpts.clientReferenceId);
9994
9648
  }
9995
9649
  }
9996
9650
  if (!sonioxOpts?.languageHints && options?.language) {
@@ -9999,33 +9653,24 @@ var SonioxAdapter = class extends BaseAdapter {
9999
9653
  `[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
10000
9654
  );
10001
9655
  }
10002
- initMessage.language_hints = [options.language];
9656
+ wsUrl.searchParams.set("language_hints", JSON.stringify([options.language]));
10003
9657
  }
10004
9658
  if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
10005
- initMessage.enable_speaker_diarization = true;
9659
+ wsUrl.searchParams.set("enable_speaker_diarization", "true");
10006
9660
  }
10007
9661
  if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
10008
- initMessage.enable_language_identification = true;
9662
+ wsUrl.searchParams.set("enable_language_identification", "true");
9663
+ }
9664
+ if (options?.interimResults !== false) {
10009
9665
  }
10010
9666
  let status = "connecting";
10011
9667
  let openedAt = null;
10012
9668
  let receivedData = false;
10013
9669
  const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
10014
- const ws = new WebSocketImpl(wsUrl);
9670
+ const ws = new WebSocketImpl(wsUrl.toString());
10015
9671
  ws.onopen = () => {
10016
- openedAt = Date.now();
10017
- const initPayload = JSON.stringify(initMessage);
10018
- if (callbacks?.onRawMessage) {
10019
- callbacks.onRawMessage({
10020
- provider: this.name,
10021
- direction: "outgoing",
10022
- timestamp: Date.now(),
10023
- payload: initPayload,
10024
- messageType: "init"
10025
- });
10026
- }
10027
- ws.send(initPayload);
10028
9672
  status = "open";
9673
+ openedAt = Date.now();
10029
9674
  callbacks?.onOpen?.();
10030
9675
  };
10031
9676
  ws.onmessage = (event) => {
@@ -10034,7 +9679,8 @@ var SonioxAdapter = class extends BaseAdapter {
10034
9679
  let messageType;
10035
9680
  try {
10036
9681
  const data = JSON.parse(rawPayload);
10037
- if (data.error) {
9682
+ const errorMessage = data.error_message || data.error;
9683
+ if (errorMessage) {
10038
9684
  messageType = "error";
10039
9685
  } else if (data.finished) {
10040
9686
  messageType = "finished";
@@ -10050,10 +9696,10 @@ var SonioxAdapter = class extends BaseAdapter {
10050
9696
  messageType
10051
9697
  });
10052
9698
  }
10053
- if (data.error) {
9699
+ if (errorMessage) {
10054
9700
  callbacks?.onError?.({
10055
9701
  code: data.error_code?.toString() || "STREAM_ERROR",
10056
- message: data.error
9702
+ message: errorMessage
10057
9703
  });
10058
9704
  return;
10059
9705
  }
@@ -10067,7 +9713,7 @@ var SonioxAdapter = class extends BaseAdapter {
10067
9713
  start: token.start_ms ? token.start_ms / 1e3 : 0,
10068
9714
  end: token.end_ms ? token.end_ms / 1e3 : 0,
10069
9715
  confidence: token.confidence,
10070
- speaker: token.speaker
9716
+ speaker: token.speaker ?? void 0
10071
9717
  }));
10072
9718
  const text = data.text || data.tokens.map((t) => t.text).join("");
10073
9719
  const isFinal = data.tokens.every((t) => t.is_final);
@@ -10076,8 +9722,8 @@ var SonioxAdapter = class extends BaseAdapter {
10076
9722
  text,
10077
9723
  isFinal,
10078
9724
  words,
10079
- speaker: data.tokens[0]?.speaker,
10080
- language: data.tokens[0]?.language,
9725
+ speaker: data.tokens[0]?.speaker ?? void 0,
9726
+ language: data.tokens[0]?.language ?? void 0,
10081
9727
  confidence: data.tokens[0]?.confidence
10082
9728
  };
10083
9729
  callbacks?.onTranscript?.(event2);
@@ -10104,10 +9750,10 @@ var SonioxAdapter = class extends BaseAdapter {
10104
9750
  ws.onclose = (event) => {
10105
9751
  status = "closed";
10106
9752
  const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
10107
- const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
10108
- if (isEarlyClose && event.code === 1e3) {
9753
+ const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
9754
+ if (isImmediateClose && event.code === 1e3) {
10109
9755
  const errorMessage = [
10110
- "Soniox closed connection shortly after opening.",
9756
+ "Soniox closed connection immediately after opening.",
10111
9757
  `Current config: region=${this.region}, model=${modelId}`,
10112
9758
  "Likely causes:",
10113
9759
  " - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
@@ -10193,7 +9839,7 @@ var SonioxAdapter = class extends BaseAdapter {
10193
9839
  async getModels() {
10194
9840
  this.validateConfig();
10195
9841
  try {
10196
- const response = await this.client.get("/models");
9842
+ const response = await getModels(this.getAxiosConfig());
10197
9843
  return response.data.models || [];
10198
9844
  } catch (error) {
10199
9845
  console.error("Failed to fetch Soniox models:", error);
@@ -10225,11 +9871,44 @@ var SonioxAdapter = class extends BaseAdapter {
10225
9871
  return buildUtterancesFromWords(words);
10226
9872
  }
10227
9873
  /**
10228
- * Normalize Soniox response to unified format
9874
+ * Normalize v1 API response to unified format
9875
+ *
9876
+ * @param meta - Transcription metadata from getTranscription/createTranscription
9877
+ * @param transcript - Transcript data (text/tokens), only present when status is completed
10229
9878
  */
10230
- normalizeResponse(response) {
10231
- const { text, tokens } = response;
10232
- const words = tokens.map((token) => ({
9879
+ normalizeTranscription(meta, transcript) {
9880
+ if (meta.status === TranscriptionStatus.error) {
9881
+ return {
9882
+ success: false,
9883
+ provider: this.name,
9884
+ data: {
9885
+ id: meta.id,
9886
+ text: "",
9887
+ status: "error"
9888
+ },
9889
+ error: {
9890
+ code: meta.error_type || "TRANSCRIPTION_ERROR",
9891
+ message: meta.error_message || "Transcription failed"
9892
+ },
9893
+ raw: { meta, transcript }
9894
+ };
9895
+ }
9896
+ if (!transcript) {
9897
+ return {
9898
+ success: true,
9899
+ provider: this.name,
9900
+ data: {
9901
+ id: meta.id,
9902
+ text: "",
9903
+ status: meta.status,
9904
+ duration: meta.audio_duration_ms ? meta.audio_duration_ms / 1e3 : void 0
9905
+ },
9906
+ raw: { meta }
9907
+ };
9908
+ }
9909
+ const tokens = transcript.tokens || [];
9910
+ const text = transcript.text || tokens.map((t) => t.text).join("");
9911
+ const words = tokens.filter((t) => t.start_ms !== void 0 && t.end_ms !== void 0).map((token) => ({
10233
9912
  word: token.text,
10234
9913
  start: token.start_ms / 1e3,
10235
9914
  end: token.end_ms / 1e3,
@@ -10237,33 +9916,32 @@ var SonioxAdapter = class extends BaseAdapter {
10237
9916
  speaker: token.speaker ?? void 0
10238
9917
  }));
10239
9918
  const speakerSet = /* @__PURE__ */ new Set();
10240
- for (const token of tokens) {
10241
- if (token.speaker) speakerSet.add(token.speaker);
10242
- }
9919
+ tokens.forEach((t) => {
9920
+ if (t.speaker) speakerSet.add(String(t.speaker));
9921
+ });
10243
9922
  const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
10244
9923
  id,
10245
9924
  label: `Speaker ${id}`
10246
9925
  })) : void 0;
10247
- const utterances = tokens.length > 0 ? this.buildUtterancesFromTokens(tokens) : [];
9926
+ const utterances = this.buildUtterancesFromTokens(tokens);
10248
9927
  const language = tokens.find((t) => t.language)?.language ?? void 0;
10249
9928
  return {
10250
9929
  success: true,
10251
9930
  provider: this.name,
10252
9931
  data: {
10253
- id: response.id || `soniox_${Date.now()}`,
9932
+ id: meta.id,
10254
9933
  text,
10255
9934
  status: TranscriptionStatus.completed,
10256
9935
  language,
10257
- duration: response.audio_duration_ms ? response.audio_duration_ms / 1e3 : response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
9936
+ duration: meta.audio_duration_ms ? meta.audio_duration_ms / 1e3 : void 0,
10258
9937
  speakers,
10259
9938
  words: words.length > 0 ? words : void 0,
10260
9939
  utterances: utterances.length > 0 ? utterances : void 0
10261
9940
  },
10262
9941
  tracking: {
10263
- requestId: response.id,
10264
- processingTimeMs: response.total_audio_proc_ms
9942
+ requestId: meta.id
10265
9943
  },
10266
- raw: response
9944
+ raw: { meta, transcript }
10267
9945
  };
10268
9946
  }
10269
9947
  };
@@ -10419,29 +10097,11 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10419
10097
  }
10420
10098
  }
10421
10099
  }
10422
- if (options?.webhookUrl) {
10423
- if (!formData.has("webhook")) {
10424
- formData.append("webhook", "true");
10425
- }
10426
- }
10427
10100
  const response = await this.client.post("/v1/speech-to-text", formData, {
10428
10101
  headers: {
10429
10102
  "Content-Type": "multipart/form-data"
10430
10103
  }
10431
10104
  });
10432
- if (options?.webhookUrl) {
10433
- const transcriptionId = response.data.transcription_id || response.data.id || `elevenlabs_${Date.now()}`;
10434
- return {
10435
- success: true,
10436
- provider: this.name,
10437
- data: {
10438
- id: transcriptionId,
10439
- text: "",
10440
- status: "queued"
10441
- },
10442
- raw: response.data
10443
- };
10444
- }
10445
10105
  return this.normalizeResponse(response.data);
10446
10106
  } catch (error) {
10447
10107
  return this.createErrorResponse(error);
@@ -10755,7 +10415,7 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10755
10415
  }
10756
10416
  }
10757
10417
  }
10758
- const transcriptionId = ("transcription_id" in response ? response.transcription_id : response.transcription_id) || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
10418
+ const transcriptionId = response.transcription_id || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
10759
10419
  return {
10760
10420
  success: true,
10761
10421
  provider: this.name,
@@ -36673,12 +36333,10 @@ var createTemporaryApiKeyBody = import_zod10.z.object({
36673
36333
  var streaming_types_zod_exports = {};
36674
36334
  __export(streaming_types_zod_exports, {
36675
36335
  sonioxAudioFormatSchema: () => sonioxAudioFormatSchema,
36676
- sonioxAutoDetectedAudioFormatSchema: () => sonioxAutoDetectedAudioFormatSchema,
36677
36336
  sonioxContextGeneralItemSchema: () => sonioxContextGeneralItemSchema,
36678
36337
  sonioxContextSchema: () => sonioxContextSchema,
36679
36338
  sonioxErrorStatusSchema: () => sonioxErrorStatusSchema,
36680
36339
  sonioxOneWayTranslationSchema: () => sonioxOneWayTranslationSchema,
36681
- sonioxPcmAudioEncodingSchema: () => sonioxPcmAudioEncodingSchema,
36682
36340
  sonioxRealtimeModelSchema: () => sonioxRealtimeModelSchema,
36683
36341
  sonioxRecorderStateSchema: () => sonioxRecorderStateSchema,
36684
36342
  sonioxStreamingResponseSchema: () => sonioxStreamingResponseSchema,
@@ -36692,7 +36350,7 @@ __export(streaming_types_zod_exports, {
36692
36350
  streamingUpdateConfigParams: () => streamingUpdateConfigParams3
36693
36351
  });
36694
36352
  var import_zod11 = require("zod");
36695
- var sonioxAutoDetectedAudioFormatSchema = import_zod11.z.enum([
36353
+ var sonioxAudioFormatSchema = import_zod11.z.enum([
36696
36354
  "auto",
36697
36355
  "aac",
36698
36356
  "aiff",
@@ -36702,10 +36360,7 @@ var sonioxAutoDetectedAudioFormatSchema = import_zod11.z.enum([
36702
36360
  "mp3",
36703
36361
  "ogg",
36704
36362
  "wav",
36705
- "webm"
36706
- ]);
36707
- var sonioxPcmAudioEncodingSchema = import_zod11.z.enum([
36708
- // Signed PCM
36363
+ "webm",
36709
36364
  "pcm_s8",
36710
36365
  "pcm_s16le",
36711
36366
  "pcm_s16be",
@@ -36713,7 +36368,6 @@ var sonioxPcmAudioEncodingSchema = import_zod11.z.enum([
36713
36368
  "pcm_s24be",
36714
36369
  "pcm_s32le",
36715
36370
  "pcm_s32be",
36716
- // Unsigned PCM
36717
36371
  "pcm_u8",
36718
36372
  "pcm_u16le",
36719
36373
  "pcm_u16be",
@@ -36721,86 +36375,81 @@ var sonioxPcmAudioEncodingSchema = import_zod11.z.enum([
36721
36375
  "pcm_u24be",
36722
36376
  "pcm_u32le",
36723
36377
  "pcm_u32be",
36724
- // Float PCM
36725
36378
  "pcm_f32le",
36726
36379
  "pcm_f32be",
36727
36380
  "pcm_f64le",
36728
36381
  "pcm_f64be",
36729
- // Companded
36730
36382
  "mulaw",
36731
36383
  "alaw"
36732
36384
  ]);
36733
- var sonioxAudioFormatSchema = import_zod11.z.union([
36734
- sonioxAutoDetectedAudioFormatSchema,
36735
- sonioxPcmAudioEncodingSchema
36736
- ]);
36737
36385
  var sonioxOneWayTranslationSchema = import_zod11.z.object({
36738
36386
  type: import_zod11.z.literal("one_way"),
36739
- target_language: import_zod11.z.string().describe("Target language code for translation")
36387
+ target_language: import_zod11.z.string()
36740
36388
  });
36741
36389
  var sonioxTwoWayTranslationSchema = import_zod11.z.object({
36742
36390
  type: import_zod11.z.literal("two_way"),
36743
- language_a: import_zod11.z.string().describe("First language for bidirectional translation"),
36744
- language_b: import_zod11.z.string().describe("Second language for bidirectional translation")
36391
+ language_a: import_zod11.z.string(),
36392
+ language_b: import_zod11.z.string()
36745
36393
  });
36746
36394
  var sonioxTranslationConfigSchema = import_zod11.z.union([
36747
36395
  sonioxOneWayTranslationSchema,
36748
36396
  sonioxTwoWayTranslationSchema
36749
36397
  ]);
36750
36398
  var sonioxContextGeneralItemSchema = import_zod11.z.object({
36751
- key: import_zod11.z.string().describe("Context item key (e.g. 'Domain')"),
36752
- value: import_zod11.z.string().describe("Context item value (e.g. 'medicine')")
36399
+ key: import_zod11.z.string(),
36400
+ value: import_zod11.z.string()
36753
36401
  });
36754
36402
  var sonioxTranslationTermSchema = import_zod11.z.object({
36755
- source: import_zod11.z.string().describe("Source term"),
36756
- target: import_zod11.z.string().describe("Target term to translate to")
36403
+ source: import_zod11.z.string(),
36404
+ target: import_zod11.z.string()
36757
36405
  });
36758
36406
  var sonioxStructuredContextSchema = import_zod11.z.object({
36759
- general: import_zod11.z.array(sonioxContextGeneralItemSchema).optional().describe("General context items (key-value pairs)"),
36760
- text: import_zod11.z.string().optional().describe("Text context"),
36761
- terms: import_zod11.z.array(import_zod11.z.string()).optional().describe("Terms that might occur in speech"),
36762
- translation_terms: import_zod11.z.array(sonioxTranslationTermSchema).optional().describe("Hints how to translate specific terms (ignored if translation is not enabled)")
36407
+ general: import_zod11.z.array(sonioxContextGeneralItemSchema).optional(),
36408
+ text: import_zod11.z.string().optional(),
36409
+ terms: import_zod11.z.array(import_zod11.z.string()).optional(),
36410
+ translation_terms: import_zod11.z.array(sonioxTranslationTermSchema).optional()
36763
36411
  });
36764
36412
  var sonioxContextSchema = import_zod11.z.union([sonioxStructuredContextSchema, import_zod11.z.string()]);
36765
36413
  var sonioxRealtimeModelSchema = import_zod11.z.enum([
36414
+ "stt-rt-v4",
36766
36415
  "stt-rt-v3",
36767
36416
  "stt-rt-preview",
36768
36417
  "stt-rt-v3-preview",
36769
36418
  "stt-rt-preview-v2"
36770
36419
  ]);
36771
36420
  var streamingTranscriberParams3 = import_zod11.z.object({
36772
- model: sonioxRealtimeModelSchema.describe("Real-time model to use"),
36773
- audioFormat: sonioxAudioFormatSchema.optional().describe("Audio format specification. Use 'auto' for automatic detection"),
36774
- sampleRate: import_zod11.z.number().optional().describe("Sample rate in Hz (required for raw PCM formats)"),
36775
- numChannels: import_zod11.z.number().min(1).max(2).optional().describe("Number of audio channels (1 for mono, 2 for stereo) - required for raw PCM formats"),
36776
- languageHints: import_zod11.z.array(import_zod11.z.string()).optional().describe("Expected languages in the audio (ISO language codes)"),
36777
- context: sonioxContextSchema.optional().describe("Additional context to improve transcription accuracy"),
36778
- enableSpeakerDiarization: import_zod11.z.boolean().optional().describe("Enable speaker diarization - each token will include a speaker field"),
36779
- enableLanguageIdentification: import_zod11.z.boolean().optional().describe("Enable language identification - each token will include a language field"),
36780
- enableEndpointDetection: import_zod11.z.boolean().optional().describe("Enable endpoint detection to detect when a speaker has finished talking"),
36781
- translation: sonioxTranslationConfigSchema.optional().describe("Translation configuration"),
36782
- clientReferenceId: import_zod11.z.string().optional().describe("Optional tracking identifier (client-defined)")
36783
- });
36784
- var sonioxTranslationStatusSchema = import_zod11.z.enum(["none", "original", "translation"]);
36421
+ model: sonioxRealtimeModelSchema,
36422
+ audioFormat: sonioxAudioFormatSchema.optional(),
36423
+ sampleRate: import_zod11.z.number().optional(),
36424
+ numChannels: import_zod11.z.number().optional(),
36425
+ languageHints: import_zod11.z.array(import_zod11.z.string()).optional(),
36426
+ context: sonioxContextSchema.optional(),
36427
+ enableSpeakerDiarization: import_zod11.z.boolean().optional(),
36428
+ enableLanguageIdentification: import_zod11.z.boolean().optional(),
36429
+ enableEndpointDetection: import_zod11.z.boolean().optional(),
36430
+ translation: sonioxTranslationConfigSchema.optional(),
36431
+ clientReferenceId: import_zod11.z.string().optional()
36432
+ });
36433
+ var sonioxTranslationStatusSchema = import_zod11.z.enum(["original", "translation", "none"]);
36785
36434
  var sonioxTokenSchema = import_zod11.z.object({
36786
- text: import_zod11.z.string().describe("Token text content (subword, word, or space)"),
36787
- start_ms: import_zod11.z.number().optional().describe("Start time of the token in milliseconds"),
36788
- end_ms: import_zod11.z.number().optional().describe("End time of the token in milliseconds"),
36789
- confidence: import_zod11.z.number().min(0).max(1).optional().describe("Confidence score between 0.0 and 1.0"),
36790
- is_final: import_zod11.z.boolean().describe("Whether this token is final (confirmed) or provisional"),
36791
- speaker: import_zod11.z.string().optional().describe("Speaker identifier (only present when speaker diarization is enabled)"),
36792
- language: import_zod11.z.string().optional().describe("Detected language code (only present when language identification is enabled)"),
36793
- source_language: import_zod11.z.string().optional().describe("Original language code for translated tokens"),
36794
- translation_status: sonioxTranslationStatusSchema.optional().describe("Translation status: 'none', 'original', or 'translation'")
36435
+ text: import_zod11.z.string(),
36436
+ start_ms: import_zod11.z.number().optional(),
36437
+ end_ms: import_zod11.z.number().optional(),
36438
+ confidence: import_zod11.z.number(),
36439
+ is_final: import_zod11.z.boolean(),
36440
+ speaker: import_zod11.z.string().optional(),
36441
+ translation_status: sonioxTranslationStatusSchema.optional(),
36442
+ language: import_zod11.z.string().optional(),
36443
+ source_language: import_zod11.z.string().optional()
36795
36444
  });
36796
36445
  var sonioxStreamingResponseSchema = import_zod11.z.object({
36797
- text: import_zod11.z.string().optional().describe("Complete transcribed text"),
36798
- tokens: import_zod11.z.array(sonioxTokenSchema).describe("List of recognized tokens"),
36799
- final_audio_proc_ms: import_zod11.z.number().optional().describe("Milliseconds of audio processed into final tokens"),
36800
- total_audio_proc_ms: import_zod11.z.number().optional().describe("Milliseconds of audio processed (final + non-final)"),
36801
- finished: import_zod11.z.boolean().optional().describe("Whether the transcription is complete"),
36802
- error: import_zod11.z.string().optional().describe("Error message if an error occurred"),
36803
- error_code: import_zod11.z.number().optional().describe("Error code if an error occurred")
36446
+ text: import_zod11.z.string(),
36447
+ tokens: import_zod11.z.array(sonioxTokenSchema),
36448
+ final_audio_proc_ms: import_zod11.z.number(),
36449
+ total_audio_proc_ms: import_zod11.z.number(),
36450
+ finished: import_zod11.z.boolean().optional(),
36451
+ error_code: import_zod11.z.number().optional(),
36452
+ error_message: import_zod11.z.string().optional()
36804
36453
  });
36805
36454
  var sonioxRecorderStateSchema = import_zod11.z.enum([
36806
36455
  "Init",
@@ -37366,8 +37015,8 @@ var BatchOnlyProviders = AllProviders.filter(
37366
37015
  );
37367
37016
 
37368
37017
  // src/generated/deepgram/schema/index.ts
37369
- var schema_exports4 = {};
37370
- __export(schema_exports4, {
37018
+ var schema_exports5 = {};
37019
+ __export(schema_exports5, {
37371
37020
  V1ListenPostParametersCallbackMethod: () => V1ListenPostParametersCallbackMethod,
37372
37021
  V1ListenPostParametersCustomIntentMode: () => V1ListenPostParametersCustomIntentMode,
37373
37022
  V1ListenPostParametersCustomTopicMode: () => V1ListenPostParametersCustomTopicMode,
@@ -37622,8 +37271,8 @@ var V1SpeakPostParametersSampleRate = {
37622
37271
  };
37623
37272
 
37624
37273
  // src/generated/openai/schema/index.ts
37625
- var schema_exports5 = {};
37626
- __export(schema_exports5, {
37274
+ var schema_exports6 = {};
37275
+ __export(schema_exports6, {
37627
37276
  AudioResponseFormat: () => AudioResponseFormat,
37628
37277
  CreateSpeechRequestResponseFormat: () => CreateSpeechRequestResponseFormat,
37629
37278
  CreateSpeechRequestStreamFormat: () => CreateSpeechRequestStreamFormat,
@@ -37963,8 +37612,8 @@ var VoiceResourceObject = {
37963
37612
  };
37964
37613
 
37965
37614
  // src/generated/speechmatics/schema/index.ts
37966
- var schema_exports6 = {};
37967
- __export(schema_exports6, {
37615
+ var schema_exports7 = {};
37616
+ __export(schema_exports7, {
37968
37617
  AutoChaptersResultErrorType: () => AutoChaptersResultErrorType,
37969
37618
  ErrorResponseError: () => ErrorResponseError,
37970
37619
  GetJobsJobidAlignmentTags: () => GetJobsJobidAlignmentTags,
@@ -38153,32 +37802,6 @@ var WrittenFormRecognitionResultType = {
38153
37802
  word: "word"
38154
37803
  };
38155
37804
 
38156
- // src/generated/soniox/schema/index.ts
38157
- var schema_exports7 = {};
38158
- __export(schema_exports7, {
38159
- TemporaryApiKeyUsageType: () => TemporaryApiKeyUsageType,
38160
- TranscriptionMode: () => TranscriptionMode,
38161
- TranscriptionStatus: () => TranscriptionStatus,
38162
- TranslationConfigType: () => TranslationConfigType
38163
- });
38164
-
38165
- // src/generated/soniox/schema/temporaryApiKeyUsageType.ts
38166
- var TemporaryApiKeyUsageType = {
38167
- transcribe_websocket: "transcribe_websocket"
38168
- };
38169
-
38170
- // src/generated/soniox/schema/transcriptionMode.ts
38171
- var TranscriptionMode = {
38172
- real_time: "real_time",
38173
- async: "async"
38174
- };
38175
-
38176
- // src/generated/soniox/schema/translationConfigType.ts
38177
- var TranslationConfigType = {
38178
- one_way: "one_way",
38179
- two_way: "two_way"
38180
- };
38181
-
38182
37805
  // src/generated/elevenlabs/schema/index.ts
38183
37806
  var schema_exports8 = {};
38184
37807
  __export(schema_exports8, {