voice-router-dev 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -5835,23 +5835,22 @@ var AssemblyAIAdapter = class extends BaseAdapter {
5835
5835
  "AssemblyAI adapter currently only supports URL-based audio input. Use audio.type='url'"
5836
5836
  );
5837
5837
  }
5838
- const aaiOpts = { ...options?.assemblyai };
5839
- if ("speech_model" in aaiOpts && aaiOpts.speech_model != null) {
5840
- if (!aaiOpts.speech_models) {
5841
- aaiOpts.speech_models = [aaiOpts.speech_model];
5842
- }
5843
- delete aaiOpts.speech_model;
5838
+ const passthrough = options?.assemblyai;
5839
+ let speechModels;
5840
+ if (passthrough?.speech_model != null && !passthrough.speech_models) {
5841
+ speechModels = [passthrough.speech_model];
5842
+ } else if (passthrough?.speech_models) {
5843
+ speechModels = passthrough.speech_models;
5844
5844
  }
5845
+ const { speech_model: _deprecated, ...typedOpts } = passthrough ?? {};
5845
5846
  const request = {
5846
- ...aaiOpts,
5847
+ ...typedOpts,
5847
5848
  audio_url: audioUrl,
5848
5849
  // speech_models is required — default to universal-3-pro
5849
- speech_models: aaiOpts.speech_models ?? [
5850
- "universal-3-pro"
5851
- ],
5850
+ speech_models: speechModels ?? ["universal-3-pro"],
5852
5851
  // Enable punctuation and formatting by default
5853
- punctuate: aaiOpts.punctuate ?? true,
5854
- format_text: aaiOpts.format_text ?? true
5852
+ punctuate: typedOpts.punctuate ?? true,
5853
+ format_text: typedOpts.format_text ?? true
5855
5854
  };
5856
5855
  if (options) {
5857
5856
  if (options.model) {
@@ -5899,22 +5898,22 @@ var AssemblyAIAdapter = class extends BaseAdapter {
5899
5898
  normalizeResponse(response) {
5900
5899
  let status;
5901
5900
  switch (response.status) {
5902
- case TranscriptStatus.queued:
5901
+ case "queued":
5903
5902
  status = "queued";
5904
5903
  break;
5905
- case TranscriptStatus.processing:
5904
+ case "processing":
5906
5905
  status = "processing";
5907
5906
  break;
5908
- case TranscriptStatus.completed:
5907
+ case "completed":
5909
5908
  status = "completed";
5910
5909
  break;
5911
- case TranscriptStatus.error:
5910
+ case "error":
5912
5911
  status = "error";
5913
5912
  break;
5914
5913
  default:
5915
5914
  status = "queued";
5916
5915
  }
5917
- if (response.status === TranscriptStatus.error) {
5916
+ if (response.status === "error") {
5918
5917
  return {
5919
5918
  success: false,
5920
5919
  provider: this.name,
@@ -6566,8 +6565,10 @@ var DeepgramAdapter = class extends BaseAdapter {
6566
6565
  /**
6567
6566
  * Submit audio for transcription
6568
6567
  *
6569
- * Sends audio to Deepgram API for transcription. Deepgram processes
6570
- * synchronously and returns results immediately (no polling required).
6568
+ * Sends audio to Deepgram API for transcription. Deepgram normally processes
6569
+ * synchronously and returns results immediately. When `webhookUrl` is set,
6570
+ * Deepgram can instead return an async callback acknowledgment containing a
6571
+ * request ID.
6571
6572
  *
6572
6573
  * @param audio - Audio input (URL or file buffer)
6573
6574
  * @param options - Transcription options
@@ -6618,17 +6619,59 @@ var DeepgramAdapter = class extends BaseAdapter {
6618
6619
  { params }
6619
6620
  ).then((res) => res.data);
6620
6621
  } else if (audio.type === "file") {
6621
- response = await this.client.post("/listen", audio.file, {
6622
- params,
6623
- headers: {
6624
- "Content-Type": "audio/*"
6622
+ response = await this.client.post(
6623
+ "/listen",
6624
+ audio.file,
6625
+ {
6626
+ params,
6627
+ headers: {
6628
+ "Content-Type": "audio/*"
6629
+ }
6625
6630
  }
6626
- }).then((res) => res.data);
6631
+ ).then((res) => res.data);
6627
6632
  } else {
6628
6633
  throw new Error(
6629
6634
  "Deepgram adapter does not support stream type for pre-recorded transcription. Use transcribeStream() for real-time streaming."
6630
6635
  );
6631
6636
  }
6637
+ if (options?.webhookUrl) {
6638
+ const requestId = ("request_id" in response ? response.request_id : void 0) || ("metadata" in response ? response.metadata?.request_id : void 0);
6639
+ if (!requestId) {
6640
+ return {
6641
+ success: false,
6642
+ provider: this.name,
6643
+ error: {
6644
+ code: "MISSING_REQUEST_ID",
6645
+ message: "Deepgram callback mode did not return a request ID"
6646
+ },
6647
+ raw: response
6648
+ };
6649
+ }
6650
+ return {
6651
+ success: true,
6652
+ provider: this.name,
6653
+ data: {
6654
+ id: requestId,
6655
+ text: "",
6656
+ status: "queued"
6657
+ },
6658
+ tracking: {
6659
+ requestId
6660
+ },
6661
+ raw: response
6662
+ };
6663
+ }
6664
+ if (!("results" in response) || !("metadata" in response)) {
6665
+ return {
6666
+ success: false,
6667
+ provider: this.name,
6668
+ error: {
6669
+ code: "INVALID_RESPONSE",
6670
+ message: "Deepgram did not return a synchronous transcription payload"
6671
+ },
6672
+ raw: response
6673
+ };
6674
+ }
6632
6675
  return this.normalizeResponse(response);
6633
6676
  } catch (error) {
6634
6677
  return this.createErrorResponse(error);
@@ -7289,7 +7332,8 @@ var DeepgramAdapter = class extends BaseAdapter {
7289
7332
  break;
7290
7333
  }
7291
7334
  case "Metadata": {
7292
- callbacks?.onMetadata?.(message);
7335
+ const { type: _, ...metadata } = message;
7336
+ callbacks?.onMetadata?.(metadata);
7293
7337
  break;
7294
7338
  }
7295
7339
  case "Error": {
@@ -7725,10 +7769,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
7725
7769
  contentUrls: [audio.url],
7726
7770
  properties: this.buildTranscriptionProperties(options)
7727
7771
  };
7728
- const response = await transcriptionsCreate(
7729
- transcriptionRequest,
7730
- this.getAxiosConfig()
7731
- );
7772
+ const response = await transcriptionsCreate(transcriptionRequest, this.getAxiosConfig());
7732
7773
  const transcription = response.data;
7733
7774
  const transcriptId = transcription.self?.split("/").pop() || "";
7734
7775
  return await this.pollForCompletion(transcriptId);
@@ -8268,7 +8309,6 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8268
8309
  const request = {
8269
8310
  ...options?.openai,
8270
8311
  file: audioData,
8271
- // Buffer/Blob both accepted at runtime; generated type expects Blob
8272
8312
  model
8273
8313
  };
8274
8314
  if (options?.language) {
@@ -8288,11 +8328,7 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8288
8328
  request.response_format = OpenAIResponseFormat.json;
8289
8329
  }
8290
8330
  const response = await createTranscription(request, this.getAxiosConfig());
8291
- return this.normalizeResponse(
8292
- response.data,
8293
- model,
8294
- isDiarization
8295
- );
8331
+ return this.normalizeResponse(response.data, model, isDiarization);
8296
8332
  } catch (error) {
8297
8333
  return this.createErrorResponse(error);
8298
8334
  }
@@ -8699,7 +8735,6 @@ function createOpenAIWhisperAdapter(config) {
8699
8735
 
8700
8736
  // src/adapters/speechmatics-adapter.ts
8701
8737
  import axios8 from "axios";
8702
- import WebSocket6 from "ws";
8703
8738
 
8704
8739
  // src/generated/speechmatics/schema/notificationConfigContentsItem.ts
8705
8740
  var NotificationConfigContentsItem = {
@@ -8749,7 +8784,8 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8749
8784
  super(...arguments);
8750
8785
  this.name = "speechmatics";
8751
8786
  this.capabilities = {
8752
- streaming: true,
8787
+ streaming: false,
8788
+ // Batch only (streaming available via separate WebSocket API)
8753
8789
  diarization: true,
8754
8790
  wordTimestamps: true,
8755
8791
  languageDetection: false,
@@ -8884,16 +8920,13 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8884
8920
  jobConfig.fetch_data = {
8885
8921
  url: audio.url
8886
8922
  };
8887
- const formData = new FormData();
8888
- formData.append("config", JSON.stringify(jobConfig));
8889
- requestBody = formData;
8890
- headers = { "Content-Type": "multipart/form-data" };
8923
+ requestBody = { config: JSON.stringify(jobConfig) };
8924
+ headers = { "Content-Type": "application/json" };
8891
8925
  } else if (audio.type === "file") {
8892
- const formData = new FormData();
8893
- formData.append("config", JSON.stringify(jobConfig));
8894
- const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
8895
- formData.append("data_file", audioBlob, audio.filename || "audio.wav");
8896
- requestBody = formData;
8926
+ requestBody = {
8927
+ config: JSON.stringify(jobConfig),
8928
+ data_file: audio.file
8929
+ };
8897
8930
  headers = { "Content-Type": "multipart/form-data" };
8898
8931
  } else {
8899
8932
  return {
@@ -8998,389 +9031,6 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8998
9031
  throw error;
8999
9032
  }
9000
9033
  }
9001
- /**
9002
- * Build WebSocket URL for real-time streaming
9003
- *
9004
- * Note: Real-time API uses a different host from the batch API:
9005
- * - Batch: {region}.asr.api.speechmatics.com
9006
- * - Real-time: {region}.rt.speechmatics.com
9007
- *
9008
- * @param region - Regional endpoint identifier
9009
- * @returns WebSocket URL for real-time API
9010
- */
9011
- getRegionalWsUrl(region) {
9012
- if (this.config?.wsBaseUrl) {
9013
- return this.config.wsBaseUrl;
9014
- }
9015
- const rtRegionMap = {
9016
- eu1: "eu",
9017
- eu2: "eu",
9018
- us1: "us",
9019
- us2: "us",
9020
- au1: "eu"
9021
- // No AU RT endpoint — fall back to EU
9022
- };
9023
- const rtPrefix = rtRegionMap[region || ""] || "eu";
9024
- return `wss://${rtPrefix}.rt.speechmatics.com/v2`;
9025
- }
9026
- /**
9027
- * Stream audio for real-time transcription via WebSocket
9028
- *
9029
- * Connects to Speechmatics' real-time API and sends audio chunks
9030
- * for transcription with results returned via callbacks.
9031
- *
9032
- * @param options - Streaming configuration options
9033
- * @param callbacks - Event callbacks for transcription results
9034
- * @returns Promise that resolves with a StreamingSession
9035
- *
9036
- * @example Basic streaming
9037
- * ```typescript
9038
- * const session = await adapter.transcribeStream({
9039
- * language: 'en',
9040
- * speechmaticsStreaming: {
9041
- * enablePartials: true,
9042
- * operatingPoint: 'enhanced'
9043
- * }
9044
- * }, {
9045
- * onTranscript: (event) => console.log(event.text),
9046
- * onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
9047
- * onError: (error) => console.error(error)
9048
- * });
9049
- *
9050
- * await session.sendAudio({ data: audioBuffer });
9051
- * await session.close();
9052
- * ```
9053
- */
9054
- async transcribeStream(options, callbacks) {
9055
- this.validateConfig();
9056
- const smOpts = options?.speechmaticsStreaming || {};
9057
- const region = smOpts.region || this.config?.region;
9058
- const wsUrl = this.getRegionalWsUrl(region);
9059
- const ws = new WebSocket6(wsUrl, {
9060
- headers: {
9061
- Authorization: `Bearer ${this.config.apiKey}`
9062
- }
9063
- });
9064
- let sessionStatus = "connecting";
9065
- const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
9066
- let seqNo = 0;
9067
- let utteranceResults = [];
9068
- const sessionReady = new Promise((resolve, reject) => {
9069
- const timeout = setTimeout(() => {
9070
- reject(new Error("WebSocket connection timeout"));
9071
- }, 1e4);
9072
- let wsOpen = false;
9073
- ws.once("error", (error) => {
9074
- clearTimeout(timeout);
9075
- reject(error);
9076
- });
9077
- ws.once("open", () => {
9078
- wsOpen = true;
9079
- const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
9080
- const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
9081
- const startMsg = {
9082
- message: "StartRecognition",
9083
- audio_format: {
9084
- type: "raw",
9085
- encoding,
9086
- sample_rate: sampleRate
9087
- },
9088
- transcription_config: {
9089
- language: smOpts.language || options?.language || "en",
9090
- enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
9091
- }
9092
- };
9093
- const txConfig = startMsg.transcription_config;
9094
- if (smOpts.domain) txConfig.domain = smOpts.domain;
9095
- if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
9096
- if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
9097
- if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
9098
- if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
9099
- if (smOpts.diarization === "speaker" || options?.diarization) {
9100
- txConfig.diarization = "speaker";
9101
- if (smOpts.maxSpeakers) {
9102
- txConfig.speaker_diarization_config = {
9103
- max_speakers: smOpts.maxSpeakers
9104
- };
9105
- } else if (options?.speakersExpected) {
9106
- txConfig.speaker_diarization_config = {
9107
- max_speakers: options.speakersExpected
9108
- };
9109
- }
9110
- }
9111
- if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
9112
- txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
9113
- content: word
9114
- }));
9115
- } else if (options?.customVocabulary && options.customVocabulary.length > 0) {
9116
- txConfig.additional_vocab = options.customVocabulary.map((word) => ({
9117
- content: word
9118
- }));
9119
- }
9120
- if (smOpts.conversationConfig) {
9121
- txConfig.conversation_config = {
9122
- end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9123
- };
9124
- }
9125
- const startPayload = JSON.stringify(startMsg);
9126
- if (callbacks?.onRawMessage) {
9127
- callbacks.onRawMessage({
9128
- provider: "speechmatics",
9129
- direction: "outgoing",
9130
- timestamp: Date.now(),
9131
- payload: startPayload,
9132
- messageType: "StartRecognition"
9133
- });
9134
- }
9135
- ws.send(startPayload);
9136
- });
9137
- const onMessage = (data) => {
9138
- const rawPayload = data.toString();
9139
- try {
9140
- const msg = JSON.parse(rawPayload);
9141
- if (msg.message === "RecognitionStarted") {
9142
- clearTimeout(timeout);
9143
- ws.removeListener("message", onMessage);
9144
- ws.emit("message", data);
9145
- resolve();
9146
- } else if (msg.message === "Error") {
9147
- clearTimeout(timeout);
9148
- ws.removeListener("message", onMessage);
9149
- reject(new Error(msg.reason || "Recognition failed to start"));
9150
- }
9151
- } catch {
9152
- }
9153
- };
9154
- ws.on("message", onMessage);
9155
- });
9156
- ws.on("message", (data) => {
9157
- const rawPayload = data.toString();
9158
- try {
9159
- const message = JSON.parse(rawPayload);
9160
- if (callbacks?.onRawMessage) {
9161
- callbacks.onRawMessage({
9162
- provider: "speechmatics",
9163
- direction: "incoming",
9164
- timestamp: Date.now(),
9165
- payload: rawPayload,
9166
- messageType: message.message
9167
- });
9168
- }
9169
- this.handleStreamingMessage(message, callbacks, utteranceResults);
9170
- } catch (error) {
9171
- if (callbacks?.onRawMessage) {
9172
- callbacks.onRawMessage({
9173
- provider: "speechmatics",
9174
- direction: "incoming",
9175
- timestamp: Date.now(),
9176
- payload: rawPayload,
9177
- messageType: "parse_error"
9178
- });
9179
- }
9180
- callbacks?.onError?.({
9181
- code: "PARSE_ERROR",
9182
- message: "Failed to parse WebSocket message",
9183
- details: error
9184
- });
9185
- }
9186
- });
9187
- ws.on("error", (error) => {
9188
- callbacks?.onError?.({
9189
- code: "WEBSOCKET_ERROR",
9190
- message: error.message,
9191
- details: error
9192
- });
9193
- });
9194
- ws.on("close", (code, reason) => {
9195
- sessionStatus = "closed";
9196
- callbacks?.onClose?.(code, reason.toString());
9197
- });
9198
- await sessionReady;
9199
- sessionStatus = "open";
9200
- callbacks?.onOpen?.();
9201
- return {
9202
- id: sessionId,
9203
- provider: this.name,
9204
- createdAt: /* @__PURE__ */ new Date(),
9205
- getStatus: () => sessionStatus,
9206
- sendAudio: async (chunk) => {
9207
- if (sessionStatus !== "open") {
9208
- throw new Error(`Cannot send audio: session is ${sessionStatus}`);
9209
- }
9210
- if (ws.readyState !== WebSocket6.OPEN) {
9211
- throw new Error("WebSocket is not open");
9212
- }
9213
- if (callbacks?.onRawMessage) {
9214
- const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
9215
- chunk.data.byteOffset,
9216
- chunk.data.byteOffset + chunk.data.byteLength
9217
- );
9218
- callbacks.onRawMessage({
9219
- provider: this.name,
9220
- direction: "outgoing",
9221
- timestamp: Date.now(),
9222
- payload: audioPayload,
9223
- messageType: "audio"
9224
- });
9225
- }
9226
- ws.send(chunk.data);
9227
- seqNo++;
9228
- if (chunk.isLast) {
9229
- const endMsg = JSON.stringify({
9230
- message: "EndOfStream",
9231
- last_seq_no: seqNo
9232
- });
9233
- if (callbacks?.onRawMessage) {
9234
- callbacks.onRawMessage({
9235
- provider: this.name,
9236
- direction: "outgoing",
9237
- timestamp: Date.now(),
9238
- payload: endMsg,
9239
- messageType: "EndOfStream"
9240
- });
9241
- }
9242
- ws.send(endMsg);
9243
- }
9244
- },
9245
- close: async () => {
9246
- if (sessionStatus === "closed" || sessionStatus === "closing") {
9247
- return;
9248
- }
9249
- sessionStatus = "closing";
9250
- if (ws.readyState === WebSocket6.OPEN) {
9251
- seqNo++;
9252
- ws.send(
9253
- JSON.stringify({
9254
- message: "EndOfStream",
9255
- last_seq_no: seqNo
9256
- })
9257
- );
9258
- }
9259
- return new Promise((resolve) => {
9260
- const timeout = setTimeout(() => {
9261
- ws.terminate();
9262
- sessionStatus = "closed";
9263
- resolve();
9264
- }, 5e3);
9265
- const onMsg = (data) => {
9266
- try {
9267
- const msg = JSON.parse(data.toString());
9268
- if (msg.message === "EndOfTranscript") {
9269
- ws.removeListener("message", onMsg);
9270
- clearTimeout(timeout);
9271
- ws.close();
9272
- }
9273
- } catch {
9274
- }
9275
- };
9276
- ws.on("message", onMsg);
9277
- ws.once("close", () => {
9278
- clearTimeout(timeout);
9279
- sessionStatus = "closed";
9280
- resolve();
9281
- });
9282
- });
9283
- }
9284
- };
9285
- }
9286
- /**
9287
- * Handle incoming Speechmatics real-time WebSocket messages
9288
- */
9289
- handleStreamingMessage(message, callbacks, utteranceResults) {
9290
- switch (message.message) {
9291
- case "RecognitionStarted": {
9292
- break;
9293
- }
9294
- case "AddPartialTranscript": {
9295
- const results = message.results || [];
9296
- const text = buildTextFromSpeechmaticsResults(results);
9297
- if (text) {
9298
- callbacks?.onTranscript?.({
9299
- type: "transcript",
9300
- text,
9301
- isFinal: false,
9302
- words: this.extractWordsFromResults(results),
9303
- data: message
9304
- });
9305
- }
9306
- break;
9307
- }
9308
- case "AddTranscript": {
9309
- const results = message.results || [];
9310
- const text = buildTextFromSpeechmaticsResults(results);
9311
- if (utteranceResults) {
9312
- utteranceResults.push(...results);
9313
- }
9314
- if (text) {
9315
- callbacks?.onTranscript?.({
9316
- type: "transcript",
9317
- text,
9318
- isFinal: true,
9319
- words: this.extractWordsFromResults(results),
9320
- data: message
9321
- });
9322
- }
9323
- break;
9324
- }
9325
- case "EndOfUtterance": {
9326
- if (utteranceResults && utteranceResults.length > 0) {
9327
- const text = buildTextFromSpeechmaticsResults(utteranceResults);
9328
- const words = this.extractWordsFromResults(utteranceResults);
9329
- const utterances = buildUtterancesFromWords(words);
9330
- if (utterances.length > 0) {
9331
- for (const utt of utterances) {
9332
- callbacks?.onUtterance?.(utt);
9333
- }
9334
- } else if (text) {
9335
- callbacks?.onUtterance?.({
9336
- text,
9337
- start: words.length > 0 ? words[0].start : 0,
9338
- end: words.length > 0 ? words[words.length - 1].end : 0,
9339
- words
9340
- });
9341
- }
9342
- utteranceResults.length = 0;
9343
- }
9344
- break;
9345
- }
9346
- case "AudioAdded": {
9347
- break;
9348
- }
9349
- case "EndOfTranscript": {
9350
- break;
9351
- }
9352
- case "Info":
9353
- case "Warning": {
9354
- callbacks?.onMetadata?.(message);
9355
- break;
9356
- }
9357
- case "Error": {
9358
- const errMsg = message;
9359
- callbacks?.onError?.({
9360
- code: errMsg.type || "SPEECHMATICS_ERROR",
9361
- message: errMsg.reason || "Unknown error",
9362
- details: message
9363
- });
9364
- break;
9365
- }
9366
- default: {
9367
- callbacks?.onMetadata?.(message);
9368
- break;
9369
- }
9370
- }
9371
- }
9372
- /**
9373
- * Extract unified Word[] from Speechmatics recognition results
9374
- */
9375
- extractWordsFromResults(results) {
9376
- return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
9377
- word: result.alternatives?.[0]?.content || "",
9378
- start: result.start_time,
9379
- end: result.end_time,
9380
- confidence: result.alternatives?.[0]?.confidence,
9381
- speaker: result.alternatives?.[0]?.speaker
9382
- }));
9383
- }
9384
9034
  /**
9385
9035
  * Normalize Speechmatics status to unified status
9386
9036
  * Uses generated JobDetailsStatus enum values
@@ -9449,9 +9099,6 @@ function createSpeechmaticsAdapter(config) {
9449
9099
  return adapter;
9450
9100
  }
9451
9101
 
9452
- // src/adapters/soniox-adapter.ts
9453
- import axios9 from "axios";
9454
-
9455
9102
  // src/generated/soniox/schema/transcriptionStatus.ts
9456
9103
  var TranscriptionStatus = {
9457
9104
  queued: "queued",
@@ -9460,6 +9107,57 @@ var TranscriptionStatus = {
9460
9107
  error: "error"
9461
9108
  };
9462
9109
 
9110
+ // src/generated/soniox/api/sonioxPublicAPI.ts
9111
+ import axios9 from "axios";
9112
+
9113
+ // src/generated/soniox/schema/index.ts
9114
+ var schema_exports4 = {};
9115
+ __export(schema_exports4, {
9116
+ TemporaryApiKeyUsageType: () => TemporaryApiKeyUsageType,
9117
+ TranscriptionMode: () => TranscriptionMode,
9118
+ TranscriptionStatus: () => TranscriptionStatus,
9119
+ TranslationConfigType: () => TranslationConfigType
9120
+ });
9121
+
9122
+ // src/generated/soniox/schema/temporaryApiKeyUsageType.ts
9123
+ var TemporaryApiKeyUsageType = {
9124
+ transcribe_websocket: "transcribe_websocket"
9125
+ };
9126
+
9127
+ // src/generated/soniox/schema/transcriptionMode.ts
9128
+ var TranscriptionMode = {
9129
+ real_time: "real_time",
9130
+ async: "async"
9131
+ };
9132
+
9133
+ // src/generated/soniox/schema/translationConfigType.ts
9134
+ var TranslationConfigType = {
9135
+ one_way: "one_way",
9136
+ two_way: "two_way"
9137
+ };
9138
+
9139
+ // src/generated/soniox/api/sonioxPublicAPI.ts
9140
+ var uploadFile = (uploadFileBody2, options) => {
9141
+ const formData = new FormData();
9142
+ if (uploadFileBody2.client_reference_id !== void 0 && uploadFileBody2.client_reference_id !== null) {
9143
+ formData.append("client_reference_id", uploadFileBody2.client_reference_id);
9144
+ }
9145
+ formData.append("file", uploadFileBody2.file);
9146
+ return axios9.post("/v1/files", formData, options);
9147
+ };
9148
+ var createTranscription2 = (createTranscriptionPayload, options) => {
9149
+ return axios9.post("/v1/transcriptions", createTranscriptionPayload, options);
9150
+ };
9151
+ var getTranscription = (transcriptionId, options) => {
9152
+ return axios9.get(`/v1/transcriptions/${transcriptionId}`, options);
9153
+ };
9154
+ var getTranscriptionTranscript = (transcriptionId, options) => {
9155
+ return axios9.get(`/v1/transcriptions/${transcriptionId}/transcript`, options);
9156
+ };
9157
+ var getModels = (options) => {
9158
+ return axios9.get("/v1/models", options);
9159
+ };
9160
+
9463
9161
  // src/adapters/soniox-adapter.ts
9464
9162
  var SonioxAdapter = class extends BaseAdapter {
9465
9163
  constructor() {
@@ -9514,11 +9212,17 @@ var SonioxAdapter = class extends BaseAdapter {
9514
9212
  }
9515
9213
  }
9516
9214
  /**
9517
- * Get the base URL for API requests
9215
+ * Get the base URL for API requests (no /v1 suffix — generated functions include /v1 in paths)
9518
9216
  */
9519
9217
  get baseUrl() {
9520
9218
  if (this.config?.baseUrl) return this.config.baseUrl;
9521
- return `https://${this.getRegionalHost()}/v1`;
9219
+ return `https://${this.getRegionalHost()}`;
9220
+ }
9221
+ /**
9222
+ * Build axios config with Soniox Bearer auth
9223
+ */
9224
+ getAxiosConfig() {
9225
+ return super.getAxiosConfig("Authorization", (key) => `Bearer ${key}`);
9522
9226
  }
9523
9227
  initialize(config) {
9524
9228
  super.initialize(config);
@@ -9528,15 +9232,6 @@ var SonioxAdapter = class extends BaseAdapter {
9528
9232
  if (config.model) {
9529
9233
  this.defaultModel = config.model;
9530
9234
  }
9531
- this.client = axios9.create({
9532
- baseURL: this.baseUrl,
9533
- timeout: config.timeout || 12e4,
9534
- headers: {
9535
- Authorization: `Bearer ${config.apiKey}`,
9536
- "Content-Type": "application/json",
9537
- ...config.headers
9538
- }
9539
- });
9540
9235
  }
9541
9236
  /**
9542
9237
  * Get current region
@@ -9566,23 +9261,12 @@ var SonioxAdapter = class extends BaseAdapter {
9566
9261
  */
9567
9262
  setRegion(region) {
9568
9263
  this.region = region;
9569
- if (this.config?.apiKey) {
9570
- this.client = axios9.create({
9571
- baseURL: this.baseUrl,
9572
- timeout: this.config.timeout || 12e4,
9573
- headers: {
9574
- Authorization: `Bearer ${this.config.apiKey}`,
9575
- "Content-Type": "application/json",
9576
- ...this.config.headers
9577
- }
9578
- });
9579
- }
9580
9264
  }
9581
9265
  /**
9582
9266
  * Submit audio for transcription
9583
9267
  *
9584
- * Soniox uses async batch processing. The transcribe method submits audio
9585
- * and waits for completion (or use getTranscript for polling).
9268
+ * Uses the async v1 API: createTranscription returns status `queued`,
9269
+ * then polls until completed (or returns immediately if webhook is set).
9586
9270
  *
9587
9271
  * @param audio - Audio input (URL or file)
9588
9272
  * @param options - Transcription options
@@ -9591,21 +9275,44 @@ var SonioxAdapter = class extends BaseAdapter {
9591
9275
  async transcribe(audio, options) {
9592
9276
  this.validateConfig();
9593
9277
  try {
9594
- const requestBody = {
9595
- model: options?.model || this.defaultModel
9596
- };
9597
- if (audio.type === "url") {
9598
- requestBody.audio_url = audio.url;
9599
- } else if (audio.type === "file") {
9600
- const formData = new FormData();
9278
+ const sonioxOpts = options?.soniox;
9279
+ if (audio.type === "file") {
9601
9280
  const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9602
- formData.append("file", audioBlob, audio.filename || "audio.wav");
9603
- const uploadResponse = await this.client.post("/files", formData, {
9604
- headers: {
9605
- "Content-Type": "multipart/form-data"
9606
- }
9607
- });
9608
- requestBody.file_id = uploadResponse.data.id;
9281
+ const uploadBody = { file: audioBlob };
9282
+ const fileResp = await uploadFile(uploadBody, this.getAxiosConfig());
9283
+ const payload = {
9284
+ ...sonioxOpts,
9285
+ model: options?.model || this.defaultModel,
9286
+ file_id: fileResp.data.id,
9287
+ language_hints: options?.language ? [options.language] : sonioxOpts?.language_hints,
9288
+ enable_speaker_diarization: options?.diarization || sonioxOpts?.enable_speaker_diarization,
9289
+ enable_language_identification: options?.languageDetection || sonioxOpts?.enable_language_identification,
9290
+ context: options?.customVocabulary?.length ? { terms: options.customVocabulary } : sonioxOpts?.context,
9291
+ webhook_url: options?.webhookUrl || sonioxOpts?.webhook_url
9292
+ };
9293
+ const createResp = await createTranscription2(payload, this.getAxiosConfig());
9294
+ const meta = createResp.data;
9295
+ if (options?.webhookUrl || sonioxOpts?.webhook_url) {
9296
+ return this.normalizeTranscription(meta);
9297
+ }
9298
+ return this.pollForCompletion(meta.id);
9299
+ } else if (audio.type === "url") {
9300
+ const payload = {
9301
+ ...sonioxOpts,
9302
+ model: options?.model || this.defaultModel,
9303
+ audio_url: audio.url,
9304
+ language_hints: options?.language ? [options.language] : sonioxOpts?.language_hints,
9305
+ enable_speaker_diarization: options?.diarization || sonioxOpts?.enable_speaker_diarization,
9306
+ enable_language_identification: options?.languageDetection || sonioxOpts?.enable_language_identification,
9307
+ context: options?.customVocabulary?.length ? { terms: options.customVocabulary } : sonioxOpts?.context,
9308
+ webhook_url: options?.webhookUrl || sonioxOpts?.webhook_url
9309
+ };
9310
+ const createResp = await createTranscription2(payload, this.getAxiosConfig());
9311
+ const meta = createResp.data;
9312
+ if (options?.webhookUrl || sonioxOpts?.webhook_url) {
9313
+ return this.normalizeTranscription(meta);
9314
+ }
9315
+ return this.pollForCompletion(meta.id);
9609
9316
  } else {
9610
9317
  return {
9611
9318
  success: false,
@@ -9616,38 +9323,6 @@ var SonioxAdapter = class extends BaseAdapter {
9616
9323
  }
9617
9324
  };
9618
9325
  }
9619
- if (options?.language) {
9620
- requestBody.language_hints = [options.language];
9621
- }
9622
- if (options?.diarization) {
9623
- requestBody.enable_speaker_diarization = true;
9624
- }
9625
- if (options?.languageDetection) {
9626
- requestBody.enable_language_identification = true;
9627
- }
9628
- if (options?.customVocabulary && options.customVocabulary.length > 0) {
9629
- requestBody.context = {
9630
- terms: options.customVocabulary
9631
- };
9632
- }
9633
- if (options?.webhookUrl) {
9634
- requestBody.webhook_url = options.webhookUrl;
9635
- }
9636
- const response = await this.client.post("/transcriptions", requestBody);
9637
- const transcriptionId = response.data.id;
9638
- if (options?.webhookUrl) {
9639
- return {
9640
- success: true,
9641
- provider: this.name,
9642
- data: {
9643
- id: transcriptionId,
9644
- text: "",
9645
- status: "queued"
9646
- },
9647
- raw: response.data
9648
- };
9649
- }
9650
- return await this.pollForCompletion(transcriptionId);
9651
9326
  } catch (error) {
9652
9327
  return this.createErrorResponse(error);
9653
9328
  }
@@ -9655,9 +9330,8 @@ var SonioxAdapter = class extends BaseAdapter {
9655
9330
  /**
9656
9331
  * Get transcription result by ID
9657
9332
  *
9658
- * Checks job status via GET /v1/transcriptions/{id}, then fetches
9659
- * the full transcript via GET /v1/transcriptions/{id}/transcript
9660
- * when completed.
9333
+ * Fetches transcription metadata and, if completed, the transcript text/tokens.
9334
+ * Used by pollForCompletion() for async polling.
9661
9335
  *
9662
9336
  * @param transcriptId - Transcript ID
9663
9337
  * @returns Transcription response
@@ -9665,39 +9339,20 @@ var SonioxAdapter = class extends BaseAdapter {
9665
9339
  async getTranscript(transcriptId) {
9666
9340
  this.validateConfig();
9667
9341
  try {
9668
- const statusResponse = await this.client.get(`/transcriptions/${transcriptId}`);
9669
- const job = statusResponse.data;
9670
- if (job.status === "error") {
9671
- return {
9672
- success: false,
9673
- provider: this.name,
9674
- error: {
9675
- code: "TRANSCRIPTION_ERROR",
9676
- message: job.error_message || "Transcription failed"
9677
- }
9678
- };
9679
- }
9680
- if (job.status !== "completed") {
9681
- return {
9682
- success: true,
9683
- provider: this.name,
9684
- data: {
9685
- id: job.id,
9686
- text: "",
9687
- status: job.status
9688
- },
9689
- raw: job
9690
- };
9342
+ const metaResp = await getTranscription(transcriptId, this.getAxiosConfig());
9343
+ const meta = metaResp.data;
9344
+ if (meta.status === TranscriptionStatus.completed) {
9345
+ try {
9346
+ const transcriptResp = await getTranscriptionTranscript(
9347
+ transcriptId,
9348
+ this.getAxiosConfig()
9349
+ );
9350
+ return this.normalizeTranscription(meta, transcriptResp.data);
9351
+ } catch (transcriptError) {
9352
+ return this.createErrorResponse(transcriptError);
9353
+ }
9691
9354
  }
9692
- const transcriptResponse = await this.client.get(
9693
- `/transcriptions/${transcriptId}/transcript`
9694
- );
9695
- return this.normalizeResponse({
9696
- ...transcriptResponse.data,
9697
- // Carry over job metadata
9698
- id: job.id,
9699
- audio_duration_ms: job.audio_duration_ms
9700
- });
9355
+ return this.normalizeTranscription(meta);
9701
9356
  } catch (error) {
9702
9357
  return this.createErrorResponse(error);
9703
9358
  }
@@ -9717,51 +9372,50 @@ var SonioxAdapter = class extends BaseAdapter {
9717
9372
  const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9718
9373
  const createdAt = /* @__PURE__ */ new Date();
9719
9374
  const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
9720
- const wsUrl = `${wsBase}/transcribe-websocket`;
9721
- const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
9722
- const sonioxOpts = options?.sonioxStreaming;
9723
- const initMessage = {
9724
- api_key: this.config.apiKey,
9725
- model: modelId
9726
- };
9727
- if (sonioxOpts?.audioFormat) {
9728
- initMessage.audio_format = sonioxOpts.audioFormat;
9729
- } else if (options?.encoding) {
9375
+ const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
9376
+ wsUrl.searchParams.set("api_key", this.config.apiKey);
9377
+ const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
9378
+ wsUrl.searchParams.set("model", modelId);
9379
+ if (options?.encoding) {
9730
9380
  const encodingMap = {
9731
9381
  linear16: "pcm_s16le",
9732
9382
  pcm: "pcm_s16le",
9733
9383
  mulaw: "mulaw",
9734
9384
  alaw: "alaw"
9735
9385
  };
9736
- initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
9386
+ wsUrl.searchParams.set("audio_format", encodingMap[options.encoding] || options.encoding);
9737
9387
  }
9738
- if (sonioxOpts?.sampleRate || options?.sampleRate) {
9739
- initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
9388
+ if (options?.sampleRate) {
9389
+ wsUrl.searchParams.set("sample_rate", options.sampleRate.toString());
9740
9390
  }
9741
- if (sonioxOpts?.numChannels || options?.channels) {
9742
- initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
9391
+ if (options?.channels) {
9392
+ wsUrl.searchParams.set("num_channels", options.channels.toString());
9743
9393
  }
9394
+ const sonioxOpts = options?.sonioxStreaming;
9744
9395
  if (sonioxOpts) {
9745
9396
  if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
9746
- initMessage.language_hints = sonioxOpts.languageHints;
9397
+ wsUrl.searchParams.set("language_hints", JSON.stringify(sonioxOpts.languageHints));
9747
9398
  }
9748
9399
  if (sonioxOpts.enableLanguageIdentification) {
9749
- initMessage.enable_language_identification = true;
9400
+ wsUrl.searchParams.set("enable_language_identification", "true");
9750
9401
  }
9751
9402
  if (sonioxOpts.enableEndpointDetection) {
9752
- initMessage.enable_endpoint_detection = true;
9403
+ wsUrl.searchParams.set("enable_endpoint_detection", "true");
9753
9404
  }
9754
9405
  if (sonioxOpts.enableSpeakerDiarization) {
9755
- initMessage.enable_speaker_diarization = true;
9406
+ wsUrl.searchParams.set("enable_speaker_diarization", "true");
9756
9407
  }
9757
9408
  if (sonioxOpts.context) {
9758
- initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
9409
+ wsUrl.searchParams.set(
9410
+ "context",
9411
+ typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
9412
+ );
9759
9413
  }
9760
9414
  if (sonioxOpts.translation) {
9761
- initMessage.translation = sonioxOpts.translation;
9415
+ wsUrl.searchParams.set("translation", JSON.stringify(sonioxOpts.translation));
9762
9416
  }
9763
9417
  if (sonioxOpts.clientReferenceId) {
9764
- initMessage.client_reference_id = sonioxOpts.clientReferenceId;
9418
+ wsUrl.searchParams.set("client_reference_id", sonioxOpts.clientReferenceId);
9765
9419
  }
9766
9420
  }
9767
9421
  if (!sonioxOpts?.languageHints && options?.language) {
@@ -9770,33 +9424,24 @@ var SonioxAdapter = class extends BaseAdapter {
9770
9424
  `[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
9771
9425
  );
9772
9426
  }
9773
- initMessage.language_hints = [options.language];
9427
+ wsUrl.searchParams.set("language_hints", JSON.stringify([options.language]));
9774
9428
  }
9775
9429
  if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
9776
- initMessage.enable_speaker_diarization = true;
9430
+ wsUrl.searchParams.set("enable_speaker_diarization", "true");
9777
9431
  }
9778
9432
  if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
9779
- initMessage.enable_language_identification = true;
9433
+ wsUrl.searchParams.set("enable_language_identification", "true");
9434
+ }
9435
+ if (options?.interimResults !== false) {
9780
9436
  }
9781
9437
  let status = "connecting";
9782
9438
  let openedAt = null;
9783
9439
  let receivedData = false;
9784
9440
  const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : __require("ws");
9785
- const ws = new WebSocketImpl(wsUrl);
9441
+ const ws = new WebSocketImpl(wsUrl.toString());
9786
9442
  ws.onopen = () => {
9787
- openedAt = Date.now();
9788
- const initPayload = JSON.stringify(initMessage);
9789
- if (callbacks?.onRawMessage) {
9790
- callbacks.onRawMessage({
9791
- provider: this.name,
9792
- direction: "outgoing",
9793
- timestamp: Date.now(),
9794
- payload: initPayload,
9795
- messageType: "init"
9796
- });
9797
- }
9798
- ws.send(initPayload);
9799
9443
  status = "open";
9444
+ openedAt = Date.now();
9800
9445
  callbacks?.onOpen?.();
9801
9446
  };
9802
9447
  ws.onmessage = (event) => {
@@ -9805,7 +9450,8 @@ var SonioxAdapter = class extends BaseAdapter {
9805
9450
  let messageType;
9806
9451
  try {
9807
9452
  const data = JSON.parse(rawPayload);
9808
- if (data.error) {
9453
+ const errorMessage = data.error_message || data.error;
9454
+ if (errorMessage) {
9809
9455
  messageType = "error";
9810
9456
  } else if (data.finished) {
9811
9457
  messageType = "finished";
@@ -9821,10 +9467,10 @@ var SonioxAdapter = class extends BaseAdapter {
9821
9467
  messageType
9822
9468
  });
9823
9469
  }
9824
- if (data.error) {
9470
+ if (errorMessage) {
9825
9471
  callbacks?.onError?.({
9826
9472
  code: data.error_code?.toString() || "STREAM_ERROR",
9827
- message: data.error
9473
+ message: errorMessage
9828
9474
  });
9829
9475
  return;
9830
9476
  }
@@ -9838,7 +9484,7 @@ var SonioxAdapter = class extends BaseAdapter {
9838
9484
  start: token.start_ms ? token.start_ms / 1e3 : 0,
9839
9485
  end: token.end_ms ? token.end_ms / 1e3 : 0,
9840
9486
  confidence: token.confidence,
9841
- speaker: token.speaker
9487
+ speaker: token.speaker ?? void 0
9842
9488
  }));
9843
9489
  const text = data.text || data.tokens.map((t) => t.text).join("");
9844
9490
  const isFinal = data.tokens.every((t) => t.is_final);
@@ -9847,8 +9493,8 @@ var SonioxAdapter = class extends BaseAdapter {
9847
9493
  text,
9848
9494
  isFinal,
9849
9495
  words,
9850
- speaker: data.tokens[0]?.speaker,
9851
- language: data.tokens[0]?.language,
9496
+ speaker: data.tokens[0]?.speaker ?? void 0,
9497
+ language: data.tokens[0]?.language ?? void 0,
9852
9498
  confidence: data.tokens[0]?.confidence
9853
9499
  };
9854
9500
  callbacks?.onTranscript?.(event2);
@@ -9875,10 +9521,10 @@ var SonioxAdapter = class extends BaseAdapter {
9875
9521
  ws.onclose = (event) => {
9876
9522
  status = "closed";
9877
9523
  const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
9878
- const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
9879
- if (isEarlyClose && event.code === 1e3) {
9524
+ const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
9525
+ if (isImmediateClose && event.code === 1e3) {
9880
9526
  const errorMessage = [
9881
- "Soniox closed connection shortly after opening.",
9527
+ "Soniox closed connection immediately after opening.",
9882
9528
  `Current config: region=${this.region}, model=${modelId}`,
9883
9529
  "Likely causes:",
9884
9530
  " - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
@@ -9964,7 +9610,7 @@ var SonioxAdapter = class extends BaseAdapter {
9964
9610
  async getModels() {
9965
9611
  this.validateConfig();
9966
9612
  try {
9967
- const response = await this.client.get("/models");
9613
+ const response = await getModels(this.getAxiosConfig());
9968
9614
  return response.data.models || [];
9969
9615
  } catch (error) {
9970
9616
  console.error("Failed to fetch Soniox models:", error);
@@ -9996,11 +9642,44 @@ var SonioxAdapter = class extends BaseAdapter {
9996
9642
  return buildUtterancesFromWords(words);
9997
9643
  }
9998
9644
  /**
9999
- * Normalize Soniox response to unified format
9645
+ * Normalize v1 API response to unified format
9646
+ *
9647
+ * @param meta - Transcription metadata from getTranscription/createTranscription
9648
+ * @param transcript - Transcript data (text/tokens), only present when status is completed
10000
9649
  */
10001
- normalizeResponse(response) {
10002
- const { text, tokens } = response;
10003
- const words = tokens.map((token) => ({
9650
+ normalizeTranscription(meta, transcript) {
9651
+ if (meta.status === TranscriptionStatus.error) {
9652
+ return {
9653
+ success: false,
9654
+ provider: this.name,
9655
+ data: {
9656
+ id: meta.id,
9657
+ text: "",
9658
+ status: "error"
9659
+ },
9660
+ error: {
9661
+ code: meta.error_type || "TRANSCRIPTION_ERROR",
9662
+ message: meta.error_message || "Transcription failed"
9663
+ },
9664
+ raw: { meta, transcript }
9665
+ };
9666
+ }
9667
+ if (!transcript) {
9668
+ return {
9669
+ success: true,
9670
+ provider: this.name,
9671
+ data: {
9672
+ id: meta.id,
9673
+ text: "",
9674
+ status: meta.status,
9675
+ duration: meta.audio_duration_ms ? meta.audio_duration_ms / 1e3 : void 0
9676
+ },
9677
+ raw: { meta }
9678
+ };
9679
+ }
9680
+ const tokens = transcript.tokens || [];
9681
+ const text = transcript.text || tokens.map((t) => t.text).join("");
9682
+ const words = tokens.filter((t) => t.start_ms !== void 0 && t.end_ms !== void 0).map((token) => ({
10004
9683
  word: token.text,
10005
9684
  start: token.start_ms / 1e3,
10006
9685
  end: token.end_ms / 1e3,
@@ -10008,33 +9687,32 @@ var SonioxAdapter = class extends BaseAdapter {
10008
9687
  speaker: token.speaker ?? void 0
10009
9688
  }));
10010
9689
  const speakerSet = /* @__PURE__ */ new Set();
10011
- for (const token of tokens) {
10012
- if (token.speaker) speakerSet.add(token.speaker);
10013
- }
9690
+ tokens.forEach((t) => {
9691
+ if (t.speaker) speakerSet.add(String(t.speaker));
9692
+ });
10014
9693
  const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
10015
9694
  id,
10016
9695
  label: `Speaker ${id}`
10017
9696
  })) : void 0;
10018
- const utterances = tokens.length > 0 ? this.buildUtterancesFromTokens(tokens) : [];
9697
+ const utterances = this.buildUtterancesFromTokens(tokens);
10019
9698
  const language = tokens.find((t) => t.language)?.language ?? void 0;
10020
9699
  return {
10021
9700
  success: true,
10022
9701
  provider: this.name,
10023
9702
  data: {
10024
- id: response.id || `soniox_${Date.now()}`,
9703
+ id: meta.id,
10025
9704
  text,
10026
9705
  status: TranscriptionStatus.completed,
10027
9706
  language,
10028
- duration: response.audio_duration_ms ? response.audio_duration_ms / 1e3 : response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
9707
+ duration: meta.audio_duration_ms ? meta.audio_duration_ms / 1e3 : void 0,
10029
9708
  speakers,
10030
9709
  words: words.length > 0 ? words : void 0,
10031
9710
  utterances: utterances.length > 0 ? utterances : void 0
10032
9711
  },
10033
9712
  tracking: {
10034
- requestId: response.id,
10035
- processingTimeMs: response.total_audio_proc_ms
9713
+ requestId: meta.id
10036
9714
  },
10037
- raw: response
9715
+ raw: { meta, transcript }
10038
9716
  };
10039
9717
  }
10040
9718
  };
@@ -10190,29 +9868,11 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10190
9868
  }
10191
9869
  }
10192
9870
  }
10193
- if (options?.webhookUrl) {
10194
- if (!formData.has("webhook")) {
10195
- formData.append("webhook", "true");
10196
- }
10197
- }
10198
9871
  const response = await this.client.post("/v1/speech-to-text", formData, {
10199
9872
  headers: {
10200
9873
  "Content-Type": "multipart/form-data"
10201
9874
  }
10202
9875
  });
10203
- if (options?.webhookUrl) {
10204
- const transcriptionId = response.data.transcription_id || response.data.id || `elevenlabs_${Date.now()}`;
10205
- return {
10206
- success: true,
10207
- provider: this.name,
10208
- data: {
10209
- id: transcriptionId,
10210
- text: "",
10211
- status: "queued"
10212
- },
10213
- raw: response.data
10214
- };
10215
- }
10216
9876
  return this.normalizeResponse(response.data);
10217
9877
  } catch (error) {
10218
9878
  return this.createErrorResponse(error);
@@ -10526,7 +10186,7 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10526
10186
  }
10527
10187
  }
10528
10188
  }
10529
- const transcriptionId = ("transcription_id" in response ? response.transcription_id : response.transcription_id) || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
10189
+ const transcriptionId = response.transcription_id || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
10530
10190
  return {
10531
10191
  success: true,
10532
10192
  provider: this.name,
@@ -36444,12 +36104,10 @@ var createTemporaryApiKeyBody = zod10.object({
36444
36104
  var streaming_types_zod_exports = {};
36445
36105
  __export(streaming_types_zod_exports, {
36446
36106
  sonioxAudioFormatSchema: () => sonioxAudioFormatSchema,
36447
- sonioxAutoDetectedAudioFormatSchema: () => sonioxAutoDetectedAudioFormatSchema,
36448
36107
  sonioxContextGeneralItemSchema: () => sonioxContextGeneralItemSchema,
36449
36108
  sonioxContextSchema: () => sonioxContextSchema,
36450
36109
  sonioxErrorStatusSchema: () => sonioxErrorStatusSchema,
36451
36110
  sonioxOneWayTranslationSchema: () => sonioxOneWayTranslationSchema,
36452
- sonioxPcmAudioEncodingSchema: () => sonioxPcmAudioEncodingSchema,
36453
36111
  sonioxRealtimeModelSchema: () => sonioxRealtimeModelSchema,
36454
36112
  sonioxRecorderStateSchema: () => sonioxRecorderStateSchema,
36455
36113
  sonioxStreamingResponseSchema: () => sonioxStreamingResponseSchema,
@@ -36463,7 +36121,7 @@ __export(streaming_types_zod_exports, {
36463
36121
  streamingUpdateConfigParams: () => streamingUpdateConfigParams3
36464
36122
  });
36465
36123
  import { z as zod11 } from "zod";
36466
- var sonioxAutoDetectedAudioFormatSchema = zod11.enum([
36124
+ var sonioxAudioFormatSchema = zod11.enum([
36467
36125
  "auto",
36468
36126
  "aac",
36469
36127
  "aiff",
@@ -36473,10 +36131,7 @@ var sonioxAutoDetectedAudioFormatSchema = zod11.enum([
36473
36131
  "mp3",
36474
36132
  "ogg",
36475
36133
  "wav",
36476
- "webm"
36477
- ]);
36478
- var sonioxPcmAudioEncodingSchema = zod11.enum([
36479
- // Signed PCM
36134
+ "webm",
36480
36135
  "pcm_s8",
36481
36136
  "pcm_s16le",
36482
36137
  "pcm_s16be",
@@ -36484,7 +36139,6 @@ var sonioxPcmAudioEncodingSchema = zod11.enum([
36484
36139
  "pcm_s24be",
36485
36140
  "pcm_s32le",
36486
36141
  "pcm_s32be",
36487
- // Unsigned PCM
36488
36142
  "pcm_u8",
36489
36143
  "pcm_u16le",
36490
36144
  "pcm_u16be",
@@ -36492,86 +36146,81 @@ var sonioxPcmAudioEncodingSchema = zod11.enum([
36492
36146
  "pcm_u24be",
36493
36147
  "pcm_u32le",
36494
36148
  "pcm_u32be",
36495
- // Float PCM
36496
36149
  "pcm_f32le",
36497
36150
  "pcm_f32be",
36498
36151
  "pcm_f64le",
36499
36152
  "pcm_f64be",
36500
- // Companded
36501
36153
  "mulaw",
36502
36154
  "alaw"
36503
36155
  ]);
36504
- var sonioxAudioFormatSchema = zod11.union([
36505
- sonioxAutoDetectedAudioFormatSchema,
36506
- sonioxPcmAudioEncodingSchema
36507
- ]);
36508
36156
  var sonioxOneWayTranslationSchema = zod11.object({
36509
36157
  type: zod11.literal("one_way"),
36510
- target_language: zod11.string().describe("Target language code for translation")
36158
+ target_language: zod11.string()
36511
36159
  });
36512
36160
  var sonioxTwoWayTranslationSchema = zod11.object({
36513
36161
  type: zod11.literal("two_way"),
36514
- language_a: zod11.string().describe("First language for bidirectional translation"),
36515
- language_b: zod11.string().describe("Second language for bidirectional translation")
36162
+ language_a: zod11.string(),
36163
+ language_b: zod11.string()
36516
36164
  });
36517
36165
  var sonioxTranslationConfigSchema = zod11.union([
36518
36166
  sonioxOneWayTranslationSchema,
36519
36167
  sonioxTwoWayTranslationSchema
36520
36168
  ]);
36521
36169
  var sonioxContextGeneralItemSchema = zod11.object({
36522
- key: zod11.string().describe("Context item key (e.g. 'Domain')"),
36523
- value: zod11.string().describe("Context item value (e.g. 'medicine')")
36170
+ key: zod11.string(),
36171
+ value: zod11.string()
36524
36172
  });
36525
36173
  var sonioxTranslationTermSchema = zod11.object({
36526
- source: zod11.string().describe("Source term"),
36527
- target: zod11.string().describe("Target term to translate to")
36174
+ source: zod11.string(),
36175
+ target: zod11.string()
36528
36176
  });
36529
36177
  var sonioxStructuredContextSchema = zod11.object({
36530
- general: zod11.array(sonioxContextGeneralItemSchema).optional().describe("General context items (key-value pairs)"),
36531
- text: zod11.string().optional().describe("Text context"),
36532
- terms: zod11.array(zod11.string()).optional().describe("Terms that might occur in speech"),
36533
- translation_terms: zod11.array(sonioxTranslationTermSchema).optional().describe("Hints how to translate specific terms (ignored if translation is not enabled)")
36178
+ general: zod11.array(sonioxContextGeneralItemSchema).optional(),
36179
+ text: zod11.string().optional(),
36180
+ terms: zod11.array(zod11.string()).optional(),
36181
+ translation_terms: zod11.array(sonioxTranslationTermSchema).optional()
36534
36182
  });
36535
36183
  var sonioxContextSchema = zod11.union([sonioxStructuredContextSchema, zod11.string()]);
36536
36184
  var sonioxRealtimeModelSchema = zod11.enum([
36185
+ "stt-rt-v4",
36537
36186
  "stt-rt-v3",
36538
36187
  "stt-rt-preview",
36539
36188
  "stt-rt-v3-preview",
36540
36189
  "stt-rt-preview-v2"
36541
36190
  ]);
36542
36191
  var streamingTranscriberParams3 = zod11.object({
36543
- model: sonioxRealtimeModelSchema.describe("Real-time model to use"),
36544
- audioFormat: sonioxAudioFormatSchema.optional().describe("Audio format specification. Use 'auto' for automatic detection"),
36545
- sampleRate: zod11.number().optional().describe("Sample rate in Hz (required for raw PCM formats)"),
36546
- numChannels: zod11.number().min(1).max(2).optional().describe("Number of audio channels (1 for mono, 2 for stereo) - required for raw PCM formats"),
36547
- languageHints: zod11.array(zod11.string()).optional().describe("Expected languages in the audio (ISO language codes)"),
36548
- context: sonioxContextSchema.optional().describe("Additional context to improve transcription accuracy"),
36549
- enableSpeakerDiarization: zod11.boolean().optional().describe("Enable speaker diarization - each token will include a speaker field"),
36550
- enableLanguageIdentification: zod11.boolean().optional().describe("Enable language identification - each token will include a language field"),
36551
- enableEndpointDetection: zod11.boolean().optional().describe("Enable endpoint detection to detect when a speaker has finished talking"),
36552
- translation: sonioxTranslationConfigSchema.optional().describe("Translation configuration"),
36553
- clientReferenceId: zod11.string().optional().describe("Optional tracking identifier (client-defined)")
36554
- });
36555
- var sonioxTranslationStatusSchema = zod11.enum(["none", "original", "translation"]);
36192
+ model: sonioxRealtimeModelSchema,
36193
+ audioFormat: sonioxAudioFormatSchema.optional(),
36194
+ sampleRate: zod11.number().optional(),
36195
+ numChannels: zod11.number().optional(),
36196
+ languageHints: zod11.array(zod11.string()).optional(),
36197
+ context: sonioxContextSchema.optional(),
36198
+ enableSpeakerDiarization: zod11.boolean().optional(),
36199
+ enableLanguageIdentification: zod11.boolean().optional(),
36200
+ enableEndpointDetection: zod11.boolean().optional(),
36201
+ translation: sonioxTranslationConfigSchema.optional(),
36202
+ clientReferenceId: zod11.string().optional()
36203
+ });
36204
+ var sonioxTranslationStatusSchema = zod11.enum(["original", "translation", "none"]);
36556
36205
  var sonioxTokenSchema = zod11.object({
36557
- text: zod11.string().describe("Token text content (subword, word, or space)"),
36558
- start_ms: zod11.number().optional().describe("Start time of the token in milliseconds"),
36559
- end_ms: zod11.number().optional().describe("End time of the token in milliseconds"),
36560
- confidence: zod11.number().min(0).max(1).optional().describe("Confidence score between 0.0 and 1.0"),
36561
- is_final: zod11.boolean().describe("Whether this token is final (confirmed) or provisional"),
36562
- speaker: zod11.string().optional().describe("Speaker identifier (only present when speaker diarization is enabled)"),
36563
- language: zod11.string().optional().describe("Detected language code (only present when language identification is enabled)"),
36564
- source_language: zod11.string().optional().describe("Original language code for translated tokens"),
36565
- translation_status: sonioxTranslationStatusSchema.optional().describe("Translation status: 'none', 'original', or 'translation'")
36206
+ text: zod11.string(),
36207
+ start_ms: zod11.number().optional(),
36208
+ end_ms: zod11.number().optional(),
36209
+ confidence: zod11.number(),
36210
+ is_final: zod11.boolean(),
36211
+ speaker: zod11.string().optional(),
36212
+ translation_status: sonioxTranslationStatusSchema.optional(),
36213
+ language: zod11.string().optional(),
36214
+ source_language: zod11.string().optional()
36566
36215
  });
36567
36216
  var sonioxStreamingResponseSchema = zod11.object({
36568
- text: zod11.string().optional().describe("Complete transcribed text"),
36569
- tokens: zod11.array(sonioxTokenSchema).describe("List of recognized tokens"),
36570
- final_audio_proc_ms: zod11.number().optional().describe("Milliseconds of audio processed into final tokens"),
36571
- total_audio_proc_ms: zod11.number().optional().describe("Milliseconds of audio processed (final + non-final)"),
36572
- finished: zod11.boolean().optional().describe("Whether the transcription is complete"),
36573
- error: zod11.string().optional().describe("Error message if an error occurred"),
36574
- error_code: zod11.number().optional().describe("Error code if an error occurred")
36217
+ text: zod11.string(),
36218
+ tokens: zod11.array(sonioxTokenSchema),
36219
+ final_audio_proc_ms: zod11.number(),
36220
+ total_audio_proc_ms: zod11.number(),
36221
+ finished: zod11.boolean().optional(),
36222
+ error_code: zod11.number().optional(),
36223
+ error_message: zod11.string().optional()
36575
36224
  });
36576
36225
  var sonioxRecorderStateSchema = zod11.enum([
36577
36226
  "Init",
@@ -37137,8 +36786,8 @@ var BatchOnlyProviders = AllProviders.filter(
37137
36786
  );
37138
36787
 
37139
36788
  // src/generated/deepgram/schema/index.ts
37140
- var schema_exports4 = {};
37141
- __export(schema_exports4, {
36789
+ var schema_exports5 = {};
36790
+ __export(schema_exports5, {
37142
36791
  V1ListenPostParametersCallbackMethod: () => V1ListenPostParametersCallbackMethod,
37143
36792
  V1ListenPostParametersCustomIntentMode: () => V1ListenPostParametersCustomIntentMode,
37144
36793
  V1ListenPostParametersCustomTopicMode: () => V1ListenPostParametersCustomTopicMode,
@@ -37393,8 +37042,8 @@ var V1SpeakPostParametersSampleRate = {
37393
37042
  };
37394
37043
 
37395
37044
  // src/generated/openai/schema/index.ts
37396
- var schema_exports5 = {};
37397
- __export(schema_exports5, {
37045
+ var schema_exports6 = {};
37046
+ __export(schema_exports6, {
37398
37047
  AudioResponseFormat: () => AudioResponseFormat,
37399
37048
  CreateSpeechRequestResponseFormat: () => CreateSpeechRequestResponseFormat,
37400
37049
  CreateSpeechRequestStreamFormat: () => CreateSpeechRequestStreamFormat,
@@ -37734,8 +37383,8 @@ var VoiceResourceObject = {
37734
37383
  };
37735
37384
 
37736
37385
  // src/generated/speechmatics/schema/index.ts
37737
- var schema_exports6 = {};
37738
- __export(schema_exports6, {
37386
+ var schema_exports7 = {};
37387
+ __export(schema_exports7, {
37739
37388
  AutoChaptersResultErrorType: () => AutoChaptersResultErrorType,
37740
37389
  ErrorResponseError: () => ErrorResponseError,
37741
37390
  GetJobsJobidAlignmentTags: () => GetJobsJobidAlignmentTags,
@@ -37924,32 +37573,6 @@ var WrittenFormRecognitionResultType = {
37924
37573
  word: "word"
37925
37574
  };
37926
37575
 
37927
- // src/generated/soniox/schema/index.ts
37928
- var schema_exports7 = {};
37929
- __export(schema_exports7, {
37930
- TemporaryApiKeyUsageType: () => TemporaryApiKeyUsageType,
37931
- TranscriptionMode: () => TranscriptionMode,
37932
- TranscriptionStatus: () => TranscriptionStatus,
37933
- TranslationConfigType: () => TranslationConfigType
37934
- });
37935
-
37936
- // src/generated/soniox/schema/temporaryApiKeyUsageType.ts
37937
- var TemporaryApiKeyUsageType = {
37938
- transcribe_websocket: "transcribe_websocket"
37939
- };
37940
-
37941
- // src/generated/soniox/schema/transcriptionMode.ts
37942
- var TranscriptionMode = {
37943
- real_time: "real_time",
37944
- async: "async"
37945
- };
37946
-
37947
- // src/generated/soniox/schema/translationConfigType.ts
37948
- var TranslationConfigType = {
37949
- one_way: "one_way",
37950
- two_way: "two_way"
37951
- };
37952
-
37953
37576
  // src/generated/elevenlabs/schema/index.ts
37954
37577
  var schema_exports8 = {};
37955
37578
  __export(schema_exports8, {
@@ -39653,7 +39276,7 @@ export {
39653
39276
  DeepgramTTSSampleRate,
39654
39277
  DeepgramTopicMode,
39655
39278
  DeepgramTranscriptionSchema,
39656
- schema_exports4 as DeepgramTypes,
39279
+ schema_exports5 as DeepgramTypes,
39657
39280
  deepgramAPI_zod_exports as DeepgramZodSchemas,
39658
39281
  ElevenLabsAdapter,
39659
39282
  ElevenLabsCapabilities,
@@ -39690,7 +39313,7 @@ export {
39690
39313
  OpenAIResponseFormat,
39691
39314
  streaming_types_exports as OpenAIStreamingTypes,
39692
39315
  OpenAITranscriptionSchema,
39693
- schema_exports5 as OpenAITypes,
39316
+ schema_exports6 as OpenAITypes,
39694
39317
  OpenAIWhisperAdapter,
39695
39318
  openAIAudioRealtimeAPI_zod_exports as OpenAIZodSchemas,
39696
39319
  ProfanityFilterMode,
@@ -39719,7 +39342,7 @@ export {
39719
39342
  SonioxStreamingUpdateSchema,
39720
39343
  streaming_types_zod_exports as SonioxStreamingZodSchemas,
39721
39344
  SonioxTranscriptionSchema,
39722
- schema_exports7 as SonioxTypes,
39345
+ schema_exports4 as SonioxTypes,
39723
39346
  SpeakV1ContainerParameter,
39724
39347
  SpeakV1EncodingParameter,
39725
39348
  SpeakV1SampleRateParameter,
@@ -39734,7 +39357,7 @@ export {
39734
39357
  SpeechmaticsStreamingSchema,
39735
39358
  SpeechmaticsStreamingUpdateSchema,
39736
39359
  SpeechmaticsTranscriptionSchema,
39737
- schema_exports6 as SpeechmaticsTypes,
39360
+ schema_exports7 as SpeechmaticsTypes,
39738
39361
  speechmaticsASRRESTAPI_zod_exports as SpeechmaticsZodSchemas,
39739
39362
  StreamingProviders,
39740
39363
  StreamingSupportedBitDepthEnum,