voice-router-dev 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -5835,23 +5835,22 @@ var AssemblyAIAdapter = class extends BaseAdapter {
5835
5835
  "AssemblyAI adapter currently only supports URL-based audio input. Use audio.type='url'"
5836
5836
  );
5837
5837
  }
5838
- const aaiOpts = { ...options?.assemblyai };
5839
- if ("speech_model" in aaiOpts && aaiOpts.speech_model != null) {
5840
- if (!aaiOpts.speech_models) {
5841
- aaiOpts.speech_models = [aaiOpts.speech_model];
5842
- }
5843
- delete aaiOpts.speech_model;
5838
+ const passthrough = options?.assemblyai;
5839
+ let speechModels;
5840
+ if (passthrough?.speech_model != null && !passthrough.speech_models) {
5841
+ speechModels = [passthrough.speech_model];
5842
+ } else if (passthrough?.speech_models) {
5843
+ speechModels = passthrough.speech_models;
5844
5844
  }
5845
+ const { speech_model: _deprecated, ...typedOpts } = passthrough ?? {};
5845
5846
  const request = {
5846
- ...aaiOpts,
5847
+ ...typedOpts,
5847
5848
  audio_url: audioUrl,
5848
5849
  // speech_models is required — default to universal-3-pro
5849
- speech_models: aaiOpts.speech_models ?? [
5850
- "universal-3-pro"
5851
- ],
5850
+ speech_models: speechModels ?? ["universal-3-pro"],
5852
5851
  // Enable punctuation and formatting by default
5853
- punctuate: aaiOpts.punctuate ?? true,
5854
- format_text: aaiOpts.format_text ?? true
5852
+ punctuate: typedOpts.punctuate ?? true,
5853
+ format_text: typedOpts.format_text ?? true
5855
5854
  };
5856
5855
  if (options) {
5857
5856
  if (options.model) {
@@ -5899,22 +5898,22 @@ var AssemblyAIAdapter = class extends BaseAdapter {
5899
5898
  normalizeResponse(response) {
5900
5899
  let status;
5901
5900
  switch (response.status) {
5902
- case TranscriptStatus.queued:
5901
+ case "queued":
5903
5902
  status = "queued";
5904
5903
  break;
5905
- case TranscriptStatus.processing:
5904
+ case "processing":
5906
5905
  status = "processing";
5907
5906
  break;
5908
- case TranscriptStatus.completed:
5907
+ case "completed":
5909
5908
  status = "completed";
5910
5909
  break;
5911
- case TranscriptStatus.error:
5910
+ case "error":
5912
5911
  status = "error";
5913
5912
  break;
5914
5913
  default:
5915
5914
  status = "queued";
5916
5915
  }
5917
- if (response.status === TranscriptStatus.error) {
5916
+ if (response.status === "error") {
5918
5917
  return {
5919
5918
  success: false,
5920
5919
  provider: this.name,
@@ -6566,8 +6565,14 @@ var DeepgramAdapter = class extends BaseAdapter {
6566
6565
  /**
6567
6566
  * Submit audio for transcription
6568
6567
  *
6569
- * Sends audio to Deepgram API for transcription. Deepgram processes
6570
- * synchronously and returns results immediately (no polling required).
6568
+ * Sends audio to Deepgram API for transcription. Deepgram normally processes
6569
+ * synchronously and returns results immediately.
6570
+ *
6571
+ * **Callback mode:** When `webhookUrl` is set, Deepgram returns immediately
6572
+ * with a `request_id` (status `"queued"`). The full transcript is POSTed to
6573
+ * the webhook URL — this is the primary delivery mechanism. `getTranscript()`
6574
+ * can attempt to retrieve the result later via request history, but that
6575
+ * endpoint is best-effort and not a guaranteed durable store.
6571
6576
  *
6572
6577
  * @param audio - Audio input (URL or file buffer)
6573
6578
  * @param options - Transcription options
@@ -6618,47 +6623,81 @@ var DeepgramAdapter = class extends BaseAdapter {
6618
6623
  { params }
6619
6624
  ).then((res) => res.data);
6620
6625
  } else if (audio.type === "file") {
6621
- response = await this.client.post("/listen", audio.file, {
6622
- params,
6623
- headers: {
6624
- "Content-Type": "audio/*"
6626
+ response = await this.client.post(
6627
+ "/listen",
6628
+ audio.file,
6629
+ {
6630
+ params,
6631
+ headers: {
6632
+ "Content-Type": "audio/*"
6633
+ }
6625
6634
  }
6626
- }).then((res) => res.data);
6635
+ ).then((res) => res.data);
6627
6636
  } else {
6628
6637
  throw new Error(
6629
6638
  "Deepgram adapter does not support stream type for pre-recorded transcription. Use transcribeStream() for real-time streaming."
6630
6639
  );
6631
6640
  }
6641
+ if (options?.webhookUrl) {
6642
+ const requestId = ("request_id" in response ? response.request_id : void 0) || ("metadata" in response ? response.metadata?.request_id : void 0);
6643
+ if (!requestId) {
6644
+ return {
6645
+ success: false,
6646
+ provider: this.name,
6647
+ error: {
6648
+ code: "MISSING_REQUEST_ID",
6649
+ message: "Deepgram callback mode did not return a request ID"
6650
+ },
6651
+ raw: response
6652
+ };
6653
+ }
6654
+ return {
6655
+ success: true,
6656
+ provider: this.name,
6657
+ data: {
6658
+ id: requestId,
6659
+ text: "",
6660
+ status: "queued"
6661
+ },
6662
+ tracking: {
6663
+ requestId
6664
+ },
6665
+ raw: response
6666
+ };
6667
+ }
6668
+ if (!("results" in response) || !("metadata" in response)) {
6669
+ return {
6670
+ success: false,
6671
+ provider: this.name,
6672
+ error: {
6673
+ code: "INVALID_RESPONSE",
6674
+ message: "Deepgram did not return a synchronous transcription payload"
6675
+ },
6676
+ raw: response
6677
+ };
6678
+ }
6632
6679
  return this.normalizeResponse(response);
6633
6680
  } catch (error) {
6634
6681
  return this.createErrorResponse(error);
6635
6682
  }
6636
6683
  }
6637
6684
  /**
6638
- * Get transcription result by ID
6685
+ * Get transcription result by ID (best-effort)
6639
6686
  *
6640
- * Retrieves a previous transcription from Deepgram's request history.
6641
- *
6642
- * Unlike the list endpoint, getting a single request DOES include the full
6643
- * transcript response. Requires `projectId` to be set during initialization.
6687
+ * Retrieves a previous transcription from Deepgram's request history API.
6688
+ * Requires `projectId` to be set during initialization.
6644
6689
  *
6645
- * @param transcriptId - Request ID from a previous transcription
6646
- * @returns Full transcript response including text, words, and metadata
6690
+ * **Important:** Deepgram's request history is best-effort. Requests may
6691
+ * expire or be unavailable depending on your plan and retention settings.
6692
+ * This is NOT a durable transcript store — for reliable retrieval, use
6693
+ * callback mode (`webhookUrl`) and persist the webhook payload yourself.
6647
6694
  *
6648
- * @example Get a transcript by request ID
6649
- * ```typescript
6650
- * const adapter = new DeepgramAdapter()
6651
- * adapter.initialize({
6652
- * apiKey: process.env.DEEPGRAM_API_KEY,
6653
- * projectId: process.env.DEEPGRAM_PROJECT_ID
6654
- * })
6695
+ * The response field on the request history entry is cast to
6696
+ * `ListenV1Response` — this appears to work in practice but is not
6697
+ * explicitly documented by Deepgram as a guaranteed contract.
6655
6698
  *
6656
- * const result = await adapter.getTranscript('abc123-request-id')
6657
- * if (result.success) {
6658
- * console.log(result.data?.text)
6659
- * console.log(result.data?.words)
6660
- * }
6661
- * ```
6699
+ * @param transcriptId - Request ID from a previous transcription
6700
+ * @returns Transcript response if still available in request history
6662
6701
  *
6663
6702
  * @see https://developers.deepgram.com/reference/get-request
6664
6703
  */
@@ -7289,7 +7328,8 @@ var DeepgramAdapter = class extends BaseAdapter {
7289
7328
  break;
7290
7329
  }
7291
7330
  case "Metadata": {
7292
- callbacks?.onMetadata?.(message);
7331
+ const { type: _, ...metadata } = message;
7332
+ callbacks?.onMetadata?.(metadata);
7293
7333
  break;
7294
7334
  }
7295
7335
  case "Error": {
@@ -7725,10 +7765,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
7725
7765
  contentUrls: [audio.url],
7726
7766
  properties: this.buildTranscriptionProperties(options)
7727
7767
  };
7728
- const response = await transcriptionsCreate(
7729
- transcriptionRequest,
7730
- this.getAxiosConfig()
7731
- );
7768
+ const response = await transcriptionsCreate(transcriptionRequest, this.getAxiosConfig());
7732
7769
  const transcription = response.data;
7733
7770
  const transcriptId = transcription.self?.split("/").pop() || "";
7734
7771
  return await this.pollForCompletion(transcriptId);
@@ -8268,7 +8305,6 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8268
8305
  const request = {
8269
8306
  ...options?.openai,
8270
8307
  file: audioData,
8271
- // Buffer/Blob both accepted at runtime; generated type expects Blob
8272
8308
  model
8273
8309
  };
8274
8310
  if (options?.language) {
@@ -8288,11 +8324,7 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8288
8324
  request.response_format = OpenAIResponseFormat.json;
8289
8325
  }
8290
8326
  const response = await createTranscription(request, this.getAxiosConfig());
8291
- return this.normalizeResponse(
8292
- response.data,
8293
- model,
8294
- isDiarization
8295
- );
8327
+ return this.normalizeResponse(response.data, model, isDiarization);
8296
8328
  } catch (error) {
8297
8329
  return this.createErrorResponse(error);
8298
8330
  }
@@ -8699,7 +8731,6 @@ function createOpenAIWhisperAdapter(config) {
8699
8731
 
8700
8732
  // src/adapters/speechmatics-adapter.ts
8701
8733
  import axios8 from "axios";
8702
- import WebSocket6 from "ws";
8703
8734
 
8704
8735
  // src/generated/speechmatics/schema/notificationConfigContentsItem.ts
8705
8736
  var NotificationConfigContentsItem = {
@@ -8884,16 +8915,13 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8884
8915
  jobConfig.fetch_data = {
8885
8916
  url: audio.url
8886
8917
  };
8887
- const formData = new FormData();
8888
- formData.append("config", JSON.stringify(jobConfig));
8889
- requestBody = formData;
8890
- headers = { "Content-Type": "multipart/form-data" };
8918
+ requestBody = { config: JSON.stringify(jobConfig) };
8919
+ headers = { "Content-Type": "application/json" };
8891
8920
  } else if (audio.type === "file") {
8892
- const formData = new FormData();
8893
- formData.append("config", JSON.stringify(jobConfig));
8894
- const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
8895
- formData.append("data_file", audioBlob, audio.filename || "audio.wav");
8896
- requestBody = formData;
8921
+ requestBody = {
8922
+ config: JSON.stringify(jobConfig),
8923
+ data_file: audio.file
8924
+ };
8897
8925
  headers = { "Content-Type": "multipart/form-data" };
8898
8926
  } else {
8899
8927
  return {
@@ -8999,216 +9027,224 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8999
9027
  }
9000
9028
  }
9001
9029
  /**
9002
- * Build WebSocket URL for real-time streaming
9030
+ * Get the regional WebSocket host for real-time streaming
9003
9031
  *
9004
- * Note: Real-time API uses a different host from the batch API:
9005
- * - Batch: {region}.asr.api.speechmatics.com
9006
- * - Real-time: {region}.rt.speechmatics.com
9007
- *
9008
- * @param region - Regional endpoint identifier
9009
- * @returns WebSocket URL for real-time API
9032
+ * Speechmatics RT uses a different host pattern: {region}.rt.speechmatics.com
9010
9033
  */
9011
- getRegionalWsUrl(region) {
9012
- if (this.config?.wsBaseUrl) {
9013
- return this.config.wsBaseUrl;
9014
- }
9015
- const rtRegionMap = {
9016
- eu1: "eu",
9017
- eu2: "eu",
9018
- us1: "us",
9019
- us2: "us",
9020
- au1: "eu"
9021
- // No AU RT endpoint — fall back to EU
9022
- };
9023
- const rtPrefix = rtRegionMap[region || ""] || "eu";
9024
- return `wss://${rtPrefix}.rt.speechmatics.com/v2`;
9034
+ getRegionalWsHost(region) {
9035
+ const regionPrefix = region || "eu1";
9036
+ return `${regionPrefix}.rt.speechmatics.com`;
9025
9037
  }
9026
9038
  /**
9027
- * Stream audio for real-time transcription via WebSocket
9028
- *
9029
- * Connects to Speechmatics' real-time API and sends audio chunks
9030
- * for transcription with results returned via callbacks.
9039
+ * Stream audio for real-time transcription
9031
9040
  *
9032
- * @param options - Streaming configuration options
9033
- * @param callbacks - Event callbacks for transcription results
9034
- * @returns Promise that resolves with a StreamingSession
9041
+ * Creates a WebSocket connection to the Speechmatics Real-Time API.
9042
+ * Protocol: send StartRecognition config, then AddAudio binary frames,
9043
+ * receive AddPartialTranscript/AddTranscript/EndOfUtterance messages.
9035
9044
  *
9036
- * @example Basic streaming
9037
- * ```typescript
9038
- * const session = await adapter.transcribeStream({
9039
- * language: 'en',
9040
- * speechmaticsStreaming: {
9041
- * enablePartials: true,
9042
- * operatingPoint: 'enhanced'
9043
- * }
9044
- * }, {
9045
- * onTranscript: (event) => console.log(event.text),
9046
- * onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
9047
- * onError: (error) => console.error(error)
9048
- * });
9045
+ * @param options - Streaming configuration
9046
+ * @param callbacks - Event callbacks
9047
+ * @returns StreamingSession for sending audio and closing
9049
9048
  *
9050
- * await session.sendAudio({ data: audioBuffer });
9051
- * await session.close();
9052
- * ```
9049
+ * @see https://docs.speechmatics.com/rt-api-ref
9053
9050
  */
9054
9051
  async transcribeStream(options, callbacks) {
9055
9052
  this.validateConfig();
9056
- const smOpts = options?.speechmaticsStreaming || {};
9057
- const region = smOpts.region || this.config?.region;
9058
- const wsUrl = this.getRegionalWsUrl(region);
9059
- const ws = new WebSocket6(wsUrl, {
9060
- headers: {
9061
- Authorization: `Bearer ${this.config.apiKey}`
9062
- }
9063
- });
9064
- let sessionStatus = "connecting";
9065
- const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
9066
- let seqNo = 0;
9067
- let utteranceResults = [];
9068
- const sessionReady = new Promise((resolve, reject) => {
9069
- const timeout = setTimeout(() => {
9070
- reject(new Error("WebSocket connection timeout"));
9071
- }, 1e4);
9072
- let wsOpen = false;
9073
- ws.once("error", (error) => {
9074
- clearTimeout(timeout);
9075
- reject(error);
9076
- });
9077
- ws.once("open", () => {
9078
- wsOpen = true;
9079
- const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
9080
- const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
9081
- const startMsg = {
9082
- message: "StartRecognition",
9083
- audio_format: {
9084
- type: "raw",
9085
- encoding,
9086
- sample_rate: sampleRate
9087
- },
9088
- transcription_config: {
9089
- language: smOpts.language || options?.language || "en",
9090
- enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
9091
- }
9092
- };
9093
- const txConfig = startMsg.transcription_config;
9094
- if (smOpts.domain) txConfig.domain = smOpts.domain;
9095
- if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
9096
- if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
9097
- if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
9098
- if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
9099
- if (smOpts.diarization === "speaker" || options?.diarization) {
9100
- txConfig.diarization = "speaker";
9101
- if (smOpts.maxSpeakers) {
9102
- txConfig.speaker_diarization_config = {
9103
- max_speakers: smOpts.maxSpeakers
9104
- };
9105
- } else if (options?.speakersExpected) {
9106
- txConfig.speaker_diarization_config = {
9107
- max_speakers: options.speakersExpected
9108
- };
9109
- }
9110
- }
9111
- if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
9112
- txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
9113
- content: word
9114
- }));
9115
- } else if (options?.customVocabulary && options.customVocabulary.length > 0) {
9116
- txConfig.additional_vocab = options.customVocabulary.map((word) => ({
9117
- content: word
9118
- }));
9119
- }
9120
- if (smOpts.conversationConfig) {
9121
- txConfig.conversation_config = {
9122
- end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9123
- };
9124
- }
9125
- const startPayload = JSON.stringify(startMsg);
9126
- if (callbacks?.onRawMessage) {
9127
- callbacks.onRawMessage({
9128
- provider: "speechmatics",
9129
- direction: "outgoing",
9130
- timestamp: Date.now(),
9131
- payload: startPayload,
9132
- messageType: "StartRecognition"
9133
- });
9053
+ const sessionId = `speechmatics_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9054
+ const createdAt = /* @__PURE__ */ new Date();
9055
+ const smOpts = options?.speechmaticsStreaming;
9056
+ const region = smOpts?.region || this.config?.region;
9057
+ const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost(region)}`);
9058
+ const wsUrl = `${wsBase}/v2`;
9059
+ let status = "connecting";
9060
+ let recognitionStarted = false;
9061
+ const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : __require("ws");
9062
+ const ws = new WebSocketImpl(wsUrl);
9063
+ const language = smOpts?.language || options?.language || "en";
9064
+ const transcriptionConfig = {
9065
+ language,
9066
+ enable_entities: smOpts?.enableEntities ?? options?.entityDetection ?? false,
9067
+ enable_partials: smOpts?.enablePartials ?? options?.interimResults !== false,
9068
+ operating_point: smOpts?.operatingPoint || OperatingPoint.enhanced,
9069
+ ...smOpts?.maxDelay !== void 0 && { max_delay: smOpts.maxDelay },
9070
+ ...smOpts?.maxDelayMode && {
9071
+ max_delay_mode: smOpts.maxDelayMode
9072
+ },
9073
+ ...smOpts?.domain && { domain: smOpts.domain },
9074
+ ...(options?.diarization || smOpts?.diarization === TranscriptionConfigDiarization.speaker) && {
9075
+ diarization: TranscriptionConfigDiarization.speaker,
9076
+ ...smOpts?.maxSpeakers !== void 0 && {
9077
+ speaker_diarization_config: { max_speakers: smOpts.maxSpeakers }
9134
9078
  }
9135
- ws.send(startPayload);
9136
- });
9137
- const onMessage = (data) => {
9138
- const rawPayload = data.toString();
9139
- try {
9140
- const msg = JSON.parse(rawPayload);
9141
- if (msg.message === "RecognitionStarted") {
9142
- clearTimeout(timeout);
9143
- ws.removeListener("message", onMessage);
9144
- ws.emit("message", data);
9145
- resolve();
9146
- } else if (msg.message === "Error") {
9147
- clearTimeout(timeout);
9148
- ws.removeListener("message", onMessage);
9149
- reject(new Error(msg.reason || "Recognition failed to start"));
9150
- }
9151
- } catch {
9079
+ },
9080
+ ...(options?.customVocabulary?.length || smOpts?.additionalVocab?.length) && {
9081
+ additional_vocab: (smOpts?.additionalVocab || options?.customVocabulary || []).map(
9082
+ (term) => ({ content: term })
9083
+ )
9084
+ }
9085
+ };
9086
+ const startRecognition = {
9087
+ message: "StartRecognition",
9088
+ audio_format: {
9089
+ type: "raw",
9090
+ encoding: smOpts?.encoding || "pcm_s16le",
9091
+ sample_rate: smOpts?.sampleRate || options?.sampleRate || 16e3
9092
+ },
9093
+ transcription_config: transcriptionConfig,
9094
+ ...smOpts?.conversationConfig && {
9095
+ conversation_config: {
9096
+ end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9152
9097
  }
9153
- };
9154
- ws.on("message", onMessage);
9155
- });
9156
- ws.on("message", (data) => {
9157
- const rawPayload = data.toString();
9098
+ }
9099
+ };
9100
+ ws.onopen = () => {
9101
+ status = "open";
9102
+ const msg = JSON.stringify(startRecognition);
9103
+ if (callbacks?.onRawMessage) {
9104
+ callbacks.onRawMessage({
9105
+ provider: this.name,
9106
+ direction: "outgoing",
9107
+ timestamp: Date.now(),
9108
+ payload: msg,
9109
+ messageType: "StartRecognition"
9110
+ });
9111
+ }
9112
+ ws.send(msg);
9113
+ };
9114
+ ws.onmessage = (event) => {
9115
+ const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
9158
9116
  try {
9159
- const message = JSON.parse(rawPayload);
9117
+ const data = JSON.parse(rawPayload);
9118
+ const messageType = data.message;
9160
9119
  if (callbacks?.onRawMessage) {
9161
9120
  callbacks.onRawMessage({
9162
- provider: "speechmatics",
9121
+ provider: this.name,
9163
9122
  direction: "incoming",
9164
9123
  timestamp: Date.now(),
9165
9124
  payload: rawPayload,
9166
- messageType: message.message
9125
+ messageType
9167
9126
  });
9168
9127
  }
9169
- this.handleStreamingMessage(message, callbacks, utteranceResults);
9170
- } catch (error) {
9171
- if (callbacks?.onRawMessage) {
9172
- callbacks.onRawMessage({
9173
- provider: "speechmatics",
9174
- direction: "incoming",
9175
- timestamp: Date.now(),
9176
- payload: rawPayload,
9177
- messageType: "parse_error"
9178
- });
9128
+ switch (messageType) {
9129
+ case "RecognitionStarted": {
9130
+ recognitionStarted = true;
9131
+ callbacks?.onOpen?.();
9132
+ callbacks?.onMetadata?.({
9133
+ id: data.id,
9134
+ languagePackInfo: data.language_pack_info
9135
+ });
9136
+ break;
9137
+ }
9138
+ case "AddPartialTranscript": {
9139
+ const partial = data;
9140
+ const words = this.resultsToWords(partial.results);
9141
+ callbacks?.onTranscript?.({
9142
+ type: "transcript",
9143
+ text: partial.metadata.transcript,
9144
+ isFinal: false,
9145
+ words,
9146
+ speaker: words[0]?.speaker,
9147
+ confidence: partial.results[0]?.alternatives?.[0]?.confidence,
9148
+ channel: partial.channel ? parseInt(partial.channel) : void 0
9149
+ });
9150
+ break;
9151
+ }
9152
+ case "AddTranscript": {
9153
+ const final = data;
9154
+ const words = this.resultsToWords(final.results);
9155
+ callbacks?.onTranscript?.({
9156
+ type: "transcript",
9157
+ text: final.metadata.transcript,
9158
+ isFinal: true,
9159
+ words,
9160
+ speaker: words[0]?.speaker,
9161
+ confidence: final.results[0]?.alternatives?.[0]?.confidence,
9162
+ channel: final.channel ? parseInt(final.channel) : void 0
9163
+ });
9164
+ if (options?.diarization || smOpts?.diarization === "speaker") {
9165
+ const utterances = buildUtterancesFromWords(words);
9166
+ for (const utterance of utterances) {
9167
+ callbacks?.onUtterance?.(utterance);
9168
+ }
9169
+ }
9170
+ break;
9171
+ }
9172
+ case "EndOfUtterance": {
9173
+ break;
9174
+ }
9175
+ case "EndOfTranscript": {
9176
+ callbacks?.onClose?.(1e3, "Transcription complete");
9177
+ break;
9178
+ }
9179
+ case "Error": {
9180
+ const err = data;
9181
+ callbacks?.onError?.({
9182
+ code: err.type || "SPEECHMATICS_ERROR",
9183
+ message: err.reason || "Unknown error"
9184
+ });
9185
+ break;
9186
+ }
9187
+ case "Warning": {
9188
+ const warn = data;
9189
+ callbacks?.onMetadata?.({
9190
+ warning: warn.type,
9191
+ reason: warn.reason
9192
+ });
9193
+ break;
9194
+ }
9195
+ case "Info": {
9196
+ callbacks?.onMetadata?.(data);
9197
+ break;
9198
+ }
9199
+ case "AudioAdded":
9200
+ case "ChannelAudioAdded":
9201
+ break;
9202
+ default:
9203
+ callbacks?.onMetadata?.(data);
9204
+ break;
9179
9205
  }
9206
+ } catch (error) {
9180
9207
  callbacks?.onError?.({
9181
9208
  code: "PARSE_ERROR",
9182
- message: "Failed to parse WebSocket message",
9183
- details: error
9209
+ message: `Failed to parse message: ${error}`
9184
9210
  });
9185
9211
  }
9186
- });
9187
- ws.on("error", (error) => {
9212
+ };
9213
+ ws.onerror = () => {
9188
9214
  callbacks?.onError?.({
9189
9215
  code: "WEBSOCKET_ERROR",
9190
- message: error.message,
9191
- details: error
9216
+ message: "WebSocket error occurred"
9192
9217
  });
9218
+ };
9219
+ ws.onclose = (event) => {
9220
+ status = "closed";
9221
+ callbacks?.onClose?.(event.code, event.reason);
9222
+ };
9223
+ await new Promise((resolve, reject) => {
9224
+ const timeout = setTimeout(() => {
9225
+ reject(new Error("WebSocket connection timeout"));
9226
+ }, 1e4);
9227
+ const checkReady = () => {
9228
+ if (recognitionStarted) {
9229
+ clearTimeout(timeout);
9230
+ resolve();
9231
+ } else if (status === "closed") {
9232
+ clearTimeout(timeout);
9233
+ reject(new Error("WebSocket connection failed"));
9234
+ } else {
9235
+ setTimeout(checkReady, 100);
9236
+ }
9237
+ };
9238
+ checkReady();
9193
9239
  });
9194
- ws.on("close", (code, reason) => {
9195
- sessionStatus = "closed";
9196
- callbacks?.onClose?.(code, reason.toString());
9197
- });
9198
- await sessionReady;
9199
- sessionStatus = "open";
9200
- callbacks?.onOpen?.();
9201
9240
  return {
9202
9241
  id: sessionId,
9203
9242
  provider: this.name,
9204
- createdAt: /* @__PURE__ */ new Date(),
9205
- getStatus: () => sessionStatus,
9243
+ createdAt,
9244
+ getStatus: () => status,
9206
9245
  sendAudio: async (chunk) => {
9207
- if (sessionStatus !== "open") {
9208
- throw new Error(`Cannot send audio: session is ${sessionStatus}`);
9209
- }
9210
- if (ws.readyState !== WebSocket6.OPEN) {
9211
- throw new Error("WebSocket is not open");
9246
+ if (status !== "open") {
9247
+ throw new Error("Session is not open");
9212
9248
  }
9213
9249
  if (callbacks?.onRawMessage) {
9214
9250
  const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
@@ -9224,12 +9260,11 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9224
9260
  });
9225
9261
  }
9226
9262
  ws.send(chunk.data);
9227
- seqNo++;
9228
- if (chunk.isLast) {
9229
- const endMsg = JSON.stringify({
9230
- message: "EndOfStream",
9231
- last_seq_no: seqNo
9232
- });
9263
+ },
9264
+ close: async () => {
9265
+ if (status === "open") {
9266
+ status = "closing";
9267
+ const endMsg = JSON.stringify({ message: "EndOfStream", last_seq_no: 0 });
9233
9268
  if (callbacks?.onRawMessage) {
9234
9269
  callbacks.onRawMessage({
9235
9270
  provider: this.name,
@@ -9241,144 +9276,19 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9241
9276
  }
9242
9277
  ws.send(endMsg);
9243
9278
  }
9244
- },
9245
- close: async () => {
9246
- if (sessionStatus === "closed" || sessionStatus === "closing") {
9247
- return;
9248
- }
9249
- sessionStatus = "closing";
9250
- if (ws.readyState === WebSocket6.OPEN) {
9251
- seqNo++;
9252
- ws.send(
9253
- JSON.stringify({
9254
- message: "EndOfStream",
9255
- last_seq_no: seqNo
9256
- })
9257
- );
9258
- }
9259
- return new Promise((resolve) => {
9260
- const timeout = setTimeout(() => {
9261
- ws.terminate();
9262
- sessionStatus = "closed";
9263
- resolve();
9264
- }, 5e3);
9265
- const onMsg = (data) => {
9266
- try {
9267
- const msg = JSON.parse(data.toString());
9268
- if (msg.message === "EndOfTranscript") {
9269
- ws.removeListener("message", onMsg);
9270
- clearTimeout(timeout);
9271
- ws.close();
9272
- }
9273
- } catch {
9274
- }
9275
- };
9276
- ws.on("message", onMsg);
9277
- ws.once("close", () => {
9278
- clearTimeout(timeout);
9279
- sessionStatus = "closed";
9280
- resolve();
9281
- });
9282
- });
9283
9279
  }
9284
9280
  };
9285
9281
  }
9286
9282
  /**
9287
- * Handle incoming Speechmatics real-time WebSocket messages
9288
- */
9289
- handleStreamingMessage(message, callbacks, utteranceResults) {
9290
- switch (message.message) {
9291
- case "RecognitionStarted": {
9292
- break;
9293
- }
9294
- case "AddPartialTranscript": {
9295
- const results = message.results || [];
9296
- const text = buildTextFromSpeechmaticsResults(results);
9297
- if (text) {
9298
- callbacks?.onTranscript?.({
9299
- type: "transcript",
9300
- text,
9301
- isFinal: false,
9302
- words: this.extractWordsFromResults(results),
9303
- data: message
9304
- });
9305
- }
9306
- break;
9307
- }
9308
- case "AddTranscript": {
9309
- const results = message.results || [];
9310
- const text = buildTextFromSpeechmaticsResults(results);
9311
- if (utteranceResults) {
9312
- utteranceResults.push(...results);
9313
- }
9314
- if (text) {
9315
- callbacks?.onTranscript?.({
9316
- type: "transcript",
9317
- text,
9318
- isFinal: true,
9319
- words: this.extractWordsFromResults(results),
9320
- data: message
9321
- });
9322
- }
9323
- break;
9324
- }
9325
- case "EndOfUtterance": {
9326
- if (utteranceResults && utteranceResults.length > 0) {
9327
- const text = buildTextFromSpeechmaticsResults(utteranceResults);
9328
- const words = this.extractWordsFromResults(utteranceResults);
9329
- const utterances = buildUtterancesFromWords(words);
9330
- if (utterances.length > 0) {
9331
- for (const utt of utterances) {
9332
- callbacks?.onUtterance?.(utt);
9333
- }
9334
- } else if (text) {
9335
- callbacks?.onUtterance?.({
9336
- text,
9337
- start: words.length > 0 ? words[0].start : 0,
9338
- end: words.length > 0 ? words[words.length - 1].end : 0,
9339
- words
9340
- });
9341
- }
9342
- utteranceResults.length = 0;
9343
- }
9344
- break;
9345
- }
9346
- case "AudioAdded": {
9347
- break;
9348
- }
9349
- case "EndOfTranscript": {
9350
- break;
9351
- }
9352
- case "Info":
9353
- case "Warning": {
9354
- callbacks?.onMetadata?.(message);
9355
- break;
9356
- }
9357
- case "Error": {
9358
- const errMsg = message;
9359
- callbacks?.onError?.({
9360
- code: errMsg.type || "SPEECHMATICS_ERROR",
9361
- message: errMsg.reason || "Unknown error",
9362
- details: message
9363
- });
9364
- break;
9365
- }
9366
- default: {
9367
- callbacks?.onMetadata?.(message);
9368
- break;
9369
- }
9370
- }
9371
- }
9372
- /**
9373
- * Extract unified Word[] from Speechmatics recognition results
9283
+ * Convert Speechmatics RecognitionResult[] to unified Word[]
9374
9284
  */
9375
- extractWordsFromResults(results) {
9376
- return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
9377
- word: result.alternatives?.[0]?.content || "",
9378
- start: result.start_time,
9379
- end: result.end_time,
9380
- confidence: result.alternatives?.[0]?.confidence,
9381
- speaker: result.alternatives?.[0]?.speaker
9285
+ resultsToWords(results) {
9286
+ return results.filter((r) => r.type === "word").map((r) => ({
9287
+ word: r.alternatives?.[0]?.content || "",
9288
+ start: r.start_time,
9289
+ end: r.end_time,
9290
+ confidence: r.alternatives?.[0]?.confidence,
9291
+ speaker: r.alternatives?.[0]?.speaker
9382
9292
  }));
9383
9293
  }
9384
9294
  /**
@@ -9449,9 +9359,6 @@ function createSpeechmaticsAdapter(config) {
9449
9359
  return adapter;
9450
9360
  }
9451
9361
 
9452
- // src/adapters/soniox-adapter.ts
9453
- import axios9 from "axios";
9454
-
9455
9362
  // src/generated/soniox/schema/transcriptionStatus.ts
9456
9363
  var TranscriptionStatus = {
9457
9364
  queued: "queued",
@@ -9460,6 +9367,57 @@ var TranscriptionStatus = {
9460
9367
  error: "error"
9461
9368
  };
9462
9369
 
9370
+ // src/generated/soniox/api/sonioxPublicAPI.ts
9371
+ import axios9 from "axios";
9372
+
9373
+ // src/generated/soniox/schema/index.ts
9374
+ var schema_exports4 = {};
9375
+ __export(schema_exports4, {
9376
+ TemporaryApiKeyUsageType: () => TemporaryApiKeyUsageType,
9377
+ TranscriptionMode: () => TranscriptionMode,
9378
+ TranscriptionStatus: () => TranscriptionStatus,
9379
+ TranslationConfigType: () => TranslationConfigType
9380
+ });
9381
+
9382
+ // src/generated/soniox/schema/temporaryApiKeyUsageType.ts
9383
+ var TemporaryApiKeyUsageType = {
9384
+ transcribe_websocket: "transcribe_websocket"
9385
+ };
9386
+
9387
+ // src/generated/soniox/schema/transcriptionMode.ts
9388
+ var TranscriptionMode = {
9389
+ real_time: "real_time",
9390
+ async: "async"
9391
+ };
9392
+
9393
+ // src/generated/soniox/schema/translationConfigType.ts
9394
+ var TranslationConfigType = {
9395
+ one_way: "one_way",
9396
+ two_way: "two_way"
9397
+ };
9398
+
9399
+ // src/generated/soniox/api/sonioxPublicAPI.ts
9400
+ var uploadFile = (uploadFileBody2, options) => {
9401
+ const formData = new FormData();
9402
+ if (uploadFileBody2.client_reference_id !== void 0 && uploadFileBody2.client_reference_id !== null) {
9403
+ formData.append("client_reference_id", uploadFileBody2.client_reference_id);
9404
+ }
9405
+ formData.append("file", uploadFileBody2.file);
9406
+ return axios9.post("/v1/files", formData, options);
9407
+ };
9408
+ var createTranscription2 = (createTranscriptionPayload, options) => {
9409
+ return axios9.post("/v1/transcriptions", createTranscriptionPayload, options);
9410
+ };
9411
+ var getTranscription = (transcriptionId, options) => {
9412
+ return axios9.get(`/v1/transcriptions/${transcriptionId}`, options);
9413
+ };
9414
+ var getTranscriptionTranscript = (transcriptionId, options) => {
9415
+ return axios9.get(`/v1/transcriptions/${transcriptionId}/transcript`, options);
9416
+ };
9417
+ var getModels = (options) => {
9418
+ return axios9.get("/v1/models", options);
9419
+ };
9420
+
9463
9421
  // src/adapters/soniox-adapter.ts
9464
9422
  var SonioxAdapter = class extends BaseAdapter {
9465
9423
  constructor() {
@@ -9514,11 +9472,17 @@ var SonioxAdapter = class extends BaseAdapter {
9514
9472
  }
9515
9473
  }
9516
9474
  /**
9517
- * Get the base URL for API requests
9475
+ * Get the base URL for API requests (no /v1 suffix — generated functions include /v1 in paths)
9518
9476
  */
9519
9477
  get baseUrl() {
9520
9478
  if (this.config?.baseUrl) return this.config.baseUrl;
9521
- return `https://${this.getRegionalHost()}/v1`;
9479
+ return `https://${this.getRegionalHost()}`;
9480
+ }
9481
+ /**
9482
+ * Build axios config with Soniox Bearer auth
9483
+ */
9484
+ getAxiosConfig() {
9485
+ return super.getAxiosConfig("Authorization", (key) => `Bearer ${key}`);
9522
9486
  }
9523
9487
  initialize(config) {
9524
9488
  super.initialize(config);
@@ -9528,15 +9492,6 @@ var SonioxAdapter = class extends BaseAdapter {
9528
9492
  if (config.model) {
9529
9493
  this.defaultModel = config.model;
9530
9494
  }
9531
- this.client = axios9.create({
9532
- baseURL: this.baseUrl,
9533
- timeout: config.timeout || 12e4,
9534
- headers: {
9535
- Authorization: `Bearer ${config.apiKey}`,
9536
- "Content-Type": "application/json",
9537
- ...config.headers
9538
- }
9539
- });
9540
9495
  }
9541
9496
  /**
9542
9497
  * Get current region
@@ -9566,23 +9521,12 @@ var SonioxAdapter = class extends BaseAdapter {
9566
9521
  */
9567
9522
  setRegion(region) {
9568
9523
  this.region = region;
9569
- if (this.config?.apiKey) {
9570
- this.client = axios9.create({
9571
- baseURL: this.baseUrl,
9572
- timeout: this.config.timeout || 12e4,
9573
- headers: {
9574
- Authorization: `Bearer ${this.config.apiKey}`,
9575
- "Content-Type": "application/json",
9576
- ...this.config.headers
9577
- }
9578
- });
9579
- }
9580
9524
  }
9581
9525
  /**
9582
9526
  * Submit audio for transcription
9583
9527
  *
9584
- * Soniox uses async batch processing. The transcribe method submits audio
9585
- * and waits for completion (or use getTranscript for polling).
9528
+ * Uses the async v1 API: createTranscription returns status `queued`,
9529
+ * then polls until completed (or returns immediately if webhook is set).
9586
9530
  *
9587
9531
  * @param audio - Audio input (URL or file)
9588
9532
  * @param options - Transcription options
@@ -9591,21 +9535,44 @@ var SonioxAdapter = class extends BaseAdapter {
9591
9535
  async transcribe(audio, options) {
9592
9536
  this.validateConfig();
9593
9537
  try {
9594
- const requestBody = {
9595
- model: options?.model || this.defaultModel
9596
- };
9597
- if (audio.type === "url") {
9598
- requestBody.audio_url = audio.url;
9599
- } else if (audio.type === "file") {
9600
- const formData = new FormData();
9538
+ const sonioxOpts = options?.soniox;
9539
+ if (audio.type === "file") {
9601
9540
  const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9602
- formData.append("file", audioBlob, audio.filename || "audio.wav");
9603
- const uploadResponse = await this.client.post("/files", formData, {
9604
- headers: {
9605
- "Content-Type": "multipart/form-data"
9606
- }
9607
- });
9608
- requestBody.file_id = uploadResponse.data.id;
9541
+ const uploadBody = { file: audioBlob };
9542
+ const fileResp = await uploadFile(uploadBody, this.getAxiosConfig());
9543
+ const payload = {
9544
+ ...sonioxOpts,
9545
+ model: options?.model || this.defaultModel,
9546
+ file_id: fileResp.data.id,
9547
+ language_hints: options?.language ? [options.language] : sonioxOpts?.language_hints,
9548
+ enable_speaker_diarization: options?.diarization || sonioxOpts?.enable_speaker_diarization,
9549
+ enable_language_identification: options?.languageDetection || sonioxOpts?.enable_language_identification,
9550
+ context: options?.customVocabulary?.length ? { terms: options.customVocabulary } : sonioxOpts?.context,
9551
+ webhook_url: options?.webhookUrl || sonioxOpts?.webhook_url
9552
+ };
9553
+ const createResp = await createTranscription2(payload, this.getAxiosConfig());
9554
+ const meta = createResp.data;
9555
+ if (options?.webhookUrl || sonioxOpts?.webhook_url) {
9556
+ return this.normalizeTranscription(meta);
9557
+ }
9558
+ return this.pollForCompletion(meta.id);
9559
+ } else if (audio.type === "url") {
9560
+ const payload = {
9561
+ ...sonioxOpts,
9562
+ model: options?.model || this.defaultModel,
9563
+ audio_url: audio.url,
9564
+ language_hints: options?.language ? [options.language] : sonioxOpts?.language_hints,
9565
+ enable_speaker_diarization: options?.diarization || sonioxOpts?.enable_speaker_diarization,
9566
+ enable_language_identification: options?.languageDetection || sonioxOpts?.enable_language_identification,
9567
+ context: options?.customVocabulary?.length ? { terms: options.customVocabulary } : sonioxOpts?.context,
9568
+ webhook_url: options?.webhookUrl || sonioxOpts?.webhook_url
9569
+ };
9570
+ const createResp = await createTranscription2(payload, this.getAxiosConfig());
9571
+ const meta = createResp.data;
9572
+ if (options?.webhookUrl || sonioxOpts?.webhook_url) {
9573
+ return this.normalizeTranscription(meta);
9574
+ }
9575
+ return this.pollForCompletion(meta.id);
9609
9576
  } else {
9610
9577
  return {
9611
9578
  success: false,
@@ -9616,38 +9583,6 @@ var SonioxAdapter = class extends BaseAdapter {
9616
9583
  }
9617
9584
  };
9618
9585
  }
9619
- if (options?.language) {
9620
- requestBody.language_hints = [options.language];
9621
- }
9622
- if (options?.diarization) {
9623
- requestBody.enable_speaker_diarization = true;
9624
- }
9625
- if (options?.languageDetection) {
9626
- requestBody.enable_language_identification = true;
9627
- }
9628
- if (options?.customVocabulary && options.customVocabulary.length > 0) {
9629
- requestBody.context = {
9630
- terms: options.customVocabulary
9631
- };
9632
- }
9633
- if (options?.webhookUrl) {
9634
- requestBody.webhook_url = options.webhookUrl;
9635
- }
9636
- const response = await this.client.post("/transcriptions", requestBody);
9637
- const transcriptionId = response.data.id;
9638
- if (options?.webhookUrl) {
9639
- return {
9640
- success: true,
9641
- provider: this.name,
9642
- data: {
9643
- id: transcriptionId,
9644
- text: "",
9645
- status: "queued"
9646
- },
9647
- raw: response.data
9648
- };
9649
- }
9650
- return await this.pollForCompletion(transcriptionId);
9651
9586
  } catch (error) {
9652
9587
  return this.createErrorResponse(error);
9653
9588
  }
@@ -9655,9 +9590,8 @@ var SonioxAdapter = class extends BaseAdapter {
9655
9590
  /**
9656
9591
  * Get transcription result by ID
9657
9592
  *
9658
- * Checks job status via GET /v1/transcriptions/{id}, then fetches
9659
- * the full transcript via GET /v1/transcriptions/{id}/transcript
9660
- * when completed.
9593
+ * Fetches transcription metadata and, if completed, the transcript text/tokens.
9594
+ * Used by pollForCompletion() for async polling.
9661
9595
  *
9662
9596
  * @param transcriptId - Transcript ID
9663
9597
  * @returns Transcription response
@@ -9665,39 +9599,20 @@ var SonioxAdapter = class extends BaseAdapter {
9665
9599
  async getTranscript(transcriptId) {
9666
9600
  this.validateConfig();
9667
9601
  try {
9668
- const statusResponse = await this.client.get(`/transcriptions/${transcriptId}`);
9669
- const job = statusResponse.data;
9670
- if (job.status === "error") {
9671
- return {
9672
- success: false,
9673
- provider: this.name,
9674
- error: {
9675
- code: "TRANSCRIPTION_ERROR",
9676
- message: job.error_message || "Transcription failed"
9677
- }
9678
- };
9679
- }
9680
- if (job.status !== "completed") {
9681
- return {
9682
- success: true,
9683
- provider: this.name,
9684
- data: {
9685
- id: job.id,
9686
- text: "",
9687
- status: job.status
9688
- },
9689
- raw: job
9690
- };
9602
+ const metaResp = await getTranscription(transcriptId, this.getAxiosConfig());
9603
+ const meta = metaResp.data;
9604
+ if (meta.status === TranscriptionStatus.completed) {
9605
+ try {
9606
+ const transcriptResp = await getTranscriptionTranscript(
9607
+ transcriptId,
9608
+ this.getAxiosConfig()
9609
+ );
9610
+ return this.normalizeTranscription(meta, transcriptResp.data);
9611
+ } catch (transcriptError) {
9612
+ return this.createErrorResponse(transcriptError);
9613
+ }
9691
9614
  }
9692
- const transcriptResponse = await this.client.get(
9693
- `/transcriptions/${transcriptId}/transcript`
9694
- );
9695
- return this.normalizeResponse({
9696
- ...transcriptResponse.data,
9697
- // Carry over job metadata
9698
- id: job.id,
9699
- audio_duration_ms: job.audio_duration_ms
9700
- });
9615
+ return this.normalizeTranscription(meta);
9701
9616
  } catch (error) {
9702
9617
  return this.createErrorResponse(error);
9703
9618
  }
@@ -9717,51 +9632,50 @@ var SonioxAdapter = class extends BaseAdapter {
9717
9632
  const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9718
9633
  const createdAt = /* @__PURE__ */ new Date();
9719
9634
  const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
9720
- const wsUrl = `${wsBase}/transcribe-websocket`;
9721
- const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
9722
- const sonioxOpts = options?.sonioxStreaming;
9723
- const initMessage = {
9724
- api_key: this.config.apiKey,
9725
- model: modelId
9726
- };
9727
- if (sonioxOpts?.audioFormat) {
9728
- initMessage.audio_format = sonioxOpts.audioFormat;
9729
- } else if (options?.encoding) {
9635
+ const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
9636
+ wsUrl.searchParams.set("api_key", this.config.apiKey);
9637
+ const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
9638
+ wsUrl.searchParams.set("model", modelId);
9639
+ if (options?.encoding) {
9730
9640
  const encodingMap = {
9731
9641
  linear16: "pcm_s16le",
9732
9642
  pcm: "pcm_s16le",
9733
9643
  mulaw: "mulaw",
9734
9644
  alaw: "alaw"
9735
9645
  };
9736
- initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
9646
+ wsUrl.searchParams.set("audio_format", encodingMap[options.encoding] || options.encoding);
9737
9647
  }
9738
- if (sonioxOpts?.sampleRate || options?.sampleRate) {
9739
- initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
9648
+ if (options?.sampleRate) {
9649
+ wsUrl.searchParams.set("sample_rate", options.sampleRate.toString());
9740
9650
  }
9741
- if (sonioxOpts?.numChannels || options?.channels) {
9742
- initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
9651
+ if (options?.channels) {
9652
+ wsUrl.searchParams.set("num_channels", options.channels.toString());
9743
9653
  }
9654
+ const sonioxOpts = options?.sonioxStreaming;
9744
9655
  if (sonioxOpts) {
9745
9656
  if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
9746
- initMessage.language_hints = sonioxOpts.languageHints;
9657
+ wsUrl.searchParams.set("language_hints", JSON.stringify(sonioxOpts.languageHints));
9747
9658
  }
9748
9659
  if (sonioxOpts.enableLanguageIdentification) {
9749
- initMessage.enable_language_identification = true;
9660
+ wsUrl.searchParams.set("enable_language_identification", "true");
9750
9661
  }
9751
9662
  if (sonioxOpts.enableEndpointDetection) {
9752
- initMessage.enable_endpoint_detection = true;
9663
+ wsUrl.searchParams.set("enable_endpoint_detection", "true");
9753
9664
  }
9754
9665
  if (sonioxOpts.enableSpeakerDiarization) {
9755
- initMessage.enable_speaker_diarization = true;
9666
+ wsUrl.searchParams.set("enable_speaker_diarization", "true");
9756
9667
  }
9757
9668
  if (sonioxOpts.context) {
9758
- initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
9669
+ wsUrl.searchParams.set(
9670
+ "context",
9671
+ typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
9672
+ );
9759
9673
  }
9760
9674
  if (sonioxOpts.translation) {
9761
- initMessage.translation = sonioxOpts.translation;
9675
+ wsUrl.searchParams.set("translation", JSON.stringify(sonioxOpts.translation));
9762
9676
  }
9763
9677
  if (sonioxOpts.clientReferenceId) {
9764
- initMessage.client_reference_id = sonioxOpts.clientReferenceId;
9678
+ wsUrl.searchParams.set("client_reference_id", sonioxOpts.clientReferenceId);
9765
9679
  }
9766
9680
  }
9767
9681
  if (!sonioxOpts?.languageHints && options?.language) {
@@ -9770,33 +9684,24 @@ var SonioxAdapter = class extends BaseAdapter {
9770
9684
  `[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
9771
9685
  );
9772
9686
  }
9773
- initMessage.language_hints = [options.language];
9687
+ wsUrl.searchParams.set("language_hints", JSON.stringify([options.language]));
9774
9688
  }
9775
9689
  if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
9776
- initMessage.enable_speaker_diarization = true;
9690
+ wsUrl.searchParams.set("enable_speaker_diarization", "true");
9777
9691
  }
9778
9692
  if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
9779
- initMessage.enable_language_identification = true;
9693
+ wsUrl.searchParams.set("enable_language_identification", "true");
9694
+ }
9695
+ if (options?.interimResults !== false) {
9780
9696
  }
9781
9697
  let status = "connecting";
9782
9698
  let openedAt = null;
9783
9699
  let receivedData = false;
9784
9700
  const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : __require("ws");
9785
- const ws = new WebSocketImpl(wsUrl);
9701
+ const ws = new WebSocketImpl(wsUrl.toString());
9786
9702
  ws.onopen = () => {
9787
- openedAt = Date.now();
9788
- const initPayload = JSON.stringify(initMessage);
9789
- if (callbacks?.onRawMessage) {
9790
- callbacks.onRawMessage({
9791
- provider: this.name,
9792
- direction: "outgoing",
9793
- timestamp: Date.now(),
9794
- payload: initPayload,
9795
- messageType: "init"
9796
- });
9797
- }
9798
- ws.send(initPayload);
9799
9703
  status = "open";
9704
+ openedAt = Date.now();
9800
9705
  callbacks?.onOpen?.();
9801
9706
  };
9802
9707
  ws.onmessage = (event) => {
@@ -9805,7 +9710,8 @@ var SonioxAdapter = class extends BaseAdapter {
9805
9710
  let messageType;
9806
9711
  try {
9807
9712
  const data = JSON.parse(rawPayload);
9808
- if (data.error) {
9713
+ const errorMessage = data.error_message;
9714
+ if (errorMessage) {
9809
9715
  messageType = "error";
9810
9716
  } else if (data.finished) {
9811
9717
  messageType = "finished";
@@ -9821,10 +9727,10 @@ var SonioxAdapter = class extends BaseAdapter {
9821
9727
  messageType
9822
9728
  });
9823
9729
  }
9824
- if (data.error) {
9730
+ if (errorMessage) {
9825
9731
  callbacks?.onError?.({
9826
9732
  code: data.error_code?.toString() || "STREAM_ERROR",
9827
- message: data.error
9733
+ message: errorMessage
9828
9734
  });
9829
9735
  return;
9830
9736
  }
@@ -9838,7 +9744,7 @@ var SonioxAdapter = class extends BaseAdapter {
9838
9744
  start: token.start_ms ? token.start_ms / 1e3 : 0,
9839
9745
  end: token.end_ms ? token.end_ms / 1e3 : 0,
9840
9746
  confidence: token.confidence,
9841
- speaker: token.speaker
9747
+ speaker: token.speaker ?? void 0
9842
9748
  }));
9843
9749
  const text = data.text || data.tokens.map((t) => t.text).join("");
9844
9750
  const isFinal = data.tokens.every((t) => t.is_final);
@@ -9847,8 +9753,8 @@ var SonioxAdapter = class extends BaseAdapter {
9847
9753
  text,
9848
9754
  isFinal,
9849
9755
  words,
9850
- speaker: data.tokens[0]?.speaker,
9851
- language: data.tokens[0]?.language,
9756
+ speaker: data.tokens[0]?.speaker ?? void 0,
9757
+ language: data.tokens[0]?.language ?? void 0,
9852
9758
  confidence: data.tokens[0]?.confidence
9853
9759
  };
9854
9760
  callbacks?.onTranscript?.(event2);
@@ -9875,10 +9781,10 @@ var SonioxAdapter = class extends BaseAdapter {
9875
9781
  ws.onclose = (event) => {
9876
9782
  status = "closed";
9877
9783
  const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
9878
- const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
9879
- if (isEarlyClose && event.code === 1e3) {
9784
+ const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
9785
+ if (isImmediateClose && event.code === 1e3) {
9880
9786
  const errorMessage = [
9881
- "Soniox closed connection shortly after opening.",
9787
+ "Soniox closed connection immediately after opening.",
9882
9788
  `Current config: region=${this.region}, model=${modelId}`,
9883
9789
  "Likely causes:",
9884
9790
  " - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
@@ -9964,7 +9870,7 @@ var SonioxAdapter = class extends BaseAdapter {
9964
9870
  async getModels() {
9965
9871
  this.validateConfig();
9966
9872
  try {
9967
- const response = await this.client.get("/models");
9873
+ const response = await getModels(this.getAxiosConfig());
9968
9874
  return response.data.models || [];
9969
9875
  } catch (error) {
9970
9876
  console.error("Failed to fetch Soniox models:", error);
@@ -9996,11 +9902,44 @@ var SonioxAdapter = class extends BaseAdapter {
9996
9902
  return buildUtterancesFromWords(words);
9997
9903
  }
9998
9904
  /**
9999
- * Normalize Soniox response to unified format
9905
+ * Normalize v1 API response to unified format
9906
+ *
9907
+ * @param meta - Transcription metadata from getTranscription/createTranscription
9908
+ * @param transcript - Transcript data (text/tokens), only present when status is completed
10000
9909
  */
10001
- normalizeResponse(response) {
10002
- const { text, tokens } = response;
10003
- const words = tokens.map((token) => ({
9910
+ normalizeTranscription(meta, transcript) {
9911
+ if (meta.status === TranscriptionStatus.error) {
9912
+ return {
9913
+ success: false,
9914
+ provider: this.name,
9915
+ data: {
9916
+ id: meta.id,
9917
+ text: "",
9918
+ status: "error"
9919
+ },
9920
+ error: {
9921
+ code: meta.error_type || "TRANSCRIPTION_ERROR",
9922
+ message: meta.error_message || "Transcription failed"
9923
+ },
9924
+ raw: { meta, transcript }
9925
+ };
9926
+ }
9927
+ if (!transcript) {
9928
+ return {
9929
+ success: true,
9930
+ provider: this.name,
9931
+ data: {
9932
+ id: meta.id,
9933
+ text: "",
9934
+ status: meta.status,
9935
+ duration: meta.audio_duration_ms ? meta.audio_duration_ms / 1e3 : void 0
9936
+ },
9937
+ raw: { meta }
9938
+ };
9939
+ }
9940
+ const tokens = transcript.tokens || [];
9941
+ const text = transcript.text || tokens.map((t) => t.text).join("");
9942
+ const words = tokens.filter((t) => t.start_ms !== void 0 && t.end_ms !== void 0).map((token) => ({
10004
9943
  word: token.text,
10005
9944
  start: token.start_ms / 1e3,
10006
9945
  end: token.end_ms / 1e3,
@@ -10008,33 +9947,32 @@ var SonioxAdapter = class extends BaseAdapter {
10008
9947
  speaker: token.speaker ?? void 0
10009
9948
  }));
10010
9949
  const speakerSet = /* @__PURE__ */ new Set();
10011
- for (const token of tokens) {
10012
- if (token.speaker) speakerSet.add(token.speaker);
10013
- }
9950
+ tokens.forEach((t) => {
9951
+ if (t.speaker) speakerSet.add(String(t.speaker));
9952
+ });
10014
9953
  const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
10015
9954
  id,
10016
9955
  label: `Speaker ${id}`
10017
9956
  })) : void 0;
10018
- const utterances = tokens.length > 0 ? this.buildUtterancesFromTokens(tokens) : [];
9957
+ const utterances = this.buildUtterancesFromTokens(tokens);
10019
9958
  const language = tokens.find((t) => t.language)?.language ?? void 0;
10020
9959
  return {
10021
9960
  success: true,
10022
9961
  provider: this.name,
10023
9962
  data: {
10024
- id: response.id || `soniox_${Date.now()}`,
9963
+ id: meta.id,
10025
9964
  text,
10026
9965
  status: TranscriptionStatus.completed,
10027
9966
  language,
10028
- duration: response.audio_duration_ms ? response.audio_duration_ms / 1e3 : response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
9967
+ duration: meta.audio_duration_ms ? meta.audio_duration_ms / 1e3 : void 0,
10029
9968
  speakers,
10030
9969
  words: words.length > 0 ? words : void 0,
10031
9970
  utterances: utterances.length > 0 ? utterances : void 0
10032
9971
  },
10033
9972
  tracking: {
10034
- requestId: response.id,
10035
- processingTimeMs: response.total_audio_proc_ms
9973
+ requestId: meta.id
10036
9974
  },
10037
- raw: response
9975
+ raw: { meta, transcript }
10038
9976
  };
10039
9977
  }
10040
9978
  };
@@ -10131,7 +10069,15 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10131
10069
  /**
10132
10070
  * Submit audio for transcription
10133
10071
  *
10134
- * ElevenLabs batch is synchronous - the API returns the result directly.
10072
+ * ElevenLabs batch is normally synchronous the API returns results directly.
10073
+ *
10074
+ * **Webhook mode:** When `webhookUrl` is set (or `elevenlabs.webhook` is true),
10075
+ * the request is processed asynchronously. ElevenLabs returns a 202 with a
10076
+ * `request_id` and delivers results to a webhook configured in the ElevenLabs
10077
+ * dashboard. The unified `webhookUrl` acts as an intent flag to enable async
10078
+ * mode — the actual delivery destination must be pre-configured in your
10079
+ * ElevenLabs dashboard. Use `elevenlabs.webhook_id` to target a specific
10080
+ * webhook endpoint.
10135
10081
  */
10136
10082
  async transcribe(audio, options) {
10137
10083
  this.validateConfig();
@@ -10154,6 +10100,11 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10154
10100
  }
10155
10101
  };
10156
10102
  }
10103
+ const elevenlabsOpts = options?.elevenlabs;
10104
+ const useWebhook = options?.webhookUrl || elevenlabsOpts?.webhook;
10105
+ if (useWebhook) {
10106
+ formData.append("webhook", "true");
10107
+ }
10157
10108
  if (options?.language) {
10158
10109
  formData.append("language_code", options.language);
10159
10110
  }
@@ -10172,7 +10123,6 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10172
10123
  if (options?.entityDetection) {
10173
10124
  formData.append("entity_detection", "all");
10174
10125
  }
10175
- const elevenlabsOpts = options?.elevenlabs;
10176
10126
  if (elevenlabsOpts) {
10177
10127
  for (const [key, value] of Object.entries(elevenlabsOpts)) {
10178
10128
  if (value === void 0 || value === null) continue;
@@ -10190,26 +10140,24 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10190
10140
  }
10191
10141
  }
10192
10142
  }
10193
- if (options?.webhookUrl) {
10194
- if (!formData.has("webhook")) {
10195
- formData.append("webhook", "true");
10196
- }
10197
- }
10198
10143
  const response = await this.client.post("/v1/speech-to-text", formData, {
10199
10144
  headers: {
10200
10145
  "Content-Type": "multipart/form-data"
10201
10146
  }
10202
10147
  });
10203
- if (options?.webhookUrl) {
10204
- const transcriptionId = response.data.transcription_id || response.data.id || `elevenlabs_${Date.now()}`;
10148
+ if (useWebhook) {
10149
+ const ack = response.data;
10205
10150
  return {
10206
10151
  success: true,
10207
10152
  provider: this.name,
10208
10153
  data: {
10209
- id: transcriptionId,
10154
+ id: ack.request_id || ack.transcription_id || `elevenlabs_${Date.now()}`,
10210
10155
  text: "",
10211
10156
  status: "queued"
10212
10157
  },
10158
+ tracking: {
10159
+ requestId: ack.request_id
10160
+ },
10213
10161
  raw: response.data
10214
10162
  };
10215
10163
  }
@@ -10305,20 +10253,9 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10305
10253
  ws.onmessage = (event) => {
10306
10254
  receivedData = true;
10307
10255
  const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
10308
- let messageType;
10309
10256
  try {
10310
10257
  const data = JSON.parse(rawPayload);
10311
- if (data.error) {
10312
- messageType = "error";
10313
- } else if (data.message_type === "session_started") {
10314
- messageType = "session_started";
10315
- } else if (data.message_type === "partial_transcript") {
10316
- messageType = "partial_transcript";
10317
- } else if (data.message_type === "committed_transcript") {
10318
- messageType = "committed_transcript";
10319
- } else if (data.message_type === "committed_transcript_with_timestamps") {
10320
- messageType = "committed_transcript_with_timestamps";
10321
- }
10258
+ const messageType = "error" in data ? "error" : data.message_type;
10322
10259
  if (callbacks?.onRawMessage) {
10323
10260
  callbacks.onRawMessage({
10324
10261
  provider: this.name,
@@ -10328,50 +10265,62 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10328
10265
  messageType
10329
10266
  });
10330
10267
  }
10331
- if (data.error) {
10268
+ if ("error" in data) {
10332
10269
  callbacks?.onError?.({
10333
- code: data.error_code?.toString() || "STREAM_ERROR",
10270
+ code: data.message_type || "STREAM_ERROR",
10334
10271
  message: data.error
10335
10272
  });
10336
10273
  return;
10337
10274
  }
10338
- if (data.message_type === "session_started") {
10339
- return;
10340
- }
10341
- if (data.message_type === "partial_transcript") {
10342
- const streamEvent = {
10343
- type: "transcript",
10344
- text: data.text || "",
10345
- isFinal: false,
10346
- confidence: void 0,
10347
- language: data.language_code
10348
- };
10349
- callbacks?.onTranscript?.(streamEvent);
10350
- return;
10351
- }
10352
- if (data.message_type === "committed_transcript" || data.message_type === "committed_transcript_with_timestamps") {
10353
- const words = data.words ? data.words.map((w) => ({
10354
- word: w.text || "",
10355
- start: w.start || 0,
10356
- end: w.end || 0,
10357
- confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
10358
- speaker: w.speaker_id
10359
- })) : [];
10360
- const streamEvent = {
10361
- type: "transcript",
10362
- text: data.text || "",
10363
- isFinal: true,
10364
- words: words.length > 0 ? words : void 0,
10365
- speaker: words[0]?.speaker,
10366
- language: data.language_code,
10367
- confidence: void 0
10368
- };
10369
- callbacks?.onTranscript?.(streamEvent);
10370
- if (options?.diarization && words.length > 0) {
10371
- const utterances = buildUtterancesFromWords(words);
10372
- for (const utterance of utterances) {
10373
- callbacks?.onUtterance?.(utterance);
10275
+ switch (data.message_type) {
10276
+ case "session_started":
10277
+ break;
10278
+ case "partial_transcript": {
10279
+ const streamEvent = {
10280
+ type: "transcript",
10281
+ text: data.text || "",
10282
+ isFinal: false,
10283
+ confidence: void 0
10284
+ };
10285
+ callbacks?.onTranscript?.(streamEvent);
10286
+ break;
10287
+ }
10288
+ case "committed_transcript": {
10289
+ const streamEvent = {
10290
+ type: "transcript",
10291
+ text: data.text || "",
10292
+ isFinal: true,
10293
+ confidence: void 0
10294
+ };
10295
+ callbacks?.onTranscript?.(streamEvent);
10296
+ break;
10297
+ }
10298
+ case "committed_transcript_with_timestamps": {
10299
+ const tsData = data;
10300
+ const words = tsData.words ? tsData.words.map((w) => ({
10301
+ word: w.text || "",
10302
+ start: w.start || 0,
10303
+ end: w.end || 0,
10304
+ confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
10305
+ speaker: w.speaker_id
10306
+ })) : [];
10307
+ const streamEvent = {
10308
+ type: "transcript",
10309
+ text: tsData.text || "",
10310
+ isFinal: true,
10311
+ words: words.length > 0 ? words : void 0,
10312
+ speaker: words[0]?.speaker,
10313
+ language: tsData.language_code,
10314
+ confidence: void 0
10315
+ };
10316
+ callbacks?.onTranscript?.(streamEvent);
10317
+ if (options?.diarization && words.length > 0) {
10318
+ const utterances = buildUtterancesFromWords(words);
10319
+ for (const utterance of utterances) {
10320
+ callbacks?.onUtterance?.(utterance);
10321
+ }
10374
10322
  }
10323
+ break;
10375
10324
  }
10376
10325
  }
10377
10326
  } catch (error) {
@@ -10526,7 +10475,7 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10526
10475
  }
10527
10476
  }
10528
10477
  }
10529
- const transcriptionId = ("transcription_id" in response ? response.transcription_id : response.transcription_id) || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
10478
+ const transcriptionId = response.transcription_id || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
10530
10479
  return {
10531
10480
  success: true,
10532
10481
  provider: this.name,
@@ -36444,12 +36393,10 @@ var createTemporaryApiKeyBody = zod10.object({
36444
36393
  var streaming_types_zod_exports = {};
36445
36394
  __export(streaming_types_zod_exports, {
36446
36395
  sonioxAudioFormatSchema: () => sonioxAudioFormatSchema,
36447
- sonioxAutoDetectedAudioFormatSchema: () => sonioxAutoDetectedAudioFormatSchema,
36448
36396
  sonioxContextGeneralItemSchema: () => sonioxContextGeneralItemSchema,
36449
36397
  sonioxContextSchema: () => sonioxContextSchema,
36450
36398
  sonioxErrorStatusSchema: () => sonioxErrorStatusSchema,
36451
36399
  sonioxOneWayTranslationSchema: () => sonioxOneWayTranslationSchema,
36452
- sonioxPcmAudioEncodingSchema: () => sonioxPcmAudioEncodingSchema,
36453
36400
  sonioxRealtimeModelSchema: () => sonioxRealtimeModelSchema,
36454
36401
  sonioxRecorderStateSchema: () => sonioxRecorderStateSchema,
36455
36402
  sonioxStreamingResponseSchema: () => sonioxStreamingResponseSchema,
@@ -36463,7 +36410,7 @@ __export(streaming_types_zod_exports, {
36463
36410
  streamingUpdateConfigParams: () => streamingUpdateConfigParams3
36464
36411
  });
36465
36412
  import { z as zod11 } from "zod";
36466
- var sonioxAutoDetectedAudioFormatSchema = zod11.enum([
36413
+ var sonioxAudioFormatSchema = zod11.enum([
36467
36414
  "auto",
36468
36415
  "aac",
36469
36416
  "aiff",
@@ -36473,10 +36420,7 @@ var sonioxAutoDetectedAudioFormatSchema = zod11.enum([
36473
36420
  "mp3",
36474
36421
  "ogg",
36475
36422
  "wav",
36476
- "webm"
36477
- ]);
36478
- var sonioxPcmAudioEncodingSchema = zod11.enum([
36479
- // Signed PCM
36423
+ "webm",
36480
36424
  "pcm_s8",
36481
36425
  "pcm_s16le",
36482
36426
  "pcm_s16be",
@@ -36484,7 +36428,6 @@ var sonioxPcmAudioEncodingSchema = zod11.enum([
36484
36428
  "pcm_s24be",
36485
36429
  "pcm_s32le",
36486
36430
  "pcm_s32be",
36487
- // Unsigned PCM
36488
36431
  "pcm_u8",
36489
36432
  "pcm_u16le",
36490
36433
  "pcm_u16be",
@@ -36492,86 +36435,81 @@ var sonioxPcmAudioEncodingSchema = zod11.enum([
36492
36435
  "pcm_u24be",
36493
36436
  "pcm_u32le",
36494
36437
  "pcm_u32be",
36495
- // Float PCM
36496
36438
  "pcm_f32le",
36497
36439
  "pcm_f32be",
36498
36440
  "pcm_f64le",
36499
36441
  "pcm_f64be",
36500
- // Companded
36501
36442
  "mulaw",
36502
36443
  "alaw"
36503
36444
  ]);
36504
- var sonioxAudioFormatSchema = zod11.union([
36505
- sonioxAutoDetectedAudioFormatSchema,
36506
- sonioxPcmAudioEncodingSchema
36507
- ]);
36508
36445
  var sonioxOneWayTranslationSchema = zod11.object({
36509
36446
  type: zod11.literal("one_way"),
36510
- target_language: zod11.string().describe("Target language code for translation")
36447
+ target_language: zod11.string()
36511
36448
  });
36512
36449
  var sonioxTwoWayTranslationSchema = zod11.object({
36513
36450
  type: zod11.literal("two_way"),
36514
- language_a: zod11.string().describe("First language for bidirectional translation"),
36515
- language_b: zod11.string().describe("Second language for bidirectional translation")
36451
+ language_a: zod11.string(),
36452
+ language_b: zod11.string()
36516
36453
  });
36517
36454
  var sonioxTranslationConfigSchema = zod11.union([
36518
36455
  sonioxOneWayTranslationSchema,
36519
36456
  sonioxTwoWayTranslationSchema
36520
36457
  ]);
36521
36458
  var sonioxContextGeneralItemSchema = zod11.object({
36522
- key: zod11.string().describe("Context item key (e.g. 'Domain')"),
36523
- value: zod11.string().describe("Context item value (e.g. 'medicine')")
36459
+ key: zod11.string(),
36460
+ value: zod11.string()
36524
36461
  });
36525
36462
  var sonioxTranslationTermSchema = zod11.object({
36526
- source: zod11.string().describe("Source term"),
36527
- target: zod11.string().describe("Target term to translate to")
36463
+ source: zod11.string(),
36464
+ target: zod11.string()
36528
36465
  });
36529
36466
  var sonioxStructuredContextSchema = zod11.object({
36530
- general: zod11.array(sonioxContextGeneralItemSchema).optional().describe("General context items (key-value pairs)"),
36531
- text: zod11.string().optional().describe("Text context"),
36532
- terms: zod11.array(zod11.string()).optional().describe("Terms that might occur in speech"),
36533
- translation_terms: zod11.array(sonioxTranslationTermSchema).optional().describe("Hints how to translate specific terms (ignored if translation is not enabled)")
36467
+ general: zod11.array(sonioxContextGeneralItemSchema).optional(),
36468
+ text: zod11.string().optional(),
36469
+ terms: zod11.array(zod11.string()).optional(),
36470
+ translation_terms: zod11.array(sonioxTranslationTermSchema).optional()
36534
36471
  });
36535
36472
  var sonioxContextSchema = zod11.union([sonioxStructuredContextSchema, zod11.string()]);
36536
36473
  var sonioxRealtimeModelSchema = zod11.enum([
36474
+ "stt-rt-v4",
36537
36475
  "stt-rt-v3",
36538
36476
  "stt-rt-preview",
36539
36477
  "stt-rt-v3-preview",
36540
36478
  "stt-rt-preview-v2"
36541
36479
  ]);
36542
36480
  var streamingTranscriberParams3 = zod11.object({
36543
- model: sonioxRealtimeModelSchema.describe("Real-time model to use"),
36544
- audioFormat: sonioxAudioFormatSchema.optional().describe("Audio format specification. Use 'auto' for automatic detection"),
36545
- sampleRate: zod11.number().optional().describe("Sample rate in Hz (required for raw PCM formats)"),
36546
- numChannels: zod11.number().min(1).max(2).optional().describe("Number of audio channels (1 for mono, 2 for stereo) - required for raw PCM formats"),
36547
- languageHints: zod11.array(zod11.string()).optional().describe("Expected languages in the audio (ISO language codes)"),
36548
- context: sonioxContextSchema.optional().describe("Additional context to improve transcription accuracy"),
36549
- enableSpeakerDiarization: zod11.boolean().optional().describe("Enable speaker diarization - each token will include a speaker field"),
36550
- enableLanguageIdentification: zod11.boolean().optional().describe("Enable language identification - each token will include a language field"),
36551
- enableEndpointDetection: zod11.boolean().optional().describe("Enable endpoint detection to detect when a speaker has finished talking"),
36552
- translation: sonioxTranslationConfigSchema.optional().describe("Translation configuration"),
36553
- clientReferenceId: zod11.string().optional().describe("Optional tracking identifier (client-defined)")
36554
- });
36555
- var sonioxTranslationStatusSchema = zod11.enum(["none", "original", "translation"]);
36481
+ model: sonioxRealtimeModelSchema,
36482
+ audioFormat: sonioxAudioFormatSchema.optional(),
36483
+ sampleRate: zod11.number().optional(),
36484
+ numChannels: zod11.number().optional(),
36485
+ languageHints: zod11.array(zod11.string()).optional(),
36486
+ context: sonioxContextSchema.optional(),
36487
+ enableSpeakerDiarization: zod11.boolean().optional(),
36488
+ enableLanguageIdentification: zod11.boolean().optional(),
36489
+ enableEndpointDetection: zod11.boolean().optional(),
36490
+ translation: sonioxTranslationConfigSchema.optional(),
36491
+ clientReferenceId: zod11.string().optional()
36492
+ });
36493
+ var sonioxTranslationStatusSchema = zod11.enum(["original", "translation", "none"]);
36556
36494
  var sonioxTokenSchema = zod11.object({
36557
- text: zod11.string().describe("Token text content (subword, word, or space)"),
36558
- start_ms: zod11.number().optional().describe("Start time of the token in milliseconds"),
36559
- end_ms: zod11.number().optional().describe("End time of the token in milliseconds"),
36560
- confidence: zod11.number().min(0).max(1).optional().describe("Confidence score between 0.0 and 1.0"),
36561
- is_final: zod11.boolean().describe("Whether this token is final (confirmed) or provisional"),
36562
- speaker: zod11.string().optional().describe("Speaker identifier (only present when speaker diarization is enabled)"),
36563
- language: zod11.string().optional().describe("Detected language code (only present when language identification is enabled)"),
36564
- source_language: zod11.string().optional().describe("Original language code for translated tokens"),
36565
- translation_status: sonioxTranslationStatusSchema.optional().describe("Translation status: 'none', 'original', or 'translation'")
36495
+ text: zod11.string(),
36496
+ start_ms: zod11.number().optional(),
36497
+ end_ms: zod11.number().optional(),
36498
+ confidence: zod11.number(),
36499
+ is_final: zod11.boolean(),
36500
+ speaker: zod11.string().optional(),
36501
+ translation_status: sonioxTranslationStatusSchema.optional(),
36502
+ language: zod11.string().optional(),
36503
+ source_language: zod11.string().optional()
36566
36504
  });
36567
36505
  var sonioxStreamingResponseSchema = zod11.object({
36568
- text: zod11.string().optional().describe("Complete transcribed text"),
36569
- tokens: zod11.array(sonioxTokenSchema).describe("List of recognized tokens"),
36570
- final_audio_proc_ms: zod11.number().optional().describe("Milliseconds of audio processed into final tokens"),
36571
- total_audio_proc_ms: zod11.number().optional().describe("Milliseconds of audio processed (final + non-final)"),
36572
- finished: zod11.boolean().optional().describe("Whether the transcription is complete"),
36573
- error: zod11.string().optional().describe("Error message if an error occurred"),
36574
- error_code: zod11.number().optional().describe("Error code if an error occurred")
36506
+ text: zod11.string(),
36507
+ tokens: zod11.array(sonioxTokenSchema),
36508
+ final_audio_proc_ms: zod11.number(),
36509
+ total_audio_proc_ms: zod11.number(),
36510
+ finished: zod11.boolean().optional(),
36511
+ error_code: zod11.number().optional(),
36512
+ error_message: zod11.string().optional()
36575
36513
  });
36576
36514
  var sonioxRecorderStateSchema = zod11.enum([
36577
36515
  "Init",
@@ -37137,8 +37075,8 @@ var BatchOnlyProviders = AllProviders.filter(
37137
37075
  );
37138
37076
 
37139
37077
  // src/generated/deepgram/schema/index.ts
37140
- var schema_exports4 = {};
37141
- __export(schema_exports4, {
37078
+ var schema_exports5 = {};
37079
+ __export(schema_exports5, {
37142
37080
  V1ListenPostParametersCallbackMethod: () => V1ListenPostParametersCallbackMethod,
37143
37081
  V1ListenPostParametersCustomIntentMode: () => V1ListenPostParametersCustomIntentMode,
37144
37082
  V1ListenPostParametersCustomTopicMode: () => V1ListenPostParametersCustomTopicMode,
@@ -37393,8 +37331,8 @@ var V1SpeakPostParametersSampleRate = {
37393
37331
  };
37394
37332
 
37395
37333
  // src/generated/openai/schema/index.ts
37396
- var schema_exports5 = {};
37397
- __export(schema_exports5, {
37334
+ var schema_exports6 = {};
37335
+ __export(schema_exports6, {
37398
37336
  AudioResponseFormat: () => AudioResponseFormat,
37399
37337
  CreateSpeechRequestResponseFormat: () => CreateSpeechRequestResponseFormat,
37400
37338
  CreateSpeechRequestStreamFormat: () => CreateSpeechRequestStreamFormat,
@@ -37734,8 +37672,8 @@ var VoiceResourceObject = {
37734
37672
  };
37735
37673
 
37736
37674
  // src/generated/speechmatics/schema/index.ts
37737
- var schema_exports6 = {};
37738
- __export(schema_exports6, {
37675
+ var schema_exports7 = {};
37676
+ __export(schema_exports7, {
37739
37677
  AutoChaptersResultErrorType: () => AutoChaptersResultErrorType,
37740
37678
  ErrorResponseError: () => ErrorResponseError,
37741
37679
  GetJobsJobidAlignmentTags: () => GetJobsJobidAlignmentTags,
@@ -37924,32 +37862,6 @@ var WrittenFormRecognitionResultType = {
37924
37862
  word: "word"
37925
37863
  };
37926
37864
 
37927
- // src/generated/soniox/schema/index.ts
37928
- var schema_exports7 = {};
37929
- __export(schema_exports7, {
37930
- TemporaryApiKeyUsageType: () => TemporaryApiKeyUsageType,
37931
- TranscriptionMode: () => TranscriptionMode,
37932
- TranscriptionStatus: () => TranscriptionStatus,
37933
- TranslationConfigType: () => TranslationConfigType
37934
- });
37935
-
37936
- // src/generated/soniox/schema/temporaryApiKeyUsageType.ts
37937
- var TemporaryApiKeyUsageType = {
37938
- transcribe_websocket: "transcribe_websocket"
37939
- };
37940
-
37941
- // src/generated/soniox/schema/transcriptionMode.ts
37942
- var TranscriptionMode = {
37943
- real_time: "real_time",
37944
- async: "async"
37945
- };
37946
-
37947
- // src/generated/soniox/schema/translationConfigType.ts
37948
- var TranslationConfigType = {
37949
- one_way: "one_way",
37950
- two_way: "two_way"
37951
- };
37952
-
37953
37865
  // src/generated/elevenlabs/schema/index.ts
37954
37866
  var schema_exports8 = {};
37955
37867
  __export(schema_exports8, {
@@ -39653,7 +39565,7 @@ export {
39653
39565
  DeepgramTTSSampleRate,
39654
39566
  DeepgramTopicMode,
39655
39567
  DeepgramTranscriptionSchema,
39656
- schema_exports4 as DeepgramTypes,
39568
+ schema_exports5 as DeepgramTypes,
39657
39569
  deepgramAPI_zod_exports as DeepgramZodSchemas,
39658
39570
  ElevenLabsAdapter,
39659
39571
  ElevenLabsCapabilities,
@@ -39690,7 +39602,7 @@ export {
39690
39602
  OpenAIResponseFormat,
39691
39603
  streaming_types_exports as OpenAIStreamingTypes,
39692
39604
  OpenAITranscriptionSchema,
39693
- schema_exports5 as OpenAITypes,
39605
+ schema_exports6 as OpenAITypes,
39694
39606
  OpenAIWhisperAdapter,
39695
39607
  openAIAudioRealtimeAPI_zod_exports as OpenAIZodSchemas,
39696
39608
  ProfanityFilterMode,
@@ -39719,7 +39631,7 @@ export {
39719
39631
  SonioxStreamingUpdateSchema,
39720
39632
  streaming_types_zod_exports as SonioxStreamingZodSchemas,
39721
39633
  SonioxTranscriptionSchema,
39722
- schema_exports7 as SonioxTypes,
39634
+ schema_exports4 as SonioxTypes,
39723
39635
  SpeakV1ContainerParameter,
39724
39636
  SpeakV1EncodingParameter,
39725
39637
  SpeakV1SampleRateParameter,
@@ -39734,7 +39646,7 @@ export {
39734
39646
  SpeechmaticsStreamingSchema,
39735
39647
  SpeechmaticsStreamingUpdateSchema,
39736
39648
  SpeechmaticsTranscriptionSchema,
39737
- schema_exports6 as SpeechmaticsTypes,
39649
+ schema_exports7 as SpeechmaticsTypes,
39738
39650
  speechmaticsASRRESTAPI_zod_exports as SpeechmaticsZodSchemas,
39739
39651
  StreamingProviders,
39740
39652
  StreamingSupportedBitDepthEnum,