voice-router-dev 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -82,7 +82,7 @@ __export(src_exports, {
82
82
  DeepgramTTSSampleRate: () => DeepgramTTSSampleRate,
83
83
  DeepgramTopicMode: () => DeepgramTopicMode,
84
84
  DeepgramTranscriptionSchema: () => DeepgramTranscriptionSchema,
85
- DeepgramTypes: () => schema_exports4,
85
+ DeepgramTypes: () => schema_exports5,
86
86
  DeepgramZodSchemas: () => deepgramAPI_zod_exports,
87
87
  ElevenLabsAdapter: () => ElevenLabsAdapter,
88
88
  ElevenLabsCapabilities: () => ElevenLabsCapabilities,
@@ -119,7 +119,7 @@ __export(src_exports, {
119
119
  OpenAIResponseFormat: () => OpenAIResponseFormat,
120
120
  OpenAIStreamingTypes: () => streaming_types_exports,
121
121
  OpenAITranscriptionSchema: () => OpenAITranscriptionSchema,
122
- OpenAITypes: () => schema_exports5,
122
+ OpenAITypes: () => schema_exports6,
123
123
  OpenAIWhisperAdapter: () => OpenAIWhisperAdapter,
124
124
  OpenAIZodSchemas: () => openAIAudioRealtimeAPI_zod_exports,
125
125
  ProfanityFilterMode: () => ProfanityFilterMode,
@@ -148,7 +148,7 @@ __export(src_exports, {
148
148
  SonioxStreamingUpdateSchema: () => SonioxStreamingUpdateSchema,
149
149
  SonioxStreamingZodSchemas: () => streaming_types_zod_exports,
150
150
  SonioxTranscriptionSchema: () => SonioxTranscriptionSchema,
151
- SonioxTypes: () => schema_exports7,
151
+ SonioxTypes: () => schema_exports4,
152
152
  SpeakV1ContainerParameter: () => SpeakV1ContainerParameter,
153
153
  SpeakV1EncodingParameter: () => SpeakV1EncodingParameter,
154
154
  SpeakV1SampleRateParameter: () => SpeakV1SampleRateParameter,
@@ -163,7 +163,7 @@ __export(src_exports, {
163
163
  SpeechmaticsStreamingSchema: () => SpeechmaticsStreamingSchema,
164
164
  SpeechmaticsStreamingUpdateSchema: () => SpeechmaticsStreamingUpdateSchema,
165
165
  SpeechmaticsTranscriptionSchema: () => SpeechmaticsTranscriptionSchema,
166
- SpeechmaticsTypes: () => schema_exports6,
166
+ SpeechmaticsTypes: () => schema_exports7,
167
167
  SpeechmaticsZodSchemas: () => speechmaticsASRRESTAPI_zod_exports,
168
168
  StreamingProviders: () => StreamingProviders,
169
169
  StreamingSupportedBitDepthEnum: () => StreamingSupportedBitDepthEnum,
@@ -6064,23 +6064,22 @@ var AssemblyAIAdapter = class extends BaseAdapter {
6064
6064
  "AssemblyAI adapter currently only supports URL-based audio input. Use audio.type='url'"
6065
6065
  );
6066
6066
  }
6067
- const aaiOpts = { ...options?.assemblyai };
6068
- if ("speech_model" in aaiOpts && aaiOpts.speech_model != null) {
6069
- if (!aaiOpts.speech_models) {
6070
- aaiOpts.speech_models = [aaiOpts.speech_model];
6071
- }
6072
- delete aaiOpts.speech_model;
6067
+ const passthrough = options?.assemblyai;
6068
+ let speechModels;
6069
+ if (passthrough?.speech_model != null && !passthrough.speech_models) {
6070
+ speechModels = [passthrough.speech_model];
6071
+ } else if (passthrough?.speech_models) {
6072
+ speechModels = passthrough.speech_models;
6073
6073
  }
6074
+ const { speech_model: _deprecated, ...typedOpts } = passthrough ?? {};
6074
6075
  const request = {
6075
- ...aaiOpts,
6076
+ ...typedOpts,
6076
6077
  audio_url: audioUrl,
6077
6078
  // speech_models is required — default to universal-3-pro
6078
- speech_models: aaiOpts.speech_models ?? [
6079
- "universal-3-pro"
6080
- ],
6079
+ speech_models: speechModels ?? ["universal-3-pro"],
6081
6080
  // Enable punctuation and formatting by default
6082
- punctuate: aaiOpts.punctuate ?? true,
6083
- format_text: aaiOpts.format_text ?? true
6081
+ punctuate: typedOpts.punctuate ?? true,
6082
+ format_text: typedOpts.format_text ?? true
6084
6083
  };
6085
6084
  if (options) {
6086
6085
  if (options.model) {
@@ -6128,22 +6127,22 @@ var AssemblyAIAdapter = class extends BaseAdapter {
6128
6127
  normalizeResponse(response) {
6129
6128
  let status;
6130
6129
  switch (response.status) {
6131
- case TranscriptStatus.queued:
6130
+ case "queued":
6132
6131
  status = "queued";
6133
6132
  break;
6134
- case TranscriptStatus.processing:
6133
+ case "processing":
6135
6134
  status = "processing";
6136
6135
  break;
6137
- case TranscriptStatus.completed:
6136
+ case "completed":
6138
6137
  status = "completed";
6139
6138
  break;
6140
- case TranscriptStatus.error:
6139
+ case "error":
6141
6140
  status = "error";
6142
6141
  break;
6143
6142
  default:
6144
6143
  status = "queued";
6145
6144
  }
6146
- if (response.status === TranscriptStatus.error) {
6145
+ if (response.status === "error") {
6147
6146
  return {
6148
6147
  success: false,
6149
6148
  provider: this.name,
@@ -6795,8 +6794,14 @@ var DeepgramAdapter = class extends BaseAdapter {
6795
6794
  /**
6796
6795
  * Submit audio for transcription
6797
6796
  *
6798
- * Sends audio to Deepgram API for transcription. Deepgram processes
6799
- * synchronously and returns results immediately (no polling required).
6797
+ * Sends audio to Deepgram API for transcription. Deepgram normally processes
6798
+ * synchronously and returns results immediately.
6799
+ *
6800
+ * **Callback mode:** When `webhookUrl` is set, Deepgram returns immediately
6801
+ * with a `request_id` (status `"queued"`). The full transcript is POSTed to
6802
+ * the webhook URL — this is the primary delivery mechanism. `getTranscript()`
6803
+ * can attempt to retrieve the result later via request history, but that
6804
+ * endpoint is best-effort and not a guaranteed durable store.
6800
6805
  *
6801
6806
  * @param audio - Audio input (URL or file buffer)
6802
6807
  * @param options - Transcription options
@@ -6847,47 +6852,81 @@ var DeepgramAdapter = class extends BaseAdapter {
6847
6852
  { params }
6848
6853
  ).then((res) => res.data);
6849
6854
  } else if (audio.type === "file") {
6850
- response = await this.client.post("/listen", audio.file, {
6851
- params,
6852
- headers: {
6853
- "Content-Type": "audio/*"
6855
+ response = await this.client.post(
6856
+ "/listen",
6857
+ audio.file,
6858
+ {
6859
+ params,
6860
+ headers: {
6861
+ "Content-Type": "audio/*"
6862
+ }
6854
6863
  }
6855
- }).then((res) => res.data);
6864
+ ).then((res) => res.data);
6856
6865
  } else {
6857
6866
  throw new Error(
6858
6867
  "Deepgram adapter does not support stream type for pre-recorded transcription. Use transcribeStream() for real-time streaming."
6859
6868
  );
6860
6869
  }
6870
+ if (options?.webhookUrl) {
6871
+ const requestId = ("request_id" in response ? response.request_id : void 0) || ("metadata" in response ? response.metadata?.request_id : void 0);
6872
+ if (!requestId) {
6873
+ return {
6874
+ success: false,
6875
+ provider: this.name,
6876
+ error: {
6877
+ code: "MISSING_REQUEST_ID",
6878
+ message: "Deepgram callback mode did not return a request ID"
6879
+ },
6880
+ raw: response
6881
+ };
6882
+ }
6883
+ return {
6884
+ success: true,
6885
+ provider: this.name,
6886
+ data: {
6887
+ id: requestId,
6888
+ text: "",
6889
+ status: "queued"
6890
+ },
6891
+ tracking: {
6892
+ requestId
6893
+ },
6894
+ raw: response
6895
+ };
6896
+ }
6897
+ if (!("results" in response) || !("metadata" in response)) {
6898
+ return {
6899
+ success: false,
6900
+ provider: this.name,
6901
+ error: {
6902
+ code: "INVALID_RESPONSE",
6903
+ message: "Deepgram did not return a synchronous transcription payload"
6904
+ },
6905
+ raw: response
6906
+ };
6907
+ }
6861
6908
  return this.normalizeResponse(response);
6862
6909
  } catch (error) {
6863
6910
  return this.createErrorResponse(error);
6864
6911
  }
6865
6912
  }
6866
6913
  /**
6867
- * Get transcription result by ID
6914
+ * Get transcription result by ID (best-effort)
6868
6915
  *
6869
- * Retrieves a previous transcription from Deepgram's request history.
6870
- *
6871
- * Unlike the list endpoint, getting a single request DOES include the full
6872
- * transcript response. Requires `projectId` to be set during initialization.
6916
+ * Retrieves a previous transcription from Deepgram's request history API.
6917
+ * Requires `projectId` to be set during initialization.
6873
6918
  *
6874
- * @param transcriptId - Request ID from a previous transcription
6875
- * @returns Full transcript response including text, words, and metadata
6919
+ * **Important:** Deepgram's request history is best-effort. Requests may
6920
+ * expire or be unavailable depending on your plan and retention settings.
6921
+ * This is NOT a durable transcript store — for reliable retrieval, use
6922
+ * callback mode (`webhookUrl`) and persist the webhook payload yourself.
6876
6923
  *
6877
- * @example Get a transcript by request ID
6878
- * ```typescript
6879
- * const adapter = new DeepgramAdapter()
6880
- * adapter.initialize({
6881
- * apiKey: process.env.DEEPGRAM_API_KEY,
6882
- * projectId: process.env.DEEPGRAM_PROJECT_ID
6883
- * })
6924
+ * The response field on the request history entry is cast to
6925
+ * `ListenV1Response` — this appears to work in practice but is not
6926
+ * explicitly documented by Deepgram as a guaranteed contract.
6884
6927
  *
6885
- * const result = await adapter.getTranscript('abc123-request-id')
6886
- * if (result.success) {
6887
- * console.log(result.data?.text)
6888
- * console.log(result.data?.words)
6889
- * }
6890
- * ```
6928
+ * @param transcriptId - Request ID from a previous transcription
6929
+ * @returns Transcript response if still available in request history
6891
6930
  *
6892
6931
  * @see https://developers.deepgram.com/reference/get-request
6893
6932
  */
@@ -7518,7 +7557,8 @@ var DeepgramAdapter = class extends BaseAdapter {
7518
7557
  break;
7519
7558
  }
7520
7559
  case "Metadata": {
7521
- callbacks?.onMetadata?.(message);
7560
+ const { type: _, ...metadata } = message;
7561
+ callbacks?.onMetadata?.(metadata);
7522
7562
  break;
7523
7563
  }
7524
7564
  case "Error": {
@@ -7954,10 +7994,7 @@ var AzureSTTAdapter = class extends BaseAdapter {
7954
7994
  contentUrls: [audio.url],
7955
7995
  properties: this.buildTranscriptionProperties(options)
7956
7996
  };
7957
- const response = await transcriptionsCreate(
7958
- transcriptionRequest,
7959
- this.getAxiosConfig()
7960
- );
7997
+ const response = await transcriptionsCreate(transcriptionRequest, this.getAxiosConfig());
7961
7998
  const transcription = response.data;
7962
7999
  const transcriptId = transcription.self?.split("/").pop() || "";
7963
8000
  return await this.pollForCompletion(transcriptId);
@@ -8497,7 +8534,6 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8497
8534
  const request = {
8498
8535
  ...options?.openai,
8499
8536
  file: audioData,
8500
- // Buffer/Blob both accepted at runtime; generated type expects Blob
8501
8537
  model
8502
8538
  };
8503
8539
  if (options?.language) {
@@ -8517,11 +8553,7 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8517
8553
  request.response_format = OpenAIResponseFormat.json;
8518
8554
  }
8519
8555
  const response = await createTranscription(request, this.getAxiosConfig());
8520
- return this.normalizeResponse(
8521
- response.data,
8522
- model,
8523
- isDiarization
8524
- );
8556
+ return this.normalizeResponse(response.data, model, isDiarization);
8525
8557
  } catch (error) {
8526
8558
  return this.createErrorResponse(error);
8527
8559
  }
@@ -8928,7 +8960,6 @@ function createOpenAIWhisperAdapter(config) {
8928
8960
 
8929
8961
  // src/adapters/speechmatics-adapter.ts
8930
8962
  var import_axios8 = __toESM(require("axios"));
8931
- var import_ws5 = __toESM(require("ws"));
8932
8963
 
8933
8964
  // src/generated/speechmatics/schema/notificationConfigContentsItem.ts
8934
8965
  var NotificationConfigContentsItem = {
@@ -9113,16 +9144,13 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9113
9144
  jobConfig.fetch_data = {
9114
9145
  url: audio.url
9115
9146
  };
9116
- const formData = new FormData();
9117
- formData.append("config", JSON.stringify(jobConfig));
9118
- requestBody = formData;
9119
- headers = { "Content-Type": "multipart/form-data" };
9147
+ requestBody = { config: JSON.stringify(jobConfig) };
9148
+ headers = { "Content-Type": "application/json" };
9120
9149
  } else if (audio.type === "file") {
9121
- const formData = new FormData();
9122
- formData.append("config", JSON.stringify(jobConfig));
9123
- const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9124
- formData.append("data_file", audioBlob, audio.filename || "audio.wav");
9125
- requestBody = formData;
9150
+ requestBody = {
9151
+ config: JSON.stringify(jobConfig),
9152
+ data_file: audio.file
9153
+ };
9126
9154
  headers = { "Content-Type": "multipart/form-data" };
9127
9155
  } else {
9128
9156
  return {
@@ -9228,216 +9256,224 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9228
9256
  }
9229
9257
  }
9230
9258
  /**
9231
- * Build WebSocket URL for real-time streaming
9259
+ * Get the regional WebSocket host for real-time streaming
9232
9260
  *
9233
- * Note: Real-time API uses a different host from the batch API:
9234
- * - Batch: {region}.asr.api.speechmatics.com
9235
- * - Real-time: {region}.rt.speechmatics.com
9236
- *
9237
- * @param region - Regional endpoint identifier
9238
- * @returns WebSocket URL for real-time API
9261
+ * Speechmatics RT uses a different host pattern: {region}.rt.speechmatics.com
9239
9262
  */
9240
- getRegionalWsUrl(region) {
9241
- if (this.config?.wsBaseUrl) {
9242
- return this.config.wsBaseUrl;
9243
- }
9244
- const rtRegionMap = {
9245
- eu1: "eu",
9246
- eu2: "eu",
9247
- us1: "us",
9248
- us2: "us",
9249
- au1: "eu"
9250
- // No AU RT endpoint — fall back to EU
9251
- };
9252
- const rtPrefix = rtRegionMap[region || ""] || "eu";
9253
- return `wss://${rtPrefix}.rt.speechmatics.com/v2`;
9263
+ getRegionalWsHost(region) {
9264
+ const regionPrefix = region || "eu1";
9265
+ return `${regionPrefix}.rt.speechmatics.com`;
9254
9266
  }
9255
9267
  /**
9256
- * Stream audio for real-time transcription via WebSocket
9257
- *
9258
- * Connects to Speechmatics' real-time API and sends audio chunks
9259
- * for transcription with results returned via callbacks.
9268
+ * Stream audio for real-time transcription
9260
9269
  *
9261
- * @param options - Streaming configuration options
9262
- * @param callbacks - Event callbacks for transcription results
9263
- * @returns Promise that resolves with a StreamingSession
9270
+ * Creates a WebSocket connection to the Speechmatics Real-Time API.
9271
+ * Protocol: send StartRecognition config, then AddAudio binary frames,
9272
+ * receive AddPartialTranscript/AddTranscript/EndOfUtterance messages.
9264
9273
  *
9265
- * @example Basic streaming
9266
- * ```typescript
9267
- * const session = await adapter.transcribeStream({
9268
- * language: 'en',
9269
- * speechmaticsStreaming: {
9270
- * enablePartials: true,
9271
- * operatingPoint: 'enhanced'
9272
- * }
9273
- * }, {
9274
- * onTranscript: (event) => console.log(event.text),
9275
- * onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
9276
- * onError: (error) => console.error(error)
9277
- * });
9274
+ * @param options - Streaming configuration
9275
+ * @param callbacks - Event callbacks
9276
+ * @returns StreamingSession for sending audio and closing
9278
9277
  *
9279
- * await session.sendAudio({ data: audioBuffer });
9280
- * await session.close();
9281
- * ```
9278
+ * @see https://docs.speechmatics.com/rt-api-ref
9282
9279
  */
9283
9280
  async transcribeStream(options, callbacks) {
9284
9281
  this.validateConfig();
9285
- const smOpts = options?.speechmaticsStreaming || {};
9286
- const region = smOpts.region || this.config?.region;
9287
- const wsUrl = this.getRegionalWsUrl(region);
9288
- const ws = new import_ws5.default(wsUrl, {
9289
- headers: {
9290
- Authorization: `Bearer ${this.config.apiKey}`
9291
- }
9292
- });
9293
- let sessionStatus = "connecting";
9294
- const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
9295
- let seqNo = 0;
9296
- let utteranceResults = [];
9297
- const sessionReady = new Promise((resolve, reject) => {
9298
- const timeout = setTimeout(() => {
9299
- reject(new Error("WebSocket connection timeout"));
9300
- }, 1e4);
9301
- let wsOpen = false;
9302
- ws.once("error", (error) => {
9303
- clearTimeout(timeout);
9304
- reject(error);
9305
- });
9306
- ws.once("open", () => {
9307
- wsOpen = true;
9308
- const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
9309
- const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
9310
- const startMsg = {
9311
- message: "StartRecognition",
9312
- audio_format: {
9313
- type: "raw",
9314
- encoding,
9315
- sample_rate: sampleRate
9316
- },
9317
- transcription_config: {
9318
- language: smOpts.language || options?.language || "en",
9319
- enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
9320
- }
9321
- };
9322
- const txConfig = startMsg.transcription_config;
9323
- if (smOpts.domain) txConfig.domain = smOpts.domain;
9324
- if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
9325
- if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
9326
- if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
9327
- if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
9328
- if (smOpts.diarization === "speaker" || options?.diarization) {
9329
- txConfig.diarization = "speaker";
9330
- if (smOpts.maxSpeakers) {
9331
- txConfig.speaker_diarization_config = {
9332
- max_speakers: smOpts.maxSpeakers
9333
- };
9334
- } else if (options?.speakersExpected) {
9335
- txConfig.speaker_diarization_config = {
9336
- max_speakers: options.speakersExpected
9337
- };
9338
- }
9339
- }
9340
- if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
9341
- txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
9342
- content: word
9343
- }));
9344
- } else if (options?.customVocabulary && options.customVocabulary.length > 0) {
9345
- txConfig.additional_vocab = options.customVocabulary.map((word) => ({
9346
- content: word
9347
- }));
9348
- }
9349
- if (smOpts.conversationConfig) {
9350
- txConfig.conversation_config = {
9351
- end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9352
- };
9353
- }
9354
- const startPayload = JSON.stringify(startMsg);
9355
- if (callbacks?.onRawMessage) {
9356
- callbacks.onRawMessage({
9357
- provider: "speechmatics",
9358
- direction: "outgoing",
9359
- timestamp: Date.now(),
9360
- payload: startPayload,
9361
- messageType: "StartRecognition"
9362
- });
9282
+ const sessionId = `speechmatics_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9283
+ const createdAt = /* @__PURE__ */ new Date();
9284
+ const smOpts = options?.speechmaticsStreaming;
9285
+ const region = smOpts?.region || this.config?.region;
9286
+ const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost(region)}`);
9287
+ const wsUrl = `${wsBase}/v2`;
9288
+ let status = "connecting";
9289
+ let recognitionStarted = false;
9290
+ const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
9291
+ const ws = new WebSocketImpl(wsUrl);
9292
+ const language = smOpts?.language || options?.language || "en";
9293
+ const transcriptionConfig = {
9294
+ language,
9295
+ enable_entities: smOpts?.enableEntities ?? options?.entityDetection ?? false,
9296
+ enable_partials: smOpts?.enablePartials ?? options?.interimResults !== false,
9297
+ operating_point: smOpts?.operatingPoint || OperatingPoint.enhanced,
9298
+ ...smOpts?.maxDelay !== void 0 && { max_delay: smOpts.maxDelay },
9299
+ ...smOpts?.maxDelayMode && {
9300
+ max_delay_mode: smOpts.maxDelayMode
9301
+ },
9302
+ ...smOpts?.domain && { domain: smOpts.domain },
9303
+ ...(options?.diarization || smOpts?.diarization === TranscriptionConfigDiarization.speaker) && {
9304
+ diarization: TranscriptionConfigDiarization.speaker,
9305
+ ...smOpts?.maxSpeakers !== void 0 && {
9306
+ speaker_diarization_config: { max_speakers: smOpts.maxSpeakers }
9363
9307
  }
9364
- ws.send(startPayload);
9365
- });
9366
- const onMessage = (data) => {
9367
- const rawPayload = data.toString();
9368
- try {
9369
- const msg = JSON.parse(rawPayload);
9370
- if (msg.message === "RecognitionStarted") {
9371
- clearTimeout(timeout);
9372
- ws.removeListener("message", onMessage);
9373
- ws.emit("message", data);
9374
- resolve();
9375
- } else if (msg.message === "Error") {
9376
- clearTimeout(timeout);
9377
- ws.removeListener("message", onMessage);
9378
- reject(new Error(msg.reason || "Recognition failed to start"));
9379
- }
9380
- } catch {
9308
+ },
9309
+ ...(options?.customVocabulary?.length || smOpts?.additionalVocab?.length) && {
9310
+ additional_vocab: (smOpts?.additionalVocab || options?.customVocabulary || []).map(
9311
+ (term) => ({ content: term })
9312
+ )
9313
+ }
9314
+ };
9315
+ const startRecognition = {
9316
+ message: "StartRecognition",
9317
+ audio_format: {
9318
+ type: "raw",
9319
+ encoding: smOpts?.encoding || "pcm_s16le",
9320
+ sample_rate: smOpts?.sampleRate || options?.sampleRate || 16e3
9321
+ },
9322
+ transcription_config: transcriptionConfig,
9323
+ ...smOpts?.conversationConfig && {
9324
+ conversation_config: {
9325
+ end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9381
9326
  }
9382
- };
9383
- ws.on("message", onMessage);
9384
- });
9385
- ws.on("message", (data) => {
9386
- const rawPayload = data.toString();
9327
+ }
9328
+ };
9329
+ ws.onopen = () => {
9330
+ status = "open";
9331
+ const msg = JSON.stringify(startRecognition);
9332
+ if (callbacks?.onRawMessage) {
9333
+ callbacks.onRawMessage({
9334
+ provider: this.name,
9335
+ direction: "outgoing",
9336
+ timestamp: Date.now(),
9337
+ payload: msg,
9338
+ messageType: "StartRecognition"
9339
+ });
9340
+ }
9341
+ ws.send(msg);
9342
+ };
9343
+ ws.onmessage = (event) => {
9344
+ const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
9387
9345
  try {
9388
- const message = JSON.parse(rawPayload);
9346
+ const data = JSON.parse(rawPayload);
9347
+ const messageType = data.message;
9389
9348
  if (callbacks?.onRawMessage) {
9390
9349
  callbacks.onRawMessage({
9391
- provider: "speechmatics",
9350
+ provider: this.name,
9392
9351
  direction: "incoming",
9393
9352
  timestamp: Date.now(),
9394
9353
  payload: rawPayload,
9395
- messageType: message.message
9354
+ messageType
9396
9355
  });
9397
9356
  }
9398
- this.handleStreamingMessage(message, callbacks, utteranceResults);
9399
- } catch (error) {
9400
- if (callbacks?.onRawMessage) {
9401
- callbacks.onRawMessage({
9402
- provider: "speechmatics",
9403
- direction: "incoming",
9404
- timestamp: Date.now(),
9405
- payload: rawPayload,
9406
- messageType: "parse_error"
9407
- });
9357
+ switch (messageType) {
9358
+ case "RecognitionStarted": {
9359
+ recognitionStarted = true;
9360
+ callbacks?.onOpen?.();
9361
+ callbacks?.onMetadata?.({
9362
+ id: data.id,
9363
+ languagePackInfo: data.language_pack_info
9364
+ });
9365
+ break;
9366
+ }
9367
+ case "AddPartialTranscript": {
9368
+ const partial = data;
9369
+ const words = this.resultsToWords(partial.results);
9370
+ callbacks?.onTranscript?.({
9371
+ type: "transcript",
9372
+ text: partial.metadata.transcript,
9373
+ isFinal: false,
9374
+ words,
9375
+ speaker: words[0]?.speaker,
9376
+ confidence: partial.results[0]?.alternatives?.[0]?.confidence,
9377
+ channel: partial.channel ? parseInt(partial.channel) : void 0
9378
+ });
9379
+ break;
9380
+ }
9381
+ case "AddTranscript": {
9382
+ const final = data;
9383
+ const words = this.resultsToWords(final.results);
9384
+ callbacks?.onTranscript?.({
9385
+ type: "transcript",
9386
+ text: final.metadata.transcript,
9387
+ isFinal: true,
9388
+ words,
9389
+ speaker: words[0]?.speaker,
9390
+ confidence: final.results[0]?.alternatives?.[0]?.confidence,
9391
+ channel: final.channel ? parseInt(final.channel) : void 0
9392
+ });
9393
+ if (options?.diarization || smOpts?.diarization === "speaker") {
9394
+ const utterances = buildUtterancesFromWords(words);
9395
+ for (const utterance of utterances) {
9396
+ callbacks?.onUtterance?.(utterance);
9397
+ }
9398
+ }
9399
+ break;
9400
+ }
9401
+ case "EndOfUtterance": {
9402
+ break;
9403
+ }
9404
+ case "EndOfTranscript": {
9405
+ callbacks?.onClose?.(1e3, "Transcription complete");
9406
+ break;
9407
+ }
9408
+ case "Error": {
9409
+ const err = data;
9410
+ callbacks?.onError?.({
9411
+ code: err.type || "SPEECHMATICS_ERROR",
9412
+ message: err.reason || "Unknown error"
9413
+ });
9414
+ break;
9415
+ }
9416
+ case "Warning": {
9417
+ const warn = data;
9418
+ callbacks?.onMetadata?.({
9419
+ warning: warn.type,
9420
+ reason: warn.reason
9421
+ });
9422
+ break;
9423
+ }
9424
+ case "Info": {
9425
+ callbacks?.onMetadata?.(data);
9426
+ break;
9427
+ }
9428
+ case "AudioAdded":
9429
+ case "ChannelAudioAdded":
9430
+ break;
9431
+ default:
9432
+ callbacks?.onMetadata?.(data);
9433
+ break;
9408
9434
  }
9435
+ } catch (error) {
9409
9436
  callbacks?.onError?.({
9410
9437
  code: "PARSE_ERROR",
9411
- message: "Failed to parse WebSocket message",
9412
- details: error
9438
+ message: `Failed to parse message: ${error}`
9413
9439
  });
9414
9440
  }
9415
- });
9416
- ws.on("error", (error) => {
9441
+ };
9442
+ ws.onerror = () => {
9417
9443
  callbacks?.onError?.({
9418
9444
  code: "WEBSOCKET_ERROR",
9419
- message: error.message,
9420
- details: error
9445
+ message: "WebSocket error occurred"
9421
9446
  });
9447
+ };
9448
+ ws.onclose = (event) => {
9449
+ status = "closed";
9450
+ callbacks?.onClose?.(event.code, event.reason);
9451
+ };
9452
+ await new Promise((resolve, reject) => {
9453
+ const timeout = setTimeout(() => {
9454
+ reject(new Error("WebSocket connection timeout"));
9455
+ }, 1e4);
9456
+ const checkReady = () => {
9457
+ if (recognitionStarted) {
9458
+ clearTimeout(timeout);
9459
+ resolve();
9460
+ } else if (status === "closed") {
9461
+ clearTimeout(timeout);
9462
+ reject(new Error("WebSocket connection failed"));
9463
+ } else {
9464
+ setTimeout(checkReady, 100);
9465
+ }
9466
+ };
9467
+ checkReady();
9422
9468
  });
9423
- ws.on("close", (code, reason) => {
9424
- sessionStatus = "closed";
9425
- callbacks?.onClose?.(code, reason.toString());
9426
- });
9427
- await sessionReady;
9428
- sessionStatus = "open";
9429
- callbacks?.onOpen?.();
9430
9469
  return {
9431
9470
  id: sessionId,
9432
9471
  provider: this.name,
9433
- createdAt: /* @__PURE__ */ new Date(),
9434
- getStatus: () => sessionStatus,
9472
+ createdAt,
9473
+ getStatus: () => status,
9435
9474
  sendAudio: async (chunk) => {
9436
- if (sessionStatus !== "open") {
9437
- throw new Error(`Cannot send audio: session is ${sessionStatus}`);
9438
- }
9439
- if (ws.readyState !== import_ws5.default.OPEN) {
9440
- throw new Error("WebSocket is not open");
9475
+ if (status !== "open") {
9476
+ throw new Error("Session is not open");
9441
9477
  }
9442
9478
  if (callbacks?.onRawMessage) {
9443
9479
  const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
@@ -9453,12 +9489,11 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9453
9489
  });
9454
9490
  }
9455
9491
  ws.send(chunk.data);
9456
- seqNo++;
9457
- if (chunk.isLast) {
9458
- const endMsg = JSON.stringify({
9459
- message: "EndOfStream",
9460
- last_seq_no: seqNo
9461
- });
9492
+ },
9493
+ close: async () => {
9494
+ if (status === "open") {
9495
+ status = "closing";
9496
+ const endMsg = JSON.stringify({ message: "EndOfStream", last_seq_no: 0 });
9462
9497
  if (callbacks?.onRawMessage) {
9463
9498
  callbacks.onRawMessage({
9464
9499
  provider: this.name,
@@ -9470,144 +9505,19 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9470
9505
  }
9471
9506
  ws.send(endMsg);
9472
9507
  }
9473
- },
9474
- close: async () => {
9475
- if (sessionStatus === "closed" || sessionStatus === "closing") {
9476
- return;
9477
- }
9478
- sessionStatus = "closing";
9479
- if (ws.readyState === import_ws5.default.OPEN) {
9480
- seqNo++;
9481
- ws.send(
9482
- JSON.stringify({
9483
- message: "EndOfStream",
9484
- last_seq_no: seqNo
9485
- })
9486
- );
9487
- }
9488
- return new Promise((resolve) => {
9489
- const timeout = setTimeout(() => {
9490
- ws.terminate();
9491
- sessionStatus = "closed";
9492
- resolve();
9493
- }, 5e3);
9494
- const onMsg = (data) => {
9495
- try {
9496
- const msg = JSON.parse(data.toString());
9497
- if (msg.message === "EndOfTranscript") {
9498
- ws.removeListener("message", onMsg);
9499
- clearTimeout(timeout);
9500
- ws.close();
9501
- }
9502
- } catch {
9503
- }
9504
- };
9505
- ws.on("message", onMsg);
9506
- ws.once("close", () => {
9507
- clearTimeout(timeout);
9508
- sessionStatus = "closed";
9509
- resolve();
9510
- });
9511
- });
9512
9508
  }
9513
9509
  };
9514
9510
  }
9515
9511
  /**
9516
- * Handle incoming Speechmatics real-time WebSocket messages
9517
- */
9518
- handleStreamingMessage(message, callbacks, utteranceResults) {
9519
- switch (message.message) {
9520
- case "RecognitionStarted": {
9521
- break;
9522
- }
9523
- case "AddPartialTranscript": {
9524
- const results = message.results || [];
9525
- const text = buildTextFromSpeechmaticsResults(results);
9526
- if (text) {
9527
- callbacks?.onTranscript?.({
9528
- type: "transcript",
9529
- text,
9530
- isFinal: false,
9531
- words: this.extractWordsFromResults(results),
9532
- data: message
9533
- });
9534
- }
9535
- break;
9536
- }
9537
- case "AddTranscript": {
9538
- const results = message.results || [];
9539
- const text = buildTextFromSpeechmaticsResults(results);
9540
- if (utteranceResults) {
9541
- utteranceResults.push(...results);
9542
- }
9543
- if (text) {
9544
- callbacks?.onTranscript?.({
9545
- type: "transcript",
9546
- text,
9547
- isFinal: true,
9548
- words: this.extractWordsFromResults(results),
9549
- data: message
9550
- });
9551
- }
9552
- break;
9553
- }
9554
- case "EndOfUtterance": {
9555
- if (utteranceResults && utteranceResults.length > 0) {
9556
- const text = buildTextFromSpeechmaticsResults(utteranceResults);
9557
- const words = this.extractWordsFromResults(utteranceResults);
9558
- const utterances = buildUtterancesFromWords(words);
9559
- if (utterances.length > 0) {
9560
- for (const utt of utterances) {
9561
- callbacks?.onUtterance?.(utt);
9562
- }
9563
- } else if (text) {
9564
- callbacks?.onUtterance?.({
9565
- text,
9566
- start: words.length > 0 ? words[0].start : 0,
9567
- end: words.length > 0 ? words[words.length - 1].end : 0,
9568
- words
9569
- });
9570
- }
9571
- utteranceResults.length = 0;
9572
- }
9573
- break;
9574
- }
9575
- case "AudioAdded": {
9576
- break;
9577
- }
9578
- case "EndOfTranscript": {
9579
- break;
9580
- }
9581
- case "Info":
9582
- case "Warning": {
9583
- callbacks?.onMetadata?.(message);
9584
- break;
9585
- }
9586
- case "Error": {
9587
- const errMsg = message;
9588
- callbacks?.onError?.({
9589
- code: errMsg.type || "SPEECHMATICS_ERROR",
9590
- message: errMsg.reason || "Unknown error",
9591
- details: message
9592
- });
9593
- break;
9594
- }
9595
- default: {
9596
- callbacks?.onMetadata?.(message);
9597
- break;
9598
- }
9599
- }
9600
- }
9601
- /**
9602
- * Extract unified Word[] from Speechmatics recognition results
9512
+ * Convert Speechmatics RecognitionResult[] to unified Word[]
9603
9513
  */
9604
- extractWordsFromResults(results) {
9605
- return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
9606
- word: result.alternatives?.[0]?.content || "",
9607
- start: result.start_time,
9608
- end: result.end_time,
9609
- confidence: result.alternatives?.[0]?.confidence,
9610
- speaker: result.alternatives?.[0]?.speaker
9514
+ resultsToWords(results) {
9515
+ return results.filter((r) => r.type === "word").map((r) => ({
9516
+ word: r.alternatives?.[0]?.content || "",
9517
+ start: r.start_time,
9518
+ end: r.end_time,
9519
+ confidence: r.alternatives?.[0]?.confidence,
9520
+ speaker: r.alternatives?.[0]?.speaker
9611
9521
  }));
9612
9522
  }
9613
9523
  /**
@@ -9678,9 +9588,6 @@ function createSpeechmaticsAdapter(config) {
9678
9588
  return adapter;
9679
9589
  }
9680
9590
 
9681
- // src/adapters/soniox-adapter.ts
9682
- var import_axios9 = __toESM(require("axios"));
9683
-
9684
9591
  // src/generated/soniox/schema/transcriptionStatus.ts
9685
9592
  var TranscriptionStatus = {
9686
9593
  queued: "queued",
@@ -9689,6 +9596,57 @@ var TranscriptionStatus = {
9689
9596
  error: "error"
9690
9597
  };
9691
9598
 
9599
+ // src/generated/soniox/api/sonioxPublicAPI.ts
9600
+ var import_axios9 = __toESM(require("axios"));
9601
+
9602
+ // src/generated/soniox/schema/index.ts
9603
+ var schema_exports4 = {};
9604
+ __export(schema_exports4, {
9605
+ TemporaryApiKeyUsageType: () => TemporaryApiKeyUsageType,
9606
+ TranscriptionMode: () => TranscriptionMode,
9607
+ TranscriptionStatus: () => TranscriptionStatus,
9608
+ TranslationConfigType: () => TranslationConfigType
9609
+ });
9610
+
9611
+ // src/generated/soniox/schema/temporaryApiKeyUsageType.ts
9612
+ var TemporaryApiKeyUsageType = {
9613
+ transcribe_websocket: "transcribe_websocket"
9614
+ };
9615
+
9616
+ // src/generated/soniox/schema/transcriptionMode.ts
9617
+ var TranscriptionMode = {
9618
+ real_time: "real_time",
9619
+ async: "async"
9620
+ };
9621
+
9622
+ // src/generated/soniox/schema/translationConfigType.ts
9623
+ var TranslationConfigType = {
9624
+ one_way: "one_way",
9625
+ two_way: "two_way"
9626
+ };
9627
+
9628
+ // src/generated/soniox/api/sonioxPublicAPI.ts
9629
+ var uploadFile = (uploadFileBody2, options) => {
9630
+ const formData = new FormData();
9631
+ if (uploadFileBody2.client_reference_id !== void 0 && uploadFileBody2.client_reference_id !== null) {
9632
+ formData.append("client_reference_id", uploadFileBody2.client_reference_id);
9633
+ }
9634
+ formData.append("file", uploadFileBody2.file);
9635
+ return import_axios9.default.post("/v1/files", formData, options);
9636
+ };
9637
+ var createTranscription2 = (createTranscriptionPayload, options) => {
9638
+ return import_axios9.default.post("/v1/transcriptions", createTranscriptionPayload, options);
9639
+ };
9640
+ var getTranscription = (transcriptionId, options) => {
9641
+ return import_axios9.default.get(`/v1/transcriptions/${transcriptionId}`, options);
9642
+ };
9643
+ var getTranscriptionTranscript = (transcriptionId, options) => {
9644
+ return import_axios9.default.get(`/v1/transcriptions/${transcriptionId}/transcript`, options);
9645
+ };
9646
+ var getModels = (options) => {
9647
+ return import_axios9.default.get("/v1/models", options);
9648
+ };
9649
+
9692
9650
  // src/adapters/soniox-adapter.ts
9693
9651
  var SonioxAdapter = class extends BaseAdapter {
9694
9652
  constructor() {
@@ -9743,11 +9701,17 @@ var SonioxAdapter = class extends BaseAdapter {
9743
9701
  }
9744
9702
  }
9745
9703
  /**
9746
- * Get the base URL for API requests
9704
+ * Get the base URL for API requests (no /v1 suffix — generated functions include /v1 in paths)
9747
9705
  */
9748
9706
  get baseUrl() {
9749
9707
  if (this.config?.baseUrl) return this.config.baseUrl;
9750
- return `https://${this.getRegionalHost()}/v1`;
9708
+ return `https://${this.getRegionalHost()}`;
9709
+ }
9710
+ /**
9711
+ * Build axios config with Soniox Bearer auth
9712
+ */
9713
+ getAxiosConfig() {
9714
+ return super.getAxiosConfig("Authorization", (key) => `Bearer ${key}`);
9751
9715
  }
9752
9716
  initialize(config) {
9753
9717
  super.initialize(config);
@@ -9757,15 +9721,6 @@ var SonioxAdapter = class extends BaseAdapter {
9757
9721
  if (config.model) {
9758
9722
  this.defaultModel = config.model;
9759
9723
  }
9760
- this.client = import_axios9.default.create({
9761
- baseURL: this.baseUrl,
9762
- timeout: config.timeout || 12e4,
9763
- headers: {
9764
- Authorization: `Bearer ${config.apiKey}`,
9765
- "Content-Type": "application/json",
9766
- ...config.headers
9767
- }
9768
- });
9769
9724
  }
9770
9725
  /**
9771
9726
  * Get current region
@@ -9795,23 +9750,12 @@ var SonioxAdapter = class extends BaseAdapter {
9795
9750
  */
9796
9751
  setRegion(region) {
9797
9752
  this.region = region;
9798
- if (this.config?.apiKey) {
9799
- this.client = import_axios9.default.create({
9800
- baseURL: this.baseUrl,
9801
- timeout: this.config.timeout || 12e4,
9802
- headers: {
9803
- Authorization: `Bearer ${this.config.apiKey}`,
9804
- "Content-Type": "application/json",
9805
- ...this.config.headers
9806
- }
9807
- });
9808
- }
9809
9753
  }
9810
9754
  /**
9811
9755
  * Submit audio for transcription
9812
9756
  *
9813
- * Soniox uses async batch processing. The transcribe method submits audio
9814
- * and waits for completion (or use getTranscript for polling).
9757
+ * Uses the async v1 API: createTranscription returns status `queued`,
9758
+ * then polls until completed (or returns immediately if webhook is set).
9815
9759
  *
9816
9760
  * @param audio - Audio input (URL or file)
9817
9761
  * @param options - Transcription options
@@ -9820,21 +9764,44 @@ var SonioxAdapter = class extends BaseAdapter {
9820
9764
  async transcribe(audio, options) {
9821
9765
  this.validateConfig();
9822
9766
  try {
9823
- const requestBody = {
9824
- model: options?.model || this.defaultModel
9825
- };
9826
- if (audio.type === "url") {
9827
- requestBody.audio_url = audio.url;
9828
- } else if (audio.type === "file") {
9829
- const formData = new FormData();
9767
+ const sonioxOpts = options?.soniox;
9768
+ if (audio.type === "file") {
9830
9769
  const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9831
- formData.append("file", audioBlob, audio.filename || "audio.wav");
9832
- const uploadResponse = await this.client.post("/files", formData, {
9833
- headers: {
9834
- "Content-Type": "multipart/form-data"
9835
- }
9836
- });
9837
- requestBody.file_id = uploadResponse.data.id;
9770
+ const uploadBody = { file: audioBlob };
9771
+ const fileResp = await uploadFile(uploadBody, this.getAxiosConfig());
9772
+ const payload = {
9773
+ ...sonioxOpts,
9774
+ model: options?.model || this.defaultModel,
9775
+ file_id: fileResp.data.id,
9776
+ language_hints: options?.language ? [options.language] : sonioxOpts?.language_hints,
9777
+ enable_speaker_diarization: options?.diarization || sonioxOpts?.enable_speaker_diarization,
9778
+ enable_language_identification: options?.languageDetection || sonioxOpts?.enable_language_identification,
9779
+ context: options?.customVocabulary?.length ? { terms: options.customVocabulary } : sonioxOpts?.context,
9780
+ webhook_url: options?.webhookUrl || sonioxOpts?.webhook_url
9781
+ };
9782
+ const createResp = await createTranscription2(payload, this.getAxiosConfig());
9783
+ const meta = createResp.data;
9784
+ if (options?.webhookUrl || sonioxOpts?.webhook_url) {
9785
+ return this.normalizeTranscription(meta);
9786
+ }
9787
+ return this.pollForCompletion(meta.id);
9788
+ } else if (audio.type === "url") {
9789
+ const payload = {
9790
+ ...sonioxOpts,
9791
+ model: options?.model || this.defaultModel,
9792
+ audio_url: audio.url,
9793
+ language_hints: options?.language ? [options.language] : sonioxOpts?.language_hints,
9794
+ enable_speaker_diarization: options?.diarization || sonioxOpts?.enable_speaker_diarization,
9795
+ enable_language_identification: options?.languageDetection || sonioxOpts?.enable_language_identification,
9796
+ context: options?.customVocabulary?.length ? { terms: options.customVocabulary } : sonioxOpts?.context,
9797
+ webhook_url: options?.webhookUrl || sonioxOpts?.webhook_url
9798
+ };
9799
+ const createResp = await createTranscription2(payload, this.getAxiosConfig());
9800
+ const meta = createResp.data;
9801
+ if (options?.webhookUrl || sonioxOpts?.webhook_url) {
9802
+ return this.normalizeTranscription(meta);
9803
+ }
9804
+ return this.pollForCompletion(meta.id);
9838
9805
  } else {
9839
9806
  return {
9840
9807
  success: false,
@@ -9845,38 +9812,6 @@ var SonioxAdapter = class extends BaseAdapter {
9845
9812
  }
9846
9813
  };
9847
9814
  }
9848
- if (options?.language) {
9849
- requestBody.language_hints = [options.language];
9850
- }
9851
- if (options?.diarization) {
9852
- requestBody.enable_speaker_diarization = true;
9853
- }
9854
- if (options?.languageDetection) {
9855
- requestBody.enable_language_identification = true;
9856
- }
9857
- if (options?.customVocabulary && options.customVocabulary.length > 0) {
9858
- requestBody.context = {
9859
- terms: options.customVocabulary
9860
- };
9861
- }
9862
- if (options?.webhookUrl) {
9863
- requestBody.webhook_url = options.webhookUrl;
9864
- }
9865
- const response = await this.client.post("/transcriptions", requestBody);
9866
- const transcriptionId = response.data.id;
9867
- if (options?.webhookUrl) {
9868
- return {
9869
- success: true,
9870
- provider: this.name,
9871
- data: {
9872
- id: transcriptionId,
9873
- text: "",
9874
- status: "queued"
9875
- },
9876
- raw: response.data
9877
- };
9878
- }
9879
- return await this.pollForCompletion(transcriptionId);
9880
9815
  } catch (error) {
9881
9816
  return this.createErrorResponse(error);
9882
9817
  }
@@ -9884,9 +9819,8 @@ var SonioxAdapter = class extends BaseAdapter {
9884
9819
  /**
9885
9820
  * Get transcription result by ID
9886
9821
  *
9887
- * Checks job status via GET /v1/transcriptions/{id}, then fetches
9888
- * the full transcript via GET /v1/transcriptions/{id}/transcript
9889
- * when completed.
9822
+ * Fetches transcription metadata and, if completed, the transcript text/tokens.
9823
+ * Used by pollForCompletion() for async polling.
9890
9824
  *
9891
9825
  * @param transcriptId - Transcript ID
9892
9826
  * @returns Transcription response
@@ -9894,39 +9828,20 @@ var SonioxAdapter = class extends BaseAdapter {
9894
9828
  async getTranscript(transcriptId) {
9895
9829
  this.validateConfig();
9896
9830
  try {
9897
- const statusResponse = await this.client.get(`/transcriptions/${transcriptId}`);
9898
- const job = statusResponse.data;
9899
- if (job.status === "error") {
9900
- return {
9901
- success: false,
9902
- provider: this.name,
9903
- error: {
9904
- code: "TRANSCRIPTION_ERROR",
9905
- message: job.error_message || "Transcription failed"
9906
- }
9907
- };
9908
- }
9909
- if (job.status !== "completed") {
9910
- return {
9911
- success: true,
9912
- provider: this.name,
9913
- data: {
9914
- id: job.id,
9915
- text: "",
9916
- status: job.status
9917
- },
9918
- raw: job
9919
- };
9831
+ const metaResp = await getTranscription(transcriptId, this.getAxiosConfig());
9832
+ const meta = metaResp.data;
9833
+ if (meta.status === TranscriptionStatus.completed) {
9834
+ try {
9835
+ const transcriptResp = await getTranscriptionTranscript(
9836
+ transcriptId,
9837
+ this.getAxiosConfig()
9838
+ );
9839
+ return this.normalizeTranscription(meta, transcriptResp.data);
9840
+ } catch (transcriptError) {
9841
+ return this.createErrorResponse(transcriptError);
9842
+ }
9920
9843
  }
9921
- const transcriptResponse = await this.client.get(
9922
- `/transcriptions/${transcriptId}/transcript`
9923
- );
9924
- return this.normalizeResponse({
9925
- ...transcriptResponse.data,
9926
- // Carry over job metadata
9927
- id: job.id,
9928
- audio_duration_ms: job.audio_duration_ms
9929
- });
9844
+ return this.normalizeTranscription(meta);
9930
9845
  } catch (error) {
9931
9846
  return this.createErrorResponse(error);
9932
9847
  }
@@ -9946,51 +9861,50 @@ var SonioxAdapter = class extends BaseAdapter {
9946
9861
  const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9947
9862
  const createdAt = /* @__PURE__ */ new Date();
9948
9863
  const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
9949
- const wsUrl = `${wsBase}/transcribe-websocket`;
9950
- const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
9951
- const sonioxOpts = options?.sonioxStreaming;
9952
- const initMessage = {
9953
- api_key: this.config.apiKey,
9954
- model: modelId
9955
- };
9956
- if (sonioxOpts?.audioFormat) {
9957
- initMessage.audio_format = sonioxOpts.audioFormat;
9958
- } else if (options?.encoding) {
9864
+ const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
9865
+ wsUrl.searchParams.set("api_key", this.config.apiKey);
9866
+ const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
9867
+ wsUrl.searchParams.set("model", modelId);
9868
+ if (options?.encoding) {
9959
9869
  const encodingMap = {
9960
9870
  linear16: "pcm_s16le",
9961
9871
  pcm: "pcm_s16le",
9962
9872
  mulaw: "mulaw",
9963
9873
  alaw: "alaw"
9964
9874
  };
9965
- initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
9875
+ wsUrl.searchParams.set("audio_format", encodingMap[options.encoding] || options.encoding);
9966
9876
  }
9967
- if (sonioxOpts?.sampleRate || options?.sampleRate) {
9968
- initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
9877
+ if (options?.sampleRate) {
9878
+ wsUrl.searchParams.set("sample_rate", options.sampleRate.toString());
9969
9879
  }
9970
- if (sonioxOpts?.numChannels || options?.channels) {
9971
- initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
9880
+ if (options?.channels) {
9881
+ wsUrl.searchParams.set("num_channels", options.channels.toString());
9972
9882
  }
9883
+ const sonioxOpts = options?.sonioxStreaming;
9973
9884
  if (sonioxOpts) {
9974
9885
  if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
9975
- initMessage.language_hints = sonioxOpts.languageHints;
9886
+ wsUrl.searchParams.set("language_hints", JSON.stringify(sonioxOpts.languageHints));
9976
9887
  }
9977
9888
  if (sonioxOpts.enableLanguageIdentification) {
9978
- initMessage.enable_language_identification = true;
9889
+ wsUrl.searchParams.set("enable_language_identification", "true");
9979
9890
  }
9980
9891
  if (sonioxOpts.enableEndpointDetection) {
9981
- initMessage.enable_endpoint_detection = true;
9892
+ wsUrl.searchParams.set("enable_endpoint_detection", "true");
9982
9893
  }
9983
9894
  if (sonioxOpts.enableSpeakerDiarization) {
9984
- initMessage.enable_speaker_diarization = true;
9895
+ wsUrl.searchParams.set("enable_speaker_diarization", "true");
9985
9896
  }
9986
9897
  if (sonioxOpts.context) {
9987
- initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
9898
+ wsUrl.searchParams.set(
9899
+ "context",
9900
+ typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
9901
+ );
9988
9902
  }
9989
9903
  if (sonioxOpts.translation) {
9990
- initMessage.translation = sonioxOpts.translation;
9904
+ wsUrl.searchParams.set("translation", JSON.stringify(sonioxOpts.translation));
9991
9905
  }
9992
9906
  if (sonioxOpts.clientReferenceId) {
9993
- initMessage.client_reference_id = sonioxOpts.clientReferenceId;
9907
+ wsUrl.searchParams.set("client_reference_id", sonioxOpts.clientReferenceId);
9994
9908
  }
9995
9909
  }
9996
9910
  if (!sonioxOpts?.languageHints && options?.language) {
@@ -9999,33 +9913,24 @@ var SonioxAdapter = class extends BaseAdapter {
9999
9913
  `[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
10000
9914
  );
10001
9915
  }
10002
- initMessage.language_hints = [options.language];
9916
+ wsUrl.searchParams.set("language_hints", JSON.stringify([options.language]));
10003
9917
  }
10004
9918
  if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
10005
- initMessage.enable_speaker_diarization = true;
9919
+ wsUrl.searchParams.set("enable_speaker_diarization", "true");
10006
9920
  }
10007
9921
  if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
10008
- initMessage.enable_language_identification = true;
9922
+ wsUrl.searchParams.set("enable_language_identification", "true");
9923
+ }
9924
+ if (options?.interimResults !== false) {
10009
9925
  }
10010
9926
  let status = "connecting";
10011
9927
  let openedAt = null;
10012
9928
  let receivedData = false;
10013
9929
  const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
10014
- const ws = new WebSocketImpl(wsUrl);
9930
+ const ws = new WebSocketImpl(wsUrl.toString());
10015
9931
  ws.onopen = () => {
10016
- openedAt = Date.now();
10017
- const initPayload = JSON.stringify(initMessage);
10018
- if (callbacks?.onRawMessage) {
10019
- callbacks.onRawMessage({
10020
- provider: this.name,
10021
- direction: "outgoing",
10022
- timestamp: Date.now(),
10023
- payload: initPayload,
10024
- messageType: "init"
10025
- });
10026
- }
10027
- ws.send(initPayload);
10028
9932
  status = "open";
9933
+ openedAt = Date.now();
10029
9934
  callbacks?.onOpen?.();
10030
9935
  };
10031
9936
  ws.onmessage = (event) => {
@@ -10034,7 +9939,8 @@ var SonioxAdapter = class extends BaseAdapter {
10034
9939
  let messageType;
10035
9940
  try {
10036
9941
  const data = JSON.parse(rawPayload);
10037
- if (data.error) {
9942
+ const errorMessage = data.error_message;
9943
+ if (errorMessage) {
10038
9944
  messageType = "error";
10039
9945
  } else if (data.finished) {
10040
9946
  messageType = "finished";
@@ -10050,10 +9956,10 @@ var SonioxAdapter = class extends BaseAdapter {
10050
9956
  messageType
10051
9957
  });
10052
9958
  }
10053
- if (data.error) {
9959
+ if (errorMessage) {
10054
9960
  callbacks?.onError?.({
10055
9961
  code: data.error_code?.toString() || "STREAM_ERROR",
10056
- message: data.error
9962
+ message: errorMessage
10057
9963
  });
10058
9964
  return;
10059
9965
  }
@@ -10067,7 +9973,7 @@ var SonioxAdapter = class extends BaseAdapter {
10067
9973
  start: token.start_ms ? token.start_ms / 1e3 : 0,
10068
9974
  end: token.end_ms ? token.end_ms / 1e3 : 0,
10069
9975
  confidence: token.confidence,
10070
- speaker: token.speaker
9976
+ speaker: token.speaker ?? void 0
10071
9977
  }));
10072
9978
  const text = data.text || data.tokens.map((t) => t.text).join("");
10073
9979
  const isFinal = data.tokens.every((t) => t.is_final);
@@ -10076,8 +9982,8 @@ var SonioxAdapter = class extends BaseAdapter {
10076
9982
  text,
10077
9983
  isFinal,
10078
9984
  words,
10079
- speaker: data.tokens[0]?.speaker,
10080
- language: data.tokens[0]?.language,
9985
+ speaker: data.tokens[0]?.speaker ?? void 0,
9986
+ language: data.tokens[0]?.language ?? void 0,
10081
9987
  confidence: data.tokens[0]?.confidence
10082
9988
  };
10083
9989
  callbacks?.onTranscript?.(event2);
@@ -10104,10 +10010,10 @@ var SonioxAdapter = class extends BaseAdapter {
10104
10010
  ws.onclose = (event) => {
10105
10011
  status = "closed";
10106
10012
  const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
10107
- const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
10108
- if (isEarlyClose && event.code === 1e3) {
10013
+ const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
10014
+ if (isImmediateClose && event.code === 1e3) {
10109
10015
  const errorMessage = [
10110
- "Soniox closed connection shortly after opening.",
10016
+ "Soniox closed connection immediately after opening.",
10111
10017
  `Current config: region=${this.region}, model=${modelId}`,
10112
10018
  "Likely causes:",
10113
10019
  " - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
@@ -10193,7 +10099,7 @@ var SonioxAdapter = class extends BaseAdapter {
10193
10099
  async getModels() {
10194
10100
  this.validateConfig();
10195
10101
  try {
10196
- const response = await this.client.get("/models");
10102
+ const response = await getModels(this.getAxiosConfig());
10197
10103
  return response.data.models || [];
10198
10104
  } catch (error) {
10199
10105
  console.error("Failed to fetch Soniox models:", error);
@@ -10225,11 +10131,44 @@ var SonioxAdapter = class extends BaseAdapter {
10225
10131
  return buildUtterancesFromWords(words);
10226
10132
  }
10227
10133
  /**
10228
- * Normalize Soniox response to unified format
10134
+ * Normalize v1 API response to unified format
10135
+ *
10136
+ * @param meta - Transcription metadata from getTranscription/createTranscription
10137
+ * @param transcript - Transcript data (text/tokens), only present when status is completed
10229
10138
  */
10230
- normalizeResponse(response) {
10231
- const { text, tokens } = response;
10232
- const words = tokens.map((token) => ({
10139
+ normalizeTranscription(meta, transcript) {
10140
+ if (meta.status === TranscriptionStatus.error) {
10141
+ return {
10142
+ success: false,
10143
+ provider: this.name,
10144
+ data: {
10145
+ id: meta.id,
10146
+ text: "",
10147
+ status: "error"
10148
+ },
10149
+ error: {
10150
+ code: meta.error_type || "TRANSCRIPTION_ERROR",
10151
+ message: meta.error_message || "Transcription failed"
10152
+ },
10153
+ raw: { meta, transcript }
10154
+ };
10155
+ }
10156
+ if (!transcript) {
10157
+ return {
10158
+ success: true,
10159
+ provider: this.name,
10160
+ data: {
10161
+ id: meta.id,
10162
+ text: "",
10163
+ status: meta.status,
10164
+ duration: meta.audio_duration_ms ? meta.audio_duration_ms / 1e3 : void 0
10165
+ },
10166
+ raw: { meta }
10167
+ };
10168
+ }
10169
+ const tokens = transcript.tokens || [];
10170
+ const text = transcript.text || tokens.map((t) => t.text).join("");
10171
+ const words = tokens.filter((t) => t.start_ms !== void 0 && t.end_ms !== void 0).map((token) => ({
10233
10172
  word: token.text,
10234
10173
  start: token.start_ms / 1e3,
10235
10174
  end: token.end_ms / 1e3,
@@ -10237,33 +10176,32 @@ var SonioxAdapter = class extends BaseAdapter {
10237
10176
  speaker: token.speaker ?? void 0
10238
10177
  }));
10239
10178
  const speakerSet = /* @__PURE__ */ new Set();
10240
- for (const token of tokens) {
10241
- if (token.speaker) speakerSet.add(token.speaker);
10242
- }
10179
+ tokens.forEach((t) => {
10180
+ if (t.speaker) speakerSet.add(String(t.speaker));
10181
+ });
10243
10182
  const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
10244
10183
  id,
10245
10184
  label: `Speaker ${id}`
10246
10185
  })) : void 0;
10247
- const utterances = tokens.length > 0 ? this.buildUtterancesFromTokens(tokens) : [];
10186
+ const utterances = this.buildUtterancesFromTokens(tokens);
10248
10187
  const language = tokens.find((t) => t.language)?.language ?? void 0;
10249
10188
  return {
10250
10189
  success: true,
10251
10190
  provider: this.name,
10252
10191
  data: {
10253
- id: response.id || `soniox_${Date.now()}`,
10192
+ id: meta.id,
10254
10193
  text,
10255
10194
  status: TranscriptionStatus.completed,
10256
10195
  language,
10257
- duration: response.audio_duration_ms ? response.audio_duration_ms / 1e3 : response.total_audio_proc_ms ? response.total_audio_proc_ms / 1e3 : void 0,
10196
+ duration: meta.audio_duration_ms ? meta.audio_duration_ms / 1e3 : void 0,
10258
10197
  speakers,
10259
10198
  words: words.length > 0 ? words : void 0,
10260
10199
  utterances: utterances.length > 0 ? utterances : void 0
10261
10200
  },
10262
10201
  tracking: {
10263
- requestId: response.id,
10264
- processingTimeMs: response.total_audio_proc_ms
10202
+ requestId: meta.id
10265
10203
  },
10266
- raw: response
10204
+ raw: { meta, transcript }
10267
10205
  };
10268
10206
  }
10269
10207
  };
@@ -10360,7 +10298,15 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10360
10298
  /**
10361
10299
  * Submit audio for transcription
10362
10300
  *
10363
- * ElevenLabs batch is synchronous - the API returns the result directly.
10301
+ * ElevenLabs batch is normally synchronous the API returns results directly.
10302
+ *
10303
+ * **Webhook mode:** When `webhookUrl` is set (or `elevenlabs.webhook` is true),
10304
+ * the request is processed asynchronously. ElevenLabs returns a 202 with a
10305
+ * `request_id` and delivers results to a webhook configured in the ElevenLabs
10306
+ * dashboard. The unified `webhookUrl` acts as an intent flag to enable async
10307
+ * mode — the actual delivery destination must be pre-configured in your
10308
+ * ElevenLabs dashboard. Use `elevenlabs.webhook_id` to target a specific
10309
+ * webhook endpoint.
10364
10310
  */
10365
10311
  async transcribe(audio, options) {
10366
10312
  this.validateConfig();
@@ -10383,6 +10329,11 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10383
10329
  }
10384
10330
  };
10385
10331
  }
10332
+ const elevenlabsOpts = options?.elevenlabs;
10333
+ const useWebhook = options?.webhookUrl || elevenlabsOpts?.webhook;
10334
+ if (useWebhook) {
10335
+ formData.append("webhook", "true");
10336
+ }
10386
10337
  if (options?.language) {
10387
10338
  formData.append("language_code", options.language);
10388
10339
  }
@@ -10401,7 +10352,6 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10401
10352
  if (options?.entityDetection) {
10402
10353
  formData.append("entity_detection", "all");
10403
10354
  }
10404
- const elevenlabsOpts = options?.elevenlabs;
10405
10355
  if (elevenlabsOpts) {
10406
10356
  for (const [key, value] of Object.entries(elevenlabsOpts)) {
10407
10357
  if (value === void 0 || value === null) continue;
@@ -10419,26 +10369,24 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10419
10369
  }
10420
10370
  }
10421
10371
  }
10422
- if (options?.webhookUrl) {
10423
- if (!formData.has("webhook")) {
10424
- formData.append("webhook", "true");
10425
- }
10426
- }
10427
10372
  const response = await this.client.post("/v1/speech-to-text", formData, {
10428
10373
  headers: {
10429
10374
  "Content-Type": "multipart/form-data"
10430
10375
  }
10431
10376
  });
10432
- if (options?.webhookUrl) {
10433
- const transcriptionId = response.data.transcription_id || response.data.id || `elevenlabs_${Date.now()}`;
10377
+ if (useWebhook) {
10378
+ const ack = response.data;
10434
10379
  return {
10435
10380
  success: true,
10436
10381
  provider: this.name,
10437
10382
  data: {
10438
- id: transcriptionId,
10383
+ id: ack.request_id || ack.transcription_id || `elevenlabs_${Date.now()}`,
10439
10384
  text: "",
10440
10385
  status: "queued"
10441
10386
  },
10387
+ tracking: {
10388
+ requestId: ack.request_id
10389
+ },
10442
10390
  raw: response.data
10443
10391
  };
10444
10392
  }
@@ -10534,20 +10482,9 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10534
10482
  ws.onmessage = (event) => {
10535
10483
  receivedData = true;
10536
10484
  const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
10537
- let messageType;
10538
10485
  try {
10539
10486
  const data = JSON.parse(rawPayload);
10540
- if (data.error) {
10541
- messageType = "error";
10542
- } else if (data.message_type === "session_started") {
10543
- messageType = "session_started";
10544
- } else if (data.message_type === "partial_transcript") {
10545
- messageType = "partial_transcript";
10546
- } else if (data.message_type === "committed_transcript") {
10547
- messageType = "committed_transcript";
10548
- } else if (data.message_type === "committed_transcript_with_timestamps") {
10549
- messageType = "committed_transcript_with_timestamps";
10550
- }
10487
+ const messageType = "error" in data ? "error" : data.message_type;
10551
10488
  if (callbacks?.onRawMessage) {
10552
10489
  callbacks.onRawMessage({
10553
10490
  provider: this.name,
@@ -10557,50 +10494,62 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10557
10494
  messageType
10558
10495
  });
10559
10496
  }
10560
- if (data.error) {
10497
+ if ("error" in data) {
10561
10498
  callbacks?.onError?.({
10562
- code: data.error_code?.toString() || "STREAM_ERROR",
10499
+ code: data.message_type || "STREAM_ERROR",
10563
10500
  message: data.error
10564
10501
  });
10565
10502
  return;
10566
10503
  }
10567
- if (data.message_type === "session_started") {
10568
- return;
10569
- }
10570
- if (data.message_type === "partial_transcript") {
10571
- const streamEvent = {
10572
- type: "transcript",
10573
- text: data.text || "",
10574
- isFinal: false,
10575
- confidence: void 0,
10576
- language: data.language_code
10577
- };
10578
- callbacks?.onTranscript?.(streamEvent);
10579
- return;
10580
- }
10581
- if (data.message_type === "committed_transcript" || data.message_type === "committed_transcript_with_timestamps") {
10582
- const words = data.words ? data.words.map((w) => ({
10583
- word: w.text || "",
10584
- start: w.start || 0,
10585
- end: w.end || 0,
10586
- confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
10587
- speaker: w.speaker_id
10588
- })) : [];
10589
- const streamEvent = {
10590
- type: "transcript",
10591
- text: data.text || "",
10592
- isFinal: true,
10593
- words: words.length > 0 ? words : void 0,
10594
- speaker: words[0]?.speaker,
10595
- language: data.language_code,
10596
- confidence: void 0
10597
- };
10598
- callbacks?.onTranscript?.(streamEvent);
10599
- if (options?.diarization && words.length > 0) {
10600
- const utterances = buildUtterancesFromWords(words);
10601
- for (const utterance of utterances) {
10602
- callbacks?.onUtterance?.(utterance);
10504
+ switch (data.message_type) {
10505
+ case "session_started":
10506
+ break;
10507
+ case "partial_transcript": {
10508
+ const streamEvent = {
10509
+ type: "transcript",
10510
+ text: data.text || "",
10511
+ isFinal: false,
10512
+ confidence: void 0
10513
+ };
10514
+ callbacks?.onTranscript?.(streamEvent);
10515
+ break;
10516
+ }
10517
+ case "committed_transcript": {
10518
+ const streamEvent = {
10519
+ type: "transcript",
10520
+ text: data.text || "",
10521
+ isFinal: true,
10522
+ confidence: void 0
10523
+ };
10524
+ callbacks?.onTranscript?.(streamEvent);
10525
+ break;
10526
+ }
10527
+ case "committed_transcript_with_timestamps": {
10528
+ const tsData = data;
10529
+ const words = tsData.words ? tsData.words.map((w) => ({
10530
+ word: w.text || "",
10531
+ start: w.start || 0,
10532
+ end: w.end || 0,
10533
+ confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
10534
+ speaker: w.speaker_id
10535
+ })) : [];
10536
+ const streamEvent = {
10537
+ type: "transcript",
10538
+ text: tsData.text || "",
10539
+ isFinal: true,
10540
+ words: words.length > 0 ? words : void 0,
10541
+ speaker: words[0]?.speaker,
10542
+ language: tsData.language_code,
10543
+ confidence: void 0
10544
+ };
10545
+ callbacks?.onTranscript?.(streamEvent);
10546
+ if (options?.diarization && words.length > 0) {
10547
+ const utterances = buildUtterancesFromWords(words);
10548
+ for (const utterance of utterances) {
10549
+ callbacks?.onUtterance?.(utterance);
10550
+ }
10603
10551
  }
10552
+ break;
10604
10553
  }
10605
10554
  }
10606
10555
  } catch (error) {
@@ -10755,7 +10704,7 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10755
10704
  }
10756
10705
  }
10757
10706
  }
10758
- const transcriptionId = ("transcription_id" in response ? response.transcription_id : response.transcription_id) || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
10707
+ const transcriptionId = response.transcription_id || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
10759
10708
  return {
10760
10709
  success: true,
10761
10710
  provider: this.name,
@@ -36673,12 +36622,10 @@ var createTemporaryApiKeyBody = import_zod10.z.object({
36673
36622
  var streaming_types_zod_exports = {};
36674
36623
  __export(streaming_types_zod_exports, {
36675
36624
  sonioxAudioFormatSchema: () => sonioxAudioFormatSchema,
36676
- sonioxAutoDetectedAudioFormatSchema: () => sonioxAutoDetectedAudioFormatSchema,
36677
36625
  sonioxContextGeneralItemSchema: () => sonioxContextGeneralItemSchema,
36678
36626
  sonioxContextSchema: () => sonioxContextSchema,
36679
36627
  sonioxErrorStatusSchema: () => sonioxErrorStatusSchema,
36680
36628
  sonioxOneWayTranslationSchema: () => sonioxOneWayTranslationSchema,
36681
- sonioxPcmAudioEncodingSchema: () => sonioxPcmAudioEncodingSchema,
36682
36629
  sonioxRealtimeModelSchema: () => sonioxRealtimeModelSchema,
36683
36630
  sonioxRecorderStateSchema: () => sonioxRecorderStateSchema,
36684
36631
  sonioxStreamingResponseSchema: () => sonioxStreamingResponseSchema,
@@ -36692,7 +36639,7 @@ __export(streaming_types_zod_exports, {
36692
36639
  streamingUpdateConfigParams: () => streamingUpdateConfigParams3
36693
36640
  });
36694
36641
  var import_zod11 = require("zod");
36695
- var sonioxAutoDetectedAudioFormatSchema = import_zod11.z.enum([
36642
+ var sonioxAudioFormatSchema = import_zod11.z.enum([
36696
36643
  "auto",
36697
36644
  "aac",
36698
36645
  "aiff",
@@ -36702,10 +36649,7 @@ var sonioxAutoDetectedAudioFormatSchema = import_zod11.z.enum([
36702
36649
  "mp3",
36703
36650
  "ogg",
36704
36651
  "wav",
36705
- "webm"
36706
- ]);
36707
- var sonioxPcmAudioEncodingSchema = import_zod11.z.enum([
36708
- // Signed PCM
36652
+ "webm",
36709
36653
  "pcm_s8",
36710
36654
  "pcm_s16le",
36711
36655
  "pcm_s16be",
@@ -36713,7 +36657,6 @@ var sonioxPcmAudioEncodingSchema = import_zod11.z.enum([
36713
36657
  "pcm_s24be",
36714
36658
  "pcm_s32le",
36715
36659
  "pcm_s32be",
36716
- // Unsigned PCM
36717
36660
  "pcm_u8",
36718
36661
  "pcm_u16le",
36719
36662
  "pcm_u16be",
@@ -36721,86 +36664,81 @@ var sonioxPcmAudioEncodingSchema = import_zod11.z.enum([
36721
36664
  "pcm_u24be",
36722
36665
  "pcm_u32le",
36723
36666
  "pcm_u32be",
36724
- // Float PCM
36725
36667
  "pcm_f32le",
36726
36668
  "pcm_f32be",
36727
36669
  "pcm_f64le",
36728
36670
  "pcm_f64be",
36729
- // Companded
36730
36671
  "mulaw",
36731
36672
  "alaw"
36732
36673
  ]);
36733
- var sonioxAudioFormatSchema = import_zod11.z.union([
36734
- sonioxAutoDetectedAudioFormatSchema,
36735
- sonioxPcmAudioEncodingSchema
36736
- ]);
36737
36674
  var sonioxOneWayTranslationSchema = import_zod11.z.object({
36738
36675
  type: import_zod11.z.literal("one_way"),
36739
- target_language: import_zod11.z.string().describe("Target language code for translation")
36676
+ target_language: import_zod11.z.string()
36740
36677
  });
36741
36678
  var sonioxTwoWayTranslationSchema = import_zod11.z.object({
36742
36679
  type: import_zod11.z.literal("two_way"),
36743
- language_a: import_zod11.z.string().describe("First language for bidirectional translation"),
36744
- language_b: import_zod11.z.string().describe("Second language for bidirectional translation")
36680
+ language_a: import_zod11.z.string(),
36681
+ language_b: import_zod11.z.string()
36745
36682
  });
36746
36683
  var sonioxTranslationConfigSchema = import_zod11.z.union([
36747
36684
  sonioxOneWayTranslationSchema,
36748
36685
  sonioxTwoWayTranslationSchema
36749
36686
  ]);
36750
36687
  var sonioxContextGeneralItemSchema = import_zod11.z.object({
36751
- key: import_zod11.z.string().describe("Context item key (e.g. 'Domain')"),
36752
- value: import_zod11.z.string().describe("Context item value (e.g. 'medicine')")
36688
+ key: import_zod11.z.string(),
36689
+ value: import_zod11.z.string()
36753
36690
  });
36754
36691
  var sonioxTranslationTermSchema = import_zod11.z.object({
36755
- source: import_zod11.z.string().describe("Source term"),
36756
- target: import_zod11.z.string().describe("Target term to translate to")
36692
+ source: import_zod11.z.string(),
36693
+ target: import_zod11.z.string()
36757
36694
  });
36758
36695
  var sonioxStructuredContextSchema = import_zod11.z.object({
36759
- general: import_zod11.z.array(sonioxContextGeneralItemSchema).optional().describe("General context items (key-value pairs)"),
36760
- text: import_zod11.z.string().optional().describe("Text context"),
36761
- terms: import_zod11.z.array(import_zod11.z.string()).optional().describe("Terms that might occur in speech"),
36762
- translation_terms: import_zod11.z.array(sonioxTranslationTermSchema).optional().describe("Hints how to translate specific terms (ignored if translation is not enabled)")
36696
+ general: import_zod11.z.array(sonioxContextGeneralItemSchema).optional(),
36697
+ text: import_zod11.z.string().optional(),
36698
+ terms: import_zod11.z.array(import_zod11.z.string()).optional(),
36699
+ translation_terms: import_zod11.z.array(sonioxTranslationTermSchema).optional()
36763
36700
  });
36764
36701
  var sonioxContextSchema = import_zod11.z.union([sonioxStructuredContextSchema, import_zod11.z.string()]);
36765
36702
  var sonioxRealtimeModelSchema = import_zod11.z.enum([
36703
+ "stt-rt-v4",
36766
36704
  "stt-rt-v3",
36767
36705
  "stt-rt-preview",
36768
36706
  "stt-rt-v3-preview",
36769
36707
  "stt-rt-preview-v2"
36770
36708
  ]);
36771
36709
  var streamingTranscriberParams3 = import_zod11.z.object({
36772
- model: sonioxRealtimeModelSchema.describe("Real-time model to use"),
36773
- audioFormat: sonioxAudioFormatSchema.optional().describe("Audio format specification. Use 'auto' for automatic detection"),
36774
- sampleRate: import_zod11.z.number().optional().describe("Sample rate in Hz (required for raw PCM formats)"),
36775
- numChannels: import_zod11.z.number().min(1).max(2).optional().describe("Number of audio channels (1 for mono, 2 for stereo) - required for raw PCM formats"),
36776
- languageHints: import_zod11.z.array(import_zod11.z.string()).optional().describe("Expected languages in the audio (ISO language codes)"),
36777
- context: sonioxContextSchema.optional().describe("Additional context to improve transcription accuracy"),
36778
- enableSpeakerDiarization: import_zod11.z.boolean().optional().describe("Enable speaker diarization - each token will include a speaker field"),
36779
- enableLanguageIdentification: import_zod11.z.boolean().optional().describe("Enable language identification - each token will include a language field"),
36780
- enableEndpointDetection: import_zod11.z.boolean().optional().describe("Enable endpoint detection to detect when a speaker has finished talking"),
36781
- translation: sonioxTranslationConfigSchema.optional().describe("Translation configuration"),
36782
- clientReferenceId: import_zod11.z.string().optional().describe("Optional tracking identifier (client-defined)")
36783
- });
36784
- var sonioxTranslationStatusSchema = import_zod11.z.enum(["none", "original", "translation"]);
36710
+ model: sonioxRealtimeModelSchema,
36711
+ audioFormat: sonioxAudioFormatSchema.optional(),
36712
+ sampleRate: import_zod11.z.number().optional(),
36713
+ numChannels: import_zod11.z.number().optional(),
36714
+ languageHints: import_zod11.z.array(import_zod11.z.string()).optional(),
36715
+ context: sonioxContextSchema.optional(),
36716
+ enableSpeakerDiarization: import_zod11.z.boolean().optional(),
36717
+ enableLanguageIdentification: import_zod11.z.boolean().optional(),
36718
+ enableEndpointDetection: import_zod11.z.boolean().optional(),
36719
+ translation: sonioxTranslationConfigSchema.optional(),
36720
+ clientReferenceId: import_zod11.z.string().optional()
36721
+ });
36722
+ var sonioxTranslationStatusSchema = import_zod11.z.enum(["original", "translation", "none"]);
36785
36723
  var sonioxTokenSchema = import_zod11.z.object({
36786
- text: import_zod11.z.string().describe("Token text content (subword, word, or space)"),
36787
- start_ms: import_zod11.z.number().optional().describe("Start time of the token in milliseconds"),
36788
- end_ms: import_zod11.z.number().optional().describe("End time of the token in milliseconds"),
36789
- confidence: import_zod11.z.number().min(0).max(1).optional().describe("Confidence score between 0.0 and 1.0"),
36790
- is_final: import_zod11.z.boolean().describe("Whether this token is final (confirmed) or provisional"),
36791
- speaker: import_zod11.z.string().optional().describe("Speaker identifier (only present when speaker diarization is enabled)"),
36792
- language: import_zod11.z.string().optional().describe("Detected language code (only present when language identification is enabled)"),
36793
- source_language: import_zod11.z.string().optional().describe("Original language code for translated tokens"),
36794
- translation_status: sonioxTranslationStatusSchema.optional().describe("Translation status: 'none', 'original', or 'translation'")
36724
+ text: import_zod11.z.string(),
36725
+ start_ms: import_zod11.z.number().optional(),
36726
+ end_ms: import_zod11.z.number().optional(),
36727
+ confidence: import_zod11.z.number(),
36728
+ is_final: import_zod11.z.boolean(),
36729
+ speaker: import_zod11.z.string().optional(),
36730
+ translation_status: sonioxTranslationStatusSchema.optional(),
36731
+ language: import_zod11.z.string().optional(),
36732
+ source_language: import_zod11.z.string().optional()
36795
36733
  });
36796
36734
  var sonioxStreamingResponseSchema = import_zod11.z.object({
36797
- text: import_zod11.z.string().optional().describe("Complete transcribed text"),
36798
- tokens: import_zod11.z.array(sonioxTokenSchema).describe("List of recognized tokens"),
36799
- final_audio_proc_ms: import_zod11.z.number().optional().describe("Milliseconds of audio processed into final tokens"),
36800
- total_audio_proc_ms: import_zod11.z.number().optional().describe("Milliseconds of audio processed (final + non-final)"),
36801
- finished: import_zod11.z.boolean().optional().describe("Whether the transcription is complete"),
36802
- error: import_zod11.z.string().optional().describe("Error message if an error occurred"),
36803
- error_code: import_zod11.z.number().optional().describe("Error code if an error occurred")
36735
+ text: import_zod11.z.string(),
36736
+ tokens: import_zod11.z.array(sonioxTokenSchema),
36737
+ final_audio_proc_ms: import_zod11.z.number(),
36738
+ total_audio_proc_ms: import_zod11.z.number(),
36739
+ finished: import_zod11.z.boolean().optional(),
36740
+ error_code: import_zod11.z.number().optional(),
36741
+ error_message: import_zod11.z.string().optional()
36804
36742
  });
36805
36743
  var sonioxRecorderStateSchema = import_zod11.z.enum([
36806
36744
  "Init",
@@ -37366,8 +37304,8 @@ var BatchOnlyProviders = AllProviders.filter(
37366
37304
  );
37367
37305
 
37368
37306
  // src/generated/deepgram/schema/index.ts
37369
- var schema_exports4 = {};
37370
- __export(schema_exports4, {
37307
+ var schema_exports5 = {};
37308
+ __export(schema_exports5, {
37371
37309
  V1ListenPostParametersCallbackMethod: () => V1ListenPostParametersCallbackMethod,
37372
37310
  V1ListenPostParametersCustomIntentMode: () => V1ListenPostParametersCustomIntentMode,
37373
37311
  V1ListenPostParametersCustomTopicMode: () => V1ListenPostParametersCustomTopicMode,
@@ -37622,8 +37560,8 @@ var V1SpeakPostParametersSampleRate = {
37622
37560
  };
37623
37561
 
37624
37562
  // src/generated/openai/schema/index.ts
37625
- var schema_exports5 = {};
37626
- __export(schema_exports5, {
37563
+ var schema_exports6 = {};
37564
+ __export(schema_exports6, {
37627
37565
  AudioResponseFormat: () => AudioResponseFormat,
37628
37566
  CreateSpeechRequestResponseFormat: () => CreateSpeechRequestResponseFormat,
37629
37567
  CreateSpeechRequestStreamFormat: () => CreateSpeechRequestStreamFormat,
@@ -37963,8 +37901,8 @@ var VoiceResourceObject = {
37963
37901
  };
37964
37902
 
37965
37903
  // src/generated/speechmatics/schema/index.ts
37966
- var schema_exports6 = {};
37967
- __export(schema_exports6, {
37904
+ var schema_exports7 = {};
37905
+ __export(schema_exports7, {
37968
37906
  AutoChaptersResultErrorType: () => AutoChaptersResultErrorType,
37969
37907
  ErrorResponseError: () => ErrorResponseError,
37970
37908
  GetJobsJobidAlignmentTags: () => GetJobsJobidAlignmentTags,
@@ -38153,32 +38091,6 @@ var WrittenFormRecognitionResultType = {
38153
38091
  word: "word"
38154
38092
  };
38155
38093
 
38156
- // src/generated/soniox/schema/index.ts
38157
- var schema_exports7 = {};
38158
- __export(schema_exports7, {
38159
- TemporaryApiKeyUsageType: () => TemporaryApiKeyUsageType,
38160
- TranscriptionMode: () => TranscriptionMode,
38161
- TranscriptionStatus: () => TranscriptionStatus,
38162
- TranslationConfigType: () => TranslationConfigType
38163
- });
38164
-
38165
- // src/generated/soniox/schema/temporaryApiKeyUsageType.ts
38166
- var TemporaryApiKeyUsageType = {
38167
- transcribe_websocket: "transcribe_websocket"
38168
- };
38169
-
38170
- // src/generated/soniox/schema/transcriptionMode.ts
38171
- var TranscriptionMode = {
38172
- real_time: "real_time",
38173
- async: "async"
38174
- };
38175
-
38176
- // src/generated/soniox/schema/translationConfigType.ts
38177
- var TranslationConfigType = {
38178
- one_way: "one_way",
38179
- two_way: "two_way"
38180
- };
38181
-
38182
38094
  // src/generated/elevenlabs/schema/index.ts
38183
38095
  var schema_exports8 = {};
38184
38096
  __export(schema_exports8, {