voice-router-dev 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -6795,9 +6795,13 @@ var DeepgramAdapter = class extends BaseAdapter {
6795
6795
  * Submit audio for transcription
6796
6796
  *
6797
6797
  * Sends audio to Deepgram API for transcription. Deepgram normally processes
6798
- * synchronously and returns results immediately. When `webhookUrl` is set,
6799
- * Deepgram can instead return an async callback acknowledgment containing a
6800
- * request ID.
6798
+ * synchronously and returns results immediately.
6799
+ *
6800
+ * **Callback mode:** When `webhookUrl` is set, Deepgram returns immediately
6801
+ * with a `request_id` (status `"queued"`). The full transcript is POSTed to
6802
+ * the webhook URL — this is the primary delivery mechanism. `getTranscript()`
6803
+ * can attempt to retrieve the result later via request history, but that
6804
+ * endpoint is best-effort and not a guaranteed durable store.
6801
6805
  *
6802
6806
  * @param audio - Audio input (URL or file buffer)
6803
6807
  * @param options - Transcription options
@@ -6907,30 +6911,22 @@ var DeepgramAdapter = class extends BaseAdapter {
6907
6911
  }
6908
6912
  }
6909
6913
  /**
6910
- * Get transcription result by ID
6911
- *
6912
- * Retrieves a previous transcription from Deepgram's request history.
6914
+ * Get transcription result by ID (best-effort)
6913
6915
  *
6914
- * Unlike the list endpoint, getting a single request DOES include the full
6915
- * transcript response. Requires `projectId` to be set during initialization.
6916
+ * Retrieves a previous transcription from Deepgram's request history API.
6917
+ * Requires `projectId` to be set during initialization.
6916
6918
  *
6917
- * @param transcriptId - Request ID from a previous transcription
6918
- * @returns Full transcript response including text, words, and metadata
6919
+ * **Important:** Deepgram's request history is best-effort. Requests may
6920
+ * expire or be unavailable depending on your plan and retention settings.
6921
+ * This is NOT a durable transcript store — for reliable retrieval, use
6922
+ * callback mode (`webhookUrl`) and persist the webhook payload yourself.
6919
6923
  *
6920
- * @example Get a transcript by request ID
6921
- * ```typescript
6922
- * const adapter = new DeepgramAdapter()
6923
- * adapter.initialize({
6924
- * apiKey: process.env.DEEPGRAM_API_KEY,
6925
- * projectId: process.env.DEEPGRAM_PROJECT_ID
6926
- * })
6924
+ * The response field on the request history entry is cast to
6925
+ * `ListenV1Response` — this appears to work in practice but is not
6926
+ * explicitly documented by Deepgram as a guaranteed contract.
6927
6927
  *
6928
- * const result = await adapter.getTranscript('abc123-request-id')
6929
- * if (result.success) {
6930
- * console.log(result.data?.text)
6931
- * console.log(result.data?.words)
6932
- * }
6933
- * ```
6928
+ * @param transcriptId - Request ID from a previous transcription
6929
+ * @returns Transcript response if still available in request history
6934
6930
  *
6935
6931
  * @see https://developers.deepgram.com/reference/get-request
6936
6932
  */
@@ -9013,8 +9009,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9013
9009
  super(...arguments);
9014
9010
  this.name = "speechmatics";
9015
9011
  this.capabilities = {
9016
- streaming: false,
9017
- // Batch only (streaming available via separate WebSocket API)
9012
+ streaming: true,
9018
9013
  diarization: true,
9019
9014
  wordTimestamps: true,
9020
9015
  languageDetection: false,
@@ -9260,6 +9255,271 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9260
9255
  throw error;
9261
9256
  }
9262
9257
  }
9258
+ /**
9259
+ * Get the regional WebSocket host for real-time streaming
9260
+ *
9261
+ * Speechmatics RT uses a different host pattern: {region}.rt.speechmatics.com
9262
+ */
9263
+ getRegionalWsHost(region) {
9264
+ const regionPrefix = region || "eu1";
9265
+ return `${regionPrefix}.rt.speechmatics.com`;
9266
+ }
9267
+ /**
9268
+ * Stream audio for real-time transcription
9269
+ *
9270
+ * Creates a WebSocket connection to the Speechmatics Real-Time API.
9271
+ * Protocol: send StartRecognition config, then AddAudio binary frames,
9272
+ * receive AddPartialTranscript/AddTranscript/EndOfUtterance messages.
9273
+ *
9274
+ * @param options - Streaming configuration
9275
+ * @param callbacks - Event callbacks
9276
+ * @returns StreamingSession for sending audio and closing
9277
+ *
9278
+ * @see https://docs.speechmatics.com/rt-api-ref
9279
+ */
9280
+ async transcribeStream(options, callbacks) {
9281
+ this.validateConfig();
9282
+ const sessionId = `speechmatics_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9283
+ const createdAt = /* @__PURE__ */ new Date();
9284
+ const smOpts = options?.speechmaticsStreaming;
9285
+ const region = smOpts?.region || this.config?.region;
9286
+ const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost(region)}`);
9287
+ const wsUrl = `${wsBase}/v2`;
9288
+ let status = "connecting";
9289
+ let recognitionStarted = false;
9290
+ const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
9291
+ const ws = new WebSocketImpl(wsUrl);
9292
+ const language = smOpts?.language || options?.language || "en";
9293
+ const transcriptionConfig = {
9294
+ language,
9295
+ enable_entities: smOpts?.enableEntities ?? options?.entityDetection ?? false,
9296
+ enable_partials: smOpts?.enablePartials ?? options?.interimResults !== false,
9297
+ operating_point: smOpts?.operatingPoint || OperatingPoint.enhanced,
9298
+ ...smOpts?.maxDelay !== void 0 && { max_delay: smOpts.maxDelay },
9299
+ ...smOpts?.maxDelayMode && {
9300
+ max_delay_mode: smOpts.maxDelayMode
9301
+ },
9302
+ ...smOpts?.domain && { domain: smOpts.domain },
9303
+ ...(options?.diarization || smOpts?.diarization === TranscriptionConfigDiarization.speaker) && {
9304
+ diarization: TranscriptionConfigDiarization.speaker,
9305
+ ...smOpts?.maxSpeakers !== void 0 && {
9306
+ speaker_diarization_config: { max_speakers: smOpts.maxSpeakers }
9307
+ }
9308
+ },
9309
+ ...(options?.customVocabulary?.length || smOpts?.additionalVocab?.length) && {
9310
+ additional_vocab: (smOpts?.additionalVocab || options?.customVocabulary || []).map(
9311
+ (term) => ({ content: term })
9312
+ )
9313
+ }
9314
+ };
9315
+ const startRecognition = {
9316
+ message: "StartRecognition",
9317
+ audio_format: {
9318
+ type: "raw",
9319
+ encoding: smOpts?.encoding || "pcm_s16le",
9320
+ sample_rate: smOpts?.sampleRate || options?.sampleRate || 16e3
9321
+ },
9322
+ transcription_config: transcriptionConfig,
9323
+ ...smOpts?.conversationConfig && {
9324
+ conversation_config: {
9325
+ end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9326
+ }
9327
+ }
9328
+ };
9329
+ ws.onopen = () => {
9330
+ status = "open";
9331
+ const msg = JSON.stringify(startRecognition);
9332
+ if (callbacks?.onRawMessage) {
9333
+ callbacks.onRawMessage({
9334
+ provider: this.name,
9335
+ direction: "outgoing",
9336
+ timestamp: Date.now(),
9337
+ payload: msg,
9338
+ messageType: "StartRecognition"
9339
+ });
9340
+ }
9341
+ ws.send(msg);
9342
+ };
9343
+ ws.onmessage = (event) => {
9344
+ const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
9345
+ try {
9346
+ const data = JSON.parse(rawPayload);
9347
+ const messageType = data.message;
9348
+ if (callbacks?.onRawMessage) {
9349
+ callbacks.onRawMessage({
9350
+ provider: this.name,
9351
+ direction: "incoming",
9352
+ timestamp: Date.now(),
9353
+ payload: rawPayload,
9354
+ messageType
9355
+ });
9356
+ }
9357
+ switch (messageType) {
9358
+ case "RecognitionStarted": {
9359
+ recognitionStarted = true;
9360
+ callbacks?.onOpen?.();
9361
+ callbacks?.onMetadata?.({
9362
+ id: data.id,
9363
+ languagePackInfo: data.language_pack_info
9364
+ });
9365
+ break;
9366
+ }
9367
+ case "AddPartialTranscript": {
9368
+ const partial = data;
9369
+ const words = this.resultsToWords(partial.results);
9370
+ callbacks?.onTranscript?.({
9371
+ type: "transcript",
9372
+ text: partial.metadata.transcript,
9373
+ isFinal: false,
9374
+ words,
9375
+ speaker: words[0]?.speaker,
9376
+ confidence: partial.results[0]?.alternatives?.[0]?.confidence,
9377
+ channel: partial.channel ? parseInt(partial.channel) : void 0
9378
+ });
9379
+ break;
9380
+ }
9381
+ case "AddTranscript": {
9382
+ const final = data;
9383
+ const words = this.resultsToWords(final.results);
9384
+ callbacks?.onTranscript?.({
9385
+ type: "transcript",
9386
+ text: final.metadata.transcript,
9387
+ isFinal: true,
9388
+ words,
9389
+ speaker: words[0]?.speaker,
9390
+ confidence: final.results[0]?.alternatives?.[0]?.confidence,
9391
+ channel: final.channel ? parseInt(final.channel) : void 0
9392
+ });
9393
+ if (options?.diarization || smOpts?.diarization === "speaker") {
9394
+ const utterances = buildUtterancesFromWords(words);
9395
+ for (const utterance of utterances) {
9396
+ callbacks?.onUtterance?.(utterance);
9397
+ }
9398
+ }
9399
+ break;
9400
+ }
9401
+ case "EndOfUtterance": {
9402
+ break;
9403
+ }
9404
+ case "EndOfTranscript": {
9405
+ callbacks?.onClose?.(1e3, "Transcription complete");
9406
+ break;
9407
+ }
9408
+ case "Error": {
9409
+ const err = data;
9410
+ callbacks?.onError?.({
9411
+ code: err.type || "SPEECHMATICS_ERROR",
9412
+ message: err.reason || "Unknown error"
9413
+ });
9414
+ break;
9415
+ }
9416
+ case "Warning": {
9417
+ const warn = data;
9418
+ callbacks?.onMetadata?.({
9419
+ warning: warn.type,
9420
+ reason: warn.reason
9421
+ });
9422
+ break;
9423
+ }
9424
+ case "Info": {
9425
+ callbacks?.onMetadata?.(data);
9426
+ break;
9427
+ }
9428
+ case "AudioAdded":
9429
+ case "ChannelAudioAdded":
9430
+ break;
9431
+ default:
9432
+ callbacks?.onMetadata?.(data);
9433
+ break;
9434
+ }
9435
+ } catch (error) {
9436
+ callbacks?.onError?.({
9437
+ code: "PARSE_ERROR",
9438
+ message: `Failed to parse message: ${error}`
9439
+ });
9440
+ }
9441
+ };
9442
+ ws.onerror = () => {
9443
+ callbacks?.onError?.({
9444
+ code: "WEBSOCKET_ERROR",
9445
+ message: "WebSocket error occurred"
9446
+ });
9447
+ };
9448
+ ws.onclose = (event) => {
9449
+ status = "closed";
9450
+ callbacks?.onClose?.(event.code, event.reason);
9451
+ };
9452
+ await new Promise((resolve, reject) => {
9453
+ const timeout = setTimeout(() => {
9454
+ reject(new Error("WebSocket connection timeout"));
9455
+ }, 1e4);
9456
+ const checkReady = () => {
9457
+ if (recognitionStarted) {
9458
+ clearTimeout(timeout);
9459
+ resolve();
9460
+ } else if (status === "closed") {
9461
+ clearTimeout(timeout);
9462
+ reject(new Error("WebSocket connection failed"));
9463
+ } else {
9464
+ setTimeout(checkReady, 100);
9465
+ }
9466
+ };
9467
+ checkReady();
9468
+ });
9469
+ return {
9470
+ id: sessionId,
9471
+ provider: this.name,
9472
+ createdAt,
9473
+ getStatus: () => status,
9474
+ sendAudio: async (chunk) => {
9475
+ if (status !== "open") {
9476
+ throw new Error("Session is not open");
9477
+ }
9478
+ if (callbacks?.onRawMessage) {
9479
+ const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
9480
+ chunk.data.byteOffset,
9481
+ chunk.data.byteOffset + chunk.data.byteLength
9482
+ );
9483
+ callbacks.onRawMessage({
9484
+ provider: this.name,
9485
+ direction: "outgoing",
9486
+ timestamp: Date.now(),
9487
+ payload: audioPayload,
9488
+ messageType: "audio"
9489
+ });
9490
+ }
9491
+ ws.send(chunk.data);
9492
+ },
9493
+ close: async () => {
9494
+ if (status === "open") {
9495
+ status = "closing";
9496
+ const endMsg = JSON.stringify({ message: "EndOfStream", last_seq_no: 0 });
9497
+ if (callbacks?.onRawMessage) {
9498
+ callbacks.onRawMessage({
9499
+ provider: this.name,
9500
+ direction: "outgoing",
9501
+ timestamp: Date.now(),
9502
+ payload: endMsg,
9503
+ messageType: "EndOfStream"
9504
+ });
9505
+ }
9506
+ ws.send(endMsg);
9507
+ }
9508
+ }
9509
+ };
9510
+ }
9511
+ /**
9512
+ * Convert Speechmatics RecognitionResult[] to unified Word[]
9513
+ */
9514
+ resultsToWords(results) {
9515
+ return results.filter((r) => r.type === "word").map((r) => ({
9516
+ word: r.alternatives?.[0]?.content || "",
9517
+ start: r.start_time,
9518
+ end: r.end_time,
9519
+ confidence: r.alternatives?.[0]?.confidence,
9520
+ speaker: r.alternatives?.[0]?.speaker
9521
+ }));
9522
+ }
9263
9523
  /**
9264
9524
  * Normalize Speechmatics status to unified status
9265
9525
  * Uses generated JobDetailsStatus enum values
@@ -9679,7 +9939,7 @@ var SonioxAdapter = class extends BaseAdapter {
9679
9939
  let messageType;
9680
9940
  try {
9681
9941
  const data = JSON.parse(rawPayload);
9682
- const errorMessage = data.error_message || data.error;
9942
+ const errorMessage = data.error_message;
9683
9943
  if (errorMessage) {
9684
9944
  messageType = "error";
9685
9945
  } else if (data.finished) {
@@ -10038,7 +10298,15 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10038
10298
  /**
10039
10299
  * Submit audio for transcription
10040
10300
  *
10041
- * ElevenLabs batch is synchronous - the API returns the result directly.
10301
+ * ElevenLabs batch is normally synchronous the API returns results directly.
10302
+ *
10303
+ * **Webhook mode:** When `webhookUrl` is set (or `elevenlabs.webhook` is true),
10304
+ * the request is processed asynchronously. ElevenLabs returns a 202 with a
10305
+ * `request_id` and delivers results to a webhook configured in the ElevenLabs
10306
+ * dashboard. The unified `webhookUrl` acts as an intent flag to enable async
10307
+ * mode — the actual delivery destination must be pre-configured in your
10308
+ * ElevenLabs dashboard. Use `elevenlabs.webhook_id` to target a specific
10309
+ * webhook endpoint.
10042
10310
  */
10043
10311
  async transcribe(audio, options) {
10044
10312
  this.validateConfig();
@@ -10061,6 +10329,11 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10061
10329
  }
10062
10330
  };
10063
10331
  }
10332
+ const elevenlabsOpts = options?.elevenlabs;
10333
+ const useWebhook = options?.webhookUrl || elevenlabsOpts?.webhook;
10334
+ if (useWebhook) {
10335
+ formData.append("webhook", "true");
10336
+ }
10064
10337
  if (options?.language) {
10065
10338
  formData.append("language_code", options.language);
10066
10339
  }
@@ -10079,7 +10352,6 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10079
10352
  if (options?.entityDetection) {
10080
10353
  formData.append("entity_detection", "all");
10081
10354
  }
10082
- const elevenlabsOpts = options?.elevenlabs;
10083
10355
  if (elevenlabsOpts) {
10084
10356
  for (const [key, value] of Object.entries(elevenlabsOpts)) {
10085
10357
  if (value === void 0 || value === null) continue;
@@ -10102,6 +10374,22 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10102
10374
  "Content-Type": "multipart/form-data"
10103
10375
  }
10104
10376
  });
10377
+ if (useWebhook) {
10378
+ const ack = response.data;
10379
+ return {
10380
+ success: true,
10381
+ provider: this.name,
10382
+ data: {
10383
+ id: ack.request_id || ack.transcription_id || `elevenlabs_${Date.now()}`,
10384
+ text: "",
10385
+ status: "queued"
10386
+ },
10387
+ tracking: {
10388
+ requestId: ack.request_id
10389
+ },
10390
+ raw: response.data
10391
+ };
10392
+ }
10105
10393
  return this.normalizeResponse(response.data);
10106
10394
  } catch (error) {
10107
10395
  return this.createErrorResponse(error);
@@ -10194,20 +10482,9 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10194
10482
  ws.onmessage = (event) => {
10195
10483
  receivedData = true;
10196
10484
  const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
10197
- let messageType;
10198
10485
  try {
10199
10486
  const data = JSON.parse(rawPayload);
10200
- if (data.error) {
10201
- messageType = "error";
10202
- } else if (data.message_type === "session_started") {
10203
- messageType = "session_started";
10204
- } else if (data.message_type === "partial_transcript") {
10205
- messageType = "partial_transcript";
10206
- } else if (data.message_type === "committed_transcript") {
10207
- messageType = "committed_transcript";
10208
- } else if (data.message_type === "committed_transcript_with_timestamps") {
10209
- messageType = "committed_transcript_with_timestamps";
10210
- }
10487
+ const messageType = "error" in data ? "error" : data.message_type;
10211
10488
  if (callbacks?.onRawMessage) {
10212
10489
  callbacks.onRawMessage({
10213
10490
  provider: this.name,
@@ -10217,50 +10494,62 @@ var ElevenLabsAdapter = class extends BaseAdapter {
10217
10494
  messageType
10218
10495
  });
10219
10496
  }
10220
- if (data.error) {
10497
+ if ("error" in data) {
10221
10498
  callbacks?.onError?.({
10222
- code: data.error_code?.toString() || "STREAM_ERROR",
10499
+ code: data.message_type || "STREAM_ERROR",
10223
10500
  message: data.error
10224
10501
  });
10225
10502
  return;
10226
10503
  }
10227
- if (data.message_type === "session_started") {
10228
- return;
10229
- }
10230
- if (data.message_type === "partial_transcript") {
10231
- const streamEvent = {
10232
- type: "transcript",
10233
- text: data.text || "",
10234
- isFinal: false,
10235
- confidence: void 0,
10236
- language: data.language_code
10237
- };
10238
- callbacks?.onTranscript?.(streamEvent);
10239
- return;
10240
- }
10241
- if (data.message_type === "committed_transcript" || data.message_type === "committed_transcript_with_timestamps") {
10242
- const words = data.words ? data.words.map((w) => ({
10243
- word: w.text || "",
10244
- start: w.start || 0,
10245
- end: w.end || 0,
10246
- confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
10247
- speaker: w.speaker_id
10248
- })) : [];
10249
- const streamEvent = {
10250
- type: "transcript",
10251
- text: data.text || "",
10252
- isFinal: true,
10253
- words: words.length > 0 ? words : void 0,
10254
- speaker: words[0]?.speaker,
10255
- language: data.language_code,
10256
- confidence: void 0
10257
- };
10258
- callbacks?.onTranscript?.(streamEvent);
10259
- if (options?.diarization && words.length > 0) {
10260
- const utterances = buildUtterancesFromWords(words);
10261
- for (const utterance of utterances) {
10262
- callbacks?.onUtterance?.(utterance);
10504
+ switch (data.message_type) {
10505
+ case "session_started":
10506
+ break;
10507
+ case "partial_transcript": {
10508
+ const streamEvent = {
10509
+ type: "transcript",
10510
+ text: data.text || "",
10511
+ isFinal: false,
10512
+ confidence: void 0
10513
+ };
10514
+ callbacks?.onTranscript?.(streamEvent);
10515
+ break;
10516
+ }
10517
+ case "committed_transcript": {
10518
+ const streamEvent = {
10519
+ type: "transcript",
10520
+ text: data.text || "",
10521
+ isFinal: true,
10522
+ confidence: void 0
10523
+ };
10524
+ callbacks?.onTranscript?.(streamEvent);
10525
+ break;
10526
+ }
10527
+ case "committed_transcript_with_timestamps": {
10528
+ const tsData = data;
10529
+ const words = tsData.words ? tsData.words.map((w) => ({
10530
+ word: w.text || "",
10531
+ start: w.start || 0,
10532
+ end: w.end || 0,
10533
+ confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
10534
+ speaker: w.speaker_id
10535
+ })) : [];
10536
+ const streamEvent = {
10537
+ type: "transcript",
10538
+ text: tsData.text || "",
10539
+ isFinal: true,
10540
+ words: words.length > 0 ? words : void 0,
10541
+ speaker: words[0]?.speaker,
10542
+ language: tsData.language_code,
10543
+ confidence: void 0
10544
+ };
10545
+ callbacks?.onTranscript?.(streamEvent);
10546
+ if (options?.diarization && words.length > 0) {
10547
+ const utterances = buildUtterancesFromWords(words);
10548
+ for (const utterance of utterances) {
10549
+ callbacks?.onUtterance?.(utterance);
10550
+ }
10263
10551
  }
10552
+ break;
10264
10553
  }
10265
10554
  }
10266
10555
  } catch (error) {