npm - voice-router-dev - Versions diffs - 0.9.1 → 0.9.3 - Mend

voice-router-dev 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/CHANGELOG.md +36 -0
package/dist/field-configs.d.mts +1 -1
package/dist/field-configs.d.ts +1 -1
package/dist/index.d.mts +181 -153
package/dist/index.d.ts +181 -153
package/dist/index.js +377 -78
package/dist/index.mjs +375 -78
package/package.json +1 -1
package/dist/{field-configs-CH0lgAe8.d.mts → field-configs-FbtCPxzs.d.mts} +60 -60
package/dist/{field-configs-CH0lgAe8.d.ts → field-configs-FbtCPxzs.d.ts} +60 -60

package/dist/index.mjs CHANGED Viewed

@@ -2591,6 +2591,12 @@ var AssemblyAISampleRate = {
   rate48000: 48e3
 };
 var AssemblyAIStatus = TranscriptStatus;
+var AssemblyAIRegion = {
+  /** United States (default) */
+  us: "us",
+  /** European Union — data never leaves the EU */
+  eu: "eu"
+};
 var GladiaStatus = TranscriptionControllerListV2StatusItem;
 var DeepgramStatus = V1ProjectsProjectIdRequestsGetParametersStatus;
 var SpeechmaticsRegion = {
@@ -6566,9 +6572,13 @@ var DeepgramAdapter = class extends BaseAdapter {
    * Submit audio for transcription
    *
    * Sends audio to Deepgram API for transcription. Deepgram normally processes
-   * synchronously and returns results immediately. When `webhookUrl` is set,
-   * Deepgram can instead return an async callback acknowledgment containing a
-   * request ID.
+   * synchronously and returns results immediately.
+   *
+   * **Callback mode:** When `webhookUrl` is set, Deepgram returns immediately
+   * with a `request_id` (status `"queued"`). The full transcript is POSTed to
+   * the webhook URL — this is the primary delivery mechanism. `getTranscript()`
+   * can attempt to retrieve the result later via request history, but that
+   * endpoint is best-effort and not a guaranteed durable store.
    *
    * @param audio - Audio input (URL or file buffer)
    * @param options - Transcription options
@@ -6678,30 +6688,22 @@ var DeepgramAdapter = class extends BaseAdapter {
     }
   }
   /**
-   * Get transcription result by ID
+   * Get transcription result by ID (best-effort)
    *
-   * Retrieves a previous transcription from Deepgram's request history.
+   * Retrieves a previous transcription from Deepgram's request history API.
+   * Requires `projectId` to be set during initialization.
    *
-   * Unlike the list endpoint, getting a single request DOES include the full
-   * transcript response. Requires `projectId` to be set during initialization.
+   * **Important:** Deepgram's request history is best-effort. Requests may
+   * expire or be unavailable depending on your plan and retention settings.
+   * This is NOT a durable transcript store — for reliable retrieval, use
+   * callback mode (`webhookUrl`) and persist the webhook payload yourself.
    *
-   * @param transcriptId - Request ID from a previous transcription
-   * @returns Full transcript response including text, words, and metadata
+   * The response field on the request history entry is cast to
+   * `ListenV1Response` — this appears to work in practice but is not
+   * explicitly documented by Deepgram as a guaranteed contract.
    *
-   * @example Get a transcript by request ID
-   * ```typescript
-   * const adapter = new DeepgramAdapter()
-   * adapter.initialize({
-   *   apiKey: process.env.DEEPGRAM_API_KEY,
-   *   projectId: process.env.DEEPGRAM_PROJECT_ID
-   * })
-   *
-   * const result = await adapter.getTranscript('abc123-request-id')
-   * if (result.success) {
-   *   console.log(result.data?.text)
-   *   console.log(result.data?.words)
-   * }
-   * ```
+   * @param transcriptId - Request ID from a previous transcription
+   * @returns Transcript response if still available in request history
    *
    * @see https://developers.deepgram.com/reference/get-request
    */
@@ -8784,8 +8786,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
     super(...arguments);
     this.name = "speechmatics";
     this.capabilities = {
-      streaming: false,
-      // Batch only (streaming available via separate WebSocket API)
+      streaming: true,
       diarization: true,
       wordTimestamps: true,
       languageDetection: false,
@@ -9031,6 +9032,271 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
       throw error;
     }
   }
+  /**
+   * Get the regional WebSocket host for real-time streaming
+   *
+   * Speechmatics RT uses a different host pattern: {region}.rt.speechmatics.com
+   */
+  getRegionalWsHost(region) {
+    const regionPrefix = region || "eu1";
+    return `${regionPrefix}.rt.speechmatics.com`;
+  }
+  /**
+   * Stream audio for real-time transcription
+   *
+   * Creates a WebSocket connection to the Speechmatics Real-Time API.
+   * Protocol: send StartRecognition config, then AddAudio binary frames,
+   * receive AddPartialTranscript/AddTranscript/EndOfUtterance messages.
+   *
+   * @param options - Streaming configuration
+   * @param callbacks - Event callbacks
+   * @returns StreamingSession for sending audio and closing
+   *
+   * @see https://docs.speechmatics.com/rt-api-ref
+   */
+  async transcribeStream(options, callbacks) {
+    this.validateConfig();
+    const sessionId = `speechmatics_${Date.now()}_${Math.random().toString(36).substring(7)}`;
+    const createdAt = /* @__PURE__ */ new Date();
+    const smOpts = options?.speechmaticsStreaming;
+    const region = smOpts?.region || this.config?.region;
+    const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost(region)}`);
+    const wsUrl = `${wsBase}/v2`;
+    let status = "connecting";
+    let recognitionStarted = false;
+    const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : __require("ws");
+    const ws = new WebSocketImpl(wsUrl);
+    const language = smOpts?.language || options?.language || "en";
+    const transcriptionConfig = {
+      language,
+      enable_entities: smOpts?.enableEntities ?? options?.entityDetection ?? false,
+      enable_partials: smOpts?.enablePartials ?? options?.interimResults !== false,
+      operating_point: smOpts?.operatingPoint || OperatingPoint.enhanced,
+      ...smOpts?.maxDelay !== void 0 && { max_delay: smOpts.maxDelay },
+      ...smOpts?.maxDelayMode && {
+        max_delay_mode: smOpts.maxDelayMode
+      },
+      ...smOpts?.domain && { domain: smOpts.domain },
+      ...(options?.diarization || smOpts?.diarization === TranscriptionConfigDiarization.speaker) && {
+        diarization: TranscriptionConfigDiarization.speaker,
+        ...smOpts?.maxSpeakers !== void 0 && {
+          speaker_diarization_config: { max_speakers: smOpts.maxSpeakers }
+        }
+      },
+      ...(options?.customVocabulary?.length || smOpts?.additionalVocab?.length) && {
+        additional_vocab: (smOpts?.additionalVocab || options?.customVocabulary || []).map(
+          (term) => ({ content: term })
+        )
+      }
+    };
+    const startRecognition = {
+      message: "StartRecognition",
+      audio_format: {
+        type: "raw",
+        encoding: smOpts?.encoding || "pcm_s16le",
+        sample_rate: smOpts?.sampleRate || options?.sampleRate || 16e3
+      },
+      transcription_config: transcriptionConfig,
+      ...smOpts?.conversationConfig && {
+        conversation_config: {
+          end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
+        }
+      }
+    };
+    ws.onopen = () => {
+      status = "open";
+      const msg = JSON.stringify(startRecognition);
+      if (callbacks?.onRawMessage) {
+        callbacks.onRawMessage({
+          provider: this.name,
+          direction: "outgoing",
+          timestamp: Date.now(),
+          payload: msg,
+          messageType: "StartRecognition"
+        });
+      }
+      ws.send(msg);
+    };
+    ws.onmessage = (event) => {
+      const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
+      try {
+        const data = JSON.parse(rawPayload);
+        const messageType = data.message;
+        if (callbacks?.onRawMessage) {
+          callbacks.onRawMessage({
+            provider: this.name,
+            direction: "incoming",
+            timestamp: Date.now(),
+            payload: rawPayload,
+            messageType
+          });
+        }
+        switch (messageType) {
+          case "RecognitionStarted": {
+            recognitionStarted = true;
+            callbacks?.onOpen?.();
+            callbacks?.onMetadata?.({
+              id: data.id,
+              languagePackInfo: data.language_pack_info
+            });
+            break;
+          }
+          case "AddPartialTranscript": {
+            const partial = data;
+            const words = this.resultsToWords(partial.results);
+            callbacks?.onTranscript?.({
+              type: "transcript",
+              text: partial.metadata.transcript,
+              isFinal: false,
+              words,
+              speaker: words[0]?.speaker,
+              confidence: partial.results[0]?.alternatives?.[0]?.confidence,
+              channel: partial.channel ? parseInt(partial.channel) : void 0
+            });
+            break;
+          }
+          case "AddTranscript": {
+            const final = data;
+            const words = this.resultsToWords(final.results);
+            callbacks?.onTranscript?.({
+              type: "transcript",
+              text: final.metadata.transcript,
+              isFinal: true,
+              words,
+              speaker: words[0]?.speaker,
+              confidence: final.results[0]?.alternatives?.[0]?.confidence,
+              channel: final.channel ? parseInt(final.channel) : void 0
+            });
+            if (options?.diarization || smOpts?.diarization === "speaker") {
+              const utterances = buildUtterancesFromWords(words);
+              for (const utterance of utterances) {
+                callbacks?.onUtterance?.(utterance);
+              }
+            }
+            break;
+          }
+          case "EndOfUtterance": {
+            break;
+          }
+          case "EndOfTranscript": {
+            callbacks?.onClose?.(1e3, "Transcription complete");
+            break;
+          }
+          case "Error": {
+            const err = data;
+            callbacks?.onError?.({
+              code: err.type || "SPEECHMATICS_ERROR",
+              message: err.reason || "Unknown error"
+            });
+            break;
+          }
+          case "Warning": {
+            const warn = data;
+            callbacks?.onMetadata?.({
+              warning: warn.type,
+              reason: warn.reason
+            });
+            break;
+          }
+          case "Info": {
+            callbacks?.onMetadata?.(data);
+            break;
+          }
+          case "AudioAdded":
+          case "ChannelAudioAdded":
+            break;
+          default:
+            callbacks?.onMetadata?.(data);
+            break;
+        }
+      } catch (error) {
+        callbacks?.onError?.({
+          code: "PARSE_ERROR",
+          message: `Failed to parse message: ${error}`
+        });
+      }
+    };
+    ws.onerror = () => {
+      callbacks?.onError?.({
+        code: "WEBSOCKET_ERROR",
+        message: "WebSocket error occurred"
+      });
+    };
+    ws.onclose = (event) => {
+      status = "closed";
+      callbacks?.onClose?.(event.code, event.reason);
+    };
+    await new Promise((resolve, reject) => {
+      const timeout = setTimeout(() => {
+        reject(new Error("WebSocket connection timeout"));
+      }, 1e4);
+      const checkReady = () => {
+        if (recognitionStarted) {
+          clearTimeout(timeout);
+          resolve();
+        } else if (status === "closed") {
+          clearTimeout(timeout);
+          reject(new Error("WebSocket connection failed"));
+        } else {
+          setTimeout(checkReady, 100);
+        }
+      };
+      checkReady();
+    });
+    return {
+      id: sessionId,
+      provider: this.name,
+      createdAt,
+      getStatus: () => status,
+      sendAudio: async (chunk) => {
+        if (status !== "open") {
+          throw new Error("Session is not open");
+        }
+        if (callbacks?.onRawMessage) {
+          const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
+            chunk.data.byteOffset,
+            chunk.data.byteOffset + chunk.data.byteLength
+          );
+          callbacks.onRawMessage({
+            provider: this.name,
+            direction: "outgoing",
+            timestamp: Date.now(),
+            payload: audioPayload,
+            messageType: "audio"
+          });
+        }
+        ws.send(chunk.data);
+      },
+      close: async () => {
+        if (status === "open") {
+          status = "closing";
+          const endMsg = JSON.stringify({ message: "EndOfStream", last_seq_no: 0 });
+          if (callbacks?.onRawMessage) {
+            callbacks.onRawMessage({
+              provider: this.name,
+              direction: "outgoing",
+              timestamp: Date.now(),
+              payload: endMsg,
+              messageType: "EndOfStream"
+            });
+          }
+          ws.send(endMsg);
+        }
+      }
+    };
+  }
+  /**
+   * Convert Speechmatics RecognitionResult[] to unified Word[]
+   */
+  resultsToWords(results) {
+    return results.filter((r) => r.type === "word").map((r) => ({
+      word: r.alternatives?.[0]?.content || "",
+      start: r.start_time,
+      end: r.end_time,
+      confidence: r.alternatives?.[0]?.confidence,
+      speaker: r.alternatives?.[0]?.speaker
+    }));
+  }
   /**
    * Normalize Speechmatics status to unified status
    * Uses generated JobDetailsStatus enum values
@@ -9450,7 +9716,7 @@ var SonioxAdapter = class extends BaseAdapter {
       let messageType;
       try {
         const data = JSON.parse(rawPayload);
-        const errorMessage = data.error_message || data.error;
+        const errorMessage = data.error_message;
         if (errorMessage) {
           messageType = "error";
         } else if (data.finished) {
@@ -9809,7 +10075,15 @@ var ElevenLabsAdapter = class extends BaseAdapter {
   /**
    * Submit audio for transcription
    *
-   * ElevenLabs batch is synchronous - the API returns the result directly.
+   * ElevenLabs batch is normally synchronous — the API returns results directly.
+   *
+   * **Webhook mode:** When `webhookUrl` is set (or `elevenlabs.webhook` is true),
+   * the request is processed asynchronously. ElevenLabs returns a 202 with a
+   * `request_id` and delivers results to a webhook configured in the ElevenLabs
+   * dashboard. The unified `webhookUrl` acts as an intent flag to enable async
+   * mode — the actual delivery destination must be pre-configured in your
+   * ElevenLabs dashboard. Use `elevenlabs.webhook_id` to target a specific
+   * webhook endpoint.
    */
   async transcribe(audio, options) {
     this.validateConfig();
@@ -9832,6 +10106,11 @@ var ElevenLabsAdapter = class extends BaseAdapter {
           }
         };
       }
+      const elevenlabsOpts = options?.elevenlabs;
+      const useWebhook = options?.webhookUrl || elevenlabsOpts?.webhook;
+      if (useWebhook) {
+        formData.append("webhook", "true");
+      }
       if (options?.language) {
         formData.append("language_code", options.language);
       }
@@ -9850,7 +10129,6 @@ var ElevenLabsAdapter = class extends BaseAdapter {
       if (options?.entityDetection) {
         formData.append("entity_detection", "all");
       }
-      const elevenlabsOpts = options?.elevenlabs;
       if (elevenlabsOpts) {
         for (const [key, value] of Object.entries(elevenlabsOpts)) {
           if (value === void 0 || value === null) continue;
@@ -9873,6 +10151,22 @@ var ElevenLabsAdapter = class extends BaseAdapter {
           "Content-Type": "multipart/form-data"
         }
       });
+      if (useWebhook) {
+        const ack = response.data;
+        return {
+          success: true,
+          provider: this.name,
+          data: {
+            id: ack.request_id || ack.transcription_id || `elevenlabs_${Date.now()}`,
+            text: "",
+            status: "queued"
+          },
+          tracking: {
+            requestId: ack.request_id
+          },
+          raw: response.data
+        };
+      }
       return this.normalizeResponse(response.data);
     } catch (error) {
       return this.createErrorResponse(error);
@@ -9965,20 +10259,9 @@ var ElevenLabsAdapter = class extends BaseAdapter {
     ws.onmessage = (event) => {
       receivedData = true;
       const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
-      let messageType;
       try {
         const data = JSON.parse(rawPayload);
-        if (data.error) {
-          messageType = "error";
-        } else if (data.message_type === "session_started") {
-          messageType = "session_started";
-        } else if (data.message_type === "partial_transcript") {
-          messageType = "partial_transcript";
-        } else if (data.message_type === "committed_transcript") {
-          messageType = "committed_transcript";
-        } else if (data.message_type === "committed_transcript_with_timestamps") {
-          messageType = "committed_transcript_with_timestamps";
-        }
+        const messageType = "error" in data ? "error" : data.message_type;
         if (callbacks?.onRawMessage) {
           callbacks.onRawMessage({
             provider: this.name,
@@ -9988,50 +10271,62 @@ var ElevenLabsAdapter = class extends BaseAdapter {
             messageType
           });
         }
-        if (data.error) {
+        if ("error" in data) {
           callbacks?.onError?.({
-            code: data.error_code?.toString() || "STREAM_ERROR",
+            code: data.message_type || "STREAM_ERROR",
             message: data.error
           });
           return;
         }
-        if (data.message_type === "session_started") {
-          return;
-        }
-        if (data.message_type === "partial_transcript") {
-          const streamEvent = {
-            type: "transcript",
-            text: data.text || "",
-            isFinal: false,
-            confidence: void 0,
-            language: data.language_code
-          };
-          callbacks?.onTranscript?.(streamEvent);
-          return;
-        }
-        if (data.message_type === "committed_transcript" || data.message_type === "committed_transcript_with_timestamps") {
-          const words = data.words ? data.words.map((w) => ({
-            word: w.text || "",
-            start: w.start || 0,
-            end: w.end || 0,
-            confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
-            speaker: w.speaker_id
-          })) : [];
-          const streamEvent = {
-            type: "transcript",
-            text: data.text || "",
-            isFinal: true,
-            words: words.length > 0 ? words : void 0,
-            speaker: words[0]?.speaker,
-            language: data.language_code,
-            confidence: void 0
-          };
-          callbacks?.onTranscript?.(streamEvent);
-          if (options?.diarization && words.length > 0) {
-            const utterances = buildUtterancesFromWords(words);
-            for (const utterance of utterances) {
-              callbacks?.onUtterance?.(utterance);
+        switch (data.message_type) {
+          case "session_started":
+            break;
+          case "partial_transcript": {
+            const streamEvent = {
+              type: "transcript",
+              text: data.text || "",
+              isFinal: false,
+              confidence: void 0
+            };
+            callbacks?.onTranscript?.(streamEvent);
+            break;
+          }
+          case "committed_transcript": {
+            const streamEvent = {
+              type: "transcript",
+              text: data.text || "",
+              isFinal: true,
+              confidence: void 0
+            };
+            callbacks?.onTranscript?.(streamEvent);
+            break;
+          }
+          case "committed_transcript_with_timestamps": {
+            const tsData = data;
+            const words = tsData.words ? tsData.words.map((w) => ({
+              word: w.text || "",
+              start: w.start || 0,
+              end: w.end || 0,
+              confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
+              speaker: w.speaker_id
+            })) : [];
+            const streamEvent = {
+              type: "transcript",
+              text: tsData.text || "",
+              isFinal: true,
+              words: words.length > 0 ? words : void 0,
+              speaker: words[0]?.speaker,
+              language: tsData.language_code,
+              confidence: void 0
+            };
+            callbacks?.onTranscript?.(streamEvent);
+            if (options?.diarization && words.length > 0) {
+              const utterances = buildUtterancesFromWords(words);
+              for (const utterance of utterances) {
+                callbacks?.onUtterance?.(utterance);
+              }
             }
+            break;
           }
         }
       } catch (error) {
@@ -39233,6 +39528,7 @@ export {
   AssemblyAILanguage,
   AssemblyAILanguageCodes,
   AssemblyAIListFilterSchema,
+  AssemblyAIRegion,
   AssemblyAISampleRate,
   AssemblyAISpeechModel,
   AssemblyAIStatus,
@@ -39283,6 +39579,7 @@ export {
   ElevenLabsLanguageCodes,
   ElevenLabsLanguageLabels,
   ElevenLabsLanguages,
+  ElevenLabsRegion,
   schema_exports8 as ElevenLabsTypes,
   elevenLabsSpeechToTextAPI_zod_exports as ElevenLabsZodSchemas,
   GladiaAdapter,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voice-router-dev",
-  "version": "0.9.1",
+  "version": "0.9.3",
   "description": "Universal speech-to-text router for Gladia, AssemblyAI, Deepgram, Azure, OpenAI Whisper, Speechmatics, Soniox, and ElevenLabs",
   "main": "dist/index.js",
   "module": "dist/index.mjs",