voice-router-dev 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -6566,9 +6566,13 @@ var DeepgramAdapter = class extends BaseAdapter {
6566
6566
  * Submit audio for transcription
6567
6567
  *
6568
6568
  * Sends audio to Deepgram API for transcription. Deepgram normally processes
6569
- * synchronously and returns results immediately. When `webhookUrl` is set,
6570
- * Deepgram can instead return an async callback acknowledgment containing a
6571
- * request ID.
6569
+ * synchronously and returns results immediately.
6570
+ *
6571
+ * **Callback mode:** When `webhookUrl` is set, Deepgram returns immediately
6572
+ * with a `request_id` (status `"queued"`). The full transcript is POSTed to
6573
+ * the webhook URL — this is the primary delivery mechanism. `getTranscript()`
6574
+ * can attempt to retrieve the result later via request history, but that
6575
+ * endpoint is best-effort and not a guaranteed durable store.
6572
6576
  *
6573
6577
  * @param audio - Audio input (URL or file buffer)
6574
6578
  * @param options - Transcription options
@@ -6678,30 +6682,22 @@ var DeepgramAdapter = class extends BaseAdapter {
6678
6682
  }
6679
6683
  }
6680
6684
  /**
6681
- * Get transcription result by ID
6682
- *
6683
- * Retrieves a previous transcription from Deepgram's request history.
6685
+ * Get transcription result by ID (best-effort)
6684
6686
  *
6685
- * Unlike the list endpoint, getting a single request DOES include the full
6686
- * transcript response. Requires `projectId` to be set during initialization.
6687
+ * Retrieves a previous transcription from Deepgram's request history API.
6688
+ * Requires `projectId` to be set during initialization.
6687
6689
  *
6688
- * @param transcriptId - Request ID from a previous transcription
6689
- * @returns Full transcript response including text, words, and metadata
6690
+ * **Important:** Deepgram's request history is best-effort. Requests may
6691
+ * expire or be unavailable depending on your plan and retention settings.
6692
+ * This is NOT a durable transcript store — for reliable retrieval, use
6693
+ * callback mode (`webhookUrl`) and persist the webhook payload yourself.
6690
6694
  *
6691
- * @example Get a transcript by request ID
6692
- * ```typescript
6693
- * const adapter = new DeepgramAdapter()
6694
- * adapter.initialize({
6695
- * apiKey: process.env.DEEPGRAM_API_KEY,
6696
- * projectId: process.env.DEEPGRAM_PROJECT_ID
6697
- * })
6695
+ * The response field on the request history entry is cast to
6696
+ * `ListenV1Response` — this appears to work in practice but is not
6697
+ * explicitly documented by Deepgram as a guaranteed contract.
6698
6698
  *
6699
- * const result = await adapter.getTranscript('abc123-request-id')
6700
- * if (result.success) {
6701
- * console.log(result.data?.text)
6702
- * console.log(result.data?.words)
6703
- * }
6704
- * ```
6699
+ * @param transcriptId - Request ID from a previous transcription
6700
+ * @returns Transcript response if still available in request history
6705
6701
  *
6706
6702
  * @see https://developers.deepgram.com/reference/get-request
6707
6703
  */
@@ -8784,8 +8780,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8784
8780
  super(...arguments);
8785
8781
  this.name = "speechmatics";
8786
8782
  this.capabilities = {
8787
- streaming: false,
8788
- // Batch only (streaming available via separate WebSocket API)
8783
+ streaming: true,
8789
8784
  diarization: true,
8790
8785
  wordTimestamps: true,
8791
8786
  languageDetection: false,
@@ -9031,6 +9026,271 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9031
9026
  throw error;
9032
9027
  }
9033
9028
  }
9029
+ /**
9030
+ * Get the regional WebSocket host for real-time streaming
9031
+ *
9032
+ * Speechmatics RT uses a different host pattern: {region}.rt.speechmatics.com
9033
+ */
9034
+ getRegionalWsHost(region) {
9035
+ const regionPrefix = region || "eu1";
9036
+ return `${regionPrefix}.rt.speechmatics.com`;
9037
+ }
9038
+ /**
9039
+ * Stream audio for real-time transcription
9040
+ *
9041
+ * Creates a WebSocket connection to the Speechmatics Real-Time API.
9042
+ * Protocol: send StartRecognition config, then AddAudio binary frames,
9043
+ * receive AddPartialTranscript/AddTranscript/EndOfUtterance messages.
9044
+ *
9045
+ * @param options - Streaming configuration
9046
+ * @param callbacks - Event callbacks
9047
+ * @returns StreamingSession for sending audio and closing
9048
+ *
9049
+ * @see https://docs.speechmatics.com/rt-api-ref
9050
+ */
9051
+ async transcribeStream(options, callbacks) {
9052
+ this.validateConfig();
9053
+ const sessionId = `speechmatics_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9054
+ const createdAt = /* @__PURE__ */ new Date();
9055
+ const smOpts = options?.speechmaticsStreaming;
9056
+ const region = smOpts?.region || this.config?.region;
9057
+ const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost(region)}`);
9058
+ const wsUrl = `${wsBase}/v2`;
9059
+ let status = "connecting";
9060
+ let recognitionStarted = false;
9061
+ const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : __require("ws");
9062
+ const ws = new WebSocketImpl(wsUrl);
9063
+ const language = smOpts?.language || options?.language || "en";
9064
+ const transcriptionConfig = {
9065
+ language,
9066
+ enable_entities: smOpts?.enableEntities ?? options?.entityDetection ?? false,
9067
+ enable_partials: smOpts?.enablePartials ?? options?.interimResults !== false,
9068
+ operating_point: smOpts?.operatingPoint || OperatingPoint.enhanced,
9069
+ ...smOpts?.maxDelay !== void 0 && { max_delay: smOpts.maxDelay },
9070
+ ...smOpts?.maxDelayMode && {
9071
+ max_delay_mode: smOpts.maxDelayMode
9072
+ },
9073
+ ...smOpts?.domain && { domain: smOpts.domain },
9074
+ ...(options?.diarization || smOpts?.diarization === TranscriptionConfigDiarization.speaker) && {
9075
+ diarization: TranscriptionConfigDiarization.speaker,
9076
+ ...smOpts?.maxSpeakers !== void 0 && {
9077
+ speaker_diarization_config: { max_speakers: smOpts.maxSpeakers }
9078
+ }
9079
+ },
9080
+ ...(options?.customVocabulary?.length || smOpts?.additionalVocab?.length) && {
9081
+ additional_vocab: (smOpts?.additionalVocab || options?.customVocabulary || []).map(
9082
+ (term) => ({ content: term })
9083
+ )
9084
+ }
9085
+ };
9086
+ const startRecognition = {
9087
+ message: "StartRecognition",
9088
+ audio_format: {
9089
+ type: "raw",
9090
+ encoding: smOpts?.encoding || "pcm_s16le",
9091
+ sample_rate: smOpts?.sampleRate || options?.sampleRate || 16e3
9092
+ },
9093
+ transcription_config: transcriptionConfig,
9094
+ ...smOpts?.conversationConfig && {
9095
+ conversation_config: {
9096
+ end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9097
+ }
9098
+ }
9099
+ };
9100
+ ws.onopen = () => {
9101
+ status = "open";
9102
+ const msg = JSON.stringify(startRecognition);
9103
+ if (callbacks?.onRawMessage) {
9104
+ callbacks.onRawMessage({
9105
+ provider: this.name,
9106
+ direction: "outgoing",
9107
+ timestamp: Date.now(),
9108
+ payload: msg,
9109
+ messageType: "StartRecognition"
9110
+ });
9111
+ }
9112
+ ws.send(msg);
9113
+ };
9114
+ ws.onmessage = (event) => {
9115
+ const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
9116
+ try {
9117
+ const data = JSON.parse(rawPayload);
9118
+ const messageType = data.message;
9119
+ if (callbacks?.onRawMessage) {
9120
+ callbacks.onRawMessage({
9121
+ provider: this.name,
9122
+ direction: "incoming",
9123
+ timestamp: Date.now(),
9124
+ payload: rawPayload,
9125
+ messageType
9126
+ });
9127
+ }
9128
+ switch (messageType) {
9129
+ case "RecognitionStarted": {
9130
+ recognitionStarted = true;
9131
+ callbacks?.onOpen?.();
9132
+ callbacks?.onMetadata?.({
9133
+ id: data.id,
9134
+ languagePackInfo: data.language_pack_info
9135
+ });
9136
+ break;
9137
+ }
9138
+ case "AddPartialTranscript": {
9139
+ const partial = data;
9140
+ const words = this.resultsToWords(partial.results);
9141
+ callbacks?.onTranscript?.({
9142
+ type: "transcript",
9143
+ text: partial.metadata.transcript,
9144
+ isFinal: false,
9145
+ words,
9146
+ speaker: words[0]?.speaker,
9147
+ confidence: partial.results[0]?.alternatives?.[0]?.confidence,
9148
+ channel: partial.channel ? parseInt(partial.channel) : void 0
9149
+ });
9150
+ break;
9151
+ }
9152
+ case "AddTranscript": {
9153
+ const final = data;
9154
+ const words = this.resultsToWords(final.results);
9155
+ callbacks?.onTranscript?.({
9156
+ type: "transcript",
9157
+ text: final.metadata.transcript,
9158
+ isFinal: true,
9159
+ words,
9160
+ speaker: words[0]?.speaker,
9161
+ confidence: final.results[0]?.alternatives?.[0]?.confidence,
9162
+ channel: final.channel ? parseInt(final.channel) : void 0
9163
+ });
9164
+ if (options?.diarization || smOpts?.diarization === "speaker") {
9165
+ const utterances = buildUtterancesFromWords(words);
9166
+ for (const utterance of utterances) {
9167
+ callbacks?.onUtterance?.(utterance);
9168
+ }
9169
+ }
9170
+ break;
9171
+ }
9172
+ case "EndOfUtterance": {
9173
+ break;
9174
+ }
9175
+ case "EndOfTranscript": {
9176
+ callbacks?.onClose?.(1e3, "Transcription complete");
9177
+ break;
9178
+ }
9179
+ case "Error": {
9180
+ const err = data;
9181
+ callbacks?.onError?.({
9182
+ code: err.type || "SPEECHMATICS_ERROR",
9183
+ message: err.reason || "Unknown error"
9184
+ });
9185
+ break;
9186
+ }
9187
+ case "Warning": {
9188
+ const warn = data;
9189
+ callbacks?.onMetadata?.({
9190
+ warning: warn.type,
9191
+ reason: warn.reason
9192
+ });
9193
+ break;
9194
+ }
9195
+ case "Info": {
9196
+ callbacks?.onMetadata?.(data);
9197
+ break;
9198
+ }
9199
+ case "AudioAdded":
9200
+ case "ChannelAudioAdded":
9201
+ break;
9202
+ default:
9203
+ callbacks?.onMetadata?.(data);
9204
+ break;
9205
+ }
9206
+ } catch (error) {
9207
+ callbacks?.onError?.({
9208
+ code: "PARSE_ERROR",
9209
+ message: `Failed to parse message: ${error}`
9210
+ });
9211
+ }
9212
+ };
9213
+ ws.onerror = () => {
9214
+ callbacks?.onError?.({
9215
+ code: "WEBSOCKET_ERROR",
9216
+ message: "WebSocket error occurred"
9217
+ });
9218
+ };
9219
+ ws.onclose = (event) => {
9220
+ status = "closed";
9221
+ callbacks?.onClose?.(event.code, event.reason);
9222
+ };
9223
+ await new Promise((resolve, reject) => {
9224
+ const timeout = setTimeout(() => {
9225
+ reject(new Error("WebSocket connection timeout"));
9226
+ }, 1e4);
9227
+ const checkReady = () => {
9228
+ if (recognitionStarted) {
9229
+ clearTimeout(timeout);
9230
+ resolve();
9231
+ } else if (status === "closed") {
9232
+ clearTimeout(timeout);
9233
+ reject(new Error("WebSocket connection failed"));
9234
+ } else {
9235
+ setTimeout(checkReady, 100);
9236
+ }
9237
+ };
9238
+ checkReady();
9239
+ });
9240
+ return {
9241
+ id: sessionId,
9242
+ provider: this.name,
9243
+ createdAt,
9244
+ getStatus: () => status,
9245
+ sendAudio: async (chunk) => {
9246
+ if (status !== "open") {
9247
+ throw new Error("Session is not open");
9248
+ }
9249
+ if (callbacks?.onRawMessage) {
9250
+ const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
9251
+ chunk.data.byteOffset,
9252
+ chunk.data.byteOffset + chunk.data.byteLength
9253
+ );
9254
+ callbacks.onRawMessage({
9255
+ provider: this.name,
9256
+ direction: "outgoing",
9257
+ timestamp: Date.now(),
9258
+ payload: audioPayload,
9259
+ messageType: "audio"
9260
+ });
9261
+ }
9262
+ ws.send(chunk.data);
9263
+ },
9264
+ close: async () => {
9265
+ if (status === "open") {
9266
+ status = "closing";
9267
+ const endMsg = JSON.stringify({ message: "EndOfStream", last_seq_no: 0 });
9268
+ if (callbacks?.onRawMessage) {
9269
+ callbacks.onRawMessage({
9270
+ provider: this.name,
9271
+ direction: "outgoing",
9272
+ timestamp: Date.now(),
9273
+ payload: endMsg,
9274
+ messageType: "EndOfStream"
9275
+ });
9276
+ }
9277
+ ws.send(endMsg);
9278
+ }
9279
+ }
9280
+ };
9281
+ }
9282
+ /**
9283
+ * Convert Speechmatics RecognitionResult[] to unified Word[]
9284
+ */
9285
+ resultsToWords(results) {
9286
+ return results.filter((r) => r.type === "word").map((r) => ({
9287
+ word: r.alternatives?.[0]?.content || "",
9288
+ start: r.start_time,
9289
+ end: r.end_time,
9290
+ confidence: r.alternatives?.[0]?.confidence,
9291
+ speaker: r.alternatives?.[0]?.speaker
9292
+ }));
9293
+ }
9034
9294
  /**
9035
9295
  * Normalize Speechmatics status to unified status
9036
9296
  * Uses generated JobDetailsStatus enum values
@@ -9450,7 +9710,7 @@ var SonioxAdapter = class extends BaseAdapter {
9450
9710
  let messageType;
9451
9711
  try {
9452
9712
  const data = JSON.parse(rawPayload);
9453
- const errorMessage = data.error_message || data.error;
9713
+ const errorMessage = data.error_message;
9454
9714
  if (errorMessage) {
9455
9715
  messageType = "error";
9456
9716
  } else if (data.finished) {
@@ -9809,7 +10069,15 @@ var ElevenLabsAdapter = class extends BaseAdapter {
9809
10069
  /**
9810
10070
  * Submit audio for transcription
9811
10071
  *
9812
- * ElevenLabs batch is synchronous - the API returns the result directly.
10072
+ * ElevenLabs batch is normally synchronous the API returns results directly.
10073
+ *
10074
+ * **Webhook mode:** When `webhookUrl` is set (or `elevenlabs.webhook` is true),
10075
+ * the request is processed asynchronously. ElevenLabs returns a 202 with a
10076
+ * `request_id` and delivers results to a webhook configured in the ElevenLabs
10077
+ * dashboard. The unified `webhookUrl` acts as an intent flag to enable async
10078
+ * mode — the actual delivery destination must be pre-configured in your
10079
+ * ElevenLabs dashboard. Use `elevenlabs.webhook_id` to target a specific
10080
+ * webhook endpoint.
9813
10081
  */
9814
10082
  async transcribe(audio, options) {
9815
10083
  this.validateConfig();
@@ -9832,6 +10100,11 @@ var ElevenLabsAdapter = class extends BaseAdapter {
9832
10100
  }
9833
10101
  };
9834
10102
  }
10103
+ const elevenlabsOpts = options?.elevenlabs;
10104
+ const useWebhook = options?.webhookUrl || elevenlabsOpts?.webhook;
10105
+ if (useWebhook) {
10106
+ formData.append("webhook", "true");
10107
+ }
9835
10108
  if (options?.language) {
9836
10109
  formData.append("language_code", options.language);
9837
10110
  }
@@ -9850,7 +10123,6 @@ var ElevenLabsAdapter = class extends BaseAdapter {
9850
10123
  if (options?.entityDetection) {
9851
10124
  formData.append("entity_detection", "all");
9852
10125
  }
9853
- const elevenlabsOpts = options?.elevenlabs;
9854
10126
  if (elevenlabsOpts) {
9855
10127
  for (const [key, value] of Object.entries(elevenlabsOpts)) {
9856
10128
  if (value === void 0 || value === null) continue;
@@ -9873,6 +10145,22 @@ var ElevenLabsAdapter = class extends BaseAdapter {
9873
10145
  "Content-Type": "multipart/form-data"
9874
10146
  }
9875
10147
  });
10148
+ if (useWebhook) {
10149
+ const ack = response.data;
10150
+ return {
10151
+ success: true,
10152
+ provider: this.name,
10153
+ data: {
10154
+ id: ack.request_id || ack.transcription_id || `elevenlabs_${Date.now()}`,
10155
+ text: "",
10156
+ status: "queued"
10157
+ },
10158
+ tracking: {
10159
+ requestId: ack.request_id
10160
+ },
10161
+ raw: response.data
10162
+ };
10163
+ }
9876
10164
  return this.normalizeResponse(response.data);
9877
10165
  } catch (error) {
9878
10166
  return this.createErrorResponse(error);
@@ -9965,20 +10253,9 @@ var ElevenLabsAdapter = class extends BaseAdapter {
9965
10253
  ws.onmessage = (event) => {
9966
10254
  receivedData = true;
9967
10255
  const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
9968
- let messageType;
9969
10256
  try {
9970
10257
  const data = JSON.parse(rawPayload);
9971
- if (data.error) {
9972
- messageType = "error";
9973
- } else if (data.message_type === "session_started") {
9974
- messageType = "session_started";
9975
- } else if (data.message_type === "partial_transcript") {
9976
- messageType = "partial_transcript";
9977
- } else if (data.message_type === "committed_transcript") {
9978
- messageType = "committed_transcript";
9979
- } else if (data.message_type === "committed_transcript_with_timestamps") {
9980
- messageType = "committed_transcript_with_timestamps";
9981
- }
10258
+ const messageType = "error" in data ? "error" : data.message_type;
9982
10259
  if (callbacks?.onRawMessage) {
9983
10260
  callbacks.onRawMessage({
9984
10261
  provider: this.name,
@@ -9988,50 +10265,62 @@ var ElevenLabsAdapter = class extends BaseAdapter {
9988
10265
  messageType
9989
10266
  });
9990
10267
  }
9991
- if (data.error) {
10268
+ if ("error" in data) {
9992
10269
  callbacks?.onError?.({
9993
- code: data.error_code?.toString() || "STREAM_ERROR",
10270
+ code: data.message_type || "STREAM_ERROR",
9994
10271
  message: data.error
9995
10272
  });
9996
10273
  return;
9997
10274
  }
9998
- if (data.message_type === "session_started") {
9999
- return;
10000
- }
10001
- if (data.message_type === "partial_transcript") {
10002
- const streamEvent = {
10003
- type: "transcript",
10004
- text: data.text || "",
10005
- isFinal: false,
10006
- confidence: void 0,
10007
- language: data.language_code
10008
- };
10009
- callbacks?.onTranscript?.(streamEvent);
10010
- return;
10011
- }
10012
- if (data.message_type === "committed_transcript" || data.message_type === "committed_transcript_with_timestamps") {
10013
- const words = data.words ? data.words.map((w) => ({
10014
- word: w.text || "",
10015
- start: w.start || 0,
10016
- end: w.end || 0,
10017
- confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
10018
- speaker: w.speaker_id
10019
- })) : [];
10020
- const streamEvent = {
10021
- type: "transcript",
10022
- text: data.text || "",
10023
- isFinal: true,
10024
- words: words.length > 0 ? words : void 0,
10025
- speaker: words[0]?.speaker,
10026
- language: data.language_code,
10027
- confidence: void 0
10028
- };
10029
- callbacks?.onTranscript?.(streamEvent);
10030
- if (options?.diarization && words.length > 0) {
10031
- const utterances = buildUtterancesFromWords(words);
10032
- for (const utterance of utterances) {
10033
- callbacks?.onUtterance?.(utterance);
10275
+ switch (data.message_type) {
10276
+ case "session_started":
10277
+ break;
10278
+ case "partial_transcript": {
10279
+ const streamEvent = {
10280
+ type: "transcript",
10281
+ text: data.text || "",
10282
+ isFinal: false,
10283
+ confidence: void 0
10284
+ };
10285
+ callbacks?.onTranscript?.(streamEvent);
10286
+ break;
10287
+ }
10288
+ case "committed_transcript": {
10289
+ const streamEvent = {
10290
+ type: "transcript",
10291
+ text: data.text || "",
10292
+ isFinal: true,
10293
+ confidence: void 0
10294
+ };
10295
+ callbacks?.onTranscript?.(streamEvent);
10296
+ break;
10297
+ }
10298
+ case "committed_transcript_with_timestamps": {
10299
+ const tsData = data;
10300
+ const words = tsData.words ? tsData.words.map((w) => ({
10301
+ word: w.text || "",
10302
+ start: w.start || 0,
10303
+ end: w.end || 0,
10304
+ confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
10305
+ speaker: w.speaker_id
10306
+ })) : [];
10307
+ const streamEvent = {
10308
+ type: "transcript",
10309
+ text: tsData.text || "",
10310
+ isFinal: true,
10311
+ words: words.length > 0 ? words : void 0,
10312
+ speaker: words[0]?.speaker,
10313
+ language: tsData.language_code,
10314
+ confidence: void 0
10315
+ };
10316
+ callbacks?.onTranscript?.(streamEvent);
10317
+ if (options?.diarization && words.length > 0) {
10318
+ const utterances = buildUtterancesFromWords(words);
10319
+ for (const utterance of utterances) {
10320
+ callbacks?.onUtterance?.(utterance);
10321
+ }
10034
10322
  }
10323
+ break;
10035
10324
  }
10036
10325
  }
10037
10326
  } catch (error) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voice-router-dev",
3
- "version": "0.9.1",
3
+ "version": "0.9.2",
4
4
  "description": "Universal speech-to-text router for Gladia, AssemblyAI, Deepgram, Azure, OpenAI Whisper, Speechmatics, Soniox, and ElevenLabs",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",