voice-router-dev 0.8.7 → 0.8.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8920,6 +8920,7 @@ function createOpenAIWhisperAdapter(config) {
8920
8920
 
8921
8921
  // src/adapters/speechmatics-adapter.ts
8922
8922
  var import_axios8 = __toESM(require("axios"));
8923
+ var import_ws5 = __toESM(require("ws"));
8923
8924
 
8924
8925
  // src/generated/speechmatics/schema/notificationConfigContentsItem.ts
8925
8926
  var NotificationConfigContentsItem = {
@@ -8969,8 +8970,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8969
8970
  super(...arguments);
8970
8971
  this.name = "speechmatics";
8971
8972
  this.capabilities = {
8972
- streaming: false,
8973
- // Batch only (streaming available via separate WebSocket API)
8973
+ streaming: true,
8974
8974
  diarization: true,
8975
8975
  wordTimestamps: true,
8976
8976
  languageDetection: false,
@@ -9219,6 +9219,389 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9219
9219
  throw error;
9220
9220
  }
9221
9221
  }
9222
+ /**
9223
+ * Build WebSocket URL for real-time streaming
9224
+ *
9225
+ * Note: Real-time API uses a different host from the batch API:
9226
+ * - Batch: {region}.asr.api.speechmatics.com
9227
+ * - Real-time: {region}.rt.speechmatics.com
9228
+ *
9229
+ * @param region - Regional endpoint identifier
9230
+ * @returns WebSocket URL for real-time API
9231
+ */
9232
+ getRegionalWsUrl(region) {
9233
+ if (this.config?.wsBaseUrl) {
9234
+ return this.config.wsBaseUrl;
9235
+ }
9236
+ const rtRegionMap = {
9237
+ eu1: "eu",
9238
+ eu2: "eu",
9239
+ us1: "us",
9240
+ us2: "us",
9241
+ au1: "eu"
9242
+ // No AU RT endpoint — fall back to EU
9243
+ };
9244
+ const rtPrefix = rtRegionMap[region || ""] || "eu";
9245
+ return `wss://${rtPrefix}.rt.speechmatics.com/v2`;
9246
+ }
9247
+ /**
9248
+ * Stream audio for real-time transcription via WebSocket
9249
+ *
9250
+ * Connects to Speechmatics' real-time API and sends audio chunks
9251
+ * for transcription with results returned via callbacks.
9252
+ *
9253
+ * @param options - Streaming configuration options
9254
+ * @param callbacks - Event callbacks for transcription results
9255
+ * @returns Promise that resolves with a StreamingSession
9256
+ *
9257
+ * @example Basic streaming
9258
+ * ```typescript
9259
+ * const session = await adapter.transcribeStream({
9260
+ * language: 'en',
9261
+ * speechmaticsStreaming: {
9262
+ * enablePartials: true,
9263
+ * operatingPoint: 'enhanced'
9264
+ * }
9265
+ * }, {
9266
+ * onTranscript: (event) => console.log(event.text),
9267
+ * onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
9268
+ * onError: (error) => console.error(error)
9269
+ * });
9270
+ *
9271
+ * await session.sendAudio({ data: audioBuffer });
9272
+ * await session.close();
9273
+ * ```
9274
+ */
9275
+ async transcribeStream(options, callbacks) {
9276
+ this.validateConfig();
9277
+ const smOpts = options?.speechmaticsStreaming || {};
9278
+ const region = smOpts.region || this.config?.region;
9279
+ const wsUrl = this.getRegionalWsUrl(region);
9280
+ const ws = new import_ws5.default(wsUrl, {
9281
+ headers: {
9282
+ Authorization: `Bearer ${this.config.apiKey}`
9283
+ }
9284
+ });
9285
+ let sessionStatus = "connecting";
9286
+ const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
9287
+ let seqNo = 0;
9288
+ let utteranceResults = [];
9289
+ const sessionReady = new Promise((resolve, reject) => {
9290
+ const timeout = setTimeout(() => {
9291
+ reject(new Error("WebSocket connection timeout"));
9292
+ }, 1e4);
9293
+ let wsOpen = false;
9294
+ ws.once("error", (error) => {
9295
+ clearTimeout(timeout);
9296
+ reject(error);
9297
+ });
9298
+ ws.once("open", () => {
9299
+ wsOpen = true;
9300
+ const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
9301
+ const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
9302
+ const startMsg = {
9303
+ message: "StartRecognition",
9304
+ audio_format: {
9305
+ type: "raw",
9306
+ encoding,
9307
+ sample_rate: sampleRate
9308
+ },
9309
+ transcription_config: {
9310
+ language: smOpts.language || options?.language || "en",
9311
+ enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
9312
+ }
9313
+ };
9314
+ const txConfig = startMsg.transcription_config;
9315
+ if (smOpts.domain) txConfig.domain = smOpts.domain;
9316
+ if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
9317
+ if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
9318
+ if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
9319
+ if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
9320
+ if (smOpts.diarization === "speaker" || options?.diarization) {
9321
+ txConfig.diarization = "speaker";
9322
+ if (smOpts.maxSpeakers) {
9323
+ txConfig.speaker_diarization_config = {
9324
+ max_speakers: smOpts.maxSpeakers
9325
+ };
9326
+ } else if (options?.speakersExpected) {
9327
+ txConfig.speaker_diarization_config = {
9328
+ max_speakers: options.speakersExpected
9329
+ };
9330
+ }
9331
+ }
9332
+ if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
9333
+ txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
9334
+ content: word
9335
+ }));
9336
+ } else if (options?.customVocabulary && options.customVocabulary.length > 0) {
9337
+ txConfig.additional_vocab = options.customVocabulary.map((word) => ({
9338
+ content: word
9339
+ }));
9340
+ }
9341
+ if (smOpts.conversationConfig) {
9342
+ txConfig.conversation_config = {
9343
+ end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9344
+ };
9345
+ }
9346
+ const startPayload = JSON.stringify(startMsg);
9347
+ if (callbacks?.onRawMessage) {
9348
+ callbacks.onRawMessage({
9349
+ provider: "speechmatics",
9350
+ direction: "outgoing",
9351
+ timestamp: Date.now(),
9352
+ payload: startPayload,
9353
+ messageType: "StartRecognition"
9354
+ });
9355
+ }
9356
+ ws.send(startPayload);
9357
+ });
9358
+ const onMessage = (data) => {
9359
+ const rawPayload = data.toString();
9360
+ try {
9361
+ const msg = JSON.parse(rawPayload);
9362
+ if (msg.message === "RecognitionStarted") {
9363
+ clearTimeout(timeout);
9364
+ ws.removeListener("message", onMessage);
9365
+ ws.emit("message", data);
9366
+ resolve();
9367
+ } else if (msg.message === "Error") {
9368
+ clearTimeout(timeout);
9369
+ ws.removeListener("message", onMessage);
9370
+ reject(new Error(msg.reason || "Recognition failed to start"));
9371
+ }
9372
+ } catch {
9373
+ }
9374
+ };
9375
+ ws.on("message", onMessage);
9376
+ });
9377
+ ws.on("message", (data) => {
9378
+ const rawPayload = data.toString();
9379
+ try {
9380
+ const message = JSON.parse(rawPayload);
9381
+ if (callbacks?.onRawMessage) {
9382
+ callbacks.onRawMessage({
9383
+ provider: "speechmatics",
9384
+ direction: "incoming",
9385
+ timestamp: Date.now(),
9386
+ payload: rawPayload,
9387
+ messageType: message.message
9388
+ });
9389
+ }
9390
+ this.handleStreamingMessage(message, callbacks, utteranceResults);
9391
+ } catch (error) {
9392
+ if (callbacks?.onRawMessage) {
9393
+ callbacks.onRawMessage({
9394
+ provider: "speechmatics",
9395
+ direction: "incoming",
9396
+ timestamp: Date.now(),
9397
+ payload: rawPayload,
9398
+ messageType: "parse_error"
9399
+ });
9400
+ }
9401
+ callbacks?.onError?.({
9402
+ code: "PARSE_ERROR",
9403
+ message: "Failed to parse WebSocket message",
9404
+ details: error
9405
+ });
9406
+ }
9407
+ });
9408
+ ws.on("error", (error) => {
9409
+ callbacks?.onError?.({
9410
+ code: "WEBSOCKET_ERROR",
9411
+ message: error.message,
9412
+ details: error
9413
+ });
9414
+ });
9415
+ ws.on("close", (code, reason) => {
9416
+ sessionStatus = "closed";
9417
+ callbacks?.onClose?.(code, reason.toString());
9418
+ });
9419
+ await sessionReady;
9420
+ sessionStatus = "open";
9421
+ callbacks?.onOpen?.();
9422
+ return {
9423
+ id: sessionId,
9424
+ provider: this.name,
9425
+ createdAt: /* @__PURE__ */ new Date(),
9426
+ getStatus: () => sessionStatus,
9427
+ sendAudio: async (chunk) => {
9428
+ if (sessionStatus !== "open") {
9429
+ throw new Error(`Cannot send audio: session is ${sessionStatus}`);
9430
+ }
9431
+ if (ws.readyState !== import_ws5.default.OPEN) {
9432
+ throw new Error("WebSocket is not open");
9433
+ }
9434
+ if (callbacks?.onRawMessage) {
9435
+ const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
9436
+ chunk.data.byteOffset,
9437
+ chunk.data.byteOffset + chunk.data.byteLength
9438
+ );
9439
+ callbacks.onRawMessage({
9440
+ provider: this.name,
9441
+ direction: "outgoing",
9442
+ timestamp: Date.now(),
9443
+ payload: audioPayload,
9444
+ messageType: "audio"
9445
+ });
9446
+ }
9447
+ ws.send(chunk.data);
9448
+ seqNo++;
9449
+ if (chunk.isLast) {
9450
+ const endMsg = JSON.stringify({
9451
+ message: "EndOfStream",
9452
+ last_seq_no: seqNo
9453
+ });
9454
+ if (callbacks?.onRawMessage) {
9455
+ callbacks.onRawMessage({
9456
+ provider: this.name,
9457
+ direction: "outgoing",
9458
+ timestamp: Date.now(),
9459
+ payload: endMsg,
9460
+ messageType: "EndOfStream"
9461
+ });
9462
+ }
9463
+ ws.send(endMsg);
9464
+ }
9465
+ },
9466
+ close: async () => {
9467
+ if (sessionStatus === "closed" || sessionStatus === "closing") {
9468
+ return;
9469
+ }
9470
+ sessionStatus = "closing";
9471
+ if (ws.readyState === import_ws5.default.OPEN) {
9472
+ seqNo++;
9473
+ ws.send(
9474
+ JSON.stringify({
9475
+ message: "EndOfStream",
9476
+ last_seq_no: seqNo
9477
+ })
9478
+ );
9479
+ }
9480
+ return new Promise((resolve) => {
9481
+ const timeout = setTimeout(() => {
9482
+ ws.terminate();
9483
+ sessionStatus = "closed";
9484
+ resolve();
9485
+ }, 5e3);
9486
+ const onMsg = (data) => {
9487
+ try {
9488
+ const msg = JSON.parse(data.toString());
9489
+ if (msg.message === "EndOfTranscript") {
9490
+ ws.removeListener("message", onMsg);
9491
+ clearTimeout(timeout);
9492
+ ws.close();
9493
+ }
9494
+ } catch {
9495
+ }
9496
+ };
9497
+ ws.on("message", onMsg);
9498
+ ws.once("close", () => {
9499
+ clearTimeout(timeout);
9500
+ sessionStatus = "closed";
9501
+ resolve();
9502
+ });
9503
+ });
9504
+ }
9505
+ };
9506
+ }
9507
+ /**
9508
+ * Handle incoming Speechmatics real-time WebSocket messages
9509
+ */
9510
+ handleStreamingMessage(message, callbacks, utteranceResults) {
9511
+ switch (message.message) {
9512
+ case "RecognitionStarted": {
9513
+ break;
9514
+ }
9515
+ case "AddPartialTranscript": {
9516
+ const results = message.results || [];
9517
+ const text = buildTextFromSpeechmaticsResults(results);
9518
+ if (text) {
9519
+ callbacks?.onTranscript?.({
9520
+ type: "transcript",
9521
+ text,
9522
+ isFinal: false,
9523
+ words: this.extractWordsFromResults(results),
9524
+ data: message
9525
+ });
9526
+ }
9527
+ break;
9528
+ }
9529
+ case "AddTranscript": {
9530
+ const results = message.results || [];
9531
+ const text = buildTextFromSpeechmaticsResults(results);
9532
+ if (utteranceResults) {
9533
+ utteranceResults.push(...results);
9534
+ }
9535
+ if (text) {
9536
+ callbacks?.onTranscript?.({
9537
+ type: "transcript",
9538
+ text,
9539
+ isFinal: true,
9540
+ words: this.extractWordsFromResults(results),
9541
+ data: message
9542
+ });
9543
+ }
9544
+ break;
9545
+ }
9546
+ case "EndOfUtterance": {
9547
+ if (utteranceResults && utteranceResults.length > 0) {
9548
+ const text = buildTextFromSpeechmaticsResults(utteranceResults);
9549
+ const words = this.extractWordsFromResults(utteranceResults);
9550
+ const utterances = buildUtterancesFromWords(words);
9551
+ if (utterances.length > 0) {
9552
+ for (const utt of utterances) {
9553
+ callbacks?.onUtterance?.(utt);
9554
+ }
9555
+ } else if (text) {
9556
+ callbacks?.onUtterance?.({
9557
+ text,
9558
+ start: words.length > 0 ? words[0].start : 0,
9559
+ end: words.length > 0 ? words[words.length - 1].end : 0,
9560
+ words
9561
+ });
9562
+ }
9563
+ utteranceResults.length = 0;
9564
+ }
9565
+ break;
9566
+ }
9567
+ case "AudioAdded": {
9568
+ break;
9569
+ }
9570
+ case "EndOfTranscript": {
9571
+ break;
9572
+ }
9573
+ case "Info":
9574
+ case "Warning": {
9575
+ callbacks?.onMetadata?.(message);
9576
+ break;
9577
+ }
9578
+ case "Error": {
9579
+ const errMsg = message;
9580
+ callbacks?.onError?.({
9581
+ code: errMsg.type || "SPEECHMATICS_ERROR",
9582
+ message: errMsg.reason || "Unknown error",
9583
+ details: message
9584
+ });
9585
+ break;
9586
+ }
9587
+ default: {
9588
+ callbacks?.onMetadata?.(message);
9589
+ break;
9590
+ }
9591
+ }
9592
+ }
9593
+ /**
9594
+ * Extract unified Word[] from Speechmatics recognition results
9595
+ */
9596
+ extractWordsFromResults(results) {
9597
+ return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
9598
+ word: result.alternatives?.[0]?.content || "",
9599
+ start: result.start_time,
9600
+ end: result.end_time,
9601
+ confidence: result.alternatives?.[0]?.confidence,
9602
+ speaker: result.alternatives?.[0]?.speaker
9603
+ }));
9604
+ }
9222
9605
  /**
9223
9606
  * Normalize Speechmatics status to unified status
9224
9607
  * Uses generated JobDetailsStatus enum values
@@ -9540,50 +9923,51 @@ var SonioxAdapter = class extends BaseAdapter {
9540
9923
  const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9541
9924
  const createdAt = /* @__PURE__ */ new Date();
9542
9925
  const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
9543
- const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
9544
- wsUrl.searchParams.set("api_key", this.config.apiKey);
9545
- const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
9546
- wsUrl.searchParams.set("model", modelId);
9547
- if (options?.encoding) {
9926
+ const wsUrl = `${wsBase}/transcribe-websocket`;
9927
+ const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
9928
+ const sonioxOpts = options?.sonioxStreaming;
9929
+ const initMessage = {
9930
+ api_key: this.config.apiKey,
9931
+ model: modelId
9932
+ };
9933
+ if (sonioxOpts?.audioFormat) {
9934
+ initMessage.audio_format = sonioxOpts.audioFormat;
9935
+ } else if (options?.encoding) {
9548
9936
  const encodingMap = {
9549
9937
  linear16: "pcm_s16le",
9550
9938
  pcm: "pcm_s16le",
9551
9939
  mulaw: "mulaw",
9552
9940
  alaw: "alaw"
9553
9941
  };
9554
- wsUrl.searchParams.set("audio_format", encodingMap[options.encoding] || options.encoding);
9942
+ initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
9555
9943
  }
9556
- if (options?.sampleRate) {
9557
- wsUrl.searchParams.set("sample_rate", options.sampleRate.toString());
9944
+ if (sonioxOpts?.sampleRate || options?.sampleRate) {
9945
+ initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
9558
9946
  }
9559
- if (options?.channels) {
9560
- wsUrl.searchParams.set("num_channels", options.channels.toString());
9947
+ if (sonioxOpts?.numChannels || options?.channels) {
9948
+ initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
9561
9949
  }
9562
- const sonioxOpts = options?.sonioxStreaming;
9563
9950
  if (sonioxOpts) {
9564
9951
  if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
9565
- wsUrl.searchParams.set("language_hints", JSON.stringify(sonioxOpts.languageHints));
9952
+ initMessage.language_hints = sonioxOpts.languageHints;
9566
9953
  }
9567
9954
  if (sonioxOpts.enableLanguageIdentification) {
9568
- wsUrl.searchParams.set("enable_language_identification", "true");
9955
+ initMessage.enable_language_identification = true;
9569
9956
  }
9570
9957
  if (sonioxOpts.enableEndpointDetection) {
9571
- wsUrl.searchParams.set("enable_endpoint_detection", "true");
9958
+ initMessage.enable_endpoint_detection = true;
9572
9959
  }
9573
9960
  if (sonioxOpts.enableSpeakerDiarization) {
9574
- wsUrl.searchParams.set("enable_speaker_diarization", "true");
9961
+ initMessage.enable_speaker_diarization = true;
9575
9962
  }
9576
9963
  if (sonioxOpts.context) {
9577
- wsUrl.searchParams.set(
9578
- "context",
9579
- typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
9580
- );
9964
+ initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
9581
9965
  }
9582
9966
  if (sonioxOpts.translation) {
9583
- wsUrl.searchParams.set("translation", JSON.stringify(sonioxOpts.translation));
9967
+ initMessage.translation = sonioxOpts.translation;
9584
9968
  }
9585
9969
  if (sonioxOpts.clientReferenceId) {
9586
- wsUrl.searchParams.set("client_reference_id", sonioxOpts.clientReferenceId);
9970
+ initMessage.client_reference_id = sonioxOpts.clientReferenceId;
9587
9971
  }
9588
9972
  }
9589
9973
  if (!sonioxOpts?.languageHints && options?.language) {
@@ -9592,24 +9976,33 @@ var SonioxAdapter = class extends BaseAdapter {
9592
9976
  `[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
9593
9977
  );
9594
9978
  }
9595
- wsUrl.searchParams.set("language_hints", JSON.stringify([options.language]));
9979
+ initMessage.language_hints = [options.language];
9596
9980
  }
9597
9981
  if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
9598
- wsUrl.searchParams.set("enable_speaker_diarization", "true");
9982
+ initMessage.enable_speaker_diarization = true;
9599
9983
  }
9600
9984
  if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
9601
- wsUrl.searchParams.set("enable_language_identification", "true");
9602
- }
9603
- if (options?.interimResults !== false) {
9985
+ initMessage.enable_language_identification = true;
9604
9986
  }
9605
9987
  let status = "connecting";
9606
9988
  let openedAt = null;
9607
9989
  let receivedData = false;
9608
9990
  const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
9609
- const ws = new WebSocketImpl(wsUrl.toString());
9991
+ const ws = new WebSocketImpl(wsUrl);
9610
9992
  ws.onopen = () => {
9611
- status = "open";
9612
9993
  openedAt = Date.now();
9994
+ const initPayload = JSON.stringify(initMessage);
9995
+ if (callbacks?.onRawMessage) {
9996
+ callbacks.onRawMessage({
9997
+ provider: this.name,
9998
+ direction: "outgoing",
9999
+ timestamp: Date.now(),
10000
+ payload: initPayload,
10001
+ messageType: "init"
10002
+ });
10003
+ }
10004
+ ws.send(initPayload);
10005
+ status = "open";
9613
10006
  callbacks?.onOpen?.();
9614
10007
  };
9615
10008
  ws.onmessage = (event) => {
@@ -9688,10 +10081,10 @@ var SonioxAdapter = class extends BaseAdapter {
9688
10081
  ws.onclose = (event) => {
9689
10082
  status = "closed";
9690
10083
  const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
9691
- const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
9692
- if (isImmediateClose && event.code === 1e3) {
10084
+ const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
10085
+ if (isEarlyClose && event.code === 1e3) {
9693
10086
  const errorMessage = [
9694
- "Soniox closed connection immediately after opening.",
10087
+ "Soniox closed connection shortly after opening.",
9695
10088
  `Current config: region=${this.region}, model=${modelId}`,
9696
10089
  "Likely causes:",
9697
10090
  " - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
@@ -36710,7 +37103,7 @@ var AzureCapabilities = {
36710
37103
  deleteTranscript: true
36711
37104
  };
36712
37105
  var SpeechmaticsCapabilities = {
36713
- streaming: false,
37106
+ streaming: true,
36714
37107
  diarization: true,
36715
37108
  wordTimestamps: true,
36716
37109
  languageDetection: false,