voice-router-dev 0.8.7 → 0.8.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8920,6 +8920,7 @@ function createOpenAIWhisperAdapter(config) {
8920
8920
 
8921
8921
  // src/adapters/speechmatics-adapter.ts
8922
8922
  var import_axios8 = __toESM(require("axios"));
8923
+ var import_ws5 = __toESM(require("ws"));
8923
8924
 
8924
8925
  // src/generated/speechmatics/schema/notificationConfigContentsItem.ts
8925
8926
  var NotificationConfigContentsItem = {
@@ -8969,8 +8970,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8969
8970
  super(...arguments);
8970
8971
  this.name = "speechmatics";
8971
8972
  this.capabilities = {
8972
- streaming: false,
8973
- // Batch only (streaming available via separate WebSocket API)
8973
+ streaming: true,
8974
8974
  diarization: true,
8975
8975
  wordTimestamps: true,
8976
8976
  languageDetection: false,
@@ -9219,6 +9219,381 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
9219
9219
  throw error;
9220
9220
  }
9221
9221
  }
9222
+ /**
9223
+ * Build WebSocket URL for real-time streaming
9224
+ *
9225
+ * Note: Real-time API uses a different host from the batch API:
9226
+ * - Batch: {region}.asr.api.speechmatics.com
9227
+ * - Real-time: {region}.rt.speechmatics.com
9228
+ *
9229
+ * @param region - Regional endpoint identifier
9230
+ * @returns WebSocket URL for real-time API
9231
+ */
9232
+ getRegionalWsUrl(region) {
9233
+ if (this.config?.wsBaseUrl) {
9234
+ return this.config.wsBaseUrl;
9235
+ }
9236
+ const regionPrefix = region || "eu1";
9237
+ return `wss://${regionPrefix}.rt.speechmatics.com/v2`;
9238
+ }
9239
+ /**
9240
+ * Stream audio for real-time transcription via WebSocket
9241
+ *
9242
+ * Connects to Speechmatics' real-time API and sends audio chunks
9243
+ * for transcription with results returned via callbacks.
9244
+ *
9245
+ * @param options - Streaming configuration options
9246
+ * @param callbacks - Event callbacks for transcription results
9247
+ * @returns Promise that resolves with a StreamingSession
9248
+ *
9249
+ * @example Basic streaming
9250
+ * ```typescript
9251
+ * const session = await adapter.transcribeStream({
9252
+ * language: 'en',
9253
+ * speechmaticsStreaming: {
9254
+ * enablePartials: true,
9255
+ * operatingPoint: 'enhanced'
9256
+ * }
9257
+ * }, {
9258
+ * onTranscript: (event) => console.log(event.text),
9259
+ * onUtterance: (utt) => console.log(`[${utt.speaker}]: ${utt.text}`),
9260
+ * onError: (error) => console.error(error)
9261
+ * });
9262
+ *
9263
+ * await session.sendAudio({ data: audioBuffer });
9264
+ * await session.close();
9265
+ * ```
9266
+ */
9267
+ async transcribeStream(options, callbacks) {
9268
+ this.validateConfig();
9269
+ const smOpts = options?.speechmaticsStreaming || {};
9270
+ const region = smOpts.region || this.config?.region;
9271
+ const wsUrl = this.getRegionalWsUrl(region);
9272
+ const ws = new import_ws5.default(wsUrl, {
9273
+ headers: {
9274
+ Authorization: `Bearer ${this.config.apiKey}`
9275
+ }
9276
+ });
9277
+ let sessionStatus = "connecting";
9278
+ const sessionId = `speechmatics-${Date.now()}-${Math.random().toString(36).substring(7)}`;
9279
+ let seqNo = 0;
9280
+ let utteranceResults = [];
9281
+ const sessionReady = new Promise((resolve, reject) => {
9282
+ const timeout = setTimeout(() => {
9283
+ reject(new Error("WebSocket connection timeout"));
9284
+ }, 1e4);
9285
+ let wsOpen = false;
9286
+ ws.once("error", (error) => {
9287
+ clearTimeout(timeout);
9288
+ reject(error);
9289
+ });
9290
+ ws.once("open", () => {
9291
+ wsOpen = true;
9292
+ const encoding = smOpts.encoding || options?.encoding || "pcm_s16le";
9293
+ const sampleRate = smOpts.sampleRate || options?.sampleRate || 16e3;
9294
+ const startMsg = {
9295
+ message: "StartRecognition",
9296
+ audio_format: {
9297
+ type: "raw",
9298
+ encoding,
9299
+ sample_rate: sampleRate
9300
+ },
9301
+ transcription_config: {
9302
+ language: smOpts.language || options?.language || "en",
9303
+ enable_partials: smOpts.enablePartials ?? options?.interimResults ?? true
9304
+ }
9305
+ };
9306
+ const txConfig = startMsg.transcription_config;
9307
+ if (smOpts.domain) txConfig.domain = smOpts.domain;
9308
+ if (smOpts.operatingPoint) txConfig.operating_point = smOpts.operatingPoint;
9309
+ if (smOpts.maxDelay !== void 0) txConfig.max_delay = smOpts.maxDelay;
9310
+ if (smOpts.maxDelayMode) txConfig.max_delay_mode = smOpts.maxDelayMode;
9311
+ if (smOpts.enableEntities !== void 0) txConfig.enable_entities = smOpts.enableEntities;
9312
+ if (smOpts.diarization === "speaker" || options?.diarization) {
9313
+ txConfig.diarization = "speaker";
9314
+ if (smOpts.maxSpeakers) {
9315
+ txConfig.speaker_diarization_config = {
9316
+ max_speakers: smOpts.maxSpeakers
9317
+ };
9318
+ } else if (options?.speakersExpected) {
9319
+ txConfig.speaker_diarization_config = {
9320
+ max_speakers: options.speakersExpected
9321
+ };
9322
+ }
9323
+ }
9324
+ if (smOpts.additionalVocab && smOpts.additionalVocab.length > 0) {
9325
+ txConfig.additional_vocab = smOpts.additionalVocab.map((word) => ({
9326
+ content: word
9327
+ }));
9328
+ } else if (options?.customVocabulary && options.customVocabulary.length > 0) {
9329
+ txConfig.additional_vocab = options.customVocabulary.map((word) => ({
9330
+ content: word
9331
+ }));
9332
+ }
9333
+ if (smOpts.conversationConfig) {
9334
+ txConfig.conversation_config = {
9335
+ end_of_utterance_silence_trigger: smOpts.conversationConfig.endOfUtteranceSilenceTrigger
9336
+ };
9337
+ }
9338
+ const startPayload = JSON.stringify(startMsg);
9339
+ if (callbacks?.onRawMessage) {
9340
+ callbacks.onRawMessage({
9341
+ provider: "speechmatics",
9342
+ direction: "outgoing",
9343
+ timestamp: Date.now(),
9344
+ payload: startPayload,
9345
+ messageType: "StartRecognition"
9346
+ });
9347
+ }
9348
+ ws.send(startPayload);
9349
+ });
9350
+ const onMessage = (data) => {
9351
+ const rawPayload = data.toString();
9352
+ try {
9353
+ const msg = JSON.parse(rawPayload);
9354
+ if (msg.message === "RecognitionStarted") {
9355
+ clearTimeout(timeout);
9356
+ ws.removeListener("message", onMessage);
9357
+ ws.emit("message", data);
9358
+ resolve();
9359
+ } else if (msg.message === "Error") {
9360
+ clearTimeout(timeout);
9361
+ ws.removeListener("message", onMessage);
9362
+ reject(new Error(msg.reason || "Recognition failed to start"));
9363
+ }
9364
+ } catch {
9365
+ }
9366
+ };
9367
+ ws.on("message", onMessage);
9368
+ });
9369
+ ws.on("message", (data) => {
9370
+ const rawPayload = data.toString();
9371
+ try {
9372
+ const message = JSON.parse(rawPayload);
9373
+ if (callbacks?.onRawMessage) {
9374
+ callbacks.onRawMessage({
9375
+ provider: "speechmatics",
9376
+ direction: "incoming",
9377
+ timestamp: Date.now(),
9378
+ payload: rawPayload,
9379
+ messageType: message.message
9380
+ });
9381
+ }
9382
+ this.handleStreamingMessage(message, callbacks, utteranceResults);
9383
+ } catch (error) {
9384
+ if (callbacks?.onRawMessage) {
9385
+ callbacks.onRawMessage({
9386
+ provider: "speechmatics",
9387
+ direction: "incoming",
9388
+ timestamp: Date.now(),
9389
+ payload: rawPayload,
9390
+ messageType: "parse_error"
9391
+ });
9392
+ }
9393
+ callbacks?.onError?.({
9394
+ code: "PARSE_ERROR",
9395
+ message: "Failed to parse WebSocket message",
9396
+ details: error
9397
+ });
9398
+ }
9399
+ });
9400
+ ws.on("error", (error) => {
9401
+ callbacks?.onError?.({
9402
+ code: "WEBSOCKET_ERROR",
9403
+ message: error.message,
9404
+ details: error
9405
+ });
9406
+ });
9407
+ ws.on("close", (code, reason) => {
9408
+ sessionStatus = "closed";
9409
+ callbacks?.onClose?.(code, reason.toString());
9410
+ });
9411
+ await sessionReady;
9412
+ sessionStatus = "open";
9413
+ callbacks?.onOpen?.();
9414
+ return {
9415
+ id: sessionId,
9416
+ provider: this.name,
9417
+ createdAt: /* @__PURE__ */ new Date(),
9418
+ getStatus: () => sessionStatus,
9419
+ sendAudio: async (chunk) => {
9420
+ if (sessionStatus !== "open") {
9421
+ throw new Error(`Cannot send audio: session is ${sessionStatus}`);
9422
+ }
9423
+ if (ws.readyState !== import_ws5.default.OPEN) {
9424
+ throw new Error("WebSocket is not open");
9425
+ }
9426
+ if (callbacks?.onRawMessage) {
9427
+ const audioPayload = chunk.data instanceof ArrayBuffer ? chunk.data : chunk.data.buffer.slice(
9428
+ chunk.data.byteOffset,
9429
+ chunk.data.byteOffset + chunk.data.byteLength
9430
+ );
9431
+ callbacks.onRawMessage({
9432
+ provider: this.name,
9433
+ direction: "outgoing",
9434
+ timestamp: Date.now(),
9435
+ payload: audioPayload,
9436
+ messageType: "audio"
9437
+ });
9438
+ }
9439
+ ws.send(chunk.data);
9440
+ seqNo++;
9441
+ if (chunk.isLast) {
9442
+ const endMsg = JSON.stringify({
9443
+ message: "EndOfStream",
9444
+ last_seq_no: seqNo
9445
+ });
9446
+ if (callbacks?.onRawMessage) {
9447
+ callbacks.onRawMessage({
9448
+ provider: this.name,
9449
+ direction: "outgoing",
9450
+ timestamp: Date.now(),
9451
+ payload: endMsg,
9452
+ messageType: "EndOfStream"
9453
+ });
9454
+ }
9455
+ ws.send(endMsg);
9456
+ }
9457
+ },
9458
+ close: async () => {
9459
+ if (sessionStatus === "closed" || sessionStatus === "closing") {
9460
+ return;
9461
+ }
9462
+ sessionStatus = "closing";
9463
+ if (ws.readyState === import_ws5.default.OPEN) {
9464
+ seqNo++;
9465
+ ws.send(
9466
+ JSON.stringify({
9467
+ message: "EndOfStream",
9468
+ last_seq_no: seqNo
9469
+ })
9470
+ );
9471
+ }
9472
+ return new Promise((resolve) => {
9473
+ const timeout = setTimeout(() => {
9474
+ ws.terminate();
9475
+ sessionStatus = "closed";
9476
+ resolve();
9477
+ }, 5e3);
9478
+ const onMsg = (data) => {
9479
+ try {
9480
+ const msg = JSON.parse(data.toString());
9481
+ if (msg.message === "EndOfTranscript") {
9482
+ ws.removeListener("message", onMsg);
9483
+ clearTimeout(timeout);
9484
+ ws.close();
9485
+ }
9486
+ } catch {
9487
+ }
9488
+ };
9489
+ ws.on("message", onMsg);
9490
+ ws.once("close", () => {
9491
+ clearTimeout(timeout);
9492
+ sessionStatus = "closed";
9493
+ resolve();
9494
+ });
9495
+ });
9496
+ }
9497
+ };
9498
+ }
9499
+ /**
9500
+ * Handle incoming Speechmatics real-time WebSocket messages
9501
+ */
9502
+ handleStreamingMessage(message, callbacks, utteranceResults) {
9503
+ switch (message.message) {
9504
+ case "RecognitionStarted": {
9505
+ break;
9506
+ }
9507
+ case "AddPartialTranscript": {
9508
+ const results = message.results || [];
9509
+ const text = buildTextFromSpeechmaticsResults(results);
9510
+ if (text) {
9511
+ callbacks?.onTranscript?.({
9512
+ type: "transcript",
9513
+ text,
9514
+ isFinal: false,
9515
+ words: this.extractWordsFromResults(results),
9516
+ data: message
9517
+ });
9518
+ }
9519
+ break;
9520
+ }
9521
+ case "AddTranscript": {
9522
+ const results = message.results || [];
9523
+ const text = buildTextFromSpeechmaticsResults(results);
9524
+ if (utteranceResults) {
9525
+ utteranceResults.push(...results);
9526
+ }
9527
+ if (text) {
9528
+ callbacks?.onTranscript?.({
9529
+ type: "transcript",
9530
+ text,
9531
+ isFinal: true,
9532
+ words: this.extractWordsFromResults(results),
9533
+ data: message
9534
+ });
9535
+ }
9536
+ break;
9537
+ }
9538
+ case "EndOfUtterance": {
9539
+ if (utteranceResults && utteranceResults.length > 0) {
9540
+ const text = buildTextFromSpeechmaticsResults(utteranceResults);
9541
+ const words = this.extractWordsFromResults(utteranceResults);
9542
+ const utterances = buildUtterancesFromWords(words);
9543
+ if (utterances.length > 0) {
9544
+ for (const utt of utterances) {
9545
+ callbacks?.onUtterance?.(utt);
9546
+ }
9547
+ } else if (text) {
9548
+ callbacks?.onUtterance?.({
9549
+ text,
9550
+ start: words.length > 0 ? words[0].start : 0,
9551
+ end: words.length > 0 ? words[words.length - 1].end : 0,
9552
+ words
9553
+ });
9554
+ }
9555
+ utteranceResults.length = 0;
9556
+ }
9557
+ break;
9558
+ }
9559
+ case "AudioAdded": {
9560
+ break;
9561
+ }
9562
+ case "EndOfTranscript": {
9563
+ break;
9564
+ }
9565
+ case "Info":
9566
+ case "Warning": {
9567
+ callbacks?.onMetadata?.(message);
9568
+ break;
9569
+ }
9570
+ case "Error": {
9571
+ const errMsg = message;
9572
+ callbacks?.onError?.({
9573
+ code: errMsg.type || "SPEECHMATICS_ERROR",
9574
+ message: errMsg.reason || "Unknown error",
9575
+ details: message
9576
+ });
9577
+ break;
9578
+ }
9579
+ default: {
9580
+ callbacks?.onMetadata?.(message);
9581
+ break;
9582
+ }
9583
+ }
9584
+ }
9585
+ /**
9586
+ * Extract unified Word[] from Speechmatics recognition results
9587
+ */
9588
+ extractWordsFromResults(results) {
9589
+ return results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
9590
+ word: result.alternatives?.[0]?.content || "",
9591
+ start: result.start_time,
9592
+ end: result.end_time,
9593
+ confidence: result.alternatives?.[0]?.confidence,
9594
+ speaker: result.alternatives?.[0]?.speaker
9595
+ }));
9596
+ }
9222
9597
  /**
9223
9598
  * Normalize Speechmatics status to unified status
9224
9599
  * Uses generated JobDetailsStatus enum values
@@ -9540,50 +9915,51 @@ var SonioxAdapter = class extends BaseAdapter {
9540
9915
  const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9541
9916
  const createdAt = /* @__PURE__ */ new Date();
9542
9917
  const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
9543
- const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
9544
- wsUrl.searchParams.set("api_key", this.config.apiKey);
9545
- const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
9546
- wsUrl.searchParams.set("model", modelId);
9547
- if (options?.encoding) {
9918
+ const wsUrl = `${wsBase}/transcribe-websocket`;
9919
+ const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-v4";
9920
+ const sonioxOpts = options?.sonioxStreaming;
9921
+ const initMessage = {
9922
+ api_key: this.config.apiKey,
9923
+ model: modelId
9924
+ };
9925
+ if (sonioxOpts?.audioFormat) {
9926
+ initMessage.audio_format = sonioxOpts.audioFormat;
9927
+ } else if (options?.encoding) {
9548
9928
  const encodingMap = {
9549
9929
  linear16: "pcm_s16le",
9550
9930
  pcm: "pcm_s16le",
9551
9931
  mulaw: "mulaw",
9552
9932
  alaw: "alaw"
9553
9933
  };
9554
- wsUrl.searchParams.set("audio_format", encodingMap[options.encoding] || options.encoding);
9934
+ initMessage.audio_format = encodingMap[options.encoding] || options.encoding;
9555
9935
  }
9556
- if (options?.sampleRate) {
9557
- wsUrl.searchParams.set("sample_rate", options.sampleRate.toString());
9936
+ if (sonioxOpts?.sampleRate || options?.sampleRate) {
9937
+ initMessage.sample_rate = sonioxOpts?.sampleRate || options?.sampleRate;
9558
9938
  }
9559
- if (options?.channels) {
9560
- wsUrl.searchParams.set("num_channels", options.channels.toString());
9939
+ if (sonioxOpts?.numChannels || options?.channels) {
9940
+ initMessage.num_channels = sonioxOpts?.numChannels || options?.channels;
9561
9941
  }
9562
- const sonioxOpts = options?.sonioxStreaming;
9563
9942
  if (sonioxOpts) {
9564
9943
  if (sonioxOpts.languageHints && sonioxOpts.languageHints.length > 0) {
9565
- wsUrl.searchParams.set("language_hints", JSON.stringify(sonioxOpts.languageHints));
9944
+ initMessage.language_hints = sonioxOpts.languageHints;
9566
9945
  }
9567
9946
  if (sonioxOpts.enableLanguageIdentification) {
9568
- wsUrl.searchParams.set("enable_language_identification", "true");
9947
+ initMessage.enable_language_identification = true;
9569
9948
  }
9570
9949
  if (sonioxOpts.enableEndpointDetection) {
9571
- wsUrl.searchParams.set("enable_endpoint_detection", "true");
9950
+ initMessage.enable_endpoint_detection = true;
9572
9951
  }
9573
9952
  if (sonioxOpts.enableSpeakerDiarization) {
9574
- wsUrl.searchParams.set("enable_speaker_diarization", "true");
9953
+ initMessage.enable_speaker_diarization = true;
9575
9954
  }
9576
9955
  if (sonioxOpts.context) {
9577
- wsUrl.searchParams.set(
9578
- "context",
9579
- typeof sonioxOpts.context === "string" ? sonioxOpts.context : JSON.stringify(sonioxOpts.context)
9580
- );
9956
+ initMessage.context = typeof sonioxOpts.context === "string" ? sonioxOpts.context : sonioxOpts.context;
9581
9957
  }
9582
9958
  if (sonioxOpts.translation) {
9583
- wsUrl.searchParams.set("translation", JSON.stringify(sonioxOpts.translation));
9959
+ initMessage.translation = sonioxOpts.translation;
9584
9960
  }
9585
9961
  if (sonioxOpts.clientReferenceId) {
9586
- wsUrl.searchParams.set("client_reference_id", sonioxOpts.clientReferenceId);
9962
+ initMessage.client_reference_id = sonioxOpts.clientReferenceId;
9587
9963
  }
9588
9964
  }
9589
9965
  if (!sonioxOpts?.languageHints && options?.language) {
@@ -9592,24 +9968,33 @@ var SonioxAdapter = class extends BaseAdapter {
9592
9968
  `[Soniox] Warning: language="multi" is Deepgram-specific and not supported by Soniox. For automatic language detection, use languageDetection: true instead, or specify a language code like 'en'.`
9593
9969
  );
9594
9970
  }
9595
- wsUrl.searchParams.set("language_hints", JSON.stringify([options.language]));
9971
+ initMessage.language_hints = [options.language];
9596
9972
  }
9597
9973
  if (!sonioxOpts?.enableSpeakerDiarization && options?.diarization) {
9598
- wsUrl.searchParams.set("enable_speaker_diarization", "true");
9974
+ initMessage.enable_speaker_diarization = true;
9599
9975
  }
9600
9976
  if (!sonioxOpts?.enableLanguageIdentification && options?.languageDetection) {
9601
- wsUrl.searchParams.set("enable_language_identification", "true");
9602
- }
9603
- if (options?.interimResults !== false) {
9977
+ initMessage.enable_language_identification = true;
9604
9978
  }
9605
9979
  let status = "connecting";
9606
9980
  let openedAt = null;
9607
9981
  let receivedData = false;
9608
9982
  const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
9609
- const ws = new WebSocketImpl(wsUrl.toString());
9983
+ const ws = new WebSocketImpl(wsUrl);
9610
9984
  ws.onopen = () => {
9611
- status = "open";
9612
9985
  openedAt = Date.now();
9986
+ const initPayload = JSON.stringify(initMessage);
9987
+ if (callbacks?.onRawMessage) {
9988
+ callbacks.onRawMessage({
9989
+ provider: this.name,
9990
+ direction: "outgoing",
9991
+ timestamp: Date.now(),
9992
+ payload: initPayload,
9993
+ messageType: "init"
9994
+ });
9995
+ }
9996
+ ws.send(initPayload);
9997
+ status = "open";
9613
9998
  callbacks?.onOpen?.();
9614
9999
  };
9615
10000
  ws.onmessage = (event) => {
@@ -9688,10 +10073,10 @@ var SonioxAdapter = class extends BaseAdapter {
9688
10073
  ws.onclose = (event) => {
9689
10074
  status = "closed";
9690
10075
  const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
9691
- const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
9692
- if (isImmediateClose && event.code === 1e3) {
10076
+ const isEarlyClose = timeSinceOpen !== null && timeSinceOpen < 5e3 && !receivedData;
10077
+ if (isEarlyClose && event.code === 1e3) {
9693
10078
  const errorMessage = [
9694
- "Soniox closed connection immediately after opening.",
10079
+ "Soniox closed connection shortly after opening.",
9695
10080
  `Current config: region=${this.region}, model=${modelId}`,
9696
10081
  "Likely causes:",
9697
10082
  " - Invalid API key or region mismatch (keys are region-specific, current: " + this.region + ")",
@@ -36710,7 +37095,7 @@ var AzureCapabilities = {
36710
37095
  deleteTranscript: true
36711
37096
  };
36712
37097
  var SpeechmaticsCapabilities = {
36713
- streaming: false,
37098
+ streaming: true,
36714
37099
  diarization: true,
36715
37100
  wordTimestamps: true,
36716
37101
  languageDetection: false,