voice-router-dev 0.7.9 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/webhooks.js CHANGED
@@ -35,12 +35,14 @@ __export(webhooks_exports, {
35
35
  AzureWebhookHandler: () => AzureWebhookHandler,
36
36
  BaseWebhookHandler: () => BaseWebhookHandler,
37
37
  DeepgramWebhookHandler: () => DeepgramWebhookHandler,
38
+ ElevenLabsWebhookHandler: () => ElevenLabsWebhookHandler,
38
39
  GladiaWebhookHandler: () => GladiaWebhookHandler,
39
40
  SpeechmaticsWebhookHandler: () => SpeechmaticsWebhookHandler,
40
41
  WebhookRouter: () => WebhookRouter,
41
42
  createAssemblyAIWebhookHandler: () => createAssemblyAIWebhookHandler,
42
43
  createAzureWebhookHandler: () => createAzureWebhookHandler,
43
44
  createDeepgramWebhookHandler: () => createDeepgramWebhookHandler,
45
+ createElevenLabsWebhookHandler: () => createElevenLabsWebhookHandler,
44
46
  createGladiaWebhookHandler: () => createGladiaWebhookHandler,
45
47
  createWebhookRouter: () => createWebhookRouter
46
48
  });
@@ -134,7 +136,7 @@ var GladiaWebhookHandler = class extends BaseWebhookHandler {
134
136
  end: utterance.end,
135
137
  confidence: utterance.confidence,
136
138
  speaker: utterance.speaker !== void 0 ? String(utterance.speaker) : void 0,
137
- words: utterance.words?.map((w) => this.mapWord(w))
139
+ words: utterance.words?.map((w) => this.mapWord(w)) ?? []
138
140
  };
139
141
  }
140
142
  /**
@@ -196,7 +198,7 @@ var GladiaWebhookHandler = class extends BaseWebhookHandler {
196
198
  speakerIds.add(u.speaker);
197
199
  }
198
200
  });
199
- const speakers = speakerIds.size > 0 ? Array.from(speakerIds).map((id) => ({ id: String(id) })) : void 0;
201
+ const speakers = speakerIds.size > 0 ? Array.from(speakerIds).map((id) => ({ id: String(id), label: `Speaker ${id}` })) : void 0;
200
202
  const summary = result.summarization?.success && result.summarization.results ? result.summarization.results : void 0;
201
203
  return {
202
204
  success: true,
@@ -274,34 +276,68 @@ var AssemblyAIWebhookHandler = class extends BaseWebhookHandler {
274
276
  }
275
277
  /**
276
278
  * Check if payload matches AssemblyAI webhook format
279
+ *
280
+ * Supports two formats:
281
+ * - Notification format: `{ transcript_id, status }` (lightweight callback)
282
+ * - Full transcript format: `{ id, status, audio_url, text, words, ... }` (complete response)
277
283
  */
278
284
  matches(payload, _options) {
279
285
  if (!payload || typeof payload !== "object") {
280
286
  return false;
281
287
  }
282
288
  const obj = payload;
283
- if (!("transcript_id" in obj) || !("status" in obj)) {
284
- return false;
285
- }
286
- if (typeof obj.transcript_id !== "string") {
287
- return false;
289
+ if ("transcript_id" in obj && "status" in obj) {
290
+ if (typeof obj.transcript_id !== "string") return false;
291
+ if (obj.status !== "completed" && obj.status !== "error") return false;
292
+ return true;
288
293
  }
289
- if (obj.status !== "completed" && obj.status !== "error") {
290
- return false;
294
+ if ("id" in obj && "status" in obj && "audio_url" in obj) {
295
+ if (typeof obj.id !== "string") return false;
296
+ if (obj.status !== "completed" && obj.status !== "error") return false;
297
+ return true;
291
298
  }
292
- return true;
299
+ return false;
300
+ }
301
+ /**
302
+ * Determine if the payload is a full transcript (vs a lightweight notification)
303
+ */
304
+ isFullTranscript(payload) {
305
+ return "audio_url" in payload && "id" in payload;
293
306
  }
294
307
  /**
295
308
  * Parse AssemblyAI webhook payload to unified format
309
+ *
310
+ * Supports two payload formats:
311
+ * - Notification: `{ transcript_id, status }` — returns minimal event (ID + status only)
312
+ * - Full transcript: `{ id, status, text, words, utterances, ... }` — returns complete data
296
313
  */
297
314
  parse(payload, _options) {
298
315
  if (!this.matches(payload)) {
299
316
  return this.createErrorEvent(payload, "Invalid AssemblyAI webhook payload");
300
317
  }
301
- const notification = payload;
302
- const transcriptId = notification.transcript_id;
303
- const status = notification.status;
318
+ const obj = payload;
319
+ const isFullFormat = this.isFullTranscript(obj);
320
+ const transcriptId = isFullFormat ? payload.id : payload.transcript_id;
321
+ const status = obj.status;
322
+ if (status === "error") {
323
+ const error = isFullFormat ? payload.error : void 0;
324
+ return {
325
+ success: false,
326
+ provider: this.provider,
327
+ eventType: "transcription.failed",
328
+ data: {
329
+ id: transcriptId,
330
+ status: "error",
331
+ error: error || "Transcription failed"
332
+ },
333
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
334
+ raw: payload
335
+ };
336
+ }
304
337
  if (status === "completed") {
338
+ if (isFullFormat) {
339
+ return this.parseFullTranscript(payload, payload);
340
+ }
305
341
  return {
306
342
  success: true,
307
343
  provider: this.provider,
@@ -309,28 +345,76 @@ var AssemblyAIWebhookHandler = class extends BaseWebhookHandler {
309
345
  data: {
310
346
  id: transcriptId,
311
347
  status: "completed"
312
- // Note: Full transcript data needs to be fetched via API
313
- // using AssemblyAIAdapter.getTranscript(transcriptId)
314
348
  },
315
349
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
316
350
  raw: payload
317
351
  };
318
352
  }
319
- if (status === "error") {
353
+ return this.createErrorEvent(payload, `Unknown AssemblyAI status: ${status}`);
354
+ }
355
+ /**
356
+ * Parse a full AssemblyAI transcript response into unified format
357
+ *
358
+ * AssemblyAI times are in milliseconds — converted to seconds for unified format.
359
+ */
360
+ parseFullTranscript(transcript, raw) {
361
+ try {
362
+ const words = transcript.words ? transcript.words.map((w) => ({
363
+ word: w.text,
364
+ start: w.start / 1e3,
365
+ end: w.end / 1e3,
366
+ confidence: w.confidence,
367
+ speaker: w.speaker ?? void 0
368
+ })) : void 0;
369
+ const utterances = transcript.utterances ? transcript.utterances.map((u) => ({
370
+ text: u.text,
371
+ start: u.start / 1e3,
372
+ end: u.end / 1e3,
373
+ speaker: u.speaker,
374
+ confidence: u.confidence,
375
+ words: u.words.map((w) => ({
376
+ word: w.text,
377
+ start: w.start / 1e3,
378
+ end: w.end / 1e3,
379
+ confidence: w.confidence,
380
+ speaker: w.speaker ?? void 0
381
+ }))
382
+ })) : void 0;
383
+ const speakerIds = /* @__PURE__ */ new Set();
384
+ transcript.utterances?.forEach((u) => {
385
+ if (u.speaker) speakerIds.add(u.speaker);
386
+ });
387
+ const speakers = speakerIds.size > 0 ? Array.from(speakerIds).map((id) => ({ id, label: `Speaker ${id}` })) : void 0;
320
388
  return {
321
- success: false,
389
+ success: true,
322
390
  provider: this.provider,
323
- eventType: "transcription.failed",
391
+ eventType: "transcription.completed",
324
392
  data: {
325
- id: transcriptId,
326
- status: "error",
327
- error: "Transcription failed"
393
+ id: transcript.id,
394
+ status: "completed",
395
+ text: transcript.text ?? void 0,
396
+ confidence: transcript.confidence ?? void 0,
397
+ duration: transcript.audio_duration ?? void 0,
398
+ language: transcript.language_code ?? void 0,
399
+ speakers,
400
+ words,
401
+ utterances,
402
+ summary: transcript.summary ?? void 0,
403
+ metadata: {
404
+ speech_model: transcript.speech_model,
405
+ audio_channels: transcript.audio_channels,
406
+ webhook_status_code: transcript.webhook_status_code
407
+ }
328
408
  },
329
409
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
330
- raw: payload
410
+ raw
331
411
  };
412
+ } catch (error) {
413
+ return this.createErrorEvent(
414
+ raw,
415
+ `Failed to parse AssemblyAI transcript: ${error instanceof Error ? error.message : "Unknown error"}`
416
+ );
332
417
  }
333
- return this.createErrorEvent(payload, `Unknown AssemblyAI status: ${status}`);
334
418
  }
335
419
  /**
336
420
  * Verify AssemblyAI webhook signature
@@ -465,24 +549,33 @@ var DeepgramWebhookHandler = class extends BaseWebhookHandler {
465
549
  end: w.end || 0,
466
550
  confidence: w.confidence
467
551
  })) : void 0;
468
- const speakers = response.results.utterances && response.results.utterances.length > 0 ? response.results.utterances.map((utterance) => ({
469
- id: utterance.speaker?.toString() || "unknown",
470
- speaker: utterance.speaker?.toString() || "unknown",
471
- text: utterance.transcript || "",
472
- confidence: utterance.confidence
552
+ const speakerIds = /* @__PURE__ */ new Set();
553
+ if (response.results.utterances) {
554
+ for (const utterance of response.results.utterances) {
555
+ if (utterance.speaker !== void 0) {
556
+ speakerIds.add(utterance.speaker.toString());
557
+ }
558
+ }
559
+ }
560
+ const speakers = speakerIds.size > 0 ? Array.from(speakerIds).map((id) => ({
561
+ id,
562
+ label: `Speaker ${id}`
473
563
  })) : void 0;
474
564
  const utterances = response.results.utterances && response.results.utterances.length > 0 ? response.results.utterances.map((utterance) => ({
565
+ id: utterance.id,
475
566
  text: utterance.transcript || "",
476
567
  start: utterance.start || 0,
477
568
  end: utterance.end || 0,
478
569
  speaker: utterance.speaker?.toString(),
570
+ channel: utterance.channel,
479
571
  confidence: utterance.confidence,
480
- words: utterance.words && utterance.words.length > 0 ? utterance.words.map((w) => ({
572
+ words: utterance.words?.map((w) => ({
481
573
  word: w.word || "",
482
574
  start: w.start || 0,
483
575
  end: w.end || 0,
484
- confidence: w.confidence
485
- })) : void 0
576
+ confidence: w.confidence,
577
+ speaker: w.speaker?.toString()
578
+ })) ?? []
486
579
  })) : void 0;
487
580
  const summary = alternative.summaries?.[0]?.summary;
488
581
  return {
@@ -495,7 +588,7 @@ var DeepgramWebhookHandler = class extends BaseWebhookHandler {
495
588
  text: transcript,
496
589
  confidence: alternative.confidence,
497
590
  duration,
498
- language: response.metadata.models?.[0] || void 0,
591
+ language: channel.detected_language,
499
592
  speakers: speakers && speakers.length > 0 ? speakers : void 0,
500
593
  words: words && words.length > 0 ? words : void 0,
501
594
  utterances: utterances && utterances.length > 0 ? utterances : void 0,
@@ -682,6 +775,72 @@ function createAzureWebhookHandler() {
682
775
  return new AzureWebhookHandler();
683
776
  }
684
777
 
778
+ // src/utils/transcription-helpers.ts
779
+ function buildUtterancesFromWords(words) {
780
+ const utterances = [];
781
+ let currentSpeaker;
782
+ let currentWords = [];
783
+ let utteranceStart = 0;
784
+ for (const word of words) {
785
+ if (!word.speaker) continue;
786
+ if (word.speaker !== currentSpeaker) {
787
+ if (currentSpeaker && currentWords.length > 0) {
788
+ utterances.push({
789
+ text: currentWords.map((w) => w.word).join(" "),
790
+ start: utteranceStart,
791
+ end: currentWords[currentWords.length - 1].end,
792
+ speaker: currentSpeaker,
793
+ words: currentWords
794
+ });
795
+ }
796
+ currentSpeaker = word.speaker;
797
+ currentWords = [word];
798
+ utteranceStart = word.start;
799
+ } else {
800
+ currentWords.push(word);
801
+ }
802
+ }
803
+ if (currentSpeaker && currentWords.length > 0) {
804
+ utterances.push({
805
+ text: currentWords.map((w) => w.word).join(" "),
806
+ start: utteranceStart,
807
+ end: currentWords[currentWords.length - 1].end,
808
+ speaker: currentSpeaker,
809
+ words: currentWords
810
+ });
811
+ }
812
+ return utterances;
813
+ }
814
+ function buildTextFromSpeechmaticsResults(results) {
815
+ const parts = [];
816
+ let attachNext = false;
817
+ for (const result of results) {
818
+ if (result.type !== "word" && result.type !== "punctuation") continue;
819
+ const content = result.alternatives?.[0]?.content;
820
+ if (!content) continue;
821
+ if (result.type === "punctuation") {
822
+ const attaches = result.attaches_to;
823
+ if (attaches === "previous" || attaches === "both") {
824
+ parts.push(content);
825
+ attachNext = attaches === "both";
826
+ } else if (attaches === "next") {
827
+ if (parts.length > 0) parts.push(" ");
828
+ parts.push(content);
829
+ attachNext = true;
830
+ } else {
831
+ if (parts.length > 0 && !attachNext) parts.push(" ");
832
+ parts.push(content);
833
+ attachNext = false;
834
+ }
835
+ } else {
836
+ if (parts.length > 0 && !attachNext) parts.push(" ");
837
+ parts.push(content);
838
+ attachNext = false;
839
+ }
840
+ }
841
+ return parts.join("");
842
+ }
843
+
685
844
  // src/webhooks/speechmatics-webhook.ts
686
845
  var SpeechmaticsWebhookHandler = class extends BaseWebhookHandler {
687
846
  constructor() {
@@ -763,18 +922,25 @@ var SpeechmaticsWebhookHandler = class extends BaseWebhookHandler {
763
922
  if (status === "success" && payload && typeof payload === "object") {
764
923
  const transcript = payload;
765
924
  if (transcript.results && transcript.job) {
766
- const text = transcript.results.filter((r) => r.type === "word" && r.alternatives).map((r) => r.alternatives[0]?.content || "").join(" ");
925
+ const text = buildTextFromSpeechmaticsResults(transcript.results);
926
+ const wordResults = transcript.results.filter((r) => r.type === "word" && r.alternatives);
927
+ const words = wordResults.filter((r) => r.start_time !== void 0 && r.end_time !== void 0).map((r) => ({
928
+ word: r.alternatives[0]?.content || "",
929
+ start: r.start_time,
930
+ end: r.end_time,
931
+ confidence: r.alternatives[0]?.confidence,
932
+ speaker: r.alternatives[0]?.speaker
933
+ }));
767
934
  const speakerSet = /* @__PURE__ */ new Set();
768
- transcript.results.forEach((r) => {
769
- if (r.alternatives) {
770
- const speaker = r.alternatives[0]?.speaker;
771
- if (speaker) speakerSet.add(speaker);
772
- }
935
+ wordResults.forEach((r) => {
936
+ const speaker = r.alternatives[0]?.speaker;
937
+ if (speaker) speakerSet.add(speaker);
773
938
  });
774
939
  const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
775
940
  id,
776
941
  label: `Speaker ${id}`
777
942
  })) : void 0;
943
+ const utterances = buildUtterancesFromWords(words);
778
944
  return {
779
945
  success: true,
780
946
  provider: this.provider,
@@ -787,6 +953,8 @@ var SpeechmaticsWebhookHandler = class extends BaseWebhookHandler {
787
953
  language: transcript.metadata.transcription_config?.language,
788
954
  duration: transcript.job.duration,
789
955
  speakers,
956
+ words: words.length > 0 ? words : void 0,
957
+ utterances: utterances.length > 0 ? utterances : void 0,
790
958
  createdAt: transcript.job.created_at
791
959
  },
792
960
  raw: payload
@@ -808,6 +976,138 @@ var SpeechmaticsWebhookHandler = class extends BaseWebhookHandler {
808
976
  }
809
977
  };
810
978
 
979
+ // src/webhooks/elevenlabs-webhook.ts
980
+ var ElevenLabsWebhookHandler = class extends BaseWebhookHandler {
981
+ constructor() {
982
+ super(...arguments);
983
+ this.provider = "elevenlabs";
984
+ }
985
+ /**
986
+ * Check if payload matches ElevenLabs webhook format
987
+ *
988
+ * ElevenLabs webhook payloads contain the full transcription result
989
+ * with `words` array and `language_code` / `language_probability` fields.
990
+ */
991
+ matches(payload, _options) {
992
+ if (!payload || typeof payload !== "object") {
993
+ return false;
994
+ }
995
+ const obj = payload;
996
+ if (!("words" in obj) || !("language_code" in obj) || !("language_probability" in obj)) {
997
+ return false;
998
+ }
999
+ if (!Array.isArray(obj.words)) {
1000
+ return false;
1001
+ }
1002
+ if (!("text" in obj)) {
1003
+ return false;
1004
+ }
1005
+ if (obj.words.length > 0) {
1006
+ const firstWord = obj.words[0];
1007
+ if (!("logprob" in firstWord) || !("type" in firstWord)) {
1008
+ return false;
1009
+ }
1010
+ }
1011
+ return true;
1012
+ }
1013
+ /**
1014
+ * Parse ElevenLabs webhook payload to unified format
1015
+ */
1016
+ parse(payload, _options) {
1017
+ if (!this.matches(payload)) {
1018
+ return this.createErrorEvent(payload, "Invalid ElevenLabs webhook payload");
1019
+ }
1020
+ const response = payload;
1021
+ try {
1022
+ const transcriptionId = response.transcription_id?.toString() || "";
1023
+ const transcript = response.text;
1024
+ if (!transcript) {
1025
+ return {
1026
+ success: false,
1027
+ provider: this.provider,
1028
+ eventType: "transcription.failed",
1029
+ data: {
1030
+ id: transcriptionId,
1031
+ status: "error",
1032
+ error: "Empty transcript"
1033
+ },
1034
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1035
+ raw: payload
1036
+ };
1037
+ }
1038
+ const words = response.words && response.words.length > 0 ? response.words.filter((w) => w.type === "word").map((w) => ({
1039
+ word: w.text || "",
1040
+ start: typeof w.start === "number" ? w.start : 0,
1041
+ end: typeof w.end === "number" ? w.end : 0,
1042
+ confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
1043
+ speaker: w.speaker_id?.toString()
1044
+ })) : void 0;
1045
+ const speakerIds = /* @__PURE__ */ new Set();
1046
+ if (response.words) {
1047
+ for (const w of response.words) {
1048
+ if (w.speaker_id !== void 0 && w.speaker_id !== null) {
1049
+ speakerIds.add(w.speaker_id.toString());
1050
+ }
1051
+ }
1052
+ }
1053
+ const speakers = speakerIds.size > 0 ? Array.from(speakerIds).map((id) => ({
1054
+ id,
1055
+ label: `Speaker ${id}`
1056
+ })) : void 0;
1057
+ const utterances = words && words.length > 0 ? buildUtterancesFromWords(
1058
+ words.map((w) => ({
1059
+ word: w.word,
1060
+ start: w.start,
1061
+ end: w.end,
1062
+ confidence: w.confidence,
1063
+ speaker: w.speaker
1064
+ }))
1065
+ ) : void 0;
1066
+ return {
1067
+ success: true,
1068
+ provider: this.provider,
1069
+ eventType: "transcription.completed",
1070
+ data: {
1071
+ id: transcriptionId,
1072
+ status: "completed",
1073
+ text: transcript,
1074
+ language: response.language_code,
1075
+ speakers: speakers && speakers.length > 0 ? speakers : void 0,
1076
+ words: words && words.length > 0 ? words : void 0,
1077
+ utterances: utterances && utterances.length > 0 ? utterances : void 0,
1078
+ metadata: {
1079
+ language_probability: response.language_probability,
1080
+ entities: response.entities,
1081
+ channel_index: response.channel_index
1082
+ }
1083
+ },
1084
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1085
+ raw: payload
1086
+ };
1087
+ } catch (error) {
1088
+ return this.createErrorEvent(
1089
+ payload,
1090
+ `Failed to parse ElevenLabs webhook: ${error instanceof Error ? error.message : "Unknown error"}`
1091
+ );
1092
+ }
1093
+ }
1094
+ /**
1095
+ * Verify ElevenLabs webhook signature
1096
+ *
1097
+ * Note: ElevenLabs does not currently provide a standard webhook signature
1098
+ * verification mechanism for STT webhooks. For security, use HTTPS and
1099
+ * validate the request source.
1100
+ *
1101
+ * @returns Always returns true (no verification available)
1102
+ */
1103
+ verify() {
1104
+ return true;
1105
+ }
1106
+ };
1107
+ function createElevenLabsWebhookHandler() {
1108
+ return new ElevenLabsWebhookHandler();
1109
+ }
1110
+
811
1111
  // src/webhooks/webhook-router.ts
812
1112
  var WebhookRouter = class {
813
1113
  constructor() {
@@ -816,7 +1116,8 @@ var WebhookRouter = class {
816
1116
  ["assemblyai", new AssemblyAIWebhookHandler()],
817
1117
  ["deepgram", new DeepgramWebhookHandler()],
818
1118
  ["azure-stt", new AzureWebhookHandler()],
819
- ["speechmatics", new SpeechmaticsWebhookHandler()]
1119
+ ["speechmatics", new SpeechmaticsWebhookHandler()],
1120
+ ["elevenlabs", new ElevenLabsWebhookHandler()]
820
1121
  ]);
821
1122
  }
822
1123
  /**
@@ -996,12 +1297,14 @@ function createWebhookRouter() {
996
1297
  AzureWebhookHandler,
997
1298
  BaseWebhookHandler,
998
1299
  DeepgramWebhookHandler,
1300
+ ElevenLabsWebhookHandler,
999
1301
  GladiaWebhookHandler,
1000
1302
  SpeechmaticsWebhookHandler,
1001
1303
  WebhookRouter,
1002
1304
  createAssemblyAIWebhookHandler,
1003
1305
  createAzureWebhookHandler,
1004
1306
  createDeepgramWebhookHandler,
1307
+ createElevenLabsWebhookHandler,
1005
1308
  createGladiaWebhookHandler,
1006
1309
  createWebhookRouter
1007
1310
  });