voice-router-dev 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -32,6 +32,9 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
32
32
  var src_exports = {};
33
33
  __export(src_exports, {
34
34
  AssemblyAIAdapter: () => AssemblyAIAdapter,
35
+ AssemblyAIEncoding: () => AssemblyAIEncoding,
36
+ AssemblyAISampleRate: () => AssemblyAISampleRate,
37
+ AssemblyAISpeechModel: () => AssemblyAISpeechModel,
35
38
  AssemblyAITypes: () => schema_exports2,
36
39
  AssemblyAIWebhookHandler: () => AssemblyAIWebhookHandler,
37
40
  AzureSTTAdapter: () => AzureSTTAdapter,
@@ -39,8 +42,18 @@ __export(src_exports, {
39
42
  BaseAdapter: () => BaseAdapter,
40
43
  BaseWebhookHandler: () => BaseWebhookHandler,
41
44
  DeepgramAdapter: () => DeepgramAdapter,
45
+ DeepgramEncoding: () => ListenV1EncodingParameter,
46
+ DeepgramModel: () => DeepgramModel,
47
+ DeepgramRedact: () => ListenV1RedactParameterOneOfItem,
48
+ DeepgramTopicMode: () => SharedCustomTopicModeParameter,
42
49
  DeepgramWebhookHandler: () => DeepgramWebhookHandler,
43
50
  GladiaAdapter: () => GladiaAdapter,
51
+ GladiaBitDepth: () => StreamingSupportedBitDepthEnum,
52
+ GladiaEncoding: () => StreamingSupportedEncodingEnum,
53
+ GladiaLanguage: () => TranscriptionLanguageCodeEnum,
54
+ GladiaModel: () => StreamingSupportedModels,
55
+ GladiaSampleRate: () => StreamingSupportedSampleRateEnum,
56
+ GladiaTranslationLanguage: () => TranslationLanguageCodeEnum,
44
57
  GladiaTypes: () => schema_exports,
45
58
  GladiaWebhookHandler: () => GladiaWebhookHandler,
46
59
  ListenV1EncodingParameter: () => ListenV1EncodingParameter,
@@ -285,6 +298,312 @@ var ListenV1EncodingParameter = {
285
298
  g729: "g729"
286
299
  };
287
300
 
301
+ // src/generated/deepgram/schema/listenV1RedactParameterOneOfItem.ts
302
+ var ListenV1RedactParameterOneOfItem = {
303
+ pci: "pci",
304
+ pii: "pii",
305
+ numbers: "numbers"
306
+ };
307
+
308
+ // src/generated/deepgram/schema/sharedCustomTopicModeParameter.ts
309
+ var SharedCustomTopicModeParameter = {
310
+ extended: "extended",
311
+ strict: "strict"
312
+ };
313
+
314
+ // src/generated/gladia/schema/streamingSupportedEncodingEnum.ts
315
+ var StreamingSupportedEncodingEnum = {
316
+ "wav/pcm": "wav/pcm",
317
+ "wav/alaw": "wav/alaw",
318
+ "wav/ulaw": "wav/ulaw"
319
+ };
320
+
321
+ // src/generated/gladia/schema/streamingSupportedSampleRateEnum.ts
322
+ var StreamingSupportedSampleRateEnum = {
323
+ NUMBER_8000: 8e3,
324
+ NUMBER_16000: 16e3,
325
+ NUMBER_32000: 32e3,
326
+ NUMBER_44100: 44100,
327
+ NUMBER_48000: 48e3
328
+ };
329
+
330
+ // src/generated/gladia/schema/streamingSupportedBitDepthEnum.ts
331
+ var StreamingSupportedBitDepthEnum = {
332
+ NUMBER_8: 8,
333
+ NUMBER_16: 16,
334
+ NUMBER_24: 24,
335
+ NUMBER_32: 32
336
+ };
337
+
338
+ // src/generated/gladia/schema/streamingSupportedModels.ts
339
+ var StreamingSupportedModels = {
340
+ "solaria-1": "solaria-1"
341
+ };
342
+
343
+ // src/generated/gladia/schema/transcriptionLanguageCodeEnum.ts
344
+ var TranscriptionLanguageCodeEnum = {
345
+ af: "af",
346
+ am: "am",
347
+ ar: "ar",
348
+ as: "as",
349
+ az: "az",
350
+ ba: "ba",
351
+ be: "be",
352
+ bg: "bg",
353
+ bn: "bn",
354
+ bo: "bo",
355
+ br: "br",
356
+ bs: "bs",
357
+ ca: "ca",
358
+ cs: "cs",
359
+ cy: "cy",
360
+ da: "da",
361
+ de: "de",
362
+ el: "el",
363
+ en: "en",
364
+ es: "es",
365
+ et: "et",
366
+ eu: "eu",
367
+ fa: "fa",
368
+ fi: "fi",
369
+ fo: "fo",
370
+ fr: "fr",
371
+ gl: "gl",
372
+ gu: "gu",
373
+ ha: "ha",
374
+ haw: "haw",
375
+ he: "he",
376
+ hi: "hi",
377
+ hr: "hr",
378
+ ht: "ht",
379
+ hu: "hu",
380
+ hy: "hy",
381
+ id: "id",
382
+ is: "is",
383
+ it: "it",
384
+ ja: "ja",
385
+ jw: "jw",
386
+ ka: "ka",
387
+ kk: "kk",
388
+ km: "km",
389
+ kn: "kn",
390
+ ko: "ko",
391
+ la: "la",
392
+ lb: "lb",
393
+ ln: "ln",
394
+ lo: "lo",
395
+ lt: "lt",
396
+ lv: "lv",
397
+ mg: "mg",
398
+ mi: "mi",
399
+ mk: "mk",
400
+ ml: "ml",
401
+ mn: "mn",
402
+ mr: "mr",
403
+ ms: "ms",
404
+ mt: "mt",
405
+ my: "my",
406
+ ne: "ne",
407
+ nl: "nl",
408
+ nn: "nn",
409
+ no: "no",
410
+ oc: "oc",
411
+ pa: "pa",
412
+ pl: "pl",
413
+ ps: "ps",
414
+ pt: "pt",
415
+ ro: "ro",
416
+ ru: "ru",
417
+ sa: "sa",
418
+ sd: "sd",
419
+ si: "si",
420
+ sk: "sk",
421
+ sl: "sl",
422
+ sn: "sn",
423
+ so: "so",
424
+ sq: "sq",
425
+ sr: "sr",
426
+ su: "su",
427
+ sv: "sv",
428
+ sw: "sw",
429
+ ta: "ta",
430
+ te: "te",
431
+ tg: "tg",
432
+ th: "th",
433
+ tk: "tk",
434
+ tl: "tl",
435
+ tr: "tr",
436
+ tt: "tt",
437
+ uk: "uk",
438
+ ur: "ur",
439
+ uz: "uz",
440
+ vi: "vi",
441
+ yi: "yi",
442
+ yo: "yo",
443
+ zh: "zh"
444
+ };
445
+
446
+ // src/generated/gladia/schema/translationLanguageCodeEnum.ts
447
+ var TranslationLanguageCodeEnum = {
448
+ af: "af",
449
+ am: "am",
450
+ ar: "ar",
451
+ as: "as",
452
+ az: "az",
453
+ ba: "ba",
454
+ be: "be",
455
+ bg: "bg",
456
+ bn: "bn",
457
+ bo: "bo",
458
+ br: "br",
459
+ bs: "bs",
460
+ ca: "ca",
461
+ cs: "cs",
462
+ cy: "cy",
463
+ da: "da",
464
+ de: "de",
465
+ el: "el",
466
+ en: "en",
467
+ es: "es",
468
+ et: "et",
469
+ eu: "eu",
470
+ fa: "fa",
471
+ fi: "fi",
472
+ fo: "fo",
473
+ fr: "fr",
474
+ gl: "gl",
475
+ gu: "gu",
476
+ ha: "ha",
477
+ haw: "haw",
478
+ he: "he",
479
+ hi: "hi",
480
+ hr: "hr",
481
+ ht: "ht",
482
+ hu: "hu",
483
+ hy: "hy",
484
+ id: "id",
485
+ is: "is",
486
+ it: "it",
487
+ ja: "ja",
488
+ jw: "jw",
489
+ ka: "ka",
490
+ kk: "kk",
491
+ km: "km",
492
+ kn: "kn",
493
+ ko: "ko",
494
+ la: "la",
495
+ lb: "lb",
496
+ ln: "ln",
497
+ lo: "lo",
498
+ lt: "lt",
499
+ lv: "lv",
500
+ mg: "mg",
501
+ mi: "mi",
502
+ mk: "mk",
503
+ ml: "ml",
504
+ mn: "mn",
505
+ mr: "mr",
506
+ ms: "ms",
507
+ mt: "mt",
508
+ my: "my",
509
+ ne: "ne",
510
+ nl: "nl",
511
+ nn: "nn",
512
+ no: "no",
513
+ oc: "oc",
514
+ pa: "pa",
515
+ pl: "pl",
516
+ ps: "ps",
517
+ pt: "pt",
518
+ ro: "ro",
519
+ ru: "ru",
520
+ sa: "sa",
521
+ sd: "sd",
522
+ si: "si",
523
+ sk: "sk",
524
+ sl: "sl",
525
+ sn: "sn",
526
+ so: "so",
527
+ sq: "sq",
528
+ sr: "sr",
529
+ su: "su",
530
+ sv: "sv",
531
+ sw: "sw",
532
+ ta: "ta",
533
+ te: "te",
534
+ tg: "tg",
535
+ th: "th",
536
+ tk: "tk",
537
+ tl: "tl",
538
+ tr: "tr",
539
+ tt: "tt",
540
+ uk: "uk",
541
+ ur: "ur",
542
+ uz: "uz",
543
+ vi: "vi",
544
+ wo: "wo",
545
+ yi: "yi",
546
+ yo: "yo",
547
+ zh: "zh"
548
+ };
549
+
550
+ // src/router/streaming-enums.ts
551
+ var DeepgramModel = {
552
+ // Nova 3 models (latest)
553
+ "nova-3": "nova-3",
554
+ "nova-3-general": "nova-3-general",
555
+ "nova-3-medical": "nova-3-medical",
556
+ // Nova 2 models
557
+ "nova-2": "nova-2",
558
+ "nova-2-general": "nova-2-general",
559
+ "nova-2-meeting": "nova-2-meeting",
560
+ "nova-2-finance": "nova-2-finance",
561
+ "nova-2-conversationalai": "nova-2-conversationalai",
562
+ "nova-2-voicemail": "nova-2-voicemail",
563
+ "nova-2-video": "nova-2-video",
564
+ "nova-2-medical": "nova-2-medical",
565
+ "nova-2-drivethru": "nova-2-drivethru",
566
+ "nova-2-automotive": "nova-2-automotive",
567
+ // Nova 1 models
568
+ nova: "nova",
569
+ "nova-general": "nova-general",
570
+ "nova-phonecall": "nova-phonecall",
571
+ "nova-medical": "nova-medical",
572
+ // Enhanced models
573
+ enhanced: "enhanced",
574
+ "enhanced-general": "enhanced-general",
575
+ "enhanced-meeting": "enhanced-meeting",
576
+ "enhanced-phonecall": "enhanced-phonecall",
577
+ "enhanced-finance": "enhanced-finance",
578
+ // Base models
579
+ base: "base",
580
+ meeting: "meeting",
581
+ phonecall: "phonecall",
582
+ finance: "finance",
583
+ conversationalai: "conversationalai",
584
+ voicemail: "voicemail",
585
+ video: "video"
586
+ };
587
+ var AssemblyAIEncoding = {
588
+ /** PCM signed 16-bit little-endian (recommended) */
589
+ pcmS16le: "pcm_s16le",
590
+ /** μ-law (telephony) */
591
+ pcmMulaw: "pcm_mulaw"
592
+ };
593
+ var AssemblyAISpeechModel = {
594
+ /** Optimized for English */
595
+ english: "universal-streaming-english",
596
+ /** Supports 20+ languages */
597
+ multilingual: "universal-streaming-multilingual"
598
+ };
599
+ var AssemblyAISampleRate = {
600
+ rate8000: 8e3,
601
+ rate16000: 16e3,
602
+ rate22050: 22050,
603
+ rate44100: 44100,
604
+ rate48000: 48e3
605
+ };
606
+
288
607
  // src/generated/deepgram/schema/speakV1EncodingParameter.ts
289
608
  var SpeakV1EncodingParameter = {
290
609
  linear16: "linear16",
@@ -314,30 +633,6 @@ var SpeakV1SampleRateParameter = {
314
633
  NUMBER_22050: 22050
315
634
  };
316
635
 
317
- // src/generated/gladia/schema/streamingSupportedEncodingEnum.ts
318
- var StreamingSupportedEncodingEnum = {
319
- "wav/pcm": "wav/pcm",
320
- "wav/alaw": "wav/alaw",
321
- "wav/ulaw": "wav/ulaw"
322
- };
323
-
324
- // src/generated/gladia/schema/streamingSupportedSampleRateEnum.ts
325
- var StreamingSupportedSampleRateEnum = {
326
- NUMBER_8000: 8e3,
327
- NUMBER_16000: 16e3,
328
- NUMBER_32000: 32e3,
329
- NUMBER_44100: 44100,
330
- NUMBER_48000: 48e3
331
- };
332
-
333
- // src/generated/gladia/schema/streamingSupportedBitDepthEnum.ts
334
- var StreamingSupportedBitDepthEnum = {
335
- NUMBER_8: 8,
336
- NUMBER_16: 16,
337
- NUMBER_24: 24,
338
- NUMBER_32: 32
339
- };
340
-
341
636
  // src/constants/defaults.ts
342
637
  var DEFAULT_TIMEOUTS = {
343
638
  /** Standard HTTP request timeout for API calls (60 seconds) */
@@ -1225,11 +1520,6 @@ var StreamingResponseStatus = {
1225
1520
  error: "error"
1226
1521
  };
1227
1522
 
1228
- // src/generated/gladia/schema/streamingSupportedModels.ts
1229
- var StreamingSupportedModels = {
1230
- "solaria-1": "solaria-1"
1231
- };
1232
-
1233
1523
  // src/generated/gladia/schema/streamingSupportedRegions.ts
1234
1524
  var StreamingSupportedRegions = {
1235
1525
  "us-west": "us-west",
@@ -1255,232 +1545,25 @@ var SummaryTypesEnum = {
1255
1545
  concise: "concise"
1256
1546
  };
1257
1547
 
1258
- // src/generated/gladia/schema/transcriptionControllerListV2KindItem.ts
1259
- var TranscriptionControllerListV2KindItem = {
1260
- "pre-recorded": "pre-recorded",
1261
- live: "live"
1262
- };
1263
-
1264
- // src/generated/gladia/schema/transcriptionControllerListV2StatusItem.ts
1265
- var TranscriptionControllerListV2StatusItem = {
1266
- queued: "queued",
1267
- processing: "processing",
1268
- done: "done",
1269
- error: "error"
1270
- };
1271
-
1272
- // src/generated/gladia/schema/transcriptionLanguageCodeEnum.ts
1273
- var TranscriptionLanguageCodeEnum = {
1274
- af: "af",
1275
- am: "am",
1276
- ar: "ar",
1277
- as: "as",
1278
- az: "az",
1279
- ba: "ba",
1280
- be: "be",
1281
- bg: "bg",
1282
- bn: "bn",
1283
- bo: "bo",
1284
- br: "br",
1285
- bs: "bs",
1286
- ca: "ca",
1287
- cs: "cs",
1288
- cy: "cy",
1289
- da: "da",
1290
- de: "de",
1291
- el: "el",
1292
- en: "en",
1293
- es: "es",
1294
- et: "et",
1295
- eu: "eu",
1296
- fa: "fa",
1297
- fi: "fi",
1298
- fo: "fo",
1299
- fr: "fr",
1300
- gl: "gl",
1301
- gu: "gu",
1302
- ha: "ha",
1303
- haw: "haw",
1304
- he: "he",
1305
- hi: "hi",
1306
- hr: "hr",
1307
- ht: "ht",
1308
- hu: "hu",
1309
- hy: "hy",
1310
- id: "id",
1311
- is: "is",
1312
- it: "it",
1313
- ja: "ja",
1314
- jw: "jw",
1315
- ka: "ka",
1316
- kk: "kk",
1317
- km: "km",
1318
- kn: "kn",
1319
- ko: "ko",
1320
- la: "la",
1321
- lb: "lb",
1322
- ln: "ln",
1323
- lo: "lo",
1324
- lt: "lt",
1325
- lv: "lv",
1326
- mg: "mg",
1327
- mi: "mi",
1328
- mk: "mk",
1329
- ml: "ml",
1330
- mn: "mn",
1331
- mr: "mr",
1332
- ms: "ms",
1333
- mt: "mt",
1334
- my: "my",
1335
- ne: "ne",
1336
- nl: "nl",
1337
- nn: "nn",
1338
- no: "no",
1339
- oc: "oc",
1340
- pa: "pa",
1341
- pl: "pl",
1342
- ps: "ps",
1343
- pt: "pt",
1344
- ro: "ro",
1345
- ru: "ru",
1346
- sa: "sa",
1347
- sd: "sd",
1348
- si: "si",
1349
- sk: "sk",
1350
- sl: "sl",
1351
- sn: "sn",
1352
- so: "so",
1353
- sq: "sq",
1354
- sr: "sr",
1355
- su: "su",
1356
- sv: "sv",
1357
- sw: "sw",
1358
- ta: "ta",
1359
- te: "te",
1360
- tg: "tg",
1361
- th: "th",
1362
- tk: "tk",
1363
- tl: "tl",
1364
- tr: "tr",
1365
- tt: "tt",
1366
- uk: "uk",
1367
- ur: "ur",
1368
- uz: "uz",
1369
- vi: "vi",
1370
- yi: "yi",
1371
- yo: "yo",
1372
- zh: "zh"
1373
- };
1374
-
1548
+ // src/generated/gladia/schema/transcriptionControllerListV2KindItem.ts
1549
+ var TranscriptionControllerListV2KindItem = {
1550
+ "pre-recorded": "pre-recorded",
1551
+ live: "live"
1552
+ };
1553
+
1554
+ // src/generated/gladia/schema/transcriptionControllerListV2StatusItem.ts
1555
+ var TranscriptionControllerListV2StatusItem = {
1556
+ queued: "queued",
1557
+ processing: "processing",
1558
+ done: "done",
1559
+ error: "error"
1560
+ };
1561
+
1375
1562
  // src/generated/gladia/schema/transcriptMessageType.ts
1376
1563
  var TranscriptMessageType = {
1377
1564
  transcript: "transcript"
1378
1565
  };
1379
1566
 
1380
- // src/generated/gladia/schema/translationLanguageCodeEnum.ts
1381
- var TranslationLanguageCodeEnum = {
1382
- af: "af",
1383
- am: "am",
1384
- ar: "ar",
1385
- as: "as",
1386
- az: "az",
1387
- ba: "ba",
1388
- be: "be",
1389
- bg: "bg",
1390
- bn: "bn",
1391
- bo: "bo",
1392
- br: "br",
1393
- bs: "bs",
1394
- ca: "ca",
1395
- cs: "cs",
1396
- cy: "cy",
1397
- da: "da",
1398
- de: "de",
1399
- el: "el",
1400
- en: "en",
1401
- es: "es",
1402
- et: "et",
1403
- eu: "eu",
1404
- fa: "fa",
1405
- fi: "fi",
1406
- fo: "fo",
1407
- fr: "fr",
1408
- gl: "gl",
1409
- gu: "gu",
1410
- ha: "ha",
1411
- haw: "haw",
1412
- he: "he",
1413
- hi: "hi",
1414
- hr: "hr",
1415
- ht: "ht",
1416
- hu: "hu",
1417
- hy: "hy",
1418
- id: "id",
1419
- is: "is",
1420
- it: "it",
1421
- ja: "ja",
1422
- jw: "jw",
1423
- ka: "ka",
1424
- kk: "kk",
1425
- km: "km",
1426
- kn: "kn",
1427
- ko: "ko",
1428
- la: "la",
1429
- lb: "lb",
1430
- ln: "ln",
1431
- lo: "lo",
1432
- lt: "lt",
1433
- lv: "lv",
1434
- mg: "mg",
1435
- mi: "mi",
1436
- mk: "mk",
1437
- ml: "ml",
1438
- mn: "mn",
1439
- mr: "mr",
1440
- ms: "ms",
1441
- mt: "mt",
1442
- my: "my",
1443
- ne: "ne",
1444
- nl: "nl",
1445
- nn: "nn",
1446
- no: "no",
1447
- oc: "oc",
1448
- pa: "pa",
1449
- pl: "pl",
1450
- ps: "ps",
1451
- pt: "pt",
1452
- ro: "ro",
1453
- ru: "ru",
1454
- sa: "sa",
1455
- sd: "sd",
1456
- si: "si",
1457
- sk: "sk",
1458
- sl: "sl",
1459
- sn: "sn",
1460
- so: "so",
1461
- sq: "sq",
1462
- sr: "sr",
1463
- su: "su",
1464
- sv: "sv",
1465
- sw: "sw",
1466
- ta: "ta",
1467
- te: "te",
1468
- tg: "tg",
1469
- th: "th",
1470
- tk: "tk",
1471
- tl: "tl",
1472
- tr: "tr",
1473
- tt: "tt",
1474
- uk: "uk",
1475
- ur: "ur",
1476
- uz: "uz",
1477
- vi: "vi",
1478
- wo: "wo",
1479
- yi: "yi",
1480
- yo: "yo",
1481
- zh: "zh"
1482
- };
1483
-
1484
1567
  // src/generated/gladia/schema/translationMessageType.ts
1485
1568
  var TranslationMessageType = {
1486
1569
  translation: "translation"
@@ -2048,7 +2131,7 @@ var GladiaAdapter = class extends BaseAdapter {
2048
2131
  }))
2049
2132
  );
2050
2133
  return extractWords(allWords, (item) => ({
2051
- text: item.word.word,
2134
+ word: item.word.word,
2052
2135
  start: item.word.start,
2053
2136
  end: item.word.end,
2054
2137
  confidence: item.word.confidence,
@@ -2068,11 +2151,11 @@ var GladiaAdapter = class extends BaseAdapter {
2068
2151
  end: utterance.end,
2069
2152
  speaker: utterance.speaker?.toString(),
2070
2153
  confidence: utterance.confidence,
2071
- words: utterance.words.map((word) => ({
2072
- text: word.word,
2073
- start: word.start,
2074
- end: word.end,
2075
- confidence: word.confidence
2154
+ words: utterance.words.map((w) => ({
2155
+ word: w.word,
2156
+ start: w.start,
2157
+ end: w.end,
2158
+ confidence: w.confidence
2076
2159
  }))
2077
2160
  }));
2078
2161
  }
@@ -2124,11 +2207,46 @@ var GladiaAdapter = class extends BaseAdapter {
2124
2207
  * Creates a WebSocket connection to Gladia for streaming transcription.
2125
2208
  * First initializes a session via REST API, then connects to WebSocket.
2126
2209
  *
2210
+ * Supports all Gladia streaming features:
2211
+ * - Real-time transcription with interim/final results
2212
+ * - Speech detection events (speech_start, speech_end)
2213
+ * - Real-time translation to other languages
2214
+ * - Real-time sentiment analysis
2215
+ * - Real-time named entity recognition
2216
+ * - Post-processing summarization and chapterization
2217
+ * - Audio preprocessing (audio enhancement, speech threshold)
2218
+ * - Custom vocabulary and spelling
2219
+ * - Multi-language code switching
2220
+ *
2127
2221
  * @param options - Streaming configuration options
2222
+ * @param options.encoding - Audio encoding (wav/pcm, wav/alaw, wav/ulaw)
2223
+ * @param options.sampleRate - Sample rate (8000, 16000, 32000, 44100, 48000)
2224
+ * @param options.bitDepth - Bit depth (8, 16, 24, 32)
2225
+ * @param options.channels - Number of channels (1-8)
2226
+ * @param options.language - Language code for transcription
2227
+ * @param options.interimResults - Enable partial/interim transcripts
2228
+ * @param options.endpointing - Silence duration to end utterance (0.01-10 seconds)
2229
+ * @param options.maxSilence - Max duration without endpointing (5-60 seconds)
2230
+ * @param options.customVocabulary - Words to boost in recognition
2231
+ * @param options.sentimentAnalysis - Enable real-time sentiment analysis
2232
+ * @param options.entityDetection - Enable named entity recognition
2233
+ * @param options.summarization - Enable post-processing summarization
2234
+ * @param options.gladiaStreaming - Full Gladia streaming options (pre_processing, realtime_processing, post_processing, messages_config)
2128
2235
  * @param callbacks - Event callbacks for transcription results
2236
+ * @param callbacks.onTranscript - Interim/final transcript received
2237
+ * @param callbacks.onUtterance - Complete utterance detected
2238
+ * @param callbacks.onSpeechStart - Speech detected (requires messages_config.receive_speech_events)
2239
+ * @param callbacks.onSpeechEnd - Speech ended (requires messages_config.receive_speech_events)
2240
+ * @param callbacks.onTranslation - Translation result (requires translation enabled)
2241
+ * @param callbacks.onSentiment - Sentiment analysis result
2242
+ * @param callbacks.onEntity - Named entity detected
2243
+ * @param callbacks.onSummarization - Summarization completed
2244
+ * @param callbacks.onChapterization - Chapterization completed
2245
+ * @param callbacks.onAudioAck - Audio chunk acknowledged
2246
+ * @param callbacks.onLifecycle - Session lifecycle events
2129
2247
  * @returns Promise that resolves with a StreamingSession
2130
2248
  *
2131
- * @example Real-time streaming
2249
+ * @example Basic real-time streaming
2132
2250
  * ```typescript
2133
2251
  * const session = await adapter.transcribeStream({
2134
2252
  * encoding: 'wav/pcm',
@@ -2150,15 +2268,124 @@ var GladiaAdapter = class extends BaseAdapter {
2150
2268
  * });
2151
2269
  *
2152
2270
  * // Send audio chunks
2153
- * const audioChunk = getAudioChunk(); // Your audio source
2271
+ * const audioChunk = getAudioChunk();
2154
2272
  * await session.sendAudio({ data: audioChunk });
2155
2273
  *
2156
2274
  * // Close when done
2157
2275
  * await session.close();
2158
2276
  * ```
2277
+ *
2278
+ * @example Advanced streaming with all features
2279
+ * ```typescript
2280
+ * const session = await adapter.transcribeStream({
2281
+ * encoding: 'wav/pcm',
2282
+ * sampleRate: 16000,
2283
+ * language: 'en',
2284
+ * sentimentAnalysis: true,
2285
+ * entityDetection: true,
2286
+ * summarization: true,
2287
+ * gladiaStreaming: {
2288
+ * pre_processing: {
2289
+ * audio_enhancer: true,
2290
+ * speech_threshold: 0.5
2291
+ * },
2292
+ * realtime_processing: {
2293
+ * translation: true,
2294
+ * translation_config: { target_languages: ['fr', 'es'] }
2295
+ * },
2296
+ * post_processing: {
2297
+ * chapterization: true
2298
+ * },
2299
+ * messages_config: {
2300
+ * receive_speech_events: true,
2301
+ * receive_acknowledgments: true,
2302
+ * receive_lifecycle_events: true
2303
+ * }
2304
+ * }
2305
+ * }, {
2306
+ * onTranscript: (e) => console.log('Transcript:', e.text),
2307
+ * onSpeechStart: (e) => console.log('Speech started at:', e.timestamp),
2308
+ * onSpeechEnd: (e) => console.log('Speech ended at:', e.timestamp),
2309
+ * onTranslation: (e) => console.log(`${e.targetLanguage}: ${e.translatedText}`),
2310
+ * onSentiment: (e) => console.log('Sentiment:', e.sentiment),
2311
+ * onEntity: (e) => console.log(`Entity: ${e.type} - ${e.text}`),
2312
+ * onSummarization: (e) => console.log('Summary:', e.summary),
2313
+ * onChapterization: (e) => console.log('Chapters:', e.chapters),
2314
+ * onAudioAck: (e) => console.log('Audio ack:', e.byteRange),
2315
+ * onLifecycle: (e) => console.log('Lifecycle:', e.eventType)
2316
+ * });
2317
+ * ```
2159
2318
  */
2160
2319
  async transcribeStream(options, callbacks) {
2161
2320
  this.validateConfig();
2321
+ const streamingRequest = this.buildStreamingRequest(options);
2322
+ const initResponse = await streamingControllerInitStreamingSessionV2(
2323
+ streamingRequest,
2324
+ void 0,
2325
+ // no params
2326
+ this.getAxiosConfig()
2327
+ );
2328
+ const { id, url: wsUrl } = initResponse.data;
2329
+ const ws = new import_ws.default(wsUrl);
2330
+ let sessionStatus = "connecting";
2331
+ setupWebSocketHandlers(ws, callbacks, (status) => {
2332
+ sessionStatus = status;
2333
+ });
2334
+ ws.on("message", (data) => {
2335
+ try {
2336
+ const message = JSON.parse(data.toString());
2337
+ this.handleWebSocketMessage(message, callbacks);
2338
+ } catch (error) {
2339
+ callbacks?.onError?.({
2340
+ code: ERROR_CODES.PARSE_ERROR,
2341
+ message: "Failed to parse WebSocket message",
2342
+ details: error
2343
+ });
2344
+ }
2345
+ });
2346
+ await waitForWebSocketOpen(ws);
2347
+ return {
2348
+ id,
2349
+ provider: this.name,
2350
+ createdAt: /* @__PURE__ */ new Date(),
2351
+ getStatus: () => sessionStatus,
2352
+ sendAudio: async (chunk) => {
2353
+ validateSessionForAudio(sessionStatus, ws.readyState, import_ws.default.OPEN);
2354
+ ws.send(chunk.data);
2355
+ if (chunk.isLast) {
2356
+ ws.send(
2357
+ JSON.stringify({
2358
+ type: "stop_recording"
2359
+ })
2360
+ );
2361
+ }
2362
+ },
2363
+ close: async () => {
2364
+ if (sessionStatus === "closed" || sessionStatus === "closing") {
2365
+ return;
2366
+ }
2367
+ sessionStatus = "closing";
2368
+ if (ws.readyState === import_ws.default.OPEN) {
2369
+ ws.send(
2370
+ JSON.stringify({
2371
+ type: "stop_recording"
2372
+ })
2373
+ );
2374
+ }
2375
+ await closeWebSocket(ws);
2376
+ sessionStatus = "closed";
2377
+ }
2378
+ };
2379
+ }
2380
+ /**
2381
+ * Build streaming request with full type safety from OpenAPI specs
2382
+ *
2383
+ * Maps normalized options to Gladia streaming request format,
2384
+ * including all advanced features like pre-processing, real-time
2385
+ * processing, post-processing, and message configuration.
2386
+ */
2387
+ buildStreamingRequest(options) {
2388
+ const gladiaOpts = options?.gladiaStreaming || {};
2162
2389
  let validatedSampleRate;
2163
2390
  if (options?.sampleRate) {
2164
2391
  validatedSampleRate = validateEnumValue(
@@ -2168,112 +2395,376 @@ var GladiaAdapter = class extends BaseAdapter {
2168
2395
  "Gladia"
2169
2396
  );
2170
2397
  }
2398
+ let validatedBitDepth;
2399
+ if (options?.bitDepth) {
2400
+ validatedBitDepth = validateEnumValue(
2401
+ options.bitDepth,
2402
+ StreamingSupportedBitDepthEnum,
2403
+ "bit depth",
2404
+ "Gladia"
2405
+ );
2406
+ }
2171
2407
  const streamingRequest = {
2408
+ // Spread any direct Gladia streaming options first
2409
+ ...gladiaOpts,
2410
+ // Audio format configuration (these are excluded from gladiaStreaming to avoid conflicts)
2172
2411
  encoding: options?.encoding ? mapEncodingToProvider(options.encoding, "gladia") : void 0,
2173
2412
  sample_rate: validatedSampleRate,
2413
+ bit_depth: validatedBitDepth,
2174
2414
  channels: options?.channels,
2175
- endpointing: options?.endpointing,
2176
- model: options?.model
2415
+ // Model and processing
2416
+ model: options?.model ?? gladiaOpts.model,
2417
+ endpointing: options?.endpointing ?? gladiaOpts.endpointing,
2418
+ maximum_duration_without_endpointing: options?.maxSilence ?? gladiaOpts.maximum_duration_without_endpointing
2177
2419
  };
2178
- if (options?.language) {
2420
+ if (options?.language || options?.codeSwitching || gladiaOpts.language_config) {
2179
2421
  streamingRequest.language_config = {
2180
- languages: [options.language]
2422
+ ...gladiaOpts.language_config,
2423
+ languages: options?.language ? [options.language] : gladiaOpts.language_config?.languages,
2424
+ code_switching: options?.codeSwitching ?? gladiaOpts.language_config?.code_switching
2181
2425
  };
2182
2426
  }
2183
- const initResponse = await streamingControllerInitStreamingSessionV2(
2184
- streamingRequest,
2185
- void 0,
2186
- // no params
2187
- this.getAxiosConfig()
2188
- );
2189
- const { id, url: wsUrl } = initResponse.data;
2190
- const ws = new import_ws.default(wsUrl);
2191
- let sessionStatus = "connecting";
2192
- setupWebSocketHandlers(ws, callbacks, (status) => {
2193
- sessionStatus = status;
2194
- });
2195
- ws.on("message", (data) => {
2196
- try {
2197
- const message = JSON.parse(data.toString());
2198
- if (message.type === "transcript") {
2199
- const transcriptMessage = message;
2200
- const messageData = transcriptMessage.data;
2201
- const utterance = messageData.utterance;
2202
- callbacks?.onTranscript?.({
2203
- type: "transcript",
2204
- text: utterance.text,
2205
- isFinal: messageData.is_final,
2206
- confidence: utterance.confidence,
2207
- words: utterance.words.map((word) => ({
2208
- text: word.word,
2209
- start: word.start,
2210
- end: word.end,
2211
- confidence: word.confidence
2212
- })),
2213
- data: message
2427
+ if (gladiaOpts.pre_processing) {
2428
+ streamingRequest.pre_processing = gladiaOpts.pre_processing;
2429
+ }
2430
+ const realtimeProcessing = gladiaOpts.realtime_processing || {};
2431
+ const hasRealtimeOptions = options?.customVocabulary || options?.sentimentAnalysis || options?.entityDetection || realtimeProcessing.translation || realtimeProcessing.custom_vocabulary || realtimeProcessing.custom_spelling || realtimeProcessing.named_entity_recognition || realtimeProcessing.sentiment_analysis;
2432
+ if (hasRealtimeOptions) {
2433
+ streamingRequest.realtime_processing = {
2434
+ ...realtimeProcessing,
2435
+ // Custom vocabulary
2436
+ custom_vocabulary: options?.customVocabulary && options.customVocabulary.length > 0 || realtimeProcessing.custom_vocabulary,
2437
+ custom_vocabulary_config: options?.customVocabulary && options.customVocabulary.length > 0 ? {
2438
+ ...realtimeProcessing.custom_vocabulary_config,
2439
+ vocabulary: options.customVocabulary
2440
+ } : realtimeProcessing.custom_vocabulary_config,
2441
+ // Sentiment analysis
2442
+ sentiment_analysis: options?.sentimentAnalysis ?? realtimeProcessing.sentiment_analysis,
2443
+ // Named entity recognition
2444
+ named_entity_recognition: options?.entityDetection ?? realtimeProcessing.named_entity_recognition
2445
+ };
2446
+ }
2447
+ const postProcessing = gladiaOpts.post_processing || {};
2448
+ if (options?.summarization || postProcessing.summarization || postProcessing.chapterization) {
2449
+ streamingRequest.post_processing = {
2450
+ ...postProcessing,
2451
+ summarization: options?.summarization ?? postProcessing.summarization
2452
+ };
2453
+ }
2454
+ if (gladiaOpts.messages_config) {
2455
+ streamingRequest.messages_config = gladiaOpts.messages_config;
2456
+ } else if (options?.interimResults !== void 0) {
2457
+ streamingRequest.messages_config = {
2458
+ receive_partial_transcripts: options.interimResults,
2459
+ receive_final_transcripts: true
2460
+ };
2461
+ }
2462
+ if (gladiaOpts.callback || gladiaOpts.callback_config) {
2463
+ streamingRequest.callback = gladiaOpts.callback;
2464
+ streamingRequest.callback_config = gladiaOpts.callback_config;
2465
+ }
2466
+ if (gladiaOpts.custom_metadata) {
2467
+ streamingRequest.custom_metadata = gladiaOpts.custom_metadata;
2468
+ }
2469
+ return streamingRequest;
2470
+ }
2471
+ /**
2472
+ * Handle all WebSocket message types from Gladia streaming
2473
+ *
2474
+ * Processes transcript, utterance, speech events, real-time processing
2475
+ * results (translation, sentiment, NER), post-processing results
2476
+ * (summarization, chapterization), acknowledgments, and lifecycle events.
2477
+ */
2478
+ handleWebSocketMessage(message, callbacks) {
2479
+ const msg = message;
2480
+ const messageType = msg.type;
2481
+ switch (messageType) {
2482
+ // ─────────────────────────────────────────────────────────────────
2483
+ // Transcript events
2484
+ // ─────────────────────────────────────────────────────────────────
2485
+ case "transcript": {
2486
+ const transcriptMessage = message;
2487
+ const messageData = transcriptMessage.data;
2488
+ const utterance = messageData.utterance;
2489
+ callbacks?.onTranscript?.({
2490
+ type: "transcript",
2491
+ text: utterance.text,
2492
+ isFinal: messageData.is_final,
2493
+ confidence: utterance.confidence,
2494
+ language: utterance.language,
2495
+ channel: utterance.channel,
2496
+ speaker: utterance.speaker?.toString(),
2497
+ words: utterance.words.map((w) => ({
2498
+ word: w.word,
2499
+ start: w.start,
2500
+ end: w.end,
2501
+ confidence: w.confidence
2502
+ })),
2503
+ data: message
2504
+ });
2505
+ break;
2506
+ }
2507
+ case "utterance": {
2508
+ const transcriptMessage = message;
2509
+ const messageData = transcriptMessage.data;
2510
+ const utterance = messageData.utterance;
2511
+ callbacks?.onUtterance?.({
2512
+ text: utterance.text,
2513
+ start: utterance.start,
2514
+ end: utterance.end,
2515
+ speaker: utterance.speaker?.toString(),
2516
+ confidence: utterance.confidence,
2517
+ words: utterance.words.map((w) => ({
2518
+ word: w.word,
2519
+ start: w.start,
2520
+ end: w.end,
2521
+ confidence: w.confidence
2522
+ }))
2523
+ });
2524
+ break;
2525
+ }
2526
+ // Post-processing transcripts (final accumulated transcript)
2527
+ case "post_transcript": {
2528
+ const postTranscript = message;
2529
+ callbacks?.onTranscript?.({
2530
+ type: "transcript",
2531
+ text: postTranscript.data?.full_transcript || "",
2532
+ isFinal: true,
2533
+ data: message
2534
+ });
2535
+ break;
2536
+ }
2537
+ case "post_final_transcript": {
2538
+ const postFinal = message;
2539
+ callbacks?.onTranscript?.({
2540
+ type: "transcript",
2541
+ text: postFinal.data?.transcription?.full_transcript || "",
2542
+ isFinal: true,
2543
+ data: message
2544
+ });
2545
+ break;
2546
+ }
2547
+ // ─────────────────────────────────────────────────────────────────
2548
+ // Speech detection events
2549
+ // ─────────────────────────────────────────────────────────────────
2550
+ case "speech_start": {
2551
+ const speechStart = message;
2552
+ const event = {
2553
+ type: "speech_start",
2554
+ timestamp: speechStart.data.time,
2555
+ channel: speechStart.data.channel,
2556
+ sessionId: speechStart.session_id
2557
+ };
2558
+ callbacks?.onSpeechStart?.(event);
2559
+ break;
2560
+ }
2561
+ case "speech_end": {
2562
+ const speechEnd = message;
2563
+ const event = {
2564
+ type: "speech_end",
2565
+ timestamp: speechEnd.data.time,
2566
+ channel: speechEnd.data.channel,
2567
+ sessionId: speechEnd.session_id
2568
+ };
2569
+ callbacks?.onSpeechEnd?.(event);
2570
+ break;
2571
+ }
2572
+ // ─────────────────────────────────────────────────────────────────
2573
+ // Real-time processing events
2574
+ // ─────────────────────────────────────────────────────────────────
2575
+ case "translation": {
2576
+ const translationMsg = message;
2577
+ if (translationMsg.error) {
2578
+ callbacks?.onError?.({
2579
+ code: ERROR_CODES.TRANSCRIPTION_ERROR,
2580
+ message: "Translation failed",
2581
+ details: translationMsg.error
2582
+ });
2583
+ } else if (translationMsg.data) {
2584
+ const event = {
2585
+ utteranceId: translationMsg.data.utterance_id,
2586
+ original: translationMsg.data.utterance.text,
2587
+ targetLanguage: translationMsg.data.target_language,
2588
+ translatedText: translationMsg.data.translated_utterance.text,
2589
+ isFinal: true
2590
+ };
2591
+ callbacks?.onTranslation?.(event);
2592
+ }
2593
+ break;
2594
+ }
2595
+ case "sentiment_analysis": {
2596
+ const sentimentMsg = message;
2597
+ if (sentimentMsg.error) {
2598
+ callbacks?.onError?.({
2599
+ code: ERROR_CODES.TRANSCRIPTION_ERROR,
2600
+ message: "Sentiment analysis failed",
2601
+ details: sentimentMsg.error
2602
+ });
2603
+ } else if (sentimentMsg.data) {
2604
+ for (const result of sentimentMsg.data.results) {
2605
+ const event = {
2606
+ utteranceId: sentimentMsg.data.utterance_id,
2607
+ sentiment: result.sentiment,
2608
+ confidence: void 0
2609
+ // Gladia doesn't provide confidence for sentiment
2610
+ };
2611
+ callbacks?.onSentiment?.(event);
2612
+ }
2613
+ }
2614
+ break;
2615
+ }
2616
+ case "named_entity_recognition": {
2617
+ const nerMsg = message;
2618
+ if (nerMsg.error) {
2619
+ callbacks?.onError?.({
2620
+ code: ERROR_CODES.TRANSCRIPTION_ERROR,
2621
+ message: "Named entity recognition failed",
2622
+ details: nerMsg.error
2214
2623
  });
2215
- } else if (message.type === "utterance") {
2216
- const transcriptMessage = message;
2217
- const messageData = transcriptMessage.data;
2218
- const utterance = messageData.utterance;
2219
- const utteranceData = {
2220
- text: utterance.text,
2221
- start: utterance.start,
2222
- end: utterance.end,
2223
- speaker: utterance.speaker?.toString(),
2224
- confidence: utterance.confidence,
2225
- words: utterance.words.map((word) => ({
2226
- text: word.word,
2227
- start: word.start,
2228
- end: word.end,
2229
- confidence: word.confidence
2230
- }))
2231
- };
2232
- callbacks?.onUtterance?.(utteranceData);
2233
- } else if (message.type === "metadata") {
2234
- callbacks?.onMetadata?.(message);
2624
+ } else if (nerMsg.data) {
2625
+ for (const entity of nerMsg.data.results) {
2626
+ const event = {
2627
+ utteranceId: nerMsg.data.utterance_id,
2628
+ text: entity.text,
2629
+ type: entity.entity_type,
2630
+ start: entity.start,
2631
+ end: entity.end
2632
+ };
2633
+ callbacks?.onEntity?.(event);
2634
+ }
2235
2635
  }
2236
- } catch (error) {
2237
- callbacks?.onError?.({
2238
- code: ERROR_CODES.PARSE_ERROR,
2239
- message: "Failed to parse WebSocket message",
2240
- details: error
2241
- });
2636
+ break;
2242
2637
  }
2243
- });
2244
- await waitForWebSocketOpen(ws);
2245
- return {
2246
- id,
2247
- provider: this.name,
2248
- createdAt: /* @__PURE__ */ new Date(),
2249
- getStatus: () => sessionStatus,
2250
- sendAudio: async (chunk) => {
2251
- validateSessionForAudio(sessionStatus, ws.readyState, import_ws.default.OPEN);
2252
- ws.send(chunk.data);
2253
- if (chunk.isLast) {
2254
- ws.send(
2255
- JSON.stringify({
2256
- type: "stop_recording"
2257
- })
2258
- );
2638
+ // ─────────────────────────────────────────────────────────────────
2639
+ // Post-processing events
2640
+ // ─────────────────────────────────────────────────────────────────
2641
+ case "post_summarization": {
2642
+ const summaryMsg = message;
2643
+ if (summaryMsg.error) {
2644
+ callbacks?.onSummarization?.({
2645
+ summary: "",
2646
+ error: typeof summaryMsg.error === "string" ? summaryMsg.error : "Summarization failed"
2647
+ });
2648
+ } else if (summaryMsg.data) {
2649
+ callbacks?.onSummarization?.({
2650
+ summary: summaryMsg.data.results
2651
+ });
2259
2652
  }
2260
- },
2261
- close: async () => {
2262
- if (sessionStatus === "closed" || sessionStatus === "closing") {
2263
- return;
2653
+ break;
2654
+ }
2655
+ case "post_chapterization": {
2656
+ const chapterMsg = message;
2657
+ if (chapterMsg.error) {
2658
+ callbacks?.onChapterization?.({
2659
+ chapters: [],
2660
+ error: typeof chapterMsg.error === "string" ? chapterMsg.error : "Chapterization failed"
2661
+ });
2662
+ } else if (chapterMsg.data) {
2663
+ callbacks?.onChapterization?.({
2664
+ chapters: chapterMsg.data.results.map((ch) => ({
2665
+ headline: ch.headline,
2666
+ summary: ch.summary || ch.abstractive_summary || ch.extractive_summary || "",
2667
+ start: ch.start,
2668
+ end: ch.end
2669
+ }))
2670
+ });
2264
2671
  }
2265
- sessionStatus = "closing";
2266
- if (ws.readyState === import_ws.default.OPEN) {
2267
- ws.send(
2268
- JSON.stringify({
2269
- type: "stop_recording"
2270
- })
2271
- );
2672
+ break;
2673
+ }
2674
+ // ─────────────────────────────────────────────────────────────────
2675
+ // Acknowledgment events
2676
+ // ─────────────────────────────────────────────────────────────────
2677
+ case "audio_chunk_ack": {
2678
+ const ackMsg = message;
2679
+ if (ackMsg.error) {
2680
+ callbacks?.onError?.({
2681
+ code: ERROR_CODES.TRANSCRIPTION_ERROR,
2682
+ message: "Audio chunk not acknowledged",
2683
+ details: ackMsg.error
2684
+ });
2685
+ } else if (ackMsg.data) {
2686
+ const event = {
2687
+ byteRange: ackMsg.data.byte_range,
2688
+ timeRange: ackMsg.data.time_range,
2689
+ timestamp: ackMsg.created_at
2690
+ };
2691
+ callbacks?.onAudioAck?.(event);
2272
2692
  }
2273
- await closeWebSocket(ws);
2274
- sessionStatus = "closed";
2693
+ break;
2275
2694
  }
2276
- };
2695
+ case "stop_recording_ack": {
2696
+ const stopAck = message;
2697
+ if (stopAck.error) {
2698
+ callbacks?.onError?.({
2699
+ code: ERROR_CODES.TRANSCRIPTION_ERROR,
2700
+ message: "Stop recording not acknowledged",
2701
+ details: stopAck.error
2702
+ });
2703
+ }
2704
+ break;
2705
+ }
2706
+ // ─────────────────────────────────────────────────────────────────
2707
+ // Lifecycle events
2708
+ // ─────────────────────────────────────────────────────────────────
2709
+ case "start_session": {
2710
+ const startSession = message;
2711
+ const event = {
2712
+ eventType: "start_session",
2713
+ timestamp: startSession.created_at,
2714
+ sessionId: startSession.session_id
2715
+ };
2716
+ callbacks?.onLifecycle?.(event);
2717
+ break;
2718
+ }
2719
+ case "start_recording": {
2720
+ const startRecording = message;
2721
+ const event = {
2722
+ eventType: "start_recording",
2723
+ timestamp: startRecording.created_at,
2724
+ sessionId: startRecording.session_id
2725
+ };
2726
+ callbacks?.onLifecycle?.(event);
2727
+ break;
2728
+ }
2729
+ case "end_recording": {
2730
+ const endRecording = message;
2731
+ const event = {
2732
+ eventType: "end_recording",
2733
+ timestamp: endRecording.created_at,
2734
+ sessionId: endRecording.session_id
2735
+ };
2736
+ callbacks?.onLifecycle?.(event);
2737
+ break;
2738
+ }
2739
+ case "end_session": {
2740
+ const endSession = message;
2741
+ const event = {
2742
+ eventType: "end_session",
2743
+ timestamp: endSession.created_at,
2744
+ sessionId: endSession.session_id
2745
+ };
2746
+ callbacks?.onLifecycle?.(event);
2747
+ break;
2748
+ }
2749
+ // ─────────────────────────────────────────────────────────────────
2750
+ // Metadata and other events
2751
+ // ─────────────────────────────────────────────────────────────────
2752
+ case "metadata":
2753
+ callbacks?.onMetadata?.(msg);
2754
+ break;
2755
+ case "error": {
2756
+ const errorMsg = msg;
2757
+ callbacks?.onError?.({
2758
+ code: errorMsg.error?.code || ERROR_CODES.TRANSCRIPTION_ERROR,
2759
+ message: errorMsg.error?.message || "Unknown streaming error",
2760
+ details: msg
2761
+ });
2762
+ break;
2763
+ }
2764
+ default:
2765
+ callbacks?.onMetadata?.(msg);
2766
+ break;
2767
+ }
2277
2768
  }
2278
2769
  };
2279
2770
  function createGladiaAdapter(config) {
@@ -2931,14 +3422,14 @@ var AssemblyAIAdapter = class extends BaseAdapter {
2931
3422
  if (!transcript.words || transcript.words.length === 0) {
2932
3423
  return void 0;
2933
3424
  }
2934
- return transcript.words.map((word) => ({
2935
- text: word.text,
2936
- start: word.start / 1e3,
3425
+ return transcript.words.map((w) => ({
3426
+ word: w.text,
3427
+ start: w.start / 1e3,
2937
3428
  // Convert ms to seconds
2938
- end: word.end / 1e3,
3429
+ end: w.end / 1e3,
2939
3430
  // Convert ms to seconds
2940
- confidence: word.confidence,
2941
- speaker: word.speaker || void 0
3431
+ confidence: w.confidence,
3432
+ speaker: w.speaker || void 0
2942
3433
  }));
2943
3434
  }
2944
3435
  /**
@@ -2956,11 +3447,11 @@ var AssemblyAIAdapter = class extends BaseAdapter {
2956
3447
  // Convert ms to seconds
2957
3448
  speaker: utterance.speaker || void 0,
2958
3449
  confidence: utterance.confidence,
2959
- words: utterance.words.map((word) => ({
2960
- text: word.text,
2961
- start: word.start / 1e3,
2962
- end: word.end / 1e3,
2963
- confidence: word.confidence
3450
+ words: utterance.words.map((w) => ({
3451
+ word: w.text,
3452
+ start: w.start / 1e3,
3453
+ end: w.end / 1e3,
3454
+ confidence: w.confidence
2964
3455
  }))
2965
3456
  }));
2966
3457
  }
@@ -2968,19 +3459,37 @@ var AssemblyAIAdapter = class extends BaseAdapter {
2968
3459
  * Stream audio for real-time transcription
2969
3460
  *
2970
3461
  * Creates a WebSocket connection to AssemblyAI for streaming transcription.
2971
- * First obtains a temporary token, then connects and streams audio chunks.
3462
+ * Uses the v3 Universal Streaming API with full support for all parameters.
3463
+ *
3464
+ * Supports all AssemblyAI streaming features:
3465
+ * - Real-time transcription with interim/final results (Turn events)
3466
+ * - End-of-turn detection tuning (confidence threshold, silence duration)
3467
+ * - Voice Activity Detection (VAD) threshold tuning
3468
+ * - Real-time text formatting
3469
+ * - Profanity filtering
3470
+ * - Custom vocabulary (keyterms)
3471
+ * - Language detection
3472
+ * - Model selection (English or Multilingual)
3473
+ * - Dynamic configuration updates mid-stream
3474
+ * - Force endpoint command
2972
3475
  *
2973
3476
  * @param options - Streaming configuration options
3477
+ * @param options.sampleRate - Sample rate (8000, 16000, 22050, 44100, 48000)
3478
+ * @param options.encoding - Audio encoding (pcm_s16le, pcm_mulaw)
3479
+ * @param options.assemblyaiStreaming - All AssemblyAI-specific streaming options
2974
3480
  * @param callbacks - Event callbacks for transcription results
2975
- * @returns Promise that resolves with a StreamingSession
3481
+ * @param callbacks.onTranscript - Interim/final transcript received (Turn event)
3482
+ * @param callbacks.onUtterance - Complete utterance (Turn with end_of_turn=true)
3483
+ * @param callbacks.onMetadata - Session metadata (Begin, Termination events)
3484
+ * @param callbacks.onError - Error occurred
3485
+ * @param callbacks.onClose - Connection closed
3486
+ * @returns Promise that resolves with an extended StreamingSession
2976
3487
  *
2977
- * @example Real-time streaming
3488
+ * @example Basic real-time streaming
2978
3489
  * ```typescript
2979
3490
  * const session = await adapter.transcribeStream({
2980
- * encoding: 'pcm_s16le',
2981
3491
  * sampleRate: 16000,
2982
- * language: 'en',
2983
- * interimResults: true
3492
+ * encoding: 'pcm_s16le'
2984
3493
  * }, {
2985
3494
  * onOpen: () => console.log('Connected'),
2986
3495
  * onTranscript: (event) => {
@@ -2995,21 +3504,50 @@ var AssemblyAIAdapter = class extends BaseAdapter {
2995
3504
  * });
2996
3505
  *
2997
3506
  * // Send audio chunks
2998
- * const audioChunk = getAudioChunk(); // Your audio source
3507
+ * const audioChunk = getAudioChunk();
2999
3508
  * await session.sendAudio({ data: audioChunk });
3000
3509
  *
3001
3510
  * // Close when done
3002
3511
  * await session.close();
3003
3512
  * ```
3513
+ *
3514
+ * @example Advanced streaming with all features
3515
+ * ```typescript
3516
+ * const session = await adapter.transcribeStream({
3517
+ * sampleRate: 16000,
3518
+ * assemblyaiStreaming: {
3519
+ * speechModel: 'universal-streaming-multilingual',
3520
+ * languageDetection: true,
3521
+ * endOfTurnConfidenceThreshold: 0.7,
3522
+ * minEndOfTurnSilenceWhenConfident: 500,
3523
+ * maxTurnSilence: 15000,
3524
+ * vadThreshold: 0.3,
3525
+ * formatTurns: true,
3526
+ * filterProfanity: true,
3527
+ * keyterms: ['TypeScript', 'JavaScript', 'API'],
3528
+ * inactivityTimeout: 60000
3529
+ * }
3530
+ * }, {
3531
+ * onTranscript: (e) => console.log('Transcript:', e.text),
3532
+ * onMetadata: (m) => console.log('Metadata:', m)
3533
+ * });
3534
+ *
3535
+ * // Update configuration mid-stream
3536
+ * session.updateConfiguration?.({
3537
+ * end_of_turn_confidence_threshold: 0.5,
3538
+ * vad_threshold: 0.2
3539
+ * });
3540
+ *
3541
+ * // Force endpoint detection
3542
+ * session.forceEndpoint?.();
3543
+ * ```
3004
3544
  */
3005
3545
  async transcribeStream(options, callbacks) {
3006
3546
  this.validateConfig();
3007
3547
  if (!this.config?.apiKey) {
3008
3548
  throw new Error("API key is required for streaming");
3009
3549
  }
3010
- const sampleRate = options?.sampleRate || 16e3;
3011
- const encoding = options?.encoding ? mapEncodingToProvider(options.encoding, "assemblyai") : "pcm_s16le";
3012
- const wsUrl = `${this.wsBaseUrl}?sample_rate=${sampleRate}&encoding=${encoding}`;
3550
+ const wsUrl = this.buildStreamingUrl(options);
3013
3551
  const ws = new import_ws2.default(wsUrl, {
3014
3552
  headers: {
3015
3553
  Authorization: this.config.apiKey
@@ -3033,43 +3571,7 @@ var AssemblyAIAdapter = class extends BaseAdapter {
3033
3571
  ws.on("message", (data) => {
3034
3572
  try {
3035
3573
  const message = JSON.parse(data.toString());
3036
- if ("error" in message) {
3037
- callbacks?.onError?.({
3038
- code: "API_ERROR",
3039
- message: message.error
3040
- });
3041
- return;
3042
- }
3043
- if (message.type === "Begin") {
3044
- const beginMsg = message;
3045
- callbacks?.onMetadata?.({
3046
- sessionId: beginMsg.id,
3047
- expiresAt: new Date(beginMsg.expires_at).toISOString()
3048
- });
3049
- } else if (message.type === "Turn") {
3050
- const turnMsg = message;
3051
- callbacks?.onTranscript?.({
3052
- type: "transcript",
3053
- text: turnMsg.transcript,
3054
- isFinal: turnMsg.end_of_turn,
3055
- confidence: turnMsg.end_of_turn_confidence,
3056
- words: turnMsg.words.map((word) => ({
3057
- text: word.text,
3058
- start: word.start / 1e3,
3059
- // Convert ms to seconds
3060
- end: word.end / 1e3,
3061
- confidence: word.confidence
3062
- })),
3063
- data: turnMsg
3064
- });
3065
- } else if (message.type === "Termination") {
3066
- const termMsg = message;
3067
- callbacks?.onMetadata?.({
3068
- terminated: true,
3069
- audioDurationSeconds: termMsg.audio_duration_seconds,
3070
- sessionDurationSeconds: termMsg.session_duration_seconds
3071
- });
3072
- }
3574
+ this.handleWebSocketMessage(message, callbacks);
3073
3575
  } catch (error) {
3074
3576
  callbacks?.onError?.({
3075
3577
  code: "PARSE_ERROR",
@@ -3121,11 +3623,7 @@ var AssemblyAIAdapter = class extends BaseAdapter {
3121
3623
  }
3122
3624
  if (chunk.isLast) {
3123
3625
  flushAudioBuffer();
3124
- ws.send(
3125
- JSON.stringify({
3126
- terminate_session: true
3127
- })
3128
- );
3626
+ ws.send(JSON.stringify({ type: "Terminate" }));
3129
3627
  }
3130
3628
  },
3131
3629
  close: async () => {
@@ -3135,11 +3633,7 @@ var AssemblyAIAdapter = class extends BaseAdapter {
3135
3633
  sessionStatus = "closing";
3136
3634
  flushAudioBuffer();
3137
3635
  if (ws.readyState === import_ws2.default.OPEN) {
3138
- ws.send(
3139
- JSON.stringify({
3140
- terminate_session: true
3141
- })
3142
- );
3636
+ ws.send(JSON.stringify({ type: "Terminate" }));
3143
3637
  }
3144
3638
  return new Promise((resolve) => {
3145
3639
  const timeout = setTimeout(() => {
@@ -3153,9 +3647,166 @@ var AssemblyAIAdapter = class extends BaseAdapter {
3153
3647
  resolve();
3154
3648
  });
3155
3649
  });
3650
+ },
3651
+ /**
3652
+ * Update streaming configuration mid-session
3653
+ *
3654
+ * Allows changing VAD, end-of-turn, and formatting settings
3655
+ * without restarting the stream.
3656
+ *
3657
+ * @param config - Configuration parameters to update
3658
+ */
3659
+ updateConfiguration: (config) => {
3660
+ if (ws.readyState !== import_ws2.default.OPEN) {
3661
+ throw new Error("Cannot update configuration: WebSocket is not open");
3662
+ }
3663
+ const updateMsg = {
3664
+ type: "UpdateConfiguration",
3665
+ ...config
3666
+ };
3667
+ ws.send(JSON.stringify(updateMsg));
3668
+ },
3669
+ /**
3670
+ * Force endpoint detection
3671
+ *
3672
+ * Immediately triggers end-of-turn, useful for manual control
3673
+ * of turn boundaries (e.g., when user presses a button).
3674
+ */
3675
+ forceEndpoint: () => {
3676
+ if (ws.readyState !== import_ws2.default.OPEN) {
3677
+ throw new Error("Cannot force endpoint: WebSocket is not open");
3678
+ }
3679
+ const forceMsg = {
3680
+ type: "ForceEndpoint"
3681
+ };
3682
+ ws.send(JSON.stringify(forceMsg));
3156
3683
  }
3157
3684
  };
3158
3685
  }
3686
+ /**
3687
+ * Build WebSocket URL with all streaming parameters
3688
+ */
3689
+ buildStreamingUrl(options) {
3690
+ const params = new URLSearchParams();
3691
+ const aaiOpts = options?.assemblyaiStreaming || {};
3692
+ const sampleRate = options?.sampleRate || aaiOpts.sampleRate || 16e3;
3693
+ params.append("sample_rate", String(sampleRate));
3694
+ const encoding = options?.encoding ? mapEncodingToProvider(options.encoding, "assemblyai") : aaiOpts.encoding || "pcm_s16le";
3695
+ params.append("encoding", encoding);
3696
+ if (aaiOpts.speechModel) {
3697
+ params.append("speech_model", aaiOpts.speechModel);
3698
+ }
3699
+ if (aaiOpts.languageDetection) {
3700
+ params.append("language_detection", "true");
3701
+ }
3702
+ if (aaiOpts.endOfTurnConfidenceThreshold !== void 0) {
3703
+ params.append(
3704
+ "end_of_turn_confidence_threshold",
3705
+ String(aaiOpts.endOfTurnConfidenceThreshold)
3706
+ );
3707
+ }
3708
+ if (aaiOpts.minEndOfTurnSilenceWhenConfident !== void 0) {
3709
+ params.append(
3710
+ "min_end_of_turn_silence_when_confident",
3711
+ String(aaiOpts.minEndOfTurnSilenceWhenConfident)
3712
+ );
3713
+ }
3714
+ if (aaiOpts.maxTurnSilence !== void 0) {
3715
+ params.append("max_turn_silence", String(aaiOpts.maxTurnSilence));
3716
+ }
3717
+ if (aaiOpts.vadThreshold !== void 0) {
3718
+ params.append("vad_threshold", String(aaiOpts.vadThreshold));
3719
+ }
3720
+ if (aaiOpts.formatTurns !== void 0) {
3721
+ params.append("format_turns", String(aaiOpts.formatTurns));
3722
+ }
3723
+ if (aaiOpts.filterProfanity) {
3724
+ params.append("filter_profanity", "true");
3725
+ }
3726
+ const keyterms = options?.customVocabulary || aaiOpts.keyterms;
3727
+ if (keyterms && keyterms.length > 0) {
3728
+ keyterms.forEach((term) => params.append("keyterms", term));
3729
+ }
3730
+ if (aaiOpts.keytermsPrompt && aaiOpts.keytermsPrompt.length > 0) {
3731
+ aaiOpts.keytermsPrompt.forEach((prompt) => params.append("keyterms_prompt", prompt));
3732
+ }
3733
+ if (aaiOpts.inactivityTimeout !== void 0) {
3734
+ params.append("inactivity_timeout", String(aaiOpts.inactivityTimeout));
3735
+ }
3736
+ return `${this.wsBaseUrl}?${params.toString()}`;
3737
+ }
3738
+ /**
3739
+ * Handle all WebSocket message types from AssemblyAI streaming
3740
+ */
3741
+ handleWebSocketMessage(message, callbacks) {
3742
+ if ("error" in message) {
3743
+ callbacks?.onError?.({
3744
+ code: "API_ERROR",
3745
+ message: message.error
3746
+ });
3747
+ return;
3748
+ }
3749
+ const typedMessage = message;
3750
+ switch (typedMessage.type) {
3751
+ case "Begin": {
3752
+ const beginMsg = typedMessage;
3753
+ callbacks?.onMetadata?.({
3754
+ type: "begin",
3755
+ sessionId: beginMsg.id,
3756
+ expiresAt: new Date(beginMsg.expires_at).toISOString()
3757
+ });
3758
+ break;
3759
+ }
3760
+ case "Turn": {
3761
+ const turnMsg = typedMessage;
3762
+ callbacks?.onTranscript?.({
3763
+ type: "transcript",
3764
+ text: turnMsg.transcript,
3765
+ isFinal: turnMsg.end_of_turn,
3766
+ confidence: turnMsg.end_of_turn_confidence,
3767
+ language: turnMsg.language_code,
3768
+ words: turnMsg.words.map((w) => ({
3769
+ word: w.text,
3770
+ start: w.start / 1e3,
3771
+ // Convert ms to seconds
3772
+ end: w.end / 1e3,
3773
+ confidence: w.confidence
3774
+ })),
3775
+ data: turnMsg
3776
+ });
3777
+ if (turnMsg.end_of_turn) {
3778
+ const words = turnMsg.words;
3779
+ const start = words.length > 0 ? words[0].start / 1e3 : 0;
3780
+ const end = words.length > 0 ? words[words.length - 1].end / 1e3 : 0;
3781
+ callbacks?.onUtterance?.({
3782
+ text: turnMsg.transcript,
3783
+ start,
3784
+ end,
3785
+ confidence: turnMsg.end_of_turn_confidence,
3786
+ words: turnMsg.words.map((w) => ({
3787
+ word: w.text,
3788
+ start: w.start / 1e3,
3789
+ end: w.end / 1e3,
3790
+ confidence: w.confidence
3791
+ }))
3792
+ });
3793
+ }
3794
+ break;
3795
+ }
3796
+ case "Termination": {
3797
+ const termMsg = typedMessage;
3798
+ callbacks?.onMetadata?.({
3799
+ type: "termination",
3800
+ audioDurationSeconds: termMsg.audio_duration_seconds,
3801
+ sessionDurationSeconds: termMsg.session_duration_seconds
3802
+ });
3803
+ break;
3804
+ }
3805
+ default:
3806
+ callbacks?.onMetadata?.(message);
3807
+ break;
3808
+ }
3809
+ }
3159
3810
  };
3160
3811
  function createAssemblyAIAdapter(config) {
3161
3812
  const adapter = new AssemblyAIAdapter();
@@ -3417,11 +4068,11 @@ var DeepgramAdapter = class extends BaseAdapter {
3417
4068
  return void 0;
3418
4069
  }
3419
4070
  return alternative.words.map(
3420
- (word) => ({
3421
- text: word.word || "",
3422
- start: word.start || 0,
3423
- end: word.end || 0,
3424
- confidence: word.confidence,
4071
+ (w) => ({
4072
+ word: w.word || "",
4073
+ start: w.start || 0,
4074
+ end: w.end || 0,
4075
+ confidence: w.confidence,
3425
4076
  speaker: void 0
3426
4077
  // Speaker info is at utterance level, not word level
3427
4078
  })
@@ -3441,11 +4092,11 @@ var DeepgramAdapter = class extends BaseAdapter {
3441
4092
  end: utterance.end || 0,
3442
4093
  speaker: utterance.speaker?.toString(),
3443
4094
  confidence: utterance.confidence,
3444
- words: utterance.words?.map((word) => ({
3445
- text: word.word || "",
3446
- start: word.start || 0,
3447
- end: word.end || 0,
3448
- confidence: word.confidence
4095
+ words: utterance.words?.map((w) => ({
4096
+ word: w.word || "",
4097
+ start: w.start || 0,
4098
+ end: w.end || 0,
4099
+ confidence: w.confidence
3449
4100
  }))
3450
4101
  }));
3451
4102
  }
@@ -3464,11 +4115,44 @@ var DeepgramAdapter = class extends BaseAdapter {
3464
4115
  * Creates a WebSocket connection to Deepgram for streaming transcription.
3465
4116
  * Send audio chunks via session.sendAudio() and receive results via callbacks.
3466
4117
  *
4118
+ * Supports all Deepgram streaming features:
4119
+ * - Real-time transcription with interim/final results
4120
+ * - Speech detection events (SpeechStarted, UtteranceEnd)
4121
+ * - Speaker diarization
4122
+ * - Language detection
4123
+ * - Real-time sentiment, entity detection, topics, intents
4124
+ * - Custom vocabulary (keywords, keyterms)
4125
+ * - PII redaction
4126
+ * - Filler words, numerals, measurements, paragraphs
4127
+ * - Profanity filtering
4128
+ * - Dictation mode
4129
+ *
3467
4130
  * @param options - Streaming configuration options
4131
+ * @param options.encoding - Audio encoding (linear16, flac, mulaw, opus, speex, g729)
4132
+ * @param options.sampleRate - Sample rate in Hz
4133
+ * @param options.channels - Number of audio channels
4134
+ * @param options.language - Language code for transcription
4135
+ * @param options.model - Model to use (nova-2, nova-3, base, enhanced, etc.)
4136
+ * @param options.diarization - Enable speaker identification
4137
+ * @param options.languageDetection - Auto-detect language
4138
+ * @param options.interimResults - Enable partial transcripts
4139
+ * @param options.summarization - Enable summarization
4140
+ * @param options.sentimentAnalysis - Enable sentiment analysis
4141
+ * @param options.entityDetection - Enable entity detection
4142
+ * @param options.piiRedaction - Enable PII redaction
4143
+ * @param options.customVocabulary - Keywords to boost recognition
4144
+ * @param options.deepgramStreaming - All Deepgram-specific streaming options
3468
4145
  * @param callbacks - Event callbacks for transcription results
4146
+ * @param callbacks.onTranscript - Interim/final transcript received
4147
+ * @param callbacks.onUtterance - Complete utterance detected
4148
+ * @param callbacks.onSpeechStart - Speech detected (Deepgram SpeechStarted)
4149
+ * @param callbacks.onSpeechEnd - Speech ended (Deepgram UtteranceEnd)
4150
+ * @param callbacks.onMetadata - Metadata received
4151
+ * @param callbacks.onError - Error occurred
4152
+ * @param callbacks.onClose - Connection closed
3469
4153
  * @returns Promise that resolves with a StreamingSession
3470
4154
  *
3471
- * @example Real-time streaming
4155
+ * @example Basic real-time streaming
3472
4156
  * ```typescript
3473
4157
  * const session = await adapter.transcribeStream({
3474
4158
  * encoding: 'linear16',
@@ -3491,32 +4175,47 @@ var DeepgramAdapter = class extends BaseAdapter {
3491
4175
  * });
3492
4176
  *
3493
4177
  * // Send audio chunks
3494
- * const audioChunk = getAudioChunk(); // Your audio source
4178
+ * const audioChunk = getAudioChunk();
3495
4179
  * await session.sendAudio({ data: audioChunk });
3496
4180
  *
3497
4181
  * // Close when done
3498
4182
  * await session.close();
3499
4183
  * ```
4184
+ *
4185
+ * @example Advanced streaming with all features
4186
+ * ```typescript
4187
+ * const session = await adapter.transcribeStream({
4188
+ * encoding: 'linear16',
4189
+ * sampleRate: 16000,
4190
+ * language: 'en',
4191
+ * model: 'nova-3',
4192
+ * diarization: true,
4193
+ * sentimentAnalysis: true,
4194
+ * entityDetection: true,
4195
+ * deepgramStreaming: {
4196
+ * fillerWords: true,
4197
+ * numerals: true,
4198
+ * profanityFilter: true,
4199
+ * topics: true,
4200
+ * intents: true,
4201
+ * customTopic: ['sales', 'support'],
4202
+ * customIntent: ['purchase', 'complaint'],
4203
+ * keyterm: ['TypeScript', 'JavaScript'],
4204
+ * utteranceSplit: 800,
4205
+ * punctuate: true,
4206
+ * smartFormat: true
4207
+ * }
4208
+ * }, {
4209
+ * onTranscript: (e) => console.log('Transcript:', e.text),
4210
+ * onSpeechStart: (e) => console.log('Speech started at:', e.timestamp),
4211
+ * onSpeechEnd: (e) => console.log('Utterance ended'),
4212
+ * onMetadata: (m) => console.log('Metadata:', m)
4213
+ * });
4214
+ * ```
3500
4215
  */
3501
4216
  async transcribeStream(options, callbacks) {
3502
4217
  this.validateConfig();
3503
- const params = new URLSearchParams();
3504
- if (options?.encoding) params.append("encoding", options.encoding);
3505
- if (options?.sampleRate) params.append("sample_rate", options.sampleRate.toString());
3506
- if (options?.channels) params.append("channels", options.channels.toString());
3507
- if (options?.language) params.append("language", options.language);
3508
- if (options?.model) params.append("model", options.model);
3509
- if (options?.languageDetection) params.append("detect_language", "true");
3510
- if (options?.diarization) params.append("diarize", "true");
3511
- if (options?.interimResults) params.append("interim_results", "true");
3512
- if (options?.summarization) params.append("summarize", "true");
3513
- if (options?.sentimentAnalysis) params.append("sentiment", "true");
3514
- if (options?.entityDetection) params.append("detect_entities", "true");
3515
- if (options?.piiRedaction) params.append("redact", "pii");
3516
- if (options?.customVocabulary && options.customVocabulary.length > 0) {
3517
- params.append("keywords", options.customVocabulary.join(","));
3518
- }
3519
- const wsUrl = `${this.wsBaseUrl}?${params.toString()}`;
4218
+ const wsUrl = this.buildStreamingUrl(options);
3520
4219
  const ws = new import_ws3.default(wsUrl, {
3521
4220
  headers: {
3522
4221
  Authorization: `Token ${this.config.apiKey}`
@@ -3531,31 +4230,7 @@ var DeepgramAdapter = class extends BaseAdapter {
3531
4230
  ws.on("message", (data) => {
3532
4231
  try {
3533
4232
  const message = JSON.parse(data.toString());
3534
- if (message.type === "Results") {
3535
- const channel = message.channel.alternatives[0];
3536
- if (channel) {
3537
- const transcript = channel.transcript;
3538
- const isFinal = message.is_final;
3539
- const words = channel.words?.map((word) => ({
3540
- text: word.word,
3541
- start: word.start,
3542
- end: word.end,
3543
- confidence: word.confidence
3544
- }));
3545
- callbacks?.onTranscript?.({
3546
- type: "transcript",
3547
- text: transcript,
3548
- isFinal,
3549
- words,
3550
- confidence: channel.confidence,
3551
- data: message
3552
- });
3553
- }
3554
- } else if (message.type === "UtteranceEnd") {
3555
- callbacks?.onMetadata?.(message);
3556
- } else if (message.type === "Metadata") {
3557
- callbacks?.onMetadata?.(message);
3558
- }
4233
+ this.handleWebSocketMessage(message, callbacks);
3559
4234
  } catch (error) {
3560
4235
  callbacks?.onError?.({
3561
4236
  code: "PARSE_ERROR",
@@ -3628,6 +4303,210 @@ var DeepgramAdapter = class extends BaseAdapter {
3628
4303
  }
3629
4304
  };
3630
4305
  }
4306
+ /**
4307
+ * Build WebSocket URL with all streaming parameters
4308
+ */
4309
+ buildStreamingUrl(options) {
4310
+ const params = new URLSearchParams();
4311
+ const dgOpts = options?.deepgramStreaming || {};
4312
+ if (options?.encoding || dgOpts.encoding) {
4313
+ params.append("encoding", options?.encoding || dgOpts.encoding);
4314
+ }
4315
+ if (options?.sampleRate || dgOpts.sampleRate) {
4316
+ params.append("sample_rate", String(options?.sampleRate || dgOpts.sampleRate));
4317
+ }
4318
+ if (options?.channels || dgOpts.channels) {
4319
+ params.append("channels", String(options?.channels || dgOpts.channels));
4320
+ }
4321
+ if (options?.language || dgOpts.language) {
4322
+ params.append("language", options?.language || dgOpts.language);
4323
+ }
4324
+ if (options?.model || dgOpts.model) {
4325
+ params.append("model", options?.model || dgOpts.model);
4326
+ }
4327
+ if (dgOpts.version) {
4328
+ params.append("version", dgOpts.version);
4329
+ }
4330
+ if (options?.languageDetection || dgOpts.languageDetection) {
4331
+ params.append("detect_language", "true");
4332
+ }
4333
+ if (options?.diarization || dgOpts.diarization) {
4334
+ params.append("diarize", "true");
4335
+ }
4336
+ if (options?.interimResults || dgOpts.interimResults) {
4337
+ params.append("interim_results", "true");
4338
+ }
4339
+ if (dgOpts.punctuate !== void 0) {
4340
+ params.append("punctuate", String(dgOpts.punctuate));
4341
+ }
4342
+ if (dgOpts.smartFormat !== void 0) {
4343
+ params.append("smart_format", String(dgOpts.smartFormat));
4344
+ }
4345
+ if (dgOpts.fillerWords) {
4346
+ params.append("filler_words", "true");
4347
+ }
4348
+ if (dgOpts.numerals) {
4349
+ params.append("numerals", "true");
4350
+ }
4351
+ if (dgOpts.measurements) {
4352
+ params.append("measurements", "true");
4353
+ }
4354
+ if (dgOpts.paragraphs) {
4355
+ params.append("paragraphs", "true");
4356
+ }
4357
+ if (dgOpts.profanityFilter) {
4358
+ params.append("profanity_filter", "true");
4359
+ }
4360
+ if (dgOpts.dictation) {
4361
+ params.append("dictation", "true");
4362
+ }
4363
+ if (dgOpts.utteranceSplit) {
4364
+ params.append("utt_split", String(dgOpts.utteranceSplit));
4365
+ }
4366
+ if (options?.summarization || dgOpts.summarize) {
4367
+ params.append("summarize", "true");
4368
+ }
4369
+ if (options?.sentimentAnalysis || dgOpts.sentiment) {
4370
+ params.append("sentiment", "true");
4371
+ }
4372
+ if (options?.entityDetection || dgOpts.detectEntities) {
4373
+ params.append("detect_entities", "true");
4374
+ }
4375
+ if (dgOpts.topics) {
4376
+ params.append("topics", "true");
4377
+ }
4378
+ if (dgOpts.customTopic && dgOpts.customTopic.length > 0) {
4379
+ dgOpts.customTopic.forEach((topic) => params.append("custom_topic", topic));
4380
+ }
4381
+ if (dgOpts.customTopicMode) {
4382
+ params.append("custom_topic_mode", dgOpts.customTopicMode);
4383
+ }
4384
+ if (dgOpts.intents) {
4385
+ params.append("intents", "true");
4386
+ }
4387
+ if (dgOpts.customIntent && dgOpts.customIntent.length > 0) {
4388
+ dgOpts.customIntent.forEach((intent) => params.append("custom_intent", intent));
4389
+ }
4390
+ if (dgOpts.customIntentMode) {
4391
+ params.append("custom_intent_mode", dgOpts.customIntentMode);
4392
+ }
4393
+ const keywords = options?.customVocabulary || dgOpts.keywords;
4394
+ if (keywords) {
4395
+ const keywordList = Array.isArray(keywords) ? keywords : [keywords];
4396
+ keywordList.forEach((kw) => params.append("keywords", kw));
4397
+ }
4398
+ if (dgOpts.keyterm && dgOpts.keyterm.length > 0) {
4399
+ dgOpts.keyterm.forEach((term) => params.append("keyterm", term));
4400
+ }
4401
+ if (options?.piiRedaction || dgOpts.redact) {
4402
+ if (Array.isArray(dgOpts.redact)) {
4403
+ dgOpts.redact.forEach((r) => params.append("redact", r));
4404
+ } else if (dgOpts.redact === true || options?.piiRedaction) {
4405
+ params.append("redact", "pii");
4406
+ params.append("redact", "pci");
4407
+ }
4408
+ }
4409
+ if (dgOpts.callback) {
4410
+ params.append("callback", dgOpts.callback);
4411
+ }
4412
+ if (dgOpts.tag && dgOpts.tag.length > 0) {
4413
+ dgOpts.tag.forEach((t) => params.append("tag", t));
4414
+ }
4415
+ if (dgOpts.extra) {
4416
+ params.append("extra", JSON.stringify(dgOpts.extra));
4417
+ }
4418
+ if (options?.endpointing !== void 0 || dgOpts.endpointing !== void 0) {
4419
+ const ep = options?.endpointing ?? dgOpts.endpointing;
4420
+ if (ep === false) {
4421
+ params.append("endpointing", "false");
4422
+ } else if (typeof ep === "number") {
4423
+ params.append("endpointing", String(ep));
4424
+ }
4425
+ }
4426
+ if (dgOpts.vadThreshold !== void 0) {
4427
+ params.append("vad_events", "true");
4428
+ }
4429
+ return `${this.wsBaseUrl}?${params.toString()}`;
4430
+ }
4431
+ /**
4432
+ * Handle all WebSocket message types from Deepgram streaming
4433
+ */
4434
+ handleWebSocketMessage(message, callbacks) {
4435
+ switch (message.type) {
4436
+ case "Results": {
4437
+ const channel = message.channel.alternatives[0];
4438
+ if (channel && channel.transcript) {
4439
+ callbacks?.onTranscript?.({
4440
+ type: "transcript",
4441
+ text: channel.transcript,
4442
+ isFinal: message.is_final,
4443
+ confidence: channel.confidence,
4444
+ language: message.channel.detected_language,
4445
+ words: channel.words?.map((w) => ({
4446
+ word: w.punctuated_word || w.word,
4447
+ start: w.start,
4448
+ end: w.end,
4449
+ confidence: w.confidence,
4450
+ speaker: w.speaker?.toString()
4451
+ })),
4452
+ data: message
4453
+ });
4454
+ }
4455
+ if (message.speech_final && channel && channel.transcript) {
4456
+ callbacks?.onUtterance?.({
4457
+ text: channel.transcript,
4458
+ start: message.start,
4459
+ end: message.start + message.duration,
4460
+ confidence: channel.confidence,
4461
+ words: channel.words?.map((w) => ({
4462
+ word: w.punctuated_word || w.word,
4463
+ start: w.start,
4464
+ end: w.end,
4465
+ confidence: w.confidence
4466
+ }))
4467
+ });
4468
+ }
4469
+ break;
4470
+ }
4471
+ case "SpeechStarted": {
4472
+ const event = {
4473
+ type: "speech_start",
4474
+ timestamp: message.timestamp,
4475
+ channel: message.channel[0]
4476
+ };
4477
+ callbacks?.onSpeechStart?.(event);
4478
+ break;
4479
+ }
4480
+ case "UtteranceEnd": {
4481
+ const event = {
4482
+ type: "speech_end",
4483
+ timestamp: message.last_word_end,
4484
+ channel: message.channel[0]
4485
+ };
4486
+ callbacks?.onSpeechEnd?.(event);
4487
+ break;
4488
+ }
4489
+ case "Metadata": {
4490
+ callbacks?.onMetadata?.(message);
4491
+ break;
4492
+ }
4493
+ case "Error": {
4494
+ callbacks?.onError?.({
4495
+ code: message.variant || "DEEPGRAM_ERROR",
4496
+ message: message.message || message.description || "Unknown error",
4497
+ details: message
4498
+ });
4499
+ break;
4500
+ }
4501
+ case "CloseStream": {
4502
+ break;
4503
+ }
4504
+ default: {
4505
+ callbacks?.onMetadata?.(message);
4506
+ break;
4507
+ }
4508
+ }
4509
+ }
3631
4510
  };
3632
4511
  function createDeepgramAdapter(config) {
3633
4512
  const adapter = new DeepgramAdapter();
@@ -3881,12 +4760,12 @@ var AzureSTTAdapter = class extends BaseAdapter {
3881
4760
  const recognizedPhrases = transcriptionData.recognizedPhrases || [];
3882
4761
  const fullText = combinedPhrases.map((phrase) => phrase.display || phrase.lexical).join(" ") || "";
3883
4762
  const words = recognizedPhrases.flatMap(
3884
- (phrase) => (phrase.nBest?.[0]?.words || []).map((word) => ({
3885
- text: word.word,
3886
- start: word.offsetInTicks / 1e7,
4763
+ (phrase) => (phrase.nBest?.[0]?.words || []).map((w) => ({
4764
+ word: w.word,
4765
+ start: w.offsetInTicks / 1e7,
3887
4766
  // Convert ticks to seconds
3888
- end: (word.offsetInTicks + word.durationInTicks) / 1e7,
3889
- confidence: word.confidence,
4767
+ end: (w.offsetInTicks + w.durationInTicks) / 1e7,
4768
+ confidence: w.confidence,
3890
4769
  speaker: phrase.speaker !== void 0 ? phrase.speaker.toString() : void 0
3891
4770
  }))
3892
4771
  );
@@ -4167,10 +5046,10 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
4167
5046
  }
4168
5047
  if ("duration" in response && "language" in response) {
4169
5048
  const verboseResponse = response;
4170
- const words = verboseResponse.words?.map((word) => ({
4171
- text: word.word,
4172
- start: word.start,
4173
- end: word.end,
5049
+ const words = verboseResponse.words?.map((w) => ({
5050
+ word: w.word,
5051
+ start: w.start,
5052
+ end: w.end,
4174
5053
  confidence: void 0
4175
5054
  }));
4176
5055
  const requestId2 = `openai-${Date.now()}`;
@@ -4436,7 +5315,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
4436
5315
  normalizeResponse(response) {
4437
5316
  const text = response.results.filter((r) => r.type === "word" && r.alternatives).map((r) => r.alternatives[0]?.content || "").join(" ");
4438
5317
  const words = response.results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
4439
- text: result.alternatives?.[0]?.content || "",
5318
+ word: result.alternatives?.[0]?.content || "",
4440
5319
  start: result.start_time,
4441
5320
  end: result.end_time,
4442
5321
  confidence: result.alternatives?.[0]?.confidence,
@@ -4587,12 +5466,12 @@ var GladiaWebhookHandler = class extends BaseWebhookHandler {
4587
5466
  /**
4588
5467
  * Convert Gladia WordDTO to unified Word type
4589
5468
  */
4590
- mapWord(word) {
5469
+ mapWord(w) {
4591
5470
  return {
4592
- text: word.word,
4593
- start: word.start,
4594
- end: word.end,
4595
- confidence: word.confidence
5471
+ word: w.word,
5472
+ start: w.start,
5473
+ end: w.end,
5474
+ confidence: w.confidence
4596
5475
  };
4597
5476
  }
4598
5477
  /**
@@ -4930,11 +5809,11 @@ var DeepgramWebhookHandler = class extends BaseWebhookHandler {
4930
5809
  raw: payload
4931
5810
  };
4932
5811
  }
4933
- const words = alternative.words && alternative.words.length > 0 ? alternative.words.map((word) => ({
4934
- text: word.word || "",
4935
- start: word.start || 0,
4936
- end: word.end || 0,
4937
- confidence: word.confidence
5812
+ const words = alternative.words && alternative.words.length > 0 ? alternative.words.map((w) => ({
5813
+ word: w.word || "",
5814
+ start: w.start || 0,
5815
+ end: w.end || 0,
5816
+ confidence: w.confidence
4938
5817
  })) : void 0;
4939
5818
  const speakers = response.results.utterances && response.results.utterances.length > 0 ? response.results.utterances.map((utterance) => ({
4940
5819
  id: utterance.speaker?.toString() || "unknown",
@@ -4948,11 +5827,11 @@ var DeepgramWebhookHandler = class extends BaseWebhookHandler {
4948
5827
  end: utterance.end || 0,
4949
5828
  speaker: utterance.speaker?.toString(),
4950
5829
  confidence: utterance.confidence,
4951
- words: utterance.words && utterance.words.length > 0 ? utterance.words.map((word) => ({
4952
- text: word.word || "",
4953
- start: word.start || 0,
4954
- end: word.end || 0,
4955
- confidence: word.confidence
5830
+ words: utterance.words && utterance.words.length > 0 ? utterance.words.map((w) => ({
5831
+ word: w.word || "",
5832
+ start: w.start || 0,
5833
+ end: w.end || 0,
5834
+ confidence: w.confidence
4956
5835
  })) : void 0
4957
5836
  })) : void 0;
4958
5837
  const summary = alternative.summaries?.[0]?.summary;
@@ -5464,6 +6343,9 @@ function createWebhookRouter() {
5464
6343
  // Annotate the CommonJS export names for ESM import in node:
5465
6344
  0 && (module.exports = {
5466
6345
  AssemblyAIAdapter,
6346
+ AssemblyAIEncoding,
6347
+ AssemblyAISampleRate,
6348
+ AssemblyAISpeechModel,
5467
6349
  AssemblyAITypes,
5468
6350
  AssemblyAIWebhookHandler,
5469
6351
  AzureSTTAdapter,
@@ -5471,8 +6353,18 @@ function createWebhookRouter() {
5471
6353
  BaseAdapter,
5472
6354
  BaseWebhookHandler,
5473
6355
  DeepgramAdapter,
6356
+ DeepgramEncoding,
6357
+ DeepgramModel,
6358
+ DeepgramRedact,
6359
+ DeepgramTopicMode,
5474
6360
  DeepgramWebhookHandler,
5475
6361
  GladiaAdapter,
6362
+ GladiaBitDepth,
6363
+ GladiaEncoding,
6364
+ GladiaLanguage,
6365
+ GladiaModel,
6366
+ GladiaSampleRate,
6367
+ GladiaTranslationLanguage,
5476
6368
  GladiaTypes,
5477
6369
  GladiaWebhookHandler,
5478
6370
  ListenV1EncodingParameter,