assemblyai 4.33.3 → 4.34.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +22 -0
  2. package/dist/assemblyai.streaming.umd.js +1291 -3
  3. package/dist/assemblyai.streaming.umd.min.js +1 -1
  4. package/dist/assemblyai.umd.js +802 -7
  5. package/dist/assemblyai.umd.min.js +1 -1
  6. package/dist/browser.mjs +775 -5
  7. package/dist/bun.mjs +775 -5
  8. package/dist/deno.mjs +775 -5
  9. package/dist/exports/streaming.d.ts +7 -0
  10. package/dist/index.cjs +802 -7
  11. package/dist/index.mjs +794 -8
  12. package/dist/node.cjs +783 -4
  13. package/dist/node.mjs +775 -5
  14. package/dist/services/index.d.ts +2 -2
  15. package/dist/services/streaming/browser/dual-channel-capture.d.ts +66 -0
  16. package/dist/services/streaming/browser/worklets/pcm16-encoder.d.ts +19 -0
  17. package/dist/services/streaming/energy-vad.d.ts +35 -0
  18. package/dist/services/streaming/index.d.ts +4 -0
  19. package/dist/services/streaming/label-mapper.d.ts +44 -0
  20. package/dist/services/streaming/resampler.d.ts +22 -0
  21. package/dist/services/streaming/service.d.ts +71 -2
  22. package/dist/streaming.browser.mjs +1247 -4
  23. package/dist/streaming.cjs +1287 -3
  24. package/dist/streaming.mjs +1276 -4
  25. package/dist/types/streaming/dual-channel.d.ts +48 -0
  26. package/dist/types/streaming/index.d.ts +140 -4
  27. package/dist/workerd.mjs +775 -5
  28. package/package.json +1 -1
  29. package/src/exports/streaming.ts +7 -0
  30. package/src/services/index.ts +20 -1
  31. package/src/services/streaming/browser/dual-channel-capture.ts +177 -0
  32. package/src/services/streaming/browser/worklets/pcm16-encoder.ts +70 -0
  33. package/src/services/streaming/energy-vad.ts +75 -0
  34. package/src/services/streaming/index.ts +4 -0
  35. package/src/services/streaming/label-mapper.ts +128 -0
  36. package/src/services/streaming/resampler.ts +69 -0
  37. package/src/services/streaming/service.ts +405 -3
  38. package/src/types/streaming/dual-channel.ts +57 -0
  39. package/src/types/streaming/index.ts +144 -1
@@ -6,6 +6,7 @@ import {
6
6
  import { ErrorEvent, MessageEvent, CloseEvent } from "ws";
7
7
  import { conditions } from "#conditions";
8
8
  import {
9
+ ChannelAttributionParams,
9
10
  StreamingEvents,
10
11
  StreamingListeners,
11
12
  StreamingTranscriberParams,
@@ -14,16 +15,67 @@ import {
14
15
  StreamingEventMessage,
15
16
  TurnEvent,
16
17
  LLMGatewayResponseEvent,
18
+ SpeakerRevisionEvent,
17
19
  StreamingUpdateConfiguration,
18
20
  StreamingForceEndpoint,
19
21
  WarningEvent,
20
22
  } from "../..";
23
+ import type { VadDetector, VadFrame } from "../../types/streaming/dual-channel";
24
+ import { EnergyVad } from "./energy-vad";
25
+ import { attributeTurn, rollUpTurnChannel, VadTimeline } from "./label-mapper";
21
26
  import { StreamingError, StreamingErrorMessages } from "../../utils/errors";
22
27
  import { StreamingErrorTypeCodes } from "../../utils/errors/streaming";
23
28
 
29
+ /**
30
+ * Options for `sendAudio`. In dual-channel mode (when `channels` is configured
31
+ * on the transcriber), `channel` is required and must match one of the declared
32
+ * channel names; in single-channel mode it is ignored.
33
+ */
34
+ export type SendAudioOptions = {
35
+ channel?: string;
36
+ };
37
+
38
+ /**
39
+ * View any `AudioData` (ArrayBuffer / ArrayBufferView / typed array) as a
40
+ * little-endian Int16 sample sequence without copying. Callers must guarantee
41
+ * the underlying byte length is even.
42
+ */
43
+ function toInt16View(audio: AudioData): Int16Array {
44
+ // AudioData is ArrayBufferLike per the public type, but in practice callers
45
+ // pass ArrayBuffer or a typed-array view. Handle both without copying.
46
+ if (audio instanceof Int16Array) return audio;
47
+ if (ArrayBuffer.isView(audio)) {
48
+ const view = audio as ArrayBufferView;
49
+ return new Int16Array(
50
+ view.buffer,
51
+ view.byteOffset,
52
+ Math.floor(view.byteLength / 2),
53
+ );
54
+ }
55
+ return new Int16Array(audio as ArrayBuffer);
56
+ }
57
+
24
58
  const defaultStreamingUrl = "wss://streaming.assemblyai.com/v3/ws";
25
59
  const terminateSessionMessage = `{"type":"Terminate"}`;
26
60
 
61
+ /**
62
+ * Per-send chunk cap in milliseconds for the dual-channel mixer. The streaming
63
+ * server rejects audio messages longer than 1000 ms (`Input Duration Error`).
64
+ * If a backlog accumulates (e.g. when a browser tab is backgrounded and
65
+ * `setInterval` is throttled to ~1 Hz), `flushMix` loops and emits multiple
66
+ * sends each ≤ this cap until the buffers drain.
67
+ */
68
+ const MAX_CHUNK_MS = 200;
69
+
70
+ /**
71
+ * Per-send minimum chunk size in milliseconds. The streaming server also
72
+ * rejects audio messages shorter than 50 ms with the same
73
+ * `Input Duration Error`, so the mixer waits until both per-channel buffers
74
+ * have at least this much accumulated before emitting. Final-flush (close
75
+ * path) bypasses this floor so the trailing partial buffer still gets sent.
76
+ */
77
+ const MIN_CHUNK_MS = 50;
78
+
27
79
  type BufferLike =
28
80
  | string
29
81
  | Buffer
@@ -51,6 +103,25 @@ export class StreamingTranscriber {
51
103
  private listeners: StreamingListeners = {};
52
104
  private sessionTerminatedResolve?: () => void;
53
105
 
106
+ // Dual-channel mode state (allocated only when params.channels is set).
107
+ private isDualChannel = false;
108
+ private channelNames?: string[];
109
+ private channelBuffers?: Map<string, number[]>;
110
+ private channelSamplesReceived?: Map<string, number>;
111
+ private channelVadFloatBuffers?: Map<string, Float32Array>;
112
+ private channelVadBufferIdx?: Map<string, number>;
113
+ private channelVads?: Map<string, VadDetector>;
114
+ private timeline?: VadTimeline;
115
+ private flushTimer?: ReturnType<typeof setInterval>;
116
+ private attributionParams?: Required<ChannelAttributionParams>;
117
+ private vadFrameSamples = 0;
118
+ private minChunkSamples = 0;
119
+ private maxChunkSamples = 0;
120
+ // For resolveUnknownChannelsMethod === "speaker-history": per-speaker_label
121
+ // cumulative active-VAD RMS per channel. Allocated only when that method is
122
+ // configured.
123
+ private speakerHistory?: Map<string, Map<string, number>>;
124
+
54
125
  constructor(params: StreamingTranscriberParams) {
55
126
  this.params = {
56
127
  ...params,
@@ -63,6 +134,58 @@ export class StreamingTranscriber {
63
134
  if (!(this.token || this.apiKey)) {
64
135
  throw new Error("API key or temporary token is required.");
65
136
  }
137
+
138
+ if (params.channels) {
139
+ if (params.channels.length !== 2) {
140
+ throw new Error(
141
+ "StreamingTranscriber.channels must have exactly 2 entries.",
142
+ );
143
+ }
144
+ const names = params.channels.map((c) => c.name);
145
+ if (new Set(names).size !== names.length) {
146
+ throw new Error("StreamingTranscriber.channels names must be unique.");
147
+ }
148
+ this.isDualChannel = true;
149
+ this.channelNames = names;
150
+ const att = params.channelAttribution ?? {};
151
+ this.attributionParams = {
152
+ dominanceRatio: att.dominanceRatio ?? 4,
153
+ timelineWindowMs: att.timelineWindowMs ?? 30_000,
154
+ createVad: att.createVad ?? (() => new EnergyVad()),
155
+ flushIntervalMs: att.flushIntervalMs ?? 50,
156
+ resolveUnknownChannelsMethod:
157
+ att.resolveUnknownChannelsMethod ?? "window",
158
+ resolutionWindowWords: att.resolutionWindowWords ?? 2,
159
+ speakerHistoryMinRmsEvidence: att.speakerHistoryMinRmsEvidence ?? 0.5,
160
+ speakerHistoryDominanceRatio: att.speakerHistoryDominanceRatio ?? 3,
161
+ };
162
+ if (
163
+ this.attributionParams.resolveUnknownChannelsMethod ===
164
+ "speaker-history"
165
+ ) {
166
+ this.speakerHistory = new Map();
167
+ }
168
+ // 20 ms VAD frames at the transcriber's target sample rate.
169
+ this.vadFrameSamples = Math.max(1, Math.round(params.sampleRate * 0.02));
170
+ this.minChunkSamples = Math.max(
171
+ 1,
172
+ Math.round(params.sampleRate * (MIN_CHUNK_MS / 1000)),
173
+ );
174
+ this.maxChunkSamples = Math.max(
175
+ this.minChunkSamples,
176
+ Math.round(params.sampleRate * (MAX_CHUNK_MS / 1000)),
177
+ );
178
+ this.channelBuffers = new Map(names.map((n) => [n, [] as number[]]));
179
+ this.channelSamplesReceived = new Map(names.map((n) => [n, 0]));
180
+ this.channelVadFloatBuffers = new Map(
181
+ names.map((n) => [n, new Float32Array(this.vadFrameSamples)]),
182
+ );
183
+ this.channelVadBufferIdx = new Map(names.map((n) => [n, 0]));
184
+ this.channelVads = new Map(
185
+ names.map((n) => [n, this.attributionParams!.createVad(n)]),
186
+ );
187
+ this.timeline = new VadTimeline(this.attributionParams.timelineWindowMs);
188
+ }
66
189
  }
67
190
 
68
191
  private connectionUrl(): URL {
@@ -140,6 +263,10 @@ export class StreamingTranscriber {
140
263
  searchParams.set("prompt", this.params.prompt);
141
264
  }
142
265
 
266
+ if (this.params.agentContext) {
267
+ searchParams.set("agent_context", this.params.agentContext);
268
+ }
269
+
143
270
  if (this.params.filterProfanity) {
144
271
  searchParams.set(
145
272
  "filter_profanity",
@@ -152,7 +279,9 @@ export class StreamingTranscriber {
152
279
  "[Deprecation Warning] The speech model `u3-pro` is deprecated and will be removed in a future release. Please use `u3-rt-pro` instead.",
153
280
  );
154
281
  }
155
- searchParams.set("speech_model", this.params.speechModel.toString());
282
+ if (this.params.speechModel !== undefined) {
283
+ searchParams.set("speech_model", this.params.speechModel.toString());
284
+ }
156
285
 
157
286
  if (this.params.languageDetection !== undefined) {
158
287
  searchParams.set(
@@ -267,6 +396,10 @@ export class StreamingTranscriber {
267
396
  searchParams.set("redact_pii_sub", this.params.redactPiiSub);
268
397
  }
269
398
 
399
+ if (this.params.mode !== undefined) {
400
+ searchParams.set("mode", this.params.mode);
401
+ }
402
+
270
403
  if (this.params.llmGateway !== undefined) {
271
404
  searchParams.set("llm_gateway", JSON.stringify(this.params.llmGateway));
272
405
  }
@@ -282,7 +415,12 @@ export class StreamingTranscriber {
282
415
  event: "llmGatewayResponse",
283
416
  listener: (event: LLMGatewayResponseEvent) => void,
284
417
  ): void;
418
+ on(
419
+ event: "speakerRevision",
420
+ listener: (event: SpeakerRevisionEvent) => void,
421
+ ): void;
285
422
  on(event: "warning", listener: (event: WarningEvent) => void): void;
423
+ on(event: "vad", listener: (event: VadFrame) => void): void;
286
424
  on(event: "error", listener: (error: Error) => void): void;
287
425
  on(event: "close", listener: (code: number, reason: string) => void): void;
288
426
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -323,6 +461,13 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
323
461
  reason = StreamingErrorMessages[code as StreamingErrorTypeCodes];
324
462
  }
325
463
  }
464
+ // Stop the flush timer when the socket is gone (server-initiated close,
465
+ // network drop, etc.) — otherwise subsequent ticks call send() on a
466
+ // closed socket and spam the error listener.
467
+ if (this.flushTimer) {
468
+ clearInterval(this.flushTimer);
469
+ this.flushTimer = undefined;
470
+ }
326
471
  this.listeners.close?.(code, reason);
327
472
  };
328
473
 
@@ -351,6 +496,22 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
351
496
  break;
352
497
  }
353
498
  case "Turn": {
499
+ if (this.isDualChannel && this.timeline && this.attributionParams) {
500
+ attributeTurn(message, this.timeline, {
501
+ dominanceRatio: this.attributionParams.dominanceRatio,
502
+ });
503
+ switch (this.attributionParams.resolveUnknownChannelsMethod) {
504
+ case "window":
505
+ this.resolveUnknownChannelsByWindow(message);
506
+ break;
507
+ case "speaker-history":
508
+ this.resolveUnknownChannelsBySpeakerHistory(message);
509
+ break;
510
+ case "none":
511
+ // Leave "unknown" words as-is.
512
+ break;
513
+ }
514
+ }
354
515
  this.listeners.turn?.(message);
355
516
  break;
356
517
  }
@@ -362,6 +523,10 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
362
523
  this.listeners.llmGatewayResponse?.(message);
363
524
  break;
364
525
  }
526
+ case "SpeakerRevision": {
527
+ this.listeners.speakerRevision?.(message);
528
+ break;
529
+ }
365
530
  case "Warning": {
366
531
  const warning = message as WarningEvent;
367
532
  console.warn(
@@ -379,6 +544,11 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
379
544
  });
380
545
  }
381
546
 
547
+ /**
548
+ * Returns a WritableStream that pumps PCM chunks into `sendAudio`. Single-channel
549
+ * only — in dual-channel mode use `sendAudio(pcm, { channel })` directly, since
550
+ * `WritableStream` has no place to carry a channel tag.
551
+ */
382
552
  stream(): WritableStream<AudioData> {
383
553
  return new WritableStream<AudioData>({
384
554
  write: (chunk: AudioData) => {
@@ -387,8 +557,231 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
387
557
  });
388
558
  }
389
559
 
390
- sendAudio(audio: AudioData) {
391
- this.send(audio);
560
+ /**
561
+ * Send PCM audio.
562
+ *
563
+ * In single-channel mode, `audio` is forwarded directly to the WebSocket and
564
+ * `options` is ignored.
565
+ *
566
+ * In dual-channel mode (when `channels` is configured), `options.channel` is
567
+ * REQUIRED and must match one of the declared channel names. Per-channel PCM is
568
+ * fed into that channel's VAD, accumulated into a per-channel ring buffer, and
569
+ * a scheduled flush (`channelAttribution.flushIntervalMs`, default 50ms) mixes
570
+ * the buffers into mono before sending to the WebSocket.
571
+ */
572
+ sendAudio(audio: AudioData, options?: SendAudioOptions) {
573
+ if (!this.isDualChannel) {
574
+ this.send(audio);
575
+ return;
576
+ }
577
+ if (!options?.channel) {
578
+ throw new Error(
579
+ "StreamingTranscriber is in dual-channel mode; sendAudio requires { channel }.",
580
+ );
581
+ }
582
+ if (!this.channelNames!.includes(options.channel)) {
583
+ throw new Error(
584
+ `Unknown channel "${options.channel}"; declared channels: ${this.channelNames!.join(", ")}.`,
585
+ );
586
+ }
587
+ this.ingestChannelAudio(options.channel, audio);
588
+ }
589
+
590
+ private ingestChannelAudio(name: string, audio: AudioData) {
591
+ const samples = toInt16View(audio);
592
+ const buf = this.channelBuffers!.get(name)!;
593
+ const vadBuf = this.channelVadFloatBuffers!.get(name)!;
594
+ let vadIdx = this.channelVadBufferIdx!.get(name)!;
595
+ let received = this.channelSamplesReceived!.get(name)!;
596
+ const vad = this.channelVads!.get(name)!;
597
+ const sampleRate = this.params.sampleRate;
598
+ const frameSize = this.vadFrameSamples;
599
+
600
+ for (let i = 0; i < samples.length; i++) {
601
+ const s = samples[i];
602
+ buf.push(s);
603
+ vadBuf[vadIdx++] = s / 0x8000;
604
+ received++;
605
+ if (vadIdx === frameSize) {
606
+ const result = vad.process(vadBuf);
607
+ const frame: VadFrame = {
608
+ ts: (received / sampleRate) * 1000,
609
+ channel: name,
610
+ active: result.active,
611
+ rms: result.energy,
612
+ };
613
+ this.timeline!.pushFrame(frame);
614
+ this.listeners.vad?.(frame);
615
+ vadIdx = 0;
616
+ }
617
+ }
618
+
619
+ this.channelVadBufferIdx!.set(name, vadIdx);
620
+ this.channelSamplesReceived!.set(name, received);
621
+
622
+ if (!this.flushTimer) this.startFlushTimer();
623
+ }
624
+
625
+ private startFlushTimer() {
626
+ this.flushTimer = setInterval(
627
+ () => this.flushMix(),
628
+ this.attributionParams!.flushIntervalMs,
629
+ );
630
+ }
631
+
632
+ private flushMix(force = false) {
633
+ if (!this.channelNames || !this.channelBuffers) return;
634
+ const bufs = this.channelNames.map((n) => this.channelBuffers!.get(n)!);
635
+ const divisor = bufs.length;
636
+ // Loop so a backlog (e.g. accumulated while a browser tab was throttled in
637
+ // the background) drains as multiple sends, each capped at MAX_CHUNK_MS.
638
+ // Without the cap a single message could exceed the server's 1000 ms input
639
+ // duration limit and be rejected with code 3007.
640
+ for (;;) {
641
+ let mixLen = Infinity;
642
+ for (const b of bufs) if (b.length < mixLen) mixLen = b.length;
643
+ if (!Number.isFinite(mixLen) || mixLen === 0) return;
644
+ // The streaming server rejects audio messages shorter than 50 ms with
645
+ // `Input Duration Error`. Wait until both per-channel buffers have at
646
+ // least minChunkSamples worth queued before emitting. The `force` path
647
+ // (final flush on close) bypasses this so the trailing partial buffer
648
+ // still gets through.
649
+ if (!force && mixLen < this.minChunkSamples) return;
650
+ if (mixLen > this.maxChunkSamples) mixLen = this.maxChunkSamples;
651
+ const out = new Int16Array(mixLen);
652
+ for (let i = 0; i < mixLen; i++) {
653
+ let sum = 0;
654
+ for (let c = 0; c < divisor; c++) sum += bufs[c][i];
655
+ const avg = Math.round(sum / divisor);
656
+ out[i] = avg < -32768 ? -32768 : avg > 32767 ? 32767 : avg;
657
+ }
658
+ for (const b of bufs) b.splice(0, mixLen);
659
+ try {
660
+ this.send(out.buffer);
661
+ } catch (err) {
662
+ this.listeners.error?.(err as Error);
663
+ return;
664
+ }
665
+ }
666
+ }
667
+
668
+ /**
669
+ * Fill in words whose per-word VAD attribution was `"unknown"` by looking
670
+ * at the dominant non-`"unknown"` channel among ±N neighbors in the same
671
+ * turn. Words with no non-`"unknown"` neighbors stay `"unknown"`. Confident
672
+ * per-word VAD decisions are never modified.
673
+ *
674
+ * Local temporal heuristic — ignores `speaker_label`, so it works even when
675
+ * AAI's diarization re-uses the same label for two physically distinct
676
+ * voices. Each resolved word gets `channelResolved: true` so downstream
677
+ * renderers can distinguish inferred channels from directly-measured ones.
678
+ */
679
+ private resolveUnknownChannelsByWindow(turn: TurnEvent): void {
680
+ if (!this.attributionParams) return;
681
+ const window = this.attributionParams.resolutionWindowWords;
682
+ const words = turn.words;
683
+ let mutated = false;
684
+
685
+ for (let i = 0; i < words.length; i++) {
686
+ if (words[i].channel !== "unknown") continue;
687
+ const tally = new Map<string, number>();
688
+ const lo = Math.max(0, i - window);
689
+ const hi = Math.min(words.length - 1, i + window);
690
+ for (let j = lo; j <= hi; j++) {
691
+ if (j === i) continue;
692
+ const ch = words[j].channel;
693
+ if (!ch || ch === "unknown") continue;
694
+ tally.set(ch, (tally.get(ch) ?? 0) + 1);
695
+ }
696
+ if (tally.size === 0) continue;
697
+
698
+ // Pick the dominant neighbor channel. Ties → leave `"unknown"` (rare;
699
+ // would require an equal count of mic and system neighbors).
700
+ let top: string | undefined;
701
+ let topCount = 0;
702
+ let tied = false;
703
+ for (const [name, count] of tally) {
704
+ if (count > topCount) {
705
+ top = name;
706
+ topCount = count;
707
+ tied = false;
708
+ } else if (count === topCount) {
709
+ tied = true;
710
+ }
711
+ }
712
+ if (top && !tied) {
713
+ words[i].channel = top;
714
+ words[i].channelResolved = true;
715
+ mutated = true;
716
+ }
717
+ }
718
+
719
+ // Recompute the rollup only if any per-word channel changed.
720
+ if (mutated) turn.channel = rollUpTurnChannel(words);
721
+ }
722
+
723
+ /**
724
+ * Fill `"unknown"` words by looking up the speaker's session-wide channel
725
+ * evidence. For each `speaker_label`, sums active VAD frame RMS per channel
726
+ * across every word the speaker has uttered to date. A speaker is
727
+ * "resolvable" if their total evidence clears
728
+ * `speakerHistoryMinRmsEvidence` and their top channel exceeds the
729
+ * runner-up by `speakerHistoryDominanceRatio`.
730
+ *
731
+ * Only touches `"unknown"` words. Confident per-word VAD decisions are
732
+ * never modified. `speaker_label` is never modified.
733
+ */
734
+ private resolveUnknownChannelsBySpeakerHistory(turn: TurnEvent): void {
735
+ if (!this.timeline || !this.attributionParams || !this.speakerHistory)
736
+ return;
737
+ const minEvidence = this.attributionParams.speakerHistoryMinRmsEvidence;
738
+ const dominanceRatio = this.attributionParams.speakerHistoryDominanceRatio;
739
+
740
+ // 1. Accumulate evidence from this turn's words.
741
+ for (const w of turn.words) {
742
+ if (!w.speaker) continue;
743
+ const frames = this.timeline.framesInWindow(w.start, w.end);
744
+ let entry = this.speakerHistory.get(w.speaker);
745
+ if (!entry) {
746
+ entry = new Map();
747
+ this.speakerHistory.set(w.speaker, entry);
748
+ }
749
+ for (const f of frames) {
750
+ if (!f.active) continue;
751
+ entry.set(f.channel, (entry.get(f.channel) ?? 0) + f.rms);
752
+ }
753
+ }
754
+
755
+ // 2. Fill unknown words whose speakers have dominant evidence.
756
+ let mutated = false;
757
+ for (const w of turn.words) {
758
+ if (w.channel !== "unknown" || !w.speaker) continue;
759
+ const entry = this.speakerHistory.get(w.speaker);
760
+ if (!entry || entry.size === 0) continue;
761
+ let total = 0;
762
+ let topName: string | undefined;
763
+ let topScore = 0;
764
+ let runnerScore = 0;
765
+ for (const [name, score] of entry) {
766
+ total += score;
767
+ if (score > topScore) {
768
+ runnerScore = topScore;
769
+ topScore = score;
770
+ topName = name;
771
+ } else if (score > runnerScore) {
772
+ runnerScore = score;
773
+ }
774
+ }
775
+ if (total < minEvidence) continue;
776
+ if (runnerScore > 0 && topScore < dominanceRatio * runnerScore) continue;
777
+ if (topName) {
778
+ w.channel = topName;
779
+ w.channelResolved = true;
780
+ mutated = true;
781
+ }
782
+ }
783
+
784
+ if (mutated) turn.channel = rollUpTurnChannel(turn.words);
392
785
  }
393
786
 
394
787
  /**
@@ -440,6 +833,15 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
440
833
  }
441
834
 
442
835
  async close(waitForSessionTermination = true) {
836
+ if (this.flushTimer) {
837
+ clearInterval(this.flushTimer);
838
+ this.flushTimer = undefined;
839
+ // Best-effort: drain any final partial mix so the server gets the tail.
840
+ // Bypass the 50ms floor here since this is the last flush; if the tail
841
+ // is <50ms the server will reject that single message, but we'd lose
842
+ // the audio either way.
843
+ this.flushMix(true);
844
+ }
443
845
  if (this.socket) {
444
846
  if (this.socket.readyState === this.socket.OPEN) {
445
847
  if (waitForSessionTermination) {
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Physical input channel that a word/turn was attributed to.
3
+ * - A channel name declared in `StreamingTranscriberParams.channels` (e.g. `"mic"`, `"system"`).
4
+ * - `"unknown"`: no channel was clearly dominant during the word's time window (silent
5
+ * or all channels evenly active under our threshold).
6
+ *
7
+ * This is independent of AssemblyAI's diarization `speaker_label` / `words[i].speaker`,
8
+ * which identifies voices by acoustic characteristics. A given speaker_label can map
9
+ * to any physical channel; the two dimensions can disagree.
10
+ */
11
+ export type Channel = string | "unknown";
12
+
13
+ /**
14
+ * Per-channel, per-frame VAD observation emitted by `StreamingTranscriber` when running
15
+ * in dual-channel mode. `ts` is stream-relative milliseconds, derived from the
16
+ * per-channel sample counter — the same reference frame as `StreamingWord.start` /
17
+ * `.end`, so per-word lookups need no conversion.
18
+ */
19
+ export type VadFrame = {
20
+ ts: number;
21
+ channel: string;
22
+ active: boolean;
23
+ rms: number;
24
+ };
25
+
26
+ export type VadDetectorResult = {
27
+ active: boolean;
28
+ energy: number;
29
+ };
30
+
31
+ /**
32
+ * Pluggable per-channel voice-activity detector. The default `EnergyVad` is energy-based
33
+ * with an adaptive noise-floor threshold; callers can drop in a DNN-backed detector
34
+ * (e.g. Silero via `@ricky0123/vad-web`) for noisier environments.
35
+ *
36
+ * A separate `VadDetector` instance is held per channel; do not assume cross-channel
37
+ * state. Frames are fixed-size at the transcriber's target sample rate.
38
+ */
39
+ export interface VadDetector {
40
+ process(frame: Float32Array): VadDetectorResult;
41
+ reset(): void;
42
+ }
43
+
44
+ /**
45
+ * Thrown when `DualChannelCapture` is constructed in a non-browser environment
46
+ * (no `globalThis.AudioContext`). The helper is intentionally surfaced from the
47
+ * main entrypoint so the import path is uniform across runtimes; the runtime
48
+ * guard moves to construction time.
49
+ */
50
+ export class BrowserOnlyError extends Error {
51
+ constructor(
52
+ message = "DualChannelCapture requires a browser environment (AudioContext is undefined).",
53
+ ) {
54
+ super(message);
55
+ this.name = "BrowserOnlyError";
56
+ }
57
+ }