assemblyai 4.33.3 → 4.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assemblyai.streaming.umd.js +1279 -3
- package/dist/assemblyai.streaming.umd.min.js +1 -1
- package/dist/assemblyai.umd.js +786 -3
- package/dist/assemblyai.umd.min.js +1 -1
- package/dist/browser.mjs +762 -4
- package/dist/bun.mjs +762 -4
- package/dist/deno.mjs +762 -4
- package/dist/exports/streaming.d.ts +7 -0
- package/dist/index.cjs +786 -3
- package/dist/index.mjs +778 -4
- package/dist/node.cjs +770 -3
- package/dist/node.mjs +762 -4
- package/dist/services/index.d.ts +2 -2
- package/dist/services/streaming/browser/dual-channel-capture.d.ts +66 -0
- package/dist/services/streaming/browser/worklets/pcm16-encoder.d.ts +19 -0
- package/dist/services/streaming/energy-vad.d.ts +35 -0
- package/dist/services/streaming/index.d.ts +4 -0
- package/dist/services/streaming/label-mapper.d.ts +44 -0
- package/dist/services/streaming/resampler.d.ts +22 -0
- package/dist/services/streaming/service.d.ts +69 -1
- package/dist/streaming.browser.mjs +1235 -4
- package/dist/streaming.cjs +1275 -3
- package/dist/streaming.mjs +1264 -4
- package/dist/types/streaming/dual-channel.d.ts +48 -0
- package/dist/types/streaming/index.d.ts +110 -1
- package/dist/workerd.mjs +762 -4
- package/package.json +1 -1
- package/src/exports/streaming.ts +7 -0
- package/src/services/index.ts +20 -1
- package/src/services/streaming/browser/dual-channel-capture.ts +177 -0
- package/src/services/streaming/browser/worklets/pcm16-encoder.ts +70 -0
- package/src/services/streaming/energy-vad.ts +75 -0
- package/src/services/streaming/index.ts +4 -0
- package/src/services/streaming/label-mapper.ts +128 -0
- package/src/services/streaming/resampler.ts +69 -0
- package/src/services/streaming/service.ts +385 -2
- package/src/types/streaming/dual-channel.ts +57 -0
- package/src/types/streaming/index.ts +110 -0
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
import { ErrorEvent, MessageEvent, CloseEvent } from "ws";
|
|
7
7
|
import { conditions } from "#conditions";
|
|
8
8
|
import {
|
|
9
|
+
ChannelAttributionParams,
|
|
9
10
|
StreamingEvents,
|
|
10
11
|
StreamingListeners,
|
|
11
12
|
StreamingTranscriberParams,
|
|
@@ -18,12 +19,62 @@ import {
|
|
|
18
19
|
StreamingForceEndpoint,
|
|
19
20
|
WarningEvent,
|
|
20
21
|
} from "../..";
|
|
22
|
+
import type { VadDetector, VadFrame } from "../../types/streaming/dual-channel";
|
|
23
|
+
import { EnergyVad } from "./energy-vad";
|
|
24
|
+
import { attributeTurn, rollUpTurnChannel, VadTimeline } from "./label-mapper";
|
|
21
25
|
import { StreamingError, StreamingErrorMessages } from "../../utils/errors";
|
|
22
26
|
import { StreamingErrorTypeCodes } from "../../utils/errors/streaming";
|
|
23
27
|
|
|
28
|
+
/**
|
|
29
|
+
* Options for `sendAudio`. In dual-channel mode (when `channels` is configured
|
|
30
|
+
* on the transcriber), `channel` is required and must match one of the declared
|
|
31
|
+
* channel names; in single-channel mode it is ignored.
|
|
32
|
+
*/
|
|
33
|
+
export type SendAudioOptions = {
|
|
34
|
+
channel?: string;
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* View any `AudioData` (ArrayBuffer / ArrayBufferView / typed array) as a
|
|
39
|
+
* little-endian Int16 sample sequence without copying. Callers must guarantee
|
|
40
|
+
* the underlying byte length is even.
|
|
41
|
+
*/
|
|
42
|
+
function toInt16View(audio: AudioData): Int16Array {
|
|
43
|
+
// AudioData is ArrayBufferLike per the public type, but in practice callers
|
|
44
|
+
// pass ArrayBuffer or a typed-array view. Handle both without copying.
|
|
45
|
+
if (audio instanceof Int16Array) return audio;
|
|
46
|
+
if (ArrayBuffer.isView(audio)) {
|
|
47
|
+
const view = audio as ArrayBufferView;
|
|
48
|
+
return new Int16Array(
|
|
49
|
+
view.buffer,
|
|
50
|
+
view.byteOffset,
|
|
51
|
+
Math.floor(view.byteLength / 2),
|
|
52
|
+
);
|
|
53
|
+
}
|
|
54
|
+
return new Int16Array(audio as ArrayBuffer);
|
|
55
|
+
}
|
|
56
|
+
|
|
24
57
|
const defaultStreamingUrl = "wss://streaming.assemblyai.com/v3/ws";
|
|
25
58
|
const terminateSessionMessage = `{"type":"Terminate"}`;
|
|
26
59
|
|
|
60
|
+
/**
|
|
61
|
+
* Per-send chunk cap in milliseconds for the dual-channel mixer. The streaming
|
|
62
|
+
* server rejects audio messages longer than 1000 ms (`Input Duration Error`).
|
|
63
|
+
* If a backlog accumulates (e.g. when a browser tab is backgrounded and
|
|
64
|
+
* `setInterval` is throttled to ~1 Hz), `flushMix` loops and emits multiple
|
|
65
|
+
* sends each ≤ this cap until the buffers drain.
|
|
66
|
+
*/
|
|
67
|
+
const MAX_CHUNK_MS = 200;
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Per-send minimum chunk size in milliseconds. The streaming server also
|
|
71
|
+
* rejects audio messages shorter than 50 ms with the same
|
|
72
|
+
* `Input Duration Error`, so the mixer waits until both per-channel buffers
|
|
73
|
+
* have at least this much accumulated before emitting. Final-flush (close
|
|
74
|
+
* path) bypasses this floor so the trailing partial buffer still gets sent.
|
|
75
|
+
*/
|
|
76
|
+
const MIN_CHUNK_MS = 50;
|
|
77
|
+
|
|
27
78
|
type BufferLike =
|
|
28
79
|
| string
|
|
29
80
|
| Buffer
|
|
@@ -51,6 +102,25 @@ export class StreamingTranscriber {
|
|
|
51
102
|
private listeners: StreamingListeners = {};
|
|
52
103
|
private sessionTerminatedResolve?: () => void;
|
|
53
104
|
|
|
105
|
+
// Dual-channel mode state (allocated only when params.channels is set).
|
|
106
|
+
private isDualChannel = false;
|
|
107
|
+
private channelNames?: string[];
|
|
108
|
+
private channelBuffers?: Map<string, number[]>;
|
|
109
|
+
private channelSamplesReceived?: Map<string, number>;
|
|
110
|
+
private channelVadFloatBuffers?: Map<string, Float32Array>;
|
|
111
|
+
private channelVadBufferIdx?: Map<string, number>;
|
|
112
|
+
private channelVads?: Map<string, VadDetector>;
|
|
113
|
+
private timeline?: VadTimeline;
|
|
114
|
+
private flushTimer?: ReturnType<typeof setInterval>;
|
|
115
|
+
private attributionParams?: Required<ChannelAttributionParams>;
|
|
116
|
+
private vadFrameSamples = 0;
|
|
117
|
+
private minChunkSamples = 0;
|
|
118
|
+
private maxChunkSamples = 0;
|
|
119
|
+
// For resolveUnknownChannelsMethod === "speaker-history": per-speaker_label
|
|
120
|
+
// cumulative active-VAD RMS per channel. Allocated only when that method is
|
|
121
|
+
// configured.
|
|
122
|
+
private speakerHistory?: Map<string, Map<string, number>>;
|
|
123
|
+
|
|
54
124
|
constructor(params: StreamingTranscriberParams) {
|
|
55
125
|
this.params = {
|
|
56
126
|
...params,
|
|
@@ -63,6 +133,58 @@ export class StreamingTranscriber {
|
|
|
63
133
|
if (!(this.token || this.apiKey)) {
|
|
64
134
|
throw new Error("API key or temporary token is required.");
|
|
65
135
|
}
|
|
136
|
+
|
|
137
|
+
if (params.channels) {
|
|
138
|
+
if (params.channels.length !== 2) {
|
|
139
|
+
throw new Error(
|
|
140
|
+
"StreamingTranscriber.channels must have exactly 2 entries.",
|
|
141
|
+
);
|
|
142
|
+
}
|
|
143
|
+
const names = params.channels.map((c) => c.name);
|
|
144
|
+
if (new Set(names).size !== names.length) {
|
|
145
|
+
throw new Error("StreamingTranscriber.channels names must be unique.");
|
|
146
|
+
}
|
|
147
|
+
this.isDualChannel = true;
|
|
148
|
+
this.channelNames = names;
|
|
149
|
+
const att = params.channelAttribution ?? {};
|
|
150
|
+
this.attributionParams = {
|
|
151
|
+
dominanceRatio: att.dominanceRatio ?? 4,
|
|
152
|
+
timelineWindowMs: att.timelineWindowMs ?? 30_000,
|
|
153
|
+
createVad: att.createVad ?? (() => new EnergyVad()),
|
|
154
|
+
flushIntervalMs: att.flushIntervalMs ?? 50,
|
|
155
|
+
resolveUnknownChannelsMethod:
|
|
156
|
+
att.resolveUnknownChannelsMethod ?? "window",
|
|
157
|
+
resolutionWindowWords: att.resolutionWindowWords ?? 2,
|
|
158
|
+
speakerHistoryMinRmsEvidence: att.speakerHistoryMinRmsEvidence ?? 0.5,
|
|
159
|
+
speakerHistoryDominanceRatio: att.speakerHistoryDominanceRatio ?? 3,
|
|
160
|
+
};
|
|
161
|
+
if (
|
|
162
|
+
this.attributionParams.resolveUnknownChannelsMethod ===
|
|
163
|
+
"speaker-history"
|
|
164
|
+
) {
|
|
165
|
+
this.speakerHistory = new Map();
|
|
166
|
+
}
|
|
167
|
+
// 20 ms VAD frames at the transcriber's target sample rate.
|
|
168
|
+
this.vadFrameSamples = Math.max(1, Math.round(params.sampleRate * 0.02));
|
|
169
|
+
this.minChunkSamples = Math.max(
|
|
170
|
+
1,
|
|
171
|
+
Math.round(params.sampleRate * (MIN_CHUNK_MS / 1000)),
|
|
172
|
+
);
|
|
173
|
+
this.maxChunkSamples = Math.max(
|
|
174
|
+
this.minChunkSamples,
|
|
175
|
+
Math.round(params.sampleRate * (MAX_CHUNK_MS / 1000)),
|
|
176
|
+
);
|
|
177
|
+
this.channelBuffers = new Map(names.map((n) => [n, [] as number[]]));
|
|
178
|
+
this.channelSamplesReceived = new Map(names.map((n) => [n, 0]));
|
|
179
|
+
this.channelVadFloatBuffers = new Map(
|
|
180
|
+
names.map((n) => [n, new Float32Array(this.vadFrameSamples)]),
|
|
181
|
+
);
|
|
182
|
+
this.channelVadBufferIdx = new Map(names.map((n) => [n, 0]));
|
|
183
|
+
this.channelVads = new Map(
|
|
184
|
+
names.map((n) => [n, this.attributionParams!.createVad(n)]),
|
|
185
|
+
);
|
|
186
|
+
this.timeline = new VadTimeline(this.attributionParams.timelineWindowMs);
|
|
187
|
+
}
|
|
66
188
|
}
|
|
67
189
|
|
|
68
190
|
private connectionUrl(): URL {
|
|
@@ -283,6 +405,7 @@ export class StreamingTranscriber {
|
|
|
283
405
|
listener: (event: LLMGatewayResponseEvent) => void,
|
|
284
406
|
): void;
|
|
285
407
|
on(event: "warning", listener: (event: WarningEvent) => void): void;
|
|
408
|
+
on(event: "vad", listener: (event: VadFrame) => void): void;
|
|
286
409
|
on(event: "error", listener: (error: Error) => void): void;
|
|
287
410
|
on(event: "close", listener: (code: number, reason: string) => void): void;
|
|
288
411
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
@@ -323,6 +446,13 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
323
446
|
reason = StreamingErrorMessages[code as StreamingErrorTypeCodes];
|
|
324
447
|
}
|
|
325
448
|
}
|
|
449
|
+
// Stop the flush timer when the socket is gone (server-initiated close,
|
|
450
|
+
// network drop, etc.) — otherwise subsequent ticks call send() on a
|
|
451
|
+
// closed socket and spam the error listener.
|
|
452
|
+
if (this.flushTimer) {
|
|
453
|
+
clearInterval(this.flushTimer);
|
|
454
|
+
this.flushTimer = undefined;
|
|
455
|
+
}
|
|
326
456
|
this.listeners.close?.(code, reason);
|
|
327
457
|
};
|
|
328
458
|
|
|
@@ -351,6 +481,22 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
351
481
|
break;
|
|
352
482
|
}
|
|
353
483
|
case "Turn": {
|
|
484
|
+
if (this.isDualChannel && this.timeline && this.attributionParams) {
|
|
485
|
+
attributeTurn(message, this.timeline, {
|
|
486
|
+
dominanceRatio: this.attributionParams.dominanceRatio,
|
|
487
|
+
});
|
|
488
|
+
switch (this.attributionParams.resolveUnknownChannelsMethod) {
|
|
489
|
+
case "window":
|
|
490
|
+
this.resolveUnknownChannelsByWindow(message);
|
|
491
|
+
break;
|
|
492
|
+
case "speaker-history":
|
|
493
|
+
this.resolveUnknownChannelsBySpeakerHistory(message);
|
|
494
|
+
break;
|
|
495
|
+
case "none":
|
|
496
|
+
// Leave "unknown" words as-is.
|
|
497
|
+
break;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
354
500
|
this.listeners.turn?.(message);
|
|
355
501
|
break;
|
|
356
502
|
}
|
|
@@ -379,6 +525,11 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
379
525
|
});
|
|
380
526
|
}
|
|
381
527
|
|
|
528
|
+
/**
|
|
529
|
+
* Returns a WritableStream that pumps PCM chunks into `sendAudio`. Single-channel
|
|
530
|
+
* only — in dual-channel mode use `sendAudio(pcm, { channel })` directly, since
|
|
531
|
+
* `WritableStream` has no place to carry a channel tag.
|
|
532
|
+
*/
|
|
382
533
|
stream(): WritableStream<AudioData> {
|
|
383
534
|
return new WritableStream<AudioData>({
|
|
384
535
|
write: (chunk: AudioData) => {
|
|
@@ -387,8 +538,231 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
387
538
|
});
|
|
388
539
|
}
|
|
389
540
|
|
|
390
|
-
|
|
391
|
-
|
|
541
|
+
/**
|
|
542
|
+
* Send PCM audio.
|
|
543
|
+
*
|
|
544
|
+
* In single-channel mode, `audio` is forwarded directly to the WebSocket and
|
|
545
|
+
* `options` is ignored.
|
|
546
|
+
*
|
|
547
|
+
* In dual-channel mode (when `channels` is configured), `options.channel` is
|
|
548
|
+
* REQUIRED and must match one of the declared channel names. Per-channel PCM is
|
|
549
|
+
* fed into that channel's VAD, accumulated into a per-channel ring buffer, and
|
|
550
|
+
* a scheduled flush (`channelAttribution.flushIntervalMs`, default 50ms) mixes
|
|
551
|
+
* the buffers into mono before sending to the WebSocket.
|
|
552
|
+
*/
|
|
553
|
+
sendAudio(audio: AudioData, options?: SendAudioOptions) {
|
|
554
|
+
if (!this.isDualChannel) {
|
|
555
|
+
this.send(audio);
|
|
556
|
+
return;
|
|
557
|
+
}
|
|
558
|
+
if (!options?.channel) {
|
|
559
|
+
throw new Error(
|
|
560
|
+
"StreamingTranscriber is in dual-channel mode; sendAudio requires { channel }.",
|
|
561
|
+
);
|
|
562
|
+
}
|
|
563
|
+
if (!this.channelNames!.includes(options.channel)) {
|
|
564
|
+
throw new Error(
|
|
565
|
+
`Unknown channel "${options.channel}"; declared channels: ${this.channelNames!.join(", ")}.`,
|
|
566
|
+
);
|
|
567
|
+
}
|
|
568
|
+
this.ingestChannelAudio(options.channel, audio);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
private ingestChannelAudio(name: string, audio: AudioData) {
|
|
572
|
+
const samples = toInt16View(audio);
|
|
573
|
+
const buf = this.channelBuffers!.get(name)!;
|
|
574
|
+
const vadBuf = this.channelVadFloatBuffers!.get(name)!;
|
|
575
|
+
let vadIdx = this.channelVadBufferIdx!.get(name)!;
|
|
576
|
+
let received = this.channelSamplesReceived!.get(name)!;
|
|
577
|
+
const vad = this.channelVads!.get(name)!;
|
|
578
|
+
const sampleRate = this.params.sampleRate;
|
|
579
|
+
const frameSize = this.vadFrameSamples;
|
|
580
|
+
|
|
581
|
+
for (let i = 0; i < samples.length; i++) {
|
|
582
|
+
const s = samples[i];
|
|
583
|
+
buf.push(s);
|
|
584
|
+
vadBuf[vadIdx++] = s / 0x8000;
|
|
585
|
+
received++;
|
|
586
|
+
if (vadIdx === frameSize) {
|
|
587
|
+
const result = vad.process(vadBuf);
|
|
588
|
+
const frame: VadFrame = {
|
|
589
|
+
ts: (received / sampleRate) * 1000,
|
|
590
|
+
channel: name,
|
|
591
|
+
active: result.active,
|
|
592
|
+
rms: result.energy,
|
|
593
|
+
};
|
|
594
|
+
this.timeline!.pushFrame(frame);
|
|
595
|
+
this.listeners.vad?.(frame);
|
|
596
|
+
vadIdx = 0;
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
this.channelVadBufferIdx!.set(name, vadIdx);
|
|
601
|
+
this.channelSamplesReceived!.set(name, received);
|
|
602
|
+
|
|
603
|
+
if (!this.flushTimer) this.startFlushTimer();
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
private startFlushTimer() {
|
|
607
|
+
this.flushTimer = setInterval(
|
|
608
|
+
() => this.flushMix(),
|
|
609
|
+
this.attributionParams!.flushIntervalMs,
|
|
610
|
+
);
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
private flushMix(force = false) {
|
|
614
|
+
if (!this.channelNames || !this.channelBuffers) return;
|
|
615
|
+
const bufs = this.channelNames.map((n) => this.channelBuffers!.get(n)!);
|
|
616
|
+
const divisor = bufs.length;
|
|
617
|
+
// Loop so a backlog (e.g. accumulated while a browser tab was throttled in
|
|
618
|
+
// the background) drains as multiple sends, each capped at MAX_CHUNK_MS.
|
|
619
|
+
// Without the cap a single message could exceed the server's 1000 ms input
|
|
620
|
+
// duration limit and be rejected with code 3007.
|
|
621
|
+
for (;;) {
|
|
622
|
+
let mixLen = Infinity;
|
|
623
|
+
for (const b of bufs) if (b.length < mixLen) mixLen = b.length;
|
|
624
|
+
if (!Number.isFinite(mixLen) || mixLen === 0) return;
|
|
625
|
+
// The streaming server rejects audio messages shorter than 50 ms with
|
|
626
|
+
// `Input Duration Error`. Wait until both per-channel buffers have at
|
|
627
|
+
// least minChunkSamples worth queued before emitting. The `force` path
|
|
628
|
+
// (final flush on close) bypasses this so the trailing partial buffer
|
|
629
|
+
// still gets through.
|
|
630
|
+
if (!force && mixLen < this.minChunkSamples) return;
|
|
631
|
+
if (mixLen > this.maxChunkSamples) mixLen = this.maxChunkSamples;
|
|
632
|
+
const out = new Int16Array(mixLen);
|
|
633
|
+
for (let i = 0; i < mixLen; i++) {
|
|
634
|
+
let sum = 0;
|
|
635
|
+
for (let c = 0; c < divisor; c++) sum += bufs[c][i];
|
|
636
|
+
const avg = Math.round(sum / divisor);
|
|
637
|
+
out[i] = avg < -32768 ? -32768 : avg > 32767 ? 32767 : avg;
|
|
638
|
+
}
|
|
639
|
+
for (const b of bufs) b.splice(0, mixLen);
|
|
640
|
+
try {
|
|
641
|
+
this.send(out.buffer);
|
|
642
|
+
} catch (err) {
|
|
643
|
+
this.listeners.error?.(err as Error);
|
|
644
|
+
return;
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
/**
|
|
650
|
+
* Fill in words whose per-word VAD attribution was `"unknown"` by looking
|
|
651
|
+
* at the dominant non-`"unknown"` channel among ±N neighbors in the same
|
|
652
|
+
* turn. Words with no non-`"unknown"` neighbors stay `"unknown"`. Confident
|
|
653
|
+
* per-word VAD decisions are never modified.
|
|
654
|
+
*
|
|
655
|
+
* Local temporal heuristic — ignores `speaker_label`, so it works even when
|
|
656
|
+
* AAI's diarization re-uses the same label for two physically distinct
|
|
657
|
+
* voices. Each resolved word gets `channelResolved: true` so downstream
|
|
658
|
+
* renderers can distinguish inferred channels from directly-measured ones.
|
|
659
|
+
*/
|
|
660
|
+
private resolveUnknownChannelsByWindow(turn: TurnEvent): void {
|
|
661
|
+
if (!this.attributionParams) return;
|
|
662
|
+
const window = this.attributionParams.resolutionWindowWords;
|
|
663
|
+
const words = turn.words;
|
|
664
|
+
let mutated = false;
|
|
665
|
+
|
|
666
|
+
for (let i = 0; i < words.length; i++) {
|
|
667
|
+
if (words[i].channel !== "unknown") continue;
|
|
668
|
+
const tally = new Map<string, number>();
|
|
669
|
+
const lo = Math.max(0, i - window);
|
|
670
|
+
const hi = Math.min(words.length - 1, i + window);
|
|
671
|
+
for (let j = lo; j <= hi; j++) {
|
|
672
|
+
if (j === i) continue;
|
|
673
|
+
const ch = words[j].channel;
|
|
674
|
+
if (!ch || ch === "unknown") continue;
|
|
675
|
+
tally.set(ch, (tally.get(ch) ?? 0) + 1);
|
|
676
|
+
}
|
|
677
|
+
if (tally.size === 0) continue;
|
|
678
|
+
|
|
679
|
+
// Pick the dominant neighbor channel. Ties → leave `"unknown"` (rare;
|
|
680
|
+
// would require an equal count of mic and system neighbors).
|
|
681
|
+
let top: string | undefined;
|
|
682
|
+
let topCount = 0;
|
|
683
|
+
let tied = false;
|
|
684
|
+
for (const [name, count] of tally) {
|
|
685
|
+
if (count > topCount) {
|
|
686
|
+
top = name;
|
|
687
|
+
topCount = count;
|
|
688
|
+
tied = false;
|
|
689
|
+
} else if (count === topCount) {
|
|
690
|
+
tied = true;
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
if (top && !tied) {
|
|
694
|
+
words[i].channel = top;
|
|
695
|
+
words[i].channelResolved = true;
|
|
696
|
+
mutated = true;
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
// Recompute the rollup only if any per-word channel changed.
|
|
701
|
+
if (mutated) turn.channel = rollUpTurnChannel(words);
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
/**
|
|
705
|
+
* Fill `"unknown"` words by looking up the speaker's session-wide channel
|
|
706
|
+
* evidence. For each `speaker_label`, sums active VAD frame RMS per channel
|
|
707
|
+
* across every word the speaker has uttered to date. A speaker is
|
|
708
|
+
* "resolvable" if their total evidence clears
|
|
709
|
+
* `speakerHistoryMinRmsEvidence` and their top channel exceeds the
|
|
710
|
+
* runner-up by `speakerHistoryDominanceRatio`.
|
|
711
|
+
*
|
|
712
|
+
* Only touches `"unknown"` words. Confident per-word VAD decisions are
|
|
713
|
+
* never modified. `speaker_label` is never modified.
|
|
714
|
+
*/
|
|
715
|
+
private resolveUnknownChannelsBySpeakerHistory(turn: TurnEvent): void {
|
|
716
|
+
if (!this.timeline || !this.attributionParams || !this.speakerHistory)
|
|
717
|
+
return;
|
|
718
|
+
const minEvidence = this.attributionParams.speakerHistoryMinRmsEvidence;
|
|
719
|
+
const dominanceRatio = this.attributionParams.speakerHistoryDominanceRatio;
|
|
720
|
+
|
|
721
|
+
// 1. Accumulate evidence from this turn's words.
|
|
722
|
+
for (const w of turn.words) {
|
|
723
|
+
if (!w.speaker) continue;
|
|
724
|
+
const frames = this.timeline.framesInWindow(w.start, w.end);
|
|
725
|
+
let entry = this.speakerHistory.get(w.speaker);
|
|
726
|
+
if (!entry) {
|
|
727
|
+
entry = new Map();
|
|
728
|
+
this.speakerHistory.set(w.speaker, entry);
|
|
729
|
+
}
|
|
730
|
+
for (const f of frames) {
|
|
731
|
+
if (!f.active) continue;
|
|
732
|
+
entry.set(f.channel, (entry.get(f.channel) ?? 0) + f.rms);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
// 2. Fill unknown words whose speakers have dominant evidence.
|
|
737
|
+
let mutated = false;
|
|
738
|
+
for (const w of turn.words) {
|
|
739
|
+
if (w.channel !== "unknown" || !w.speaker) continue;
|
|
740
|
+
const entry = this.speakerHistory.get(w.speaker);
|
|
741
|
+
if (!entry || entry.size === 0) continue;
|
|
742
|
+
let total = 0;
|
|
743
|
+
let topName: string | undefined;
|
|
744
|
+
let topScore = 0;
|
|
745
|
+
let runnerScore = 0;
|
|
746
|
+
for (const [name, score] of entry) {
|
|
747
|
+
total += score;
|
|
748
|
+
if (score > topScore) {
|
|
749
|
+
runnerScore = topScore;
|
|
750
|
+
topScore = score;
|
|
751
|
+
topName = name;
|
|
752
|
+
} else if (score > runnerScore) {
|
|
753
|
+
runnerScore = score;
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
if (total < minEvidence) continue;
|
|
757
|
+
if (runnerScore > 0 && topScore < dominanceRatio * runnerScore) continue;
|
|
758
|
+
if (topName) {
|
|
759
|
+
w.channel = topName;
|
|
760
|
+
w.channelResolved = true;
|
|
761
|
+
mutated = true;
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
if (mutated) turn.channel = rollUpTurnChannel(turn.words);
|
|
392
766
|
}
|
|
393
767
|
|
|
394
768
|
/**
|
|
@@ -440,6 +814,15 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
440
814
|
}
|
|
441
815
|
|
|
442
816
|
async close(waitForSessionTermination = true) {
|
|
817
|
+
if (this.flushTimer) {
|
|
818
|
+
clearInterval(this.flushTimer);
|
|
819
|
+
this.flushTimer = undefined;
|
|
820
|
+
// Best-effort: drain any final partial mix so the server gets the tail.
|
|
821
|
+
// Bypass the 50ms floor here since this is the last flush; if the tail
|
|
822
|
+
// is <50ms the server will reject that single message, but we'd lose
|
|
823
|
+
// the audio either way.
|
|
824
|
+
this.flushMix(true);
|
|
825
|
+
}
|
|
443
826
|
if (this.socket) {
|
|
444
827
|
if (this.socket.readyState === this.socket.OPEN) {
|
|
445
828
|
if (waitForSessionTermination) {
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Physical input channel that a word/turn was attributed to.
|
|
3
|
+
* - A channel name declared in `StreamingTranscriberParams.channels` (e.g. `"mic"`, `"system"`).
|
|
4
|
+
* - `"unknown"`: no channel was clearly dominant during the word's time window (silent
|
|
5
|
+
* or all channels evenly active under our threshold).
|
|
6
|
+
*
|
|
7
|
+
* This is independent of AssemblyAI's diarization `speaker_label` / `words[i].speaker`,
|
|
8
|
+
* which identifies voices by acoustic characteristics. A given speaker_label can map
|
|
9
|
+
* to any physical channel; the two dimensions can disagree.
|
|
10
|
+
*/
|
|
11
|
+
export type Channel = string | "unknown";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Per-channel, per-frame VAD observation emitted by `StreamingTranscriber` when running
|
|
15
|
+
* in dual-channel mode. `ts` is stream-relative milliseconds, derived from the
|
|
16
|
+
* per-channel sample counter — the same reference frame as `StreamingWord.start` /
|
|
17
|
+
* `.end`, so per-word lookups need no conversion.
|
|
18
|
+
*/
|
|
19
|
+
export type VadFrame = {
|
|
20
|
+
ts: number;
|
|
21
|
+
channel: string;
|
|
22
|
+
active: boolean;
|
|
23
|
+
rms: number;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
export type VadDetectorResult = {
|
|
27
|
+
active: boolean;
|
|
28
|
+
energy: number;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Pluggable per-channel voice-activity detector. The default `EnergyVad` is energy-based
|
|
33
|
+
* with an adaptive noise-floor threshold; callers can drop in a DNN-backed detector
|
|
34
|
+
* (e.g. Silero via `@ricky0123/vad-web`) for noisier environments.
|
|
35
|
+
*
|
|
36
|
+
* A separate `VadDetector` instance is held per channel; do not assume cross-channel
|
|
37
|
+
* state. Frames are fixed-size at the transcriber's target sample rate.
|
|
38
|
+
*/
|
|
39
|
+
export interface VadDetector {
|
|
40
|
+
process(frame: Float32Array): VadDetectorResult;
|
|
41
|
+
reset(): void;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Thrown when `DualChannelCapture` is constructed in a non-browser environment
|
|
46
|
+
* (no `globalThis.AudioContext`). The helper is intentionally surfaced from the
|
|
47
|
+
* main entrypoint so the import path is uniform across runtimes; the runtime
|
|
48
|
+
* guard moves to construction time.
|
|
49
|
+
*/
|
|
50
|
+
export class BrowserOnlyError extends Error {
|
|
51
|
+
constructor(
|
|
52
|
+
message = "DualChannelCapture requires a browser environment (AudioContext is undefined).",
|
|
53
|
+
) {
|
|
54
|
+
super(message);
|
|
55
|
+
this.name = "BrowserOnlyError";
|
|
56
|
+
}
|
|
57
|
+
}
|
|
@@ -1,4 +1,65 @@
|
|
|
1
1
|
import { AudioEncoding } from "..";
|
|
2
|
+
import type { Channel, VadDetector, VadFrame } from "./dual-channel";
|
|
3
|
+
|
|
4
|
+
export * from "./dual-channel";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Per-channel attribution tuning for dual-channel mode. All fields optional;
|
|
8
|
+
* ignored when `StreamingTranscriberParams.channels` is not set.
|
|
9
|
+
*/
|
|
10
|
+
export type ChannelAttributionParams = {
|
|
11
|
+
/** Energy ratio above which a channel is declared dominant for a word. Default 4. */
|
|
12
|
+
dominanceRatio?: number;
|
|
13
|
+
/** Rolling VAD timeline window in ms. Default 30_000. */
|
|
14
|
+
timelineWindowMs?: number;
|
|
15
|
+
/**
|
|
16
|
+
* Factory for the per-channel VAD detector. Called once per declared channel
|
|
17
|
+
* at transcriber construction time. The channel name is passed so factories
|
|
18
|
+
* that wrap higher-level VAD libraries (which manage their own audio source)
|
|
19
|
+
* can map each `VadDetector` instance to its corresponding channel.
|
|
20
|
+
*/
|
|
21
|
+
createVad?: (channelName: string) => VadDetector;
|
|
22
|
+
/** Mix flush interval in ms — how often per-channel buffers are summed and sent. Default 50. */
|
|
23
|
+
flushIntervalMs?: number;
|
|
24
|
+
/**
|
|
25
|
+
* Strategy used to fill words whose per-word VAD attribution resolved to
|
|
26
|
+
* `"unknown"`. Confident per-word VAD decisions (`"mic"` / `"system"`) are
|
|
27
|
+
* never modified by any strategy.
|
|
28
|
+
*
|
|
29
|
+
* - `"window"` (default): look at the dominant non-`"unknown"` channel
|
|
30
|
+
* among ±`resolutionWindowWords` neighboring words in the same turn.
|
|
31
|
+
* Ignores `speaker_label`, so it works even when AAI re-uses a label for
|
|
32
|
+
* two physically distinct voices.
|
|
33
|
+
* - `"speaker-history"`: accumulate per-`speaker_label` per-channel active
|
|
34
|
+
* VAD energy across the session, then fill `"unknown"` words with the
|
|
35
|
+
* speaker's dominant channel when it clears
|
|
36
|
+
* `speakerHistoryMinRmsEvidence` and beats runner-up by
|
|
37
|
+
* `speakerHistoryDominanceRatio`. Robust for stable speaker labels but
|
|
38
|
+
* does nothing when a speaker has split evidence.
|
|
39
|
+
* - `"none"`: disable resolution; `"unknown"` words remain `"unknown"` in
|
|
40
|
+
* the output.
|
|
41
|
+
*/
|
|
42
|
+
resolveUnknownChannelsMethod?: "none" | "window" | "speaker-history";
|
|
43
|
+
/**
|
|
44
|
+
* Half-window (in words) on each side of an `"unknown"` word for the
|
|
45
|
+
* `"window"` method. Default 2 — so the full window is up to 5 words
|
|
46
|
+
* (2 before + the unknown + 2 after).
|
|
47
|
+
*/
|
|
48
|
+
resolutionWindowWords?: number;
|
|
49
|
+
/**
|
|
50
|
+
* Minimum cumulative active-RMS evidence (sum across all the speaker's
|
|
51
|
+
* frames to date) before a speaker can be resolved via the
|
|
52
|
+
* `"speaker-history"` method. Default 0.5 — roughly a few seconds of
|
|
53
|
+
* sustained speech.
|
|
54
|
+
*/
|
|
55
|
+
speakerHistoryMinRmsEvidence?: number;
|
|
56
|
+
/**
|
|
57
|
+
* For the `"speaker-history"` method, the top channel's evidence must
|
|
58
|
+
* exceed the runner-up's by at least this factor for the speaker to be
|
|
59
|
+
* considered pinned to that channel. Default 3.
|
|
60
|
+
*/
|
|
61
|
+
speakerHistoryDominanceRatio?: number;
|
|
62
|
+
};
|
|
2
63
|
|
|
3
64
|
export type LLMGatewayMessage = {
|
|
4
65
|
role: string;
|
|
@@ -50,6 +111,33 @@ export type StreamingTranscriberParams = {
|
|
|
50
111
|
webhookUrl?: string;
|
|
51
112
|
webhookAuthHeaderName?: string;
|
|
52
113
|
webhookAuthHeaderValue?: string;
|
|
114
|
+
/**
|
|
115
|
+
* Enable dual-channel (or N-channel) mode. Presence of `channels` switches the
|
|
116
|
+
* transcriber into channel-tagged mode: `sendAudio(audio, { channel })` is required,
|
|
117
|
+
* per-channel VAD runs on the raw PCM, the streams are mixed to mono before being
|
|
118
|
+
* sent to the server, and emitted `TurnEvent`s are enriched with `channel` and
|
|
119
|
+
* per-word `channel` attribution.
|
|
120
|
+
*
|
|
121
|
+
* Must contain exactly 2 entries with unique names. The names are echoed back in
|
|
122
|
+
* `TurnEvent.channel` / `words[i].channel`.
|
|
123
|
+
*
|
|
124
|
+
* **Acoustic-leak caveat.** Per-word channel attribution uses energy-based
|
|
125
|
+
* VAD on each channel. If your capture setup lets one channel's audio bleed
|
|
126
|
+
* into another at similar amplitude — typically system audio playing
|
|
127
|
+
* through speakers and being picked up by an open mic — attribution can
|
|
128
|
+
* misfire (mic-tagged words that were actually system). Transcription
|
|
129
|
+
* quality is unaffected; only the `channel` field is. To preserve
|
|
130
|
+
* attribution in speaker-leak setups, apply echo cancellation at capture
|
|
131
|
+
* before feeding audio to the SDK. In browsers, that's
|
|
132
|
+
* `getUserMedia({ audio: { echoCancellation: true } })`. On macOS native,
|
|
133
|
+
* `AVAudioEngine.setVoiceProcessingEnabled(true)` on the input node. If
|
|
134
|
+
* platform-level AEC isn't available, swap in a DNN VAD (e.g. Silero) via
|
|
135
|
+
* `channelAttribution.createVad`. See the dual-channel sample app's
|
|
136
|
+
* README for worked examples.
|
|
137
|
+
*/
|
|
138
|
+
channels?: Array<{ name: string }>;
|
|
139
|
+
/** Tuning for dual-channel attribution. Ignored when `channels` is unset. */
|
|
140
|
+
channelAttribution?: ChannelAttributionParams;
|
|
53
141
|
};
|
|
54
142
|
|
|
55
143
|
export type StreamingEvents =
|
|
@@ -59,6 +147,7 @@ export type StreamingEvents =
|
|
|
59
147
|
| "speechStarted"
|
|
60
148
|
| "llmGatewayResponse"
|
|
61
149
|
| "warning"
|
|
150
|
+
| "vad"
|
|
62
151
|
| "error";
|
|
63
152
|
|
|
64
153
|
export type StreamingListeners = {
|
|
@@ -68,6 +157,7 @@ export type StreamingListeners = {
|
|
|
68
157
|
speechStarted?: (event: SpeechStartedEvent) => void;
|
|
69
158
|
llmGatewayResponse?: (event: LLMGatewayResponseEvent) => void;
|
|
70
159
|
warning?: (event: WarningEvent) => void;
|
|
160
|
+
vad?: (event: VadFrame) => void;
|
|
71
161
|
error?: (error: Error) => void;
|
|
72
162
|
};
|
|
73
163
|
|
|
@@ -186,6 +276,12 @@ export type TurnEvent = {
|
|
|
186
276
|
language_code?: string;
|
|
187
277
|
language_confidence?: number;
|
|
188
278
|
speaker_label?: string;
|
|
279
|
+
/**
|
|
280
|
+
* Duration-weighted majority channel across `words[i].channel`. Populated only
|
|
281
|
+
* when the transcriber is configured with `channels`. Independent from
|
|
282
|
+
* `speaker_label`.
|
|
283
|
+
*/
|
|
284
|
+
channel?: Channel;
|
|
189
285
|
};
|
|
190
286
|
|
|
191
287
|
export type StreamingWord = {
|
|
@@ -195,6 +291,20 @@ export type StreamingWord = {
|
|
|
195
291
|
text: string;
|
|
196
292
|
word_is_final: boolean;
|
|
197
293
|
speaker?: string;
|
|
294
|
+
/**
|
|
295
|
+
* Physical input channel attributed by client-side VAD during this word's
|
|
296
|
+
* time window. Populated only when the transcriber is configured with
|
|
297
|
+
* `channels`. Independent from `speaker`.
|
|
298
|
+
*/
|
|
299
|
+
channel?: Channel;
|
|
300
|
+
/**
|
|
301
|
+
* True if `channel` was filled in by `channelAttribution.resolveUnknownChannelsMethod`
|
|
302
|
+
* rather than by the per-word VAD. Only set on words whose per-word VAD
|
|
303
|
+
* attribution was `"unknown"` and whose resolution method produced a
|
|
304
|
+
* confident channel. Useful for debugging or rendering an indicator that a
|
|
305
|
+
* word's channel came from context, not direct VAD evidence.
|
|
306
|
+
*/
|
|
307
|
+
channelResolved?: boolean;
|
|
198
308
|
};
|
|
199
309
|
|
|
200
310
|
export type TerminationEvent = {
|