assemblyai 4.33.3 → 4.34.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -0
- package/dist/assemblyai.streaming.umd.js +1291 -3
- package/dist/assemblyai.streaming.umd.min.js +1 -1
- package/dist/assemblyai.umd.js +802 -7
- package/dist/assemblyai.umd.min.js +1 -1
- package/dist/browser.mjs +775 -5
- package/dist/bun.mjs +775 -5
- package/dist/deno.mjs +775 -5
- package/dist/exports/streaming.d.ts +7 -0
- package/dist/index.cjs +802 -7
- package/dist/index.mjs +794 -8
- package/dist/node.cjs +783 -4
- package/dist/node.mjs +775 -5
- package/dist/services/index.d.ts +2 -2
- package/dist/services/streaming/browser/dual-channel-capture.d.ts +66 -0
- package/dist/services/streaming/browser/worklets/pcm16-encoder.d.ts +19 -0
- package/dist/services/streaming/energy-vad.d.ts +35 -0
- package/dist/services/streaming/index.d.ts +4 -0
- package/dist/services/streaming/label-mapper.d.ts +44 -0
- package/dist/services/streaming/resampler.d.ts +22 -0
- package/dist/services/streaming/service.d.ts +71 -2
- package/dist/streaming.browser.mjs +1247 -4
- package/dist/streaming.cjs +1287 -3
- package/dist/streaming.mjs +1276 -4
- package/dist/types/streaming/dual-channel.d.ts +48 -0
- package/dist/types/streaming/index.d.ts +140 -4
- package/dist/workerd.mjs +775 -5
- package/package.json +1 -1
- package/src/exports/streaming.ts +7 -0
- package/src/services/index.ts +20 -1
- package/src/services/streaming/browser/dual-channel-capture.ts +177 -0
- package/src/services/streaming/browser/worklets/pcm16-encoder.ts +70 -0
- package/src/services/streaming/energy-vad.ts +75 -0
- package/src/services/streaming/index.ts +4 -0
- package/src/services/streaming/label-mapper.ts +128 -0
- package/src/services/streaming/resampler.ts +69 -0
- package/src/services/streaming/service.ts +405 -3
- package/src/types/streaming/dual-channel.ts +57 -0
- package/src/types/streaming/index.ts +144 -1
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
import { ErrorEvent, MessageEvent, CloseEvent } from "ws";
|
|
7
7
|
import { conditions } from "#conditions";
|
|
8
8
|
import {
|
|
9
|
+
ChannelAttributionParams,
|
|
9
10
|
StreamingEvents,
|
|
10
11
|
StreamingListeners,
|
|
11
12
|
StreamingTranscriberParams,
|
|
@@ -14,16 +15,67 @@ import {
|
|
|
14
15
|
StreamingEventMessage,
|
|
15
16
|
TurnEvent,
|
|
16
17
|
LLMGatewayResponseEvent,
|
|
18
|
+
SpeakerRevisionEvent,
|
|
17
19
|
StreamingUpdateConfiguration,
|
|
18
20
|
StreamingForceEndpoint,
|
|
19
21
|
WarningEvent,
|
|
20
22
|
} from "../..";
|
|
23
|
+
import type { VadDetector, VadFrame } from "../../types/streaming/dual-channel";
|
|
24
|
+
import { EnergyVad } from "./energy-vad";
|
|
25
|
+
import { attributeTurn, rollUpTurnChannel, VadTimeline } from "./label-mapper";
|
|
21
26
|
import { StreamingError, StreamingErrorMessages } from "../../utils/errors";
|
|
22
27
|
import { StreamingErrorTypeCodes } from "../../utils/errors/streaming";
|
|
23
28
|
|
|
29
|
+
/**
|
|
30
|
+
* Options for `sendAudio`. In dual-channel mode (when `channels` is configured
|
|
31
|
+
* on the transcriber), `channel` is required and must match one of the declared
|
|
32
|
+
* channel names; in single-channel mode it is ignored.
|
|
33
|
+
*/
|
|
34
|
+
export type SendAudioOptions = {
|
|
35
|
+
channel?: string;
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* View any `AudioData` (ArrayBuffer / ArrayBufferView / typed array) as a
|
|
40
|
+
* little-endian Int16 sample sequence without copying. Callers must guarantee
|
|
41
|
+
* the underlying byte length is even.
|
|
42
|
+
*/
|
|
43
|
+
function toInt16View(audio: AudioData): Int16Array {
|
|
44
|
+
// AudioData is ArrayBufferLike per the public type, but in practice callers
|
|
45
|
+
// pass ArrayBuffer or a typed-array view. Handle both without copying.
|
|
46
|
+
if (audio instanceof Int16Array) return audio;
|
|
47
|
+
if (ArrayBuffer.isView(audio)) {
|
|
48
|
+
const view = audio as ArrayBufferView;
|
|
49
|
+
return new Int16Array(
|
|
50
|
+
view.buffer,
|
|
51
|
+
view.byteOffset,
|
|
52
|
+
Math.floor(view.byteLength / 2),
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
return new Int16Array(audio as ArrayBuffer);
|
|
56
|
+
}
|
|
57
|
+
|
|
24
58
|
const defaultStreamingUrl = "wss://streaming.assemblyai.com/v3/ws";
|
|
25
59
|
const terminateSessionMessage = `{"type":"Terminate"}`;
|
|
26
60
|
|
|
61
|
+
/**
|
|
62
|
+
* Per-send chunk cap in milliseconds for the dual-channel mixer. The streaming
|
|
63
|
+
* server rejects audio messages longer than 1000 ms (`Input Duration Error`).
|
|
64
|
+
* If a backlog accumulates (e.g. when a browser tab is backgrounded and
|
|
65
|
+
* `setInterval` is throttled to ~1 Hz), `flushMix` loops and emits multiple
|
|
66
|
+
* sends each ≤ this cap until the buffers drain.
|
|
67
|
+
*/
|
|
68
|
+
const MAX_CHUNK_MS = 200;
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Per-send minimum chunk size in milliseconds. The streaming server also
|
|
72
|
+
* rejects audio messages shorter than 50 ms with the same
|
|
73
|
+
* `Input Duration Error`, so the mixer waits until both per-channel buffers
|
|
74
|
+
* have at least this much accumulated before emitting. Final-flush (close
|
|
75
|
+
* path) bypasses this floor so the trailing partial buffer still gets sent.
|
|
76
|
+
*/
|
|
77
|
+
const MIN_CHUNK_MS = 50;
|
|
78
|
+
|
|
27
79
|
type BufferLike =
|
|
28
80
|
| string
|
|
29
81
|
| Buffer
|
|
@@ -51,6 +103,25 @@ export class StreamingTranscriber {
|
|
|
51
103
|
private listeners: StreamingListeners = {};
|
|
52
104
|
private sessionTerminatedResolve?: () => void;
|
|
53
105
|
|
|
106
|
+
// Dual-channel mode state (allocated only when params.channels is set).
|
|
107
|
+
private isDualChannel = false;
|
|
108
|
+
private channelNames?: string[];
|
|
109
|
+
private channelBuffers?: Map<string, number[]>;
|
|
110
|
+
private channelSamplesReceived?: Map<string, number>;
|
|
111
|
+
private channelVadFloatBuffers?: Map<string, Float32Array>;
|
|
112
|
+
private channelVadBufferIdx?: Map<string, number>;
|
|
113
|
+
private channelVads?: Map<string, VadDetector>;
|
|
114
|
+
private timeline?: VadTimeline;
|
|
115
|
+
private flushTimer?: ReturnType<typeof setInterval>;
|
|
116
|
+
private attributionParams?: Required<ChannelAttributionParams>;
|
|
117
|
+
private vadFrameSamples = 0;
|
|
118
|
+
private minChunkSamples = 0;
|
|
119
|
+
private maxChunkSamples = 0;
|
|
120
|
+
// For resolveUnknownChannelsMethod === "speaker-history": per-speaker_label
|
|
121
|
+
// cumulative active-VAD RMS per channel. Allocated only when that method is
|
|
122
|
+
// configured.
|
|
123
|
+
private speakerHistory?: Map<string, Map<string, number>>;
|
|
124
|
+
|
|
54
125
|
constructor(params: StreamingTranscriberParams) {
|
|
55
126
|
this.params = {
|
|
56
127
|
...params,
|
|
@@ -63,6 +134,58 @@ export class StreamingTranscriber {
|
|
|
63
134
|
if (!(this.token || this.apiKey)) {
|
|
64
135
|
throw new Error("API key or temporary token is required.");
|
|
65
136
|
}
|
|
137
|
+
|
|
138
|
+
if (params.channels) {
|
|
139
|
+
if (params.channels.length !== 2) {
|
|
140
|
+
throw new Error(
|
|
141
|
+
"StreamingTranscriber.channels must have exactly 2 entries.",
|
|
142
|
+
);
|
|
143
|
+
}
|
|
144
|
+
const names = params.channels.map((c) => c.name);
|
|
145
|
+
if (new Set(names).size !== names.length) {
|
|
146
|
+
throw new Error("StreamingTranscriber.channels names must be unique.");
|
|
147
|
+
}
|
|
148
|
+
this.isDualChannel = true;
|
|
149
|
+
this.channelNames = names;
|
|
150
|
+
const att = params.channelAttribution ?? {};
|
|
151
|
+
this.attributionParams = {
|
|
152
|
+
dominanceRatio: att.dominanceRatio ?? 4,
|
|
153
|
+
timelineWindowMs: att.timelineWindowMs ?? 30_000,
|
|
154
|
+
createVad: att.createVad ?? (() => new EnergyVad()),
|
|
155
|
+
flushIntervalMs: att.flushIntervalMs ?? 50,
|
|
156
|
+
resolveUnknownChannelsMethod:
|
|
157
|
+
att.resolveUnknownChannelsMethod ?? "window",
|
|
158
|
+
resolutionWindowWords: att.resolutionWindowWords ?? 2,
|
|
159
|
+
speakerHistoryMinRmsEvidence: att.speakerHistoryMinRmsEvidence ?? 0.5,
|
|
160
|
+
speakerHistoryDominanceRatio: att.speakerHistoryDominanceRatio ?? 3,
|
|
161
|
+
};
|
|
162
|
+
if (
|
|
163
|
+
this.attributionParams.resolveUnknownChannelsMethod ===
|
|
164
|
+
"speaker-history"
|
|
165
|
+
) {
|
|
166
|
+
this.speakerHistory = new Map();
|
|
167
|
+
}
|
|
168
|
+
// 20 ms VAD frames at the transcriber's target sample rate.
|
|
169
|
+
this.vadFrameSamples = Math.max(1, Math.round(params.sampleRate * 0.02));
|
|
170
|
+
this.minChunkSamples = Math.max(
|
|
171
|
+
1,
|
|
172
|
+
Math.round(params.sampleRate * (MIN_CHUNK_MS / 1000)),
|
|
173
|
+
);
|
|
174
|
+
this.maxChunkSamples = Math.max(
|
|
175
|
+
this.minChunkSamples,
|
|
176
|
+
Math.round(params.sampleRate * (MAX_CHUNK_MS / 1000)),
|
|
177
|
+
);
|
|
178
|
+
this.channelBuffers = new Map(names.map((n) => [n, [] as number[]]));
|
|
179
|
+
this.channelSamplesReceived = new Map(names.map((n) => [n, 0]));
|
|
180
|
+
this.channelVadFloatBuffers = new Map(
|
|
181
|
+
names.map((n) => [n, new Float32Array(this.vadFrameSamples)]),
|
|
182
|
+
);
|
|
183
|
+
this.channelVadBufferIdx = new Map(names.map((n) => [n, 0]));
|
|
184
|
+
this.channelVads = new Map(
|
|
185
|
+
names.map((n) => [n, this.attributionParams!.createVad(n)]),
|
|
186
|
+
);
|
|
187
|
+
this.timeline = new VadTimeline(this.attributionParams.timelineWindowMs);
|
|
188
|
+
}
|
|
66
189
|
}
|
|
67
190
|
|
|
68
191
|
private connectionUrl(): URL {
|
|
@@ -140,6 +263,10 @@ export class StreamingTranscriber {
|
|
|
140
263
|
searchParams.set("prompt", this.params.prompt);
|
|
141
264
|
}
|
|
142
265
|
|
|
266
|
+
if (this.params.agentContext) {
|
|
267
|
+
searchParams.set("agent_context", this.params.agentContext);
|
|
268
|
+
}
|
|
269
|
+
|
|
143
270
|
if (this.params.filterProfanity) {
|
|
144
271
|
searchParams.set(
|
|
145
272
|
"filter_profanity",
|
|
@@ -152,7 +279,9 @@ export class StreamingTranscriber {
|
|
|
152
279
|
"[Deprecation Warning] The speech model `u3-pro` is deprecated and will be removed in a future release. Please use `u3-rt-pro` instead.",
|
|
153
280
|
);
|
|
154
281
|
}
|
|
155
|
-
|
|
282
|
+
if (this.params.speechModel !== undefined) {
|
|
283
|
+
searchParams.set("speech_model", this.params.speechModel.toString());
|
|
284
|
+
}
|
|
156
285
|
|
|
157
286
|
if (this.params.languageDetection !== undefined) {
|
|
158
287
|
searchParams.set(
|
|
@@ -267,6 +396,10 @@ export class StreamingTranscriber {
|
|
|
267
396
|
searchParams.set("redact_pii_sub", this.params.redactPiiSub);
|
|
268
397
|
}
|
|
269
398
|
|
|
399
|
+
if (this.params.mode !== undefined) {
|
|
400
|
+
searchParams.set("mode", this.params.mode);
|
|
401
|
+
}
|
|
402
|
+
|
|
270
403
|
if (this.params.llmGateway !== undefined) {
|
|
271
404
|
searchParams.set("llm_gateway", JSON.stringify(this.params.llmGateway));
|
|
272
405
|
}
|
|
@@ -282,7 +415,12 @@ export class StreamingTranscriber {
|
|
|
282
415
|
event: "llmGatewayResponse",
|
|
283
416
|
listener: (event: LLMGatewayResponseEvent) => void,
|
|
284
417
|
): void;
|
|
418
|
+
on(
|
|
419
|
+
event: "speakerRevision",
|
|
420
|
+
listener: (event: SpeakerRevisionEvent) => void,
|
|
421
|
+
): void;
|
|
285
422
|
on(event: "warning", listener: (event: WarningEvent) => void): void;
|
|
423
|
+
on(event: "vad", listener: (event: VadFrame) => void): void;
|
|
286
424
|
on(event: "error", listener: (error: Error) => void): void;
|
|
287
425
|
on(event: "close", listener: (code: number, reason: string) => void): void;
|
|
288
426
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
@@ -323,6 +461,13 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
323
461
|
reason = StreamingErrorMessages[code as StreamingErrorTypeCodes];
|
|
324
462
|
}
|
|
325
463
|
}
|
|
464
|
+
// Stop the flush timer when the socket is gone (server-initiated close,
|
|
465
|
+
// network drop, etc.) — otherwise subsequent ticks call send() on a
|
|
466
|
+
// closed socket and spam the error listener.
|
|
467
|
+
if (this.flushTimer) {
|
|
468
|
+
clearInterval(this.flushTimer);
|
|
469
|
+
this.flushTimer = undefined;
|
|
470
|
+
}
|
|
326
471
|
this.listeners.close?.(code, reason);
|
|
327
472
|
};
|
|
328
473
|
|
|
@@ -351,6 +496,22 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
351
496
|
break;
|
|
352
497
|
}
|
|
353
498
|
case "Turn": {
|
|
499
|
+
if (this.isDualChannel && this.timeline && this.attributionParams) {
|
|
500
|
+
attributeTurn(message, this.timeline, {
|
|
501
|
+
dominanceRatio: this.attributionParams.dominanceRatio,
|
|
502
|
+
});
|
|
503
|
+
switch (this.attributionParams.resolveUnknownChannelsMethod) {
|
|
504
|
+
case "window":
|
|
505
|
+
this.resolveUnknownChannelsByWindow(message);
|
|
506
|
+
break;
|
|
507
|
+
case "speaker-history":
|
|
508
|
+
this.resolveUnknownChannelsBySpeakerHistory(message);
|
|
509
|
+
break;
|
|
510
|
+
case "none":
|
|
511
|
+
// Leave "unknown" words as-is.
|
|
512
|
+
break;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
354
515
|
this.listeners.turn?.(message);
|
|
355
516
|
break;
|
|
356
517
|
}
|
|
@@ -362,6 +523,10 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
362
523
|
this.listeners.llmGatewayResponse?.(message);
|
|
363
524
|
break;
|
|
364
525
|
}
|
|
526
|
+
case "SpeakerRevision": {
|
|
527
|
+
this.listeners.speakerRevision?.(message);
|
|
528
|
+
break;
|
|
529
|
+
}
|
|
365
530
|
case "Warning": {
|
|
366
531
|
const warning = message as WarningEvent;
|
|
367
532
|
console.warn(
|
|
@@ -379,6 +544,11 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
379
544
|
});
|
|
380
545
|
}
|
|
381
546
|
|
|
547
|
+
/**
|
|
548
|
+
* Returns a WritableStream that pumps PCM chunks into `sendAudio`. Single-channel
|
|
549
|
+
* only — in dual-channel mode use `sendAudio(pcm, { channel })` directly, since
|
|
550
|
+
* `WritableStream` has no place to carry a channel tag.
|
|
551
|
+
*/
|
|
382
552
|
stream(): WritableStream<AudioData> {
|
|
383
553
|
return new WritableStream<AudioData>({
|
|
384
554
|
write: (chunk: AudioData) => {
|
|
@@ -387,8 +557,231 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
387
557
|
});
|
|
388
558
|
}
|
|
389
559
|
|
|
390
|
-
|
|
391
|
-
|
|
560
|
+
/**
|
|
561
|
+
* Send PCM audio.
|
|
562
|
+
*
|
|
563
|
+
* In single-channel mode, `audio` is forwarded directly to the WebSocket and
|
|
564
|
+
* `options` is ignored.
|
|
565
|
+
*
|
|
566
|
+
* In dual-channel mode (when `channels` is configured), `options.channel` is
|
|
567
|
+
* REQUIRED and must match one of the declared channel names. Per-channel PCM is
|
|
568
|
+
* fed into that channel's VAD, accumulated into a per-channel ring buffer, and
|
|
569
|
+
* a scheduled flush (`channelAttribution.flushIntervalMs`, default 50ms) mixes
|
|
570
|
+
* the buffers into mono before sending to the WebSocket.
|
|
571
|
+
*/
|
|
572
|
+
sendAudio(audio: AudioData, options?: SendAudioOptions) {
|
|
573
|
+
if (!this.isDualChannel) {
|
|
574
|
+
this.send(audio);
|
|
575
|
+
return;
|
|
576
|
+
}
|
|
577
|
+
if (!options?.channel) {
|
|
578
|
+
throw new Error(
|
|
579
|
+
"StreamingTranscriber is in dual-channel mode; sendAudio requires { channel }.",
|
|
580
|
+
);
|
|
581
|
+
}
|
|
582
|
+
if (!this.channelNames!.includes(options.channel)) {
|
|
583
|
+
throw new Error(
|
|
584
|
+
`Unknown channel "${options.channel}"; declared channels: ${this.channelNames!.join(", ")}.`,
|
|
585
|
+
);
|
|
586
|
+
}
|
|
587
|
+
this.ingestChannelAudio(options.channel, audio);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
private ingestChannelAudio(name: string, audio: AudioData) {
|
|
591
|
+
const samples = toInt16View(audio);
|
|
592
|
+
const buf = this.channelBuffers!.get(name)!;
|
|
593
|
+
const vadBuf = this.channelVadFloatBuffers!.get(name)!;
|
|
594
|
+
let vadIdx = this.channelVadBufferIdx!.get(name)!;
|
|
595
|
+
let received = this.channelSamplesReceived!.get(name)!;
|
|
596
|
+
const vad = this.channelVads!.get(name)!;
|
|
597
|
+
const sampleRate = this.params.sampleRate;
|
|
598
|
+
const frameSize = this.vadFrameSamples;
|
|
599
|
+
|
|
600
|
+
for (let i = 0; i < samples.length; i++) {
|
|
601
|
+
const s = samples[i];
|
|
602
|
+
buf.push(s);
|
|
603
|
+
vadBuf[vadIdx++] = s / 0x8000;
|
|
604
|
+
received++;
|
|
605
|
+
if (vadIdx === frameSize) {
|
|
606
|
+
const result = vad.process(vadBuf);
|
|
607
|
+
const frame: VadFrame = {
|
|
608
|
+
ts: (received / sampleRate) * 1000,
|
|
609
|
+
channel: name,
|
|
610
|
+
active: result.active,
|
|
611
|
+
rms: result.energy,
|
|
612
|
+
};
|
|
613
|
+
this.timeline!.pushFrame(frame);
|
|
614
|
+
this.listeners.vad?.(frame);
|
|
615
|
+
vadIdx = 0;
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
this.channelVadBufferIdx!.set(name, vadIdx);
|
|
620
|
+
this.channelSamplesReceived!.set(name, received);
|
|
621
|
+
|
|
622
|
+
if (!this.flushTimer) this.startFlushTimer();
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
private startFlushTimer() {
|
|
626
|
+
this.flushTimer = setInterval(
|
|
627
|
+
() => this.flushMix(),
|
|
628
|
+
this.attributionParams!.flushIntervalMs,
|
|
629
|
+
);
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
private flushMix(force = false) {
|
|
633
|
+
if (!this.channelNames || !this.channelBuffers) return;
|
|
634
|
+
const bufs = this.channelNames.map((n) => this.channelBuffers!.get(n)!);
|
|
635
|
+
const divisor = bufs.length;
|
|
636
|
+
// Loop so a backlog (e.g. accumulated while a browser tab was throttled in
|
|
637
|
+
// the background) drains as multiple sends, each capped at MAX_CHUNK_MS.
|
|
638
|
+
// Without the cap a single message could exceed the server's 1000 ms input
|
|
639
|
+
// duration limit and be rejected with code 3007.
|
|
640
|
+
for (;;) {
|
|
641
|
+
let mixLen = Infinity;
|
|
642
|
+
for (const b of bufs) if (b.length < mixLen) mixLen = b.length;
|
|
643
|
+
if (!Number.isFinite(mixLen) || mixLen === 0) return;
|
|
644
|
+
// The streaming server rejects audio messages shorter than 50 ms with
|
|
645
|
+
// `Input Duration Error`. Wait until both per-channel buffers have at
|
|
646
|
+
// least minChunkSamples worth queued before emitting. The `force` path
|
|
647
|
+
// (final flush on close) bypasses this so the trailing partial buffer
|
|
648
|
+
// still gets through.
|
|
649
|
+
if (!force && mixLen < this.minChunkSamples) return;
|
|
650
|
+
if (mixLen > this.maxChunkSamples) mixLen = this.maxChunkSamples;
|
|
651
|
+
const out = new Int16Array(mixLen);
|
|
652
|
+
for (let i = 0; i < mixLen; i++) {
|
|
653
|
+
let sum = 0;
|
|
654
|
+
for (let c = 0; c < divisor; c++) sum += bufs[c][i];
|
|
655
|
+
const avg = Math.round(sum / divisor);
|
|
656
|
+
out[i] = avg < -32768 ? -32768 : avg > 32767 ? 32767 : avg;
|
|
657
|
+
}
|
|
658
|
+
for (const b of bufs) b.splice(0, mixLen);
|
|
659
|
+
try {
|
|
660
|
+
this.send(out.buffer);
|
|
661
|
+
} catch (err) {
|
|
662
|
+
this.listeners.error?.(err as Error);
|
|
663
|
+
return;
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
/**
|
|
669
|
+
* Fill in words whose per-word VAD attribution was `"unknown"` by looking
|
|
670
|
+
* at the dominant non-`"unknown"` channel among ±N neighbors in the same
|
|
671
|
+
* turn. Words with no non-`"unknown"` neighbors stay `"unknown"`. Confident
|
|
672
|
+
* per-word VAD decisions are never modified.
|
|
673
|
+
*
|
|
674
|
+
* Local temporal heuristic — ignores `speaker_label`, so it works even when
|
|
675
|
+
* AAI's diarization re-uses the same label for two physically distinct
|
|
676
|
+
* voices. Each resolved word gets `channelResolved: true` so downstream
|
|
677
|
+
* renderers can distinguish inferred channels from directly-measured ones.
|
|
678
|
+
*/
|
|
679
|
+
private resolveUnknownChannelsByWindow(turn: TurnEvent): void {
|
|
680
|
+
if (!this.attributionParams) return;
|
|
681
|
+
const window = this.attributionParams.resolutionWindowWords;
|
|
682
|
+
const words = turn.words;
|
|
683
|
+
let mutated = false;
|
|
684
|
+
|
|
685
|
+
for (let i = 0; i < words.length; i++) {
|
|
686
|
+
if (words[i].channel !== "unknown") continue;
|
|
687
|
+
const tally = new Map<string, number>();
|
|
688
|
+
const lo = Math.max(0, i - window);
|
|
689
|
+
const hi = Math.min(words.length - 1, i + window);
|
|
690
|
+
for (let j = lo; j <= hi; j++) {
|
|
691
|
+
if (j === i) continue;
|
|
692
|
+
const ch = words[j].channel;
|
|
693
|
+
if (!ch || ch === "unknown") continue;
|
|
694
|
+
tally.set(ch, (tally.get(ch) ?? 0) + 1);
|
|
695
|
+
}
|
|
696
|
+
if (tally.size === 0) continue;
|
|
697
|
+
|
|
698
|
+
// Pick the dominant neighbor channel. Ties → leave `"unknown"` (rare;
|
|
699
|
+
// would require an equal count of mic and system neighbors).
|
|
700
|
+
let top: string | undefined;
|
|
701
|
+
let topCount = 0;
|
|
702
|
+
let tied = false;
|
|
703
|
+
for (const [name, count] of tally) {
|
|
704
|
+
if (count > topCount) {
|
|
705
|
+
top = name;
|
|
706
|
+
topCount = count;
|
|
707
|
+
tied = false;
|
|
708
|
+
} else if (count === topCount) {
|
|
709
|
+
tied = true;
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
if (top && !tied) {
|
|
713
|
+
words[i].channel = top;
|
|
714
|
+
words[i].channelResolved = true;
|
|
715
|
+
mutated = true;
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
// Recompute the rollup only if any per-word channel changed.
|
|
720
|
+
if (mutated) turn.channel = rollUpTurnChannel(words);
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
/**
|
|
724
|
+
* Fill `"unknown"` words by looking up the speaker's session-wide channel
|
|
725
|
+
* evidence. For each `speaker_label`, sums active VAD frame RMS per channel
|
|
726
|
+
* across every word the speaker has uttered to date. A speaker is
|
|
727
|
+
* "resolvable" if their total evidence clears
|
|
728
|
+
* `speakerHistoryMinRmsEvidence` and their top channel exceeds the
|
|
729
|
+
* runner-up by `speakerHistoryDominanceRatio`.
|
|
730
|
+
*
|
|
731
|
+
* Only touches `"unknown"` words. Confident per-word VAD decisions are
|
|
732
|
+
* never modified. `speaker_label` is never modified.
|
|
733
|
+
*/
|
|
734
|
+
private resolveUnknownChannelsBySpeakerHistory(turn: TurnEvent): void {
|
|
735
|
+
if (!this.timeline || !this.attributionParams || !this.speakerHistory)
|
|
736
|
+
return;
|
|
737
|
+
const minEvidence = this.attributionParams.speakerHistoryMinRmsEvidence;
|
|
738
|
+
const dominanceRatio = this.attributionParams.speakerHistoryDominanceRatio;
|
|
739
|
+
|
|
740
|
+
// 1. Accumulate evidence from this turn's words.
|
|
741
|
+
for (const w of turn.words) {
|
|
742
|
+
if (!w.speaker) continue;
|
|
743
|
+
const frames = this.timeline.framesInWindow(w.start, w.end);
|
|
744
|
+
let entry = this.speakerHistory.get(w.speaker);
|
|
745
|
+
if (!entry) {
|
|
746
|
+
entry = new Map();
|
|
747
|
+
this.speakerHistory.set(w.speaker, entry);
|
|
748
|
+
}
|
|
749
|
+
for (const f of frames) {
|
|
750
|
+
if (!f.active) continue;
|
|
751
|
+
entry.set(f.channel, (entry.get(f.channel) ?? 0) + f.rms);
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
// 2. Fill unknown words whose speakers have dominant evidence.
|
|
756
|
+
let mutated = false;
|
|
757
|
+
for (const w of turn.words) {
|
|
758
|
+
if (w.channel !== "unknown" || !w.speaker) continue;
|
|
759
|
+
const entry = this.speakerHistory.get(w.speaker);
|
|
760
|
+
if (!entry || entry.size === 0) continue;
|
|
761
|
+
let total = 0;
|
|
762
|
+
let topName: string | undefined;
|
|
763
|
+
let topScore = 0;
|
|
764
|
+
let runnerScore = 0;
|
|
765
|
+
for (const [name, score] of entry) {
|
|
766
|
+
total += score;
|
|
767
|
+
if (score > topScore) {
|
|
768
|
+
runnerScore = topScore;
|
|
769
|
+
topScore = score;
|
|
770
|
+
topName = name;
|
|
771
|
+
} else if (score > runnerScore) {
|
|
772
|
+
runnerScore = score;
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
if (total < minEvidence) continue;
|
|
776
|
+
if (runnerScore > 0 && topScore < dominanceRatio * runnerScore) continue;
|
|
777
|
+
if (topName) {
|
|
778
|
+
w.channel = topName;
|
|
779
|
+
w.channelResolved = true;
|
|
780
|
+
mutated = true;
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
if (mutated) turn.channel = rollUpTurnChannel(turn.words);
|
|
392
785
|
}
|
|
393
786
|
|
|
394
787
|
/**
|
|
@@ -440,6 +833,15 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
|
|
|
440
833
|
}
|
|
441
834
|
|
|
442
835
|
async close(waitForSessionTermination = true) {
|
|
836
|
+
if (this.flushTimer) {
|
|
837
|
+
clearInterval(this.flushTimer);
|
|
838
|
+
this.flushTimer = undefined;
|
|
839
|
+
// Best-effort: drain any final partial mix so the server gets the tail.
|
|
840
|
+
// Bypass the 50ms floor here since this is the last flush; if the tail
|
|
841
|
+
// is <50ms the server will reject that single message, but we'd lose
|
|
842
|
+
// the audio either way.
|
|
843
|
+
this.flushMix(true);
|
|
844
|
+
}
|
|
443
845
|
if (this.socket) {
|
|
444
846
|
if (this.socket.readyState === this.socket.OPEN) {
|
|
445
847
|
if (waitForSessionTermination) {
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Physical input channel that a word/turn was attributed to.
|
|
3
|
+
* - A channel name declared in `StreamingTranscriberParams.channels` (e.g. `"mic"`, `"system"`).
|
|
4
|
+
* - `"unknown"`: no channel was clearly dominant during the word's time window (silent
|
|
5
|
+
* or all channels evenly active under our threshold).
|
|
6
|
+
*
|
|
7
|
+
* This is independent of AssemblyAI's diarization `speaker_label` / `words[i].speaker`,
|
|
8
|
+
* which identifies voices by acoustic characteristics. A given speaker_label can map
|
|
9
|
+
* to any physical channel; the two dimensions can disagree.
|
|
10
|
+
*/
|
|
11
|
+
export type Channel = string | "unknown";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Per-channel, per-frame VAD observation emitted by `StreamingTranscriber` when running
|
|
15
|
+
* in dual-channel mode. `ts` is stream-relative milliseconds, derived from the
|
|
16
|
+
* per-channel sample counter — the same reference frame as `StreamingWord.start` /
|
|
17
|
+
* `.end`, so per-word lookups need no conversion.
|
|
18
|
+
*/
|
|
19
|
+
export type VadFrame = {
|
|
20
|
+
ts: number;
|
|
21
|
+
channel: string;
|
|
22
|
+
active: boolean;
|
|
23
|
+
rms: number;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
export type VadDetectorResult = {
|
|
27
|
+
active: boolean;
|
|
28
|
+
energy: number;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Pluggable per-channel voice-activity detector. The default `EnergyVad` is energy-based
|
|
33
|
+
* with an adaptive noise-floor threshold; callers can drop in a DNN-backed detector
|
|
34
|
+
* (e.g. Silero via `@ricky0123/vad-web`) for noisier environments.
|
|
35
|
+
*
|
|
36
|
+
* A separate `VadDetector` instance is held per channel; do not assume cross-channel
|
|
37
|
+
* state. Frames are fixed-size at the transcriber's target sample rate.
|
|
38
|
+
*/
|
|
39
|
+
export interface VadDetector {
|
|
40
|
+
process(frame: Float32Array): VadDetectorResult;
|
|
41
|
+
reset(): void;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Thrown when `DualChannelCapture` is constructed in a non-browser environment
|
|
46
|
+
* (no `globalThis.AudioContext`). The helper is intentionally surfaced from the
|
|
47
|
+
* main entrypoint so the import path is uniform across runtimes; the runtime
|
|
48
|
+
* guard moves to construction time.
|
|
49
|
+
*/
|
|
50
|
+
export class BrowserOnlyError extends Error {
|
|
51
|
+
constructor(
|
|
52
|
+
message = "DualChannelCapture requires a browser environment (AudioContext is undefined).",
|
|
53
|
+
) {
|
|
54
|
+
super(message);
|
|
55
|
+
this.name = "BrowserOnlyError";
|
|
56
|
+
}
|
|
57
|
+
}
|