assemblyai 4.33.2 → 4.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assemblyai.streaming.umd.js +1279 -3
- package/dist/assemblyai.streaming.umd.min.js +1 -1
- package/dist/assemblyai.umd.js +789 -3
- package/dist/assemblyai.umd.min.js +1 -1
- package/dist/browser.mjs +765 -4
- package/dist/bun.mjs +765 -4
- package/dist/deno.mjs +765 -4
- package/dist/exports/streaming.d.ts +7 -0
- package/dist/index.cjs +789 -3
- package/dist/index.mjs +781 -4
- package/dist/node.cjs +773 -3
- package/dist/node.mjs +765 -4
- package/dist/services/index.d.ts +2 -2
- package/dist/services/streaming/browser/dual-channel-capture.d.ts +66 -0
- package/dist/services/streaming/browser/worklets/pcm16-encoder.d.ts +19 -0
- package/dist/services/streaming/energy-vad.d.ts +35 -0
- package/dist/services/streaming/index.d.ts +4 -0
- package/dist/services/streaming/label-mapper.d.ts +44 -0
- package/dist/services/streaming/resampler.d.ts +22 -0
- package/dist/services/streaming/service.d.ts +69 -1
- package/dist/streaming.browser.mjs +1235 -4
- package/dist/streaming.cjs +1275 -3
- package/dist/streaming.mjs +1264 -4
- package/dist/types/streaming/dual-channel.d.ts +48 -0
- package/dist/types/streaming/index.d.ts +112 -1
- package/dist/workerd.mjs +765 -4
- package/package.json +1 -1
- package/src/exports/streaming.ts +7 -0
- package/src/services/index.ts +20 -1
- package/src/services/streaming/browser/dual-channel-capture.ts +177 -0
- package/src/services/streaming/browser/worklets/pcm16-encoder.ts +70 -0
- package/src/services/streaming/energy-vad.ts +75 -0
- package/src/services/streaming/index.ts +4 -0
- package/src/services/streaming/label-mapper.ts +128 -0
- package/src/services/streaming/resampler.ts +69 -0
- package/src/services/streaming/service.ts +392 -2
- package/src/types/streaming/dual-channel.ts +57 -0
- package/src/types/streaming/index.ts +112 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "assemblyai",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.34.0",
|
|
4
4
|
"description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=18"
|
package/src/exports/streaming.ts
CHANGED
|
@@ -1,4 +1,11 @@
|
|
|
1
1
|
export * from "../types/asyncapi.generated";
|
|
2
2
|
export * from "../types/realtime";
|
|
3
3
|
export * from "../types/helpers";
|
|
4
|
+
export * from "../types/streaming/dual-channel";
|
|
4
5
|
export * from "../services/realtime/service";
|
|
6
|
+
export * from "../services/streaming/service";
|
|
7
|
+
export * from "../services/streaming/factory";
|
|
8
|
+
export * from "../services/streaming/browser/dual-channel-capture";
|
|
9
|
+
export * from "../services/streaming/energy-vad";
|
|
10
|
+
export * from "../services/streaming/label-mapper";
|
|
11
|
+
export * from "../services/streaming/resampler";
|
package/src/services/index.ts
CHANGED
|
@@ -8,7 +8,18 @@ import {
|
|
|
8
8
|
} from "./realtime";
|
|
9
9
|
import { TranscriptService } from "./transcripts";
|
|
10
10
|
import { FileService } from "./files";
|
|
11
|
-
import {
|
|
11
|
+
import {
|
|
12
|
+
StreamingTranscriber,
|
|
13
|
+
StreamingTranscriberFactory,
|
|
14
|
+
DualChannelCapture,
|
|
15
|
+
EnergyVad,
|
|
16
|
+
LinearResampler,
|
|
17
|
+
VadTimeline,
|
|
18
|
+
attributeTurn,
|
|
19
|
+
attributeWord,
|
|
20
|
+
rollUpTurnChannel,
|
|
21
|
+
float32ToPcm16,
|
|
22
|
+
} from "./streaming";
|
|
12
23
|
|
|
13
24
|
const defaultBaseUrl = "https://api.assemblyai.com";
|
|
14
25
|
const defaultStreamingUrl = "https://streaming.assemblyai.com";
|
|
@@ -71,4 +82,12 @@ export {
|
|
|
71
82
|
TranscriptService,
|
|
72
83
|
FileService,
|
|
73
84
|
StreamingTranscriber,
|
|
85
|
+
DualChannelCapture,
|
|
86
|
+
EnergyVad,
|
|
87
|
+
LinearResampler,
|
|
88
|
+
VadTimeline,
|
|
89
|
+
attributeTurn,
|
|
90
|
+
attributeWord,
|
|
91
|
+
rollUpTurnChannel,
|
|
92
|
+
float32ToPcm16,
|
|
74
93
|
};
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import { StreamingTranscriber } from "../service";
|
|
2
|
+
import { BrowserOnlyError } from "../../../types/streaming/dual-channel";
|
|
3
|
+
|
|
4
|
+
export { BrowserOnlyError } from "../../../types/streaming/dual-channel";
|
|
5
|
+
import {
|
|
6
|
+
PCM16_ENCODER_PROCESSOR_NAME,
|
|
7
|
+
Pcm16EncoderMessage,
|
|
8
|
+
pcm16EncoderWorkletSource,
|
|
9
|
+
} from "./worklets/pcm16-encoder";
|
|
10
|
+
|
|
11
|
+
const DEFAULT_TARGET_RATE = 16_000;
|
|
12
|
+
const DEFAULT_CHUNK_MS = 50;
|
|
13
|
+
const MIC_CHANNEL = "mic";
|
|
14
|
+
const SYSTEM_CHANNEL = "system";
|
|
15
|
+
|
|
16
|
+
type ErrorListener = (err: Error) => void;
|
|
17
|
+
|
|
18
|
+
export type DualChannelCaptureParams = {
|
|
19
|
+
/** Microphone MediaStream. Caller should set `echoCancellation: true` at `getUserMedia` time. */
|
|
20
|
+
micStream: MediaStream;
|
|
21
|
+
/** System-audio MediaStream (e.g. `getDisplayMedia({ audio: true })`). */
|
|
22
|
+
systemStream: MediaStream;
|
|
23
|
+
/**
|
|
24
|
+
* The transcriber to push tagged PCM into. MUST be constructed with
|
|
25
|
+
* `channels: [{ name: "mic" }, { name: "system" }]` so the per-channel
|
|
26
|
+
* `sendAudio` calls succeed.
|
|
27
|
+
*/
|
|
28
|
+
transcriber: StreamingTranscriber;
|
|
29
|
+
/**
|
|
30
|
+
* Target sample rate sent to the transcriber. Defaults to 16000. The
|
|
31
|
+
* AudioContext runs at the device's native rate; resampling happens inside
|
|
32
|
+
* the encoder worklet (forcing `AudioContext({ sampleRate })` is unreliable
|
|
33
|
+
* across browsers).
|
|
34
|
+
*/
|
|
35
|
+
targetSampleRate?: number;
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Browser-only adapter that pumps two `MediaStream`s into a `StreamingTranscriber`
|
|
40
|
+
* configured for dual-channel mode. Each `MediaStream` runs through its own
|
|
41
|
+
* `pcm16-encoder` AudioWorklet (resample to `targetSampleRate`, encode to Int16
|
|
42
|
+
* PCM); each PCM chunk is forwarded via `transcriber.sendAudio(pcm, { channel })`.
|
|
43
|
+
*
|
|
44
|
+
* All dual-channel orchestration (mixing, VAD, per-word attribution) lives inside
|
|
45
|
+
* `StreamingTranscriber` — this class is a pure I/O adapter. Non-browser runtimes
|
|
46
|
+
* can replicate its job by pushing tagged PCM into `transcriber.sendAudio` directly.
|
|
47
|
+
*
|
|
48
|
+
* Caller responsibilities:
|
|
49
|
+
* - **Echo cancellation** is set at `getUserMedia` time (`audio: { echoCancellation: true }`).
|
|
50
|
+
* - **System-audio capture** is platform-dependent. Chrome's `getDisplayMedia({ audio: true })`
|
|
51
|
+
* captures tab audio (and on Windows, full system audio when sharing the whole screen).
|
|
52
|
+
* macOS requires a virtual loopback driver (e.g. BlackHole) to expose system audio at all.
|
|
53
|
+
* - **Token auth.** Construct the transcriber with `token` — API-key auth is unsupported in browsers.
|
|
54
|
+
* - **Stream ownership.** `stop()` tears down the AudioContext but does NOT stop the
|
|
55
|
+
* `MediaStreamTrack`s passed in — callers own those.
|
|
56
|
+
*/
|
|
57
|
+
export class DualChannelCapture {
|
|
58
|
+
private readonly params: Required<
|
|
59
|
+
Omit<DualChannelCaptureParams, "targetSampleRate">
|
|
60
|
+
> & { targetSampleRate: number };
|
|
61
|
+
private errorListener?: ErrorListener;
|
|
62
|
+
private context?: AudioContext;
|
|
63
|
+
private micSource?: MediaStreamAudioSourceNode;
|
|
64
|
+
private sysSource?: MediaStreamAudioSourceNode;
|
|
65
|
+
private micEncoder?: AudioWorkletNode;
|
|
66
|
+
private sysEncoder?: AudioWorkletNode;
|
|
67
|
+
private running = false;
|
|
68
|
+
|
|
69
|
+
constructor(params: DualChannelCaptureParams) {
|
|
70
|
+
if (typeof globalThis.AudioContext === "undefined") {
|
|
71
|
+
throw new BrowserOnlyError();
|
|
72
|
+
}
|
|
73
|
+
this.params = {
|
|
74
|
+
micStream: params.micStream,
|
|
75
|
+
systemStream: params.systemStream,
|
|
76
|
+
transcriber: params.transcriber,
|
|
77
|
+
targetSampleRate: params.targetSampleRate ?? DEFAULT_TARGET_RATE,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
on(event: "error", listener: ErrorListener): void {
|
|
82
|
+
if (event === "error") this.errorListener = listener;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Wire the capture pipeline and start pumping tagged PCM into the transcriber.
|
|
87
|
+
* The transcriber must already be connected. Returns once the worklet is
|
|
88
|
+
* registered and the audio graph is live.
|
|
89
|
+
*/
|
|
90
|
+
async start(): Promise<void> {
|
|
91
|
+
if (this.running) {
|
|
92
|
+
throw new Error("DualChannelCapture already started");
|
|
93
|
+
}
|
|
94
|
+
this.context = new AudioContext();
|
|
95
|
+
|
|
96
|
+
const blob = new Blob([pcm16EncoderWorkletSource], {
|
|
97
|
+
type: "application/javascript",
|
|
98
|
+
});
|
|
99
|
+
const url = URL.createObjectURL(blob);
|
|
100
|
+
try {
|
|
101
|
+
await this.context.audioWorklet.addModule(url);
|
|
102
|
+
} finally {
|
|
103
|
+
URL.revokeObjectURL(url);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
this.micSource = this.context.createMediaStreamSource(
|
|
107
|
+
this.params.micStream,
|
|
108
|
+
);
|
|
109
|
+
this.sysSource = this.context.createMediaStreamSource(
|
|
110
|
+
this.params.systemStream,
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
this.micEncoder = this.makeEncoder(MIC_CHANNEL);
|
|
114
|
+
this.sysEncoder = this.makeEncoder(SYSTEM_CHANNEL);
|
|
115
|
+
this.micSource.connect(this.micEncoder);
|
|
116
|
+
this.sysSource.connect(this.sysEncoder);
|
|
117
|
+
|
|
118
|
+
this.running = true;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
private makeEncoder(channel: string): AudioWorkletNode {
|
|
122
|
+
const node = new AudioWorkletNode(
|
|
123
|
+
this.context!,
|
|
124
|
+
PCM16_ENCODER_PROCESSOR_NAME,
|
|
125
|
+
{
|
|
126
|
+
numberOfInputs: 1,
|
|
127
|
+
numberOfOutputs: 0,
|
|
128
|
+
channelCount: 1,
|
|
129
|
+
channelCountMode: "explicit",
|
|
130
|
+
channelInterpretation: "speakers",
|
|
131
|
+
processorOptions: {
|
|
132
|
+
targetRate: this.params.targetSampleRate,
|
|
133
|
+
chunkMs: DEFAULT_CHUNK_MS,
|
|
134
|
+
},
|
|
135
|
+
},
|
|
136
|
+
);
|
|
137
|
+
node.port.onmessage = (e: MessageEvent<Pcm16EncoderMessage>) => {
|
|
138
|
+
try {
|
|
139
|
+
this.params.transcriber.sendAudio(e.data.pcm, { channel });
|
|
140
|
+
} catch (err) {
|
|
141
|
+
this.errorListener?.(err as Error);
|
|
142
|
+
}
|
|
143
|
+
};
|
|
144
|
+
return node;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Tear down internal nodes and close the AudioContext. Does NOT stop the
|
|
149
|
+
* caller-provided MediaStream tracks — they remain available for preview UI,
|
|
150
|
+
* recording, etc. Idempotent.
|
|
151
|
+
*/
|
|
152
|
+
async stop(): Promise<void> {
|
|
153
|
+
if (!this.running) return;
|
|
154
|
+
this.running = false;
|
|
155
|
+
|
|
156
|
+
try {
|
|
157
|
+
this.micEncoder?.port.close();
|
|
158
|
+
this.sysEncoder?.port.close();
|
|
159
|
+
this.micEncoder?.disconnect();
|
|
160
|
+
this.sysEncoder?.disconnect();
|
|
161
|
+
this.micSource?.disconnect();
|
|
162
|
+
this.sysSource?.disconnect();
|
|
163
|
+
} catch {
|
|
164
|
+
// Disconnecting already-disconnected nodes throws in some browsers; ignore.
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if (this.context && this.context.state !== "closed") {
|
|
168
|
+
await this.context.close();
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
this.context = undefined;
|
|
172
|
+
this.micSource = undefined;
|
|
173
|
+
this.sysSource = undefined;
|
|
174
|
+
this.micEncoder = undefined;
|
|
175
|
+
this.sysEncoder = undefined;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AudioWorklet processor that ingests mono Float32 audio at the AudioContext's
|
|
3
|
+
* native sample rate, resamples to `targetRate` (linear interpolation, stateful
|
|
4
|
+
* across `process()` calls), packs to little-endian Int16 PCM, and posts
|
|
5
|
+
* fixed-size chunks via `port.postMessage` with a running `samplesSent` counter.
|
|
6
|
+
*
|
|
7
|
+
* `samplesSent` is in **target-rate samples**, so the main thread can derive a
|
|
8
|
+
* stream-relative timestamp = `samplesSent / targetRate * 1000` (ms) — the same
|
|
9
|
+
* frame AAI uses for `StreamingWord.start` / `.end`.
|
|
10
|
+
*
|
|
11
|
+
* Defined as a string so it can be registered via a Blob URL — the SDK ships as
|
|
12
|
+
* a single ESM file, so a separate `.js` worklet asset isn't viable.
|
|
13
|
+
*/
|
|
14
|
+
export const pcm16EncoderWorkletSource = `
|
|
15
|
+
class Pcm16EncoderProcessor extends AudioWorkletProcessor {
|
|
16
|
+
constructor(options) {
|
|
17
|
+
super();
|
|
18
|
+
const opts = (options && options.processorOptions) || {};
|
|
19
|
+
this.targetRate = opts.targetRate || 16000;
|
|
20
|
+
this.chunkMs = opts.chunkMs || 50;
|
|
21
|
+
this.ratio = sampleRate / this.targetRate;
|
|
22
|
+
this.chunkSize = Math.round(this.targetRate * this.chunkMs / 1000);
|
|
23
|
+
this.buffer = new Int16Array(this.chunkSize);
|
|
24
|
+
this.bufferIdx = 0;
|
|
25
|
+
this.samplesSent = 0;
|
|
26
|
+
this.lastSample = 0;
|
|
27
|
+
this.fractional = 0;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
process(inputs) {
|
|
31
|
+
const input = inputs[0];
|
|
32
|
+
if (!input || input.length === 0 || !input[0] || input[0].length === 0) {
|
|
33
|
+
return true;
|
|
34
|
+
}
|
|
35
|
+
const mono = input[0];
|
|
36
|
+
let pos = this.fractional;
|
|
37
|
+
while (pos < mono.length) {
|
|
38
|
+
const i = Math.floor(pos);
|
|
39
|
+
const frac = pos - i;
|
|
40
|
+
const a = i === 0 ? this.lastSample : mono[i - 1];
|
|
41
|
+
const b = mono[i];
|
|
42
|
+
const sample = a + (b - a) * frac;
|
|
43
|
+
const clamped = sample < -1 ? -1 : sample > 1 ? 1 : sample;
|
|
44
|
+
this.buffer[this.bufferIdx++] = clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff;
|
|
45
|
+
if (this.bufferIdx === this.chunkSize) {
|
|
46
|
+
const out = new Int16Array(this.chunkSize);
|
|
47
|
+
out.set(this.buffer);
|
|
48
|
+
this.samplesSent += this.chunkSize;
|
|
49
|
+
this.port.postMessage(
|
|
50
|
+
{ pcm: out.buffer, samplesSent: this.samplesSent },
|
|
51
|
+
[out.buffer],
|
|
52
|
+
);
|
|
53
|
+
this.bufferIdx = 0;
|
|
54
|
+
}
|
|
55
|
+
pos += this.ratio;
|
|
56
|
+
}
|
|
57
|
+
this.lastSample = mono[mono.length - 1];
|
|
58
|
+
this.fractional = pos - mono.length;
|
|
59
|
+
return true;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
registerProcessor("aai-pcm16-encoder", Pcm16EncoderProcessor);
|
|
63
|
+
`;
|
|
64
|
+
|
|
65
|
+
export const PCM16_ENCODER_PROCESSOR_NAME = "aai-pcm16-encoder";
|
|
66
|
+
|
|
67
|
+
export type Pcm16EncoderMessage = {
|
|
68
|
+
pcm: ArrayBuffer;
|
|
69
|
+
samplesSent: number;
|
|
70
|
+
};
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import {
|
|
2
|
+
VadDetector,
|
|
3
|
+
VadDetectorResult,
|
|
4
|
+
} from "../../types/streaming/dual-channel";
|
|
5
|
+
|
|
6
|
+
export type EnergyVadParams = {
|
|
7
|
+
/** Threshold = noiseFloor * thresholdRatio. Default 3.0 (~ +9.5 dB above noise). */
|
|
8
|
+
thresholdRatio?: number;
|
|
9
|
+
/** EMA smoothing for the noise-floor estimate when frame is non-speech. Default 0.05. */
|
|
10
|
+
noiseFloorAlpha?: number;
|
|
11
|
+
/** Hangover in frames: stay "active" this many frames after the last speech frame. Default 10 (\~200 ms at 20 ms frames). */
|
|
12
|
+
hangoverFrames?: number;
|
|
13
|
+
/** Initial noise floor estimate. Default 1e-4. Adaptive after the first non-speech frame. */
|
|
14
|
+
initialNoiseFloor?: number;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Energy-based VAD with adaptive noise-floor tracking and hangover. Pure JS,
|
|
19
|
+
* no dependencies. Suitable for the "which physical channel is speaking" task
|
|
20
|
+
* because the channels are already physically separated at capture — the harder
|
|
21
|
+
* problem (speech vs. non-speech in the wild) is one a customer can swap in a
|
|
22
|
+
* DNN VAD for via the `createVad` parameter.
|
|
23
|
+
*
|
|
24
|
+
* Tuning notes:
|
|
25
|
+
* - thresholdRatio below 2 will treat anything above noise as speech (too sensitive).
|
|
26
|
+
* - thresholdRatio above 6 will miss quiet utterance onsets/offsets.
|
|
27
|
+
* - noiseFloorAlpha above 0.1 makes the floor track quickly (good for non-stationary
|
|
28
|
+
* background) but risks slowly adapting *up* to a sustained low voice.
|
|
29
|
+
*/
|
|
30
|
+
export class EnergyVad implements VadDetector {
|
|
31
|
+
private readonly thresholdRatio: number;
|
|
32
|
+
private readonly noiseFloorAlpha: number;
|
|
33
|
+
private readonly hangoverFrames: number;
|
|
34
|
+
private readonly initialNoiseFloor: number;
|
|
35
|
+
private noiseFloor: number;
|
|
36
|
+
private hangoverRemaining = 0;
|
|
37
|
+
|
|
38
|
+
constructor(params: EnergyVadParams = {}) {
|
|
39
|
+
this.thresholdRatio = params.thresholdRatio ?? 3.0;
|
|
40
|
+
this.noiseFloorAlpha = params.noiseFloorAlpha ?? 0.05;
|
|
41
|
+
this.hangoverFrames = params.hangoverFrames ?? 10;
|
|
42
|
+
this.initialNoiseFloor = params.initialNoiseFloor ?? 1e-4;
|
|
43
|
+
this.noiseFloor = this.initialNoiseFloor;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
process(frame: Float32Array): VadDetectorResult {
|
|
47
|
+
let sumSq = 0;
|
|
48
|
+
for (let i = 0; i < frame.length; i++) {
|
|
49
|
+
sumSq += frame[i] * frame[i];
|
|
50
|
+
}
|
|
51
|
+
const rms = frame.length > 0 ? Math.sqrt(sumSq / frame.length) : 0;
|
|
52
|
+
|
|
53
|
+
const threshold = this.noiseFloor * this.thresholdRatio;
|
|
54
|
+
let active = rms > threshold;
|
|
55
|
+
|
|
56
|
+
if (active) {
|
|
57
|
+
this.hangoverRemaining = this.hangoverFrames;
|
|
58
|
+
} else if (this.hangoverRemaining > 0) {
|
|
59
|
+
this.hangoverRemaining--;
|
|
60
|
+
active = true;
|
|
61
|
+
// While in hangover, do not update noise floor — RMS may still reflect tail energy.
|
|
62
|
+
} else {
|
|
63
|
+
this.noiseFloor =
|
|
64
|
+
this.noiseFloor * (1 - this.noiseFloorAlpha) +
|
|
65
|
+
rms * this.noiseFloorAlpha;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return { active, energy: rms };
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
reset(): void {
|
|
72
|
+
this.noiseFloor = this.initialNoiseFloor;
|
|
73
|
+
this.hangoverRemaining = 0;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import { StreamingWord, TurnEvent } from "../../types/streaming";
|
|
2
|
+
import { Channel, VadFrame } from "../../types/streaming/dual-channel";
|
|
3
|
+
|
|
4
|
+
export type LabelMapperParams = {
|
|
5
|
+
/** Per-word energy ratio above which a channel is declared dominant. */
|
|
6
|
+
dominanceRatio: number;
|
|
7
|
+
};
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Append-only ring buffer of VAD frames in stream-relative ms order.
|
|
11
|
+
* `pushFrame` is O(1) amortized; `framesInWindow` is O(n) over kept frames,
|
|
12
|
+
* which is fine for the per-word lookups we do (a 30 s window at 50 frames/s
|
|
13
|
+
* per channel × 2 channels = 3000 entries, scanned once per word).
|
|
14
|
+
*
|
|
15
|
+
* Runtime-agnostic — no DOM or Web Audio dependencies.
|
|
16
|
+
*/
|
|
17
|
+
export class VadTimeline {
|
|
18
|
+
private frames: VadFrame[] = [];
|
|
19
|
+
private head = 0;
|
|
20
|
+
|
|
21
|
+
constructor(private readonly windowMs: number) {}
|
|
22
|
+
|
|
23
|
+
pushFrame(frame: VadFrame): void {
|
|
24
|
+
this.frames.push(frame);
|
|
25
|
+
const cutoff = frame.ts - this.windowMs;
|
|
26
|
+
while (
|
|
27
|
+
this.head < this.frames.length &&
|
|
28
|
+
this.frames[this.head].ts < cutoff
|
|
29
|
+
) {
|
|
30
|
+
this.head++;
|
|
31
|
+
}
|
|
32
|
+
if (this.head > 1024 && this.head * 2 > this.frames.length) {
|
|
33
|
+
this.frames = this.frames.slice(this.head);
|
|
34
|
+
this.head = 0;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
framesInWindow(startMs: number, endMs: number): VadFrame[] {
|
|
39
|
+
const out: VadFrame[] = [];
|
|
40
|
+
for (let i = this.head; i < this.frames.length; i++) {
|
|
41
|
+
const f = this.frames[i];
|
|
42
|
+
if (f.ts < startMs) continue;
|
|
43
|
+
if (f.ts > endMs) break;
|
|
44
|
+
out.push(f);
|
|
45
|
+
}
|
|
46
|
+
return out;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
clear(): void {
|
|
50
|
+
this.frames = [];
|
|
51
|
+
this.head = 0;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Sum per-channel active RMS over a window. Returns a Map from channel name
|
|
57
|
+
* to total score. Channels with zero score are omitted.
|
|
58
|
+
*/
|
|
59
|
+
function scoreChannels(frames: VadFrame[]): Map<string, number> {
|
|
60
|
+
const scores = new Map<string, number>();
|
|
61
|
+
for (const f of frames) {
|
|
62
|
+
if (!f.active) continue;
|
|
63
|
+
scores.set(f.channel, (scores.get(f.channel) ?? 0) + f.rms);
|
|
64
|
+
}
|
|
65
|
+
return scores;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Decide which channel was dominant during a word's `[start, end]` window.
|
|
70
|
+
*
|
|
71
|
+
* - If no channel has any active VAD energy → `"unknown"`.
|
|
72
|
+
* - If the top channel beats the runner-up by at least `dominanceRatio` → top channel.
|
|
73
|
+
* - Else: top channel wins on absolute score; exact ties → `"unknown"`.
|
|
74
|
+
*/
|
|
75
|
+
export function attributeWord(
|
|
76
|
+
word: StreamingWord,
|
|
77
|
+
timeline: VadTimeline,
|
|
78
|
+
params: LabelMapperParams,
|
|
79
|
+
): Channel {
|
|
80
|
+
const scores = scoreChannels(timeline.framesInWindow(word.start, word.end));
|
|
81
|
+
if (scores.size === 0) return "unknown";
|
|
82
|
+
const sorted = [...scores.entries()].sort((a, b) => b[1] - a[1]);
|
|
83
|
+
if (sorted.length === 1) return sorted[0][0];
|
|
84
|
+
const [topName, topScore] = sorted[0];
|
|
85
|
+
const [runnerName, runnerScore] = sorted[1];
|
|
86
|
+
if (topScore >= params.dominanceRatio * runnerScore) return topName;
|
|
87
|
+
if (topScore > runnerScore) return topName;
|
|
88
|
+
if (runnerScore > topScore) return runnerName;
|
|
89
|
+
return "unknown";
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Duration-weighted majority of word channels. `"unknown"` if there are no
|
|
94
|
+
* words, every word resolved to `"unknown"`, or two channels tie exactly.
|
|
95
|
+
*/
|
|
96
|
+
export function rollUpTurnChannel(words: StreamingWord[]): Channel {
|
|
97
|
+
const totals = new Map<string, number>();
|
|
98
|
+
for (const w of words) {
|
|
99
|
+
if (!w.channel || w.channel === "unknown") continue;
|
|
100
|
+
const dur = Math.max(0, w.end - w.start);
|
|
101
|
+
totals.set(w.channel, (totals.get(w.channel) ?? 0) + dur);
|
|
102
|
+
}
|
|
103
|
+
if (totals.size === 0) return "unknown";
|
|
104
|
+
const sorted = [...totals.entries()].sort((a, b) => b[1] - a[1]);
|
|
105
|
+
if (sorted.length === 1) return sorted[0][0];
|
|
106
|
+
const [topName, topMs] = sorted[0];
|
|
107
|
+
const [, runnerMs] = sorted[1];
|
|
108
|
+
if (topMs === runnerMs) return "unknown";
|
|
109
|
+
return topName;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Mutate `turn` in place: write `turn.words[i].channel` for every word and set
|
|
114
|
+
* `turn.channel` to the duration-weighted rollup.
|
|
115
|
+
*
|
|
116
|
+
* Returns `void` because the transcriber owns the `TurnEvent` ref and forwards
|
|
117
|
+
* the same object to the customer listener — no need to allocate a copy.
|
|
118
|
+
*/
|
|
119
|
+
export function attributeTurn(
|
|
120
|
+
turn: TurnEvent,
|
|
121
|
+
timeline: VadTimeline,
|
|
122
|
+
params: LabelMapperParams,
|
|
123
|
+
): void {
|
|
124
|
+
for (const w of turn.words) {
|
|
125
|
+
w.channel = attributeWord(w, timeline, params);
|
|
126
|
+
}
|
|
127
|
+
turn.channel = rollUpTurnChannel(turn.words);
|
|
128
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Linear-interpolation resampler for streaming Float32 audio. Stateful across
|
|
3
|
+
* `process()` calls so chunk boundaries don't introduce phase discontinuities:
|
|
4
|
+
* the last input sample and a fractional read position are carried over.
|
|
5
|
+
*
|
|
6
|
+
* Linear interpolation is good enough for ASR ingest — the downstream
|
|
7
|
+
* StreamingTranscriber band-limits at the target rate anyway, and a polyphase
|
|
8
|
+
* filter would be overkill in the AudioWorklet hot path. If a customer needs
|
|
9
|
+
* higher quality they can supply their own VadDetector + bypass the encoder.
|
|
10
|
+
*/
|
|
11
|
+
export class LinearResampler {
|
|
12
|
+
private readonly ratio: number;
|
|
13
|
+
private lastSample = 0;
|
|
14
|
+
private fractional = 0;
|
|
15
|
+
|
|
16
|
+
constructor(
|
|
17
|
+
private readonly sourceRate: number,
|
|
18
|
+
private readonly targetRate: number,
|
|
19
|
+
) {
|
|
20
|
+
if (sourceRate <= 0 || targetRate <= 0) {
|
|
21
|
+
throw new Error("sourceRate and targetRate must be positive");
|
|
22
|
+
}
|
|
23
|
+
this.ratio = sourceRate / targetRate;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
process(input: Float32Array): Float32Array {
|
|
27
|
+
if (this.sourceRate === this.targetRate) {
|
|
28
|
+
return input;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Worst-case output length; we'll slice to actual.
|
|
32
|
+
const out = new Float32Array(Math.ceil(input.length / this.ratio) + 1);
|
|
33
|
+
let outIdx = 0;
|
|
34
|
+
let pos = this.fractional;
|
|
35
|
+
|
|
36
|
+
while (pos < input.length) {
|
|
37
|
+
const i = Math.floor(pos);
|
|
38
|
+
const frac = pos - i;
|
|
39
|
+
const a = i === 0 ? this.lastSample : input[i - 1];
|
|
40
|
+
const b = input[i];
|
|
41
|
+
out[outIdx++] = a + (b - a) * frac;
|
|
42
|
+
pos += this.ratio;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
this.lastSample = input[input.length - 1] ?? this.lastSample;
|
|
46
|
+
this.fractional = pos - input.length;
|
|
47
|
+
return out.subarray(0, outIdx);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
reset(): void {
|
|
51
|
+
this.lastSample = 0;
|
|
52
|
+
this.fractional = 0;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Convert Float32 PCM (-1..1) to little-endian Int16 PCM. */
|
|
57
|
+
export function float32ToPcm16(input: Float32Array): ArrayBuffer {
|
|
58
|
+
const out = new ArrayBuffer(input.length * 2);
|
|
59
|
+
const view = new DataView(out);
|
|
60
|
+
for (let i = 0; i < input.length; i++) {
|
|
61
|
+
const clamped = Math.max(-1, Math.min(1, input[i]));
|
|
62
|
+
view.setInt16(
|
|
63
|
+
i * 2,
|
|
64
|
+
clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff,
|
|
65
|
+
true,
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
return out;
|
|
69
|
+
}
|