assemblyai 4.33.3 → 4.34.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +22 -0
  2. package/dist/assemblyai.streaming.umd.js +1291 -3
  3. package/dist/assemblyai.streaming.umd.min.js +1 -1
  4. package/dist/assemblyai.umd.js +802 -7
  5. package/dist/assemblyai.umd.min.js +1 -1
  6. package/dist/browser.mjs +775 -5
  7. package/dist/bun.mjs +775 -5
  8. package/dist/deno.mjs +775 -5
  9. package/dist/exports/streaming.d.ts +7 -0
  10. package/dist/index.cjs +802 -7
  11. package/dist/index.mjs +794 -8
  12. package/dist/node.cjs +783 -4
  13. package/dist/node.mjs +775 -5
  14. package/dist/services/index.d.ts +2 -2
  15. package/dist/services/streaming/browser/dual-channel-capture.d.ts +66 -0
  16. package/dist/services/streaming/browser/worklets/pcm16-encoder.d.ts +19 -0
  17. package/dist/services/streaming/energy-vad.d.ts +35 -0
  18. package/dist/services/streaming/index.d.ts +4 -0
  19. package/dist/services/streaming/label-mapper.d.ts +44 -0
  20. package/dist/services/streaming/resampler.d.ts +22 -0
  21. package/dist/services/streaming/service.d.ts +71 -2
  22. package/dist/streaming.browser.mjs +1247 -4
  23. package/dist/streaming.cjs +1287 -3
  24. package/dist/streaming.mjs +1276 -4
  25. package/dist/types/streaming/dual-channel.d.ts +48 -0
  26. package/dist/types/streaming/index.d.ts +140 -4
  27. package/dist/workerd.mjs +775 -5
  28. package/package.json +1 -1
  29. package/src/exports/streaming.ts +7 -0
  30. package/src/services/index.ts +20 -1
  31. package/src/services/streaming/browser/dual-channel-capture.ts +177 -0
  32. package/src/services/streaming/browser/worklets/pcm16-encoder.ts +70 -0
  33. package/src/services/streaming/energy-vad.ts +75 -0
  34. package/src/services/streaming/index.ts +4 -0
  35. package/src/services/streaming/label-mapper.ts +128 -0
  36. package/src/services/streaming/resampler.ts +69 -0
  37. package/src/services/streaming/service.ts +405 -3
  38. package/src/types/streaming/dual-channel.ts +57 -0
  39. package/src/types/streaming/index.ts +144 -1
package/dist/node.cjs CHANGED
@@ -5,6 +5,19 @@ var ws = require('ws');
5
5
  var fs = require('fs');
6
6
  var stream = require('stream');
7
7
 
8
+ /**
9
+ * Thrown when `DualChannelCapture` is constructed in a non-browser environment
10
+ * (no `globalThis.AudioContext`). The helper is intentionally surfaced from the
11
+ * main entrypoint so the import path is uniform across runtimes; the runtime
12
+ * guard moves to construction time.
13
+ */
14
+ class BrowserOnlyError extends Error {
15
+ constructor(message = "DualChannelCapture requires a browser environment (AudioContext is undefined).") {
16
+ super(message);
17
+ this.name = "BrowserOnlyError";
18
+ }
19
+ }
20
+
8
21
  const DEFAULT_FETCH_INIT = {
9
22
  cache: "no-store",
10
23
  };
@@ -22,7 +35,7 @@ if (typeof navigator !== "undefined" && navigator.userAgent) {
22
35
  defaultUserAgentString += navigator.userAgent;
23
36
  }
24
37
  const defaultUserAgent = {
25
- sdk: { name: "JavaScript", version: "4.33.3" },
38
+ sdk: { name: "JavaScript", version: "4.34.4" },
26
39
  };
27
40
  if (typeof process !== "undefined") {
28
41
  if (process.versions.node && defaultUserAgentString.indexOf("Node") === -1) {
@@ -781,11 +794,216 @@ function dataUrlToBlob(dataUrl) {
781
794
  return new Blob([u8arr], { type: mime });
782
795
  }
783
796
 
797
+ /**
798
+ * Energy-based VAD with adaptive noise-floor tracking and hangover. Pure JS,
799
+ * no dependencies. Suitable for the "which physical channel is speaking" task
800
+ * because the channels are already physically separated at capture — the harder
801
+ * problem (speech vs. non-speech in the wild) is one a customer can swap in a
802
+ * DNN VAD for via the `createVad` parameter.
803
+ *
804
+ * Tuning notes:
805
+ * - thresholdRatio below 2 will treat anything above noise as speech (too sensitive).
806
+ * - thresholdRatio above 6 will miss quiet utterance onsets/offsets.
807
+ * - noiseFloorAlpha above 0.1 makes the floor track quickly (good for non-stationary
808
+ * background) but risks slowly adapting *up* to a sustained low voice.
809
+ */
810
+ class EnergyVad {
811
+ constructor(params = {}) {
812
+ this.hangoverRemaining = 0;
813
+ this.thresholdRatio = params.thresholdRatio ?? 3.0;
814
+ this.noiseFloorAlpha = params.noiseFloorAlpha ?? 0.05;
815
+ this.hangoverFrames = params.hangoverFrames ?? 10;
816
+ this.initialNoiseFloor = params.initialNoiseFloor ?? 1e-4;
817
+ this.noiseFloor = this.initialNoiseFloor;
818
+ }
819
+ process(frame) {
820
+ let sumSq = 0;
821
+ for (let i = 0; i < frame.length; i++) {
822
+ sumSq += frame[i] * frame[i];
823
+ }
824
+ const rms = frame.length > 0 ? Math.sqrt(sumSq / frame.length) : 0;
825
+ const threshold = this.noiseFloor * this.thresholdRatio;
826
+ let active = rms > threshold;
827
+ if (active) {
828
+ this.hangoverRemaining = this.hangoverFrames;
829
+ }
830
+ else if (this.hangoverRemaining > 0) {
831
+ this.hangoverRemaining--;
832
+ active = true;
833
+ // While in hangover, do not update noise floor — RMS may still reflect tail energy.
834
+ }
835
+ else {
836
+ this.noiseFloor =
837
+ this.noiseFloor * (1 - this.noiseFloorAlpha) +
838
+ rms * this.noiseFloorAlpha;
839
+ }
840
+ return { active, energy: rms };
841
+ }
842
+ reset() {
843
+ this.noiseFloor = this.initialNoiseFloor;
844
+ this.hangoverRemaining = 0;
845
+ }
846
+ }
847
+
848
+ /**
849
+ * Append-only ring buffer of VAD frames in stream-relative ms order.
850
+ * `pushFrame` is O(1) amortized; `framesInWindow` is O(n) over kept frames,
851
+ * which is fine for the per-word lookups we do (a 30 s window at 50 frames/s
852
+ * per channel × 2 channels = 3000 entries, scanned once per word).
853
+ *
854
+ * Runtime-agnostic — no DOM or Web Audio dependencies.
855
+ */
856
+ class VadTimeline {
857
+ constructor(windowMs) {
858
+ this.windowMs = windowMs;
859
+ this.frames = [];
860
+ this.head = 0;
861
+ }
862
+ pushFrame(frame) {
863
+ this.frames.push(frame);
864
+ const cutoff = frame.ts - this.windowMs;
865
+ while (this.head < this.frames.length &&
866
+ this.frames[this.head].ts < cutoff) {
867
+ this.head++;
868
+ }
869
+ if (this.head > 1024 && this.head * 2 > this.frames.length) {
870
+ this.frames = this.frames.slice(this.head);
871
+ this.head = 0;
872
+ }
873
+ }
874
+ framesInWindow(startMs, endMs) {
875
+ const out = [];
876
+ for (let i = this.head; i < this.frames.length; i++) {
877
+ const f = this.frames[i];
878
+ if (f.ts < startMs)
879
+ continue;
880
+ if (f.ts > endMs)
881
+ break;
882
+ out.push(f);
883
+ }
884
+ return out;
885
+ }
886
+ clear() {
887
+ this.frames = [];
888
+ this.head = 0;
889
+ }
890
+ }
891
+ /**
892
+ * Sum per-channel active RMS over a window. Returns a Map from channel name
893
+ * to total score. Channels with zero score are omitted.
894
+ */
895
+ function scoreChannels(frames) {
896
+ const scores = new Map();
897
+ for (const f of frames) {
898
+ if (!f.active)
899
+ continue;
900
+ scores.set(f.channel, (scores.get(f.channel) ?? 0) + f.rms);
901
+ }
902
+ return scores;
903
+ }
904
+ /**
905
+ * Decide which channel was dominant during a word's `[start, end]` window.
906
+ *
907
+ * - If no channel has any active VAD energy → `"unknown"`.
908
+ * - If the top channel beats the runner-up by at least `dominanceRatio` → top channel.
909
+ * - Else: top channel wins on absolute score; exact ties → `"unknown"`.
910
+ */
911
+ function attributeWord(word, timeline, params) {
912
+ const scores = scoreChannels(timeline.framesInWindow(word.start, word.end));
913
+ if (scores.size === 0)
914
+ return "unknown";
915
+ const sorted = [...scores.entries()].sort((a, b) => b[1] - a[1]);
916
+ if (sorted.length === 1)
917
+ return sorted[0][0];
918
+ const [topName, topScore] = sorted[0];
919
+ const [runnerName, runnerScore] = sorted[1];
920
+ if (topScore >= params.dominanceRatio * runnerScore)
921
+ return topName;
922
+ if (topScore > runnerScore)
923
+ return topName;
924
+ if (runnerScore > topScore)
925
+ return runnerName;
926
+ return "unknown";
927
+ }
928
+ /**
929
+ * Duration-weighted majority of word channels. `"unknown"` if there are no
930
+ * words, every word resolved to `"unknown"`, or two channels tie exactly.
931
+ */
932
+ function rollUpTurnChannel(words) {
933
+ const totals = new Map();
934
+ for (const w of words) {
935
+ if (!w.channel || w.channel === "unknown")
936
+ continue;
937
+ const dur = Math.max(0, w.end - w.start);
938
+ totals.set(w.channel, (totals.get(w.channel) ?? 0) + dur);
939
+ }
940
+ if (totals.size === 0)
941
+ return "unknown";
942
+ const sorted = [...totals.entries()].sort((a, b) => b[1] - a[1]);
943
+ if (sorted.length === 1)
944
+ return sorted[0][0];
945
+ const [topName, topMs] = sorted[0];
946
+ const [, runnerMs] = sorted[1];
947
+ if (topMs === runnerMs)
948
+ return "unknown";
949
+ return topName;
950
+ }
951
+ /**
952
+ * Mutate `turn` in place: write `turn.words[i].channel` for every word and set
953
+ * `turn.channel` to the duration-weighted rollup.
954
+ *
955
+ * Returns `void` because the transcriber owns the `TurnEvent` ref and forwards
956
+ * the same object to the customer listener — no need to allocate a copy.
957
+ */
958
+ function attributeTurn(turn, timeline, params) {
959
+ for (const w of turn.words) {
960
+ w.channel = attributeWord(w, timeline, params);
961
+ }
962
+ turn.channel = rollUpTurnChannel(turn.words);
963
+ }
964
+
965
+ /**
966
+ * View any `AudioData` (ArrayBuffer / ArrayBufferView / typed array) as a
967
+ * little-endian Int16 sample sequence without copying. Callers must guarantee
968
+ * the underlying byte length is even.
969
+ */
970
+ function toInt16View(audio) {
971
+ // AudioData is ArrayBufferLike per the public type, but in practice callers
972
+ // pass ArrayBuffer or a typed-array view. Handle both without copying.
973
+ if (audio instanceof Int16Array)
974
+ return audio;
975
+ if (ArrayBuffer.isView(audio)) {
976
+ const view = audio;
977
+ return new Int16Array(view.buffer, view.byteOffset, Math.floor(view.byteLength / 2));
978
+ }
979
+ return new Int16Array(audio);
980
+ }
784
981
  const defaultStreamingUrl$1 = "wss://streaming.assemblyai.com/v3/ws";
785
982
  const terminateSessionMessage = `{"type":"Terminate"}`;
983
+ /**
984
+ * Per-send chunk cap in milliseconds for the dual-channel mixer. The streaming
985
+ * server rejects audio messages longer than 1000 ms (`Input Duration Error`).
986
+ * If a backlog accumulates (e.g. when a browser tab is backgrounded and
987
+ * `setInterval` is throttled to ~1 Hz), `flushMix` loops and emits multiple
988
+ * sends each ≤ this cap until the buffers drain.
989
+ */
990
+ const MAX_CHUNK_MS = 200;
991
+ /**
992
+ * Per-send minimum chunk size in milliseconds. The streaming server also
993
+ * rejects audio messages shorter than 50 ms with the same
994
+ * `Input Duration Error`, so the mixer waits until both per-channel buffers
995
+ * have at least this much accumulated before emitting. Final-flush (close
996
+ * path) bypasses this floor so the trailing partial buffer still gets sent.
997
+ */
998
+ const MIN_CHUNK_MS = 50;
786
999
  class StreamingTranscriber {
787
1000
  constructor(params) {
788
1001
  this.listeners = {};
1002
+ // Dual-channel mode state (allocated only when params.channels is set).
1003
+ this.isDualChannel = false;
1004
+ this.vadFrameSamples = 0;
1005
+ this.minChunkSamples = 0;
1006
+ this.maxChunkSamples = 0;
789
1007
  this.params = {
790
1008
  ...params,
791
1009
  websocketBaseUrl: params.websocketBaseUrl || defaultStreamingUrl$1,
@@ -797,6 +1015,42 @@ class StreamingTranscriber {
797
1015
  if (!(this.token || this.apiKey)) {
798
1016
  throw new Error("API key or temporary token is required.");
799
1017
  }
1018
+ if (params.channels) {
1019
+ if (params.channels.length !== 2) {
1020
+ throw new Error("StreamingTranscriber.channels must have exactly 2 entries.");
1021
+ }
1022
+ const names = params.channels.map((c) => c.name);
1023
+ if (new Set(names).size !== names.length) {
1024
+ throw new Error("StreamingTranscriber.channels names must be unique.");
1025
+ }
1026
+ this.isDualChannel = true;
1027
+ this.channelNames = names;
1028
+ const att = params.channelAttribution ?? {};
1029
+ this.attributionParams = {
1030
+ dominanceRatio: att.dominanceRatio ?? 4,
1031
+ timelineWindowMs: att.timelineWindowMs ?? 30_000,
1032
+ createVad: att.createVad ?? (() => new EnergyVad()),
1033
+ flushIntervalMs: att.flushIntervalMs ?? 50,
1034
+ resolveUnknownChannelsMethod: att.resolveUnknownChannelsMethod ?? "window",
1035
+ resolutionWindowWords: att.resolutionWindowWords ?? 2,
1036
+ speakerHistoryMinRmsEvidence: att.speakerHistoryMinRmsEvidence ?? 0.5,
1037
+ speakerHistoryDominanceRatio: att.speakerHistoryDominanceRatio ?? 3,
1038
+ };
1039
+ if (this.attributionParams.resolveUnknownChannelsMethod ===
1040
+ "speaker-history") {
1041
+ this.speakerHistory = new Map();
1042
+ }
1043
+ // 20 ms VAD frames at the transcriber's target sample rate.
1044
+ this.vadFrameSamples = Math.max(1, Math.round(params.sampleRate * 0.02));
1045
+ this.minChunkSamples = Math.max(1, Math.round(params.sampleRate * (MIN_CHUNK_MS / 1000)));
1046
+ this.maxChunkSamples = Math.max(this.minChunkSamples, Math.round(params.sampleRate * (MAX_CHUNK_MS / 1000)));
1047
+ this.channelBuffers = new Map(names.map((n) => [n, []]));
1048
+ this.channelSamplesReceived = new Map(names.map((n) => [n, 0]));
1049
+ this.channelVadFloatBuffers = new Map(names.map((n) => [n, new Float32Array(this.vadFrameSamples)]));
1050
+ this.channelVadBufferIdx = new Map(names.map((n) => [n, 0]));
1051
+ this.channelVads = new Map(names.map((n) => [n, this.attributionParams.createVad(n)]));
1052
+ this.timeline = new VadTimeline(this.attributionParams.timelineWindowMs);
1053
+ }
800
1054
  }
801
1055
  connectionUrl() {
802
1056
  const url = new URL(this.params.websocketBaseUrl ?? "");
@@ -846,13 +1100,18 @@ class StreamingTranscriber {
846
1100
  if (this.params.prompt) {
847
1101
  searchParams.set("prompt", this.params.prompt);
848
1102
  }
1103
+ if (this.params.agentContext) {
1104
+ searchParams.set("agent_context", this.params.agentContext);
1105
+ }
849
1106
  if (this.params.filterProfanity) {
850
1107
  searchParams.set("filter_profanity", this.params.filterProfanity.toString());
851
1108
  }
852
1109
  if (this.params.speechModel === "u3-pro") {
853
1110
  console.warn("[Deprecation Warning] The speech model `u3-pro` is deprecated and will be removed in a future release. Please use `u3-rt-pro` instead.");
854
1111
  }
855
- searchParams.set("speech_model", this.params.speechModel.toString());
1112
+ if (this.params.speechModel !== undefined) {
1113
+ searchParams.set("speech_model", this.params.speechModel.toString());
1114
+ }
856
1115
  if (this.params.languageDetection !== undefined) {
857
1116
  searchParams.set("language_detection", this.params.languageDetection.toString());
858
1117
  }
@@ -913,6 +1172,9 @@ class StreamingTranscriber {
913
1172
  if (this.params.redactPiiSub !== undefined) {
914
1173
  searchParams.set("redact_pii_sub", this.params.redactPiiSub);
915
1174
  }
1175
+ if (this.params.mode !== undefined) {
1176
+ searchParams.set("mode", this.params.mode);
1177
+ }
916
1178
  if (this.params.llmGateway !== undefined) {
917
1179
  searchParams.set("llm_gateway", JSON.stringify(this.params.llmGateway));
918
1180
  }
@@ -945,6 +1207,13 @@ class StreamingTranscriber {
945
1207
  reason = StreamingErrorMessages[code];
946
1208
  }
947
1209
  }
1210
+ // Stop the flush timer when the socket is gone (server-initiated close,
1211
+ // network drop, etc.) — otherwise subsequent ticks call send() on a
1212
+ // closed socket and spam the error listener.
1213
+ if (this.flushTimer) {
1214
+ clearInterval(this.flushTimer);
1215
+ this.flushTimer = undefined;
1216
+ }
948
1217
  this.listeners.close?.(code, reason);
949
1218
  };
950
1219
  this.socket.onerror = (event) => {
@@ -971,6 +1240,19 @@ class StreamingTranscriber {
971
1240
  break;
972
1241
  }
973
1242
  case "Turn": {
1243
+ if (this.isDualChannel && this.timeline && this.attributionParams) {
1244
+ attributeTurn(message, this.timeline, {
1245
+ dominanceRatio: this.attributionParams.dominanceRatio,
1246
+ });
1247
+ switch (this.attributionParams.resolveUnknownChannelsMethod) {
1248
+ case "window":
1249
+ this.resolveUnknownChannelsByWindow(message);
1250
+ break;
1251
+ case "speaker-history":
1252
+ this.resolveUnknownChannelsBySpeakerHistory(message);
1253
+ break;
1254
+ }
1255
+ }
974
1256
  this.listeners.turn?.(message);
975
1257
  break;
976
1258
  }
@@ -982,6 +1264,10 @@ class StreamingTranscriber {
982
1264
  this.listeners.llmGatewayResponse?.(message);
983
1265
  break;
984
1266
  }
1267
+ case "SpeakerRevision": {
1268
+ this.listeners.speakerRevision?.(message);
1269
+ break;
1270
+ }
985
1271
  case "Warning": {
986
1272
  const warning = message;
987
1273
  console.warn(`Streaming warning (code=${warning.warning_code}): ${warning.warning}`);
@@ -996,6 +1282,11 @@ class StreamingTranscriber {
996
1282
  };
997
1283
  });
998
1284
  }
1285
+ /**
1286
+ * Returns a WritableStream that pumps PCM chunks into `sendAudio`. Single-channel
1287
+ * only — in dual-channel mode use `sendAudio(pcm, { channel })` directly, since
1288
+ * `WritableStream` has no place to carry a channel tag.
1289
+ */
999
1290
  stream() {
1000
1291
  return new web.WritableStream({
1001
1292
  write: (chunk) => {
@@ -1003,8 +1294,235 @@ class StreamingTranscriber {
1003
1294
  },
1004
1295
  });
1005
1296
  }
1006
- sendAudio(audio) {
1007
- this.send(audio);
1297
+ /**
1298
+ * Send PCM audio.
1299
+ *
1300
+ * In single-channel mode, `audio` is forwarded directly to the WebSocket and
1301
+ * `options` is ignored.
1302
+ *
1303
+ * In dual-channel mode (when `channels` is configured), `options.channel` is
1304
+ * REQUIRED and must match one of the declared channel names. Per-channel PCM is
1305
+ * fed into that channel's VAD, accumulated into a per-channel ring buffer, and
1306
+ * a scheduled flush (`channelAttribution.flushIntervalMs`, default 50ms) mixes
1307
+ * the buffers into mono before sending to the WebSocket.
1308
+ */
1309
+ sendAudio(audio, options) {
1310
+ if (!this.isDualChannel) {
1311
+ this.send(audio);
1312
+ return;
1313
+ }
1314
+ if (!options?.channel) {
1315
+ throw new Error("StreamingTranscriber is in dual-channel mode; sendAudio requires { channel }.");
1316
+ }
1317
+ if (!this.channelNames.includes(options.channel)) {
1318
+ throw new Error(`Unknown channel "${options.channel}"; declared channels: ${this.channelNames.join(", ")}.`);
1319
+ }
1320
+ this.ingestChannelAudio(options.channel, audio);
1321
+ }
1322
+ ingestChannelAudio(name, audio) {
1323
+ const samples = toInt16View(audio);
1324
+ const buf = this.channelBuffers.get(name);
1325
+ const vadBuf = this.channelVadFloatBuffers.get(name);
1326
+ let vadIdx = this.channelVadBufferIdx.get(name);
1327
+ let received = this.channelSamplesReceived.get(name);
1328
+ const vad = this.channelVads.get(name);
1329
+ const sampleRate = this.params.sampleRate;
1330
+ const frameSize = this.vadFrameSamples;
1331
+ for (let i = 0; i < samples.length; i++) {
1332
+ const s = samples[i];
1333
+ buf.push(s);
1334
+ vadBuf[vadIdx++] = s / 0x8000;
1335
+ received++;
1336
+ if (vadIdx === frameSize) {
1337
+ const result = vad.process(vadBuf);
1338
+ const frame = {
1339
+ ts: (received / sampleRate) * 1000,
1340
+ channel: name,
1341
+ active: result.active,
1342
+ rms: result.energy,
1343
+ };
1344
+ this.timeline.pushFrame(frame);
1345
+ this.listeners.vad?.(frame);
1346
+ vadIdx = 0;
1347
+ }
1348
+ }
1349
+ this.channelVadBufferIdx.set(name, vadIdx);
1350
+ this.channelSamplesReceived.set(name, received);
1351
+ if (!this.flushTimer)
1352
+ this.startFlushTimer();
1353
+ }
1354
+ startFlushTimer() {
1355
+ this.flushTimer = setInterval(() => this.flushMix(), this.attributionParams.flushIntervalMs);
1356
+ }
1357
+ flushMix(force = false) {
1358
+ if (!this.channelNames || !this.channelBuffers)
1359
+ return;
1360
+ const bufs = this.channelNames.map((n) => this.channelBuffers.get(n));
1361
+ const divisor = bufs.length;
1362
+ // Loop so a backlog (e.g. accumulated while a browser tab was throttled in
1363
+ // the background) drains as multiple sends, each capped at MAX_CHUNK_MS.
1364
+ // Without the cap a single message could exceed the server's 1000 ms input
1365
+ // duration limit and be rejected with code 3007.
1366
+ for (;;) {
1367
+ let mixLen = Infinity;
1368
+ for (const b of bufs)
1369
+ if (b.length < mixLen)
1370
+ mixLen = b.length;
1371
+ if (!Number.isFinite(mixLen) || mixLen === 0)
1372
+ return;
1373
+ // The streaming server rejects audio messages shorter than 50 ms with
1374
+ // `Input Duration Error`. Wait until both per-channel buffers have at
1375
+ // least minChunkSamples worth queued before emitting. The `force` path
1376
+ // (final flush on close) bypasses this so the trailing partial buffer
1377
+ // still gets through.
1378
+ if (!force && mixLen < this.minChunkSamples)
1379
+ return;
1380
+ if (mixLen > this.maxChunkSamples)
1381
+ mixLen = this.maxChunkSamples;
1382
+ const out = new Int16Array(mixLen);
1383
+ for (let i = 0; i < mixLen; i++) {
1384
+ let sum = 0;
1385
+ for (let c = 0; c < divisor; c++)
1386
+ sum += bufs[c][i];
1387
+ const avg = Math.round(sum / divisor);
1388
+ out[i] = avg < -32768 ? -32768 : avg > 32767 ? 32767 : avg;
1389
+ }
1390
+ for (const b of bufs)
1391
+ b.splice(0, mixLen);
1392
+ try {
1393
+ this.send(out.buffer);
1394
+ }
1395
+ catch (err) {
1396
+ this.listeners.error?.(err);
1397
+ return;
1398
+ }
1399
+ }
1400
+ }
1401
+ /**
1402
+ * Fill in words whose per-word VAD attribution was `"unknown"` by looking
1403
+ * at the dominant non-`"unknown"` channel among ±N neighbors in the same
1404
+ * turn. Words with no non-`"unknown"` neighbors stay `"unknown"`. Confident
1405
+ * per-word VAD decisions are never modified.
1406
+ *
1407
+ * Local temporal heuristic — ignores `speaker_label`, so it works even when
1408
+ * AAI's diarization re-uses the same label for two physically distinct
1409
+ * voices. Each resolved word gets `channelResolved: true` so downstream
1410
+ * renderers can distinguish inferred channels from directly-measured ones.
1411
+ */
1412
+ resolveUnknownChannelsByWindow(turn) {
1413
+ if (!this.attributionParams)
1414
+ return;
1415
+ const window = this.attributionParams.resolutionWindowWords;
1416
+ const words = turn.words;
1417
+ let mutated = false;
1418
+ for (let i = 0; i < words.length; i++) {
1419
+ if (words[i].channel !== "unknown")
1420
+ continue;
1421
+ const tally = new Map();
1422
+ const lo = Math.max(0, i - window);
1423
+ const hi = Math.min(words.length - 1, i + window);
1424
+ for (let j = lo; j <= hi; j++) {
1425
+ if (j === i)
1426
+ continue;
1427
+ const ch = words[j].channel;
1428
+ if (!ch || ch === "unknown")
1429
+ continue;
1430
+ tally.set(ch, (tally.get(ch) ?? 0) + 1);
1431
+ }
1432
+ if (tally.size === 0)
1433
+ continue;
1434
+ // Pick the dominant neighbor channel. Ties → leave `"unknown"` (rare;
1435
+ // would require an equal count of mic and system neighbors).
1436
+ let top;
1437
+ let topCount = 0;
1438
+ let tied = false;
1439
+ for (const [name, count] of tally) {
1440
+ if (count > topCount) {
1441
+ top = name;
1442
+ topCount = count;
1443
+ tied = false;
1444
+ }
1445
+ else if (count === topCount) {
1446
+ tied = true;
1447
+ }
1448
+ }
1449
+ if (top && !tied) {
1450
+ words[i].channel = top;
1451
+ words[i].channelResolved = true;
1452
+ mutated = true;
1453
+ }
1454
+ }
1455
+ // Recompute the rollup only if any per-word channel changed.
1456
+ if (mutated)
1457
+ turn.channel = rollUpTurnChannel(words);
1458
+ }
1459
+ /**
1460
+ * Fill `"unknown"` words by looking up the speaker's session-wide channel
1461
+ * evidence. For each `speaker_label`, sums active VAD frame RMS per channel
1462
+ * across every word the speaker has uttered to date. A speaker is
1463
+ * "resolvable" if their total evidence clears
1464
+ * `speakerHistoryMinRmsEvidence` and their top channel exceeds the
1465
+ * runner-up by `speakerHistoryDominanceRatio`.
1466
+ *
1467
+ * Only touches `"unknown"` words. Confident per-word VAD decisions are
1468
+ * never modified. `speaker_label` is never modified.
1469
+ */
1470
+ resolveUnknownChannelsBySpeakerHistory(turn) {
1471
+ if (!this.timeline || !this.attributionParams || !this.speakerHistory)
1472
+ return;
1473
+ const minEvidence = this.attributionParams.speakerHistoryMinRmsEvidence;
1474
+ const dominanceRatio = this.attributionParams.speakerHistoryDominanceRatio;
1475
+ // 1. Accumulate evidence from this turn's words.
1476
+ for (const w of turn.words) {
1477
+ if (!w.speaker)
1478
+ continue;
1479
+ const frames = this.timeline.framesInWindow(w.start, w.end);
1480
+ let entry = this.speakerHistory.get(w.speaker);
1481
+ if (!entry) {
1482
+ entry = new Map();
1483
+ this.speakerHistory.set(w.speaker, entry);
1484
+ }
1485
+ for (const f of frames) {
1486
+ if (!f.active)
1487
+ continue;
1488
+ entry.set(f.channel, (entry.get(f.channel) ?? 0) + f.rms);
1489
+ }
1490
+ }
1491
+ // 2. Fill unknown words whose speakers have dominant evidence.
1492
+ let mutated = false;
1493
+ for (const w of turn.words) {
1494
+ if (w.channel !== "unknown" || !w.speaker)
1495
+ continue;
1496
+ const entry = this.speakerHistory.get(w.speaker);
1497
+ if (!entry || entry.size === 0)
1498
+ continue;
1499
+ let total = 0;
1500
+ let topName;
1501
+ let topScore = 0;
1502
+ let runnerScore = 0;
1503
+ for (const [name, score] of entry) {
1504
+ total += score;
1505
+ if (score > topScore) {
1506
+ runnerScore = topScore;
1507
+ topScore = score;
1508
+ topName = name;
1509
+ }
1510
+ else if (score > runnerScore) {
1511
+ runnerScore = score;
1512
+ }
1513
+ }
1514
+ if (total < minEvidence)
1515
+ continue;
1516
+ if (runnerScore > 0 && topScore < dominanceRatio * runnerScore)
1517
+ continue;
1518
+ if (topName) {
1519
+ w.channel = topName;
1520
+ w.channelResolved = true;
1521
+ mutated = true;
1522
+ }
1523
+ }
1524
+ if (mutated)
1525
+ turn.channel = rollUpTurnChannel(turn.words);
1008
1526
  }
1009
1527
  /**
1010
1528
  * Update the streaming configuration mid-stream.
@@ -1044,6 +1562,15 @@ class StreamingTranscriber {
1044
1562
  this.socket.send(data);
1045
1563
  }
1046
1564
  async close(waitForSessionTermination = true) {
1565
+ if (this.flushTimer) {
1566
+ clearInterval(this.flushTimer);
1567
+ this.flushTimer = undefined;
1568
+ // Best-effort: drain any final partial mix so the server gets the tail.
1569
+ // Bypass the 50ms floor here since this is the last flush; if the tail
1570
+ // is <50ms the server will reject that single message, but we'd lose
1571
+ // the audio either way.
1572
+ this.flushMix(true);
1573
+ }
1047
1574
  if (this.socket) {
1048
1575
  if (this.socket.readyState === this.socket.OPEN) {
1049
1576
  if (waitForSessionTermination) {
@@ -1095,6 +1622,249 @@ class StreamingTranscriberFactory extends BaseService {
1095
1622
  }
1096
1623
  }
1097
1624
 
1625
+ /**
1626
+ * AudioWorklet processor that ingests mono Float32 audio at the AudioContext's
1627
+ * native sample rate, resamples to `targetRate` (linear interpolation, stateful
1628
+ * across `process()` calls), packs to little-endian Int16 PCM, and posts
1629
+ * fixed-size chunks via `port.postMessage` with a running `samplesSent` counter.
1630
+ *
1631
+ * `samplesSent` is in **target-rate samples**, so the main thread can derive a
1632
+ * stream-relative timestamp = `samplesSent / targetRate * 1000` (ms) — the same
1633
+ * frame AAI uses for `StreamingWord.start` / `.end`.
1634
+ *
1635
+ * Defined as a string so it can be registered via a Blob URL — the SDK ships as
1636
+ * a single ESM file, so a separate `.js` worklet asset isn't viable.
1637
+ */
1638
+ const pcm16EncoderWorkletSource = `
1639
+ class Pcm16EncoderProcessor extends AudioWorkletProcessor {
1640
+ constructor(options) {
1641
+ super();
1642
+ const opts = (options && options.processorOptions) || {};
1643
+ this.targetRate = opts.targetRate || 16000;
1644
+ this.chunkMs = opts.chunkMs || 50;
1645
+ this.ratio = sampleRate / this.targetRate;
1646
+ this.chunkSize = Math.round(this.targetRate * this.chunkMs / 1000);
1647
+ this.buffer = new Int16Array(this.chunkSize);
1648
+ this.bufferIdx = 0;
1649
+ this.samplesSent = 0;
1650
+ this.lastSample = 0;
1651
+ this.fractional = 0;
1652
+ }
1653
+
1654
+ process(inputs) {
1655
+ const input = inputs[0];
1656
+ if (!input || input.length === 0 || !input[0] || input[0].length === 0) {
1657
+ return true;
1658
+ }
1659
+ const mono = input[0];
1660
+ let pos = this.fractional;
1661
+ while (pos < mono.length) {
1662
+ const i = Math.floor(pos);
1663
+ const frac = pos - i;
1664
+ const a = i === 0 ? this.lastSample : mono[i - 1];
1665
+ const b = mono[i];
1666
+ const sample = a + (b - a) * frac;
1667
+ const clamped = sample < -1 ? -1 : sample > 1 ? 1 : sample;
1668
+ this.buffer[this.bufferIdx++] = clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff;
1669
+ if (this.bufferIdx === this.chunkSize) {
1670
+ const out = new Int16Array(this.chunkSize);
1671
+ out.set(this.buffer);
1672
+ this.samplesSent += this.chunkSize;
1673
+ this.port.postMessage(
1674
+ { pcm: out.buffer, samplesSent: this.samplesSent },
1675
+ [out.buffer],
1676
+ );
1677
+ this.bufferIdx = 0;
1678
+ }
1679
+ pos += this.ratio;
1680
+ }
1681
+ this.lastSample = mono[mono.length - 1];
1682
+ this.fractional = pos - mono.length;
1683
+ return true;
1684
+ }
1685
+ }
1686
+ registerProcessor("aai-pcm16-encoder", Pcm16EncoderProcessor);
1687
+ `;
1688
+ const PCM16_ENCODER_PROCESSOR_NAME = "aai-pcm16-encoder";
1689
+
1690
+ const DEFAULT_TARGET_RATE = 16_000;
1691
+ const DEFAULT_CHUNK_MS = 50;
1692
+ const MIC_CHANNEL = "mic";
1693
+ const SYSTEM_CHANNEL = "system";
1694
+ /**
1695
+ * Browser-only adapter that pumps two `MediaStream`s into a `StreamingTranscriber`
1696
+ * configured for dual-channel mode. Each `MediaStream` runs through its own
1697
+ * `pcm16-encoder` AudioWorklet (resample to `targetSampleRate`, encode to Int16
1698
+ * PCM); each PCM chunk is forwarded via `transcriber.sendAudio(pcm, { channel })`.
1699
+ *
1700
+ * All dual-channel orchestration (mixing, VAD, per-word attribution) lives inside
1701
+ * `StreamingTranscriber` — this class is a pure I/O adapter. Non-browser runtimes
1702
+ * can replicate its job by pushing tagged PCM into `transcriber.sendAudio` directly.
1703
+ *
1704
+ * Caller responsibilities:
1705
+ * - **Echo cancellation** is set at `getUserMedia` time (`audio: { echoCancellation: true }`).
1706
+ * - **System-audio capture** is platform-dependent. Chrome's `getDisplayMedia({ audio: true })`
1707
+ * captures tab audio (and on Windows, full system audio when sharing the whole screen).
1708
+ * macOS requires a virtual loopback driver (e.g. BlackHole) to expose system audio at all.
1709
+ * - **Token auth.** Construct the transcriber with `token` — API-key auth is unsupported in browsers.
1710
+ * - **Stream ownership.** `stop()` tears down the AudioContext but does NOT stop the
1711
+ * `MediaStreamTrack`s passed in — callers own those.
1712
+ */
1713
+ class DualChannelCapture {
1714
+ constructor(params) {
1715
+ this.running = false;
1716
+ if (typeof globalThis.AudioContext === "undefined") {
1717
+ throw new BrowserOnlyError();
1718
+ }
1719
+ this.params = {
1720
+ micStream: params.micStream,
1721
+ systemStream: params.systemStream,
1722
+ transcriber: params.transcriber,
1723
+ targetSampleRate: params.targetSampleRate ?? DEFAULT_TARGET_RATE,
1724
+ };
1725
+ }
1726
+ on(event, listener) {
1727
+ if (event === "error")
1728
+ this.errorListener = listener;
1729
+ }
1730
+ /**
1731
+ * Wire the capture pipeline and start pumping tagged PCM into the transcriber.
1732
+ * The transcriber must already be connected. Returns once the worklet is
1733
+ * registered and the audio graph is live.
1734
+ */
1735
+ async start() {
1736
+ if (this.running) {
1737
+ throw new Error("DualChannelCapture already started");
1738
+ }
1739
+ this.context = new AudioContext();
1740
+ const blob = new Blob([pcm16EncoderWorkletSource], {
1741
+ type: "application/javascript",
1742
+ });
1743
+ const url = URL.createObjectURL(blob);
1744
+ try {
1745
+ await this.context.audioWorklet.addModule(url);
1746
+ }
1747
+ finally {
1748
+ URL.revokeObjectURL(url);
1749
+ }
1750
+ this.micSource = this.context.createMediaStreamSource(this.params.micStream);
1751
+ this.sysSource = this.context.createMediaStreamSource(this.params.systemStream);
1752
+ this.micEncoder = this.makeEncoder(MIC_CHANNEL);
1753
+ this.sysEncoder = this.makeEncoder(SYSTEM_CHANNEL);
1754
+ this.micSource.connect(this.micEncoder);
1755
+ this.sysSource.connect(this.sysEncoder);
1756
+ this.running = true;
1757
+ }
1758
+ makeEncoder(channel) {
1759
+ const node = new AudioWorkletNode(this.context, PCM16_ENCODER_PROCESSOR_NAME, {
1760
+ numberOfInputs: 1,
1761
+ numberOfOutputs: 0,
1762
+ channelCount: 1,
1763
+ channelCountMode: "explicit",
1764
+ channelInterpretation: "speakers",
1765
+ processorOptions: {
1766
+ targetRate: this.params.targetSampleRate,
1767
+ chunkMs: DEFAULT_CHUNK_MS,
1768
+ },
1769
+ });
1770
+ node.port.onmessage = (e) => {
1771
+ try {
1772
+ this.params.transcriber.sendAudio(e.data.pcm, { channel });
1773
+ }
1774
+ catch (err) {
1775
+ this.errorListener?.(err);
1776
+ }
1777
+ };
1778
+ return node;
1779
+ }
1780
+ /**
1781
+ * Tear down internal nodes and close the AudioContext. Does NOT stop the
1782
+ * caller-provided MediaStream tracks — they remain available for preview UI,
1783
+ * recording, etc. Idempotent.
1784
+ */
1785
+ async stop() {
1786
+ if (!this.running)
1787
+ return;
1788
+ this.running = false;
1789
+ try {
1790
+ this.micEncoder?.port.close();
1791
+ this.sysEncoder?.port.close();
1792
+ this.micEncoder?.disconnect();
1793
+ this.sysEncoder?.disconnect();
1794
+ this.micSource?.disconnect();
1795
+ this.sysSource?.disconnect();
1796
+ }
1797
+ catch {
1798
+ // Disconnecting already-disconnected nodes throws in some browsers; ignore.
1799
+ }
1800
+ if (this.context && this.context.state !== "closed") {
1801
+ await this.context.close();
1802
+ }
1803
+ this.context = undefined;
1804
+ this.micSource = undefined;
1805
+ this.sysSource = undefined;
1806
+ this.micEncoder = undefined;
1807
+ this.sysEncoder = undefined;
1808
+ }
1809
+ }
1810
+
1811
+ /**
1812
+ * Linear-interpolation resampler for streaming Float32 audio. Stateful across
1813
+ * `process()` calls so chunk boundaries don't introduce phase discontinuities:
1814
+ * the last input sample and a fractional read position are carried over.
1815
+ *
1816
+ * Linear interpolation is good enough for ASR ingest — the downstream
1817
+ * StreamingTranscriber band-limits at the target rate anyway, and a polyphase
1818
+ * filter would be overkill in the AudioWorklet hot path. If a customer needs
1819
+ * higher quality they can supply their own VadDetector + bypass the encoder.
1820
+ */
1821
+ class LinearResampler {
1822
+ constructor(sourceRate, targetRate) {
1823
+ this.sourceRate = sourceRate;
1824
+ this.targetRate = targetRate;
1825
+ this.lastSample = 0;
1826
+ this.fractional = 0;
1827
+ if (sourceRate <= 0 || targetRate <= 0) {
1828
+ throw new Error("sourceRate and targetRate must be positive");
1829
+ }
1830
+ this.ratio = sourceRate / targetRate;
1831
+ }
1832
+ process(input) {
1833
+ if (this.sourceRate === this.targetRate) {
1834
+ return input;
1835
+ }
1836
+ // Worst-case output length; we'll slice to actual.
1837
+ const out = new Float32Array(Math.ceil(input.length / this.ratio) + 1);
1838
+ let outIdx = 0;
1839
+ let pos = this.fractional;
1840
+ while (pos < input.length) {
1841
+ const i = Math.floor(pos);
1842
+ const frac = pos - i;
1843
+ const a = i === 0 ? this.lastSample : input[i - 1];
1844
+ const b = input[i];
1845
+ out[outIdx++] = a + (b - a) * frac;
1846
+ pos += this.ratio;
1847
+ }
1848
+ this.lastSample = input[input.length - 1] ?? this.lastSample;
1849
+ this.fractional = pos - input.length;
1850
+ return out.subarray(0, outIdx);
1851
+ }
1852
+ reset() {
1853
+ this.lastSample = 0;
1854
+ this.fractional = 0;
1855
+ }
1856
+ }
1857
+ /** Convert Float32 PCM (-1..1) to little-endian Int16 PCM. */
1858
+ function float32ToPcm16(input) {
1859
+ const out = new ArrayBuffer(input.length * 2);
1860
+ const view = new DataView(out);
1861
+ for (let i = 0; i < input.length; i++) {
1862
+ const clamped = Math.max(-1, Math.min(1, input[i]));
1863
+ view.setInt16(i * 2, clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff, true);
1864
+ }
1865
+ return out;
1866
+ }
1867
+
1098
1868
  const defaultBaseUrl = "https://api.assemblyai.com";
1099
1869
  const defaultStreamingUrl = "https://streaming.assemblyai.com";
1100
1870
  class AssemblyAI {
@@ -1119,11 +1889,20 @@ class AssemblyAI {
1119
1889
  }
1120
1890
 
1121
1891
  exports.AssemblyAI = AssemblyAI;
1892
+ exports.BrowserOnlyError = BrowserOnlyError;
1893
+ exports.DualChannelCapture = DualChannelCapture;
1894
+ exports.EnergyVad = EnergyVad;
1122
1895
  exports.FileService = FileService;
1123
1896
  exports.LemurService = LemurService;
1897
+ exports.LinearResampler = LinearResampler;
1124
1898
  exports.RealtimeService = RealtimeService;
1125
1899
  exports.RealtimeServiceFactory = RealtimeServiceFactory;
1126
1900
  exports.RealtimeTranscriber = RealtimeTranscriber;
1127
1901
  exports.RealtimeTranscriberFactory = RealtimeTranscriberFactory;
1128
1902
  exports.StreamingTranscriber = StreamingTranscriber;
1129
1903
  exports.TranscriptService = TranscriptService;
1904
+ exports.VadTimeline = VadTimeline;
1905
+ exports.attributeTurn = attributeTurn;
1906
+ exports.attributeWord = attributeWord;
1907
+ exports.float32ToPcm16 = float32ToPcm16;
1908
+ exports.rollUpTurnChannel = rollUpTurnChannel;