assemblyai 4.33.3 → 4.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assemblyai.streaming.umd.js +1279 -3
- package/dist/assemblyai.streaming.umd.min.js +1 -1
- package/dist/assemblyai.umd.js +786 -3
- package/dist/assemblyai.umd.min.js +1 -1
- package/dist/browser.mjs +762 -4
- package/dist/bun.mjs +762 -4
- package/dist/deno.mjs +762 -4
- package/dist/exports/streaming.d.ts +7 -0
- package/dist/index.cjs +786 -3
- package/dist/index.mjs +778 -4
- package/dist/node.cjs +770 -3
- package/dist/node.mjs +762 -4
- package/dist/services/index.d.ts +2 -2
- package/dist/services/streaming/browser/dual-channel-capture.d.ts +66 -0
- package/dist/services/streaming/browser/worklets/pcm16-encoder.d.ts +19 -0
- package/dist/services/streaming/energy-vad.d.ts +35 -0
- package/dist/services/streaming/index.d.ts +4 -0
- package/dist/services/streaming/label-mapper.d.ts +44 -0
- package/dist/services/streaming/resampler.d.ts +22 -0
- package/dist/services/streaming/service.d.ts +69 -1
- package/dist/streaming.browser.mjs +1235 -4
- package/dist/streaming.cjs +1275 -3
- package/dist/streaming.mjs +1264 -4
- package/dist/types/streaming/dual-channel.d.ts +48 -0
- package/dist/types/streaming/index.d.ts +110 -1
- package/dist/workerd.mjs +762 -4
- package/package.json +1 -1
- package/src/exports/streaming.ts +7 -0
- package/src/services/index.ts +20 -1
- package/src/services/streaming/browser/dual-channel-capture.ts +177 -0
- package/src/services/streaming/browser/worklets/pcm16-encoder.ts +70 -0
- package/src/services/streaming/energy-vad.ts +75 -0
- package/src/services/streaming/index.ts +4 -0
- package/src/services/streaming/label-mapper.ts +128 -0
- package/src/services/streaming/resampler.ts +69 -0
- package/src/services/streaming/service.ts +385 -2
- package/src/types/streaming/dual-channel.ts +57 -0
- package/src/types/streaming/index.ts +110 -0
package/dist/streaming.mjs
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
import ws from 'ws';
|
|
2
2
|
|
|
3
|
+
/**
|
|
4
|
+
* Thrown when `DualChannelCapture` is constructed in a non-browser environment
|
|
5
|
+
* (no `globalThis.AudioContext`). The helper is intentionally surfaced from the
|
|
6
|
+
* main entrypoint so the import path is uniform across runtimes; the runtime
|
|
7
|
+
* guard moves to construction time.
|
|
8
|
+
*/
|
|
9
|
+
class BrowserOnlyError extends Error {
|
|
10
|
+
constructor(message = "DualChannelCapture requires a browser environment (AudioContext is undefined).") {
|
|
11
|
+
super(message);
|
|
12
|
+
this.name = "BrowserOnlyError";
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
3
16
|
/******************************************************************************
|
|
4
17
|
Copyright (c) Microsoft Corporation.
|
|
5
18
|
|
|
@@ -17,6 +30,18 @@ PERFORMANCE OF THIS SOFTWARE.
|
|
|
17
30
|
/* global Reflect, Promise, SuppressedError, Symbol, Iterator */
|
|
18
31
|
|
|
19
32
|
|
|
33
|
+
function __rest(s, e) {
|
|
34
|
+
var t = {};
|
|
35
|
+
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)
|
|
36
|
+
t[p] = s[p];
|
|
37
|
+
if (s != null && typeof Object.getOwnPropertySymbols === "function")
|
|
38
|
+
for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {
|
|
39
|
+
if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))
|
|
40
|
+
t[p[i]] = s[p[i]];
|
|
41
|
+
}
|
|
42
|
+
return t;
|
|
43
|
+
}
|
|
44
|
+
|
|
20
45
|
function __awaiter(thisArg, _arguments, P, generator) {
|
|
21
46
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
22
47
|
return new (P || (P = Promise))(function (resolve, reject) {
|
|
@@ -95,9 +120,58 @@ const RealtimeErrorMessages = {
|
|
|
95
120
|
class RealtimeError extends Error {
|
|
96
121
|
}
|
|
97
122
|
|
|
123
|
+
const StreamingErrorType = {
|
|
124
|
+
BadSampleRate: 4000,
|
|
125
|
+
AuthFailed: 4001,
|
|
126
|
+
InsufficientFunds: 4002,
|
|
127
|
+
FreeTierUser: 4003,
|
|
128
|
+
NonexistentSessionId: 4004,
|
|
129
|
+
SessionExpired: 4008,
|
|
130
|
+
ClosedSession: 4010,
|
|
131
|
+
RateLimited: 4029,
|
|
132
|
+
UniqueSessionViolation: 4030,
|
|
133
|
+
SessionTimeout: 4031,
|
|
134
|
+
AudioTooShort: 4032,
|
|
135
|
+
AudioTooLong: 4033,
|
|
136
|
+
AudioTooSmallToTranscode: 4034,
|
|
137
|
+
BadSchema: 4101,
|
|
138
|
+
TooManyStreams: 4102,
|
|
139
|
+
Reconnected: 4103,
|
|
140
|
+
ServerError: 3005,
|
|
141
|
+
InputValidationError: 3006,
|
|
142
|
+
AudioChunkDurationViolation: 3007,
|
|
143
|
+
MaxSessionDurationExceeded: 3008,
|
|
144
|
+
ConcurrencyLimitExceeded: 3009,
|
|
145
|
+
};
|
|
146
|
+
const StreamingErrorMessages = {
|
|
147
|
+
[StreamingErrorType.ServerError]: "Server error",
|
|
148
|
+
[StreamingErrorType.InputValidationError]: "Input validation error",
|
|
149
|
+
[StreamingErrorType.AudioChunkDurationViolation]: "Audio chunk duration violation",
|
|
150
|
+
[StreamingErrorType.MaxSessionDurationExceeded]: "Session expired: maximum session duration exceeded",
|
|
151
|
+
[StreamingErrorType.ConcurrencyLimitExceeded]: "Too many concurrent sessions",
|
|
152
|
+
[StreamingErrorType.BadSampleRate]: "Sample rate must be a positive integer",
|
|
153
|
+
[StreamingErrorType.AuthFailed]: "Not Authorized",
|
|
154
|
+
[StreamingErrorType.InsufficientFunds]: "Insufficient funds",
|
|
155
|
+
[StreamingErrorType.FreeTierUser]: "This feature is paid-only and requires you to add a credit card. Please visit https://app.assemblyai.com/ to add a credit card to your account.",
|
|
156
|
+
[StreamingErrorType.NonexistentSessionId]: "Session ID does not exist",
|
|
157
|
+
[StreamingErrorType.SessionExpired]: "Session has expired",
|
|
158
|
+
[StreamingErrorType.ClosedSession]: "Session is closed",
|
|
159
|
+
[StreamingErrorType.RateLimited]: "Rate limited",
|
|
160
|
+
[StreamingErrorType.UniqueSessionViolation]: "Unique session violation",
|
|
161
|
+
[StreamingErrorType.SessionTimeout]: "Session Timeout",
|
|
162
|
+
[StreamingErrorType.AudioTooShort]: "Audio too short",
|
|
163
|
+
[StreamingErrorType.AudioTooLong]: "Audio too long",
|
|
164
|
+
[StreamingErrorType.AudioTooSmallToTranscode]: "Audio too small to transcode",
|
|
165
|
+
[StreamingErrorType.BadSchema]: "Bad schema",
|
|
166
|
+
[StreamingErrorType.TooManyStreams]: "Too many streams",
|
|
167
|
+
[StreamingErrorType.Reconnected]: "This session has been reconnected. This WebSocket is no longer valid.",
|
|
168
|
+
};
|
|
169
|
+
class StreamingError extends Error {
|
|
170
|
+
}
|
|
171
|
+
|
|
98
172
|
const defaultRealtimeUrl = "wss://api.assemblyai.com/v2/realtime/ws";
|
|
99
173
|
const forceEndOfUtteranceMessage = `{"force_end_utterance":true}`;
|
|
100
|
-
const terminateSessionMessage = `{"terminate_session":true}`;
|
|
174
|
+
const terminateSessionMessage$1 = `{"terminate_session":true}`;
|
|
101
175
|
/**
|
|
102
176
|
* RealtimeTranscriber connects to the Streaming Speech-to-Text API and lets you transcribe audio in real-time.
|
|
103
177
|
*/
|
|
@@ -292,11 +366,11 @@ class RealtimeTranscriber {
|
|
|
292
366
|
const sessionTerminatedPromise = new Promise((resolve) => {
|
|
293
367
|
this.sessionTerminatedResolve = resolve;
|
|
294
368
|
});
|
|
295
|
-
this.socket.send(terminateSessionMessage);
|
|
369
|
+
this.socket.send(terminateSessionMessage$1);
|
|
296
370
|
yield sessionTerminatedPromise;
|
|
297
371
|
}
|
|
298
372
|
else {
|
|
299
|
-
this.socket.send(terminateSessionMessage);
|
|
373
|
+
this.socket.send(terminateSessionMessage$1);
|
|
300
374
|
}
|
|
301
375
|
}
|
|
302
376
|
if ((_a = this.socket) === null || _a === void 0 ? void 0 : _a.removeAllListeners)
|
|
@@ -314,4 +388,1190 @@ class RealtimeTranscriber {
|
|
|
314
388
|
class RealtimeService extends RealtimeTranscriber {
|
|
315
389
|
}
|
|
316
390
|
|
|
317
|
-
|
|
391
|
+
/**
|
|
392
|
+
* Energy-based VAD with adaptive noise-floor tracking and hangover. Pure JS,
|
|
393
|
+
* no dependencies. Suitable for the "which physical channel is speaking" task
|
|
394
|
+
* because the channels are already physically separated at capture — the harder
|
|
395
|
+
* problem (speech vs. non-speech in the wild) is one a customer can swap in a
|
|
396
|
+
* DNN VAD for via the `createVad` parameter.
|
|
397
|
+
*
|
|
398
|
+
* Tuning notes:
|
|
399
|
+
* - thresholdRatio below 2 will treat anything above noise as speech (too sensitive).
|
|
400
|
+
* - thresholdRatio above 6 will miss quiet utterance onsets/offsets.
|
|
401
|
+
* - noiseFloorAlpha above 0.1 makes the floor track quickly (good for non-stationary
|
|
402
|
+
* background) but risks slowly adapting *up* to a sustained low voice.
|
|
403
|
+
*/
|
|
404
|
+
class EnergyVad {
|
|
405
|
+
constructor(params = {}) {
|
|
406
|
+
var _a, _b, _c, _d;
|
|
407
|
+
this.hangoverRemaining = 0;
|
|
408
|
+
this.thresholdRatio = (_a = params.thresholdRatio) !== null && _a !== void 0 ? _a : 3.0;
|
|
409
|
+
this.noiseFloorAlpha = (_b = params.noiseFloorAlpha) !== null && _b !== void 0 ? _b : 0.05;
|
|
410
|
+
this.hangoverFrames = (_c = params.hangoverFrames) !== null && _c !== void 0 ? _c : 10;
|
|
411
|
+
this.initialNoiseFloor = (_d = params.initialNoiseFloor) !== null && _d !== void 0 ? _d : 1e-4;
|
|
412
|
+
this.noiseFloor = this.initialNoiseFloor;
|
|
413
|
+
}
|
|
414
|
+
process(frame) {
|
|
415
|
+
let sumSq = 0;
|
|
416
|
+
for (let i = 0; i < frame.length; i++) {
|
|
417
|
+
sumSq += frame[i] * frame[i];
|
|
418
|
+
}
|
|
419
|
+
const rms = frame.length > 0 ? Math.sqrt(sumSq / frame.length) : 0;
|
|
420
|
+
const threshold = this.noiseFloor * this.thresholdRatio;
|
|
421
|
+
let active = rms > threshold;
|
|
422
|
+
if (active) {
|
|
423
|
+
this.hangoverRemaining = this.hangoverFrames;
|
|
424
|
+
}
|
|
425
|
+
else if (this.hangoverRemaining > 0) {
|
|
426
|
+
this.hangoverRemaining--;
|
|
427
|
+
active = true;
|
|
428
|
+
// While in hangover, do not update noise floor — RMS may still reflect tail energy.
|
|
429
|
+
}
|
|
430
|
+
else {
|
|
431
|
+
this.noiseFloor =
|
|
432
|
+
this.noiseFloor * (1 - this.noiseFloorAlpha) +
|
|
433
|
+
rms * this.noiseFloorAlpha;
|
|
434
|
+
}
|
|
435
|
+
return { active, energy: rms };
|
|
436
|
+
}
|
|
437
|
+
reset() {
|
|
438
|
+
this.noiseFloor = this.initialNoiseFloor;
|
|
439
|
+
this.hangoverRemaining = 0;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
/**
|
|
444
|
+
* Append-only ring buffer of VAD frames in stream-relative ms order.
|
|
445
|
+
* `pushFrame` is O(1) amortized; `framesInWindow` is O(n) over kept frames,
|
|
446
|
+
* which is fine for the per-word lookups we do (a 30 s window at 50 frames/s
|
|
447
|
+
* per channel × 2 channels = 3000 entries, scanned once per word).
|
|
448
|
+
*
|
|
449
|
+
* Runtime-agnostic — no DOM or Web Audio dependencies.
|
|
450
|
+
*/
|
|
451
|
+
class VadTimeline {
|
|
452
|
+
constructor(windowMs) {
|
|
453
|
+
this.windowMs = windowMs;
|
|
454
|
+
this.frames = [];
|
|
455
|
+
this.head = 0;
|
|
456
|
+
}
|
|
457
|
+
pushFrame(frame) {
|
|
458
|
+
this.frames.push(frame);
|
|
459
|
+
const cutoff = frame.ts - this.windowMs;
|
|
460
|
+
while (this.head < this.frames.length &&
|
|
461
|
+
this.frames[this.head].ts < cutoff) {
|
|
462
|
+
this.head++;
|
|
463
|
+
}
|
|
464
|
+
if (this.head > 1024 && this.head * 2 > this.frames.length) {
|
|
465
|
+
this.frames = this.frames.slice(this.head);
|
|
466
|
+
this.head = 0;
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
framesInWindow(startMs, endMs) {
|
|
470
|
+
const out = [];
|
|
471
|
+
for (let i = this.head; i < this.frames.length; i++) {
|
|
472
|
+
const f = this.frames[i];
|
|
473
|
+
if (f.ts < startMs)
|
|
474
|
+
continue;
|
|
475
|
+
if (f.ts > endMs)
|
|
476
|
+
break;
|
|
477
|
+
out.push(f);
|
|
478
|
+
}
|
|
479
|
+
return out;
|
|
480
|
+
}
|
|
481
|
+
clear() {
|
|
482
|
+
this.frames = [];
|
|
483
|
+
this.head = 0;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
/**
|
|
487
|
+
* Sum per-channel active RMS over a window. Returns a Map from channel name
|
|
488
|
+
* to total score. Channels with zero score are omitted.
|
|
489
|
+
*/
|
|
490
|
+
function scoreChannels(frames) {
|
|
491
|
+
var _a;
|
|
492
|
+
const scores = new Map();
|
|
493
|
+
for (const f of frames) {
|
|
494
|
+
if (!f.active)
|
|
495
|
+
continue;
|
|
496
|
+
scores.set(f.channel, ((_a = scores.get(f.channel)) !== null && _a !== void 0 ? _a : 0) + f.rms);
|
|
497
|
+
}
|
|
498
|
+
return scores;
|
|
499
|
+
}
|
|
500
|
+
/**
|
|
501
|
+
* Decide which channel was dominant during a word's `[start, end]` window.
|
|
502
|
+
*
|
|
503
|
+
* - If no channel has any active VAD energy → `"unknown"`.
|
|
504
|
+
* - If the top channel beats the runner-up by at least `dominanceRatio` → top channel.
|
|
505
|
+
* - Else: top channel wins on absolute score; exact ties → `"unknown"`.
|
|
506
|
+
*/
|
|
507
|
+
function attributeWord(word, timeline, params) {
|
|
508
|
+
const scores = scoreChannels(timeline.framesInWindow(word.start, word.end));
|
|
509
|
+
if (scores.size === 0)
|
|
510
|
+
return "unknown";
|
|
511
|
+
const sorted = [...scores.entries()].sort((a, b) => b[1] - a[1]);
|
|
512
|
+
if (sorted.length === 1)
|
|
513
|
+
return sorted[0][0];
|
|
514
|
+
const [topName, topScore] = sorted[0];
|
|
515
|
+
const [runnerName, runnerScore] = sorted[1];
|
|
516
|
+
if (topScore >= params.dominanceRatio * runnerScore)
|
|
517
|
+
return topName;
|
|
518
|
+
if (topScore > runnerScore)
|
|
519
|
+
return topName;
|
|
520
|
+
if (runnerScore > topScore)
|
|
521
|
+
return runnerName;
|
|
522
|
+
return "unknown";
|
|
523
|
+
}
|
|
524
|
+
/**
|
|
525
|
+
* Duration-weighted majority of word channels. `"unknown"` if there are no
|
|
526
|
+
* words, every word resolved to `"unknown"`, or two channels tie exactly.
|
|
527
|
+
*/
|
|
528
|
+
function rollUpTurnChannel(words) {
|
|
529
|
+
var _a;
|
|
530
|
+
const totals = new Map();
|
|
531
|
+
for (const w of words) {
|
|
532
|
+
if (!w.channel || w.channel === "unknown")
|
|
533
|
+
continue;
|
|
534
|
+
const dur = Math.max(0, w.end - w.start);
|
|
535
|
+
totals.set(w.channel, ((_a = totals.get(w.channel)) !== null && _a !== void 0 ? _a : 0) + dur);
|
|
536
|
+
}
|
|
537
|
+
if (totals.size === 0)
|
|
538
|
+
return "unknown";
|
|
539
|
+
const sorted = [...totals.entries()].sort((a, b) => b[1] - a[1]);
|
|
540
|
+
if (sorted.length === 1)
|
|
541
|
+
return sorted[0][0];
|
|
542
|
+
const [topName, topMs] = sorted[0];
|
|
543
|
+
const [, runnerMs] = sorted[1];
|
|
544
|
+
if (topMs === runnerMs)
|
|
545
|
+
return "unknown";
|
|
546
|
+
return topName;
|
|
547
|
+
}
|
|
548
|
+
/**
|
|
549
|
+
* Mutate `turn` in place: write `turn.words[i].channel` for every word and set
|
|
550
|
+
* `turn.channel` to the duration-weighted rollup.
|
|
551
|
+
*
|
|
552
|
+
* Returns `void` because the transcriber owns the `TurnEvent` ref and forwards
|
|
553
|
+
* the same object to the customer listener — no need to allocate a copy.
|
|
554
|
+
*/
|
|
555
|
+
function attributeTurn(turn, timeline, params) {
|
|
556
|
+
for (const w of turn.words) {
|
|
557
|
+
w.channel = attributeWord(w, timeline, params);
|
|
558
|
+
}
|
|
559
|
+
turn.channel = rollUpTurnChannel(turn.words);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
/**
|
|
563
|
+
* View any `AudioData` (ArrayBuffer / ArrayBufferView / typed array) as a
|
|
564
|
+
* little-endian Int16 sample sequence without copying. Callers must guarantee
|
|
565
|
+
* the underlying byte length is even.
|
|
566
|
+
*/
|
|
567
|
+
function toInt16View(audio) {
|
|
568
|
+
// AudioData is ArrayBufferLike per the public type, but in practice callers
|
|
569
|
+
// pass ArrayBuffer or a typed-array view. Handle both without copying.
|
|
570
|
+
if (audio instanceof Int16Array)
|
|
571
|
+
return audio;
|
|
572
|
+
if (ArrayBuffer.isView(audio)) {
|
|
573
|
+
const view = audio;
|
|
574
|
+
return new Int16Array(view.buffer, view.byteOffset, Math.floor(view.byteLength / 2));
|
|
575
|
+
}
|
|
576
|
+
return new Int16Array(audio);
|
|
577
|
+
}
|
|
578
|
+
const defaultStreamingUrl = "wss://streaming.assemblyai.com/v3/ws";
|
|
579
|
+
const terminateSessionMessage = `{"type":"Terminate"}`;
|
|
580
|
+
/**
|
|
581
|
+
* Per-send chunk cap in milliseconds for the dual-channel mixer. The streaming
|
|
582
|
+
* server rejects audio messages longer than 1000 ms (`Input Duration Error`).
|
|
583
|
+
* If a backlog accumulates (e.g. when a browser tab is backgrounded and
|
|
584
|
+
* `setInterval` is throttled to ~1 Hz), `flushMix` loops and emits multiple
|
|
585
|
+
* sends each ≤ this cap until the buffers drain.
|
|
586
|
+
*/
|
|
587
|
+
const MAX_CHUNK_MS = 200;
|
|
588
|
+
/**
|
|
589
|
+
* Per-send minimum chunk size in milliseconds. The streaming server also
|
|
590
|
+
* rejects audio messages shorter than 50 ms with the same
|
|
591
|
+
* `Input Duration Error`, so the mixer waits until both per-channel buffers
|
|
592
|
+
* have at least this much accumulated before emitting. Final-flush (close
|
|
593
|
+
* path) bypasses this floor so the trailing partial buffer still gets sent.
|
|
594
|
+
*/
|
|
595
|
+
const MIN_CHUNK_MS = 50;
|
|
596
|
+
class StreamingTranscriber {
|
|
597
|
+
constructor(params) {
|
|
598
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j;
|
|
599
|
+
this.listeners = {};
|
|
600
|
+
// Dual-channel mode state (allocated only when params.channels is set).
|
|
601
|
+
this.isDualChannel = false;
|
|
602
|
+
this.vadFrameSamples = 0;
|
|
603
|
+
this.minChunkSamples = 0;
|
|
604
|
+
this.maxChunkSamples = 0;
|
|
605
|
+
this.params = Object.assign(Object.assign({}, params), { websocketBaseUrl: params.websocketBaseUrl || defaultStreamingUrl });
|
|
606
|
+
if ("token" in params && params.token)
|
|
607
|
+
this.token = params.token;
|
|
608
|
+
if ("apiKey" in params && params.apiKey)
|
|
609
|
+
this.apiKey = params.apiKey;
|
|
610
|
+
if (!(this.token || this.apiKey)) {
|
|
611
|
+
throw new Error("API key or temporary token is required.");
|
|
612
|
+
}
|
|
613
|
+
if (params.channels) {
|
|
614
|
+
if (params.channels.length !== 2) {
|
|
615
|
+
throw new Error("StreamingTranscriber.channels must have exactly 2 entries.");
|
|
616
|
+
}
|
|
617
|
+
const names = params.channels.map((c) => c.name);
|
|
618
|
+
if (new Set(names).size !== names.length) {
|
|
619
|
+
throw new Error("StreamingTranscriber.channels names must be unique.");
|
|
620
|
+
}
|
|
621
|
+
this.isDualChannel = true;
|
|
622
|
+
this.channelNames = names;
|
|
623
|
+
const att = (_a = params.channelAttribution) !== null && _a !== void 0 ? _a : {};
|
|
624
|
+
this.attributionParams = {
|
|
625
|
+
dominanceRatio: (_b = att.dominanceRatio) !== null && _b !== void 0 ? _b : 4,
|
|
626
|
+
timelineWindowMs: (_c = att.timelineWindowMs) !== null && _c !== void 0 ? _c : 30000,
|
|
627
|
+
createVad: (_d = att.createVad) !== null && _d !== void 0 ? _d : (() => new EnergyVad()),
|
|
628
|
+
flushIntervalMs: (_e = att.flushIntervalMs) !== null && _e !== void 0 ? _e : 50,
|
|
629
|
+
resolveUnknownChannelsMethod: (_f = att.resolveUnknownChannelsMethod) !== null && _f !== void 0 ? _f : "window",
|
|
630
|
+
resolutionWindowWords: (_g = att.resolutionWindowWords) !== null && _g !== void 0 ? _g : 2,
|
|
631
|
+
speakerHistoryMinRmsEvidence: (_h = att.speakerHistoryMinRmsEvidence) !== null && _h !== void 0 ? _h : 0.5,
|
|
632
|
+
speakerHistoryDominanceRatio: (_j = att.speakerHistoryDominanceRatio) !== null && _j !== void 0 ? _j : 3,
|
|
633
|
+
};
|
|
634
|
+
if (this.attributionParams.resolveUnknownChannelsMethod ===
|
|
635
|
+
"speaker-history") {
|
|
636
|
+
this.speakerHistory = new Map();
|
|
637
|
+
}
|
|
638
|
+
// 20 ms VAD frames at the transcriber's target sample rate.
|
|
639
|
+
this.vadFrameSamples = Math.max(1, Math.round(params.sampleRate * 0.02));
|
|
640
|
+
this.minChunkSamples = Math.max(1, Math.round(params.sampleRate * (MIN_CHUNK_MS / 1000)));
|
|
641
|
+
this.maxChunkSamples = Math.max(this.minChunkSamples, Math.round(params.sampleRate * (MAX_CHUNK_MS / 1000)));
|
|
642
|
+
this.channelBuffers = new Map(names.map((n) => [n, []]));
|
|
643
|
+
this.channelSamplesReceived = new Map(names.map((n) => [n, 0]));
|
|
644
|
+
this.channelVadFloatBuffers = new Map(names.map((n) => [n, new Float32Array(this.vadFrameSamples)]));
|
|
645
|
+
this.channelVadBufferIdx = new Map(names.map((n) => [n, 0]));
|
|
646
|
+
this.channelVads = new Map(names.map((n) => [n, this.attributionParams.createVad(n)]));
|
|
647
|
+
this.timeline = new VadTimeline(this.attributionParams.timelineWindowMs);
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
connectionUrl() {
|
|
651
|
+
var _a, _b;
|
|
652
|
+
const url = new URL((_a = this.params.websocketBaseUrl) !== null && _a !== void 0 ? _a : "");
|
|
653
|
+
if (url.protocol !== "wss:") {
|
|
654
|
+
throw new Error("Invalid protocol, must be wss");
|
|
655
|
+
}
|
|
656
|
+
const searchParams = new URLSearchParams();
|
|
657
|
+
if (this.token) {
|
|
658
|
+
searchParams.set("token", this.token);
|
|
659
|
+
}
|
|
660
|
+
searchParams.set("sample_rate", this.params.sampleRate.toString());
|
|
661
|
+
if (this.params.endOfTurnConfidenceThreshold) {
|
|
662
|
+
searchParams.set("end_of_turn_confidence_threshold", this.params.endOfTurnConfidenceThreshold.toString());
|
|
663
|
+
}
|
|
664
|
+
if (this.params.minEndOfTurnSilenceWhenConfident !== undefined) {
|
|
665
|
+
if (this.params.minTurnSilence !== undefined) {
|
|
666
|
+
console.warn("[Deprecation Warning] Both `minEndOfTurnSilenceWhenConfident` and `minTurnSilence` are set. Using `minTurnSilence`; `minEndOfTurnSilenceWhenConfident` is deprecated.");
|
|
667
|
+
}
|
|
668
|
+
else {
|
|
669
|
+
console.warn("[Deprecation Warning] `minEndOfTurnSilenceWhenConfident` is deprecated and will be removed in a future release. Please use `minTurnSilence` instead.");
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
const effectiveMinTurnSilence = (_b = this.params.minTurnSilence) !== null && _b !== void 0 ? _b : this.params.minEndOfTurnSilenceWhenConfident;
|
|
673
|
+
if (effectiveMinTurnSilence !== undefined) {
|
|
674
|
+
searchParams.set("min_turn_silence", effectiveMinTurnSilence.toString());
|
|
675
|
+
}
|
|
676
|
+
if (this.params.maxTurnSilence) {
|
|
677
|
+
searchParams.set("max_turn_silence", this.params.maxTurnSilence.toString());
|
|
678
|
+
}
|
|
679
|
+
if (this.params.vadThreshold !== undefined) {
|
|
680
|
+
searchParams.set("vad_threshold", this.params.vadThreshold.toString());
|
|
681
|
+
}
|
|
682
|
+
if (this.params.formatTurns) {
|
|
683
|
+
searchParams.set("format_turns", this.params.formatTurns.toString());
|
|
684
|
+
}
|
|
685
|
+
if (this.params.encoding) {
|
|
686
|
+
searchParams.set("encoding", this.params.encoding.toString());
|
|
687
|
+
}
|
|
688
|
+
if (this.params.keytermsPrompt) {
|
|
689
|
+
searchParams.set("keyterms_prompt", JSON.stringify(this.params.keytermsPrompt));
|
|
690
|
+
}
|
|
691
|
+
else if (this.params.keyterms) {
|
|
692
|
+
console.warn("[Deprecation Warning] `keyterms` is deprecated and will be removed in a future release. Please use `keytermsPrompt` instead.");
|
|
693
|
+
searchParams.set("keyterms_prompt", JSON.stringify(this.params.keyterms));
|
|
694
|
+
}
|
|
695
|
+
if (this.params.prompt) {
|
|
696
|
+
searchParams.set("prompt", this.params.prompt);
|
|
697
|
+
}
|
|
698
|
+
if (this.params.filterProfanity) {
|
|
699
|
+
searchParams.set("filter_profanity", this.params.filterProfanity.toString());
|
|
700
|
+
}
|
|
701
|
+
if (this.params.speechModel === "u3-pro") {
|
|
702
|
+
console.warn("[Deprecation Warning] The speech model `u3-pro` is deprecated and will be removed in a future release. Please use `u3-rt-pro` instead.");
|
|
703
|
+
}
|
|
704
|
+
searchParams.set("speech_model", this.params.speechModel.toString());
|
|
705
|
+
if (this.params.languageDetection !== undefined) {
|
|
706
|
+
searchParams.set("language_detection", this.params.languageDetection.toString());
|
|
707
|
+
}
|
|
708
|
+
if (this.params.domain) {
|
|
709
|
+
searchParams.set("domain", this.params.domain);
|
|
710
|
+
}
|
|
711
|
+
if (this.params.inactivityTimeout !== undefined) {
|
|
712
|
+
searchParams.set("inactivity_timeout", this.params.inactivityTimeout.toString());
|
|
713
|
+
}
|
|
714
|
+
if (this.params.speakerLabels !== undefined) {
|
|
715
|
+
searchParams.set("speaker_labels", this.params.speakerLabels.toString());
|
|
716
|
+
}
|
|
717
|
+
if (this.params.maxSpeakers !== undefined) {
|
|
718
|
+
searchParams.set("max_speakers", this.params.maxSpeakers.toString());
|
|
719
|
+
}
|
|
720
|
+
if (this.params.voiceFocus) {
|
|
721
|
+
searchParams.set("voice_focus", this.params.voiceFocus);
|
|
722
|
+
}
|
|
723
|
+
if (this.params.voiceFocusThreshold !== undefined) {
|
|
724
|
+
searchParams.set("voice_focus_threshold", this.params.voiceFocusThreshold.toString());
|
|
725
|
+
}
|
|
726
|
+
if (this.params.continuousPartials !== undefined) {
|
|
727
|
+
searchParams.set("continuous_partials", this.params.continuousPartials.toString());
|
|
728
|
+
}
|
|
729
|
+
if (this.params.interruptionDelay !== undefined) {
|
|
730
|
+
searchParams.set("interruption_delay", this.params.interruptionDelay.toString());
|
|
731
|
+
}
|
|
732
|
+
if (this.params.turnLeftPadMs !== undefined) {
|
|
733
|
+
searchParams.set("turn_left_pad_ms", this.params.turnLeftPadMs.toString());
|
|
734
|
+
}
|
|
735
|
+
if (this.params.customerSupportAudioCapture) {
|
|
736
|
+
console.warn("`customerSupportAudioCapture=true` will record session audio. Only enable this when explicitly coordinating with AssemblyAI support.");
|
|
737
|
+
// The server's canonical wire name is `_customer_support_audio_capture`
|
|
738
|
+
// (leading underscore = "not officially supported / unstable"). The
|
|
739
|
+
// server also accepts `customer_support_audio_capture` via
|
|
740
|
+
// `populate_by_name=True`, but we send the underscore form to honor
|
|
741
|
+
// the server's stability marker.
|
|
742
|
+
searchParams.set("_customer_support_audio_capture", this.params.customerSupportAudioCapture.toString());
|
|
743
|
+
}
|
|
744
|
+
if (this.params.webhookUrl) {
|
|
745
|
+
searchParams.set("webhook_url", this.params.webhookUrl);
|
|
746
|
+
}
|
|
747
|
+
if (this.params.webhookAuthHeaderName) {
|
|
748
|
+
searchParams.set("webhook_auth_header_name", this.params.webhookAuthHeaderName);
|
|
749
|
+
}
|
|
750
|
+
if (this.params.webhookAuthHeaderValue) {
|
|
751
|
+
searchParams.set("webhook_auth_header_value", this.params.webhookAuthHeaderValue);
|
|
752
|
+
}
|
|
753
|
+
if (this.params.includePartialTurns !== undefined) {
|
|
754
|
+
searchParams.set("include_partial_turns", this.params.includePartialTurns.toString());
|
|
755
|
+
}
|
|
756
|
+
if (this.params.redactPii !== undefined) {
|
|
757
|
+
searchParams.set("redact_pii", this.params.redactPii.toString());
|
|
758
|
+
}
|
|
759
|
+
if (this.params.redactPiiPolicies !== undefined) {
|
|
760
|
+
searchParams.set("redact_pii_policies", JSON.stringify(this.params.redactPiiPolicies));
|
|
761
|
+
}
|
|
762
|
+
if (this.params.redactPiiSub !== undefined) {
|
|
763
|
+
searchParams.set("redact_pii_sub", this.params.redactPiiSub);
|
|
764
|
+
}
|
|
765
|
+
if (this.params.llmGateway !== undefined) {
|
|
766
|
+
searchParams.set("llm_gateway", JSON.stringify(this.params.llmGateway));
|
|
767
|
+
}
|
|
768
|
+
url.search = searchParams.toString();
|
|
769
|
+
return url;
|
|
770
|
+
}
|
|
771
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
772
|
+
on(event, listener) {
|
|
773
|
+
this.listeners[event] = listener;
|
|
774
|
+
}
|
|
775
|
+
connect() {
|
|
776
|
+
return new Promise((resolve) => {
|
|
777
|
+
if (this.socket) {
|
|
778
|
+
throw new Error("Already connected");
|
|
779
|
+
}
|
|
780
|
+
const url = this.connectionUrl();
|
|
781
|
+
if (this.token) {
|
|
782
|
+
this.socket = factory(url.toString());
|
|
783
|
+
}
|
|
784
|
+
else {
|
|
785
|
+
this.socket = factory(url.toString(), {
|
|
786
|
+
headers: { Authorization: this.apiKey },
|
|
787
|
+
});
|
|
788
|
+
}
|
|
789
|
+
this.socket.binaryType = "arraybuffer";
|
|
790
|
+
this.socket.onopen = () => { };
|
|
791
|
+
this.socket.onclose = ({ code, reason }) => {
|
|
792
|
+
var _a, _b;
|
|
793
|
+
if (!reason) {
|
|
794
|
+
if (code in StreamingErrorMessages) {
|
|
795
|
+
reason = StreamingErrorMessages[code];
|
|
796
|
+
}
|
|
797
|
+
}
|
|
798
|
+
// Stop the flush timer when the socket is gone (server-initiated close,
|
|
799
|
+
// network drop, etc.) — otherwise subsequent ticks call send() on a
|
|
800
|
+
// closed socket and spam the error listener.
|
|
801
|
+
if (this.flushTimer) {
|
|
802
|
+
clearInterval(this.flushTimer);
|
|
803
|
+
this.flushTimer = undefined;
|
|
804
|
+
}
|
|
805
|
+
(_b = (_a = this.listeners).close) === null || _b === void 0 ? void 0 : _b.call(_a, code, reason);
|
|
806
|
+
};
|
|
807
|
+
this.socket.onerror = (event) => {
|
|
808
|
+
var _a, _b, _c, _d;
|
|
809
|
+
if (event.error)
|
|
810
|
+
(_b = (_a = this.listeners).error) === null || _b === void 0 ? void 0 : _b.call(_a, event.error);
|
|
811
|
+
else
|
|
812
|
+
(_d = (_c = this.listeners).error) === null || _d === void 0 ? void 0 : _d.call(_c, new Error(event.message));
|
|
813
|
+
};
|
|
814
|
+
this.socket.onmessage = ({ data }) => {
|
|
815
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o;
|
|
816
|
+
const message = JSON.parse(data.toString());
|
|
817
|
+
if ("error" in message) {
|
|
818
|
+
const err = new StreamingError(message.error);
|
|
819
|
+
if ("error_code" in message) {
|
|
820
|
+
err.code =
|
|
821
|
+
message.error_code;
|
|
822
|
+
}
|
|
823
|
+
(_b = (_a = this.listeners).error) === null || _b === void 0 ? void 0 : _b.call(_a, err);
|
|
824
|
+
return;
|
|
825
|
+
}
|
|
826
|
+
switch (message.type) {
|
|
827
|
+
case "Begin": {
|
|
828
|
+
resolve(message);
|
|
829
|
+
(_d = (_c = this.listeners).open) === null || _d === void 0 ? void 0 : _d.call(_c, message);
|
|
830
|
+
break;
|
|
831
|
+
}
|
|
832
|
+
case "Turn": {
|
|
833
|
+
if (this.isDualChannel && this.timeline && this.attributionParams) {
|
|
834
|
+
attributeTurn(message, this.timeline, {
|
|
835
|
+
dominanceRatio: this.attributionParams.dominanceRatio,
|
|
836
|
+
});
|
|
837
|
+
switch (this.attributionParams.resolveUnknownChannelsMethod) {
|
|
838
|
+
case "window":
|
|
839
|
+
this.resolveUnknownChannelsByWindow(message);
|
|
840
|
+
break;
|
|
841
|
+
case "speaker-history":
|
|
842
|
+
this.resolveUnknownChannelsBySpeakerHistory(message);
|
|
843
|
+
break;
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
(_f = (_e = this.listeners).turn) === null || _f === void 0 ? void 0 : _f.call(_e, message);
|
|
847
|
+
break;
|
|
848
|
+
}
|
|
849
|
+
case "SpeechStarted": {
|
|
850
|
+
(_h = (_g = this.listeners).speechStarted) === null || _h === void 0 ? void 0 : _h.call(_g, message);
|
|
851
|
+
break;
|
|
852
|
+
}
|
|
853
|
+
case "LLMGatewayResponse": {
|
|
854
|
+
(_k = (_j = this.listeners).llmGatewayResponse) === null || _k === void 0 ? void 0 : _k.call(_j, message);
|
|
855
|
+
break;
|
|
856
|
+
}
|
|
857
|
+
case "Warning": {
|
|
858
|
+
const warning = message;
|
|
859
|
+
console.warn(`Streaming warning (code=${warning.warning_code}): ${warning.warning}`);
|
|
860
|
+
(_m = (_l = this.listeners).warning) === null || _m === void 0 ? void 0 : _m.call(_l, warning);
|
|
861
|
+
break;
|
|
862
|
+
}
|
|
863
|
+
case "Termination": {
|
|
864
|
+
(_o = this.sessionTerminatedResolve) === null || _o === void 0 ? void 0 : _o.call(this);
|
|
865
|
+
break;
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
};
|
|
869
|
+
});
|
|
870
|
+
}
|
|
871
|
+
/**
|
|
872
|
+
* Returns a WritableStream that pumps PCM chunks into `sendAudio`. Single-channel
|
|
873
|
+
* only — in dual-channel mode use `sendAudio(pcm, { channel })` directly, since
|
|
874
|
+
* `WritableStream` has no place to carry a channel tag.
|
|
875
|
+
*/
|
|
876
|
+
stream() {
|
|
877
|
+
return new WritableStream({
|
|
878
|
+
write: (chunk) => {
|
|
879
|
+
this.sendAudio(chunk);
|
|
880
|
+
},
|
|
881
|
+
});
|
|
882
|
+
}
|
|
883
|
+
/**
|
|
884
|
+
* Send PCM audio.
|
|
885
|
+
*
|
|
886
|
+
* In single-channel mode, `audio` is forwarded directly to the WebSocket and
|
|
887
|
+
* `options` is ignored.
|
|
888
|
+
*
|
|
889
|
+
* In dual-channel mode (when `channels` is configured), `options.channel` is
|
|
890
|
+
* REQUIRED and must match one of the declared channel names. Per-channel PCM is
|
|
891
|
+
* fed into that channel's VAD, accumulated into a per-channel ring buffer, and
|
|
892
|
+
* a scheduled flush (`channelAttribution.flushIntervalMs`, default 50ms) mixes
|
|
893
|
+
* the buffers into mono before sending to the WebSocket.
|
|
894
|
+
*/
|
|
895
|
+
sendAudio(audio, options) {
|
|
896
|
+
if (!this.isDualChannel) {
|
|
897
|
+
this.send(audio);
|
|
898
|
+
return;
|
|
899
|
+
}
|
|
900
|
+
if (!(options === null || options === void 0 ? void 0 : options.channel)) {
|
|
901
|
+
throw new Error("StreamingTranscriber is in dual-channel mode; sendAudio requires { channel }.");
|
|
902
|
+
}
|
|
903
|
+
if (!this.channelNames.includes(options.channel)) {
|
|
904
|
+
throw new Error(`Unknown channel "${options.channel}"; declared channels: ${this.channelNames.join(", ")}.`);
|
|
905
|
+
}
|
|
906
|
+
this.ingestChannelAudio(options.channel, audio);
|
|
907
|
+
}
|
|
908
|
+
ingestChannelAudio(name, audio) {
|
|
909
|
+
var _a, _b;
|
|
910
|
+
const samples = toInt16View(audio);
|
|
911
|
+
const buf = this.channelBuffers.get(name);
|
|
912
|
+
const vadBuf = this.channelVadFloatBuffers.get(name);
|
|
913
|
+
let vadIdx = this.channelVadBufferIdx.get(name);
|
|
914
|
+
let received = this.channelSamplesReceived.get(name);
|
|
915
|
+
const vad = this.channelVads.get(name);
|
|
916
|
+
const sampleRate = this.params.sampleRate;
|
|
917
|
+
const frameSize = this.vadFrameSamples;
|
|
918
|
+
for (let i = 0; i < samples.length; i++) {
|
|
919
|
+
const s = samples[i];
|
|
920
|
+
buf.push(s);
|
|
921
|
+
vadBuf[vadIdx++] = s / 0x8000;
|
|
922
|
+
received++;
|
|
923
|
+
if (vadIdx === frameSize) {
|
|
924
|
+
const result = vad.process(vadBuf);
|
|
925
|
+
const frame = {
|
|
926
|
+
ts: (received / sampleRate) * 1000,
|
|
927
|
+
channel: name,
|
|
928
|
+
active: result.active,
|
|
929
|
+
rms: result.energy,
|
|
930
|
+
};
|
|
931
|
+
this.timeline.pushFrame(frame);
|
|
932
|
+
(_b = (_a = this.listeners).vad) === null || _b === void 0 ? void 0 : _b.call(_a, frame);
|
|
933
|
+
vadIdx = 0;
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
this.channelVadBufferIdx.set(name, vadIdx);
|
|
937
|
+
this.channelSamplesReceived.set(name, received);
|
|
938
|
+
if (!this.flushTimer)
|
|
939
|
+
this.startFlushTimer();
|
|
940
|
+
}
|
|
941
|
+
startFlushTimer() {
|
|
942
|
+
this.flushTimer = setInterval(() => this.flushMix(), this.attributionParams.flushIntervalMs);
|
|
943
|
+
}
|
|
944
|
+
flushMix(force = false) {
|
|
945
|
+
var _a, _b;
|
|
946
|
+
if (!this.channelNames || !this.channelBuffers)
|
|
947
|
+
return;
|
|
948
|
+
const bufs = this.channelNames.map((n) => this.channelBuffers.get(n));
|
|
949
|
+
const divisor = bufs.length;
|
|
950
|
+
// Loop so a backlog (e.g. accumulated while a browser tab was throttled in
|
|
951
|
+
// the background) drains as multiple sends, each capped at MAX_CHUNK_MS.
|
|
952
|
+
// Without the cap a single message could exceed the server's 1000 ms input
|
|
953
|
+
// duration limit and be rejected with code 3007.
|
|
954
|
+
for (;;) {
|
|
955
|
+
let mixLen = Infinity;
|
|
956
|
+
for (const b of bufs)
|
|
957
|
+
if (b.length < mixLen)
|
|
958
|
+
mixLen = b.length;
|
|
959
|
+
if (!Number.isFinite(mixLen) || mixLen === 0)
|
|
960
|
+
return;
|
|
961
|
+
// The streaming server rejects audio messages shorter than 50 ms with
|
|
962
|
+
// `Input Duration Error`. Wait until both per-channel buffers have at
|
|
963
|
+
// least minChunkSamples worth queued before emitting. The `force` path
|
|
964
|
+
// (final flush on close) bypasses this so the trailing partial buffer
|
|
965
|
+
// still gets through.
|
|
966
|
+
if (!force && mixLen < this.minChunkSamples)
|
|
967
|
+
return;
|
|
968
|
+
if (mixLen > this.maxChunkSamples)
|
|
969
|
+
mixLen = this.maxChunkSamples;
|
|
970
|
+
const out = new Int16Array(mixLen);
|
|
971
|
+
for (let i = 0; i < mixLen; i++) {
|
|
972
|
+
let sum = 0;
|
|
973
|
+
for (let c = 0; c < divisor; c++)
|
|
974
|
+
sum += bufs[c][i];
|
|
975
|
+
const avg = Math.round(sum / divisor);
|
|
976
|
+
out[i] = avg < -32768 ? -32768 : avg > 32767 ? 32767 : avg;
|
|
977
|
+
}
|
|
978
|
+
for (const b of bufs)
|
|
979
|
+
b.splice(0, mixLen);
|
|
980
|
+
try {
|
|
981
|
+
this.send(out.buffer);
|
|
982
|
+
}
|
|
983
|
+
catch (err) {
|
|
984
|
+
(_b = (_a = this.listeners).error) === null || _b === void 0 ? void 0 : _b.call(_a, err);
|
|
985
|
+
return;
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
/**
|
|
990
|
+
* Fill in words whose per-word VAD attribution was `"unknown"` by looking
|
|
991
|
+
* at the dominant non-`"unknown"` channel among ±N neighbors in the same
|
|
992
|
+
* turn. Words with no non-`"unknown"` neighbors stay `"unknown"`. Confident
|
|
993
|
+
* per-word VAD decisions are never modified.
|
|
994
|
+
*
|
|
995
|
+
* Local temporal heuristic — ignores `speaker_label`, so it works even when
|
|
996
|
+
* AAI's diarization re-uses the same label for two physically distinct
|
|
997
|
+
* voices. Each resolved word gets `channelResolved: true` so downstream
|
|
998
|
+
* renderers can distinguish inferred channels from directly-measured ones.
|
|
999
|
+
*/
|
|
1000
|
+
resolveUnknownChannelsByWindow(turn) {
|
|
1001
|
+
var _a;
|
|
1002
|
+
if (!this.attributionParams)
|
|
1003
|
+
return;
|
|
1004
|
+
const window = this.attributionParams.resolutionWindowWords;
|
|
1005
|
+
const words = turn.words;
|
|
1006
|
+
let mutated = false;
|
|
1007
|
+
for (let i = 0; i < words.length; i++) {
|
|
1008
|
+
if (words[i].channel !== "unknown")
|
|
1009
|
+
continue;
|
|
1010
|
+
const tally = new Map();
|
|
1011
|
+
const lo = Math.max(0, i - window);
|
|
1012
|
+
const hi = Math.min(words.length - 1, i + window);
|
|
1013
|
+
for (let j = lo; j <= hi; j++) {
|
|
1014
|
+
if (j === i)
|
|
1015
|
+
continue;
|
|
1016
|
+
const ch = words[j].channel;
|
|
1017
|
+
if (!ch || ch === "unknown")
|
|
1018
|
+
continue;
|
|
1019
|
+
tally.set(ch, ((_a = tally.get(ch)) !== null && _a !== void 0 ? _a : 0) + 1);
|
|
1020
|
+
}
|
|
1021
|
+
if (tally.size === 0)
|
|
1022
|
+
continue;
|
|
1023
|
+
// Pick the dominant neighbor channel. Ties → leave `"unknown"` (rare;
|
|
1024
|
+
// would require an equal count of mic and system neighbors).
|
|
1025
|
+
let top;
|
|
1026
|
+
let topCount = 0;
|
|
1027
|
+
let tied = false;
|
|
1028
|
+
for (const [name, count] of tally) {
|
|
1029
|
+
if (count > topCount) {
|
|
1030
|
+
top = name;
|
|
1031
|
+
topCount = count;
|
|
1032
|
+
tied = false;
|
|
1033
|
+
}
|
|
1034
|
+
else if (count === topCount) {
|
|
1035
|
+
tied = true;
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
if (top && !tied) {
|
|
1039
|
+
words[i].channel = top;
|
|
1040
|
+
words[i].channelResolved = true;
|
|
1041
|
+
mutated = true;
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
// Recompute the rollup only if any per-word channel changed.
|
|
1045
|
+
if (mutated)
|
|
1046
|
+
turn.channel = rollUpTurnChannel(words);
|
|
1047
|
+
}
|
|
1048
|
+
/**
|
|
1049
|
+
* Fill `"unknown"` words by looking up the speaker's session-wide channel
|
|
1050
|
+
* evidence. For each `speaker_label`, sums active VAD frame RMS per channel
|
|
1051
|
+
* across every word the speaker has uttered to date. A speaker is
|
|
1052
|
+
* "resolvable" if their total evidence clears
|
|
1053
|
+
* `speakerHistoryMinRmsEvidence` and their top channel exceeds the
|
|
1054
|
+
* runner-up by `speakerHistoryDominanceRatio`.
|
|
1055
|
+
*
|
|
1056
|
+
* Only touches `"unknown"` words. Confident per-word VAD decisions are
|
|
1057
|
+
* never modified. `speaker_label` is never modified.
|
|
1058
|
+
*/
|
|
1059
|
+
resolveUnknownChannelsBySpeakerHistory(turn) {
|
|
1060
|
+
var _a;
|
|
1061
|
+
if (!this.timeline || !this.attributionParams || !this.speakerHistory)
|
|
1062
|
+
return;
|
|
1063
|
+
const minEvidence = this.attributionParams.speakerHistoryMinRmsEvidence;
|
|
1064
|
+
const dominanceRatio = this.attributionParams.speakerHistoryDominanceRatio;
|
|
1065
|
+
// 1. Accumulate evidence from this turn's words.
|
|
1066
|
+
for (const w of turn.words) {
|
|
1067
|
+
if (!w.speaker)
|
|
1068
|
+
continue;
|
|
1069
|
+
const frames = this.timeline.framesInWindow(w.start, w.end);
|
|
1070
|
+
let entry = this.speakerHistory.get(w.speaker);
|
|
1071
|
+
if (!entry) {
|
|
1072
|
+
entry = new Map();
|
|
1073
|
+
this.speakerHistory.set(w.speaker, entry);
|
|
1074
|
+
}
|
|
1075
|
+
for (const f of frames) {
|
|
1076
|
+
if (!f.active)
|
|
1077
|
+
continue;
|
|
1078
|
+
entry.set(f.channel, ((_a = entry.get(f.channel)) !== null && _a !== void 0 ? _a : 0) + f.rms);
|
|
1079
|
+
}
|
|
1080
|
+
}
|
|
1081
|
+
// 2. Fill unknown words whose speakers have dominant evidence.
|
|
1082
|
+
let mutated = false;
|
|
1083
|
+
for (const w of turn.words) {
|
|
1084
|
+
if (w.channel !== "unknown" || !w.speaker)
|
|
1085
|
+
continue;
|
|
1086
|
+
const entry = this.speakerHistory.get(w.speaker);
|
|
1087
|
+
if (!entry || entry.size === 0)
|
|
1088
|
+
continue;
|
|
1089
|
+
let total = 0;
|
|
1090
|
+
let topName;
|
|
1091
|
+
let topScore = 0;
|
|
1092
|
+
let runnerScore = 0;
|
|
1093
|
+
for (const [name, score] of entry) {
|
|
1094
|
+
total += score;
|
|
1095
|
+
if (score > topScore) {
|
|
1096
|
+
runnerScore = topScore;
|
|
1097
|
+
topScore = score;
|
|
1098
|
+
topName = name;
|
|
1099
|
+
}
|
|
1100
|
+
else if (score > runnerScore) {
|
|
1101
|
+
runnerScore = score;
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
if (total < minEvidence)
|
|
1105
|
+
continue;
|
|
1106
|
+
if (runnerScore > 0 && topScore < dominanceRatio * runnerScore)
|
|
1107
|
+
continue;
|
|
1108
|
+
if (topName) {
|
|
1109
|
+
w.channel = topName;
|
|
1110
|
+
w.channelResolved = true;
|
|
1111
|
+
mutated = true;
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
if (mutated)
|
|
1115
|
+
turn.channel = rollUpTurnChannel(turn.words);
|
|
1116
|
+
}
|
|
1117
|
+
/**
|
|
1118
|
+
* Update the streaming configuration mid-stream.
|
|
1119
|
+
* @param config - The configuration parameters to update
|
|
1120
|
+
*/
|
|
1121
|
+
updateConfiguration(config) {
|
|
1122
|
+
const { min_end_of_turn_silence_when_confident, min_turn_silence } = config, rest = __rest(config, ["min_end_of_turn_silence_when_confident", "min_turn_silence"]);
|
|
1123
|
+
if (min_end_of_turn_silence_when_confident !== undefined) {
|
|
1124
|
+
if (min_turn_silence !== undefined) {
|
|
1125
|
+
console.warn("[Deprecation Warning] Both `min_end_of_turn_silence_when_confident` and `min_turn_silence` are set. Using `min_turn_silence`; `min_end_of_turn_silence_when_confident` is deprecated.");
|
|
1126
|
+
}
|
|
1127
|
+
else {
|
|
1128
|
+
console.warn("[Deprecation Warning] `min_end_of_turn_silence_when_confident` is deprecated and will be removed in a future release. Please use `min_turn_silence` instead.");
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
const effective = min_turn_silence !== null && min_turn_silence !== void 0 ? min_turn_silence : min_end_of_turn_silence_when_confident;
|
|
1132
|
+
const message = Object.assign(Object.assign({ type: "UpdateConfiguration" }, rest), (effective !== undefined ? { min_turn_silence: effective } : {}));
|
|
1133
|
+
this.send(JSON.stringify(message));
|
|
1134
|
+
}
|
|
1135
|
+
/**
|
|
1136
|
+
* Force the current turn to end immediately.
|
|
1137
|
+
*/
|
|
1138
|
+
forceEndpoint() {
|
|
1139
|
+
const message = {
|
|
1140
|
+
type: "ForceEndpoint",
|
|
1141
|
+
};
|
|
1142
|
+
this.send(JSON.stringify(message));
|
|
1143
|
+
}
|
|
1144
|
+
send(data) {
|
|
1145
|
+
if (!this.socket || this.socket.readyState !== this.socket.OPEN) {
|
|
1146
|
+
throw new Error("Socket is not open for communication");
|
|
1147
|
+
}
|
|
1148
|
+
this.socket.send(data);
|
|
1149
|
+
}
|
|
1150
|
+
close() {
|
|
1151
|
+
return __awaiter(this, arguments, void 0, function* (waitForSessionTermination = true) {
|
|
1152
|
+
var _a;
|
|
1153
|
+
if (this.flushTimer) {
|
|
1154
|
+
clearInterval(this.flushTimer);
|
|
1155
|
+
this.flushTimer = undefined;
|
|
1156
|
+
// Best-effort: drain any final partial mix so the server gets the tail.
|
|
1157
|
+
// Bypass the 50ms floor here since this is the last flush; if the tail
|
|
1158
|
+
// is <50ms the server will reject that single message, but we'd lose
|
|
1159
|
+
// the audio either way.
|
|
1160
|
+
this.flushMix(true);
|
|
1161
|
+
}
|
|
1162
|
+
if (this.socket) {
|
|
1163
|
+
if (this.socket.readyState === this.socket.OPEN) {
|
|
1164
|
+
if (waitForSessionTermination) {
|
|
1165
|
+
const sessionTerminatedPromise = new Promise((resolve) => {
|
|
1166
|
+
this.sessionTerminatedResolve = resolve;
|
|
1167
|
+
});
|
|
1168
|
+
this.socket.send(terminateSessionMessage);
|
|
1169
|
+
yield sessionTerminatedPromise;
|
|
1170
|
+
}
|
|
1171
|
+
else {
|
|
1172
|
+
this.socket.send(terminateSessionMessage);
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
if ((_a = this.socket) === null || _a === void 0 ? void 0 : _a.removeAllListeners)
|
|
1176
|
+
this.socket.removeAllListeners();
|
|
1177
|
+
this.socket.close();
|
|
1178
|
+
}
|
|
1179
|
+
this.listeners = {};
|
|
1180
|
+
this.socket = undefined;
|
|
1181
|
+
});
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
const DEFAULT_FETCH_INIT = {
|
|
1186
|
+
cache: "no-store",
|
|
1187
|
+
};
|
|
1188
|
+
|
|
1189
|
+
const buildUserAgent = (userAgent) => defaultUserAgentString +
|
|
1190
|
+
(userAgent === false
|
|
1191
|
+
? ""
|
|
1192
|
+
: " AssemblyAI/1.0 (" +
|
|
1193
|
+
Object.entries(Object.assign(Object.assign({}, defaultUserAgent), userAgent))
|
|
1194
|
+
.map(([key, item]) => item ? `${key}=${item.name}/${item.version}` : "")
|
|
1195
|
+
.join(" ") +
|
|
1196
|
+
")");
|
|
1197
|
+
let defaultUserAgentString = "";
|
|
1198
|
+
if (typeof navigator !== "undefined" && navigator.userAgent) {
|
|
1199
|
+
defaultUserAgentString += navigator.userAgent;
|
|
1200
|
+
}
|
|
1201
|
+
const defaultUserAgent = {
|
|
1202
|
+
sdk: { name: "JavaScript", version: "__SDK_VERSION__" },
|
|
1203
|
+
};
|
|
1204
|
+
if (typeof process !== "undefined") {
|
|
1205
|
+
if (process.versions.node && defaultUserAgentString.indexOf("Node") === -1) {
|
|
1206
|
+
defaultUserAgent.runtime_env = {
|
|
1207
|
+
name: "Node",
|
|
1208
|
+
version: process.versions.node,
|
|
1209
|
+
};
|
|
1210
|
+
}
|
|
1211
|
+
if (process.versions.bun && defaultUserAgentString.indexOf("Bun") === -1) {
|
|
1212
|
+
defaultUserAgent.runtime_env = {
|
|
1213
|
+
name: "Bun",
|
|
1214
|
+
version: process.versions.bun,
|
|
1215
|
+
};
|
|
1216
|
+
}
|
|
1217
|
+
}
|
|
1218
|
+
if (typeof Deno !== "undefined") {
|
|
1219
|
+
if (process.versions.bun && defaultUserAgentString.indexOf("Deno") === -1) {
|
|
1220
|
+
defaultUserAgent.runtime_env = { name: "Deno", version: Deno.version.deno };
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
/**
|
|
1225
|
+
* Base class for services that communicate with the API.
|
|
1226
|
+
*/
|
|
1227
|
+
class BaseService {
|
|
1228
|
+
/**
|
|
1229
|
+
* Create a new service.
|
|
1230
|
+
* @param params - The parameters to use for the service.
|
|
1231
|
+
*/
|
|
1232
|
+
constructor(params) {
|
|
1233
|
+
this.params = params;
|
|
1234
|
+
if (params.userAgent === false) {
|
|
1235
|
+
this.userAgent = undefined;
|
|
1236
|
+
}
|
|
1237
|
+
else {
|
|
1238
|
+
this.userAgent = buildUserAgent(params.userAgent || {});
|
|
1239
|
+
}
|
|
1240
|
+
}
|
|
1241
|
+
fetch(input, init) {
|
|
1242
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
1243
|
+
init = Object.assign(Object.assign({}, DEFAULT_FETCH_INIT), init);
|
|
1244
|
+
let headers = {
|
|
1245
|
+
Authorization: this.params.apiKey,
|
|
1246
|
+
"Content-Type": "application/json",
|
|
1247
|
+
};
|
|
1248
|
+
if (DEFAULT_FETCH_INIT === null || DEFAULT_FETCH_INIT === void 0 ? void 0 : DEFAULT_FETCH_INIT.headers)
|
|
1249
|
+
headers = Object.assign(Object.assign({}, headers), DEFAULT_FETCH_INIT.headers);
|
|
1250
|
+
if (init === null || init === void 0 ? void 0 : init.headers)
|
|
1251
|
+
headers = Object.assign(Object.assign({}, headers), init.headers);
|
|
1252
|
+
if (this.userAgent) {
|
|
1253
|
+
headers["User-Agent"] = this.userAgent;
|
|
1254
|
+
{
|
|
1255
|
+
// chromium browsers have a bug where the user agent can't be modified
|
|
1256
|
+
if (typeof window !== "undefined" && "chrome" in window) {
|
|
1257
|
+
headers["AssemblyAI-Agent"] =
|
|
1258
|
+
this.userAgent;
|
|
1259
|
+
}
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
init.headers = headers;
|
|
1263
|
+
if (!input.startsWith("http"))
|
|
1264
|
+
input = this.params.baseUrl + input;
|
|
1265
|
+
const response = yield fetch(input, init);
|
|
1266
|
+
if (response.status >= 400) {
|
|
1267
|
+
let json;
|
|
1268
|
+
const text = yield response.text();
|
|
1269
|
+
if (text) {
|
|
1270
|
+
try {
|
|
1271
|
+
json = JSON.parse(text);
|
|
1272
|
+
}
|
|
1273
|
+
catch (_a) {
|
|
1274
|
+
/* empty */
|
|
1275
|
+
}
|
|
1276
|
+
if (json === null || json === void 0 ? void 0 : json.error)
|
|
1277
|
+
throw new Error(json.error);
|
|
1278
|
+
throw new Error(text);
|
|
1279
|
+
}
|
|
1280
|
+
throw new Error(`HTTP Error: ${response.status} ${response.statusText}`);
|
|
1281
|
+
}
|
|
1282
|
+
return response;
|
|
1283
|
+
});
|
|
1284
|
+
}
|
|
1285
|
+
fetchJson(input, init) {
|
|
1286
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
1287
|
+
const response = yield this.fetch(input, init);
|
|
1288
|
+
return response.json();
|
|
1289
|
+
});
|
|
1290
|
+
}
|
|
1291
|
+
}
|
|
1292
|
+
|
|
1293
|
+
class StreamingTranscriberFactory extends BaseService {
|
|
1294
|
+
constructor(params) {
|
|
1295
|
+
super(params);
|
|
1296
|
+
this.baseServiceParams = params;
|
|
1297
|
+
}
|
|
1298
|
+
transcriber(params) {
|
|
1299
|
+
const serviceParams = Object.assign({}, params);
|
|
1300
|
+
if (!serviceParams.token && !serviceParams.apiKey) {
|
|
1301
|
+
serviceParams.apiKey = this.baseServiceParams.apiKey;
|
|
1302
|
+
}
|
|
1303
|
+
return new StreamingTranscriber(serviceParams);
|
|
1304
|
+
}
|
|
1305
|
+
createTemporaryToken(params) {
|
|
1306
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
1307
|
+
const searchParams = new URLSearchParams();
|
|
1308
|
+
// Add each param to the search params
|
|
1309
|
+
Object.entries(params).forEach(([key, value]) => {
|
|
1310
|
+
if (value !== undefined && value !== null) {
|
|
1311
|
+
searchParams.append(key, String(value));
|
|
1312
|
+
}
|
|
1313
|
+
});
|
|
1314
|
+
const queryString = searchParams.toString();
|
|
1315
|
+
const url = queryString ? `/v3/token?${queryString}` : "/v3/token";
|
|
1316
|
+
const data = yield this.fetchJson(url, {
|
|
1317
|
+
method: "GET",
|
|
1318
|
+
});
|
|
1319
|
+
return data.token;
|
|
1320
|
+
});
|
|
1321
|
+
}
|
|
1322
|
+
}
|
|
1323
|
+
class StreamingServiceFactory extends StreamingTranscriberFactory {
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
/**
|
|
1327
|
+
* AudioWorklet processor that ingests mono Float32 audio at the AudioContext's
|
|
1328
|
+
* native sample rate, resamples to `targetRate` (linear interpolation, stateful
|
|
1329
|
+
* across `process()` calls), packs to little-endian Int16 PCM, and posts
|
|
1330
|
+
* fixed-size chunks via `port.postMessage` with a running `samplesSent` counter.
|
|
1331
|
+
*
|
|
1332
|
+
* `samplesSent` is in **target-rate samples**, so the main thread can derive a
|
|
1333
|
+
* stream-relative timestamp = `samplesSent / targetRate * 1000` (ms) — the same
|
|
1334
|
+
* frame AAI uses for `StreamingWord.start` / `.end`.
|
|
1335
|
+
*
|
|
1336
|
+
* Defined as a string so it can be registered via a Blob URL — the SDK ships as
|
|
1337
|
+
* a single ESM file, so a separate `.js` worklet asset isn't viable.
|
|
1338
|
+
*/
|
|
1339
|
+
const pcm16EncoderWorkletSource = `
|
|
1340
|
+
class Pcm16EncoderProcessor extends AudioWorkletProcessor {
|
|
1341
|
+
constructor(options) {
|
|
1342
|
+
super();
|
|
1343
|
+
const opts = (options && options.processorOptions) || {};
|
|
1344
|
+
this.targetRate = opts.targetRate || 16000;
|
|
1345
|
+
this.chunkMs = opts.chunkMs || 50;
|
|
1346
|
+
this.ratio = sampleRate / this.targetRate;
|
|
1347
|
+
this.chunkSize = Math.round(this.targetRate * this.chunkMs / 1000);
|
|
1348
|
+
this.buffer = new Int16Array(this.chunkSize);
|
|
1349
|
+
this.bufferIdx = 0;
|
|
1350
|
+
this.samplesSent = 0;
|
|
1351
|
+
this.lastSample = 0;
|
|
1352
|
+
this.fractional = 0;
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
process(inputs) {
|
|
1356
|
+
const input = inputs[0];
|
|
1357
|
+
if (!input || input.length === 0 || !input[0] || input[0].length === 0) {
|
|
1358
|
+
return true;
|
|
1359
|
+
}
|
|
1360
|
+
const mono = input[0];
|
|
1361
|
+
let pos = this.fractional;
|
|
1362
|
+
while (pos < mono.length) {
|
|
1363
|
+
const i = Math.floor(pos);
|
|
1364
|
+
const frac = pos - i;
|
|
1365
|
+
const a = i === 0 ? this.lastSample : mono[i - 1];
|
|
1366
|
+
const b = mono[i];
|
|
1367
|
+
const sample = a + (b - a) * frac;
|
|
1368
|
+
const clamped = sample < -1 ? -1 : sample > 1 ? 1 : sample;
|
|
1369
|
+
this.buffer[this.bufferIdx++] = clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff;
|
|
1370
|
+
if (this.bufferIdx === this.chunkSize) {
|
|
1371
|
+
const out = new Int16Array(this.chunkSize);
|
|
1372
|
+
out.set(this.buffer);
|
|
1373
|
+
this.samplesSent += this.chunkSize;
|
|
1374
|
+
this.port.postMessage(
|
|
1375
|
+
{ pcm: out.buffer, samplesSent: this.samplesSent },
|
|
1376
|
+
[out.buffer],
|
|
1377
|
+
);
|
|
1378
|
+
this.bufferIdx = 0;
|
|
1379
|
+
}
|
|
1380
|
+
pos += this.ratio;
|
|
1381
|
+
}
|
|
1382
|
+
this.lastSample = mono[mono.length - 1];
|
|
1383
|
+
this.fractional = pos - mono.length;
|
|
1384
|
+
return true;
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
registerProcessor("aai-pcm16-encoder", Pcm16EncoderProcessor);
|
|
1388
|
+
`;
|
|
1389
|
+
const PCM16_ENCODER_PROCESSOR_NAME = "aai-pcm16-encoder";
|
|
1390
|
+
|
|
1391
|
+
const DEFAULT_TARGET_RATE = 16000;
|
|
1392
|
+
const DEFAULT_CHUNK_MS = 50;
|
|
1393
|
+
const MIC_CHANNEL = "mic";
|
|
1394
|
+
const SYSTEM_CHANNEL = "system";
|
|
1395
|
+
/**
|
|
1396
|
+
* Browser-only adapter that pumps two `MediaStream`s into a `StreamingTranscriber`
|
|
1397
|
+
* configured for dual-channel mode. Each `MediaStream` runs through its own
|
|
1398
|
+
* `pcm16-encoder` AudioWorklet (resample to `targetSampleRate`, encode to Int16
|
|
1399
|
+
* PCM); each PCM chunk is forwarded via `transcriber.sendAudio(pcm, { channel })`.
|
|
1400
|
+
*
|
|
1401
|
+
* All dual-channel orchestration (mixing, VAD, per-word attribution) lives inside
|
|
1402
|
+
* `StreamingTranscriber` — this class is a pure I/O adapter. Non-browser runtimes
|
|
1403
|
+
* can replicate its job by pushing tagged PCM into `transcriber.sendAudio` directly.
|
|
1404
|
+
*
|
|
1405
|
+
* Caller responsibilities:
|
|
1406
|
+
* - **Echo cancellation** is set at `getUserMedia` time (`audio: { echoCancellation: true }`).
|
|
1407
|
+
* - **System-audio capture** is platform-dependent. Chrome's `getDisplayMedia({ audio: true })`
|
|
1408
|
+
* captures tab audio (and on Windows, full system audio when sharing the whole screen).
|
|
1409
|
+
* macOS requires a virtual loopback driver (e.g. BlackHole) to expose system audio at all.
|
|
1410
|
+
* - **Token auth.** Construct the transcriber with `token` — API-key auth is unsupported in browsers.
|
|
1411
|
+
* - **Stream ownership.** `stop()` tears down the AudioContext but does NOT stop the
|
|
1412
|
+
* `MediaStreamTrack`s passed in — callers own those.
|
|
1413
|
+
*/
|
|
1414
|
+
class DualChannelCapture {
|
|
1415
|
+
constructor(params) {
|
|
1416
|
+
var _a;
|
|
1417
|
+
this.running = false;
|
|
1418
|
+
if (typeof globalThis.AudioContext === "undefined") {
|
|
1419
|
+
throw new BrowserOnlyError();
|
|
1420
|
+
}
|
|
1421
|
+
this.params = {
|
|
1422
|
+
micStream: params.micStream,
|
|
1423
|
+
systemStream: params.systemStream,
|
|
1424
|
+
transcriber: params.transcriber,
|
|
1425
|
+
targetSampleRate: (_a = params.targetSampleRate) !== null && _a !== void 0 ? _a : DEFAULT_TARGET_RATE,
|
|
1426
|
+
};
|
|
1427
|
+
}
|
|
1428
|
+
on(event, listener) {
|
|
1429
|
+
if (event === "error")
|
|
1430
|
+
this.errorListener = listener;
|
|
1431
|
+
}
|
|
1432
|
+
/**
|
|
1433
|
+
* Wire the capture pipeline and start pumping tagged PCM into the transcriber.
|
|
1434
|
+
* The transcriber must already be connected. Returns once the worklet is
|
|
1435
|
+
* registered and the audio graph is live.
|
|
1436
|
+
*/
|
|
1437
|
+
start() {
|
|
1438
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
1439
|
+
if (this.running) {
|
|
1440
|
+
throw new Error("DualChannelCapture already started");
|
|
1441
|
+
}
|
|
1442
|
+
this.context = new AudioContext();
|
|
1443
|
+
const blob = new Blob([pcm16EncoderWorkletSource], {
|
|
1444
|
+
type: "application/javascript",
|
|
1445
|
+
});
|
|
1446
|
+
const url = URL.createObjectURL(blob);
|
|
1447
|
+
try {
|
|
1448
|
+
yield this.context.audioWorklet.addModule(url);
|
|
1449
|
+
}
|
|
1450
|
+
finally {
|
|
1451
|
+
URL.revokeObjectURL(url);
|
|
1452
|
+
}
|
|
1453
|
+
this.micSource = this.context.createMediaStreamSource(this.params.micStream);
|
|
1454
|
+
this.sysSource = this.context.createMediaStreamSource(this.params.systemStream);
|
|
1455
|
+
this.micEncoder = this.makeEncoder(MIC_CHANNEL);
|
|
1456
|
+
this.sysEncoder = this.makeEncoder(SYSTEM_CHANNEL);
|
|
1457
|
+
this.micSource.connect(this.micEncoder);
|
|
1458
|
+
this.sysSource.connect(this.sysEncoder);
|
|
1459
|
+
this.running = true;
|
|
1460
|
+
});
|
|
1461
|
+
}
|
|
1462
|
+
makeEncoder(channel) {
|
|
1463
|
+
const node = new AudioWorkletNode(this.context, PCM16_ENCODER_PROCESSOR_NAME, {
|
|
1464
|
+
numberOfInputs: 1,
|
|
1465
|
+
numberOfOutputs: 0,
|
|
1466
|
+
channelCount: 1,
|
|
1467
|
+
channelCountMode: "explicit",
|
|
1468
|
+
channelInterpretation: "speakers",
|
|
1469
|
+
processorOptions: {
|
|
1470
|
+
targetRate: this.params.targetSampleRate,
|
|
1471
|
+
chunkMs: DEFAULT_CHUNK_MS,
|
|
1472
|
+
},
|
|
1473
|
+
});
|
|
1474
|
+
node.port.onmessage = (e) => {
|
|
1475
|
+
var _a;
|
|
1476
|
+
try {
|
|
1477
|
+
this.params.transcriber.sendAudio(e.data.pcm, { channel });
|
|
1478
|
+
}
|
|
1479
|
+
catch (err) {
|
|
1480
|
+
(_a = this.errorListener) === null || _a === void 0 ? void 0 : _a.call(this, err);
|
|
1481
|
+
}
|
|
1482
|
+
};
|
|
1483
|
+
return node;
|
|
1484
|
+
}
|
|
1485
|
+
/**
|
|
1486
|
+
* Tear down internal nodes and close the AudioContext. Does NOT stop the
|
|
1487
|
+
* caller-provided MediaStream tracks — they remain available for preview UI,
|
|
1488
|
+
* recording, etc. Idempotent.
|
|
1489
|
+
*/
|
|
1490
|
+
stop() {
|
|
1491
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
1492
|
+
var _a, _b, _c, _d, _e, _f;
|
|
1493
|
+
if (!this.running)
|
|
1494
|
+
return;
|
|
1495
|
+
this.running = false;
|
|
1496
|
+
try {
|
|
1497
|
+
(_a = this.micEncoder) === null || _a === void 0 ? void 0 : _a.port.close();
|
|
1498
|
+
(_b = this.sysEncoder) === null || _b === void 0 ? void 0 : _b.port.close();
|
|
1499
|
+
(_c = this.micEncoder) === null || _c === void 0 ? void 0 : _c.disconnect();
|
|
1500
|
+
(_d = this.sysEncoder) === null || _d === void 0 ? void 0 : _d.disconnect();
|
|
1501
|
+
(_e = this.micSource) === null || _e === void 0 ? void 0 : _e.disconnect();
|
|
1502
|
+
(_f = this.sysSource) === null || _f === void 0 ? void 0 : _f.disconnect();
|
|
1503
|
+
}
|
|
1504
|
+
catch (_g) {
|
|
1505
|
+
// Disconnecting already-disconnected nodes throws in some browsers; ignore.
|
|
1506
|
+
}
|
|
1507
|
+
if (this.context && this.context.state !== "closed") {
|
|
1508
|
+
yield this.context.close();
|
|
1509
|
+
}
|
|
1510
|
+
this.context = undefined;
|
|
1511
|
+
this.micSource = undefined;
|
|
1512
|
+
this.sysSource = undefined;
|
|
1513
|
+
this.micEncoder = undefined;
|
|
1514
|
+
this.sysEncoder = undefined;
|
|
1515
|
+
});
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
|
|
1519
|
+
/**
|
|
1520
|
+
* Linear-interpolation resampler for streaming Float32 audio. Stateful across
|
|
1521
|
+
* `process()` calls so chunk boundaries don't introduce phase discontinuities:
|
|
1522
|
+
* the last input sample and a fractional read position are carried over.
|
|
1523
|
+
*
|
|
1524
|
+
* Linear interpolation is good enough for ASR ingest — the downstream
|
|
1525
|
+
* StreamingTranscriber band-limits at the target rate anyway, and a polyphase
|
|
1526
|
+
* filter would be overkill in the AudioWorklet hot path. If a customer needs
|
|
1527
|
+
* higher quality they can supply their own VadDetector + bypass the encoder.
|
|
1528
|
+
*/
|
|
1529
|
+
class LinearResampler {
|
|
1530
|
+
constructor(sourceRate, targetRate) {
|
|
1531
|
+
this.sourceRate = sourceRate;
|
|
1532
|
+
this.targetRate = targetRate;
|
|
1533
|
+
this.lastSample = 0;
|
|
1534
|
+
this.fractional = 0;
|
|
1535
|
+
if (sourceRate <= 0 || targetRate <= 0) {
|
|
1536
|
+
throw new Error("sourceRate and targetRate must be positive");
|
|
1537
|
+
}
|
|
1538
|
+
this.ratio = sourceRate / targetRate;
|
|
1539
|
+
}
|
|
1540
|
+
process(input) {
|
|
1541
|
+
var _a;
|
|
1542
|
+
if (this.sourceRate === this.targetRate) {
|
|
1543
|
+
return input;
|
|
1544
|
+
}
|
|
1545
|
+
// Worst-case output length; we'll slice to actual.
|
|
1546
|
+
const out = new Float32Array(Math.ceil(input.length / this.ratio) + 1);
|
|
1547
|
+
let outIdx = 0;
|
|
1548
|
+
let pos = this.fractional;
|
|
1549
|
+
while (pos < input.length) {
|
|
1550
|
+
const i = Math.floor(pos);
|
|
1551
|
+
const frac = pos - i;
|
|
1552
|
+
const a = i === 0 ? this.lastSample : input[i - 1];
|
|
1553
|
+
const b = input[i];
|
|
1554
|
+
out[outIdx++] = a + (b - a) * frac;
|
|
1555
|
+
pos += this.ratio;
|
|
1556
|
+
}
|
|
1557
|
+
this.lastSample = (_a = input[input.length - 1]) !== null && _a !== void 0 ? _a : this.lastSample;
|
|
1558
|
+
this.fractional = pos - input.length;
|
|
1559
|
+
return out.subarray(0, outIdx);
|
|
1560
|
+
}
|
|
1561
|
+
reset() {
|
|
1562
|
+
this.lastSample = 0;
|
|
1563
|
+
this.fractional = 0;
|
|
1564
|
+
}
|
|
1565
|
+
}
|
|
1566
|
+
/** Convert Float32 PCM (-1..1) to little-endian Int16 PCM. */
|
|
1567
|
+
function float32ToPcm16(input) {
|
|
1568
|
+
const out = new ArrayBuffer(input.length * 2);
|
|
1569
|
+
const view = new DataView(out);
|
|
1570
|
+
for (let i = 0; i < input.length; i++) {
|
|
1571
|
+
const clamped = Math.max(-1, Math.min(1, input[i]));
|
|
1572
|
+
view.setInt16(i * 2, clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff, true);
|
|
1573
|
+
}
|
|
1574
|
+
return out;
|
|
1575
|
+
}
|
|
1576
|
+
|
|
1577
|
+
export { BrowserOnlyError, DualChannelCapture, EnergyVad, LinearResampler, RealtimeService, RealtimeTranscriber, StreamingServiceFactory, StreamingTranscriber, StreamingTranscriberFactory, VadTimeline, attributeTurn, attributeWord, float32ToPcm16, rollUpTurnChannel };
|