@whereby.com/assistant-sdk 0.0.0-canary-20250916140846 → 0.0.0-canary-20250923130059

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -35,6 +35,10 @@ const TRIGGER_EVENT_SUCCESS = "trigger_event_success";
35
35
  const AUDIO_STREAM_READY = "AUDIO_STREAM_READY";
36
36
  const ASSISTANT_JOINED_ROOM = "ASSISTANT_JOINED_ROOM";
37
37
  const ASSISTANT_LEFT_ROOM = "ASSISTANT_LEFT_ROOM";
38
+ const PARTICIPANT_VIDEO_TRACK_ADDED = "PARTICIPANT_VIDEO_TRACK_ADDED";
39
+ const PARTICIPANT_VIDEO_TRACK_REMOVED = "PARTICIPANT_VIDEO_TRACK_REMOVED";
40
+ const PARTICIPANT_AUDIO_TRACK_ADDED = "PARTICIPANT_AUDIO_TRACK_ADDED";
41
+ const PARTICIPANT_AUDIO_TRACK_REMOVED = "PARTICIPANT_AUDIO_TRACK_REMOVED";
38
42
 
39
43
  /******************************************************************************
40
44
  Copyright (c) Microsoft Corporation.
@@ -99,301 +103,362 @@ const STREAM_INPUT_SAMPLE_RATE_IN_HZ = 48000;
99
103
  const BYTES_PER_SAMPLE = 2;
100
104
  // 480 samples per 10ms frame at 48kHz
101
105
  const FRAME_10MS_SAMPLES = 480;
102
- const slotBuffers = new Map();
103
- function appendAndDrainTo480(slot, newSamples) {
104
- var _a;
105
- const prev = (_a = slotBuffers.get(slot)) !== null && _a !== void 0 ? _a : new Int16Array(0);
106
- const merged = new Int16Array(prev.length + newSamples.length);
107
- merged.set(prev, 0);
108
- merged.set(newSamples, prev.length);
109
- let offset = 0;
110
- while (merged.length - offset >= FRAME_10MS_SAMPLES) {
111
- const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
112
- enqueueFrame(slot, chunk); // always 480
113
- offset += FRAME_10MS_SAMPLES;
114
- }
115
- slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
116
- }
117
- ({
118
- enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
119
- enqSamples: new Array(PARTICIPANT_SLOTS).fill(0),
120
- wroteFrames: new Array(PARTICIPANT_SLOTS).fill(0),
121
- wroteSamples: new Array(PARTICIPANT_SLOTS).fill(0),
122
- lastFramesSeen: new Array(PARTICIPANT_SLOTS).fill(0),
123
- });
124
- let slots = [];
125
- let stopPacerFn = null;
126
- let outputPacerState = null;
127
- /**
128
- * Simple linear interpolation resampler to convert audio to 48kHz.
129
- * This handles the common case of 16kHz -> 48kHz (3x upsampling).
130
- */
131
- function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
132
- const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
133
- const outputLength = Math.floor(inputFrames * ratio);
134
- const output = new Int16Array(outputLength);
135
- for (let i = 0; i < outputLength; i++) {
136
- const inputIndex = i / ratio;
137
- const index = Math.floor(inputIndex);
138
- const fraction = inputIndex - index;
139
- if (index + 1 < inputSamples.length) {
140
- const sample1 = inputSamples[index];
141
- const sample2 = inputSamples[index + 1];
142
- output[i] = Math.round(sample1 + (sample2 - sample1) * fraction);
106
+ function createFfmpegMixer() {
107
+ const slotBuffers = new Map();
108
+ function appendAndDrainTo480(slot, newSamples) {
109
+ var _a;
110
+ const prev = (_a = slotBuffers.get(slot)) !== null && _a !== void 0 ? _a : new Int16Array(0);
111
+ const merged = new Int16Array(prev.length + newSamples.length);
112
+ merged.set(prev, 0);
113
+ merged.set(newSamples, prev.length);
114
+ let offset = 0;
115
+ while (merged.length - offset >= FRAME_10MS_SAMPLES) {
116
+ const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
117
+ enqueueFrame(slot, chunk); // always 480
118
+ offset += FRAME_10MS_SAMPLES;
143
119
  }
144
- else {
145
- output[i] = inputSamples[Math.min(index, inputSamples.length - 1)];
120
+ slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
121
+ }
122
+ ({
123
+ enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
124
+ enqSamples: new Array(PARTICIPANT_SLOTS).fill(0),
125
+ wroteFrames: new Array(PARTICIPANT_SLOTS).fill(0),
126
+ wroteSamples: new Array(PARTICIPANT_SLOTS).fill(0),
127
+ lastFramesSeen: new Array(PARTICIPANT_SLOTS).fill(0),
128
+ });
129
+ let slots = [];
130
+ let stopPacerFn = null;
131
+ let outputPacerState = null;
132
+ /**
133
+ * Simple linear interpolation resampler to convert audio to 48kHz.
134
+ * This handles the common case of 16kHz -> 48kHz (3x upsampling).
135
+ */
136
+ function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
137
+ const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
138
+ const outputLength = Math.floor(inputFrames * ratio);
139
+ const output = new Int16Array(outputLength);
140
+ for (let i = 0; i < outputLength; i++) {
141
+ const inputIndex = i / ratio;
142
+ const index = Math.floor(inputIndex);
143
+ const fraction = inputIndex - index;
144
+ if (index + 1 < inputSamples.length) {
145
+ const sample1 = inputSamples[index];
146
+ const sample2 = inputSamples[index + 1];
147
+ output[i] = Math.round(sample1 + (sample2 - sample1) * fraction);
148
+ }
149
+ else {
150
+ output[i] = inputSamples[Math.min(index, inputSamples.length - 1)];
151
+ }
152
+ }
153
+ return output;
154
+ }
155
+ /**
156
+ * Enqueue an audio frame for paced delivery to the RTCAudioSource.
157
+ */
158
+ function enqueueOutputFrame(samples) {
159
+ if (outputPacerState) {
160
+ outputPacerState.frameQueue.push(samples);
146
161
  }
147
162
  }
148
- return output;
149
- }
150
- /**
151
- * Enqueue an audio frame for paced delivery to the RTCAudioSource.
152
- */
153
- function enqueueOutputFrame(samples) {
154
- if (outputPacerState) {
155
- outputPacerState.frameQueue.push(samples);
156
- }
157
- }
158
- /**
159
- * Start the audio pacer loop for all input slots in an FFmpeg process.
160
- *
161
- * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
162
- * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
163
- * arrive jittery, bursty, or with slightly different clocks.
164
- *
165
- * Key behavior:
166
- * - Writes exactly one frame per period, on a shared wall-clock grid.
167
- * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
168
- * never stalls.
169
- * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
170
- * - Honors Node stream backpressure (`write()` return false) without breaking
171
- * the timing grid.
172
- *
173
- * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
174
- * can mix them without slow-downs or drift.
175
- *
176
- * Call this once right after spawning FFmpeg:
177
- * ```ts
178
- * const ff = spawnFFmpegProcess();
179
- * startPacer(ff, PARTICIPANT_SLOTS);
180
- * ```
181
- *
182
- * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
183
- *
184
- * @param ff Child process handle from spawn("ffmpeg", ...)
185
- * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
186
- */
187
- function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
188
- if (stopPacerFn) {
189
- stopPacerFn();
190
- stopPacerFn = null;
191
- }
192
- const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
193
- const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
194
- const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
195
- const t0 = nowMs();
196
- slots = Array.from({ length: slotCount }, () => ({
197
- q: [],
198
- lastFrames: FRAME_10MS_SAMPLES, // keep constant
199
- nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
200
- }));
201
- outputPacerState = {
202
- frameQueue: [],
203
- nextDueMs: t0 + outputFrameMs,
204
- rtcAudioSource,
205
- onAudioStreamReady,
206
- didEmitReadyEvent: false,
207
- };
208
- const iv = setInterval(() => {
209
- const t = nowMs();
210
- for (let s = 0; s < slotCount; s++) {
211
- const st = slots[s];
212
- const w = writers[s];
213
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
214
- if (t >= st.nextDueMs) {
215
- const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
216
- if (!w.write(buf)) {
217
- // Just continue without adding drain listener - backpressure will naturally resolve
163
+ /**
164
+ * Start the audio pacer loop for all input slots in an FFmpeg process.
165
+ *
166
+ * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
167
+ * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
168
+ * arrive jittery, bursty, or with slightly different clocks.
169
+ *
170
+ * Key behavior:
171
+ * - Writes exactly one frame per period, on a shared wall-clock grid.
172
+ * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
173
+ * never stalls.
174
+ * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
175
+ * - Honors Node stream backpressure (`write()` return false) without breaking
176
+ * the timing grid.
177
+ *
178
+ * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
179
+ * can mix them without slow-downs or drift.
180
+ *
181
+ * Call this once right after spawning FFmpeg:
182
+ * ```ts
183
+ * const ff = spawnFFmpegProcess();
184
+ * startPacer(ff, PARTICIPANT_SLOTS);
185
+ * ```
186
+ *
187
+ * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
188
+ *
189
+ * @param ff Child process handle from spawn("ffmpeg", ...)
190
+ * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
191
+ */
192
+ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
193
+ if (stopPacerFn) {
194
+ stopPacerFn();
195
+ stopPacerFn = null;
196
+ }
197
+ const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
198
+ const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
199
+ const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
200
+ const t0 = nowMs();
201
+ slots = Array.from({ length: slotCount }, () => ({
202
+ q: [],
203
+ lastFrames: FRAME_10MS_SAMPLES, // keep constant
204
+ nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
205
+ }));
206
+ outputPacerState = {
207
+ frameQueue: [],
208
+ nextDueMs: t0 + outputFrameMs,
209
+ rtcAudioSource,
210
+ onAudioStreamReady,
211
+ didEmitReadyEvent: false,
212
+ };
213
+ const iv = setInterval(() => {
214
+ const t = nowMs();
215
+ for (let s = 0; s < slotCount; s++) {
216
+ const st = slots[s];
217
+ const w = writers[s];
218
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
219
+ if (t >= st.nextDueMs) {
220
+ const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
221
+ if (!w.write(buf)) {
222
+ // Just continue without adding drain listener - backpressure will naturally resolve
223
+ const late = t - st.nextDueMs;
224
+ const steps = Math.max(1, Math.ceil(late / frameMs));
225
+ st.nextDueMs += steps * frameMs;
226
+ continue;
227
+ }
218
228
  const late = t - st.nextDueMs;
219
229
  const steps = Math.max(1, Math.ceil(late / frameMs));
220
230
  st.nextDueMs += steps * frameMs;
221
- continue;
222
231
  }
223
- const late = t - st.nextDueMs;
224
- const steps = Math.max(1, Math.ceil(late / frameMs));
225
- st.nextDueMs += steps * frameMs;
226
232
  }
227
- }
228
- if (!outputPacerState)
229
- return;
230
- // Handle output pacer for RTCAudioSource
231
- const state = outputPacerState;
232
- if (t >= state.nextDueMs) {
233
- const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
234
- if (!state.didEmitReadyEvent) {
235
- state.onAudioStreamReady();
236
- state.didEmitReadyEvent = true;
233
+ if (!outputPacerState)
234
+ return;
235
+ // Handle output pacer for RTCAudioSource
236
+ const state = outputPacerState;
237
+ if (t >= state.nextDueMs) {
238
+ const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
239
+ if (!state.didEmitReadyEvent) {
240
+ state.onAudioStreamReady();
241
+ state.didEmitReadyEvent = true;
242
+ }
243
+ state.rtcAudioSource.onData({
244
+ samples: samples,
245
+ sampleRate: STREAM_INPUT_SAMPLE_RATE_IN_HZ,
246
+ });
247
+ const late = t - state.nextDueMs;
248
+ const steps = Math.max(1, Math.ceil(late / outputFrameMs));
249
+ state.nextDueMs += steps * outputFrameMs;
237
250
  }
238
- state.rtcAudioSource.onData({
239
- samples: samples,
240
- sampleRate: STREAM_INPUT_SAMPLE_RATE_IN_HZ,
241
- });
242
- const late = t - state.nextDueMs;
243
- const steps = Math.max(1, Math.ceil(late / outputFrameMs));
244
- state.nextDueMs += steps * outputFrameMs;
251
+ }, 5);
252
+ stopPacerFn = () => clearInterval(iv);
253
+ }
254
+ /**
255
+ * Stop the audio pacer loop and clear all input slots.
256
+ * Call this before killing the FFmpeg process to ensure clean shutdown.
257
+ */
258
+ function stopPacer() {
259
+ if (stopPacerFn)
260
+ stopPacerFn();
261
+ stopPacerFn = null;
262
+ slots = [];
263
+ slotBuffers.clear();
264
+ outputPacerState = null;
265
+ }
266
+ /**
267
+ * Queue a live frame for a given slot (0..N-1).
268
+ * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
269
+ */
270
+ function enqueueFrame(slot, samples, numberOfFrames) {
271
+ const st = slots[slot];
272
+ if (!st)
273
+ return;
274
+ const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
275
+ st.q.push(buf);
276
+ }
277
+ /**
278
+ * Clear the audio queue for a specific slot when a participant leaves.
279
+ * This prevents stale audio data from continuing to play after disconnect.
280
+ */
281
+ function clearSlotQueue(slot) {
282
+ const st = slots[slot];
283
+ if (st) {
284
+ st.q = [];
285
+ slotBuffers.delete(slot);
286
+ const now = Number(process.hrtime.bigint()) / 1e6;
287
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
288
+ st.nextDueMs = now + frameMs;
245
289
  }
246
- }, 5);
247
- stopPacerFn = () => clearInterval(iv);
248
- }
249
- /**
250
- * Stop the audio pacer loop and clear all input slots.
251
- * Call this before killing the FFmpeg process to ensure clean shutdown.
252
- */
253
- function stopPacer() {
254
- if (stopPacerFn)
255
- stopPacerFn();
256
- stopPacerFn = null;
257
- slots = [];
258
- }
259
- /**
260
- * Queue a live frame for a given slot (0..N-1).
261
- * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
262
- */
263
- function enqueueFrame(slot, samples, numberOfFrames) {
264
- const st = slots[slot];
265
- if (!st)
266
- return;
267
- const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
268
- st.q.push(buf);
269
- }
270
- /**
271
- * Clear the audio queue for a specific slot when a participant leaves.
272
- * This prevents stale audio data from continuing to play after disconnect.
273
- */
274
- function clearSlotQueue(slot) {
275
- const st = slots[slot];
276
- if (st) {
277
- st.q = [];
278
- const now = Number(process.hrtime.bigint()) / 1e6;
279
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
280
- st.nextDueMs = now + frameMs;
281
290
  }
282
- }
283
- /**
284
- * Get the FFmpeg arguments for mixing audio from multiple participants.
285
- * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
286
- * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
287
- */
288
- function getFFmpegArguments() {
289
- const N = PARTICIPANT_SLOTS;
290
- const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
291
- const ffArgs = [];
292
- for (let i = 0; i < N; i++) {
293
- ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
294
- }
295
- const pre = [];
296
- for (let i = 0; i < N; i++) {
297
- pre.push(`[${i}:a]aresample=async=1:first_pts=0,asetpts=N/SR/TB[a${i}]`);
298
- }
299
- const labels = Array.from({ length: N }, (_, i) => `[a${i}]`).join("");
300
- const amix = `${labels}amix=inputs=${N}:duration=longest:dropout_transition=250:normalize=0[mix]`;
301
- const filter = `${pre.join(";")};${amix}`;
302
- ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
303
- return ffArgs;
304
- }
305
- /**
306
- * Spawn a new FFmpeg process for mixing audio from multiple participants.
307
- * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
308
- * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
309
- * The process will log its output to stderr.
310
- * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
311
- * @return The spawned FFmpeg process.
312
- */
313
- function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
314
- const stdio = ["ignore", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
315
- const args = getFFmpegArguments();
316
- const ffmpegProcess = child_process.spawn("ffmpeg", args, { stdio });
317
- startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
318
- ffmpegProcess.stderr.setEncoding("utf8");
319
- ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
320
- ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
321
- let audioBuffer = Buffer.alloc(0);
322
- const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
323
- ffmpegProcess.stdout.on("data", (chunk) => {
324
- audioBuffer = Buffer.concat([audioBuffer, chunk]);
325
- while (audioBuffer.length >= FRAME_SIZE_BYTES) {
326
- const frameData = audioBuffer.subarray(0, FRAME_SIZE_BYTES);
327
- const samples = new Int16Array(FRAME_10MS_SAMPLES);
328
- for (let i = 0; i < FRAME_10MS_SAMPLES; i++) {
329
- samples[i] = frameData.readInt16LE(i * 2);
330
- }
331
- enqueueOutputFrame(samples);
332
- audioBuffer = audioBuffer.subarray(FRAME_SIZE_BYTES);
291
+ /**
292
+ * Get the FFmpeg arguments for debugging, which writes each participant's audio to a separate WAV file
293
+ * and also mixes them into a single WAV file.
294
+ * This is useful for inspecting the audio quality and timing of each participant.
295
+ */
296
+ function getFFmpegArgumentsDebug() {
297
+ const N = PARTICIPANT_SLOTS;
298
+ const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
299
+ const ffArgs = [];
300
+ for (let i = 0; i < N; i++) {
301
+ ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
333
302
  }
334
- });
335
- return ffmpegProcess;
336
- }
337
- /**
338
- * Write audio data from a MediaStreamTrack to the FFmpeg process.
339
- * This function creates an AudioSink for the track and sets up a data handler
340
- * that enqueues audio frames into the pacer.
341
- *
342
- * @param ffmpegProcess The FFmpeg process to which audio data will be written.
343
- * @param slot The participant slot number (0..N-1) to which this track belongs.
344
- * @param audioTrack The MediaStreamTrack containing the audio data.
345
- * @return An object containing the AudioSink, the writable stream, and a stop function.
346
- */
347
- function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
348
- const writer = ffmpegProcess.stdio[3 + slot];
349
- const sink = new AudioSink(audioTrack);
350
- const unsubscribe = sink.subscribe(({ samples, sampleRate: sr, channelCount: ch, bitsPerSample, numberOfFrames }) => {
351
- if (ch !== 1 || bitsPerSample !== 16)
352
- return;
353
- let out = samples;
354
- if (sr !== STREAM_INPUT_SAMPLE_RATE_IN_HZ) {
355
- const resampled = resampleTo48kHz(samples, sr, numberOfFrames !== null && numberOfFrames !== void 0 ? numberOfFrames : samples.length);
356
- out = resampled;
357
- }
358
- appendAndDrainTo480(slot, out);
359
- });
360
- const stop = () => {
361
- try {
362
- unsubscribe();
363
- sink.stop();
303
+ const pre = [];
304
+ for (let i = 0; i < N; i++) {
305
+ pre.push(`[${i}:a]aresample=async=0:first_pts=0,asetpts=PTS-STARTPTS,asplit=2[a${i}tap][a${i}mix]`);
364
306
  }
365
- catch (_a) {
366
- console.error("Failed to stop AudioSink");
307
+ const mixInputs = Array.from({ length: N }, (_, i) => `[a${i}mix]`).join("");
308
+ const filter = `${pre.join(";")};${mixInputs}amix=inputs=${N}:duration=first:dropout_transition=0:normalize=0[mix]`;
309
+ ffArgs.push("-hide_banner", "-nostats", "-loglevel", "info", "-y", "-filter_complex", filter);
310
+ for (let i = 0; i < N; i++) {
311
+ ffArgs.push("-map", `[a${i}tap]`, "-f", "wav", "-c:a", "pcm_s16le", `pre${i}.wav`);
367
312
  }
368
- };
369
- return { sink, writer, stop };
370
- }
371
- /**
372
- * Stop the FFmpeg process and clean up all resources.
373
- * This function will unpipe the stdout, end all writable streams for each participant slot,
374
- * and kill the FFmpeg process.
375
- * @param ffmpegProcess The FFmpeg process to stop.
376
- */
377
- function stopFFmpegProcess(ffmpegProcess) {
378
- stopPacer();
379
- if (ffmpegProcess && !ffmpegProcess.killed) {
380
- try {
381
- ffmpegProcess.stdout.unpipe();
313
+ ffArgs.push("-map", "[mix]", "-f", "wav", "-c:a", "pcm_s16le", "mixed.wav");
314
+ return ffArgs;
315
+ }
316
+ /**
317
+ * Get the FFmpeg arguments for mixing audio from multiple participants.
318
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
319
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
320
+ */
321
+ function getFFmpegArguments() {
322
+ const N = PARTICIPANT_SLOTS;
323
+ const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
324
+ const ffArgs = [];
325
+ for (let i = 0; i < N; i++) {
326
+ ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
382
327
  }
383
- catch (_a) {
384
- console.error("Failed to unpipe ffmpeg stdout");
328
+ const pre = [];
329
+ for (let i = 0; i < N; i++) {
330
+ pre.push(`[${i}:a]aresample=async=0:first_pts=0,asetpts=PTS-STARTPTS[a${i}]`);
385
331
  }
386
- for (let i = 0; i < PARTICIPANT_SLOTS; i++) {
387
- const w = ffmpegProcess.stdio[3 + i];
332
+ const labels = Array.from({ length: N }, (_, i) => `[a${i}]`).join("");
333
+ const amix = `${labels}amix=inputs=${N}:duration=first:dropout_transition=0:normalize=0[mix]`;
334
+ const filter = `${pre.join(";")};${amix}`;
335
+ ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
336
+ return ffArgs;
337
+ }
338
+ /*
339
+ * Spawn a new FFmpeg process for debugging purposes.
340
+ * This will write each participant's audio to a separate WAV file and also mix them into a single WAV file.
341
+ * The output files will be named pre0.wav, pre1.wav, ..., and mixed.wav.
342
+ * The process will log its output to stderr.
343
+ * @return The spawned FFmpeg process.
344
+ */
345
+ function spawnFFmpegProcessDebug(rtcAudioSource, onAudioStreamReady) {
346
+ const stdio = ["ignore", "ignore", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
347
+ const args = getFFmpegArgumentsDebug();
348
+ const ffmpegProcess = child_process.spawn("ffmpeg", args, { stdio });
349
+ startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
350
+ ffmpegProcess.stderr.setEncoding("utf8");
351
+ ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
352
+ ffmpegProcess.on("error", () => console.error("FFmpeg process error (debug): is ffmpeg installed?"));
353
+ return ffmpegProcess;
354
+ }
355
+ /**
356
+ * Spawn a new FFmpeg process for mixing audio from multiple participants.
357
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
358
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
359
+ * The process will log its output to stderr.
360
+ * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
361
+ * @return The spawned FFmpeg process.
362
+ */
363
+ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
364
+ const stdio = ["pipe", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
365
+ const args = getFFmpegArguments();
366
+ const ffmpegProcess = child_process.spawn("ffmpeg", args, { stdio });
367
+ startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
368
+ ffmpegProcess.stderr.setEncoding("utf8");
369
+ ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
370
+ ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
371
+ let audioBuffer = Buffer.alloc(0);
372
+ const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
373
+ ffmpegProcess.stdout.on("data", (chunk) => {
374
+ audioBuffer = Buffer.concat([audioBuffer, chunk]);
375
+ while (audioBuffer.length >= FRAME_SIZE_BYTES) {
376
+ const frameData = audioBuffer.subarray(0, FRAME_SIZE_BYTES);
377
+ const samples = new Int16Array(FRAME_10MS_SAMPLES);
378
+ for (let i = 0; i < FRAME_10MS_SAMPLES; i++) {
379
+ samples[i] = frameData.readInt16LE(i * 2);
380
+ }
381
+ enqueueOutputFrame(samples);
382
+ audioBuffer = audioBuffer.subarray(FRAME_SIZE_BYTES);
383
+ }
384
+ });
385
+ return ffmpegProcess;
386
+ }
387
+ /**
388
+ * Write audio data from a MediaStreamTrack to the FFmpeg process.
389
+ * This function creates an AudioSink for the track and sets up a data handler
390
+ * that enqueues audio frames into the pacer.
391
+ *
392
+ * @param ffmpegProcess The FFmpeg process to which audio data will be written.
393
+ * @param slot The participant slot number (0..N-1) to which this track belongs.
394
+ * @param audioTrack The MediaStreamTrack containing the audio data.
395
+ * @return An object containing the AudioSink, the writable stream, and a stop function.
396
+ */
397
+ function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
398
+ const writer = ffmpegProcess.stdio[3 + slot];
399
+ const sink = new AudioSink(audioTrack);
400
+ const unsubscribe = sink.subscribe(({ samples, sampleRate: sr, channelCount: ch, bitsPerSample, numberOfFrames }) => {
401
+ if (ch !== 1 || bitsPerSample !== 16)
402
+ return;
403
+ let out = samples;
404
+ if (sr !== STREAM_INPUT_SAMPLE_RATE_IN_HZ) {
405
+ const resampled = resampleTo48kHz(samples, sr, numberOfFrames !== null && numberOfFrames !== void 0 ? numberOfFrames : samples.length);
406
+ out = resampled;
407
+ }
408
+ appendAndDrainTo480(slot, out);
409
+ });
410
+ const stop = () => {
411
+ try {
412
+ unsubscribe();
413
+ sink.stop();
414
+ }
415
+ catch (_a) {
416
+ console.error("Failed to stop AudioSink");
417
+ }
418
+ };
419
+ return { sink, writer, stop };
420
+ }
421
+ /**
422
+ * Stop the FFmpeg process and clean up all resources.
423
+ * This function will unpipe the stdout, end all writable streams for each participant slot,
424
+ * and kill the FFmpeg process.
425
+ * @param ffmpegProcess The FFmpeg process to stop.
426
+ */
427
+ function stopFFmpegProcess(ffmpegProcess) {
428
+ var _a, _b;
429
+ stopPacer();
430
+ if (ffmpegProcess && !ffmpegProcess.killed) {
431
+ try {
432
+ ffmpegProcess.stdout.unpipe();
433
+ }
434
+ catch (_c) {
435
+ console.error("Failed to unpipe ffmpeg stdout");
436
+ }
437
+ for (let i = 0; i < PARTICIPANT_SLOTS; i++) {
438
+ const w = ffmpegProcess.stdio[3 + i];
439
+ try {
440
+ w.end();
441
+ }
442
+ catch (_d) {
443
+ console.error("Failed to end ffmpeg writable stream");
444
+ }
445
+ }
388
446
  try {
389
- w.end();
447
+ (_a = ffmpegProcess.stdin) === null || _a === void 0 ? void 0 : _a.write("q\n");
448
+ (_b = ffmpegProcess.stdin) === null || _b === void 0 ? void 0 : _b.end();
390
449
  }
391
- catch (_b) {
392
- console.error("Failed to end ffmpeg writable stream");
450
+ catch (_e) {
451
+ console.error("Failed to end ffmpeg stdin");
393
452
  }
394
453
  }
395
- ffmpegProcess.kill("SIGTERM");
396
454
  }
455
+ return {
456
+ spawnFFmpegProcess,
457
+ spawnFFmpegProcessDebug,
458
+ writeAudioDataToFFmpeg,
459
+ stopFFmpegProcess,
460
+ clearSlotQueue,
461
+ };
397
462
  }
398
463
 
399
464
  class AudioMixer extends EventEmitter.EventEmitter {
@@ -404,6 +469,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
404
469
  this.rtcAudioSource = null;
405
470
  this.participantSlots = new Map();
406
471
  this.activeSlots = {};
472
+ this.mixer = createFfmpegMixer();
407
473
  this.setupMediaStream();
408
474
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
409
475
  this.onStreamReady = onStreamReady;
@@ -422,7 +488,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
422
488
  return;
423
489
  }
424
490
  if (!this.ffmpegProcess && this.rtcAudioSource) {
425
- this.ffmpegProcess = spawnFFmpegProcess(this.rtcAudioSource, this.onStreamReady);
491
+ this.ffmpegProcess = this.mixer.spawnFFmpegProcess(this.rtcAudioSource, this.onStreamReady);
426
492
  }
427
493
  for (const p of participants)
428
494
  this.attachParticipantIfNeeded(p);
@@ -435,7 +501,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
435
501
  }
436
502
  stopAudioMixer() {
437
503
  if (this.ffmpegProcess) {
438
- stopFFmpegProcess(this.ffmpegProcess);
504
+ this.mixer.stopFFmpegProcess(this.ffmpegProcess);
439
505
  this.ffmpegProcess = null;
440
506
  }
441
507
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
@@ -488,7 +554,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
488
554
  }
489
555
  this.activeSlots[slot] = undefined;
490
556
  }
491
- const { sink, writer, stop } = writeAudioDataToFFmpeg(this.ffmpegProcess, slot, audioTrack);
557
+ const { sink, writer, stop } = this.mixer.writeAudioDataToFFmpeg(this.ffmpegProcess, slot, audioTrack);
492
558
  this.activeSlots[slot] = { sink, writer, stop, trackId: audioTrack.id };
493
559
  (_a = audioTrack.addEventListener) === null || _a === void 0 ? void 0 : _a.call(audioTrack, "ended", () => this.detachParticipant(participantId));
494
560
  }
@@ -507,7 +573,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
507
573
  this.activeSlots[slot] = undefined;
508
574
  }
509
575
  // Clear any queued audio data for this slot to prevent stale audio
510
- clearSlotQueue(slot);
576
+ this.mixer.clearSlotQueue(slot);
511
577
  this.participantSlots.set(slot, "");
512
578
  }
513
579
  }
@@ -518,6 +584,7 @@ class Assistant extends EventEmitter {
518
584
  this.mediaStream = null;
519
585
  this.audioSource = null;
520
586
  this.combinedStream = null;
587
+ this.remoteMediaTracks = {};
521
588
  this.roomUrl = null;
522
589
  this.handleConnectionStatusChange = (status) => {
523
590
  if (status === "connected") {
@@ -527,6 +594,41 @@ class Assistant extends EventEmitter {
527
594
  this.emit(ASSISTANT_LEFT_ROOM, { roomUrl: this.roomUrl || "" });
528
595
  }
529
596
  };
597
+ this.handleRemoteParticipantsTracksChange = (remoteParticipants) => {
598
+ const currentRemoteMediaTracks = remoteParticipants.flatMap(({ id: participantId, stream }) => {
599
+ if (!stream) {
600
+ return [];
601
+ }
602
+ const tracks = stream.getTracks();
603
+ tracks.forEach((track) => {
604
+ if (!this.remoteMediaTracks[track.id]) {
605
+ const eventName = track.kind === "video" ? PARTICIPANT_VIDEO_TRACK_ADDED : PARTICIPANT_AUDIO_TRACK_ADDED;
606
+ this.emit(eventName, {
607
+ participantId,
608
+ stream,
609
+ track,
610
+ });
611
+ this.remoteMediaTracks[track.id] = {
612
+ participantId,
613
+ stream,
614
+ track,
615
+ };
616
+ }
617
+ });
618
+ return tracks;
619
+ });
620
+ Object.values(this.remoteMediaTracks).forEach(({ participantId, stream, track }) => {
621
+ if (!currentRemoteMediaTracks.includes(track)) {
622
+ const eventName = track.kind === "video" ? PARTICIPANT_VIDEO_TRACK_REMOVED : PARTICIPANT_AUDIO_TRACK_REMOVED;
623
+ this.emit(eventName, {
624
+ participantId,
625
+ stream,
626
+ track,
627
+ });
628
+ delete this.remoteMediaTracks[track.id];
629
+ }
630
+ });
631
+ };
530
632
  this.assistantKey = assistantKey;
531
633
  this.client = new core.WherebyClient();
532
634
  this.roomConnection = this.client.getRoomConnection();
@@ -551,8 +653,9 @@ class Assistant extends EventEmitter {
551
653
  const audioMixer = new AudioMixer(handleStreamReady);
552
654
  this.combinedStream = audioMixer.getCombinedAudioStream();
553
655
  this.roomConnection.subscribeToRemoteParticipants(audioMixer.handleRemoteParticipants.bind(audioMixer));
554
- this.roomConnection.subscribeToConnectionStatus(this.handleConnectionStatusChange);
555
656
  }
657
+ this.roomConnection.subscribeToConnectionStatus(this.handleConnectionStatusChange);
658
+ this.roomConnection.subscribeToRemoteParticipants(this.handleRemoteParticipantsTracksChange);
556
659
  }
557
660
  joinRoom(roomUrl) {
558
661
  return __awaiter(this, void 0, void 0, function* () {
@@ -669,7 +772,7 @@ const webhookRouter = (webhookTriggers, emitter) => {
669
772
  res.status(200);
670
773
  res.end();
671
774
  });
672
- router.post("/", jsonParser, (req, res) => {
775
+ router.post("/", jsonParser, (req, res) => __awaiter(void 0, void 0, void 0, function* () {
673
776
  var _a, _b, _c, _d, _e, _f, _g, _h;
674
777
  assert(req.body, "message body is required");
675
778
  assert("type" in req.body, "webhook type is required");
@@ -677,19 +780,19 @@ const webhookRouter = (webhookTriggers, emitter) => {
677
780
  switch (req.body.type) {
678
781
  case "room.client.joined":
679
782
  shouldTriggerOnReceivedWebhook =
680
- (_b = (_a = webhookTriggers["room.client.joined"]) === null || _a === void 0 ? void 0 : _a.call(webhookTriggers, req.body)) !== null && _b !== void 0 ? _b : false;
783
+ (_b = (yield ((_a = webhookTriggers["room.client.joined"]) === null || _a === void 0 ? void 0 : _a.call(webhookTriggers, req.body)))) !== null && _b !== void 0 ? _b : false;
681
784
  break;
682
785
  case "room.client.left":
683
786
  shouldTriggerOnReceivedWebhook =
684
- (_d = (_c = webhookTriggers["room.client.left"]) === null || _c === void 0 ? void 0 : _c.call(webhookTriggers, req.body)) !== null && _d !== void 0 ? _d : false;
787
+ (_d = (yield ((_c = webhookTriggers["room.client.left"]) === null || _c === void 0 ? void 0 : _c.call(webhookTriggers, req.body)))) !== null && _d !== void 0 ? _d : false;
685
788
  break;
686
789
  case "room.session.started":
687
790
  shouldTriggerOnReceivedWebhook =
688
- (_f = (_e = webhookTriggers["room.session.started"]) === null || _e === void 0 ? void 0 : _e.call(webhookTriggers, req.body)) !== null && _f !== void 0 ? _f : false;
791
+ (_f = (yield ((_e = webhookTriggers["room.session.started"]) === null || _e === void 0 ? void 0 : _e.call(webhookTriggers, req.body)))) !== null && _f !== void 0 ? _f : false;
689
792
  break;
690
793
  case "room.session.ended":
691
794
  shouldTriggerOnReceivedWebhook =
692
- (_h = (_g = webhookTriggers["room.session.ended"]) === null || _g === void 0 ? void 0 : _g.call(webhookTriggers, req.body)) !== null && _h !== void 0 ? _h : false;
795
+ (_h = (yield ((_g = webhookTriggers["room.session.ended"]) === null || _g === void 0 ? void 0 : _g.call(webhookTriggers, req.body)))) !== null && _h !== void 0 ? _h : false;
693
796
  break;
694
797
  }
695
798
  if (shouldTriggerOnReceivedWebhook) {
@@ -698,7 +801,7 @@ const webhookRouter = (webhookTriggers, emitter) => {
698
801
  }
699
802
  res.status(200);
700
803
  res.end();
701
- });
804
+ }));
702
805
  return router;
703
806
  };
704
807
  class Trigger extends EventEmitter.EventEmitter {
@@ -711,12 +814,9 @@ class Trigger extends EventEmitter.EventEmitter {
711
814
  const app = express();
712
815
  const router = webhookRouter(this.webhookTriggers, this);
713
816
  app.use(router);
714
- const server = app.listen(this.port, () => {
817
+ app.listen(this.port, () => {
715
818
  // console.log(`Bot trigger server now running on port[${this.port}]`);
716
819
  });
717
- process.on("SIGTERM", () => {
718
- server.close();
719
- });
720
820
  }
721
821
  }
722
822
 
@@ -726,5 +826,9 @@ exports.AUDIO_STREAM_READY = AUDIO_STREAM_READY;
726
826
  exports.Assistant = Assistant;
727
827
  exports.AudioSink = AudioSink;
728
828
  exports.AudioSource = AudioSource;
829
+ exports.PARTICIPANT_AUDIO_TRACK_ADDED = PARTICIPANT_AUDIO_TRACK_ADDED;
830
+ exports.PARTICIPANT_AUDIO_TRACK_REMOVED = PARTICIPANT_AUDIO_TRACK_REMOVED;
831
+ exports.PARTICIPANT_VIDEO_TRACK_ADDED = PARTICIPANT_VIDEO_TRACK_ADDED;
832
+ exports.PARTICIPANT_VIDEO_TRACK_REMOVED = PARTICIPANT_VIDEO_TRACK_REMOVED;
729
833
  exports.TRIGGER_EVENT_SUCCESS = TRIGGER_EVENT_SUCCESS;
730
834
  exports.Trigger = Trigger;