@whereby.com/assistant-sdk 0.0.0-canary-20250916140846 → 0.0.0-canary-20250923130059

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,10 @@ const TRIGGER_EVENT_SUCCESS = "trigger_event_success";
14
14
  const AUDIO_STREAM_READY = "AUDIO_STREAM_READY";
15
15
  const ASSISTANT_JOINED_ROOM = "ASSISTANT_JOINED_ROOM";
16
16
  const ASSISTANT_LEFT_ROOM = "ASSISTANT_LEFT_ROOM";
17
+ const PARTICIPANT_VIDEO_TRACK_ADDED = "PARTICIPANT_VIDEO_TRACK_ADDED";
18
+ const PARTICIPANT_VIDEO_TRACK_REMOVED = "PARTICIPANT_VIDEO_TRACK_REMOVED";
19
+ const PARTICIPANT_AUDIO_TRACK_ADDED = "PARTICIPANT_AUDIO_TRACK_ADDED";
20
+ const PARTICIPANT_AUDIO_TRACK_REMOVED = "PARTICIPANT_AUDIO_TRACK_REMOVED";
17
21
 
18
22
  /******************************************************************************
19
23
  Copyright (c) Microsoft Corporation.
@@ -78,301 +82,362 @@ const STREAM_INPUT_SAMPLE_RATE_IN_HZ = 48000;
78
82
  const BYTES_PER_SAMPLE = 2;
79
83
  // 480 samples per 10ms frame at 48kHz
80
84
  const FRAME_10MS_SAMPLES = 480;
81
- const slotBuffers = new Map();
82
- function appendAndDrainTo480(slot, newSamples) {
83
- var _a;
84
- const prev = (_a = slotBuffers.get(slot)) !== null && _a !== void 0 ? _a : new Int16Array(0);
85
- const merged = new Int16Array(prev.length + newSamples.length);
86
- merged.set(prev, 0);
87
- merged.set(newSamples, prev.length);
88
- let offset = 0;
89
- while (merged.length - offset >= FRAME_10MS_SAMPLES) {
90
- const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
91
- enqueueFrame(slot, chunk); // always 480
92
- offset += FRAME_10MS_SAMPLES;
93
- }
94
- slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
95
- }
96
- ({
97
- enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
98
- enqSamples: new Array(PARTICIPANT_SLOTS).fill(0),
99
- wroteFrames: new Array(PARTICIPANT_SLOTS).fill(0),
100
- wroteSamples: new Array(PARTICIPANT_SLOTS).fill(0),
101
- lastFramesSeen: new Array(PARTICIPANT_SLOTS).fill(0),
102
- });
103
- let slots = [];
104
- let stopPacerFn = null;
105
- let outputPacerState = null;
106
- /**
107
- * Simple linear interpolation resampler to convert audio to 48kHz.
108
- * This handles the common case of 16kHz -> 48kHz (3x upsampling).
109
- */
110
- function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
111
- const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
112
- const outputLength = Math.floor(inputFrames * ratio);
113
- const output = new Int16Array(outputLength);
114
- for (let i = 0; i < outputLength; i++) {
115
- const inputIndex = i / ratio;
116
- const index = Math.floor(inputIndex);
117
- const fraction = inputIndex - index;
118
- if (index + 1 < inputSamples.length) {
119
- const sample1 = inputSamples[index];
120
- const sample2 = inputSamples[index + 1];
121
- output[i] = Math.round(sample1 + (sample2 - sample1) * fraction);
85
+ function createFfmpegMixer() {
86
+ const slotBuffers = new Map();
87
+ function appendAndDrainTo480(slot, newSamples) {
88
+ var _a;
89
+ const prev = (_a = slotBuffers.get(slot)) !== null && _a !== void 0 ? _a : new Int16Array(0);
90
+ const merged = new Int16Array(prev.length + newSamples.length);
91
+ merged.set(prev, 0);
92
+ merged.set(newSamples, prev.length);
93
+ let offset = 0;
94
+ while (merged.length - offset >= FRAME_10MS_SAMPLES) {
95
+ const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
96
+ enqueueFrame(slot, chunk); // always 480
97
+ offset += FRAME_10MS_SAMPLES;
122
98
  }
123
- else {
124
- output[i] = inputSamples[Math.min(index, inputSamples.length - 1)];
99
+ slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
100
+ }
101
+ ({
102
+ enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
103
+ enqSamples: new Array(PARTICIPANT_SLOTS).fill(0),
104
+ wroteFrames: new Array(PARTICIPANT_SLOTS).fill(0),
105
+ wroteSamples: new Array(PARTICIPANT_SLOTS).fill(0),
106
+ lastFramesSeen: new Array(PARTICIPANT_SLOTS).fill(0),
107
+ });
108
+ let slots = [];
109
+ let stopPacerFn = null;
110
+ let outputPacerState = null;
111
+ /**
112
+ * Simple linear interpolation resampler to convert audio to 48kHz.
113
+ * This handles the common case of 16kHz -> 48kHz (3x upsampling).
114
+ */
115
+ function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
116
+ const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
117
+ const outputLength = Math.floor(inputFrames * ratio);
118
+ const output = new Int16Array(outputLength);
119
+ for (let i = 0; i < outputLength; i++) {
120
+ const inputIndex = i / ratio;
121
+ const index = Math.floor(inputIndex);
122
+ const fraction = inputIndex - index;
123
+ if (index + 1 < inputSamples.length) {
124
+ const sample1 = inputSamples[index];
125
+ const sample2 = inputSamples[index + 1];
126
+ output[i] = Math.round(sample1 + (sample2 - sample1) * fraction);
127
+ }
128
+ else {
129
+ output[i] = inputSamples[Math.min(index, inputSamples.length - 1)];
130
+ }
131
+ }
132
+ return output;
133
+ }
134
+ /**
135
+ * Enqueue an audio frame for paced delivery to the RTCAudioSource.
136
+ */
137
+ function enqueueOutputFrame(samples) {
138
+ if (outputPacerState) {
139
+ outputPacerState.frameQueue.push(samples);
125
140
  }
126
141
  }
127
- return output;
128
- }
129
- /**
130
- * Enqueue an audio frame for paced delivery to the RTCAudioSource.
131
- */
132
- function enqueueOutputFrame(samples) {
133
- if (outputPacerState) {
134
- outputPacerState.frameQueue.push(samples);
135
- }
136
- }
137
- /**
138
- * Start the audio pacer loop for all input slots in an FFmpeg process.
139
- *
140
- * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
141
- * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
142
- * arrive jittery, bursty, or with slightly different clocks.
143
- *
144
- * Key behavior:
145
- * - Writes exactly one frame per period, on a shared wall-clock grid.
146
- * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
147
- * never stalls.
148
- * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
149
- * - Honors Node stream backpressure (`write()` return false) without breaking
150
- * the timing grid.
151
- *
152
- * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
153
- * can mix them without slow-downs or drift.
154
- *
155
- * Call this once right after spawning FFmpeg:
156
- * ```ts
157
- * const ff = spawnFFmpegProcess();
158
- * startPacer(ff, PARTICIPANT_SLOTS);
159
- * ```
160
- *
161
- * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
162
- *
163
- * @param ff Child process handle from spawn("ffmpeg", ...)
164
- * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
165
- */
166
- function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
167
- if (stopPacerFn) {
168
- stopPacerFn();
169
- stopPacerFn = null;
170
- }
171
- const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
172
- const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
173
- const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
174
- const t0 = nowMs();
175
- slots = Array.from({ length: slotCount }, () => ({
176
- q: [],
177
- lastFrames: FRAME_10MS_SAMPLES, // keep constant
178
- nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
179
- }));
180
- outputPacerState = {
181
- frameQueue: [],
182
- nextDueMs: t0 + outputFrameMs,
183
- rtcAudioSource,
184
- onAudioStreamReady,
185
- didEmitReadyEvent: false,
186
- };
187
- const iv = setInterval(() => {
188
- const t = nowMs();
189
- for (let s = 0; s < slotCount; s++) {
190
- const st = slots[s];
191
- const w = writers[s];
192
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
193
- if (t >= st.nextDueMs) {
194
- const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
195
- if (!w.write(buf)) {
196
- // Just continue without adding drain listener - backpressure will naturally resolve
142
+ /**
143
+ * Start the audio pacer loop for all input slots in an FFmpeg process.
144
+ *
145
+ * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
146
+ * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
147
+ * arrive jittery, bursty, or with slightly different clocks.
148
+ *
149
+ * Key behavior:
150
+ * - Writes exactly one frame per period, on a shared wall-clock grid.
151
+ * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
152
+ * never stalls.
153
+ * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
154
+ * - Honors Node stream backpressure (`write()` return false) without breaking
155
+ * the timing grid.
156
+ *
157
+ * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
158
+ * can mix them without slow-downs or drift.
159
+ *
160
+ * Call this once right after spawning FFmpeg:
161
+ * ```ts
162
+ * const ff = spawnFFmpegProcess();
163
+ * startPacer(ff, PARTICIPANT_SLOTS);
164
+ * ```
165
+ *
166
+ * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
167
+ *
168
+ * @param ff Child process handle from spawn("ffmpeg", ...)
169
+ * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
170
+ */
171
+ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
172
+ if (stopPacerFn) {
173
+ stopPacerFn();
174
+ stopPacerFn = null;
175
+ }
176
+ const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
177
+ const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
178
+ const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
179
+ const t0 = nowMs();
180
+ slots = Array.from({ length: slotCount }, () => ({
181
+ q: [],
182
+ lastFrames: FRAME_10MS_SAMPLES, // keep constant
183
+ nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
184
+ }));
185
+ outputPacerState = {
186
+ frameQueue: [],
187
+ nextDueMs: t0 + outputFrameMs,
188
+ rtcAudioSource,
189
+ onAudioStreamReady,
190
+ didEmitReadyEvent: false,
191
+ };
192
+ const iv = setInterval(() => {
193
+ const t = nowMs();
194
+ for (let s = 0; s < slotCount; s++) {
195
+ const st = slots[s];
196
+ const w = writers[s];
197
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
198
+ if (t >= st.nextDueMs) {
199
+ const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
200
+ if (!w.write(buf)) {
201
+ // Just continue without adding drain listener - backpressure will naturally resolve
202
+ const late = t - st.nextDueMs;
203
+ const steps = Math.max(1, Math.ceil(late / frameMs));
204
+ st.nextDueMs += steps * frameMs;
205
+ continue;
206
+ }
197
207
  const late = t - st.nextDueMs;
198
208
  const steps = Math.max(1, Math.ceil(late / frameMs));
199
209
  st.nextDueMs += steps * frameMs;
200
- continue;
201
210
  }
202
- const late = t - st.nextDueMs;
203
- const steps = Math.max(1, Math.ceil(late / frameMs));
204
- st.nextDueMs += steps * frameMs;
205
211
  }
206
- }
207
- if (!outputPacerState)
208
- return;
209
- // Handle output pacer for RTCAudioSource
210
- const state = outputPacerState;
211
- if (t >= state.nextDueMs) {
212
- const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
213
- if (!state.didEmitReadyEvent) {
214
- state.onAudioStreamReady();
215
- state.didEmitReadyEvent = true;
212
+ if (!outputPacerState)
213
+ return;
214
+ // Handle output pacer for RTCAudioSource
215
+ const state = outputPacerState;
216
+ if (t >= state.nextDueMs) {
217
+ const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
218
+ if (!state.didEmitReadyEvent) {
219
+ state.onAudioStreamReady();
220
+ state.didEmitReadyEvent = true;
221
+ }
222
+ state.rtcAudioSource.onData({
223
+ samples: samples,
224
+ sampleRate: STREAM_INPUT_SAMPLE_RATE_IN_HZ,
225
+ });
226
+ const late = t - state.nextDueMs;
227
+ const steps = Math.max(1, Math.ceil(late / outputFrameMs));
228
+ state.nextDueMs += steps * outputFrameMs;
216
229
  }
217
- state.rtcAudioSource.onData({
218
- samples: samples,
219
- sampleRate: STREAM_INPUT_SAMPLE_RATE_IN_HZ,
220
- });
221
- const late = t - state.nextDueMs;
222
- const steps = Math.max(1, Math.ceil(late / outputFrameMs));
223
- state.nextDueMs += steps * outputFrameMs;
230
+ }, 5);
231
+ stopPacerFn = () => clearInterval(iv);
232
+ }
233
+ /**
234
+ * Stop the audio pacer loop and clear all input slots.
235
+ * Call this before killing the FFmpeg process to ensure clean shutdown.
236
+ */
237
+ function stopPacer() {
238
+ if (stopPacerFn)
239
+ stopPacerFn();
240
+ stopPacerFn = null;
241
+ slots = [];
242
+ slotBuffers.clear();
243
+ outputPacerState = null;
244
+ }
245
+ /**
246
+ * Queue a live frame for a given slot (0..N-1).
247
+ * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
248
+ */
249
+ function enqueueFrame(slot, samples, numberOfFrames) {
250
+ const st = slots[slot];
251
+ if (!st)
252
+ return;
253
+ const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
254
+ st.q.push(buf);
255
+ }
256
+ /**
257
+ * Clear the audio queue for a specific slot when a participant leaves.
258
+ * This prevents stale audio data from continuing to play after disconnect.
259
+ */
260
+ function clearSlotQueue(slot) {
261
+ const st = slots[slot];
262
+ if (st) {
263
+ st.q = [];
264
+ slotBuffers.delete(slot);
265
+ const now = Number(process.hrtime.bigint()) / 1e6;
266
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
267
+ st.nextDueMs = now + frameMs;
224
268
  }
225
- }, 5);
226
- stopPacerFn = () => clearInterval(iv);
227
- }
228
- /**
229
- * Stop the audio pacer loop and clear all input slots.
230
- * Call this before killing the FFmpeg process to ensure clean shutdown.
231
- */
232
- function stopPacer() {
233
- if (stopPacerFn)
234
- stopPacerFn();
235
- stopPacerFn = null;
236
- slots = [];
237
- }
238
- /**
239
- * Queue a live frame for a given slot (0..N-1).
240
- * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
241
- */
242
- function enqueueFrame(slot, samples, numberOfFrames) {
243
- const st = slots[slot];
244
- if (!st)
245
- return;
246
- const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
247
- st.q.push(buf);
248
- }
249
- /**
250
- * Clear the audio queue for a specific slot when a participant leaves.
251
- * This prevents stale audio data from continuing to play after disconnect.
252
- */
253
- function clearSlotQueue(slot) {
254
- const st = slots[slot];
255
- if (st) {
256
- st.q = [];
257
- const now = Number(process.hrtime.bigint()) / 1e6;
258
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
259
- st.nextDueMs = now + frameMs;
260
269
  }
261
- }
262
- /**
263
- * Get the FFmpeg arguments for mixing audio from multiple participants.
264
- * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
265
- * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
266
- */
267
- function getFFmpegArguments() {
268
- const N = PARTICIPANT_SLOTS;
269
- const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
270
- const ffArgs = [];
271
- for (let i = 0; i < N; i++) {
272
- ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
273
- }
274
- const pre = [];
275
- for (let i = 0; i < N; i++) {
276
- pre.push(`[${i}:a]aresample=async=1:first_pts=0,asetpts=N/SR/TB[a${i}]`);
277
- }
278
- const labels = Array.from({ length: N }, (_, i) => `[a${i}]`).join("");
279
- const amix = `${labels}amix=inputs=${N}:duration=longest:dropout_transition=250:normalize=0[mix]`;
280
- const filter = `${pre.join(";")};${amix}`;
281
- ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
282
- return ffArgs;
283
- }
284
- /**
285
- * Spawn a new FFmpeg process for mixing audio from multiple participants.
286
- * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
287
- * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
288
- * The process will log its output to stderr.
289
- * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
290
- * @return The spawned FFmpeg process.
291
- */
292
- function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
293
- const stdio = ["ignore", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
294
- const args = getFFmpegArguments();
295
- const ffmpegProcess = spawn("ffmpeg", args, { stdio });
296
- startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
297
- ffmpegProcess.stderr.setEncoding("utf8");
298
- ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
299
- ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
300
- let audioBuffer = Buffer.alloc(0);
301
- const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
302
- ffmpegProcess.stdout.on("data", (chunk) => {
303
- audioBuffer = Buffer.concat([audioBuffer, chunk]);
304
- while (audioBuffer.length >= FRAME_SIZE_BYTES) {
305
- const frameData = audioBuffer.subarray(0, FRAME_SIZE_BYTES);
306
- const samples = new Int16Array(FRAME_10MS_SAMPLES);
307
- for (let i = 0; i < FRAME_10MS_SAMPLES; i++) {
308
- samples[i] = frameData.readInt16LE(i * 2);
309
- }
310
- enqueueOutputFrame(samples);
311
- audioBuffer = audioBuffer.subarray(FRAME_SIZE_BYTES);
270
+ /**
271
+ * Get the FFmpeg arguments for debugging, which writes each participant's audio to a separate WAV file
272
+ * and also mixes them into a single WAV file.
273
+ * This is useful for inspecting the audio quality and timing of each participant.
274
+ */
275
+ function getFFmpegArgumentsDebug() {
276
+ const N = PARTICIPANT_SLOTS;
277
+ const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
278
+ const ffArgs = [];
279
+ for (let i = 0; i < N; i++) {
280
+ ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
312
281
  }
313
- });
314
- return ffmpegProcess;
315
- }
316
- /**
317
- * Write audio data from a MediaStreamTrack to the FFmpeg process.
318
- * This function creates an AudioSink for the track and sets up a data handler
319
- * that enqueues audio frames into the pacer.
320
- *
321
- * @param ffmpegProcess The FFmpeg process to which audio data will be written.
322
- * @param slot The participant slot number (0..N-1) to which this track belongs.
323
- * @param audioTrack The MediaStreamTrack containing the audio data.
324
- * @return An object containing the AudioSink, the writable stream, and a stop function.
325
- */
326
- function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
327
- const writer = ffmpegProcess.stdio[3 + slot];
328
- const sink = new AudioSink(audioTrack);
329
- const unsubscribe = sink.subscribe(({ samples, sampleRate: sr, channelCount: ch, bitsPerSample, numberOfFrames }) => {
330
- if (ch !== 1 || bitsPerSample !== 16)
331
- return;
332
- let out = samples;
333
- if (sr !== STREAM_INPUT_SAMPLE_RATE_IN_HZ) {
334
- const resampled = resampleTo48kHz(samples, sr, numberOfFrames !== null && numberOfFrames !== void 0 ? numberOfFrames : samples.length);
335
- out = resampled;
336
- }
337
- appendAndDrainTo480(slot, out);
338
- });
339
- const stop = () => {
340
- try {
341
- unsubscribe();
342
- sink.stop();
282
+ const pre = [];
283
+ for (let i = 0; i < N; i++) {
284
+ pre.push(`[${i}:a]aresample=async=0:first_pts=0,asetpts=PTS-STARTPTS,asplit=2[a${i}tap][a${i}mix]`);
343
285
  }
344
- catch (_a) {
345
- console.error("Failed to stop AudioSink");
286
+ const mixInputs = Array.from({ length: N }, (_, i) => `[a${i}mix]`).join("");
287
+ const filter = `${pre.join(";")};${mixInputs}amix=inputs=${N}:duration=first:dropout_transition=0:normalize=0[mix]`;
288
+ ffArgs.push("-hide_banner", "-nostats", "-loglevel", "info", "-y", "-filter_complex", filter);
289
+ for (let i = 0; i < N; i++) {
290
+ ffArgs.push("-map", `[a${i}tap]`, "-f", "wav", "-c:a", "pcm_s16le", `pre${i}.wav`);
346
291
  }
347
- };
348
- return { sink, writer, stop };
349
- }
350
- /**
351
- * Stop the FFmpeg process and clean up all resources.
352
- * This function will unpipe the stdout, end all writable streams for each participant slot,
353
- * and kill the FFmpeg process.
354
- * @param ffmpegProcess The FFmpeg process to stop.
355
- */
356
- function stopFFmpegProcess(ffmpegProcess) {
357
- stopPacer();
358
- if (ffmpegProcess && !ffmpegProcess.killed) {
359
- try {
360
- ffmpegProcess.stdout.unpipe();
292
+ ffArgs.push("-map", "[mix]", "-f", "wav", "-c:a", "pcm_s16le", "mixed.wav");
293
+ return ffArgs;
294
+ }
295
+ /**
296
+ * Get the FFmpeg arguments for mixing audio from multiple participants.
297
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
298
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
299
+ */
300
+ function getFFmpegArguments() {
301
+ const N = PARTICIPANT_SLOTS;
302
+ const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
303
+ const ffArgs = [];
304
+ for (let i = 0; i < N; i++) {
305
+ ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
361
306
  }
362
- catch (_a) {
363
- console.error("Failed to unpipe ffmpeg stdout");
307
+ const pre = [];
308
+ for (let i = 0; i < N; i++) {
309
+ pre.push(`[${i}:a]aresample=async=0:first_pts=0,asetpts=PTS-STARTPTS[a${i}]`);
364
310
  }
365
- for (let i = 0; i < PARTICIPANT_SLOTS; i++) {
366
- const w = ffmpegProcess.stdio[3 + i];
311
+ const labels = Array.from({ length: N }, (_, i) => `[a${i}]`).join("");
312
+ const amix = `${labels}amix=inputs=${N}:duration=first:dropout_transition=0:normalize=0[mix]`;
313
+ const filter = `${pre.join(";")};${amix}`;
314
+ ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
315
+ return ffArgs;
316
+ }
317
+ /*
318
+ * Spawn a new FFmpeg process for debugging purposes.
319
+ * This will write each participant's audio to a separate WAV file and also mix them into a single WAV file.
320
+ * The output files will be named pre0.wav, pre1.wav, ..., and mixed.wav.
321
+ * The process will log its output to stderr.
322
+ * @return The spawned FFmpeg process.
323
+ */
324
+ function spawnFFmpegProcessDebug(rtcAudioSource, onAudioStreamReady) {
325
+ const stdio = ["ignore", "ignore", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
326
+ const args = getFFmpegArgumentsDebug();
327
+ const ffmpegProcess = spawn("ffmpeg", args, { stdio });
328
+ startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
329
+ ffmpegProcess.stderr.setEncoding("utf8");
330
+ ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
331
+ ffmpegProcess.on("error", () => console.error("FFmpeg process error (debug): is ffmpeg installed?"));
332
+ return ffmpegProcess;
333
+ }
334
+ /**
335
+ * Spawn a new FFmpeg process for mixing audio from multiple participants.
336
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
337
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
338
+ * The process will log its output to stderr.
339
+ * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
340
+ * @return The spawned FFmpeg process.
341
+ */
342
+ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
343
+ const stdio = ["pipe", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
344
+ const args = getFFmpegArguments();
345
+ const ffmpegProcess = spawn("ffmpeg", args, { stdio });
346
+ startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
347
+ ffmpegProcess.stderr.setEncoding("utf8");
348
+ ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
349
+ ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
350
+ let audioBuffer = Buffer.alloc(0);
351
+ const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
352
+ ffmpegProcess.stdout.on("data", (chunk) => {
353
+ audioBuffer = Buffer.concat([audioBuffer, chunk]);
354
+ while (audioBuffer.length >= FRAME_SIZE_BYTES) {
355
+ const frameData = audioBuffer.subarray(0, FRAME_SIZE_BYTES);
356
+ const samples = new Int16Array(FRAME_10MS_SAMPLES);
357
+ for (let i = 0; i < FRAME_10MS_SAMPLES; i++) {
358
+ samples[i] = frameData.readInt16LE(i * 2);
359
+ }
360
+ enqueueOutputFrame(samples);
361
+ audioBuffer = audioBuffer.subarray(FRAME_SIZE_BYTES);
362
+ }
363
+ });
364
+ return ffmpegProcess;
365
+ }
366
+ /**
367
+ * Write audio data from a MediaStreamTrack to the FFmpeg process.
368
+ * This function creates an AudioSink for the track and sets up a data handler
369
+ * that enqueues audio frames into the pacer.
370
+ *
371
+ * @param ffmpegProcess The FFmpeg process to which audio data will be written.
372
+ * @param slot The participant slot number (0..N-1) to which this track belongs.
373
+ * @param audioTrack The MediaStreamTrack containing the audio data.
374
+ * @return An object containing the AudioSink, the writable stream, and a stop function.
375
+ */
376
+ function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
377
+ const writer = ffmpegProcess.stdio[3 + slot];
378
+ const sink = new AudioSink(audioTrack);
379
+ const unsubscribe = sink.subscribe(({ samples, sampleRate: sr, channelCount: ch, bitsPerSample, numberOfFrames }) => {
380
+ if (ch !== 1 || bitsPerSample !== 16)
381
+ return;
382
+ let out = samples;
383
+ if (sr !== STREAM_INPUT_SAMPLE_RATE_IN_HZ) {
384
+ const resampled = resampleTo48kHz(samples, sr, numberOfFrames !== null && numberOfFrames !== void 0 ? numberOfFrames : samples.length);
385
+ out = resampled;
386
+ }
387
+ appendAndDrainTo480(slot, out);
388
+ });
389
+ const stop = () => {
390
+ try {
391
+ unsubscribe();
392
+ sink.stop();
393
+ }
394
+ catch (_a) {
395
+ console.error("Failed to stop AudioSink");
396
+ }
397
+ };
398
+ return { sink, writer, stop };
399
+ }
400
+ /**
401
+ * Stop the FFmpeg process and clean up all resources.
402
+ * This function will unpipe the stdout, end all writable streams for each participant slot,
403
+ * and kill the FFmpeg process.
404
+ * @param ffmpegProcess The FFmpeg process to stop.
405
+ */
406
+ function stopFFmpegProcess(ffmpegProcess) {
407
+ var _a, _b;
408
+ stopPacer();
409
+ if (ffmpegProcess && !ffmpegProcess.killed) {
410
+ try {
411
+ ffmpegProcess.stdout.unpipe();
412
+ }
413
+ catch (_c) {
414
+ console.error("Failed to unpipe ffmpeg stdout");
415
+ }
416
+ for (let i = 0; i < PARTICIPANT_SLOTS; i++) {
417
+ const w = ffmpegProcess.stdio[3 + i];
418
+ try {
419
+ w.end();
420
+ }
421
+ catch (_d) {
422
+ console.error("Failed to end ffmpeg writable stream");
423
+ }
424
+ }
367
425
  try {
368
- w.end();
426
+ (_a = ffmpegProcess.stdin) === null || _a === void 0 ? void 0 : _a.write("q\n");
427
+ (_b = ffmpegProcess.stdin) === null || _b === void 0 ? void 0 : _b.end();
369
428
  }
370
- catch (_b) {
371
- console.error("Failed to end ffmpeg writable stream");
429
+ catch (_e) {
430
+ console.error("Failed to end ffmpeg stdin");
372
431
  }
373
432
  }
374
- ffmpegProcess.kill("SIGTERM");
375
433
  }
434
+ return {
435
+ spawnFFmpegProcess,
436
+ spawnFFmpegProcessDebug,
437
+ writeAudioDataToFFmpeg,
438
+ stopFFmpegProcess,
439
+ clearSlotQueue,
440
+ };
376
441
  }
377
442
 
378
443
  class AudioMixer extends EventEmitter {
@@ -383,6 +448,7 @@ class AudioMixer extends EventEmitter {
383
448
  this.rtcAudioSource = null;
384
449
  this.participantSlots = new Map();
385
450
  this.activeSlots = {};
451
+ this.mixer = createFfmpegMixer();
386
452
  this.setupMediaStream();
387
453
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
388
454
  this.onStreamReady = onStreamReady;
@@ -401,7 +467,7 @@ class AudioMixer extends EventEmitter {
401
467
  return;
402
468
  }
403
469
  if (!this.ffmpegProcess && this.rtcAudioSource) {
404
- this.ffmpegProcess = spawnFFmpegProcess(this.rtcAudioSource, this.onStreamReady);
470
+ this.ffmpegProcess = this.mixer.spawnFFmpegProcess(this.rtcAudioSource, this.onStreamReady);
405
471
  }
406
472
  for (const p of participants)
407
473
  this.attachParticipantIfNeeded(p);
@@ -414,7 +480,7 @@ class AudioMixer extends EventEmitter {
414
480
  }
415
481
  stopAudioMixer() {
416
482
  if (this.ffmpegProcess) {
417
- stopFFmpegProcess(this.ffmpegProcess);
483
+ this.mixer.stopFFmpegProcess(this.ffmpegProcess);
418
484
  this.ffmpegProcess = null;
419
485
  }
420
486
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
@@ -467,7 +533,7 @@ class AudioMixer extends EventEmitter {
467
533
  }
468
534
  this.activeSlots[slot] = undefined;
469
535
  }
470
- const { sink, writer, stop } = writeAudioDataToFFmpeg(this.ffmpegProcess, slot, audioTrack);
536
+ const { sink, writer, stop } = this.mixer.writeAudioDataToFFmpeg(this.ffmpegProcess, slot, audioTrack);
471
537
  this.activeSlots[slot] = { sink, writer, stop, trackId: audioTrack.id };
472
538
  (_a = audioTrack.addEventListener) === null || _a === void 0 ? void 0 : _a.call(audioTrack, "ended", () => this.detachParticipant(participantId));
473
539
  }
@@ -486,7 +552,7 @@ class AudioMixer extends EventEmitter {
486
552
  this.activeSlots[slot] = undefined;
487
553
  }
488
554
  // Clear any queued audio data for this slot to prevent stale audio
489
- clearSlotQueue(slot);
555
+ this.mixer.clearSlotQueue(slot);
490
556
  this.participantSlots.set(slot, "");
491
557
  }
492
558
  }
@@ -497,6 +563,7 @@ class Assistant extends EventEmitter$1 {
497
563
  this.mediaStream = null;
498
564
  this.audioSource = null;
499
565
  this.combinedStream = null;
566
+ this.remoteMediaTracks = {};
500
567
  this.roomUrl = null;
501
568
  this.handleConnectionStatusChange = (status) => {
502
569
  if (status === "connected") {
@@ -506,6 +573,41 @@ class Assistant extends EventEmitter$1 {
506
573
  this.emit(ASSISTANT_LEFT_ROOM, { roomUrl: this.roomUrl || "" });
507
574
  }
508
575
  };
576
+ this.handleRemoteParticipantsTracksChange = (remoteParticipants) => {
577
+ const currentRemoteMediaTracks = remoteParticipants.flatMap(({ id: participantId, stream }) => {
578
+ if (!stream) {
579
+ return [];
580
+ }
581
+ const tracks = stream.getTracks();
582
+ tracks.forEach((track) => {
583
+ if (!this.remoteMediaTracks[track.id]) {
584
+ const eventName = track.kind === "video" ? PARTICIPANT_VIDEO_TRACK_ADDED : PARTICIPANT_AUDIO_TRACK_ADDED;
585
+ this.emit(eventName, {
586
+ participantId,
587
+ stream,
588
+ track,
589
+ });
590
+ this.remoteMediaTracks[track.id] = {
591
+ participantId,
592
+ stream,
593
+ track,
594
+ };
595
+ }
596
+ });
597
+ return tracks;
598
+ });
599
+ Object.values(this.remoteMediaTracks).forEach(({ participantId, stream, track }) => {
600
+ if (!currentRemoteMediaTracks.includes(track)) {
601
+ const eventName = track.kind === "video" ? PARTICIPANT_VIDEO_TRACK_REMOVED : PARTICIPANT_AUDIO_TRACK_REMOVED;
602
+ this.emit(eventName, {
603
+ participantId,
604
+ stream,
605
+ track,
606
+ });
607
+ delete this.remoteMediaTracks[track.id];
608
+ }
609
+ });
610
+ };
509
611
  this.assistantKey = assistantKey;
510
612
  this.client = new WherebyClient();
511
613
  this.roomConnection = this.client.getRoomConnection();
@@ -530,8 +632,9 @@ class Assistant extends EventEmitter$1 {
530
632
  const audioMixer = new AudioMixer(handleStreamReady);
531
633
  this.combinedStream = audioMixer.getCombinedAudioStream();
532
634
  this.roomConnection.subscribeToRemoteParticipants(audioMixer.handleRemoteParticipants.bind(audioMixer));
533
- this.roomConnection.subscribeToConnectionStatus(this.handleConnectionStatusChange);
534
635
  }
636
+ this.roomConnection.subscribeToConnectionStatus(this.handleConnectionStatusChange);
637
+ this.roomConnection.subscribeToRemoteParticipants(this.handleRemoteParticipantsTracksChange);
535
638
  }
536
639
  joinRoom(roomUrl) {
537
640
  return __awaiter(this, void 0, void 0, function* () {
@@ -648,7 +751,7 @@ const webhookRouter = (webhookTriggers, emitter) => {
648
751
  res.status(200);
649
752
  res.end();
650
753
  });
651
- router.post("/", jsonParser, (req, res) => {
754
+ router.post("/", jsonParser, (req, res) => __awaiter(void 0, void 0, void 0, function* () {
652
755
  var _a, _b, _c, _d, _e, _f, _g, _h;
653
756
  assert(req.body, "message body is required");
654
757
  assert("type" in req.body, "webhook type is required");
@@ -656,19 +759,19 @@ const webhookRouter = (webhookTriggers, emitter) => {
656
759
  switch (req.body.type) {
657
760
  case "room.client.joined":
658
761
  shouldTriggerOnReceivedWebhook =
659
- (_b = (_a = webhookTriggers["room.client.joined"]) === null || _a === void 0 ? void 0 : _a.call(webhookTriggers, req.body)) !== null && _b !== void 0 ? _b : false;
762
+ (_b = (yield ((_a = webhookTriggers["room.client.joined"]) === null || _a === void 0 ? void 0 : _a.call(webhookTriggers, req.body)))) !== null && _b !== void 0 ? _b : false;
660
763
  break;
661
764
  case "room.client.left":
662
765
  shouldTriggerOnReceivedWebhook =
663
- (_d = (_c = webhookTriggers["room.client.left"]) === null || _c === void 0 ? void 0 : _c.call(webhookTriggers, req.body)) !== null && _d !== void 0 ? _d : false;
766
+ (_d = (yield ((_c = webhookTriggers["room.client.left"]) === null || _c === void 0 ? void 0 : _c.call(webhookTriggers, req.body)))) !== null && _d !== void 0 ? _d : false;
664
767
  break;
665
768
  case "room.session.started":
666
769
  shouldTriggerOnReceivedWebhook =
667
- (_f = (_e = webhookTriggers["room.session.started"]) === null || _e === void 0 ? void 0 : _e.call(webhookTriggers, req.body)) !== null && _f !== void 0 ? _f : false;
770
+ (_f = (yield ((_e = webhookTriggers["room.session.started"]) === null || _e === void 0 ? void 0 : _e.call(webhookTriggers, req.body)))) !== null && _f !== void 0 ? _f : false;
668
771
  break;
669
772
  case "room.session.ended":
670
773
  shouldTriggerOnReceivedWebhook =
671
- (_h = (_g = webhookTriggers["room.session.ended"]) === null || _g === void 0 ? void 0 : _g.call(webhookTriggers, req.body)) !== null && _h !== void 0 ? _h : false;
774
+ (_h = (yield ((_g = webhookTriggers["room.session.ended"]) === null || _g === void 0 ? void 0 : _g.call(webhookTriggers, req.body)))) !== null && _h !== void 0 ? _h : false;
672
775
  break;
673
776
  }
674
777
  if (shouldTriggerOnReceivedWebhook) {
@@ -677,7 +780,7 @@ const webhookRouter = (webhookTriggers, emitter) => {
677
780
  }
678
781
  res.status(200);
679
782
  res.end();
680
- });
783
+ }));
681
784
  return router;
682
785
  };
683
786
  class Trigger extends EventEmitter {
@@ -690,13 +793,10 @@ class Trigger extends EventEmitter {
690
793
  const app = express();
691
794
  const router = webhookRouter(this.webhookTriggers, this);
692
795
  app.use(router);
693
- const server = app.listen(this.port, () => {
796
+ app.listen(this.port, () => {
694
797
  // console.log(`Bot trigger server now running on port[${this.port}]`);
695
798
  });
696
- process.on("SIGTERM", () => {
697
- server.close();
698
- });
699
799
  }
700
800
  }
701
801
 
702
- export { ASSISTANT_JOINED_ROOM, ASSISTANT_LEFT_ROOM, AUDIO_STREAM_READY, Assistant, AudioSink, AudioSource, TRIGGER_EVENT_SUCCESS, Trigger };
802
+ export { ASSISTANT_JOINED_ROOM, ASSISTANT_LEFT_ROOM, AUDIO_STREAM_READY, Assistant, AudioSink, AudioSource, PARTICIPANT_AUDIO_TRACK_ADDED, PARTICIPANT_AUDIO_TRACK_REMOVED, PARTICIPANT_VIDEO_TRACK_ADDED, PARTICIPANT_VIDEO_TRACK_REMOVED, TRIGGER_EVENT_SUCCESS, Trigger };