@whereby.com/assistant-sdk 0.0.0-canary-20250916072551 → 0.0.0-canary-20250917154617

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,8 @@ import * as dotenv from 'dotenv';
12
12
  const TRIGGER_EVENT_SUCCESS = "trigger_event_success";
13
13
 
14
14
  const AUDIO_STREAM_READY = "AUDIO_STREAM_READY";
15
+ const ASSISTANT_JOINED_ROOM = "ASSISTANT_JOINED_ROOM";
16
+ const ASSISTANT_LEFT_ROOM = "ASSISTANT_LEFT_ROOM";
15
17
 
16
18
  /******************************************************************************
17
19
  Copyright (c) Microsoft Corporation.
@@ -76,301 +78,362 @@ const STREAM_INPUT_SAMPLE_RATE_IN_HZ = 48000;
76
78
  const BYTES_PER_SAMPLE = 2;
77
79
  // 480 samples per 10ms frame at 48kHz
78
80
  const FRAME_10MS_SAMPLES = 480;
79
- const slotBuffers = new Map();
80
- function appendAndDrainTo480(slot, newSamples) {
81
- var _a;
82
- const prev = (_a = slotBuffers.get(slot)) !== null && _a !== void 0 ? _a : new Int16Array(0);
83
- const merged = new Int16Array(prev.length + newSamples.length);
84
- merged.set(prev, 0);
85
- merged.set(newSamples, prev.length);
86
- let offset = 0;
87
- while (merged.length - offset >= FRAME_10MS_SAMPLES) {
88
- const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
89
- enqueueFrame(slot, chunk); // always 480
90
- offset += FRAME_10MS_SAMPLES;
91
- }
92
- slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
93
- }
94
- ({
95
- enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
96
- enqSamples: new Array(PARTICIPANT_SLOTS).fill(0),
97
- wroteFrames: new Array(PARTICIPANT_SLOTS).fill(0),
98
- wroteSamples: new Array(PARTICIPANT_SLOTS).fill(0),
99
- lastFramesSeen: new Array(PARTICIPANT_SLOTS).fill(0),
100
- });
101
- let slots = [];
102
- let stopPacerFn = null;
103
- let outputPacerState = null;
104
- /**
105
- * Simple linear interpolation resampler to convert audio to 48kHz.
106
- * This handles the common case of 16kHz -> 48kHz (3x upsampling).
107
- */
108
- function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
109
- const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
110
- const outputLength = Math.floor(inputFrames * ratio);
111
- const output = new Int16Array(outputLength);
112
- for (let i = 0; i < outputLength; i++) {
113
- const inputIndex = i / ratio;
114
- const index = Math.floor(inputIndex);
115
- const fraction = inputIndex - index;
116
- if (index + 1 < inputSamples.length) {
117
- const sample1 = inputSamples[index];
118
- const sample2 = inputSamples[index + 1];
119
- output[i] = Math.round(sample1 + (sample2 - sample1) * fraction);
81
+ function createFfmpegMixer() {
82
+ const slotBuffers = new Map();
83
+ function appendAndDrainTo480(slot, newSamples) {
84
+ var _a;
85
+ const prev = (_a = slotBuffers.get(slot)) !== null && _a !== void 0 ? _a : new Int16Array(0);
86
+ const merged = new Int16Array(prev.length + newSamples.length);
87
+ merged.set(prev, 0);
88
+ merged.set(newSamples, prev.length);
89
+ let offset = 0;
90
+ while (merged.length - offset >= FRAME_10MS_SAMPLES) {
91
+ const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
92
+ enqueueFrame(slot, chunk); // always 480
93
+ offset += FRAME_10MS_SAMPLES;
120
94
  }
121
- else {
122
- output[i] = inputSamples[Math.min(index, inputSamples.length - 1)];
95
+ slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
96
+ }
97
+ ({
98
+ enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
99
+ enqSamples: new Array(PARTICIPANT_SLOTS).fill(0),
100
+ wroteFrames: new Array(PARTICIPANT_SLOTS).fill(0),
101
+ wroteSamples: new Array(PARTICIPANT_SLOTS).fill(0),
102
+ lastFramesSeen: new Array(PARTICIPANT_SLOTS).fill(0),
103
+ });
104
+ let slots = [];
105
+ let stopPacerFn = null;
106
+ let outputPacerState = null;
107
+ /**
108
+ * Simple linear interpolation resampler to convert audio to 48kHz.
109
+ * This handles the common case of 16kHz -> 48kHz (3x upsampling).
110
+ */
111
+ function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
112
+ const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
113
+ const outputLength = Math.floor(inputFrames * ratio);
114
+ const output = new Int16Array(outputLength);
115
+ for (let i = 0; i < outputLength; i++) {
116
+ const inputIndex = i / ratio;
117
+ const index = Math.floor(inputIndex);
118
+ const fraction = inputIndex - index;
119
+ if (index + 1 < inputSamples.length) {
120
+ const sample1 = inputSamples[index];
121
+ const sample2 = inputSamples[index + 1];
122
+ output[i] = Math.round(sample1 + (sample2 - sample1) * fraction);
123
+ }
124
+ else {
125
+ output[i] = inputSamples[Math.min(index, inputSamples.length - 1)];
126
+ }
127
+ }
128
+ return output;
129
+ }
130
+ /**
131
+ * Enqueue an audio frame for paced delivery to the RTCAudioSource.
132
+ */
133
+ function enqueueOutputFrame(samples) {
134
+ if (outputPacerState) {
135
+ outputPacerState.frameQueue.push(samples);
123
136
  }
124
137
  }
125
- return output;
126
- }
127
- /**
128
- * Enqueue an audio frame for paced delivery to the RTCAudioSource.
129
- */
130
- function enqueueOutputFrame(samples) {
131
- if (outputPacerState) {
132
- outputPacerState.frameQueue.push(samples);
133
- }
134
- }
135
- /**
136
- * Start the audio pacer loop for all input slots in an FFmpeg process.
137
- *
138
- * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
139
- * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
140
- * arrive jittery, bursty, or with slightly different clocks.
141
- *
142
- * Key behavior:
143
- * - Writes exactly one frame per period, on a shared wall-clock grid.
144
- * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
145
- * never stalls.
146
- * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
147
- * - Honors Node stream backpressure (`write()` return false) without breaking
148
- * the timing grid.
149
- *
150
- * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
151
- * can mix them without slow-downs or drift.
152
- *
153
- * Call this once right after spawning FFmpeg:
154
- * ```ts
155
- * const ff = spawnFFmpegProcess();
156
- * startPacer(ff, PARTICIPANT_SLOTS);
157
- * ```
158
- *
159
- * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
160
- *
161
- * @param ff Child process handle from spawn("ffmpeg", ...)
162
- * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
163
- */
164
- function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
165
- if (stopPacerFn) {
166
- stopPacerFn();
167
- stopPacerFn = null;
168
- }
169
- const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
170
- const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
171
- const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
172
- const t0 = nowMs();
173
- slots = Array.from({ length: slotCount }, () => ({
174
- q: [],
175
- lastFrames: FRAME_10MS_SAMPLES, // keep constant
176
- nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
177
- }));
178
- outputPacerState = {
179
- frameQueue: [],
180
- nextDueMs: t0 + outputFrameMs,
181
- rtcAudioSource,
182
- onAudioStreamReady,
183
- didEmitReadyEvent: false,
184
- };
185
- const iv = setInterval(() => {
186
- const t = nowMs();
187
- for (let s = 0; s < slotCount; s++) {
188
- const st = slots[s];
189
- const w = writers[s];
190
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
191
- if (t >= st.nextDueMs) {
192
- const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
193
- if (!w.write(buf)) {
194
- // Just continue without adding drain listener - backpressure will naturally resolve
138
+ /**
139
+ * Start the audio pacer loop for all input slots in an FFmpeg process.
140
+ *
141
+ * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
142
+ * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
143
+ * arrive jittery, bursty, or with slightly different clocks.
144
+ *
145
+ * Key behavior:
146
+ * - Writes exactly one frame per period, on a shared wall-clock grid.
147
+ * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
148
+ * never stalls.
149
+ * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
150
+ * - Honors Node stream backpressure (`write()` return false) without breaking
151
+ * the timing grid.
152
+ *
153
+ * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
154
+ * can mix them without slow-downs or drift.
155
+ *
156
+ * Call this once right after spawning FFmpeg:
157
+ * ```ts
158
+ * const ff = spawnFFmpegProcess();
159
+ * startPacer(ff, PARTICIPANT_SLOTS);
160
+ * ```
161
+ *
162
+ * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
163
+ *
164
+ * @param ff Child process handle from spawn("ffmpeg", ...)
165
+ * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
166
+ */
167
+ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
168
+ if (stopPacerFn) {
169
+ stopPacerFn();
170
+ stopPacerFn = null;
171
+ }
172
+ const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
173
+ const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
174
+ const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
175
+ const t0 = nowMs();
176
+ slots = Array.from({ length: slotCount }, () => ({
177
+ q: [],
178
+ lastFrames: FRAME_10MS_SAMPLES, // keep constant
179
+ nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
180
+ }));
181
+ outputPacerState = {
182
+ frameQueue: [],
183
+ nextDueMs: t0 + outputFrameMs,
184
+ rtcAudioSource,
185
+ onAudioStreamReady,
186
+ didEmitReadyEvent: false,
187
+ };
188
+ const iv = setInterval(() => {
189
+ const t = nowMs();
190
+ for (let s = 0; s < slotCount; s++) {
191
+ const st = slots[s];
192
+ const w = writers[s];
193
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
194
+ if (t >= st.nextDueMs) {
195
+ const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
196
+ if (!w.write(buf)) {
197
+ // Just continue without adding drain listener - backpressure will naturally resolve
198
+ const late = t - st.nextDueMs;
199
+ const steps = Math.max(1, Math.ceil(late / frameMs));
200
+ st.nextDueMs += steps * frameMs;
201
+ continue;
202
+ }
195
203
  const late = t - st.nextDueMs;
196
204
  const steps = Math.max(1, Math.ceil(late / frameMs));
197
205
  st.nextDueMs += steps * frameMs;
198
- continue;
199
206
  }
200
- const late = t - st.nextDueMs;
201
- const steps = Math.max(1, Math.ceil(late / frameMs));
202
- st.nextDueMs += steps * frameMs;
203
207
  }
204
- }
205
- if (!outputPacerState)
206
- return;
207
- // Handle output pacer for RTCAudioSource
208
- const state = outputPacerState;
209
- if (t >= state.nextDueMs) {
210
- const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
211
- if (!state.didEmitReadyEvent) {
212
- state.onAudioStreamReady();
213
- state.didEmitReadyEvent = true;
208
+ if (!outputPacerState)
209
+ return;
210
+ // Handle output pacer for RTCAudioSource
211
+ const state = outputPacerState;
212
+ if (t >= state.nextDueMs) {
213
+ const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
214
+ if (!state.didEmitReadyEvent) {
215
+ state.onAudioStreamReady();
216
+ state.didEmitReadyEvent = true;
217
+ }
218
+ state.rtcAudioSource.onData({
219
+ samples: samples,
220
+ sampleRate: STREAM_INPUT_SAMPLE_RATE_IN_HZ,
221
+ });
222
+ const late = t - state.nextDueMs;
223
+ const steps = Math.max(1, Math.ceil(late / outputFrameMs));
224
+ state.nextDueMs += steps * outputFrameMs;
214
225
  }
215
- state.rtcAudioSource.onData({
216
- samples: samples,
217
- sampleRate: STREAM_INPUT_SAMPLE_RATE_IN_HZ,
218
- });
219
- const late = t - state.nextDueMs;
220
- const steps = Math.max(1, Math.ceil(late / outputFrameMs));
221
- state.nextDueMs += steps * outputFrameMs;
226
+ }, 5);
227
+ stopPacerFn = () => clearInterval(iv);
228
+ }
229
+ /**
230
+ * Stop the audio pacer loop and clear all input slots.
231
+ * Call this before killing the FFmpeg process to ensure clean shutdown.
232
+ */
233
+ function stopPacer() {
234
+ if (stopPacerFn)
235
+ stopPacerFn();
236
+ stopPacerFn = null;
237
+ slots = [];
238
+ slotBuffers.clear();
239
+ outputPacerState = null;
240
+ }
241
+ /**
242
+ * Queue a live frame for a given slot (0..N-1).
243
+ * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
244
+ */
245
+ function enqueueFrame(slot, samples, numberOfFrames) {
246
+ const st = slots[slot];
247
+ if (!st)
248
+ return;
249
+ const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
250
+ st.q.push(buf);
251
+ }
252
+ /**
253
+ * Clear the audio queue for a specific slot when a participant leaves.
254
+ * This prevents stale audio data from continuing to play after disconnect.
255
+ */
256
+ function clearSlotQueue(slot) {
257
+ const st = slots[slot];
258
+ if (st) {
259
+ st.q = [];
260
+ slotBuffers.delete(slot);
261
+ const now = Number(process.hrtime.bigint()) / 1e6;
262
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
263
+ st.nextDueMs = now + frameMs;
222
264
  }
223
- }, 5);
224
- stopPacerFn = () => clearInterval(iv);
225
- }
226
- /**
227
- * Stop the audio pacer loop and clear all input slots.
228
- * Call this before killing the FFmpeg process to ensure clean shutdown.
229
- */
230
- function stopPacer() {
231
- if (stopPacerFn)
232
- stopPacerFn();
233
- stopPacerFn = null;
234
- slots = [];
235
- }
236
- /**
237
- * Queue a live frame for a given slot (0..N-1).
238
- * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
239
- */
240
- function enqueueFrame(slot, samples, numberOfFrames) {
241
- const st = slots[slot];
242
- if (!st)
243
- return;
244
- const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
245
- st.q.push(buf);
246
- }
247
- /**
248
- * Clear the audio queue for a specific slot when a participant leaves.
249
- * This prevents stale audio data from continuing to play after disconnect.
250
- */
251
- function clearSlotQueue(slot) {
252
- const st = slots[slot];
253
- if (st) {
254
- st.q = [];
255
- const now = Number(process.hrtime.bigint()) / 1e6;
256
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
257
- st.nextDueMs = now + frameMs;
258
265
  }
259
- }
260
- /**
261
- * Get the FFmpeg arguments for mixing audio from multiple participants.
262
- * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
263
- * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
264
- */
265
- function getFFmpegArguments() {
266
- const N = PARTICIPANT_SLOTS;
267
- const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
268
- const ffArgs = [];
269
- for (let i = 0; i < N; i++) {
270
- ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
271
- }
272
- const pre = [];
273
- for (let i = 0; i < N; i++) {
274
- pre.push(`[${i}:a]aresample=async=1:first_pts=0,asetpts=N/SR/TB[a${i}]`);
275
- }
276
- const labels = Array.from({ length: N }, (_, i) => `[a${i}]`).join("");
277
- const amix = `${labels}amix=inputs=${N}:duration=longest:dropout_transition=250:normalize=0[mix]`;
278
- const filter = `${pre.join(";")};${amix}`;
279
- ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
280
- return ffArgs;
281
- }
282
- /**
283
- * Spawn a new FFmpeg process for mixing audio from multiple participants.
284
- * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
285
- * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
286
- * The process will log its output to stderr.
287
- * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
288
- * @return The spawned FFmpeg process.
289
- */
290
- function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
291
- const stdio = ["ignore", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
292
- const args = getFFmpegArguments();
293
- const ffmpegProcess = spawn("ffmpeg", args, { stdio });
294
- startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
295
- ffmpegProcess.stderr.setEncoding("utf8");
296
- ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
297
- ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
298
- let audioBuffer = Buffer.alloc(0);
299
- const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
300
- ffmpegProcess.stdout.on("data", (chunk) => {
301
- audioBuffer = Buffer.concat([audioBuffer, chunk]);
302
- while (audioBuffer.length >= FRAME_SIZE_BYTES) {
303
- const frameData = audioBuffer.subarray(0, FRAME_SIZE_BYTES);
304
- const samples = new Int16Array(FRAME_10MS_SAMPLES);
305
- for (let i = 0; i < FRAME_10MS_SAMPLES; i++) {
306
- samples[i] = frameData.readInt16LE(i * 2);
307
- }
308
- enqueueOutputFrame(samples);
309
- audioBuffer = audioBuffer.subarray(FRAME_SIZE_BYTES);
266
+ /**
267
+ * Get the FFmpeg arguments for debugging, which writes each participant's audio to a separate WAV file
268
+ * and also mixes them into a single WAV file.
269
+ * This is useful for inspecting the audio quality and timing of each participant.
270
+ */
271
+ function getFFmpegArgumentsDebug() {
272
+ const N = PARTICIPANT_SLOTS;
273
+ const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
274
+ const ffArgs = [];
275
+ for (let i = 0; i < N; i++) {
276
+ ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
310
277
  }
311
- });
312
- return ffmpegProcess;
313
- }
314
- /**
315
- * Write audio data from a MediaStreamTrack to the FFmpeg process.
316
- * This function creates an AudioSink for the track and sets up a data handler
317
- * that enqueues audio frames into the pacer.
318
- *
319
- * @param ffmpegProcess The FFmpeg process to which audio data will be written.
320
- * @param slot The participant slot number (0..N-1) to which this track belongs.
321
- * @param audioTrack The MediaStreamTrack containing the audio data.
322
- * @return An object containing the AudioSink, the writable stream, and a stop function.
323
- */
324
- function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
325
- const writer = ffmpegProcess.stdio[3 + slot];
326
- const sink = new AudioSink(audioTrack);
327
- const unsubscribe = sink.subscribe(({ samples, sampleRate: sr, channelCount: ch, bitsPerSample, numberOfFrames }) => {
328
- if (ch !== 1 || bitsPerSample !== 16)
329
- return;
330
- let out = samples;
331
- if (sr !== STREAM_INPUT_SAMPLE_RATE_IN_HZ) {
332
- const resampled = resampleTo48kHz(samples, sr, numberOfFrames !== null && numberOfFrames !== void 0 ? numberOfFrames : samples.length);
333
- out = resampled;
278
+ const pre = [];
279
+ for (let i = 0; i < N; i++) {
280
+ pre.push(`[${i}:a]aresample=async=0:first_pts=0,asetpts=PTS-STARTPTS,asplit=2[a${i}tap][a${i}mix]`);
334
281
  }
335
- appendAndDrainTo480(slot, out);
336
- });
337
- const stop = () => {
338
- try {
339
- unsubscribe();
340
- sink.stop();
282
+ const mixInputs = Array.from({ length: N }, (_, i) => `[a${i}mix]`).join("");
283
+ const filter = `${pre.join(";")};${mixInputs}amix=inputs=${N}:duration=first:dropout_transition=0:normalize=0[mix]`;
284
+ ffArgs.push("-hide_banner", "-nostats", "-loglevel", "info", "-y", "-filter_complex", filter);
285
+ for (let i = 0; i < N; i++) {
286
+ ffArgs.push("-map", `[a${i}tap]`, "-f", "wav", "-c:a", "pcm_s16le", `pre${i}.wav`);
341
287
  }
342
- catch (_a) {
343
- console.error("Failed to stop AudioSink");
344
- }
345
- };
346
- return { sink, writer, stop };
347
- }
348
- /**
349
- * Stop the FFmpeg process and clean up all resources.
350
- * This function will unpipe the stdout, end all writable streams for each participant slot,
351
- * and kill the FFmpeg process.
352
- * @param ffmpegProcess The FFmpeg process to stop.
353
- */
354
- function stopFFmpegProcess(ffmpegProcess) {
355
- stopPacer();
356
- if (ffmpegProcess && !ffmpegProcess.killed) {
357
- try {
358
- ffmpegProcess.stdout.unpipe();
288
+ ffArgs.push("-map", "[mix]", "-f", "wav", "-c:a", "pcm_s16le", "mixed.wav");
289
+ return ffArgs;
290
+ }
291
+ /**
292
+ * Get the FFmpeg arguments for mixing audio from multiple participants.
293
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
294
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
295
+ */
296
+ function getFFmpegArguments() {
297
+ const N = PARTICIPANT_SLOTS;
298
+ const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
299
+ const ffArgs = [];
300
+ for (let i = 0; i < N; i++) {
301
+ ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
359
302
  }
360
- catch (_a) {
361
- console.error("Failed to unpipe ffmpeg stdout");
303
+ const pre = [];
304
+ for (let i = 0; i < N; i++) {
305
+ pre.push(`[${i}:a]aresample=async=0:first_pts=0,asetpts=PTS-STARTPTS[a${i}]`);
362
306
  }
363
- for (let i = 0; i < PARTICIPANT_SLOTS; i++) {
364
- const w = ffmpegProcess.stdio[3 + i];
307
+ const labels = Array.from({ length: N }, (_, i) => `[a${i}]`).join("");
308
+ const amix = `${labels}amix=inputs=${N}:duration=first:dropout_transition=0:normalize=0[mix]`;
309
+ const filter = `${pre.join(";")};${amix}`;
310
+ ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
311
+ return ffArgs;
312
+ }
313
+ /*
314
+ * Spawn a new FFmpeg process for debugging purposes.
315
+ * This will write each participant's audio to a separate WAV file and also mix them into a single WAV file.
316
+ * The output files will be named pre0.wav, pre1.wav, ..., and mixed.wav.
317
+ * The process will log its output to stderr.
318
+ * @return The spawned FFmpeg process.
319
+ */
320
+ function spawnFFmpegProcessDebug(rtcAudioSource, onAudioStreamReady) {
321
+ const stdio = ["ignore", "ignore", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
322
+ const args = getFFmpegArgumentsDebug();
323
+ const ffmpegProcess = spawn("ffmpeg", args, { stdio });
324
+ startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
325
+ ffmpegProcess.stderr.setEncoding("utf8");
326
+ ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
327
+ ffmpegProcess.on("error", () => console.error("FFmpeg process error (debug): is ffmpeg installed?"));
328
+ return ffmpegProcess;
329
+ }
330
+ /**
331
+ * Spawn a new FFmpeg process for mixing audio from multiple participants.
332
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
333
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
334
+ * The process will log its output to stderr.
335
+ * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
336
+ * @return The spawned FFmpeg process.
337
+ */
338
+ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
339
+ const stdio = ["pipe", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
340
+ const args = getFFmpegArguments();
341
+ const ffmpegProcess = spawn("ffmpeg", args, { stdio });
342
+ startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
343
+ ffmpegProcess.stderr.setEncoding("utf8");
344
+ ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
345
+ ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
346
+ let audioBuffer = Buffer.alloc(0);
347
+ const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
348
+ ffmpegProcess.stdout.on("data", (chunk) => {
349
+ audioBuffer = Buffer.concat([audioBuffer, chunk]);
350
+ while (audioBuffer.length >= FRAME_SIZE_BYTES) {
351
+ const frameData = audioBuffer.subarray(0, FRAME_SIZE_BYTES);
352
+ const samples = new Int16Array(FRAME_10MS_SAMPLES);
353
+ for (let i = 0; i < FRAME_10MS_SAMPLES; i++) {
354
+ samples[i] = frameData.readInt16LE(i * 2);
355
+ }
356
+ enqueueOutputFrame(samples);
357
+ audioBuffer = audioBuffer.subarray(FRAME_SIZE_BYTES);
358
+ }
359
+ });
360
+ return ffmpegProcess;
361
+ }
362
+ /**
363
+ * Write audio data from a MediaStreamTrack to the FFmpeg process.
364
+ * This function creates an AudioSink for the track and sets up a data handler
365
+ * that enqueues audio frames into the pacer.
366
+ *
367
+ * @param ffmpegProcess The FFmpeg process to which audio data will be written.
368
+ * @param slot The participant slot number (0..N-1) to which this track belongs.
369
+ * @param audioTrack The MediaStreamTrack containing the audio data.
370
+ * @return An object containing the AudioSink, the writable stream, and a stop function.
371
+ */
372
+ function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
373
+ const writer = ffmpegProcess.stdio[3 + slot];
374
+ const sink = new AudioSink(audioTrack);
375
+ const unsubscribe = sink.subscribe(({ samples, sampleRate: sr, channelCount: ch, bitsPerSample, numberOfFrames }) => {
376
+ if (ch !== 1 || bitsPerSample !== 16)
377
+ return;
378
+ let out = samples;
379
+ if (sr !== STREAM_INPUT_SAMPLE_RATE_IN_HZ) {
380
+ const resampled = resampleTo48kHz(samples, sr, numberOfFrames !== null && numberOfFrames !== void 0 ? numberOfFrames : samples.length);
381
+ out = resampled;
382
+ }
383
+ appendAndDrainTo480(slot, out);
384
+ });
385
+ const stop = () => {
386
+ try {
387
+ unsubscribe();
388
+ sink.stop();
389
+ }
390
+ catch (_a) {
391
+ console.error("Failed to stop AudioSink");
392
+ }
393
+ };
394
+ return { sink, writer, stop };
395
+ }
396
+ /**
397
+ * Stop the FFmpeg process and clean up all resources.
398
+ * This function will unpipe the stdout, end all writable streams for each participant slot,
399
+ * and kill the FFmpeg process.
400
+ * @param ffmpegProcess The FFmpeg process to stop.
401
+ */
402
+ function stopFFmpegProcess(ffmpegProcess) {
403
+ var _a, _b;
404
+ stopPacer();
405
+ if (ffmpegProcess && !ffmpegProcess.killed) {
406
+ try {
407
+ ffmpegProcess.stdout.unpipe();
408
+ }
409
+ catch (_c) {
410
+ console.error("Failed to unpipe ffmpeg stdout");
411
+ }
412
+ for (let i = 0; i < PARTICIPANT_SLOTS; i++) {
413
+ const w = ffmpegProcess.stdio[3 + i];
414
+ try {
415
+ w.end();
416
+ }
417
+ catch (_d) {
418
+ console.error("Failed to end ffmpeg writable stream");
419
+ }
420
+ }
365
421
  try {
366
- w.end();
422
+ (_a = ffmpegProcess.stdin) === null || _a === void 0 ? void 0 : _a.write("q\n");
423
+ (_b = ffmpegProcess.stdin) === null || _b === void 0 ? void 0 : _b.end();
367
424
  }
368
- catch (_b) {
369
- console.error("Failed to end ffmpeg writable stream");
425
+ catch (_e) {
426
+ console.error("Failed to end ffmpeg stdin");
370
427
  }
371
428
  }
372
- ffmpegProcess.kill("SIGTERM");
373
429
  }
430
+ return {
431
+ spawnFFmpegProcess,
432
+ spawnFFmpegProcessDebug,
433
+ writeAudioDataToFFmpeg,
434
+ stopFFmpegProcess,
435
+ clearSlotQueue,
436
+ };
374
437
  }
375
438
 
376
439
  class AudioMixer extends EventEmitter {
@@ -381,6 +444,7 @@ class AudioMixer extends EventEmitter {
381
444
  this.rtcAudioSource = null;
382
445
  this.participantSlots = new Map();
383
446
  this.activeSlots = {};
447
+ this.mixer = createFfmpegMixer();
384
448
  this.setupMediaStream();
385
449
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
386
450
  this.onStreamReady = onStreamReady;
@@ -399,7 +463,7 @@ class AudioMixer extends EventEmitter {
399
463
  return;
400
464
  }
401
465
  if (!this.ffmpegProcess && this.rtcAudioSource) {
402
- this.ffmpegProcess = spawnFFmpegProcess(this.rtcAudioSource, this.onStreamReady);
466
+ this.ffmpegProcess = this.mixer.spawnFFmpegProcess(this.rtcAudioSource, this.onStreamReady);
403
467
  }
404
468
  for (const p of participants)
405
469
  this.attachParticipantIfNeeded(p);
@@ -412,7 +476,7 @@ class AudioMixer extends EventEmitter {
412
476
  }
413
477
  stopAudioMixer() {
414
478
  if (this.ffmpegProcess) {
415
- stopFFmpegProcess(this.ffmpegProcess);
479
+ this.mixer.stopFFmpegProcess(this.ffmpegProcess);
416
480
  this.ffmpegProcess = null;
417
481
  }
418
482
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
@@ -465,7 +529,7 @@ class AudioMixer extends EventEmitter {
465
529
  }
466
530
  this.activeSlots[slot] = undefined;
467
531
  }
468
- const { sink, writer, stop } = writeAudioDataToFFmpeg(this.ffmpegProcess, slot, audioTrack);
532
+ const { sink, writer, stop } = this.mixer.writeAudioDataToFFmpeg(this.ffmpegProcess, slot, audioTrack);
469
533
  this.activeSlots[slot] = { sink, writer, stop, trackId: audioTrack.id };
470
534
  (_a = audioTrack.addEventListener) === null || _a === void 0 ? void 0 : _a.call(audioTrack, "ended", () => this.detachParticipant(participantId));
471
535
  }
@@ -484,7 +548,7 @@ class AudioMixer extends EventEmitter {
484
548
  this.activeSlots[slot] = undefined;
485
549
  }
486
550
  // Clear any queued audio data for this slot to prevent stale audio
487
- clearSlotQueue(slot);
551
+ this.mixer.clearSlotQueue(slot);
488
552
  this.participantSlots.set(slot, "");
489
553
  }
490
554
  }
@@ -495,6 +559,15 @@ class Assistant extends EventEmitter$1 {
495
559
  this.mediaStream = null;
496
560
  this.audioSource = null;
497
561
  this.combinedStream = null;
562
+ this.roomUrl = null;
563
+ this.handleConnectionStatusChange = (status) => {
564
+ if (status === "connected") {
565
+ this.emit(ASSISTANT_JOINED_ROOM, { roomUrl: this.roomUrl || "" });
566
+ }
567
+ if (["left", "kicked"].includes(status)) {
568
+ this.emit(ASSISTANT_LEFT_ROOM, { roomUrl: this.roomUrl || "" });
569
+ }
570
+ };
498
571
  this.assistantKey = assistantKey;
499
572
  this.client = new WherebyClient();
500
573
  this.roomConnection = this.client.getRoomConnection();
@@ -519,6 +592,7 @@ class Assistant extends EventEmitter$1 {
519
592
  const audioMixer = new AudioMixer(handleStreamReady);
520
593
  this.combinedStream = audioMixer.getCombinedAudioStream();
521
594
  this.roomConnection.subscribeToRemoteParticipants(audioMixer.handleRemoteParticipants.bind(audioMixer));
595
+ this.roomConnection.subscribeToConnectionStatus(this.handleConnectionStatusChange);
522
596
  }
523
597
  }
524
598
  joinRoom(roomUrl) {
@@ -526,6 +600,7 @@ class Assistant extends EventEmitter$1 {
526
600
  if (this.mediaStream) {
527
601
  yield this.localMedia.startMedia(this.mediaStream);
528
602
  }
603
+ this.roomUrl = roomUrl;
529
604
  this.roomConnection.initialize({
530
605
  localMediaOptions: {
531
606
  audio: false,
@@ -636,10 +711,28 @@ const webhookRouter = (webhookTriggers, emitter) => {
636
711
  res.end();
637
712
  });
638
713
  router.post("/", jsonParser, (req, res) => {
639
- var _a;
714
+ var _a, _b, _c, _d, _e, _f, _g, _h;
640
715
  assert(req.body, "message body is required");
641
716
  assert("type" in req.body, "webhook type is required");
642
- const shouldTriggerOnReceivedWebhook = (_a = webhookTriggers[req.body.type]) === null || _a === void 0 ? void 0 : _a.call(webhookTriggers, req.body);
717
+ let shouldTriggerOnReceivedWebhook = false;
718
+ switch (req.body.type) {
719
+ case "room.client.joined":
720
+ shouldTriggerOnReceivedWebhook =
721
+ (_b = (_a = webhookTriggers["room.client.joined"]) === null || _a === void 0 ? void 0 : _a.call(webhookTriggers, req.body)) !== null && _b !== void 0 ? _b : false;
722
+ break;
723
+ case "room.client.left":
724
+ shouldTriggerOnReceivedWebhook =
725
+ (_d = (_c = webhookTriggers["room.client.left"]) === null || _c === void 0 ? void 0 : _c.call(webhookTriggers, req.body)) !== null && _d !== void 0 ? _d : false;
726
+ break;
727
+ case "room.session.started":
728
+ shouldTriggerOnReceivedWebhook =
729
+ (_f = (_e = webhookTriggers["room.session.started"]) === null || _e === void 0 ? void 0 : _e.call(webhookTriggers, req.body)) !== null && _f !== void 0 ? _f : false;
730
+ break;
731
+ case "room.session.ended":
732
+ shouldTriggerOnReceivedWebhook =
733
+ (_h = (_g = webhookTriggers["room.session.ended"]) === null || _g === void 0 ? void 0 : _g.call(webhookTriggers, req.body)) !== null && _h !== void 0 ? _h : false;
734
+ break;
735
+ }
643
736
  if (shouldTriggerOnReceivedWebhook) {
644
737
  const roomUrl = buildRoomUrl(req.body.data.roomName, req.body.data.subdomain);
645
738
  emitter.emit(TRIGGER_EVENT_SUCCESS, { roomUrl, triggerWebhook: req.body });
@@ -668,4 +761,4 @@ class Trigger extends EventEmitter {
668
761
  }
669
762
  }
670
763
 
671
- export { AUDIO_STREAM_READY, Assistant, AudioSink, AudioSource, TRIGGER_EVENT_SUCCESS, Trigger };
764
+ export { ASSISTANT_JOINED_ROOM, ASSISTANT_LEFT_ROOM, AUDIO_STREAM_READY, Assistant, AudioSink, AudioSource, TRIGGER_EVENT_SUCCESS, Trigger };