@whereby.com/assistant-sdk 0.0.0-canary-20250916072551 → 0.0.0-canary-20250917154617

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -33,6 +33,8 @@ var dotenv__namespace = /*#__PURE__*/_interopNamespaceDefault(dotenv);
33
33
  const TRIGGER_EVENT_SUCCESS = "trigger_event_success";
34
34
 
35
35
  const AUDIO_STREAM_READY = "AUDIO_STREAM_READY";
36
+ const ASSISTANT_JOINED_ROOM = "ASSISTANT_JOINED_ROOM";
37
+ const ASSISTANT_LEFT_ROOM = "ASSISTANT_LEFT_ROOM";
36
38
 
37
39
  /******************************************************************************
38
40
  Copyright (c) Microsoft Corporation.
@@ -97,301 +99,362 @@ const STREAM_INPUT_SAMPLE_RATE_IN_HZ = 48000;
97
99
  const BYTES_PER_SAMPLE = 2;
98
100
  // 480 samples per 10ms frame at 48kHz
99
101
  const FRAME_10MS_SAMPLES = 480;
100
- const slotBuffers = new Map();
101
- function appendAndDrainTo480(slot, newSamples) {
102
- var _a;
103
- const prev = (_a = slotBuffers.get(slot)) !== null && _a !== void 0 ? _a : new Int16Array(0);
104
- const merged = new Int16Array(prev.length + newSamples.length);
105
- merged.set(prev, 0);
106
- merged.set(newSamples, prev.length);
107
- let offset = 0;
108
- while (merged.length - offset >= FRAME_10MS_SAMPLES) {
109
- const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
110
- enqueueFrame(slot, chunk); // always 480
111
- offset += FRAME_10MS_SAMPLES;
112
- }
113
- slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
114
- }
115
- ({
116
- enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
117
- enqSamples: new Array(PARTICIPANT_SLOTS).fill(0),
118
- wroteFrames: new Array(PARTICIPANT_SLOTS).fill(0),
119
- wroteSamples: new Array(PARTICIPANT_SLOTS).fill(0),
120
- lastFramesSeen: new Array(PARTICIPANT_SLOTS).fill(0),
121
- });
122
- let slots = [];
123
- let stopPacerFn = null;
124
- let outputPacerState = null;
125
- /**
126
- * Simple linear interpolation resampler to convert audio to 48kHz.
127
- * This handles the common case of 16kHz -> 48kHz (3x upsampling).
128
- */
129
- function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
130
- const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
131
- const outputLength = Math.floor(inputFrames * ratio);
132
- const output = new Int16Array(outputLength);
133
- for (let i = 0; i < outputLength; i++) {
134
- const inputIndex = i / ratio;
135
- const index = Math.floor(inputIndex);
136
- const fraction = inputIndex - index;
137
- if (index + 1 < inputSamples.length) {
138
- const sample1 = inputSamples[index];
139
- const sample2 = inputSamples[index + 1];
140
- output[i] = Math.round(sample1 + (sample2 - sample1) * fraction);
102
+ function createFfmpegMixer() {
103
+ const slotBuffers = new Map();
104
+ function appendAndDrainTo480(slot, newSamples) {
105
+ var _a;
106
+ const prev = (_a = slotBuffers.get(slot)) !== null && _a !== void 0 ? _a : new Int16Array(0);
107
+ const merged = new Int16Array(prev.length + newSamples.length);
108
+ merged.set(prev, 0);
109
+ merged.set(newSamples, prev.length);
110
+ let offset = 0;
111
+ while (merged.length - offset >= FRAME_10MS_SAMPLES) {
112
+ const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
113
+ enqueueFrame(slot, chunk); // always 480
114
+ offset += FRAME_10MS_SAMPLES;
141
115
  }
142
- else {
143
- output[i] = inputSamples[Math.min(index, inputSamples.length - 1)];
116
+ slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
117
+ }
118
+ ({
119
+ enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
120
+ enqSamples: new Array(PARTICIPANT_SLOTS).fill(0),
121
+ wroteFrames: new Array(PARTICIPANT_SLOTS).fill(0),
122
+ wroteSamples: new Array(PARTICIPANT_SLOTS).fill(0),
123
+ lastFramesSeen: new Array(PARTICIPANT_SLOTS).fill(0),
124
+ });
125
+ let slots = [];
126
+ let stopPacerFn = null;
127
+ let outputPacerState = null;
128
+ /**
129
+ * Simple linear interpolation resampler to convert audio to 48kHz.
130
+ * This handles the common case of 16kHz -> 48kHz (3x upsampling).
131
+ */
132
+ function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
133
+ const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
134
+ const outputLength = Math.floor(inputFrames * ratio);
135
+ const output = new Int16Array(outputLength);
136
+ for (let i = 0; i < outputLength; i++) {
137
+ const inputIndex = i / ratio;
138
+ const index = Math.floor(inputIndex);
139
+ const fraction = inputIndex - index;
140
+ if (index + 1 < inputSamples.length) {
141
+ const sample1 = inputSamples[index];
142
+ const sample2 = inputSamples[index + 1];
143
+ output[i] = Math.round(sample1 + (sample2 - sample1) * fraction);
144
+ }
145
+ else {
146
+ output[i] = inputSamples[Math.min(index, inputSamples.length - 1)];
147
+ }
148
+ }
149
+ return output;
150
+ }
151
+ /**
152
+ * Enqueue an audio frame for paced delivery to the RTCAudioSource.
153
+ */
154
+ function enqueueOutputFrame(samples) {
155
+ if (outputPacerState) {
156
+ outputPacerState.frameQueue.push(samples);
144
157
  }
145
158
  }
146
- return output;
147
- }
148
- /**
149
- * Enqueue an audio frame for paced delivery to the RTCAudioSource.
150
- */
151
- function enqueueOutputFrame(samples) {
152
- if (outputPacerState) {
153
- outputPacerState.frameQueue.push(samples);
154
- }
155
- }
156
- /**
157
- * Start the audio pacer loop for all input slots in an FFmpeg process.
158
- *
159
- * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
160
- * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
161
- * arrive jittery, bursty, or with slightly different clocks.
162
- *
163
- * Key behavior:
164
- * - Writes exactly one frame per period, on a shared wall-clock grid.
165
- * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
166
- * never stalls.
167
- * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
168
- * - Honors Node stream backpressure (`write()` return false) without breaking
169
- * the timing grid.
170
- *
171
- * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
172
- * can mix them without slow-downs or drift.
173
- *
174
- * Call this once right after spawning FFmpeg:
175
- * ```ts
176
- * const ff = spawnFFmpegProcess();
177
- * startPacer(ff, PARTICIPANT_SLOTS);
178
- * ```
179
- *
180
- * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
181
- *
182
- * @param ff Child process handle from spawn("ffmpeg", ...)
183
- * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
184
- */
185
- function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
186
- if (stopPacerFn) {
187
- stopPacerFn();
188
- stopPacerFn = null;
189
- }
190
- const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
191
- const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
192
- const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
193
- const t0 = nowMs();
194
- slots = Array.from({ length: slotCount }, () => ({
195
- q: [],
196
- lastFrames: FRAME_10MS_SAMPLES, // keep constant
197
- nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
198
- }));
199
- outputPacerState = {
200
- frameQueue: [],
201
- nextDueMs: t0 + outputFrameMs,
202
- rtcAudioSource,
203
- onAudioStreamReady,
204
- didEmitReadyEvent: false,
205
- };
206
- const iv = setInterval(() => {
207
- const t = nowMs();
208
- for (let s = 0; s < slotCount; s++) {
209
- const st = slots[s];
210
- const w = writers[s];
211
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
212
- if (t >= st.nextDueMs) {
213
- const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
214
- if (!w.write(buf)) {
215
- // Just continue without adding drain listener - backpressure will naturally resolve
159
+ /**
160
+ * Start the audio pacer loop for all input slots in an FFmpeg process.
161
+ *
162
+ * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
163
+ * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
164
+ * arrive jittery, bursty, or with slightly different clocks.
165
+ *
166
+ * Key behavior:
167
+ * - Writes exactly one frame per period, on a shared wall-clock grid.
168
+ * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
169
+ * never stalls.
170
+ * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
171
+ * - Honors Node stream backpressure (`write()` return false) without breaking
172
+ * the timing grid.
173
+ *
174
+ * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
175
+ * can mix them without slow-downs or drift.
176
+ *
177
+ * Call this once right after spawning FFmpeg:
178
+ * ```ts
179
+ * const ff = spawnFFmpegProcess();
180
+ * startPacer(ff, PARTICIPANT_SLOTS);
181
+ * ```
182
+ *
183
+ * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
184
+ *
185
+ * @param ff Child process handle from spawn("ffmpeg", ...)
186
+ * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
187
+ */
188
+ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
189
+ if (stopPacerFn) {
190
+ stopPacerFn();
191
+ stopPacerFn = null;
192
+ }
193
+ const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
194
+ const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
195
+ const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
196
+ const t0 = nowMs();
197
+ slots = Array.from({ length: slotCount }, () => ({
198
+ q: [],
199
+ lastFrames: FRAME_10MS_SAMPLES, // keep constant
200
+ nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
201
+ }));
202
+ outputPacerState = {
203
+ frameQueue: [],
204
+ nextDueMs: t0 + outputFrameMs,
205
+ rtcAudioSource,
206
+ onAudioStreamReady,
207
+ didEmitReadyEvent: false,
208
+ };
209
+ const iv = setInterval(() => {
210
+ const t = nowMs();
211
+ for (let s = 0; s < slotCount; s++) {
212
+ const st = slots[s];
213
+ const w = writers[s];
214
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
215
+ if (t >= st.nextDueMs) {
216
+ const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
217
+ if (!w.write(buf)) {
218
+ // Just continue without adding drain listener - backpressure will naturally resolve
219
+ const late = t - st.nextDueMs;
220
+ const steps = Math.max(1, Math.ceil(late / frameMs));
221
+ st.nextDueMs += steps * frameMs;
222
+ continue;
223
+ }
216
224
  const late = t - st.nextDueMs;
217
225
  const steps = Math.max(1, Math.ceil(late / frameMs));
218
226
  st.nextDueMs += steps * frameMs;
219
- continue;
220
227
  }
221
- const late = t - st.nextDueMs;
222
- const steps = Math.max(1, Math.ceil(late / frameMs));
223
- st.nextDueMs += steps * frameMs;
224
228
  }
225
- }
226
- if (!outputPacerState)
227
- return;
228
- // Handle output pacer for RTCAudioSource
229
- const state = outputPacerState;
230
- if (t >= state.nextDueMs) {
231
- const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
232
- if (!state.didEmitReadyEvent) {
233
- state.onAudioStreamReady();
234
- state.didEmitReadyEvent = true;
229
+ if (!outputPacerState)
230
+ return;
231
+ // Handle output pacer for RTCAudioSource
232
+ const state = outputPacerState;
233
+ if (t >= state.nextDueMs) {
234
+ const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
235
+ if (!state.didEmitReadyEvent) {
236
+ state.onAudioStreamReady();
237
+ state.didEmitReadyEvent = true;
238
+ }
239
+ state.rtcAudioSource.onData({
240
+ samples: samples,
241
+ sampleRate: STREAM_INPUT_SAMPLE_RATE_IN_HZ,
242
+ });
243
+ const late = t - state.nextDueMs;
244
+ const steps = Math.max(1, Math.ceil(late / outputFrameMs));
245
+ state.nextDueMs += steps * outputFrameMs;
235
246
  }
236
- state.rtcAudioSource.onData({
237
- samples: samples,
238
- sampleRate: STREAM_INPUT_SAMPLE_RATE_IN_HZ,
239
- });
240
- const late = t - state.nextDueMs;
241
- const steps = Math.max(1, Math.ceil(late / outputFrameMs));
242
- state.nextDueMs += steps * outputFrameMs;
247
+ }, 5);
248
+ stopPacerFn = () => clearInterval(iv);
249
+ }
250
+ /**
251
+ * Stop the audio pacer loop and clear all input slots.
252
+ * Call this before killing the FFmpeg process to ensure clean shutdown.
253
+ */
254
+ function stopPacer() {
255
+ if (stopPacerFn)
256
+ stopPacerFn();
257
+ stopPacerFn = null;
258
+ slots = [];
259
+ slotBuffers.clear();
260
+ outputPacerState = null;
261
+ }
262
+ /**
263
+ * Queue a live frame for a given slot (0..N-1).
264
+ * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
265
+ */
266
+ function enqueueFrame(slot, samples, numberOfFrames) {
267
+ const st = slots[slot];
268
+ if (!st)
269
+ return;
270
+ const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
271
+ st.q.push(buf);
272
+ }
273
+ /**
274
+ * Clear the audio queue for a specific slot when a participant leaves.
275
+ * This prevents stale audio data from continuing to play after disconnect.
276
+ */
277
+ function clearSlotQueue(slot) {
278
+ const st = slots[slot];
279
+ if (st) {
280
+ st.q = [];
281
+ slotBuffers.delete(slot);
282
+ const now = Number(process.hrtime.bigint()) / 1e6;
283
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
284
+ st.nextDueMs = now + frameMs;
243
285
  }
244
- }, 5);
245
- stopPacerFn = () => clearInterval(iv);
246
- }
247
- /**
248
- * Stop the audio pacer loop and clear all input slots.
249
- * Call this before killing the FFmpeg process to ensure clean shutdown.
250
- */
251
- function stopPacer() {
252
- if (stopPacerFn)
253
- stopPacerFn();
254
- stopPacerFn = null;
255
- slots = [];
256
- }
257
- /**
258
- * Queue a live frame for a given slot (0..N-1).
259
- * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
260
- */
261
- function enqueueFrame(slot, samples, numberOfFrames) {
262
- const st = slots[slot];
263
- if (!st)
264
- return;
265
- const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
266
- st.q.push(buf);
267
- }
268
- /**
269
- * Clear the audio queue for a specific slot when a participant leaves.
270
- * This prevents stale audio data from continuing to play after disconnect.
271
- */
272
- function clearSlotQueue(slot) {
273
- const st = slots[slot];
274
- if (st) {
275
- st.q = [];
276
- const now = Number(process.hrtime.bigint()) / 1e6;
277
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
278
- st.nextDueMs = now + frameMs;
279
286
  }
280
- }
281
- /**
282
- * Get the FFmpeg arguments for mixing audio from multiple participants.
283
- * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
284
- * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
285
- */
286
- function getFFmpegArguments() {
287
- const N = PARTICIPANT_SLOTS;
288
- const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
289
- const ffArgs = [];
290
- for (let i = 0; i < N; i++) {
291
- ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
292
- }
293
- const pre = [];
294
- for (let i = 0; i < N; i++) {
295
- pre.push(`[${i}:a]aresample=async=1:first_pts=0,asetpts=N/SR/TB[a${i}]`);
296
- }
297
- const labels = Array.from({ length: N }, (_, i) => `[a${i}]`).join("");
298
- const amix = `${labels}amix=inputs=${N}:duration=longest:dropout_transition=250:normalize=0[mix]`;
299
- const filter = `${pre.join(";")};${amix}`;
300
- ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
301
- return ffArgs;
302
- }
303
- /**
304
- * Spawn a new FFmpeg process for mixing audio from multiple participants.
305
- * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
306
- * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
307
- * The process will log its output to stderr.
308
- * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
309
- * @return The spawned FFmpeg process.
310
- */
311
- function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
312
- const stdio = ["ignore", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
313
- const args = getFFmpegArguments();
314
- const ffmpegProcess = child_process.spawn("ffmpeg", args, { stdio });
315
- startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
316
- ffmpegProcess.stderr.setEncoding("utf8");
317
- ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
318
- ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
319
- let audioBuffer = Buffer.alloc(0);
320
- const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
321
- ffmpegProcess.stdout.on("data", (chunk) => {
322
- audioBuffer = Buffer.concat([audioBuffer, chunk]);
323
- while (audioBuffer.length >= FRAME_SIZE_BYTES) {
324
- const frameData = audioBuffer.subarray(0, FRAME_SIZE_BYTES);
325
- const samples = new Int16Array(FRAME_10MS_SAMPLES);
326
- for (let i = 0; i < FRAME_10MS_SAMPLES; i++) {
327
- samples[i] = frameData.readInt16LE(i * 2);
328
- }
329
- enqueueOutputFrame(samples);
330
- audioBuffer = audioBuffer.subarray(FRAME_SIZE_BYTES);
287
+ /**
288
+ * Get the FFmpeg arguments for debugging, which writes each participant's audio to a separate WAV file
289
+ * and also mixes them into a single WAV file.
290
+ * This is useful for inspecting the audio quality and timing of each participant.
291
+ */
292
+ function getFFmpegArgumentsDebug() {
293
+ const N = PARTICIPANT_SLOTS;
294
+ const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
295
+ const ffArgs = [];
296
+ for (let i = 0; i < N; i++) {
297
+ ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
331
298
  }
332
- });
333
- return ffmpegProcess;
334
- }
335
- /**
336
- * Write audio data from a MediaStreamTrack to the FFmpeg process.
337
- * This function creates an AudioSink for the track and sets up a data handler
338
- * that enqueues audio frames into the pacer.
339
- *
340
- * @param ffmpegProcess The FFmpeg process to which audio data will be written.
341
- * @param slot The participant slot number (0..N-1) to which this track belongs.
342
- * @param audioTrack The MediaStreamTrack containing the audio data.
343
- * @return An object containing the AudioSink, the writable stream, and a stop function.
344
- */
345
- function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
346
- const writer = ffmpegProcess.stdio[3 + slot];
347
- const sink = new AudioSink(audioTrack);
348
- const unsubscribe = sink.subscribe(({ samples, sampleRate: sr, channelCount: ch, bitsPerSample, numberOfFrames }) => {
349
- if (ch !== 1 || bitsPerSample !== 16)
350
- return;
351
- let out = samples;
352
- if (sr !== STREAM_INPUT_SAMPLE_RATE_IN_HZ) {
353
- const resampled = resampleTo48kHz(samples, sr, numberOfFrames !== null && numberOfFrames !== void 0 ? numberOfFrames : samples.length);
354
- out = resampled;
299
+ const pre = [];
300
+ for (let i = 0; i < N; i++) {
301
+ pre.push(`[${i}:a]aresample=async=0:first_pts=0,asetpts=PTS-STARTPTS,asplit=2[a${i}tap][a${i}mix]`);
355
302
  }
356
- appendAndDrainTo480(slot, out);
357
- });
358
- const stop = () => {
359
- try {
360
- unsubscribe();
361
- sink.stop();
303
+ const mixInputs = Array.from({ length: N }, (_, i) => `[a${i}mix]`).join("");
304
+ const filter = `${pre.join(";")};${mixInputs}amix=inputs=${N}:duration=first:dropout_transition=0:normalize=0[mix]`;
305
+ ffArgs.push("-hide_banner", "-nostats", "-loglevel", "info", "-y", "-filter_complex", filter);
306
+ for (let i = 0; i < N; i++) {
307
+ ffArgs.push("-map", `[a${i}tap]`, "-f", "wav", "-c:a", "pcm_s16le", `pre${i}.wav`);
362
308
  }
363
- catch (_a) {
364
- console.error("Failed to stop AudioSink");
365
- }
366
- };
367
- return { sink, writer, stop };
368
- }
369
- /**
370
- * Stop the FFmpeg process and clean up all resources.
371
- * This function will unpipe the stdout, end all writable streams for each participant slot,
372
- * and kill the FFmpeg process.
373
- * @param ffmpegProcess The FFmpeg process to stop.
374
- */
375
- function stopFFmpegProcess(ffmpegProcess) {
376
- stopPacer();
377
- if (ffmpegProcess && !ffmpegProcess.killed) {
378
- try {
379
- ffmpegProcess.stdout.unpipe();
309
+ ffArgs.push("-map", "[mix]", "-f", "wav", "-c:a", "pcm_s16le", "mixed.wav");
310
+ return ffArgs;
311
+ }
312
+ /**
313
+ * Get the FFmpeg arguments for mixing audio from multiple participants.
314
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
315
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
316
+ */
317
+ function getFFmpegArguments() {
318
+ const N = PARTICIPANT_SLOTS;
319
+ const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
320
+ const ffArgs = [];
321
+ for (let i = 0; i < N; i++) {
322
+ ffArgs.push("-f", "s16le", "-ar", String(SR), "-ac", "1", "-i", `pipe:${3 + i}`);
380
323
  }
381
- catch (_a) {
382
- console.error("Failed to unpipe ffmpeg stdout");
324
+ const pre = [];
325
+ for (let i = 0; i < N; i++) {
326
+ pre.push(`[${i}:a]aresample=async=0:first_pts=0,asetpts=PTS-STARTPTS[a${i}]`);
383
327
  }
384
- for (let i = 0; i < PARTICIPANT_SLOTS; i++) {
385
- const w = ffmpegProcess.stdio[3 + i];
328
+ const labels = Array.from({ length: N }, (_, i) => `[a${i}]`).join("");
329
+ const amix = `${labels}amix=inputs=${N}:duration=first:dropout_transition=0:normalize=0[mix]`;
330
+ const filter = `${pre.join(";")};${amix}`;
331
+ ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
332
+ return ffArgs;
333
+ }
334
+ /*
335
+ * Spawn a new FFmpeg process for debugging purposes.
336
+ * This will write each participant's audio to a separate WAV file and also mix them into a single WAV file.
337
+ * The output files will be named pre0.wav, pre1.wav, ..., and mixed.wav.
338
+ * The process will log its output to stderr.
339
+ * @return The spawned FFmpeg process.
340
+ */
341
+ function spawnFFmpegProcessDebug(rtcAudioSource, onAudioStreamReady) {
342
+ const stdio = ["ignore", "ignore", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
343
+ const args = getFFmpegArgumentsDebug();
344
+ const ffmpegProcess = child_process.spawn("ffmpeg", args, { stdio });
345
+ startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
346
+ ffmpegProcess.stderr.setEncoding("utf8");
347
+ ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
348
+ ffmpegProcess.on("error", () => console.error("FFmpeg process error (debug): is ffmpeg installed?"));
349
+ return ffmpegProcess;
350
+ }
351
+ /**
352
+ * Spawn a new FFmpeg process for mixing audio from multiple participants.
353
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
354
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
355
+ * The process will log its output to stderr.
356
+ * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
357
+ * @return The spawned FFmpeg process.
358
+ */
359
+ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
360
+ const stdio = ["pipe", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
361
+ const args = getFFmpegArguments();
362
+ const ffmpegProcess = child_process.spawn("ffmpeg", args, { stdio });
363
+ startPacer(ffmpegProcess, PARTICIPANT_SLOTS, rtcAudioSource, onAudioStreamReady);
364
+ ffmpegProcess.stderr.setEncoding("utf8");
365
+ ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
366
+ ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
367
+ let audioBuffer = Buffer.alloc(0);
368
+ const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
369
+ ffmpegProcess.stdout.on("data", (chunk) => {
370
+ audioBuffer = Buffer.concat([audioBuffer, chunk]);
371
+ while (audioBuffer.length >= FRAME_SIZE_BYTES) {
372
+ const frameData = audioBuffer.subarray(0, FRAME_SIZE_BYTES);
373
+ const samples = new Int16Array(FRAME_10MS_SAMPLES);
374
+ for (let i = 0; i < FRAME_10MS_SAMPLES; i++) {
375
+ samples[i] = frameData.readInt16LE(i * 2);
376
+ }
377
+ enqueueOutputFrame(samples);
378
+ audioBuffer = audioBuffer.subarray(FRAME_SIZE_BYTES);
379
+ }
380
+ });
381
+ return ffmpegProcess;
382
+ }
383
+ /**
384
+ * Write audio data from a MediaStreamTrack to the FFmpeg process.
385
+ * This function creates an AudioSink for the track and sets up a data handler
386
+ * that enqueues audio frames into the pacer.
387
+ *
388
+ * @param ffmpegProcess The FFmpeg process to which audio data will be written.
389
+ * @param slot The participant slot number (0..N-1) to which this track belongs.
390
+ * @param audioTrack The MediaStreamTrack containing the audio data.
391
+ * @return An object containing the AudioSink, the writable stream, and a stop function.
392
+ */
393
+ function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
394
+ const writer = ffmpegProcess.stdio[3 + slot];
395
+ const sink = new AudioSink(audioTrack);
396
+ const unsubscribe = sink.subscribe(({ samples, sampleRate: sr, channelCount: ch, bitsPerSample, numberOfFrames }) => {
397
+ if (ch !== 1 || bitsPerSample !== 16)
398
+ return;
399
+ let out = samples;
400
+ if (sr !== STREAM_INPUT_SAMPLE_RATE_IN_HZ) {
401
+ const resampled = resampleTo48kHz(samples, sr, numberOfFrames !== null && numberOfFrames !== void 0 ? numberOfFrames : samples.length);
402
+ out = resampled;
403
+ }
404
+ appendAndDrainTo480(slot, out);
405
+ });
406
+ const stop = () => {
407
+ try {
408
+ unsubscribe();
409
+ sink.stop();
410
+ }
411
+ catch (_a) {
412
+ console.error("Failed to stop AudioSink");
413
+ }
414
+ };
415
+ return { sink, writer, stop };
416
+ }
417
+ /**
418
+ * Stop the FFmpeg process and clean up all resources.
419
+ * This function will unpipe the stdout, end all writable streams for each participant slot,
420
+ * and kill the FFmpeg process.
421
+ * @param ffmpegProcess The FFmpeg process to stop.
422
+ */
423
+ function stopFFmpegProcess(ffmpegProcess) {
424
+ var _a, _b;
425
+ stopPacer();
426
+ if (ffmpegProcess && !ffmpegProcess.killed) {
427
+ try {
428
+ ffmpegProcess.stdout.unpipe();
429
+ }
430
+ catch (_c) {
431
+ console.error("Failed to unpipe ffmpeg stdout");
432
+ }
433
+ for (let i = 0; i < PARTICIPANT_SLOTS; i++) {
434
+ const w = ffmpegProcess.stdio[3 + i];
435
+ try {
436
+ w.end();
437
+ }
438
+ catch (_d) {
439
+ console.error("Failed to end ffmpeg writable stream");
440
+ }
441
+ }
386
442
  try {
387
- w.end();
443
+ (_a = ffmpegProcess.stdin) === null || _a === void 0 ? void 0 : _a.write("q\n");
444
+ (_b = ffmpegProcess.stdin) === null || _b === void 0 ? void 0 : _b.end();
388
445
  }
389
- catch (_b) {
390
- console.error("Failed to end ffmpeg writable stream");
446
+ catch (_e) {
447
+ console.error("Failed to end ffmpeg stdin");
391
448
  }
392
449
  }
393
- ffmpegProcess.kill("SIGTERM");
394
450
  }
451
+ return {
452
+ spawnFFmpegProcess,
453
+ spawnFFmpegProcessDebug,
454
+ writeAudioDataToFFmpeg,
455
+ stopFFmpegProcess,
456
+ clearSlotQueue,
457
+ };
395
458
  }
396
459
 
397
460
  class AudioMixer extends EventEmitter.EventEmitter {
@@ -402,6 +465,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
402
465
  this.rtcAudioSource = null;
403
466
  this.participantSlots = new Map();
404
467
  this.activeSlots = {};
468
+ this.mixer = createFfmpegMixer();
405
469
  this.setupMediaStream();
406
470
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
407
471
  this.onStreamReady = onStreamReady;
@@ -420,7 +484,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
420
484
  return;
421
485
  }
422
486
  if (!this.ffmpegProcess && this.rtcAudioSource) {
423
- this.ffmpegProcess = spawnFFmpegProcess(this.rtcAudioSource, this.onStreamReady);
487
+ this.ffmpegProcess = this.mixer.spawnFFmpegProcess(this.rtcAudioSource, this.onStreamReady);
424
488
  }
425
489
  for (const p of participants)
426
490
  this.attachParticipantIfNeeded(p);
@@ -433,7 +497,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
433
497
  }
434
498
  stopAudioMixer() {
435
499
  if (this.ffmpegProcess) {
436
- stopFFmpegProcess(this.ffmpegProcess);
500
+ this.mixer.stopFFmpegProcess(this.ffmpegProcess);
437
501
  this.ffmpegProcess = null;
438
502
  }
439
503
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
@@ -486,7 +550,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
486
550
  }
487
551
  this.activeSlots[slot] = undefined;
488
552
  }
489
- const { sink, writer, stop } = writeAudioDataToFFmpeg(this.ffmpegProcess, slot, audioTrack);
553
+ const { sink, writer, stop } = this.mixer.writeAudioDataToFFmpeg(this.ffmpegProcess, slot, audioTrack);
490
554
  this.activeSlots[slot] = { sink, writer, stop, trackId: audioTrack.id };
491
555
  (_a = audioTrack.addEventListener) === null || _a === void 0 ? void 0 : _a.call(audioTrack, "ended", () => this.detachParticipant(participantId));
492
556
  }
@@ -505,7 +569,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
505
569
  this.activeSlots[slot] = undefined;
506
570
  }
507
571
  // Clear any queued audio data for this slot to prevent stale audio
508
- clearSlotQueue(slot);
572
+ this.mixer.clearSlotQueue(slot);
509
573
  this.participantSlots.set(slot, "");
510
574
  }
511
575
  }
@@ -516,6 +580,15 @@ class Assistant extends EventEmitter {
516
580
  this.mediaStream = null;
517
581
  this.audioSource = null;
518
582
  this.combinedStream = null;
583
+ this.roomUrl = null;
584
+ this.handleConnectionStatusChange = (status) => {
585
+ if (status === "connected") {
586
+ this.emit(ASSISTANT_JOINED_ROOM, { roomUrl: this.roomUrl || "" });
587
+ }
588
+ if (["left", "kicked"].includes(status)) {
589
+ this.emit(ASSISTANT_LEFT_ROOM, { roomUrl: this.roomUrl || "" });
590
+ }
591
+ };
519
592
  this.assistantKey = assistantKey;
520
593
  this.client = new core.WherebyClient();
521
594
  this.roomConnection = this.client.getRoomConnection();
@@ -540,6 +613,7 @@ class Assistant extends EventEmitter {
540
613
  const audioMixer = new AudioMixer(handleStreamReady);
541
614
  this.combinedStream = audioMixer.getCombinedAudioStream();
542
615
  this.roomConnection.subscribeToRemoteParticipants(audioMixer.handleRemoteParticipants.bind(audioMixer));
616
+ this.roomConnection.subscribeToConnectionStatus(this.handleConnectionStatusChange);
543
617
  }
544
618
  }
545
619
  joinRoom(roomUrl) {
@@ -547,6 +621,7 @@ class Assistant extends EventEmitter {
547
621
  if (this.mediaStream) {
548
622
  yield this.localMedia.startMedia(this.mediaStream);
549
623
  }
624
+ this.roomUrl = roomUrl;
550
625
  this.roomConnection.initialize({
551
626
  localMediaOptions: {
552
627
  audio: false,
@@ -657,10 +732,28 @@ const webhookRouter = (webhookTriggers, emitter) => {
657
732
  res.end();
658
733
  });
659
734
  router.post("/", jsonParser, (req, res) => {
660
- var _a;
735
+ var _a, _b, _c, _d, _e, _f, _g, _h;
661
736
  assert(req.body, "message body is required");
662
737
  assert("type" in req.body, "webhook type is required");
663
- const shouldTriggerOnReceivedWebhook = (_a = webhookTriggers[req.body.type]) === null || _a === void 0 ? void 0 : _a.call(webhookTriggers, req.body);
738
+ let shouldTriggerOnReceivedWebhook = false;
739
+ switch (req.body.type) {
740
+ case "room.client.joined":
741
+ shouldTriggerOnReceivedWebhook =
742
+ (_b = (_a = webhookTriggers["room.client.joined"]) === null || _a === void 0 ? void 0 : _a.call(webhookTriggers, req.body)) !== null && _b !== void 0 ? _b : false;
743
+ break;
744
+ case "room.client.left":
745
+ shouldTriggerOnReceivedWebhook =
746
+ (_d = (_c = webhookTriggers["room.client.left"]) === null || _c === void 0 ? void 0 : _c.call(webhookTriggers, req.body)) !== null && _d !== void 0 ? _d : false;
747
+ break;
748
+ case "room.session.started":
749
+ shouldTriggerOnReceivedWebhook =
750
+ (_f = (_e = webhookTriggers["room.session.started"]) === null || _e === void 0 ? void 0 : _e.call(webhookTriggers, req.body)) !== null && _f !== void 0 ? _f : false;
751
+ break;
752
+ case "room.session.ended":
753
+ shouldTriggerOnReceivedWebhook =
754
+ (_h = (_g = webhookTriggers["room.session.ended"]) === null || _g === void 0 ? void 0 : _g.call(webhookTriggers, req.body)) !== null && _h !== void 0 ? _h : false;
755
+ break;
756
+ }
664
757
  if (shouldTriggerOnReceivedWebhook) {
665
758
  const roomUrl = buildRoomUrl(req.body.data.roomName, req.body.data.subdomain);
666
759
  emitter.emit(TRIGGER_EVENT_SUCCESS, { roomUrl, triggerWebhook: req.body });
@@ -689,6 +782,8 @@ class Trigger extends EventEmitter.EventEmitter {
689
782
  }
690
783
  }
691
784
 
785
+ exports.ASSISTANT_JOINED_ROOM = ASSISTANT_JOINED_ROOM;
786
+ exports.ASSISTANT_LEFT_ROOM = ASSISTANT_LEFT_ROOM;
692
787
  exports.AUDIO_STREAM_READY = AUDIO_STREAM_READY;
693
788
  exports.Assistant = Assistant;
694
789
  exports.AudioSink = AudioSink;