@whereby.com/assistant-sdk 0.0.0-canary-20250911141956 → 0.0.0-canary-20250912142319

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -9,6 +9,26 @@ var express = require('express');
9
9
  var assert = require('assert');
10
10
  var bodyParser = require('body-parser');
11
11
  var os = require('os');
12
+ var dotenv = require('dotenv');
13
+
14
+ function _interopNamespaceDefault(e) {
15
+ var n = Object.create(null);
16
+ if (e) {
17
+ Object.keys(e).forEach(function (k) {
18
+ if (k !== 'default') {
19
+ var d = Object.getOwnPropertyDescriptor(e, k);
20
+ Object.defineProperty(n, k, d.get ? d : {
21
+ enumerable: true,
22
+ get: function () { return e[k]; }
23
+ });
24
+ }
25
+ });
26
+ }
27
+ n.default = e;
28
+ return Object.freeze(n);
29
+ }
30
+
31
+ var dotenv__namespace = /*#__PURE__*/_interopNamespaceDefault(dotenv);
12
32
 
13
33
  const ASSISTANT_JOIN_SUCCESS = "ASSISTANT_JOIN_SUCCESS";
14
34
 
@@ -68,9 +88,14 @@ class AudioSink extends wrtc.nonstandard.RTCAudioSink {
68
88
  }
69
89
  }
70
90
 
91
+ // Number of pipes in the ffmpeg process. We predefine a fixed number of slots, and then we dynamically assign
92
+ // participants to these slots based on mute/unmute state.
71
93
  const PARTICIPANT_SLOTS = 20;
94
+ // Each sample is 2 bytes (16 bits) for PCM audio - s16le format
95
+ // 48000 Hz is the standard sample rate for WebRTC audio
72
96
  const STREAM_INPUT_SAMPLE_RATE_IN_HZ = 48000;
73
97
  const BYTES_PER_SAMPLE = 2;
98
+ // 480 samples per 10ms frame at 48kHz
74
99
  const FRAME_10MS_SAMPLES = 480;
75
100
  const slotBuffers = new Map();
76
101
  function appendAndDrainTo480(slot, newSamples) {
@@ -82,10 +107,10 @@ function appendAndDrainTo480(slot, newSamples) {
82
107
  let offset = 0;
83
108
  while (merged.length - offset >= FRAME_10MS_SAMPLES) {
84
109
  const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
85
- enqueueFrame(slot, chunk);
110
+ enqueueFrame(slot, chunk); // always 480
86
111
  offset += FRAME_10MS_SAMPLES;
87
112
  }
88
- slotBuffers.set(slot, merged.subarray(offset));
113
+ slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
89
114
  }
90
115
  ({
91
116
  enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
@@ -97,6 +122,10 @@ function appendAndDrainTo480(slot, newSamples) {
97
122
  let slots = [];
98
123
  let stopPacerFn = null;
99
124
  let outputPacerState = null;
125
+ /**
126
+ * Simple linear interpolation resampler to convert audio to 48kHz.
127
+ * This handles the common case of 16kHz -> 48kHz (3x upsampling).
128
+ */
100
129
  function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
101
130
  const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
102
131
  const outputLength = Math.floor(inputFrames * ratio);
@@ -116,11 +145,43 @@ function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
116
145
  }
117
146
  return output;
118
147
  }
148
+ /**
149
+ * Enqueue an audio frame for paced delivery to the RTCAudioSource.
150
+ */
119
151
  function enqueueOutputFrame(samples) {
120
152
  if (outputPacerState) {
121
153
  outputPacerState.frameQueue.push(samples);
122
154
  }
123
155
  }
156
+ /**
157
+ * Start the audio pacer loop for all input slots in an FFmpeg process.
158
+ *
159
+ * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
160
+ * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
161
+ * arrive jittery, bursty, or with slightly different clocks.
162
+ *
163
+ * Key behavior:
164
+ * - Writes exactly one frame per period, on a shared wall-clock grid.
165
+ * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
166
+ * never stalls.
167
+ * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
168
+ * - Honors Node stream backpressure (`write()` return false) without breaking
169
+ * the timing grid.
170
+ *
171
+ * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
172
+ * can mix them without slow-downs or drift.
173
+ *
174
+ * Call this once right after spawning FFmpeg:
175
+ * ```ts
176
+ * const ff = spawnFFmpegProcess();
177
+ * startPacer(ff, PARTICIPANT_SLOTS);
178
+ * ```
179
+ *
180
+ * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
181
+ *
182
+ * @param ff Child process handle from spawn("ffmpeg", ...)
183
+ * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
184
+ */
124
185
  function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
125
186
  if (stopPacerFn) {
126
187
  stopPacerFn();
@@ -128,11 +189,11 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
128
189
  }
129
190
  const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
130
191
  const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
131
- const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
192
+ const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
132
193
  const t0 = nowMs();
133
194
  slots = Array.from({ length: slotCount }, () => ({
134
195
  q: [],
135
- lastFrames: FRAME_10MS_SAMPLES,
196
+ lastFrames: FRAME_10MS_SAMPLES, // keep constant
136
197
  nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
137
198
  }));
138
199
  outputPacerState = {
@@ -147,10 +208,11 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
147
208
  for (let s = 0; s < slotCount; s++) {
148
209
  const st = slots[s];
149
210
  const w = writers[s];
150
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
211
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
151
212
  if (t >= st.nextDueMs) {
152
213
  const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
153
214
  if (!w.write(buf)) {
215
+ // Just continue without adding drain listener - backpressure will naturally resolve
154
216
  const late = t - st.nextDueMs;
155
217
  const steps = Math.max(1, Math.ceil(late / frameMs));
156
218
  st.nextDueMs += steps * frameMs;
@@ -163,9 +225,10 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
163
225
  }
164
226
  if (!outputPacerState)
165
227
  return;
228
+ // Handle output pacer for RTCAudioSource
166
229
  const state = outputPacerState;
167
230
  if (t >= state.nextDueMs) {
168
- const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES);
231
+ const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
169
232
  if (!state.didEmitReadyEvent) {
170
233
  state.onAudioStreamReady();
171
234
  state.didEmitReadyEvent = true;
@@ -181,12 +244,20 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
181
244
  }, 5);
182
245
  stopPacerFn = () => clearInterval(iv);
183
246
  }
247
+ /**
248
+ * Stop the audio pacer loop and clear all input slots.
249
+ * Call this before killing the FFmpeg process to ensure clean shutdown.
250
+ */
184
251
  function stopPacer() {
185
252
  if (stopPacerFn)
186
253
  stopPacerFn();
187
254
  stopPacerFn = null;
188
255
  slots = [];
189
256
  }
257
+ /**
258
+ * Queue a live frame for a given slot (0..N-1).
259
+ * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
260
+ */
190
261
  function enqueueFrame(slot, samples, numberOfFrames) {
191
262
  const st = slots[slot];
192
263
  if (!st)
@@ -194,6 +265,10 @@ function enqueueFrame(slot, samples, numberOfFrames) {
194
265
  const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
195
266
  st.q.push(buf);
196
267
  }
268
+ /**
269
+ * Clear the audio queue for a specific slot when a participant leaves.
270
+ * This prevents stale audio data from continuing to play after disconnect.
271
+ */
197
272
  function clearSlotQueue(slot) {
198
273
  const st = slots[slot];
199
274
  if (st) {
@@ -203,6 +278,11 @@ function clearSlotQueue(slot) {
203
278
  st.nextDueMs = now + frameMs;
204
279
  }
205
280
  }
281
+ /**
282
+ * Get the FFmpeg arguments for mixing audio from multiple participants.
283
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
284
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
285
+ */
206
286
  function getFFmpegArguments() {
207
287
  const N = PARTICIPANT_SLOTS;
208
288
  const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
@@ -220,6 +300,14 @@ function getFFmpegArguments() {
220
300
  ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
221
301
  return ffArgs;
222
302
  }
303
+ /**
304
+ * Spawn a new FFmpeg process for mixing audio from multiple participants.
305
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
306
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
307
+ * The process will log its output to stderr.
308
+ * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
309
+ * @return The spawned FFmpeg process.
310
+ */
223
311
  function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
224
312
  const stdio = ["ignore", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
225
313
  const args = getFFmpegArguments();
@@ -229,7 +317,7 @@ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
229
317
  ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
230
318
  ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
231
319
  let audioBuffer = Buffer.alloc(0);
232
- const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE;
320
+ const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
233
321
  ffmpegProcess.stdout.on("data", (chunk) => {
234
322
  audioBuffer = Buffer.concat([audioBuffer, chunk]);
235
323
  while (audioBuffer.length >= FRAME_SIZE_BYTES) {
@@ -244,6 +332,16 @@ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
244
332
  });
245
333
  return ffmpegProcess;
246
334
  }
335
+ /**
336
+ * Write audio data from a MediaStreamTrack to the FFmpeg process.
337
+ * This function creates an AudioSink for the track and sets up a data handler
338
+ * that enqueues audio frames into the pacer.
339
+ *
340
+ * @param ffmpegProcess The FFmpeg process to which audio data will be written.
341
+ * @param slot The participant slot number (0..N-1) to which this track belongs.
342
+ * @param audioTrack The MediaStreamTrack containing the audio data.
343
+ * @return An object containing the AudioSink, the writable stream, and a stop function.
344
+ */
247
345
  function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
248
346
  const writer = ffmpegProcess.stdio[3 + slot];
249
347
  const sink = new AudioSink(audioTrack);
@@ -268,6 +366,12 @@ function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
268
366
  };
269
367
  return { sink, writer, stop };
270
368
  }
369
+ /**
370
+ * Stop the FFmpeg process and clean up all resources.
371
+ * This function will unpipe the stdout, end all writable streams for each participant slot,
372
+ * and kill the FFmpeg process.
373
+ * @param ffmpegProcess The FFmpeg process to stop.
374
+ */
271
375
  function stopFFmpegProcess(ffmpegProcess) {
272
376
  stopPacer();
273
377
  if (ffmpegProcess && !ffmpegProcess.killed) {
@@ -321,6 +425,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
321
425
  for (const p of participants)
322
426
  this.attachParticipantIfNeeded(p);
323
427
  const liveIds = new Set(participants.map((p) => p.id).filter(Boolean));
428
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
324
429
  for (const [slot, pid] of this.participantSlots) {
325
430
  if (pid && !liveIds.has(pid))
326
431
  this.detachParticipant(pid);
@@ -333,6 +438,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
333
438
  }
334
439
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
335
440
  this.activeSlots = {};
441
+ // Recreate the media stream to avoid stale references
336
442
  this.setupMediaStream();
337
443
  }
338
444
  slotForParticipant(participantId) {
@@ -398,6 +504,7 @@ class AudioMixer extends EventEmitter.EventEmitter {
398
504
  }
399
505
  this.activeSlots[slot] = undefined;
400
506
  }
507
+ // Clear any queued audio data for this slot to prevent stale audio
401
508
  clearSlotQueue(slot);
402
509
  this.participantSlots.set(slot, "");
403
510
  }
@@ -521,10 +628,11 @@ class Assistant extends EventEmitter {
521
628
  }
522
629
  }
523
630
 
524
- const BIND_INTERFACE = "en0";
631
+ dotenv__namespace.config();
632
+ const { IS_LOCAL = "false", BIND_INTERFACE = "en0" } = process.env;
525
633
  function buildRoomUrl(roomPath, wherebySubdomain, baseDomain = "whereby.com") {
526
634
  let wherebyDomain;
527
- {
635
+ if (IS_LOCAL === "true") {
528
636
  const ifaceAddrs = os.networkInterfaces()[BIND_INTERFACE];
529
637
  if (!ifaceAddrs) {
530
638
  throw new Error(`Unknown interface ${BIND_INTERFACE}`);
@@ -535,6 +643,9 @@ function buildRoomUrl(roomPath, wherebySubdomain, baseDomain = "whereby.com") {
535
643
  }
536
644
  wherebyDomain = `${wherebySubdomain}-ip-${bindAddr.address.replace(/[.]/g, "-")}.hereby.dev:4443`;
537
645
  }
646
+ else {
647
+ wherebyDomain = `${wherebySubdomain}.${baseDomain}`;
648
+ }
538
649
  return `https://${wherebyDomain}${roomPath}`;
539
650
  }
540
651
 
@@ -562,7 +673,7 @@ const webhookRouter = (webhookTriggers, emitter, assistantKey, startCombinedAudi
562
673
  return router;
563
674
  };
564
675
  class Trigger extends EventEmitter.EventEmitter {
565
- constructor({ webhookTriggers = {}, port = 4999, assistantKey, startCombinedAudioStream, startLocalMedia, }) {
676
+ constructor({ webhookTriggers = {}, port = 8080, assistantKey, startCombinedAudioStream, startLocalMedia, }) {
566
677
  super();
567
678
  this.webhookTriggers = webhookTriggers;
568
679
  this.port = port;
@@ -575,6 +686,7 @@ class Trigger extends EventEmitter.EventEmitter {
575
686
  const router = webhookRouter(this.webhookTriggers, this, this.assistantKey, this.startCombinedAudioStream, this.startLocalMedia);
576
687
  app.use(router);
577
688
  const server = app.listen(this.port, () => {
689
+ // console.log(`Bot trigger server now running on port[${this.port}]`);
578
690
  });
579
691
  process.on("SIGTERM", () => {
580
692
  server.close();
package/dist/index.mjs CHANGED
@@ -7,6 +7,7 @@ import express from 'express';
7
7
  import assert from 'assert';
8
8
  import bodyParser from 'body-parser';
9
9
  import { networkInterfaces } from 'os';
10
+ import * as dotenv from 'dotenv';
10
11
 
11
12
  const ASSISTANT_JOIN_SUCCESS = "ASSISTANT_JOIN_SUCCESS";
12
13
 
@@ -66,9 +67,14 @@ class AudioSink extends wrtc.nonstandard.RTCAudioSink {
66
67
  }
67
68
  }
68
69
 
70
+ // Number of pipes in the ffmpeg process. We predefine a fixed number of slots, and then we dynamically assign
71
+ // participants to these slots based on mute/unmute state.
69
72
  const PARTICIPANT_SLOTS = 20;
73
+ // Each sample is 2 bytes (16 bits) for PCM audio - s16le format
74
+ // 48000 Hz is the standard sample rate for WebRTC audio
70
75
  const STREAM_INPUT_SAMPLE_RATE_IN_HZ = 48000;
71
76
  const BYTES_PER_SAMPLE = 2;
77
+ // 480 samples per 10ms frame at 48kHz
72
78
  const FRAME_10MS_SAMPLES = 480;
73
79
  const slotBuffers = new Map();
74
80
  function appendAndDrainTo480(slot, newSamples) {
@@ -80,10 +86,10 @@ function appendAndDrainTo480(slot, newSamples) {
80
86
  let offset = 0;
81
87
  while (merged.length - offset >= FRAME_10MS_SAMPLES) {
82
88
  const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
83
- enqueueFrame(slot, chunk);
89
+ enqueueFrame(slot, chunk); // always 480
84
90
  offset += FRAME_10MS_SAMPLES;
85
91
  }
86
- slotBuffers.set(slot, merged.subarray(offset));
92
+ slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
87
93
  }
88
94
  ({
89
95
  enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
@@ -95,6 +101,10 @@ function appendAndDrainTo480(slot, newSamples) {
95
101
  let slots = [];
96
102
  let stopPacerFn = null;
97
103
  let outputPacerState = null;
104
+ /**
105
+ * Simple linear interpolation resampler to convert audio to 48kHz.
106
+ * This handles the common case of 16kHz -> 48kHz (3x upsampling).
107
+ */
98
108
  function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
99
109
  const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
100
110
  const outputLength = Math.floor(inputFrames * ratio);
@@ -114,11 +124,43 @@ function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
114
124
  }
115
125
  return output;
116
126
  }
127
+ /**
128
+ * Enqueue an audio frame for paced delivery to the RTCAudioSource.
129
+ */
117
130
  function enqueueOutputFrame(samples) {
118
131
  if (outputPacerState) {
119
132
  outputPacerState.frameQueue.push(samples);
120
133
  }
121
134
  }
135
+ /**
136
+ * Start the audio pacer loop for all input slots in an FFmpeg process.
137
+ *
138
+ * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
139
+ * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
140
+ * arrive jittery, bursty, or with slightly different clocks.
141
+ *
142
+ * Key behavior:
143
+ * - Writes exactly one frame per period, on a shared wall-clock grid.
144
+ * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
145
+ * never stalls.
146
+ * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
147
+ * - Honors Node stream backpressure (`write()` return false) without breaking
148
+ * the timing grid.
149
+ *
150
+ * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
151
+ * can mix them without slow-downs or drift.
152
+ *
153
+ * Call this once right after spawning FFmpeg:
154
+ * ```ts
155
+ * const ff = spawnFFmpegProcess();
156
+ * startPacer(ff, PARTICIPANT_SLOTS);
157
+ * ```
158
+ *
159
+ * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
160
+ *
161
+ * @param ff Child process handle from spawn("ffmpeg", ...)
162
+ * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
163
+ */
122
164
  function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
123
165
  if (stopPacerFn) {
124
166
  stopPacerFn();
@@ -126,11 +168,11 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
126
168
  }
127
169
  const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
128
170
  const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
129
- const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
171
+ const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
130
172
  const t0 = nowMs();
131
173
  slots = Array.from({ length: slotCount }, () => ({
132
174
  q: [],
133
- lastFrames: FRAME_10MS_SAMPLES,
175
+ lastFrames: FRAME_10MS_SAMPLES, // keep constant
134
176
  nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
135
177
  }));
136
178
  outputPacerState = {
@@ -145,10 +187,11 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
145
187
  for (let s = 0; s < slotCount; s++) {
146
188
  const st = slots[s];
147
189
  const w = writers[s];
148
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
190
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
149
191
  if (t >= st.nextDueMs) {
150
192
  const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
151
193
  if (!w.write(buf)) {
194
+ // Just continue without adding drain listener - backpressure will naturally resolve
152
195
  const late = t - st.nextDueMs;
153
196
  const steps = Math.max(1, Math.ceil(late / frameMs));
154
197
  st.nextDueMs += steps * frameMs;
@@ -161,9 +204,10 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
161
204
  }
162
205
  if (!outputPacerState)
163
206
  return;
207
+ // Handle output pacer for RTCAudioSource
164
208
  const state = outputPacerState;
165
209
  if (t >= state.nextDueMs) {
166
- const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES);
210
+ const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
167
211
  if (!state.didEmitReadyEvent) {
168
212
  state.onAudioStreamReady();
169
213
  state.didEmitReadyEvent = true;
@@ -179,12 +223,20 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
179
223
  }, 5);
180
224
  stopPacerFn = () => clearInterval(iv);
181
225
  }
226
+ /**
227
+ * Stop the audio pacer loop and clear all input slots.
228
+ * Call this before killing the FFmpeg process to ensure clean shutdown.
229
+ */
182
230
  function stopPacer() {
183
231
  if (stopPacerFn)
184
232
  stopPacerFn();
185
233
  stopPacerFn = null;
186
234
  slots = [];
187
235
  }
236
+ /**
237
+ * Queue a live frame for a given slot (0..N-1).
238
+ * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
239
+ */
188
240
  function enqueueFrame(slot, samples, numberOfFrames) {
189
241
  const st = slots[slot];
190
242
  if (!st)
@@ -192,6 +244,10 @@ function enqueueFrame(slot, samples, numberOfFrames) {
192
244
  const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
193
245
  st.q.push(buf);
194
246
  }
247
+ /**
248
+ * Clear the audio queue for a specific slot when a participant leaves.
249
+ * This prevents stale audio data from continuing to play after disconnect.
250
+ */
195
251
  function clearSlotQueue(slot) {
196
252
  const st = slots[slot];
197
253
  if (st) {
@@ -201,6 +257,11 @@ function clearSlotQueue(slot) {
201
257
  st.nextDueMs = now + frameMs;
202
258
  }
203
259
  }
260
+ /**
261
+ * Get the FFmpeg arguments for mixing audio from multiple participants.
262
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
263
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
264
+ */
204
265
  function getFFmpegArguments() {
205
266
  const N = PARTICIPANT_SLOTS;
206
267
  const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
@@ -218,6 +279,14 @@ function getFFmpegArguments() {
218
279
  ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
219
280
  return ffArgs;
220
281
  }
282
+ /**
283
+ * Spawn a new FFmpeg process for mixing audio from multiple participants.
284
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
285
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
286
+ * The process will log its output to stderr.
287
+ * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
288
+ * @return The spawned FFmpeg process.
289
+ */
221
290
  function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
222
291
  const stdio = ["ignore", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
223
292
  const args = getFFmpegArguments();
@@ -227,7 +296,7 @@ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
227
296
  ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
228
297
  ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
229
298
  let audioBuffer = Buffer.alloc(0);
230
- const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE;
299
+ const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
231
300
  ffmpegProcess.stdout.on("data", (chunk) => {
232
301
  audioBuffer = Buffer.concat([audioBuffer, chunk]);
233
302
  while (audioBuffer.length >= FRAME_SIZE_BYTES) {
@@ -242,6 +311,16 @@ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
242
311
  });
243
312
  return ffmpegProcess;
244
313
  }
314
+ /**
315
+ * Write audio data from a MediaStreamTrack to the FFmpeg process.
316
+ * This function creates an AudioSink for the track and sets up a data handler
317
+ * that enqueues audio frames into the pacer.
318
+ *
319
+ * @param ffmpegProcess The FFmpeg process to which audio data will be written.
320
+ * @param slot The participant slot number (0..N-1) to which this track belongs.
321
+ * @param audioTrack The MediaStreamTrack containing the audio data.
322
+ * @return An object containing the AudioSink, the writable stream, and a stop function.
323
+ */
245
324
  function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
246
325
  const writer = ffmpegProcess.stdio[3 + slot];
247
326
  const sink = new AudioSink(audioTrack);
@@ -266,6 +345,12 @@ function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
266
345
  };
267
346
  return { sink, writer, stop };
268
347
  }
348
+ /**
349
+ * Stop the FFmpeg process and clean up all resources.
350
+ * This function will unpipe the stdout, end all writable streams for each participant slot,
351
+ * and kill the FFmpeg process.
352
+ * @param ffmpegProcess The FFmpeg process to stop.
353
+ */
269
354
  function stopFFmpegProcess(ffmpegProcess) {
270
355
  stopPacer();
271
356
  if (ffmpegProcess && !ffmpegProcess.killed) {
@@ -319,6 +404,7 @@ class AudioMixer extends EventEmitter {
319
404
  for (const p of participants)
320
405
  this.attachParticipantIfNeeded(p);
321
406
  const liveIds = new Set(participants.map((p) => p.id).filter(Boolean));
407
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
322
408
  for (const [slot, pid] of this.participantSlots) {
323
409
  if (pid && !liveIds.has(pid))
324
410
  this.detachParticipant(pid);
@@ -331,6 +417,7 @@ class AudioMixer extends EventEmitter {
331
417
  }
332
418
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
333
419
  this.activeSlots = {};
420
+ // Recreate the media stream to avoid stale references
334
421
  this.setupMediaStream();
335
422
  }
336
423
  slotForParticipant(participantId) {
@@ -396,6 +483,7 @@ class AudioMixer extends EventEmitter {
396
483
  }
397
484
  this.activeSlots[slot] = undefined;
398
485
  }
486
+ // Clear any queued audio data for this slot to prevent stale audio
399
487
  clearSlotQueue(slot);
400
488
  this.participantSlots.set(slot, "");
401
489
  }
@@ -519,10 +607,11 @@ class Assistant extends EventEmitter$1 {
519
607
  }
520
608
  }
521
609
 
522
- const BIND_INTERFACE = "en0";
610
+ dotenv.config();
611
+ const { IS_LOCAL = "false", BIND_INTERFACE = "en0" } = process.env;
523
612
  function buildRoomUrl(roomPath, wherebySubdomain, baseDomain = "whereby.com") {
524
613
  let wherebyDomain;
525
- {
614
+ if (IS_LOCAL === "true") {
526
615
  const ifaceAddrs = networkInterfaces()[BIND_INTERFACE];
527
616
  if (!ifaceAddrs) {
528
617
  throw new Error(`Unknown interface ${BIND_INTERFACE}`);
@@ -533,6 +622,9 @@ function buildRoomUrl(roomPath, wherebySubdomain, baseDomain = "whereby.com") {
533
622
  }
534
623
  wherebyDomain = `${wherebySubdomain}-ip-${bindAddr.address.replace(/[.]/g, "-")}.hereby.dev:4443`;
535
624
  }
625
+ else {
626
+ wherebyDomain = `${wherebySubdomain}.${baseDomain}`;
627
+ }
536
628
  return `https://${wherebyDomain}${roomPath}`;
537
629
  }
538
630
 
@@ -560,7 +652,7 @@ const webhookRouter = (webhookTriggers, emitter, assistantKey, startCombinedAudi
560
652
  return router;
561
653
  };
562
654
  class Trigger extends EventEmitter {
563
- constructor({ webhookTriggers = {}, port = 4999, assistantKey, startCombinedAudioStream, startLocalMedia, }) {
655
+ constructor({ webhookTriggers = {}, port = 8080, assistantKey, startCombinedAudioStream, startLocalMedia, }) {
564
656
  super();
565
657
  this.webhookTriggers = webhookTriggers;
566
658
  this.port = port;
@@ -573,6 +665,7 @@ class Trigger extends EventEmitter {
573
665
  const router = webhookRouter(this.webhookTriggers, this, this.assistantKey, this.startCombinedAudioStream, this.startLocalMedia);
574
666
  app.use(router);
575
667
  const server = app.listen(this.port, () => {
668
+ // console.log(`Bot trigger server now running on port[${this.port}]`);
576
669
  });
577
670
  process.on("SIGTERM", () => {
578
671
  server.close();
@@ -7,6 +7,7 @@ import express from 'express';
7
7
  import assert from 'assert';
8
8
  import bodyParser from 'body-parser';
9
9
  import { networkInterfaces } from 'os';
10
+ import * as dotenv from 'dotenv';
10
11
 
11
12
  const ASSISTANT_JOIN_SUCCESS = "ASSISTANT_JOIN_SUCCESS";
12
13
 
@@ -66,9 +67,14 @@ class AudioSink extends wrtc.nonstandard.RTCAudioSink {
66
67
  }
67
68
  }
68
69
 
70
+ // Number of pipes in the ffmpeg process. We predefine a fixed number of slots, and then we dynamically assign
71
+ // participants to these slots based on mute/unmute state.
69
72
  const PARTICIPANT_SLOTS = 20;
73
+ // Each sample is 2 bytes (16 bits) for PCM audio - s16le format
74
+ // 48000 Hz is the standard sample rate for WebRTC audio
70
75
  const STREAM_INPUT_SAMPLE_RATE_IN_HZ = 48000;
71
76
  const BYTES_PER_SAMPLE = 2;
77
+ // 480 samples per 10ms frame at 48kHz
72
78
  const FRAME_10MS_SAMPLES = 480;
73
79
  const slotBuffers = new Map();
74
80
  function appendAndDrainTo480(slot, newSamples) {
@@ -80,10 +86,10 @@ function appendAndDrainTo480(slot, newSamples) {
80
86
  let offset = 0;
81
87
  while (merged.length - offset >= FRAME_10MS_SAMPLES) {
82
88
  const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
83
- enqueueFrame(slot, chunk);
89
+ enqueueFrame(slot, chunk); // always 480
84
90
  offset += FRAME_10MS_SAMPLES;
85
91
  }
86
- slotBuffers.set(slot, merged.subarray(offset));
92
+ slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
87
93
  }
88
94
  ({
89
95
  enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
@@ -95,6 +101,10 @@ function appendAndDrainTo480(slot, newSamples) {
95
101
  let slots = [];
96
102
  let stopPacerFn = null;
97
103
  let outputPacerState = null;
104
+ /**
105
+ * Simple linear interpolation resampler to convert audio to 48kHz.
106
+ * This handles the common case of 16kHz -> 48kHz (3x upsampling).
107
+ */
98
108
  function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
99
109
  const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
100
110
  const outputLength = Math.floor(inputFrames * ratio);
@@ -114,11 +124,43 @@ function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
114
124
  }
115
125
  return output;
116
126
  }
127
+ /**
128
+ * Enqueue an audio frame for paced delivery to the RTCAudioSource.
129
+ */
117
130
  function enqueueOutputFrame(samples) {
118
131
  if (outputPacerState) {
119
132
  outputPacerState.frameQueue.push(samples);
120
133
  }
121
134
  }
135
+ /**
136
+ * Start the audio pacer loop for all input slots in an FFmpeg process.
137
+ *
138
+ * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
139
+ * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
140
+ * arrive jittery, bursty, or with slightly different clocks.
141
+ *
142
+ * Key behavior:
143
+ * - Writes exactly one frame per period, on a shared wall-clock grid.
144
+ * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
145
+ * never stalls.
146
+ * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
147
+ * - Honors Node stream backpressure (`write()` return false) without breaking
148
+ * the timing grid.
149
+ *
150
+ * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
151
+ * can mix them without slow-downs or drift.
152
+ *
153
+ * Call this once right after spawning FFmpeg:
154
+ * ```ts
155
+ * const ff = spawnFFmpegProcess();
156
+ * startPacer(ff, PARTICIPANT_SLOTS);
157
+ * ```
158
+ *
159
+ * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
160
+ *
161
+ * @param ff Child process handle from spawn("ffmpeg", ...)
162
+ * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
163
+ */
122
164
  function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
123
165
  if (stopPacerFn) {
124
166
  stopPacerFn();
@@ -126,11 +168,11 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
126
168
  }
127
169
  const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
128
170
  const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
129
- const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
171
+ const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
130
172
  const t0 = nowMs();
131
173
  slots = Array.from({ length: slotCount }, () => ({
132
174
  q: [],
133
- lastFrames: FRAME_10MS_SAMPLES,
175
+ lastFrames: FRAME_10MS_SAMPLES, // keep constant
134
176
  nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
135
177
  }));
136
178
  outputPacerState = {
@@ -145,10 +187,11 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
145
187
  for (let s = 0; s < slotCount; s++) {
146
188
  const st = slots[s];
147
189
  const w = writers[s];
148
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
190
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
149
191
  if (t >= st.nextDueMs) {
150
192
  const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
151
193
  if (!w.write(buf)) {
194
+ // Just continue without adding drain listener - backpressure will naturally resolve
152
195
  const late = t - st.nextDueMs;
153
196
  const steps = Math.max(1, Math.ceil(late / frameMs));
154
197
  st.nextDueMs += steps * frameMs;
@@ -161,9 +204,10 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
161
204
  }
162
205
  if (!outputPacerState)
163
206
  return;
207
+ // Handle output pacer for RTCAudioSource
164
208
  const state = outputPacerState;
165
209
  if (t >= state.nextDueMs) {
166
- const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES);
210
+ const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
167
211
  if (!state.didEmitReadyEvent) {
168
212
  state.onAudioStreamReady();
169
213
  state.didEmitReadyEvent = true;
@@ -179,12 +223,20 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
179
223
  }, 5);
180
224
  stopPacerFn = () => clearInterval(iv);
181
225
  }
226
+ /**
227
+ * Stop the audio pacer loop and clear all input slots.
228
+ * Call this before killing the FFmpeg process to ensure clean shutdown.
229
+ */
182
230
  function stopPacer() {
183
231
  if (stopPacerFn)
184
232
  stopPacerFn();
185
233
  stopPacerFn = null;
186
234
  slots = [];
187
235
  }
236
+ /**
237
+ * Queue a live frame for a given slot (0..N-1).
238
+ * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
239
+ */
188
240
  function enqueueFrame(slot, samples, numberOfFrames) {
189
241
  const st = slots[slot];
190
242
  if (!st)
@@ -192,6 +244,10 @@ function enqueueFrame(slot, samples, numberOfFrames) {
192
244
  const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
193
245
  st.q.push(buf);
194
246
  }
247
+ /**
248
+ * Clear the audio queue for a specific slot when a participant leaves.
249
+ * This prevents stale audio data from continuing to play after disconnect.
250
+ */
195
251
  function clearSlotQueue(slot) {
196
252
  const st = slots[slot];
197
253
  if (st) {
@@ -201,6 +257,11 @@ function clearSlotQueue(slot) {
201
257
  st.nextDueMs = now + frameMs;
202
258
  }
203
259
  }
260
+ /**
261
+ * Get the FFmpeg arguments for mixing audio from multiple participants.
262
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
263
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
264
+ */
204
265
  function getFFmpegArguments() {
205
266
  const N = PARTICIPANT_SLOTS;
206
267
  const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
@@ -218,6 +279,14 @@ function getFFmpegArguments() {
218
279
  ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
219
280
  return ffArgs;
220
281
  }
282
+ /**
283
+ * Spawn a new FFmpeg process for mixing audio from multiple participants.
284
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
285
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
286
+ * The process will log its output to stderr.
287
+ * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
288
+ * @return The spawned FFmpeg process.
289
+ */
221
290
  function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
222
291
  const stdio = ["ignore", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
223
292
  const args = getFFmpegArguments();
@@ -227,7 +296,7 @@ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
227
296
  ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
228
297
  ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
229
298
  let audioBuffer = Buffer.alloc(0);
230
- const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE;
299
+ const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
231
300
  ffmpegProcess.stdout.on("data", (chunk) => {
232
301
  audioBuffer = Buffer.concat([audioBuffer, chunk]);
233
302
  while (audioBuffer.length >= FRAME_SIZE_BYTES) {
@@ -242,6 +311,16 @@ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
242
311
  });
243
312
  return ffmpegProcess;
244
313
  }
314
+ /**
315
+ * Write audio data from a MediaStreamTrack to the FFmpeg process.
316
+ * This function creates an AudioSink for the track and sets up a data handler
317
+ * that enqueues audio frames into the pacer.
318
+ *
319
+ * @param ffmpegProcess The FFmpeg process to which audio data will be written.
320
+ * @param slot The participant slot number (0..N-1) to which this track belongs.
321
+ * @param audioTrack The MediaStreamTrack containing the audio data.
322
+ * @return An object containing the AudioSink, the writable stream, and a stop function.
323
+ */
245
324
  function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
246
325
  const writer = ffmpegProcess.stdio[3 + slot];
247
326
  const sink = new AudioSink(audioTrack);
@@ -266,6 +345,12 @@ function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
266
345
  };
267
346
  return { sink, writer, stop };
268
347
  }
348
+ /**
349
+ * Stop the FFmpeg process and clean up all resources.
350
+ * This function will unpipe the stdout, end all writable streams for each participant slot,
351
+ * and kill the FFmpeg process.
352
+ * @param ffmpegProcess The FFmpeg process to stop.
353
+ */
269
354
  function stopFFmpegProcess(ffmpegProcess) {
270
355
  stopPacer();
271
356
  if (ffmpegProcess && !ffmpegProcess.killed) {
@@ -319,6 +404,7 @@ class AudioMixer extends EventEmitter {
319
404
  for (const p of participants)
320
405
  this.attachParticipantIfNeeded(p);
321
406
  const liveIds = new Set(participants.map((p) => p.id).filter(Boolean));
407
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
322
408
  for (const [slot, pid] of this.participantSlots) {
323
409
  if (pid && !liveIds.has(pid))
324
410
  this.detachParticipant(pid);
@@ -331,6 +417,7 @@ class AudioMixer extends EventEmitter {
331
417
  }
332
418
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
333
419
  this.activeSlots = {};
420
+ // Recreate the media stream to avoid stale references
334
421
  this.setupMediaStream();
335
422
  }
336
423
  slotForParticipant(participantId) {
@@ -396,6 +483,7 @@ class AudioMixer extends EventEmitter {
396
483
  }
397
484
  this.activeSlots[slot] = undefined;
398
485
  }
486
+ // Clear any queued audio data for this slot to prevent stale audio
399
487
  clearSlotQueue(slot);
400
488
  this.participantSlots.set(slot, "");
401
489
  }
@@ -519,10 +607,11 @@ class Assistant extends EventEmitter$1 {
519
607
  }
520
608
  }
521
609
 
522
- const BIND_INTERFACE = "en0";
610
+ dotenv.config();
611
+ const { IS_LOCAL = "false", BIND_INTERFACE = "en0" } = process.env;
523
612
  function buildRoomUrl(roomPath, wherebySubdomain, baseDomain = "whereby.com") {
524
613
  let wherebyDomain;
525
- {
614
+ if (IS_LOCAL === "true") {
526
615
  const ifaceAddrs = networkInterfaces()[BIND_INTERFACE];
527
616
  if (!ifaceAddrs) {
528
617
  throw new Error(`Unknown interface ${BIND_INTERFACE}`);
@@ -533,6 +622,9 @@ function buildRoomUrl(roomPath, wherebySubdomain, baseDomain = "whereby.com") {
533
622
  }
534
623
  wherebyDomain = `${wherebySubdomain}-ip-${bindAddr.address.replace(/[.]/g, "-")}.hereby.dev:4443`;
535
624
  }
625
+ else {
626
+ wherebyDomain = `${wherebySubdomain}.${baseDomain}`;
627
+ }
536
628
  return `https://${wherebyDomain}${roomPath}`;
537
629
  }
538
630
 
@@ -560,7 +652,7 @@ const webhookRouter = (webhookTriggers, emitter, assistantKey, startCombinedAudi
560
652
  return router;
561
653
  };
562
654
  class Trigger extends EventEmitter {
563
- constructor({ webhookTriggers = {}, port = 4999, assistantKey, startCombinedAudioStream, startLocalMedia, }) {
655
+ constructor({ webhookTriggers = {}, port = 8080, assistantKey, startCombinedAudioStream, startLocalMedia, }) {
564
656
  super();
565
657
  this.webhookTriggers = webhookTriggers;
566
658
  this.port = port;
@@ -573,6 +665,7 @@ class Trigger extends EventEmitter {
573
665
  const router = webhookRouter(this.webhookTriggers, this, this.assistantKey, this.startCombinedAudioStream, this.startLocalMedia);
574
666
  app.use(router);
575
667
  const server = app.listen(this.port, () => {
668
+ // console.log(`Bot trigger server now running on port[${this.port}]`);
576
669
  });
577
670
  process.on("SIGTERM", () => {
578
671
  server.close();
@@ -38,8 +38,10 @@ typeof SuppressedError === "function" ? SuppressedError : function (error, suppr
38
38
 
39
39
  function setWebsocketOrigin(roomUrl) {
40
40
  try {
41
+ // add pathname needed for parsing in rtcstats-server.
41
42
  const url = new URL(roomUrl);
42
43
  global.window.location.pathname = url.pathname;
44
+ // fix origin header needed for parsing in rtcstats-server.
43
45
  const defaultClientOptions = {
44
46
  origin: url.origin,
45
47
  };
@@ -55,24 +57,29 @@ function setWebsocketOrigin(roomUrl) {
55
57
  }
56
58
  }
57
59
  const wrtcMediaDevices = wrtc.mediaDevices;
58
- global.navigator = {
59
- userAgent: "Node.js/20",
60
- mediaDevices: {
61
- getUserMedia: wrtc.getUserMedia,
62
- addEventListener: wrtcMediaDevices.addEventListener,
63
- removeEventListener: wrtcMediaDevices.removeEventListener,
64
- enumerateDevices: () => __awaiter(void 0, void 0, void 0, function* () {
65
- return new Promise((resolve) => resolve([
66
- {
67
- deviceId: "default",
68
- groupId: uuid.v4(),
69
- kind: "audioinput",
70
- label: "Dummy audio device",
71
- },
72
- ]));
73
- }),
60
+ Object.defineProperty(global, "navigator", {
61
+ value: {
62
+ userAgent: "Node.js/20",
63
+ mediaDevices: {
64
+ getUserMedia: wrtc.getUserMedia,
65
+ addEventListener: wrtcMediaDevices.addEventListener,
66
+ removeEventListener: wrtcMediaDevices.removeEventListener,
67
+ enumerateDevices: () => __awaiter(void 0, void 0, void 0, function* () {
68
+ return new Promise((resolve) => resolve([
69
+ {
70
+ deviceId: "default",
71
+ groupId: uuid.v4(),
72
+ kind: "audioinput",
73
+ label: "Dummy audio device",
74
+ },
75
+ ]));
76
+ }),
77
+ },
74
78
  },
75
- };
79
+ writable: false,
80
+ enumerable: true,
81
+ configurable: true,
82
+ });
76
83
  class DOMException {
77
84
  constructor(...args) {
78
85
  console.error("DOMException", args);
@@ -85,6 +92,10 @@ class RTCPeerConnection extends wrtc.RTCPeerConnection {
85
92
  }
86
93
  getStats(arg) {
87
94
  return __awaiter(this, void 0, void 0, function* () {
95
+ /**
96
+ * node-wrtc seems to expect an Object argument, and doesn't handle the null arg we pass, so we
97
+ * wrap the call and filter the arg
98
+ **/
88
99
  arg = arg instanceof Object ? arg : undefined;
89
100
  const stats = yield this.wrappedGetStats(arg);
90
101
  return stats;
@@ -107,6 +118,6 @@ global.RTCRtpSender = wrtc.RTCRtpSender;
107
118
  global.RTCRtpTransceiver = wrtc.RTCRtpTransceiver;
108
119
  global.RTCSctpTransport = wrtc.RTCSctpTransport;
109
120
  global.RTCSessionDescription = wrtc.RTCSessionDescription;
110
- global.window = Object.assign(Object.assign({}, global), { location: { pathname: "" }, screen: { width: 0 }, setInterval: global.setInterval });
121
+ global.window = Object.assign(Object.assign({}, global), { location: { pathname: "" }, screen: { width: 0 }, setInterval: global.setInterval }); // make sure all the classes / setInterval are available on window for rtcstats
111
122
 
112
123
  exports.setWebsocketOrigin = setWebsocketOrigin;
package/dist/tools.cjs CHANGED
@@ -19,9 +19,14 @@ class AudioSink extends wrtc.nonstandard.RTCAudioSink {
19
19
  }
20
20
  }
21
21
 
22
+ // Number of pipes in the ffmpeg process. We predefine a fixed number of slots, and then we dynamically assign
23
+ // participants to these slots based on mute/unmute state.
22
24
  const PARTICIPANT_SLOTS = 20;
25
+ // Each sample is 2 bytes (16 bits) for PCM audio - s16le format
26
+ // 48000 Hz is the standard sample rate for WebRTC audio
23
27
  const STREAM_INPUT_SAMPLE_RATE_IN_HZ = 48000;
24
28
  const BYTES_PER_SAMPLE = 2;
29
+ // 480 samples per 10ms frame at 48kHz
25
30
  const FRAME_10MS_SAMPLES = 480;
26
31
  const slotBuffers = new Map();
27
32
  function appendAndDrainTo480(slot, newSamples) {
@@ -33,10 +38,10 @@ function appendAndDrainTo480(slot, newSamples) {
33
38
  let offset = 0;
34
39
  while (merged.length - offset >= FRAME_10MS_SAMPLES) {
35
40
  const chunk = merged.subarray(offset, offset + FRAME_10MS_SAMPLES);
36
- enqueueFrame(slot, chunk);
41
+ enqueueFrame(slot, chunk); // always 480
37
42
  offset += FRAME_10MS_SAMPLES;
38
43
  }
39
- slotBuffers.set(slot, merged.subarray(offset));
44
+ slotBuffers.set(slot, merged.subarray(offset)); // keep remainder
40
45
  }
41
46
  ({
42
47
  enqFrames: new Array(PARTICIPANT_SLOTS).fill(0),
@@ -48,6 +53,10 @@ function appendAndDrainTo480(slot, newSamples) {
48
53
  let slots = [];
49
54
  let stopPacerFn = null;
50
55
  let outputPacerState = null;
56
+ /**
57
+ * Simple linear interpolation resampler to convert audio to 48kHz.
58
+ * This handles the common case of 16kHz -> 48kHz (3x upsampling).
59
+ */
51
60
  function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
52
61
  const ratio = STREAM_INPUT_SAMPLE_RATE_IN_HZ / inputSampleRate;
53
62
  const outputLength = Math.floor(inputFrames * ratio);
@@ -67,11 +76,43 @@ function resampleTo48kHz(inputSamples, inputSampleRate, inputFrames) {
67
76
  }
68
77
  return output;
69
78
  }
79
+ /**
80
+ * Enqueue an audio frame for paced delivery to the RTCAudioSource.
81
+ */
70
82
  function enqueueOutputFrame(samples) {
71
83
  if (outputPacerState) {
72
84
  outputPacerState.frameQueue.push(samples);
73
85
  }
74
86
  }
87
+ /**
88
+ * Start the audio pacer loop for all input slots in an FFmpeg process.
89
+ *
90
+ * The pacer ensures each slot (pipe:3..3+N-1) is written to at a steady
91
+ * real-time rate (e.g. 10 ms = 480 samples @ 48kHz), even if WebRTC frames
92
+ * arrive jittery, bursty, or with slightly different clocks.
93
+ *
94
+ * Key behavior:
95
+ * - Writes exactly one frame per period, on a shared wall-clock grid.
96
+ * - Uses silence (zero-filled frame) if a slot's queue is empty, so timing
97
+ * never stalls.
98
+ * - Resnaps the schedule if a slot switches between 10 ms / 20 ms frames.
99
+ * - Honors Node stream backpressure (`write()` return false) without breaking
100
+ * the timing grid.
101
+ *
102
+ * This keeps all FFmpeg inputs phase-aligned and stable, so aresample/amix
103
+ * can mix them without slow-downs or drift.
104
+ *
105
+ * Call this once right after spawning FFmpeg:
106
+ * ```ts
107
+ * const ff = spawnFFmpegProcess();
108
+ * startPacer(ff, PARTICIPANT_SLOTS);
109
+ * ```
110
+ *
111
+ * When tearing down the mixer, always call `stopPacer()` before killing FFmpeg.
112
+ *
113
+ * @param ff Child process handle from spawn("ffmpeg", ...)
114
+ * @param slotCount Number of participant input slots (0..N-1 → fd 3..3+N-1)
115
+ */
75
116
  function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
76
117
  if (stopPacerFn) {
77
118
  stopPacerFn();
@@ -79,11 +120,11 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
79
120
  }
80
121
  const writers = Array.from({ length: slotCount }, (_, i) => ff.stdio[3 + i]);
81
122
  const nowMs = () => Number(process.hrtime.bigint()) / 1e6;
82
- const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
123
+ const outputFrameMs = (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms
83
124
  const t0 = nowMs();
84
125
  slots = Array.from({ length: slotCount }, () => ({
85
126
  q: [],
86
- lastFrames: FRAME_10MS_SAMPLES,
127
+ lastFrames: FRAME_10MS_SAMPLES, // keep constant
87
128
  nextDueMs: t0 + (FRAME_10MS_SAMPLES / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000,
88
129
  }));
89
130
  outputPacerState = {
@@ -98,10 +139,11 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
98
139
  for (let s = 0; s < slotCount; s++) {
99
140
  const st = slots[s];
100
141
  const w = writers[s];
101
- const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000;
142
+ const frameMs = (st.lastFrames / STREAM_INPUT_SAMPLE_RATE_IN_HZ) * 1000; // 10ms if 480, 20ms if 960
102
143
  if (t >= st.nextDueMs) {
103
144
  const buf = st.q.length ? st.q.shift() : Buffer.alloc(st.lastFrames * BYTES_PER_SAMPLE);
104
145
  if (!w.write(buf)) {
146
+ // Just continue without adding drain listener - backpressure will naturally resolve
105
147
  const late = t - st.nextDueMs;
106
148
  const steps = Math.max(1, Math.ceil(late / frameMs));
107
149
  st.nextDueMs += steps * frameMs;
@@ -114,9 +156,10 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
114
156
  }
115
157
  if (!outputPacerState)
116
158
  return;
159
+ // Handle output pacer for RTCAudioSource
117
160
  const state = outputPacerState;
118
161
  if (t >= state.nextDueMs) {
119
- const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES);
162
+ const samples = state.frameQueue.length > 0 ? state.frameQueue.shift() : new Int16Array(FRAME_10MS_SAMPLES); // silence
120
163
  if (!state.didEmitReadyEvent) {
121
164
  state.onAudioStreamReady();
122
165
  state.didEmitReadyEvent = true;
@@ -132,12 +175,20 @@ function startPacer(ff, slotCount, rtcAudioSource, onAudioStreamReady) {
132
175
  }, 5);
133
176
  stopPacerFn = () => clearInterval(iv);
134
177
  }
178
+ /**
179
+ * Stop the audio pacer loop and clear all input slots.
180
+ * Call this before killing the FFmpeg process to ensure clean shutdown.
181
+ */
135
182
  function stopPacer() {
136
183
  if (stopPacerFn)
137
184
  stopPacerFn();
138
185
  stopPacerFn = null;
139
186
  slots = [];
140
187
  }
188
+ /**
189
+ * Queue a live frame for a given slot (0..N-1).
190
+ * Auto-resnaps the slot's schedule if the frame size (480/960) changes.
191
+ */
141
192
  function enqueueFrame(slot, samples, numberOfFrames) {
142
193
  const st = slots[slot];
143
194
  if (!st)
@@ -145,6 +196,10 @@ function enqueueFrame(slot, samples, numberOfFrames) {
145
196
  const buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength);
146
197
  st.q.push(buf);
147
198
  }
199
+ /**
200
+ * Clear the audio queue for a specific slot when a participant leaves.
201
+ * This prevents stale audio data from continuing to play after disconnect.
202
+ */
148
203
  function clearSlotQueue(slot) {
149
204
  const st = slots[slot];
150
205
  if (st) {
@@ -154,6 +209,11 @@ function clearSlotQueue(slot) {
154
209
  st.nextDueMs = now + frameMs;
155
210
  }
156
211
  }
212
+ /**
213
+ * Get the FFmpeg arguments for mixing audio from multiple participants.
214
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
215
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
216
+ */
157
217
  function getFFmpegArguments() {
158
218
  const N = PARTICIPANT_SLOTS;
159
219
  const SR = STREAM_INPUT_SAMPLE_RATE_IN_HZ;
@@ -171,6 +231,14 @@ function getFFmpegArguments() {
171
231
  ffArgs.push("-hide_banner", "-nostats", "-loglevel", "error", "-filter_complex", filter, "-map", "[mix]", "-f", "s16le", "-ar", String(SR), "-ac", "1", "-c:a", "pcm_s16le", "pipe:1");
172
232
  return ffArgs;
173
233
  }
234
+ /**
235
+ * Spawn a new FFmpeg process for mixing audio from multiple participants.
236
+ * This will read from the input pipes (3..3+N-1) and output a single mixed audio stream.
237
+ * The output is in PCM 16-bit little-endian format at 48kHz sample rate.
238
+ * The process will log its output to stderr.
239
+ * @param rtcAudioSource The RTCAudioSource to which the mixed audio will be sent.
240
+ * @return The spawned FFmpeg process.
241
+ */
174
242
  function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
175
243
  const stdio = ["ignore", "pipe", "pipe", ...Array(PARTICIPANT_SLOTS).fill("pipe")];
176
244
  const args = getFFmpegArguments();
@@ -180,7 +248,7 @@ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
180
248
  ffmpegProcess.stderr.on("data", (d) => console.error("[ffmpeg]", String(d).trim()));
181
249
  ffmpegProcess.on("error", () => console.error("FFmpeg process error: is ffmpeg installed?"));
182
250
  let audioBuffer = Buffer.alloc(0);
183
- const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE;
251
+ const FRAME_SIZE_BYTES = FRAME_10MS_SAMPLES * BYTES_PER_SAMPLE; // 480 samples * 2 bytes = 960 bytes
184
252
  ffmpegProcess.stdout.on("data", (chunk) => {
185
253
  audioBuffer = Buffer.concat([audioBuffer, chunk]);
186
254
  while (audioBuffer.length >= FRAME_SIZE_BYTES) {
@@ -195,6 +263,16 @@ function spawnFFmpegProcess(rtcAudioSource, onAudioStreamReady) {
195
263
  });
196
264
  return ffmpegProcess;
197
265
  }
266
+ /**
267
+ * Write audio data from a MediaStreamTrack to the FFmpeg process.
268
+ * This function creates an AudioSink for the track and sets up a data handler
269
+ * that enqueues audio frames into the pacer.
270
+ *
271
+ * @param ffmpegProcess The FFmpeg process to which audio data will be written.
272
+ * @param slot The participant slot number (0..N-1) to which this track belongs.
273
+ * @param audioTrack The MediaStreamTrack containing the audio data.
274
+ * @return An object containing the AudioSink, the writable stream, and a stop function.
275
+ */
198
276
  function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
199
277
  const writer = ffmpegProcess.stdio[3 + slot];
200
278
  const sink = new AudioSink(audioTrack);
@@ -219,6 +297,12 @@ function writeAudioDataToFFmpeg(ffmpegProcess, slot, audioTrack) {
219
297
  };
220
298
  return { sink, writer, stop };
221
299
  }
300
+ /**
301
+ * Stop the FFmpeg process and clean up all resources.
302
+ * This function will unpipe the stdout, end all writable streams for each participant slot,
303
+ * and kill the FFmpeg process.
304
+ * @param ffmpegProcess The FFmpeg process to stop.
305
+ */
222
306
  function stopFFmpegProcess(ffmpegProcess) {
223
307
  stopPacer();
224
308
  if (ffmpegProcess && !ffmpegProcess.killed) {
@@ -272,6 +356,7 @@ class AudioMixer extends events.EventEmitter {
272
356
  for (const p of participants)
273
357
  this.attachParticipantIfNeeded(p);
274
358
  const liveIds = new Set(participants.map((p) => p.id).filter(Boolean));
359
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
275
360
  for (const [slot, pid] of this.participantSlots) {
276
361
  if (pid && !liveIds.has(pid))
277
362
  this.detachParticipant(pid);
@@ -284,6 +369,7 @@ class AudioMixer extends events.EventEmitter {
284
369
  }
285
370
  this.participantSlots = new Map(Array.from({ length: PARTICIPANT_SLOTS }, (_, i) => [i, ""]));
286
371
  this.activeSlots = {};
372
+ // Recreate the media stream to avoid stale references
287
373
  this.setupMediaStream();
288
374
  }
289
375
  slotForParticipant(participantId) {
@@ -349,6 +435,7 @@ class AudioMixer extends events.EventEmitter {
349
435
  }
350
436
  this.activeSlots[slot] = undefined;
351
437
  }
438
+ // Clear any queued audio data for this slot to prevent stale audio
352
439
  clearSlotQueue(slot);
353
440
  this.participantSlots.set(slot, "");
354
441
  }
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "@whereby.com/assistant-sdk",
3
3
  "description": "Assistant SDK for whereby.com",
4
4
  "author": "Whereby AS",
5
- "version": "0.0.0-canary-20250911141956",
5
+ "version": "0.0.0-canary-20250912142319",
6
6
  "license": "MIT",
7
7
  "files": [
8
8
  "dist",
@@ -47,6 +47,8 @@
47
47
  }
48
48
  },
49
49
  "devDependencies": {
50
+ "body-parser": "2.2.0",
51
+ "express": "5.1.0",
50
52
  "eslint": "^9.29.0",
51
53
  "prettier": "^3.5.3",
52
54
  "typescript": "^5.8.3",
@@ -58,9 +60,10 @@
58
60
  },
59
61
  "dependencies": {
60
62
  "@roamhq/wrtc": "github:whereby/node-webrtc#patch/rtc_audio_source",
63
+ "dotenv": "^16.4.5",
61
64
  "uuid": "^11.0.3",
62
65
  "ws": "^8.18.0",
63
- "@whereby.com/core": "0.0.0-canary-20250911141956"
66
+ "@whereby.com/core": "0.0.0-canary-20250912142319"
64
67
  },
65
68
  "prettier": "@whereby.com/prettier-config",
66
69
  "scripts": {