browser-pilot 0.0.8 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,1312 @@ import {
8
8
  BatchExecutor,
9
9
  ElementNotFoundError,
10
10
  TimeoutError
11
- } from "./chunk-ZTQ37YQT.mjs";
11
+ } from "./chunk-KKW2SZLV.mjs";
12
+
13
+ // src/audio/encoding.ts
14
+ function bufferToBase64(data) {
15
+ const bytes = data instanceof Uint8Array ? data : new Uint8Array(data);
16
+ let binary = "";
17
+ for (let i = 0; i < bytes.length; i++) {
18
+ binary += String.fromCharCode(bytes[i]);
19
+ }
20
+ return btoa(binary);
21
+ }
22
+ function base64ToBuffer(b64) {
23
+ const binary = atob(b64);
24
+ const bytes = new Uint8Array(binary.length);
25
+ for (let i = 0; i < binary.length; i++) {
26
+ bytes[i] = binary.charCodeAt(i);
27
+ }
28
+ return bytes;
29
+ }
30
+ function calculateRMS(samples) {
31
+ if (samples.length === 0) return 0;
32
+ let sum = 0;
33
+ for (let i = 0; i < samples.length; i++) {
34
+ sum += samples[i] * samples[i];
35
+ }
36
+ return Math.sqrt(sum / samples.length);
37
+ }
38
+ function pcmToWav(options) {
39
+ const { left, right, sampleRate } = options;
40
+ const numChannels = right ? 2 : 1;
41
+ const numSamples = left.length;
42
+ const bitsPerSample = 16;
43
+ const bytesPerSample = bitsPerSample / 8;
44
+ const blockAlign = numChannels * bytesPerSample;
45
+ const dataLength = numSamples * blockAlign;
46
+ const headerLength = 44;
47
+ const buffer = new ArrayBuffer(headerLength + dataLength);
48
+ const view = new DataView(buffer);
49
+ writeString(view, 0, "RIFF");
50
+ view.setUint32(4, 36 + dataLength, true);
51
+ writeString(view, 8, "WAVE");
52
+ writeString(view, 12, "fmt ");
53
+ view.setUint32(16, 16, true);
54
+ view.setUint16(20, 1, true);
55
+ view.setUint16(22, numChannels, true);
56
+ view.setUint32(24, sampleRate, true);
57
+ view.setUint32(28, sampleRate * blockAlign, true);
58
+ view.setUint16(32, blockAlign, true);
59
+ view.setUint16(34, bitsPerSample, true);
60
+ writeString(view, 36, "data");
61
+ view.setUint32(40, dataLength, true);
62
+ let offset = 44;
63
+ for (let i = 0; i < numSamples; i++) {
64
+ const leftSample = Math.max(-1, Math.min(1, left[i]));
65
+ view.setInt16(offset, leftSample < 0 ? leftSample * 32768 : leftSample * 32767, true);
66
+ offset += 2;
67
+ if (right) {
68
+ const rightSample = Math.max(-1, Math.min(1, right[i]));
69
+ view.setInt16(offset, rightSample < 0 ? rightSample * 32768 : rightSample * 32767, true);
70
+ offset += 2;
71
+ }
72
+ }
73
+ return buffer;
74
+ }
75
+ function parseWavHeader(data) {
76
+ const view = new DataView(data);
77
+ if (data.byteLength < 44) {
78
+ throw new Error("Invalid WAV: file too small");
79
+ }
80
+ const riff = readString(view, 0, 4);
81
+ const wave = readString(view, 8, 4);
82
+ if (riff !== "RIFF" || wave !== "WAVE") {
83
+ throw new Error("Invalid WAV: missing RIFF/WAVE header");
84
+ }
85
+ const fmt = readString(view, 12, 4);
86
+ if (fmt !== "fmt ") {
87
+ throw new Error("Invalid WAV: missing fmt chunk");
88
+ }
89
+ const channels = view.getUint16(22, true);
90
+ const sampleRate = view.getUint32(24, true);
91
+ const bitsPerSample = view.getUint16(34, true);
92
+ let dataOffset = 36;
93
+ while (dataOffset < data.byteLength - 8) {
94
+ const chunkId = readString(view, dataOffset, 4);
95
+ const chunkSize = view.getUint32(dataOffset + 4, true);
96
+ if (chunkId === "data") {
97
+ return {
98
+ sampleRate,
99
+ channels,
100
+ bitsPerSample,
101
+ dataOffset: dataOffset + 8,
102
+ dataLength: chunkSize
103
+ };
104
+ }
105
+ dataOffset += 8 + chunkSize;
106
+ }
107
+ throw new Error("Invalid WAV: missing data chunk");
108
+ }
109
+ function generateSilence(durationMs, sampleRate = 48e3) {
110
+ return new Float32Array(Math.ceil(sampleRate * durationMs / 1e3));
111
+ }
112
+ function generateTone(frequency, durationMs, sampleRate = 48e3, amplitude = 0.5) {
113
+ const numSamples = Math.ceil(sampleRate * durationMs / 1e3);
114
+ const samples = new Float32Array(numSamples);
115
+ for (let i = 0; i < numSamples; i++) {
116
+ samples[i] = amplitude * Math.sin(2 * Math.PI * frequency * i / sampleRate);
117
+ }
118
+ return samples;
119
+ }
120
+ function writeString(view, offset, str) {
121
+ for (let i = 0; i < str.length; i++) {
122
+ view.setUint8(offset + i, str.charCodeAt(i));
123
+ }
124
+ }
125
+ function readString(view, offset, length) {
126
+ let str = "";
127
+ for (let i = 0; i < length; i++) {
128
+ str += String.fromCharCode(view.getUint8(offset + i));
129
+ }
130
+ return str;
131
+ }
132
+
133
+ // src/audio/permissions.ts
134
+ async function grantAudioPermissions(cdp, origin) {
135
+ await cdp.send("Browser.grantPermissions", {
136
+ permissions: ["audioCapture"],
137
+ origin: origin ?? ""
138
+ });
139
+ await cdp.send("Page.addScriptToEvaluateOnNewDocument", {
140
+ source: PERMISSIONS_OVERRIDE_SCRIPT
141
+ });
142
+ }
143
+ var PERMISSIONS_OVERRIDE_SCRIPT = `
144
+ (function() {
145
+ if (window.__bpPermissionsPatched) return;
146
+ window.__bpPermissionsPatched = true;
147
+
148
+ var origQuery = navigator.permissions.query.bind(navigator.permissions);
149
+ navigator.permissions.query = function(desc) {
150
+ if (desc && (desc.name === 'microphone' || desc.name === 'audio-capture')) {
151
+ return Promise.resolve({
152
+ state: 'granted',
153
+ onchange: null,
154
+ addEventListener: function() {},
155
+ removeEventListener: function() {},
156
+ dispatchEvent: function() { return true; }
157
+ });
158
+ }
159
+ return origQuery(desc);
160
+ };
161
+ })();
162
+ `;
163
+
164
+ // src/audio/input.ts
165
+ var INPUT_BINDING = "__bpAudioInputDone";
166
+ var AUDIO_INPUT_SCRIPT = `
167
+ (function() {
168
+ if (window.__bpAudioInput) return;
169
+
170
+ var audioCtx = null;
171
+ var sourceNode = null;
172
+ var destinationNode = null;
173
+ var fakeStream = null;
174
+ var silenceGain = null;
175
+ var silenceOsc = null;
176
+ var isPlaying = false;
177
+
178
+ function ensureFakeStream() {
179
+ if (fakeStream) return fakeStream;
180
+ // Use the original AudioContext to avoid being tracked by our output override
181
+ var CtorToUse = window.__bpOrigAudioContext || window.AudioContext || window.webkitAudioContext;
182
+ audioCtx = new CtorToUse({ sampleRate: 48000 });
183
+ // Auto-resume if suspended (CDP automation has no user gesture)
184
+ if (audioCtx.state === 'suspended') {
185
+ console.log('[bp:input] AudioContext suspended, auto-resuming...');
186
+ audioCtx.resume().then(function() {
187
+ console.log('[bp:input] AudioContext resumed (' + audioCtx.state + ')');
188
+ }).catch(function(e) {
189
+ console.warn('[bp:input] AudioContext resume failed:', e);
190
+ });
191
+ }
192
+ destinationNode = audioCtx.createMediaStreamDestination();
193
+
194
+ // Start with silence so the stream always has active tracks
195
+ silenceGain = audioCtx.createGain();
196
+ silenceGain.gain.value = 0;
197
+ silenceOsc = audioCtx.createOscillator();
198
+ silenceOsc.connect(silenceGain);
199
+ silenceGain.connect(destinationNode);
200
+ silenceOsc.start();
201
+
202
+ fakeStream = destinationNode.stream;
203
+ console.log('[bp:input] Fake mic stream created (48kHz, ' + fakeStream.getAudioTracks().length + ' tracks)');
204
+ return fakeStream;
205
+ }
206
+
207
+ function playAudio(base64Data) {
208
+ ensureFakeStream();
209
+
210
+ var resumePromise = audioCtx.state === 'suspended'
211
+ ? audioCtx.resume()
212
+ : Promise.resolve();
213
+
214
+ return resumePromise.then(function() {
215
+ if (sourceNode) {
216
+ try { sourceNode.stop(); } catch(e) {}
217
+ sourceNode.disconnect();
218
+ sourceNode = null;
219
+ }
220
+
221
+ var binaryStr = atob(base64Data);
222
+ var bytes = new Uint8Array(binaryStr.length);
223
+ for (var i = 0; i < binaryStr.length; i++) {
224
+ bytes[i] = binaryStr.charCodeAt(i);
225
+ }
226
+ console.log('[bp:input] Decoding audio (' + bytes.length + ' bytes)...');
227
+
228
+ return audioCtx.decodeAudioData(bytes.buffer.slice(0));
229
+ }).then(function(audioBuffer) {
230
+ sourceNode = audioCtx.createBufferSource();
231
+ sourceNode.buffer = audioBuffer;
232
+ sourceNode.connect(destinationNode);
233
+
234
+ var durationMs = Math.round(audioBuffer.duration * 1000);
235
+ console.log('[bp:input] Playing ' + durationMs + 'ms audio (' + audioBuffer.sampleRate + 'Hz, ' + audioBuffer.numberOfChannels + 'ch)');
236
+
237
+ return new Promise(function(resolve) {
238
+ sourceNode.onended = function() {
239
+ isPlaying = false;
240
+ console.log('[bp:input] Playback ended');
241
+ resolve(true);
242
+ try {
243
+ if (typeof window.__bpAudioInputDone === 'function') {
244
+ window.__bpAudioInputDone('done');
245
+ }
246
+ } catch(e) {}
247
+ };
248
+ isPlaying = true;
249
+ sourceNode.start();
250
+ });
251
+ });
252
+ }
253
+
254
+ function stopAudio() {
255
+ if (sourceNode) {
256
+ try { sourceNode.stop(); } catch(e) {}
257
+ sourceNode.disconnect();
258
+ sourceNode = null;
259
+ }
260
+ isPlaying = false;
261
+ console.log('[bp:input] Stopped');
262
+ }
263
+
264
+ var origGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
265
+
266
+ navigator.mediaDevices.getUserMedia = function(constraints) {
267
+ if (constraints && constraints.audio) {
268
+ var stream = ensureFakeStream();
269
+ console.log('[bp:input] getUserMedia intercepted \u2014 returning fake mic' + (constraints.video ? ' + real video' : ''));
270
+
271
+ if (constraints.video) {
272
+ // Get real video + our fake audio
273
+ return origGetUserMedia({ video: constraints.video }).then(function(realStream) {
274
+ var combined = new MediaStream(
275
+ stream.getAudioTracks().concat(realStream.getVideoTracks())
276
+ );
277
+ return combined;
278
+ });
279
+ }
280
+
281
+ // Return a clone so consumers can't stop our source track
282
+ return Promise.resolve(stream.clone());
283
+ }
284
+ return origGetUserMedia(constraints);
285
+ };
286
+
287
+ var origEnumerate = navigator.mediaDevices.enumerateDevices.bind(navigator.mediaDevices);
288
+ navigator.mediaDevices.enumerateDevices = function() {
289
+ return origEnumerate().then(function(devices) {
290
+ var hasMic = devices.some(function(d) { return d.kind === 'audioinput'; });
291
+ if (!hasMic) {
292
+ devices.push({
293
+ deviceId: 'bp-fake-mic',
294
+ kind: 'audioinput',
295
+ label: 'Default Audio Input',
296
+ groupId: 'bp-audio',
297
+ toJSON: function() {
298
+ return { deviceId: this.deviceId, kind: this.kind, label: this.label, groupId: this.groupId };
299
+ }
300
+ });
301
+ }
302
+ return devices;
303
+ });
304
+ };
305
+
306
+ window.__bpAudioInput = {
307
+ play: playAudio,
308
+ stop: stopAudio,
309
+ isPlaying: function() { return isPlaying; },
310
+ getState: function() {
311
+ return {
312
+ contextState: audioCtx ? audioCtx.state : 'not-created',
313
+ isPlaying: isPlaying,
314
+ sampleRate: audioCtx ? audioCtx.sampleRate : 0
315
+ };
316
+ },
317
+ getContext: function() { return audioCtx; }
318
+ };
319
+
320
+ console.log('[bp:input] Audio input override installed (getUserMedia + enumerateDevices)');
321
+ })();
322
+ `;
323
+ var AudioInput = class {
324
+ cdp;
325
+ injected = false;
326
+ bindingRegistered = false;
327
+ bindingHandler = null;
328
+ constructor(cdp) {
329
+ this.cdp = cdp;
330
+ }
331
+ /** Whether the audio input system has been set up */
332
+ get isSetup() {
333
+ return this.injected;
334
+ }
335
+ /**
336
+ * Set up audio input injection.
337
+ * Must be called before navigating to the page that will use getUserMedia.
338
+ * Grants permissions and injects the getUserMedia override.
339
+ */
340
+ async setup() {
341
+ if (this.injected) return;
342
+ try {
343
+ const resp = await this.cdp.send("Runtime.evaluate", {
344
+ expression: "location.href",
345
+ returnByValue: true
346
+ });
347
+ const href = resp.result?.value;
348
+ if (typeof href === "string" && (href === "about:blank" || href === "about:srcdoc")) {
349
+ throw new Error(
350
+ 'Cannot set up audio on about:blank. Navigate to a page first.\nExample: await page.goto("https://your-voice-app.com")'
351
+ );
352
+ }
353
+ } catch (e) {
354
+ if (e instanceof Error && e.message.includes("Cannot set up audio")) throw e;
355
+ }
356
+ let origin;
357
+ try {
358
+ const resp = await this.cdp.send("Runtime.evaluate", {
359
+ expression: "location.origin",
360
+ returnByValue: true
361
+ });
362
+ const val = resp.result?.value;
363
+ if (typeof val === "string" && val !== "null") {
364
+ origin = val;
365
+ }
366
+ } catch {
367
+ }
368
+ await grantAudioPermissions(this.cdp, origin);
369
+ if (!this.bindingRegistered) {
370
+ await this.cdp.send("Runtime.addBinding", { name: INPUT_BINDING });
371
+ this.bindingRegistered = true;
372
+ }
373
+ await this.cdp.send("Page.addScriptToEvaluateOnNewDocument", {
374
+ source: AUDIO_INPUT_SCRIPT
375
+ });
376
+ await this.cdp.send("Runtime.evaluate", {
377
+ expression: AUDIO_INPUT_SCRIPT,
378
+ awaitPromise: false,
379
+ userGesture: true
380
+ });
381
+ this.injected = true;
382
+ }
383
+ /**
384
+ * Play audio bytes into the page's fake microphone.
385
+ * Accepts any format that Web Audio API can decode (WAV, MP3, OGG, etc.).
386
+ *
387
+ * @param audioData - Raw audio file bytes
388
+ * @param options - Playback options
389
+ */
390
+ async play(audioData, options) {
391
+ if (!this.injected) {
392
+ await this.setup();
393
+ }
394
+ await this.cdp.send("Runtime.evaluate", {
395
+ expression: `(function() {
396
+ var resumed = [];
397
+ (window.__bpTrackedAudioContexts || []).forEach(function(ctx) {
398
+ if (ctx.state === 'suspended') {
399
+ ctx.resume().then(function() {
400
+ console.log('[bp:input] Resumed suspended AudioContext (' + ctx.sampleRate + 'Hz)');
401
+ });
402
+ resumed.push(ctx.sampleRate);
403
+ }
404
+ });
405
+ // Also resume the input context itself
406
+ if (window.__bpAudioInput && window.__bpAudioInput.getContext) {
407
+ var inputCtx = window.__bpAudioInput.getContext();
408
+ if (inputCtx && inputCtx.state === 'suspended') {
409
+ inputCtx.resume().then(function() {
410
+ console.log('[bp:input] Resumed input AudioContext (' + inputCtx.sampleRate + 'Hz)');
411
+ });
412
+ resumed.push('input-' + inputCtx.sampleRate);
413
+ }
414
+ }
415
+ return resumed.length > 0 ? 'resumed: ' + resumed.join(',') : 'all running';
416
+ })()`,
417
+ awaitPromise: false,
418
+ userGesture: true
419
+ });
420
+ const base64 = bufferToBase64(audioData);
421
+ const waitForEnd = options?.waitForEnd ?? true;
422
+ const timeout = options?.timeout ?? 6e4;
423
+ if (waitForEnd) {
424
+ const donePromise = this.waitForBinding(timeout);
425
+ await this.cdp.send("Runtime.evaluate", {
426
+ expression: `window.__bpAudioInput.play('${base64}')`,
427
+ awaitPromise: false
428
+ });
429
+ await donePromise;
430
+ } else {
431
+ await this.cdp.send("Runtime.evaluate", {
432
+ expression: `window.__bpAudioInput.play('${base64}')`,
433
+ awaitPromise: false
434
+ });
435
+ }
436
+ }
437
+ /**
438
+ * Stop any currently playing audio.
439
+ */
440
+ async stop() {
441
+ if (!this.injected) return;
442
+ await this.cdp.send("Runtime.evaluate", {
443
+ expression: "window.__bpAudioInput && window.__bpAudioInput.stop()",
444
+ awaitPromise: false
445
+ });
446
+ }
447
+ /**
448
+ * Get current state of the injected audio input system.
449
+ */
450
+ async getState() {
451
+ if (!this.injected) {
452
+ return { contextState: "not-created", isPlaying: false, sampleRate: 0 };
453
+ }
454
+ const result = await this.cdp.send("Runtime.evaluate", {
455
+ expression: "window.__bpAudioInput ? window.__bpAudioInput.getState() : null",
456
+ returnByValue: true
457
+ });
458
+ return result.result.value ?? { contextState: "not-created", isPlaying: false, sampleRate: 0 };
459
+ }
460
+ /**
461
+ * Clean up: remove binding handler.
462
+ */
463
+ async teardown() {
464
+ if (this.bindingHandler) {
465
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
466
+ this.bindingHandler = null;
467
+ }
468
+ await this.stop();
469
+ this.injected = false;
470
+ this.bindingRegistered = false;
471
+ }
472
+ /**
473
+ * Wait for the playback-complete binding to fire.
474
+ */
475
+ waitForBinding(timeout) {
476
+ return new Promise((resolve, reject) => {
477
+ const timer = setTimeout(() => {
478
+ if (this.bindingHandler) {
479
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
480
+ this.bindingHandler = null;
481
+ }
482
+ reject(new Error(`AudioInput: playback timed out after ${timeout}ms`));
483
+ }, timeout);
484
+ if (this.bindingHandler) {
485
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
486
+ }
487
+ this.bindingHandler = (params) => {
488
+ if (params["name"] === INPUT_BINDING) {
489
+ clearTimeout(timer);
490
+ if (this.bindingHandler) {
491
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
492
+ this.bindingHandler = null;
493
+ }
494
+ resolve();
495
+ }
496
+ };
497
+ this.cdp.on("Runtime.bindingCalled", this.bindingHandler);
498
+ });
499
+ }
500
+ };
501
+
502
+ // src/audio/output.ts
503
+ var OUTPUT_BINDING = "__bpAudioOutputData";
504
+ var AUDIO_OUTPUT_SCRIPT = `
505
+ (function() {
506
+ // If already installed, stop any active capture but allow re-initialization
507
+ // so that updated scripts (e.g. with new capture strategies) take effect.
508
+ if (window.__bpAudioOutput) {
509
+ if (window.__bpAudioOutput.isCapturing()) window.__bpAudioOutput.stop();
510
+ // Keep existing allAudioContexts if available (preserves pre-override tracking)
511
+ }
512
+
513
+ var BUFFER_SIZE = 4096;
514
+ var FLUSH_SAMPLES = 48000; // flush every ~1s at 48kHz (scales with sample rate)
515
+ var capturing = false;
516
+ var capturedChunks = [];
517
+ var totalSamples = 0;
518
+ var flushCount = 0;
519
+ var pendingTracks = [];
520
+ var tappedTrackIds = {};
521
+
522
+ // --- Per-context tap infrastructure ---
523
+ // Preserve any AudioContexts tracked by a previous script version
524
+ var allAudioContexts = window.__bpTrackedAudioContexts || [];
525
+ // Use a WeakMap to associate taps with AudioContext instances
526
+ // (native objects like AudioContext may not support custom properties)
527
+ var contextTapMap = typeof WeakMap !== 'undefined' ? new WeakMap() : null;
528
+ var contextTapList = []; // fallback: [{ctx, proc}]
529
+
530
+ var OrigAudioContext = window.__bpOrigAudioContext || window.AudioContext || window.webkitAudioContext;
531
+ // Save the native connect function once; on re-injection, reuse it to avoid double-wrapping
532
+ var origConnect = window.__bpOrigConnect || AudioNode.prototype.connect;
533
+ window.__bpOrigConnect = origConnect;
534
+
535
+ // Our own capture context (48kHz) for WebRTC tracks and media elements
536
+ var captureCtx = null;
537
+ var captureProcessor = null;
538
+
539
+ // Save original AudioContext constructor once
540
+ if (!window.__bpOrigAudioContext) {
541
+ window.__bpOrigAudioContext = OrigAudioContext;
542
+ }
543
+
544
+ // Override AudioContext constructor to track all instances (skip if already overridden)
545
+ if (OrigAudioContext && !window.__bpAudioContextOverridden) {
546
+ window.__bpAudioContextOverridden = true;
547
+ window.AudioContext = function() {
548
+ var ctx = new (Function.prototype.bind.apply(OrigAudioContext, [null].concat(Array.prototype.slice.call(arguments))))();
549
+ allAudioContexts.push(ctx);
550
+ // Auto-resume suspended contexts \u2014 CDP automation has no user gesture,
551
+ // so Chrome suspends new AudioContexts by default. Without this, voice
552
+ // agents' ScriptProcessorNodes never fire and no audio flows.
553
+ if (ctx.state === 'suspended') {
554
+ console.log('[bp:output] AudioContext created suspended (' + ctx.sampleRate + 'Hz), auto-resuming...');
555
+ ctx.resume().then(function() {
556
+ console.log('[bp:output] AudioContext resumed successfully (' + ctx.sampleRate + 'Hz, state: ' + ctx.state + ')');
557
+ }).catch(function(e) {
558
+ console.warn('[bp:output] AudioContext resume failed (' + ctx.sampleRate + 'Hz):', e);
559
+ });
560
+ } else {
561
+ console.log('[bp:output] AudioContext created (' + ctx.sampleRate + 'Hz, state: ' + ctx.state + ')');
562
+ }
563
+ return ctx;
564
+ };
565
+ window.AudioContext.prototype = OrigAudioContext.prototype;
566
+ Object.keys(OrigAudioContext).forEach(function(k) {
567
+ try { window.AudioContext[k] = OrigAudioContext[k]; } catch(e) {}
568
+ });
569
+ if (window.webkitAudioContext) {
570
+ window.webkitAudioContext = window.AudioContext;
571
+ }
572
+ }
573
+
574
+ // Expose tracked contexts on window so re-injections preserve them
575
+ window.__bpTrackedAudioContexts = allAudioContexts;
576
+
577
+ // Look up an existing tap for a given AudioContext
578
+ function findTap(ctx) {
579
+ if (contextTapMap) return contextTapMap.get(ctx) || null;
580
+ for (var i = 0; i < contextTapList.length; i++) {
581
+ if (contextTapList[i].ctx === ctx) return contextTapList[i].proc;
582
+ }
583
+ return null;
584
+ }
585
+
586
+ // Store a tap for a given AudioContext
587
+ function storeTap(ctx, proc) {
588
+ if (contextTapMap) { contextTapMap.set(ctx, proc); }
589
+ else { contextTapList.push({ ctx: ctx, proc: proc }); }
590
+ }
591
+
592
+ // Count stored taps
593
+ function tapCount() {
594
+ if (contextTapMap) {
595
+ var count = 0;
596
+ for (var i = 0; i < allAudioContexts.length; i++) {
597
+ if (contextTapMap.has(allAudioContexts[i])) count++;
598
+ }
599
+ return count;
600
+ }
601
+ return contextTapList.length;
602
+ }
603
+
604
+ // Create or retrieve a ScriptProcessorNode tap for a specific AudioContext.
605
+ // The tap lives in the SAME context as the source, avoiding cross-context errors.
606
+ function getOrCreateTap(ctx) {
607
+ var existing = findTap(ctx);
608
+ if (existing) return existing;
609
+
610
+ try {
611
+ if (ctx.state === 'closed') return null;
612
+ var channels = Math.min(ctx.destination.channelCount || 2, 2);
613
+ if (channels < 1) channels = 1;
614
+ var proc = ctx.createScriptProcessor(BUFFER_SIZE, channels, channels);
615
+ proc.onaudioprocess = function(e) {
616
+ if (!capturing) return;
617
+ var left = new Float32Array(e.inputBuffer.getChannelData(0));
618
+ var right = e.inputBuffer.numberOfChannels > 1
619
+ ? new Float32Array(e.inputBuffer.getChannelData(1))
620
+ : new Float32Array(left.length);
621
+ capturedChunks.push({ left: left, right: right, sampleRate: ctx.sampleRate });
622
+ totalSamples += left.length;
623
+ if (totalSamples >= FLUSH_SAMPLES) {
624
+ flushToNodeJs();
625
+ }
626
+ };
627
+ // Must connect to destination to keep ScriptProcessorNode alive
628
+ origConnect.call(proc, ctx.destination);
629
+ storeTap(ctx, proc);
630
+ return proc;
631
+ } catch(e) {
632
+ return null;
633
+ }
634
+ }
635
+
636
+ // Override AudioNode.prototype.connect to tap connections to any AudioDestinationNode
637
+ AudioNode.prototype.connect = function(destination) {
638
+ var result = origConnect.apply(this, arguments);
639
+
640
+ if (capturing && destination instanceof AudioDestinationNode) {
641
+ try {
642
+ var tap = getOrCreateTap(destination.context);
643
+ // Don't connect the tap to itself
644
+ if (tap && tap !== this) {
645
+ origConnect.call(this, tap);
646
+ }
647
+ } catch(e) {}
648
+ }
649
+ return result;
650
+ };
651
+
652
+ var origPlay = window.__bpOrigPlay || HTMLMediaElement.prototype.play;
653
+ window.__bpOrigPlay = origPlay;
654
+ HTMLMediaElement.prototype.play = function() {
655
+ if (capturing && !this.__bpCaptured) {
656
+ this.__bpCaptured = true;
657
+ try {
658
+ if (!captureCtx) initCaptureCtx();
659
+ var stream = this.captureStream ? this.captureStream() : null;
660
+ if (stream && captureCtx) {
661
+ var source = captureCtx.createMediaStreamSource(stream);
662
+ origConnect.call(source, captureProcessor);
663
+ }
664
+ } catch(e) {}
665
+ }
666
+ return origPlay.apply(this, arguments);
667
+ };
668
+
669
+ // Intercept srcObject assignment to catch WebRTC streams attached to media elements
670
+ var origSrcObjectDesc = Object.getOwnPropertyDescriptor(HTMLMediaElement.prototype, 'srcObject');
671
+ if (origSrcObjectDesc && origSrcObjectDesc.set) {
672
+ Object.defineProperty(HTMLMediaElement.prototype, 'srcObject', {
673
+ set: function(stream) {
674
+ origSrcObjectDesc.set.call(this, stream);
675
+ if (stream && stream.getAudioTracks) {
676
+ var tracks = stream.getAudioTracks();
677
+ for (var i = 0; i < tracks.length; i++) {
678
+ if (capturing) {
679
+ tapAudioTrack(tracks[i]);
680
+ } else {
681
+ pendingTracks.push(tracks[i]);
682
+ }
683
+ }
684
+ }
685
+ },
686
+ get: origSrcObjectDesc.get,
687
+ configurable: true
688
+ });
689
+ }
690
+
691
+ // Initialize our own 48kHz capture context for WebRTC and media element tapping
692
+ function initCaptureCtx() {
693
+ captureCtx = new OrigAudioContext({ sampleRate: 48000 });
694
+ captureProcessor = captureCtx.createScriptProcessor(BUFFER_SIZE, 2, 2);
695
+ captureProcessor.onaudioprocess = function(e) {
696
+ if (!capturing) return;
697
+ var left = new Float32Array(e.inputBuffer.getChannelData(0));
698
+ var right = new Float32Array(e.inputBuffer.getChannelData(1));
699
+ capturedChunks.push({ left: left, right: right, sampleRate: 48000 });
700
+ totalSamples += left.length;
701
+ if (totalSamples >= FLUSH_SAMPLES) {
702
+ flushToNodeJs();
703
+ }
704
+ };
705
+ origConnect.call(captureProcessor, captureCtx.destination);
706
+ }
707
+
708
+ function uint8ToBase64(bytes) {
709
+ var CHUNK = 8192;
710
+ var parts = [];
711
+ for (var i = 0; i < bytes.length; i += CHUNK) {
712
+ var slice = bytes.subarray(i, Math.min(i + CHUNK, bytes.length));
713
+ var binary = '';
714
+ for (var j = 0; j < slice.length; j++) {
715
+ binary += String.fromCharCode(slice[j]);
716
+ }
717
+ parts.push(binary);
718
+ }
719
+ return btoa(parts.join(''));
720
+ }
721
+
722
+ function flushGroup(chunks, rate) {
723
+ var totalLen = 0;
724
+ for (var i = 0; i < chunks.length; i++) {
725
+ totalLen += chunks[i].left.length;
726
+ }
727
+ if (totalLen === 0) return;
728
+
729
+ var left = new Float32Array(totalLen);
730
+ var right = new Float32Array(totalLen);
731
+ var offset = 0;
732
+ for (var i = 0; i < chunks.length; i++) {
733
+ left.set(chunks[i].left, offset);
734
+ right.set(chunks[i].right, offset);
735
+ offset += chunks[i].left.length;
736
+ }
737
+
738
+ var leftB64 = uint8ToBase64(new Uint8Array(left.buffer));
739
+ var rightB64 = uint8ToBase64(new Uint8Array(right.buffer));
740
+
741
+ flushCount++;
742
+
743
+ try {
744
+ if (typeof window.__bpAudioOutputData === 'function') {
745
+ window.__bpAudioOutputData(JSON.stringify({
746
+ left: leftB64,
747
+ right: rightB64,
748
+ sampleRate: rate,
749
+ samples: totalLen
750
+ }));
751
+ }
752
+ } catch(e) {}
753
+ }
754
+
755
+ function flushToNodeJs() {
756
+ if (capturedChunks.length === 0) return;
757
+
758
+ // Group chunks by sample rate to avoid mixing different-rate audio
759
+ var byRate = {};
760
+ for (var i = 0; i < capturedChunks.length; i++) {
761
+ var rate = capturedChunks[i].sampleRate || 48000;
762
+ if (!byRate[rate]) byRate[rate] = [];
763
+ byRate[rate].push(capturedChunks[i]);
764
+ }
765
+
766
+ // Flush each sample rate group separately
767
+ for (var rateKey in byRate) {
768
+ if (byRate.hasOwnProperty(rateKey)) {
769
+ flushGroup(byRate[rateKey], Number(rateKey));
770
+ }
771
+ }
772
+
773
+ capturedChunks = [];
774
+ totalSamples = 0;
775
+ }
776
+
777
+ // --- WebRTC interception (for apps that use RTCPeerConnection) ---
778
+ var rtcTrackedStreams = [];
779
+ var rtcPeerConnections = [];
780
+
781
+ function tapAudioTrack(track) {
782
+ try {
783
+ if (tappedTrackIds[track.id]) return;
784
+ tappedTrackIds[track.id] = true;
785
+ if (!captureCtx) initCaptureCtx();
786
+ var stream = new MediaStream([track]);
787
+ var source = captureCtx.createMediaStreamSource(stream);
788
+ origConnect.call(source, captureProcessor);
789
+ rtcTrackedStreams.push(source);
790
+ } catch(e) {}
791
+ }
792
+
793
+ function tapExistingPeerConnection(pc) {
794
+ try {
795
+ var receivers = pc.getReceivers ? pc.getReceivers() : [];
796
+ for (var i = 0; i < receivers.length; i++) {
797
+ if (receivers[i].track && receivers[i].track.kind === 'audio') {
798
+ tapAudioTrack(receivers[i].track);
799
+ }
800
+ }
801
+ } catch(e) {}
802
+ }
803
+
804
+ if (typeof RTCPeerConnection !== 'undefined') {
805
+ var OrigRTC = RTCPeerConnection;
806
+
807
+ window.RTCPeerConnection = function() {
808
+ var pc = new (Function.prototype.bind.apply(OrigRTC, [null].concat(Array.prototype.slice.call(arguments))))();
809
+ rtcPeerConnections.push(pc);
810
+
811
+ pc.addEventListener('track', function(event) {
812
+ if (event.track && event.track.kind === 'audio') {
813
+ if (capturing) {
814
+ tapAudioTrack(event.track);
815
+ } else {
816
+ pendingTracks.push(event.track);
817
+ }
818
+ }
819
+ });
820
+
821
+ return pc;
822
+ };
823
+ window.RTCPeerConnection.prototype = OrigRTC.prototype;
824
+ Object.keys(OrigRTC).forEach(function(k) {
825
+ try { window.RTCPeerConnection[k] = OrigRTC[k]; } catch(e) {}
826
+ });
827
+
828
+ window.__bpTrackedPCs = rtcPeerConnections;
829
+ }
830
+
831
+ window.__bpAudioOutput = {
832
+ start: function() {
833
+ capturing = true;
834
+ capturedChunks = [];
835
+ totalSamples = 0;
836
+ flushCount = 0;
837
+ tappedTrackIds = {};
838
+
839
+ // Resume any suspended capture context
840
+ if (captureCtx && captureCtx.state === 'suspended') captureCtx.resume();
841
+
842
+ // Create taps for all tracked AudioContexts (catches contexts created before capture)
843
+ for (var i = 0; i < allAudioContexts.length; i++) {
844
+ var ctx = allAudioContexts[i];
845
+ if (ctx.state !== 'closed') {
846
+ getOrCreateTap(ctx);
847
+ }
848
+ }
849
+
850
+ // Drain pending WebRTC tracks
851
+ for (var j = 0; j < pendingTracks.length; j++) {
852
+ tapAudioTrack(pendingTracks[j]);
853
+ }
854
+ pendingTracks = [];
855
+
856
+ // Tap existing peer connections
857
+ for (var k = 0; k < rtcPeerConnections.length; k++) {
858
+ tapExistingPeerConnection(rtcPeerConnections[k]);
859
+ }
860
+
861
+ // Scan existing media elements for srcObject with audio tracks
862
+ var mediaEls = document.querySelectorAll('audio, video');
863
+ for (var i = 0; i < mediaEls.length; i++) {
864
+ var el = mediaEls[i];
865
+ if (el.srcObject && el.srcObject.getAudioTracks && !el.__bpCaptured) {
866
+ el.__bpCaptured = true;
867
+ var tracks = el.srcObject.getAudioTracks();
868
+ for (var j = 0; j < tracks.length; j++) {
869
+ tapAudioTrack(tracks[j]);
870
+ }
871
+ }
872
+ }
873
+
874
+ // Watch for dynamically added media elements with srcObject
875
+ if (typeof MutationObserver !== 'undefined') {
876
+ if (window.__bpMediaObserver) {
877
+ window.__bpMediaObserver.disconnect();
878
+ }
879
+ window.__bpMediaObserver = new MutationObserver(function(mutations) {
880
+ for (var i = 0; i < mutations.length; i++) {
881
+ var added = mutations[i].addedNodes;
882
+ for (var j = 0; j < added.length; j++) {
883
+ var node = added[j];
884
+ if (node.nodeType !== 1) continue;
885
+ var els = [];
886
+ if (node.tagName === 'AUDIO' || node.tagName === 'VIDEO') els.push(node);
887
+ else if (node.querySelectorAll) {
888
+ var nested = node.querySelectorAll('audio, video');
889
+ for (var k = 0; k < nested.length; k++) els.push(nested[k]);
890
+ }
891
+ for (var m = 0; m < els.length; m++) {
892
+ var el = els[m];
893
+ if (el.srcObject && el.srcObject.getAudioTracks && !el.__bpCaptured) {
894
+ el.__bpCaptured = true;
895
+ var tracks = el.srcObject.getAudioTracks();
896
+ for (var t = 0; t < tracks.length; t++) tapAudioTrack(tracks[t]);
897
+ }
898
+ }
899
+ }
900
+ }
901
+ });
902
+ window.__bpMediaObserver.observe(document, { childList: true, subtree: true });
903
+ }
904
+ },
905
+ stop: function() {
906
+ capturing = false;
907
+ flushToNodeJs();
908
+ // Disconnect MutationObserver
909
+ if (window.__bpMediaObserver) {
910
+ window.__bpMediaObserver.disconnect();
911
+ window.__bpMediaObserver = null;
912
+ }
913
+ },
914
+ isCapturing: function() { return capturing; },
915
+ getBufferedSamples: function() { return totalSamples; },
916
+ tapPC: function(pc) {
917
+ if (!pc || typeof pc.getReceivers !== 'function') return false;
918
+ if (rtcPeerConnections.indexOf(pc) === -1) {
919
+ rtcPeerConnections.push(pc);
920
+ }
921
+ if (capturing) {
922
+ tapExistingPeerConnection(pc);
923
+ }
924
+ pc.addEventListener('track', function(event) {
925
+ if (event.track && event.track.kind === 'audio') {
926
+ if (capturing) {
927
+ tapAudioTrack(event.track);
928
+ } else {
929
+ pendingTracks.push(event.track);
930
+ }
931
+ }
932
+ });
933
+ return true;
934
+ },
935
+ getStats: function() {
936
+ return {
937
+ audioContexts: allAudioContexts.filter(function(c) { return c.state !== 'closed'; }).length,
938
+ contextTaps: tapCount(),
939
+ audioNodes: captureCtx ? captureCtx.destination.numberOfInputs : 0,
940
+ rtcConnections: rtcPeerConnections.length,
941
+ mediaElements: document.querySelectorAll('audio, video').length,
942
+ pendingTracks: pendingTracks.length,
943
+ tappedTracks: Object.keys(tappedTrackIds).length,
944
+ capturing: capturing,
945
+ bufferedSamples: totalSamples,
946
+ rtcDetails: rtcPeerConnections.map(function(pc) {
947
+ try {
948
+ var receivers = pc.getReceivers ? pc.getReceivers() : [];
949
+ var senders = pc.getSenders ? pc.getSenders() : [];
950
+ var audioReceivers = receivers.filter(function(r) { return r.track && r.track.kind === 'audio'; }).length;
951
+ var audioSenders = senders.filter(function(s) { return s.track && s.track.kind === 'audio'; }).length;
952
+ return {
953
+ state: pc.connectionState || pc.iceConnectionState || 'unknown',
954
+ audioReceivers: audioReceivers,
955
+ audioSenders: audioSenders,
956
+ tapped: receivers.some(function(r) { return r.track && tappedTrackIds[r.track.id]; })
957
+ };
958
+ } catch(e) { return { state: 'error', audioReceivers: 0, audioSenders: 0, tapped: false }; }
959
+ }),
960
+ mediaElementDetails: (function() {
961
+ try {
962
+ var els = document.querySelectorAll('audio, video');
963
+ var details = [];
964
+ for (var i = 0; i < els.length; i++) {
965
+ var el = els[i];
966
+ var hasSrcObject = !!(el.srcObject);
967
+ var audioTracks = 0;
968
+ if (el.srcObject && el.srcObject.getAudioTracks) {
969
+ audioTracks = el.srcObject.getAudioTracks().length;
970
+ }
971
+ details.push({
972
+ tag: el.tagName.toLowerCase(),
973
+ hasSrcObject: hasSrcObject,
974
+ hasSrc: !!(el.src || el.currentSrc),
975
+ audioTracks: audioTracks,
976
+ tapped: !!(el.__bpCaptured)
977
+ });
978
+ }
979
+ return details;
980
+ } catch(e) { return []; }
981
+ })()
982
+ };
983
+ }
984
+ };
985
+ })();
986
+ `;
987
+ var AudioOutput = class {
988
+ cdp;
989
+ chunks = [];
990
+ injected = false;
991
+ capturing = false;
992
+ bindingHandler = null;
993
+ onChunkHandler;
994
+ onDiagHandler;
995
+ /** Timestamp of the first non-silent chunk received */
996
+ firstChunkTime = null;
997
+ constructor(cdp) {
998
+ this.cdp = cdp;
999
+ }
1000
+ /** Whether the audio output system has been set up */
1001
+ get isSetup() {
1002
+ return this.injected;
1003
+ }
1004
+ /** Whether audio is currently being captured */
1005
+ get isCapturing() {
1006
+ return this.capturing;
1007
+ }
1008
+ /**
1009
+ * Set up audio output capture.
1010
+ * Registers bindings and injects the capture script.
1011
+ */
1012
+ async setup() {
1013
+ if (this.injected) return;
1014
+ await this.cdp.send("Runtime.addBinding", { name: OUTPUT_BINDING });
1015
+ this.bindingHandler = (params) => {
1016
+ if (params["name"] === OUTPUT_BINDING) {
1017
+ this.handleAudioData(params["payload"]);
1018
+ }
1019
+ };
1020
+ this.cdp.on("Runtime.bindingCalled", this.bindingHandler);
1021
+ await this.cdp.send("Page.addScriptToEvaluateOnNewDocument", {
1022
+ source: AUDIO_OUTPUT_SCRIPT
1023
+ });
1024
+ await this.cdp.send("Runtime.evaluate", {
1025
+ expression: AUDIO_OUTPUT_SCRIPT,
1026
+ awaitPromise: false,
1027
+ userGesture: true
1028
+ });
1029
+ this.injected = true;
1030
+ }
1031
+ /**
1032
+ * Start capturing audio output.
1033
+ */
1034
+ async start() {
1035
+ if (!this.injected) {
1036
+ await this.setup();
1037
+ }
1038
+ this.chunks = [];
1039
+ this.firstChunkTime = null;
1040
+ this.capturing = true;
1041
+ await this.cdp.send("Runtime.evaluate", {
1042
+ expression: `(function() {
1043
+ var resumed = [];
1044
+ (window.__bpTrackedAudioContexts || []).forEach(function(ctx) {
1045
+ if (ctx.state === 'suspended') {
1046
+ ctx.resume().then(function() {
1047
+ console.log('[bp:output] Resumed AudioContext (' + ctx.sampleRate + 'Hz) before capture');
1048
+ });
1049
+ resumed.push(ctx.sampleRate);
1050
+ }
1051
+ });
1052
+ if (window.__bpAudioInput && window.__bpAudioInput.getContext) {
1053
+ var inputCtx = window.__bpAudioInput.getContext();
1054
+ if (inputCtx && inputCtx.state === 'suspended') {
1055
+ inputCtx.resume();
1056
+ resumed.push('input-' + inputCtx.sampleRate);
1057
+ }
1058
+ }
1059
+ if (resumed.length) console.log('[bp:output] Resumed ' + resumed.length + ' contexts: ' + resumed.join(', '));
1060
+ })()`,
1061
+ awaitPromise: false,
1062
+ userGesture: true
1063
+ });
1064
+ await this.cdp.send("Runtime.evaluate", {
1065
+ expression: "window.__bpAudioOutput && window.__bpAudioOutput.start()",
1066
+ awaitPromise: false
1067
+ });
1068
+ await this.discoverExistingPeerConnections();
1069
+ if (this.onDiagHandler) {
1070
+ try {
1071
+ const statsResult = await this.cdp.send(
1072
+ "Runtime.evaluate",
1073
+ {
1074
+ expression: "window.__bpAudioOutput && window.__bpAudioOutput.getStats()",
1075
+ returnByValue: true
1076
+ }
1077
+ );
1078
+ const stats = statsResult.result.value;
1079
+ if (stats) {
1080
+ this.onDiagHandler(
1081
+ `started \u2014 ${stats["audioContexts"]} AudioContexts, ${stats["contextTaps"]} taps, ${stats["rtcConnections"]} RTCPeerConnections, ${stats["mediaElements"]} MediaElements, ${stats["tappedTracks"]} tapped tracks`
1082
+ );
1083
+ }
1084
+ } catch {
1085
+ }
1086
+ }
1087
+ }
1088
+ /**
1089
+ * Stop capturing and return all collected audio.
1090
+ */
1091
+ async stop() {
1092
+ if (!this.injected) {
1093
+ return emptyCaptureResult();
1094
+ }
1095
+ await this.cdp.send("Runtime.evaluate", {
1096
+ expression: "window.__bpAudioOutput && window.__bpAudioOutput.stop()",
1097
+ awaitPromise: false
1098
+ });
1099
+ this.capturing = false;
1100
+ await sleep(250);
1101
+ return this.mergeChunks();
1102
+ }
1103
+ /**
1104
+ * Capture audio until silence is detected.
1105
+ *
1106
+ * Two-phase approach:
1107
+ * 1. **Wait phase**: Wait up to `maxDuration` for the first non-silent chunk.
1108
+ * The silence countdown does NOT tick during this phase, so slow voice agents
1109
+ * (STT → LLM → TTS can take 5-15s) don't cause premature timeout.
1110
+ * 2. **Capture phase**: Once audio is detected, capture until `silenceTimeout` ms
1111
+ * of consecutive silence pass, then stop.
1112
+ */
1113
+ async captureUntilSilence(options) {
1114
+ const silenceTimeout = options?.silenceTimeout ?? 1500;
1115
+ const silenceThreshold = options?.silenceThreshold ?? 0.01;
1116
+ const maxDuration = options?.maxDuration ?? 3e5;
1117
+ const noAudioTimeout = options?.noAudioTimeout ?? 15e3;
1118
+ if (!this.capturing) {
1119
+ await this.start();
1120
+ }
1121
+ return new Promise((resolve) => {
1122
+ let heardAudio = false;
1123
+ let lastSoundTime = 0;
1124
+ const startTime = Date.now();
1125
+ const checkInterval = setInterval(async () => {
1126
+ const elapsed = Date.now() - startTime;
1127
+ if (elapsed > maxDuration) {
1128
+ clearInterval(checkInterval);
1129
+ this.onDiagHandler?.(`max duration reached (${maxDuration}ms), stopping`);
1130
+ resolve(await this.stop());
1131
+ return;
1132
+ }
1133
+ const latest = this.chunks[this.chunks.length - 1];
1134
+ if (latest) {
1135
+ const rms = calculateRMS(latest.left);
1136
+ if (rms > silenceThreshold) {
1137
+ if (!heardAudio) {
1138
+ heardAudio = true;
1139
+ this.onDiagHandler?.("first audio detected \u2014 silence countdown begins");
1140
+ }
1141
+ lastSoundTime = Date.now();
1142
+ }
1143
+ }
1144
+ if (!heardAudio && elapsed > noAudioTimeout) {
1145
+ clearInterval(checkInterval);
1146
+ this.onDiagHandler?.(`no audio detected after ${noAudioTimeout}ms, stopping early`);
1147
+ resolve(await this.stop());
1148
+ return;
1149
+ }
1150
+ if (heardAudio && Date.now() - lastSoundTime > silenceTimeout) {
1151
+ clearInterval(checkInterval);
1152
+ resolve(await this.stop());
1153
+ }
1154
+ }, 200);
1155
+ });
1156
+ }
1157
+ /**
1158
+ * Subscribe to real-time audio chunks as they arrive.
1159
+ */
1160
+ onData(handler) {
1161
+ this.onChunkHandler = handler;
1162
+ }
1163
+ /**
1164
+ * Subscribe to diagnostic messages (for --verbose).
1165
+ */
1166
+ onDiag(handler) {
1167
+ this.onDiagHandler = handler;
1168
+ }
1169
+ /**
1170
+ * Clean up: remove binding handler.
1171
+ */
1172
+ async teardown() {
1173
+ if (this.capturing) {
1174
+ await this.stop();
1175
+ }
1176
+ if (this.bindingHandler) {
1177
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
1178
+ this.bindingHandler = null;
1179
+ }
1180
+ this.onChunkHandler = void 0;
1181
+ this.onDiagHandler = void 0;
1182
+ this.injected = false;
1183
+ }
1184
+ /**
1185
+ * Use CDP Runtime.queryObjects to find RTCPeerConnection instances
1186
+ * that were created before our override was injected, and tap their audio tracks.
1187
+ */
1188
+ async discoverExistingPeerConnections() {
1189
+ try {
1190
+ const protoResult = await this.cdp.send("Runtime.evaluate", {
1191
+ expression: 'typeof RTCPeerConnection !== "undefined" ? RTCPeerConnection.prototype : null',
1192
+ returnByValue: false
1193
+ });
1194
+ const protoId = protoResult.result.objectId;
1195
+ if (!protoId) return;
1196
+ const queryResult = await this.cdp.send("Runtime.queryObjects", {
1197
+ prototypeObjectId: protoId
1198
+ });
1199
+ const arrayId = queryResult.objects.objectId;
1200
+ if (!arrayId) return;
1201
+ const propsResult = await this.cdp.send("Runtime.getProperties", {
1202
+ objectId: arrayId,
1203
+ ownProperties: true
1204
+ });
1205
+ let tapped = 0;
1206
+ for (const prop of propsResult.result) {
1207
+ if (prop.name === "length" || prop.name === "__proto__") continue;
1208
+ const pcObjectId = prop.value?.objectId;
1209
+ if (!pcObjectId) continue;
1210
+ await this.cdp.send("Runtime.callFunctionOn", {
1211
+ objectId: pcObjectId,
1212
+ functionDeclaration: "function() { if (window.__bpAudioOutput && window.__bpAudioOutput.tapPC) { return window.__bpAudioOutput.tapPC(this); } return false; }",
1213
+ returnByValue: true
1214
+ });
1215
+ tapped++;
1216
+ }
1217
+ if (tapped > 0) {
1218
+ this.onDiagHandler?.(`retroactively discovered ${tapped} existing RTCPeerConnection(s)`);
1219
+ }
1220
+ await this.cdp.send("Runtime.releaseObject", { objectId: arrayId });
1221
+ await this.cdp.send("Runtime.releaseObject", { objectId: protoId });
1222
+ } catch {
1223
+ }
1224
+ }
1225
+ handleAudioData(payload) {
1226
+ try {
1227
+ const data = JSON.parse(payload);
1228
+ const leftBytes = base64ToBuffer(data.left);
1229
+ const rightBytes = base64ToBuffer(data.right);
1230
+ const chunk = {
1231
+ left: new Float32Array(leftBytes.buffer),
1232
+ right: new Float32Array(rightBytes.buffer),
1233
+ sampleRate: data.sampleRate,
1234
+ samples: data.samples,
1235
+ timestamp: Date.now()
1236
+ };
1237
+ this.chunks.push(chunk);
1238
+ if (this.onDiagHandler) {
1239
+ const rms = calculateRMS(chunk.left);
1240
+ const label = rms > 0.01 ? "audio" : "silence";
1241
+ this.onDiagHandler(`chunk: ${chunk.samples} samples, RMS=${rms.toFixed(4)} (${label})`);
1242
+ }
1243
+ if (this.firstChunkTime === null) {
1244
+ const rms = calculateRMS(chunk.left);
1245
+ if (rms > 1e-3) {
1246
+ this.firstChunkTime = Date.now();
1247
+ }
1248
+ }
1249
+ this.onChunkHandler?.(chunk);
1250
+ } catch {
1251
+ }
1252
+ }
1253
+ mergeChunks() {
1254
+ if (this.chunks.length === 0) {
1255
+ return emptyCaptureResult();
1256
+ }
1257
+ const byRate = /* @__PURE__ */ new Map();
1258
+ for (const chunk of this.chunks) {
1259
+ const rate = chunk.sampleRate;
1260
+ if (!byRate.has(rate)) byRate.set(rate, []);
1261
+ byRate.get(rate).push(chunk);
1262
+ }
1263
+ let bestRate = this.chunks[0].sampleRate;
1264
+ let bestNonSilentSamples = 0;
1265
+ for (const [rate, chunks] of byRate) {
1266
+ let nonSilentSamples = 0;
1267
+ for (const chunk of chunks) {
1268
+ const rms = calculateRMS(chunk.left);
1269
+ if (rms > 0.01) {
1270
+ nonSilentSamples += chunk.left.length;
1271
+ }
1272
+ }
1273
+ if (nonSilentSamples > bestNonSilentSamples) {
1274
+ bestNonSilentSamples = nonSilentSamples;
1275
+ bestRate = rate;
1276
+ }
1277
+ }
1278
+ const bestChunks = byRate.get(bestRate);
1279
+ let totalLen = 0;
1280
+ for (const chunk of bestChunks) {
1281
+ totalLen += chunk.left.length;
1282
+ }
1283
+ const left = new Float32Array(totalLen);
1284
+ const right = new Float32Array(totalLen);
1285
+ let offset = 0;
1286
+ for (const chunk of bestChunks) {
1287
+ left.set(chunk.left, offset);
1288
+ right.set(chunk.right, offset);
1289
+ offset += chunk.left.length;
1290
+ }
1291
+ if (byRate.size > 1) {
1292
+ this.onDiagHandler?.(
1293
+ `mergeChunks: ${byRate.size} sample rates detected, using ${bestRate}Hz (${bestNonSilentSamples} non-silent samples)`
1294
+ );
1295
+ }
1296
+ return {
1297
+ left,
1298
+ right,
1299
+ sampleRate: bestRate,
1300
+ durationMs: totalLen / bestRate * 1e3,
1301
+ chunkCount: bestChunks.length
1302
+ };
1303
+ }
1304
+ };
1305
+ function emptyCaptureResult() {
1306
+ return {
1307
+ left: new Float32Array(0),
1308
+ right: new Float32Array(0),
1309
+ sampleRate: 48e3,
1310
+ durationMs: 0,
1311
+ chunkCount: 0
1312
+ };
1313
+ }
1314
+ function sleep(ms) {
1315
+ return new Promise((resolve) => setTimeout(resolve, ms));
1316
+ }
12
1317
 
13
1318
  // src/network/interceptor.ts
14
1319
  var RequestInterceptor = class {
@@ -260,7 +1565,7 @@ async function isElementAttached(cdp, selector, contextId) {
260
1565
  const result = await cdp.send("Runtime.evaluate", params);
261
1566
  return result.result.value === true;
262
1567
  }
263
- function sleep(ms) {
1568
+ function sleep2(ms) {
264
1569
  return new Promise((resolve) => setTimeout(resolve, ms));
265
1570
  }
266
1571
  async function waitForElement(cdp, selector, options = {}) {
@@ -286,7 +1591,7 @@ async function waitForElement(cdp, selector, options = {}) {
286
1591
  if (conditionMet) {
287
1592
  return { success: true, waitedMs: Date.now() - startTime };
288
1593
  }
289
- await sleep(pollInterval);
1594
+ await sleep2(pollInterval);
290
1595
  }
291
1596
  return { success: false, waitedMs: Date.now() - startTime };
292
1597
  }
@@ -315,7 +1620,7 @@ async function waitForAnyElement(cdp, selectors, options = {}) {
315
1620
  return { success: true, selector, waitedMs: Date.now() - startTime };
316
1621
  }
317
1622
  }
318
- await sleep(pollInterval);
1623
+ await sleep2(pollInterval);
319
1624
  }
320
1625
  return { success: false, waitedMs: Date.now() - startTime };
321
1626
  }
@@ -364,7 +1669,7 @@ async function waitForNavigation(cdp, options = {}) {
364
1669
  }
365
1670
  const pollUrl = async () => {
366
1671
  while (!resolved && Date.now() < startTime + timeout) {
367
- await sleep(100);
1672
+ await sleep2(100);
368
1673
  if (resolved) return;
369
1674
  try {
370
1675
  const currentUrl = await getCurrentUrl(cdp);
@@ -694,6 +1999,10 @@ var Page = class {
694
1999
  currentFrameContextId = null;
695
2000
  /** Last matched selector from findElement (for selectorUsed tracking) */
696
2001
  _lastMatchedSelector;
2002
+ /** Audio input controller (lazy-initialized) */
2003
+ _audioInput;
2004
+ /** Audio output controller (lazy-initialized) */
2005
+ _audioOutput;
697
2006
  constructor(cdp, targetId) {
698
2007
  this.cdp = cdp;
699
2008
  this._targetId = targetId;
@@ -941,7 +2250,7 @@ var Page = class {
941
2250
  key: char
942
2251
  });
943
2252
  if (delay > 0) {
944
- await sleep2(delay);
2253
+ await sleep3(delay);
945
2254
  }
946
2255
  }
947
2256
  return true;
@@ -982,7 +2291,7 @@ var Page = class {
982
2291
  async selectCustom(config, options = {}) {
983
2292
  const { trigger, option, value, match = "text" } = config;
984
2293
  await this.click(trigger, options);
985
- await sleep2(100);
2294
+ await sleep3(100);
986
2295
  let optionSelector;
987
2296
  const optionSelectors = Array.isArray(option) ? option : [option];
988
2297
  if (match === "contains") {
@@ -1094,7 +2403,7 @@ var Page = class {
1094
2403
  if (shouldWait === true) {
1095
2404
  await this.waitForNavigation({ timeout: options.timeout ?? DEFAULT_TIMEOUT });
1096
2405
  } else if (shouldWait === "auto") {
1097
- await Promise.race([this.waitForNavigation({ timeout: 1e3, optional: true }), sleep2(500)]);
2406
+ await Promise.race([this.waitForNavigation({ timeout: 1e3, optional: true }), sleep3(500)]);
1098
2407
  }
1099
2408
  return true;
1100
2409
  }
@@ -1112,7 +2421,7 @@ var Page = class {
1112
2421
  this.waitForNavigation({ timeout: 1e3, optional: true }).then(
1113
2422
  (success) => success ? "nav" : null
1114
2423
  ),
1115
- sleep2(500).then(() => "timeout")
2424
+ sleep3(500).then(() => "timeout")
1116
2425
  ]);
1117
2426
  if (navigationDetected === "nav") {
1118
2427
  return true;
@@ -1126,7 +2435,7 @@ var Page = class {
1126
2435
  if (shouldWait === true) {
1127
2436
  await this.waitForNavigation({ timeout: options.timeout ?? DEFAULT_TIMEOUT });
1128
2437
  } else if (shouldWait === "auto") {
1129
- await sleep2(100);
2438
+ await sleep3(100);
1130
2439
  }
1131
2440
  }
1132
2441
  return true;
@@ -2123,7 +3432,7 @@ var Page = class {
2123
3432
  lastError = e;
2124
3433
  if (attempt < retries) {
2125
3434
  this.rootNodeId = null;
2126
- await sleep2(delay);
3435
+ await sleep3(delay);
2127
3436
  continue;
2128
3437
  }
2129
3438
  }
@@ -2295,8 +3604,107 @@ var Page = class {
2295
3604
  clickCount: 1
2296
3605
  });
2297
3606
  }
3607
+ // ============ Audio I/O ============
3608
+ /**
3609
+ * Audio input controller (fake microphone).
3610
+ * Lazy-initialized on first access.
3611
+ */
3612
+ get audioInput() {
3613
+ if (!this._audioInput) {
3614
+ this._audioInput = new AudioInput(this.cdp);
3615
+ }
3616
+ return this._audioInput;
3617
+ }
3618
+ /**
3619
+ * Audio output capture controller.
3620
+ * Lazy-initialized on first access.
3621
+ */
3622
+ get audioOutput() {
3623
+ if (!this._audioOutput) {
3624
+ this._audioOutput = new AudioOutput(this.cdp);
3625
+ }
3626
+ return this._audioOutput;
3627
+ }
3628
+ /**
3629
+ * Set up both audio input (fake microphone) and output (capture).
3630
+ * Must be called before navigating to the page that will use audio.
3631
+ */
3632
+ async setupAudio() {
3633
+ try {
3634
+ await this.cdp.send("Input.dispatchMouseEvent", {
3635
+ type: "mousePressed",
3636
+ x: 0,
3637
+ y: 0,
3638
+ button: "left",
3639
+ clickCount: 1
3640
+ });
3641
+ await this.cdp.send("Input.dispatchMouseEvent", {
3642
+ type: "mouseReleased",
3643
+ x: 0,
3644
+ y: 0,
3645
+ button: "left",
3646
+ clickCount: 1
3647
+ });
3648
+ } catch {
3649
+ }
3650
+ await this.audioInput.setup();
3651
+ await this.audioOutput.setup();
3652
+ }
3653
+ /**
3654
+ * Full audio round-trip: feed input audio, capture the response.
3655
+ *
3656
+ * 1. Starts capturing output
3657
+ * 2. Feeds input audio as microphone data
3658
+ * 3. Waits for the page to respond and then go silent
3659
+ * 4. Returns the captured response audio with latency metrics
3660
+ *
3661
+ * @example
3662
+ * ```typescript
3663
+ * await page.setupAudio();
3664
+ * await page.goto('https://voice-agent.example.com');
3665
+ * const result = await page.audioRoundTrip({
3666
+ * input: wavFileBytes,
3667
+ * silenceTimeout: 3000,
3668
+ * });
3669
+ * console.log(`Response: ${result.audio.durationMs}ms, latency: ${result.latencyMs}ms`);
3670
+ * ```
3671
+ */
3672
+ async audioRoundTrip(options) {
3673
+ if (!this.audioInput.isSetup || !this.audioOutput.isSetup) {
3674
+ await this.setupAudio();
3675
+ }
3676
+ const start = Date.now();
3677
+ await this.audioOutput.start();
3678
+ if (options.preDelay && options.preDelay > 0) {
3679
+ await sleep3(options.preDelay);
3680
+ }
3681
+ const inputDone = this.audioInput.play(options.input, {
3682
+ waitForEnd: !!options.sendSelector
3683
+ });
3684
+ if (options.sendSelector) {
3685
+ await inputDone.catch(() => {
3686
+ });
3687
+ await this.click(options.sendSelector);
3688
+ }
3689
+ const audio = await this.audioOutput.captureUntilSilence({
3690
+ silenceTimeout: options.silenceTimeout ?? 1500,
3691
+ silenceThreshold: options.silenceThreshold ?? 0.01,
3692
+ maxDuration: options.timeout ?? 12e4
3693
+ });
3694
+ await this.audioInput.stop();
3695
+ if (!options.sendSelector) {
3696
+ await inputDone.catch(() => {
3697
+ });
3698
+ }
3699
+ const firstChunkTime = this.audioOutput.firstChunkTime;
3700
+ return {
3701
+ audio,
3702
+ latencyMs: firstChunkTime !== null ? firstChunkTime - start : -1,
3703
+ totalMs: Date.now() - start
3704
+ };
3705
+ }
2298
3706
  };
2299
- function sleep2(ms) {
3707
+ function sleep3(ms) {
2300
3708
  return new Promise((resolve) => setTimeout(resolve, ms));
2301
3709
  }
2302
3710
 
@@ -2436,13 +3844,20 @@ function connect(options) {
2436
3844
  }
2437
3845
 
2438
3846
  export {
3847
+ bufferToBase64,
3848
+ calculateRMS,
3849
+ pcmToWav,
3850
+ parseWavHeader,
3851
+ generateSilence,
3852
+ generateTone,
3853
+ grantAudioPermissions,
3854
+ AudioInput,
3855
+ AudioOutput,
2439
3856
  RequestInterceptor,
2440
- DEEP_QUERY_SCRIPT,
2441
3857
  waitForElement,
2442
3858
  waitForAnyElement,
2443
3859
  waitForNavigation,
2444
3860
  waitForNetworkIdle,
2445
- fuzzyMatchElements,
2446
3861
  Page,
2447
3862
  Browser,
2448
3863
  connect