browser-pilot 0.0.8 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -20,6 +20,8 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
20
20
  // src/index.ts
21
21
  var src_exports = {};
22
22
  __export(src_exports, {
23
+ AudioInput: () => AudioInput,
24
+ AudioOutput: () => AudioOutput,
23
25
  BatchExecutor: () => BatchExecutor,
24
26
  Browser: () => Browser,
25
27
  BrowserBaseProvider: () => BrowserBaseProvider,
@@ -33,6 +35,8 @@ __export(src_exports, {
33
35
  TimeoutError: () => TimeoutError,
34
36
  Tracer: () => Tracer,
35
37
  addBatchToPage: () => addBatchToPage,
38
+ bufferToBase64: () => bufferToBase64,
39
+ calculateRMS: () => calculateRMS,
36
40
  connect: () => connect,
37
41
  createCDPClient: () => createCDPClient,
38
42
  createProvider: () => createProvider,
@@ -40,8 +44,17 @@ __export(src_exports, {
40
44
  disableTracing: () => disableTracing,
41
45
  discoverTargets: () => discoverTargets,
42
46
  enableTracing: () => enableTracing,
47
+ generateSilence: () => generateSilence,
48
+ generateTone: () => generateTone,
49
+ getAudioChromeFlags: () => getAudioChromeFlags,
43
50
  getBrowserWebSocketUrl: () => getBrowserWebSocketUrl,
44
51
  getTracer: () => getTracer,
52
+ grantAudioPermissions: () => grantAudioPermissions,
53
+ isTranscriptionAvailable: () => isTranscriptionAvailable,
54
+ parseWavHeader: () => parseWavHeader,
55
+ pcmToWav: () => pcmToWav,
56
+ transcribe: () => transcribe,
57
+ validateSteps: () => validateSteps,
45
58
  waitForAnyElement: () => waitForAnyElement,
46
59
  waitForElement: () => waitForElement,
47
60
  waitForNavigation: () => waitForNavigation,
@@ -302,27 +315,1901 @@ var BatchExecutor = class {
302
315
  await this.page.switchToMain();
303
316
  return {};
304
317
  }
305
- default:
318
+ default: {
319
+ const action = step.action;
320
+ const aliases = {
321
+ execute: "evaluate",
322
+ navigate: "goto",
323
+ input: "fill",
324
+ tap: "click",
325
+ go: "goto",
326
+ run: "evaluate",
327
+ capture: "screenshot",
328
+ inspect: "snapshot",
329
+ enter: "press",
330
+ open: "goto",
331
+ visit: "goto",
332
+ eval: "evaluate",
333
+ js: "evaluate",
334
+ snap: "snapshot",
335
+ frame: "switchFrame"
336
+ };
337
+ const suggestion = aliases[action.toLowerCase()];
338
+ const hint = suggestion ? ` Did you mean "${suggestion}"?` : "";
339
+ const valid = "goto, click, fill, type, select, check, uncheck, submit, press, focus, hover, scroll, wait, snapshot, screenshot, evaluate, text, switchFrame, switchToMain";
340
+ throw new Error(`Unknown action "${action}".${hint}
341
+
342
+ Valid actions: ${valid}`);
343
+ }
344
+ }
345
+ }
346
+ /**
347
+ * Get the actual selector that matched the element.
348
+ * Uses the last matched selector tracked by Page, falls back to first selector if unavailable.
349
+ */
350
+ getUsedSelector(selector) {
351
+ const matched = this.page.getLastMatchedSelector();
352
+ if (matched) return matched;
353
+ return Array.isArray(selector) ? selector[0] : selector;
354
+ }
355
+ };
356
+ function addBatchToPage(page) {
357
+ const executor = new BatchExecutor(page);
358
+ return Object.assign(page, {
359
+ batch: (steps, options) => executor.execute(steps, options)
360
+ });
361
+ }
362
+
363
+ // src/actions/validate.ts
364
+ function levenshtein(a, b) {
365
+ const m = a.length;
366
+ const n = b.length;
367
+ const dp = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
368
+ for (let i = 0; i <= m; i++) dp[i][0] = i;
369
+ for (let j = 0; j <= n; j++) dp[0][j] = j;
370
+ for (let i = 1; i <= m; i++) {
371
+ for (let j = 1; j <= n; j++) {
372
+ dp[i][j] = a[i - 1] === b[j - 1] ? dp[i - 1][j - 1] : 1 + Math.min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]);
373
+ }
374
+ }
375
+ return dp[m][n];
376
+ }
377
+ var ACTION_ALIASES = {
378
+ execute: "evaluate",
379
+ navigate: "goto",
380
+ input: "fill",
381
+ tap: "click",
382
+ go: "goto",
383
+ run: "evaluate",
384
+ capture: "screenshot",
385
+ inspect: "snapshot",
386
+ enter: "press",
387
+ keypress: "press",
388
+ nav: "goto",
389
+ open: "goto",
390
+ visit: "goto",
391
+ browse: "goto",
392
+ load: "goto",
393
+ write: "fill",
394
+ set: "fill",
395
+ pick: "select",
396
+ choose: "select",
397
+ send: "press",
398
+ eval: "evaluate",
399
+ js: "evaluate",
400
+ script: "evaluate",
401
+ snap: "snapshot",
402
+ accessibility: "snapshot",
403
+ a11y: "snapshot",
404
+ image: "screenshot",
405
+ pic: "screenshot",
406
+ frame: "switchFrame",
407
+ iframe: "switchFrame"
408
+ };
409
+ var PROPERTY_ALIASES = {
410
+ expression: "value",
411
+ href: "url",
412
+ target: "selector",
413
+ element: "selector",
414
+ code: "value",
415
+ script: "value",
416
+ src: "url",
417
+ link: "url",
418
+ char: "key",
419
+ text: "value",
420
+ query: "selector",
421
+ el: "selector",
422
+ elem: "selector",
423
+ css: "selector",
424
+ xpath: "selector",
425
+ input: "value",
426
+ content: "value",
427
+ keys: "key",
428
+ button: "key",
429
+ address: "url",
430
+ page: "url",
431
+ path: "url"
432
+ };
433
+ var ACTION_RULES = {
434
+ goto: {
435
+ required: { url: { type: "string" } },
436
+ optional: {}
437
+ },
438
+ click: {
439
+ required: { selector: { type: "string|string[]" } },
440
+ optional: {
441
+ waitForNavigation: { type: "boolean" }
442
+ }
443
+ },
444
+ fill: {
445
+ required: { selector: { type: "string|string[]" }, value: { type: "string" } },
446
+ optional: {
447
+ clear: { type: "boolean" },
448
+ blur: { type: "boolean" }
449
+ }
450
+ },
451
+ type: {
452
+ required: { selector: { type: "string|string[]" }, value: { type: "string" } },
453
+ optional: {
454
+ delay: { type: "number" }
455
+ }
456
+ },
457
+ select: {
458
+ required: {},
459
+ optional: {
460
+ selector: { type: "string|string[]" },
461
+ value: { type: "string|string[]" },
462
+ trigger: { type: "string|string[]" },
463
+ option: { type: "string|string[]" },
464
+ match: { type: "string", enum: ["text", "value", "contains"] }
465
+ }
466
+ },
467
+ check: {
468
+ required: { selector: { type: "string|string[]" } },
469
+ optional: {}
470
+ },
471
+ uncheck: {
472
+ required: { selector: { type: "string|string[]" } },
473
+ optional: {}
474
+ },
475
+ submit: {
476
+ required: { selector: { type: "string|string[]" } },
477
+ optional: {
478
+ method: { type: "string", enum: ["enter", "click", "enter+click"] }
479
+ }
480
+ },
481
+ press: {
482
+ required: { key: { type: "string" } },
483
+ optional: {}
484
+ },
485
+ focus: {
486
+ required: { selector: { type: "string|string[]" } },
487
+ optional: {}
488
+ },
489
+ hover: {
490
+ required: { selector: { type: "string|string[]" } },
491
+ optional: {}
492
+ },
493
+ scroll: {
494
+ required: {},
495
+ optional: {
496
+ selector: { type: "string|string[]" },
497
+ x: { type: "number" },
498
+ y: { type: "number" },
499
+ direction: { type: "string", enum: ["up", "down", "left", "right"] },
500
+ amount: { type: "number" }
501
+ }
502
+ },
503
+ wait: {
504
+ required: {},
505
+ optional: {
506
+ selector: { type: "string|string[]" },
507
+ waitFor: {
508
+ type: "string",
509
+ enum: ["visible", "hidden", "attached", "detached", "navigation", "networkIdle"]
510
+ }
511
+ }
512
+ },
513
+ snapshot: {
514
+ required: {},
515
+ optional: {}
516
+ },
517
+ screenshot: {
518
+ required: {},
519
+ optional: {
520
+ format: { type: "string", enum: ["png", "jpeg", "webp"] },
521
+ quality: { type: "number" },
522
+ fullPage: { type: "boolean" }
523
+ }
524
+ },
525
+ evaluate: {
526
+ required: { value: { type: "string" } },
527
+ optional: {}
528
+ },
529
+ text: {
530
+ required: {},
531
+ optional: {
532
+ selector: { type: "string|string[]" }
533
+ }
534
+ },
535
+ switchFrame: {
536
+ required: { selector: { type: "string|string[]" } },
537
+ optional: {}
538
+ },
539
+ switchToMain: {
540
+ required: {},
541
+ optional: {}
542
+ }
543
+ };
544
+ var VALID_ACTIONS = Object.keys(ACTION_RULES);
545
+ var VALID_ACTIONS_LIST = VALID_ACTIONS.join(", ");
546
+ var KNOWN_STEP_FIELDS = /* @__PURE__ */ new Set([
547
+ "action",
548
+ "selector",
549
+ "url",
550
+ "value",
551
+ "key",
552
+ "waitFor",
553
+ "timeout",
554
+ "optional",
555
+ "method",
556
+ "clear",
557
+ "blur",
558
+ "delay",
559
+ "waitForNavigation",
560
+ "trigger",
561
+ "option",
562
+ "match",
563
+ "x",
564
+ "y",
565
+ "direction",
566
+ "amount",
567
+ "format",
568
+ "quality",
569
+ "fullPage"
570
+ ]);
571
+ function resolveAction(name) {
572
+ if (VALID_ACTIONS.includes(name)) {
573
+ return { action: name };
574
+ }
575
+ const lower = name.toLowerCase();
576
+ if (ACTION_ALIASES[lower]) {
577
+ return {
578
+ action: ACTION_ALIASES[lower],
579
+ suggestion: `Did you mean "${ACTION_ALIASES[lower]}"?`
580
+ };
581
+ }
582
+ let best = null;
583
+ let bestDist = Infinity;
584
+ for (const valid of VALID_ACTIONS) {
585
+ const dist = levenshtein(lower, valid);
586
+ if (dist < bestDist) {
587
+ bestDist = dist;
588
+ best = valid;
589
+ }
590
+ }
591
+ if (best && bestDist <= 2) {
592
+ return { action: best, suggestion: `Did you mean "${best}"?` };
593
+ }
594
+ return null;
595
+ }
596
+ function suggestProperty(name) {
597
+ if (PROPERTY_ALIASES[name]) {
598
+ return PROPERTY_ALIASES[name];
599
+ }
600
+ let best = null;
601
+ let bestDist = Infinity;
602
+ for (const known of KNOWN_STEP_FIELDS) {
603
+ if (known === "action") continue;
604
+ const dist = levenshtein(name, known);
605
+ if (dist < bestDist) {
606
+ bestDist = dist;
607
+ best = known;
608
+ }
609
+ }
610
+ if (best && bestDist <= 2) {
611
+ return best;
612
+ }
613
+ return void 0;
614
+ }
615
+ function checkFieldType(value, rule) {
616
+ switch (rule.type) {
617
+ case "string":
618
+ if (typeof value !== "string") return `expected string, got ${typeof value}`;
619
+ if (rule.enum && !rule.enum.includes(value)) {
620
+ return `must be one of: ${rule.enum.join(", ")}`;
621
+ }
622
+ return null;
623
+ case "string|string[]":
624
+ if (typeof value !== "string" && !Array.isArray(value)) {
625
+ return `expected string or string[], got ${typeof value}`;
626
+ }
627
+ if (Array.isArray(value) && value.some((v) => typeof v !== "string")) {
628
+ return "array elements must be strings";
629
+ }
630
+ return null;
631
+ case "number":
632
+ if (typeof value !== "number") return `expected number, got ${typeof value}`;
633
+ return null;
634
+ case "boolean":
635
+ if (typeof value !== "boolean") return `expected boolean, got ${typeof value}`;
636
+ return null;
637
+ }
638
+ }
639
+ function validateSteps(steps) {
640
+ const errors = [];
641
+ for (let i = 0; i < steps.length; i++) {
642
+ const step = steps[i];
643
+ if (!step || typeof step !== "object" || Array.isArray(step)) {
644
+ errors.push({
645
+ stepIndex: i,
646
+ field: "step",
647
+ message: "step must be a JSON object."
648
+ });
649
+ continue;
650
+ }
651
+ const obj = step;
652
+ if (!("action" in obj)) {
653
+ errors.push({
654
+ stepIndex: i,
655
+ field: "action",
656
+ message: 'missing required "action" field.'
657
+ });
658
+ continue;
659
+ }
660
+ const actionName = obj["action"];
661
+ if (typeof actionName !== "string") {
662
+ errors.push({
663
+ stepIndex: i,
664
+ field: "action",
665
+ message: `"action" must be a string, got ${typeof actionName}.`
666
+ });
667
+ continue;
668
+ }
669
+ const resolved = resolveAction(actionName);
670
+ if (!resolved) {
671
+ errors.push({
672
+ stepIndex: i,
673
+ field: "action",
674
+ message: `unknown action "${actionName}".`,
675
+ suggestion: `Valid actions: ${VALID_ACTIONS_LIST}`
676
+ });
677
+ continue;
678
+ }
679
+ if (resolved.suggestion) {
680
+ errors.push({
681
+ stepIndex: i,
682
+ field: "action",
683
+ message: `unknown action "${actionName}". ${resolved.suggestion}`,
684
+ suggestion: resolved.suggestion
685
+ });
686
+ continue;
687
+ }
688
+ const action = resolved.action;
689
+ const rule = ACTION_RULES[action];
690
+ for (const key of Object.keys(obj)) {
691
+ if (key === "action") continue;
692
+ if (!KNOWN_STEP_FIELDS.has(key)) {
693
+ const suggestion = suggestProperty(key);
694
+ errors.push({
695
+ stepIndex: i,
696
+ field: key,
697
+ message: suggestion ? `unknown property "${key}". Did you mean "${suggestion}"?` : `unknown property "${key}".`,
698
+ suggestion: suggestion ? `Did you mean "${suggestion}"?` : void 0
699
+ });
700
+ }
701
+ }
702
+ for (const [field, fieldRule] of Object.entries(rule.required)) {
703
+ if (!(field in obj) || obj[field] === void 0) {
704
+ errors.push({
705
+ stepIndex: i,
706
+ field,
707
+ message: `missing required "${field}" (${fieldRule.type}).`
708
+ });
709
+ } else {
710
+ const typeErr = checkFieldType(obj[field], fieldRule);
711
+ if (typeErr) {
712
+ errors.push({
713
+ stepIndex: i,
714
+ field,
715
+ message: `"${field}" ${typeErr}.`
716
+ });
717
+ }
718
+ }
719
+ }
720
+ for (const [field, fieldRule] of Object.entries(rule.optional)) {
721
+ if (field in obj && obj[field] !== void 0) {
722
+ const typeErr = checkFieldType(obj[field], fieldRule);
723
+ if (typeErr) {
724
+ errors.push({
725
+ stepIndex: i,
726
+ field,
727
+ message: `"${field}" ${typeErr}.`
728
+ });
729
+ }
730
+ }
731
+ }
732
+ if ("timeout" in obj && obj["timeout"] !== void 0) {
733
+ if (typeof obj["timeout"] !== "number") {
734
+ errors.push({
735
+ stepIndex: i,
736
+ field: "timeout",
737
+ message: `"timeout" expected number, got ${typeof obj["timeout"]}.`
738
+ });
739
+ }
740
+ }
741
+ if ("optional" in obj && obj["optional"] !== void 0) {
742
+ if (typeof obj["optional"] !== "boolean") {
743
+ errors.push({
744
+ stepIndex: i,
745
+ field: "optional",
746
+ message: `"optional" expected boolean, got ${typeof obj["optional"]}.`
747
+ });
748
+ }
749
+ }
750
+ if (action === "select") {
751
+ const hasNative = "selector" in obj && "value" in obj;
752
+ const hasCustom = "trigger" in obj && "option" in obj && "value" in obj;
753
+ if (!hasNative && !hasCustom) {
754
+ errors.push({
755
+ stepIndex: i,
756
+ field: "selector",
757
+ message: "select requires either (selector + value) for native select, or (trigger + option + value) for custom select."
758
+ });
759
+ }
760
+ }
761
+ }
762
+ return {
763
+ valid: errors.length === 0,
764
+ errors,
765
+ formatted() {
766
+ if (errors.length === 0) return "";
767
+ const lines = [`Validation failed (${errors.length} error${errors.length > 1 ? "s" : ""}):`];
768
+ for (const err of errors) {
769
+ const stepLabel = err.field === "action" || err.field === "step" ? `Step ${err.stepIndex}` : `Step ${err.stepIndex}`;
770
+ lines.push("");
771
+ lines.push(` ${stepLabel}: ${err.message}`);
772
+ if (err.suggestion && !err.message.includes(err.suggestion)) {
773
+ lines.push(` ${err.suggestion}`);
774
+ }
775
+ const step = steps[err.stepIndex];
776
+ if (step && typeof step === "object") {
777
+ lines.push(` Got: ${JSON.stringify(step)}`);
778
+ }
779
+ }
780
+ const hasEvaluateError = errors.some((err) => {
781
+ const step = steps[err.stepIndex];
782
+ return step && typeof step === "object" && step["action"] === "evaluate";
783
+ });
784
+ if (hasEvaluateError) {
785
+ lines.push("");
786
+ lines.push(
787
+ "Tip: For JavaScript evaluation, use 'bp eval' instead \u2014 no JSON wrapping needed:"
788
+ );
789
+ lines.push(" bp eval 'your.expression.here'");
790
+ }
791
+ lines.push("");
792
+ lines.push(`Valid actions: ${VALID_ACTIONS_LIST}`);
793
+ return lines.join("\n");
794
+ }
795
+ };
796
+ }
797
+
798
+ // src/audio/encoding.ts
799
+ function bufferToBase64(data) {
800
+ const bytes = data instanceof Uint8Array ? data : new Uint8Array(data);
801
+ let binary = "";
802
+ for (let i = 0; i < bytes.length; i++) {
803
+ binary += String.fromCharCode(bytes[i]);
804
+ }
805
+ return btoa(binary);
806
+ }
807
+ function base64ToBuffer(b64) {
808
+ const binary = atob(b64);
809
+ const bytes = new Uint8Array(binary.length);
810
+ for (let i = 0; i < binary.length; i++) {
811
+ bytes[i] = binary.charCodeAt(i);
812
+ }
813
+ return bytes;
814
+ }
815
+ function calculateRMS(samples) {
816
+ if (samples.length === 0) return 0;
817
+ let sum = 0;
818
+ for (let i = 0; i < samples.length; i++) {
819
+ sum += samples[i] * samples[i];
820
+ }
821
+ return Math.sqrt(sum / samples.length);
822
+ }
823
+ function pcmToWav(options) {
824
+ const { left, right, sampleRate } = options;
825
+ const numChannels = right ? 2 : 1;
826
+ const numSamples = left.length;
827
+ const bitsPerSample = 16;
828
+ const bytesPerSample = bitsPerSample / 8;
829
+ const blockAlign = numChannels * bytesPerSample;
830
+ const dataLength = numSamples * blockAlign;
831
+ const headerLength = 44;
832
+ const buffer = new ArrayBuffer(headerLength + dataLength);
833
+ const view = new DataView(buffer);
834
+ writeString(view, 0, "RIFF");
835
+ view.setUint32(4, 36 + dataLength, true);
836
+ writeString(view, 8, "WAVE");
837
+ writeString(view, 12, "fmt ");
838
+ view.setUint32(16, 16, true);
839
+ view.setUint16(20, 1, true);
840
+ view.setUint16(22, numChannels, true);
841
+ view.setUint32(24, sampleRate, true);
842
+ view.setUint32(28, sampleRate * blockAlign, true);
843
+ view.setUint16(32, blockAlign, true);
844
+ view.setUint16(34, bitsPerSample, true);
845
+ writeString(view, 36, "data");
846
+ view.setUint32(40, dataLength, true);
847
+ let offset = 44;
848
+ for (let i = 0; i < numSamples; i++) {
849
+ const leftSample = Math.max(-1, Math.min(1, left[i]));
850
+ view.setInt16(offset, leftSample < 0 ? leftSample * 32768 : leftSample * 32767, true);
851
+ offset += 2;
852
+ if (right) {
853
+ const rightSample = Math.max(-1, Math.min(1, right[i]));
854
+ view.setInt16(offset, rightSample < 0 ? rightSample * 32768 : rightSample * 32767, true);
855
+ offset += 2;
856
+ }
857
+ }
858
+ return buffer;
859
+ }
860
+ function parseWavHeader(data) {
861
+ const view = new DataView(data);
862
+ if (data.byteLength < 44) {
863
+ throw new Error("Invalid WAV: file too small");
864
+ }
865
+ const riff = readString(view, 0, 4);
866
+ const wave = readString(view, 8, 4);
867
+ if (riff !== "RIFF" || wave !== "WAVE") {
868
+ throw new Error("Invalid WAV: missing RIFF/WAVE header");
869
+ }
870
+ const fmt = readString(view, 12, 4);
871
+ if (fmt !== "fmt ") {
872
+ throw new Error("Invalid WAV: missing fmt chunk");
873
+ }
874
+ const channels = view.getUint16(22, true);
875
+ const sampleRate = view.getUint32(24, true);
876
+ const bitsPerSample = view.getUint16(34, true);
877
+ let dataOffset = 36;
878
+ while (dataOffset < data.byteLength - 8) {
879
+ const chunkId = readString(view, dataOffset, 4);
880
+ const chunkSize = view.getUint32(dataOffset + 4, true);
881
+ if (chunkId === "data") {
882
+ return {
883
+ sampleRate,
884
+ channels,
885
+ bitsPerSample,
886
+ dataOffset: dataOffset + 8,
887
+ dataLength: chunkSize
888
+ };
889
+ }
890
+ dataOffset += 8 + chunkSize;
891
+ }
892
+ throw new Error("Invalid WAV: missing data chunk");
893
+ }
894
+ function generateSilence(durationMs, sampleRate = 48e3) {
895
+ return new Float32Array(Math.ceil(sampleRate * durationMs / 1e3));
896
+ }
897
+ function generateTone(frequency, durationMs, sampleRate = 48e3, amplitude = 0.5) {
898
+ const numSamples = Math.ceil(sampleRate * durationMs / 1e3);
899
+ const samples = new Float32Array(numSamples);
900
+ for (let i = 0; i < numSamples; i++) {
901
+ samples[i] = amplitude * Math.sin(2 * Math.PI * frequency * i / sampleRate);
902
+ }
903
+ return samples;
904
+ }
905
+ function writeString(view, offset, str) {
906
+ for (let i = 0; i < str.length; i++) {
907
+ view.setUint8(offset + i, str.charCodeAt(i));
908
+ }
909
+ }
910
+ function readString(view, offset, length) {
911
+ let str = "";
912
+ for (let i = 0; i < length; i++) {
913
+ str += String.fromCharCode(view.getUint8(offset + i));
914
+ }
915
+ return str;
916
+ }
917
+
918
+ // src/audio/flags.ts
919
+ function getAudioChromeFlags(options) {
920
+ const flags = [
921
+ "--use-fake-device-for-media-stream",
922
+ "--use-fake-ui-for-media-stream",
923
+ "--autoplay-policy=no-user-gesture-required"
924
+ ];
925
+ if (options?.inputWavPath) {
926
+ let path = options.inputWavPath;
927
+ if (options.noLoop) {
928
+ path += "%noloop";
929
+ }
930
+ flags.push(`--use-file-for-fake-audio-capture=${path}`);
931
+ }
932
+ return flags;
933
+ }
934
+
935
+ // src/audio/permissions.ts
936
+ async function grantAudioPermissions(cdp, origin) {
937
+ await cdp.send("Browser.grantPermissions", {
938
+ permissions: ["audioCapture"],
939
+ origin: origin ?? ""
940
+ });
941
+ await cdp.send("Page.addScriptToEvaluateOnNewDocument", {
942
+ source: PERMISSIONS_OVERRIDE_SCRIPT
943
+ });
944
+ }
945
+ var PERMISSIONS_OVERRIDE_SCRIPT = `
946
+ (function() {
947
+ if (window.__bpPermissionsPatched) return;
948
+ window.__bpPermissionsPatched = true;
949
+
950
+ var origQuery = navigator.permissions.query.bind(navigator.permissions);
951
+ navigator.permissions.query = function(desc) {
952
+ if (desc && (desc.name === 'microphone' || desc.name === 'audio-capture')) {
953
+ return Promise.resolve({
954
+ state: 'granted',
955
+ onchange: null,
956
+ addEventListener: function() {},
957
+ removeEventListener: function() {},
958
+ dispatchEvent: function() { return true; }
959
+ });
960
+ }
961
+ return origQuery(desc);
962
+ };
963
+ })();
964
+ `;
965
+
966
+ // src/audio/input.ts
967
+ var INPUT_BINDING = "__bpAudioInputDone";
968
+ var AUDIO_INPUT_SCRIPT = `
969
+ (function() {
970
+ if (window.__bpAudioInput) return;
971
+
972
+ var audioCtx = null;
973
+ var sourceNode = null;
974
+ var destinationNode = null;
975
+ var fakeStream = null;
976
+ var silenceGain = null;
977
+ var silenceOsc = null;
978
+ var isPlaying = false;
979
+
980
+ function ensureFakeStream() {
981
+ if (fakeStream) return fakeStream;
982
+ // Use the original AudioContext to avoid being tracked by our output override
983
+ var CtorToUse = window.__bpOrigAudioContext || window.AudioContext || window.webkitAudioContext;
984
+ audioCtx = new CtorToUse({ sampleRate: 48000 });
985
+ // Auto-resume if suspended (CDP automation has no user gesture)
986
+ if (audioCtx.state === 'suspended') {
987
+ console.log('[bp:input] AudioContext suspended, auto-resuming...');
988
+ audioCtx.resume().then(function() {
989
+ console.log('[bp:input] AudioContext resumed (' + audioCtx.state + ')');
990
+ }).catch(function(e) {
991
+ console.warn('[bp:input] AudioContext resume failed:', e);
992
+ });
993
+ }
994
+ destinationNode = audioCtx.createMediaStreamDestination();
995
+
996
+ // Start with silence so the stream always has active tracks
997
+ silenceGain = audioCtx.createGain();
998
+ silenceGain.gain.value = 0;
999
+ silenceOsc = audioCtx.createOscillator();
1000
+ silenceOsc.connect(silenceGain);
1001
+ silenceGain.connect(destinationNode);
1002
+ silenceOsc.start();
1003
+
1004
+ fakeStream = destinationNode.stream;
1005
+ console.log('[bp:input] Fake mic stream created (48kHz, ' + fakeStream.getAudioTracks().length + ' tracks)');
1006
+ return fakeStream;
1007
+ }
1008
+
1009
+ function playAudio(base64Data) {
1010
+ ensureFakeStream();
1011
+
1012
+ var resumePromise = audioCtx.state === 'suspended'
1013
+ ? audioCtx.resume()
1014
+ : Promise.resolve();
1015
+
1016
+ return resumePromise.then(function() {
1017
+ if (sourceNode) {
1018
+ try { sourceNode.stop(); } catch(e) {}
1019
+ sourceNode.disconnect();
1020
+ sourceNode = null;
1021
+ }
1022
+
1023
+ var binaryStr = atob(base64Data);
1024
+ var bytes = new Uint8Array(binaryStr.length);
1025
+ for (var i = 0; i < binaryStr.length; i++) {
1026
+ bytes[i] = binaryStr.charCodeAt(i);
1027
+ }
1028
+ console.log('[bp:input] Decoding audio (' + bytes.length + ' bytes)...');
1029
+
1030
+ return audioCtx.decodeAudioData(bytes.buffer.slice(0));
1031
+ }).then(function(audioBuffer) {
1032
+ sourceNode = audioCtx.createBufferSource();
1033
+ sourceNode.buffer = audioBuffer;
1034
+ sourceNode.connect(destinationNode);
1035
+
1036
+ var durationMs = Math.round(audioBuffer.duration * 1000);
1037
+ console.log('[bp:input] Playing ' + durationMs + 'ms audio (' + audioBuffer.sampleRate + 'Hz, ' + audioBuffer.numberOfChannels + 'ch)');
1038
+
1039
+ return new Promise(function(resolve) {
1040
+ sourceNode.onended = function() {
1041
+ isPlaying = false;
1042
+ console.log('[bp:input] Playback ended');
1043
+ resolve(true);
1044
+ try {
1045
+ if (typeof window.__bpAudioInputDone === 'function') {
1046
+ window.__bpAudioInputDone('done');
1047
+ }
1048
+ } catch(e) {}
1049
+ };
1050
+ isPlaying = true;
1051
+ sourceNode.start();
1052
+ });
1053
+ });
1054
+ }
1055
+
1056
+ function stopAudio() {
1057
+ if (sourceNode) {
1058
+ try { sourceNode.stop(); } catch(e) {}
1059
+ sourceNode.disconnect();
1060
+ sourceNode = null;
1061
+ }
1062
+ isPlaying = false;
1063
+ console.log('[bp:input] Stopped');
1064
+ }
1065
+
1066
+ var origGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
1067
+
1068
+ navigator.mediaDevices.getUserMedia = function(constraints) {
1069
+ if (constraints && constraints.audio) {
1070
+ var stream = ensureFakeStream();
1071
+ console.log('[bp:input] getUserMedia intercepted \u2014 returning fake mic' + (constraints.video ? ' + real video' : ''));
1072
+
1073
+ if (constraints.video) {
1074
+ // Get real video + our fake audio
1075
+ return origGetUserMedia({ video: constraints.video }).then(function(realStream) {
1076
+ var combined = new MediaStream(
1077
+ stream.getAudioTracks().concat(realStream.getVideoTracks())
1078
+ );
1079
+ return combined;
1080
+ });
1081
+ }
1082
+
1083
+ // Return a clone so consumers can't stop our source track
1084
+ return Promise.resolve(stream.clone());
1085
+ }
1086
+ return origGetUserMedia(constraints);
1087
+ };
1088
+
1089
+ var origEnumerate = navigator.mediaDevices.enumerateDevices.bind(navigator.mediaDevices);
1090
+ navigator.mediaDevices.enumerateDevices = function() {
1091
+ return origEnumerate().then(function(devices) {
1092
+ var hasMic = devices.some(function(d) { return d.kind === 'audioinput'; });
1093
+ if (!hasMic) {
1094
+ devices.push({
1095
+ deviceId: 'bp-fake-mic',
1096
+ kind: 'audioinput',
1097
+ label: 'Default Audio Input',
1098
+ groupId: 'bp-audio',
1099
+ toJSON: function() {
1100
+ return { deviceId: this.deviceId, kind: this.kind, label: this.label, groupId: this.groupId };
1101
+ }
1102
+ });
1103
+ }
1104
+ return devices;
1105
+ });
1106
+ };
1107
+
1108
+ window.__bpAudioInput = {
1109
+ play: playAudio,
1110
+ stop: stopAudio,
1111
+ isPlaying: function() { return isPlaying; },
1112
+ getState: function() {
1113
+ return {
1114
+ contextState: audioCtx ? audioCtx.state : 'not-created',
1115
+ isPlaying: isPlaying,
1116
+ sampleRate: audioCtx ? audioCtx.sampleRate : 0
1117
+ };
1118
+ },
1119
+ getContext: function() { return audioCtx; }
1120
+ };
1121
+
1122
+ console.log('[bp:input] Audio input override installed (getUserMedia + enumerateDevices)');
1123
+ })();
1124
+ `;
1125
+ var AudioInput = class {
1126
+ cdp;
1127
+ injected = false;
1128
+ bindingRegistered = false;
1129
+ bindingHandler = null;
1130
+ constructor(cdp) {
1131
+ this.cdp = cdp;
1132
+ }
1133
+ /** Whether the audio input system has been set up */
1134
+ get isSetup() {
1135
+ return this.injected;
1136
+ }
1137
+ /**
1138
+ * Set up audio input injection.
1139
+ * Must be called before navigating to the page that will use getUserMedia.
1140
+ * Grants permissions and injects the getUserMedia override.
1141
+ */
1142
+ async setup() {
1143
+ if (this.injected) return;
1144
+ try {
1145
+ const resp = await this.cdp.send("Runtime.evaluate", {
1146
+ expression: "location.href",
1147
+ returnByValue: true
1148
+ });
1149
+ const href = resp.result?.value;
1150
+ if (typeof href === "string" && (href === "about:blank" || href === "about:srcdoc")) {
306
1151
  throw new Error(
307
- `Unknown action: ${step.action}. Run 'bp actions' for available actions.`
1152
+ 'Cannot set up audio on about:blank. Navigate to a page first.\nExample: await page.goto("https://your-voice-app.com")'
1153
+ );
1154
+ }
1155
+ } catch (e) {
1156
+ if (e instanceof Error && e.message.includes("Cannot set up audio")) throw e;
1157
+ }
1158
+ let origin;
1159
+ try {
1160
+ const resp = await this.cdp.send("Runtime.evaluate", {
1161
+ expression: "location.origin",
1162
+ returnByValue: true
1163
+ });
1164
+ const val = resp.result?.value;
1165
+ if (typeof val === "string" && val !== "null") {
1166
+ origin = val;
1167
+ }
1168
+ } catch {
1169
+ }
1170
+ await grantAudioPermissions(this.cdp, origin);
1171
+ if (!this.bindingRegistered) {
1172
+ await this.cdp.send("Runtime.addBinding", { name: INPUT_BINDING });
1173
+ this.bindingRegistered = true;
1174
+ }
1175
+ await this.cdp.send("Page.addScriptToEvaluateOnNewDocument", {
1176
+ source: AUDIO_INPUT_SCRIPT
1177
+ });
1178
+ await this.cdp.send("Runtime.evaluate", {
1179
+ expression: AUDIO_INPUT_SCRIPT,
1180
+ awaitPromise: false,
1181
+ userGesture: true
1182
+ });
1183
+ this.injected = true;
1184
+ }
1185
+ /**
1186
+ * Play audio bytes into the page's fake microphone.
1187
+ * Accepts any format that Web Audio API can decode (WAV, MP3, OGG, etc.).
1188
+ *
1189
+ * @param audioData - Raw audio file bytes
1190
+ * @param options - Playback options
1191
+ */
1192
+ async play(audioData, options) {
1193
+ if (!this.injected) {
1194
+ await this.setup();
1195
+ }
1196
+ await this.cdp.send("Runtime.evaluate", {
1197
+ expression: `(function() {
1198
+ var resumed = [];
1199
+ (window.__bpTrackedAudioContexts || []).forEach(function(ctx) {
1200
+ if (ctx.state === 'suspended') {
1201
+ ctx.resume().then(function() {
1202
+ console.log('[bp:input] Resumed suspended AudioContext (' + ctx.sampleRate + 'Hz)');
1203
+ });
1204
+ resumed.push(ctx.sampleRate);
1205
+ }
1206
+ });
1207
+ // Also resume the input context itself
1208
+ if (window.__bpAudioInput && window.__bpAudioInput.getContext) {
1209
+ var inputCtx = window.__bpAudioInput.getContext();
1210
+ if (inputCtx && inputCtx.state === 'suspended') {
1211
+ inputCtx.resume().then(function() {
1212
+ console.log('[bp:input] Resumed input AudioContext (' + inputCtx.sampleRate + 'Hz)');
1213
+ });
1214
+ resumed.push('input-' + inputCtx.sampleRate);
1215
+ }
1216
+ }
1217
+ return resumed.length > 0 ? 'resumed: ' + resumed.join(',') : 'all running';
1218
+ })()`,
1219
+ awaitPromise: false,
1220
+ userGesture: true
1221
+ });
1222
+ const base64 = bufferToBase64(audioData);
1223
+ const waitForEnd = options?.waitForEnd ?? true;
1224
+ const timeout = options?.timeout ?? 6e4;
1225
+ if (waitForEnd) {
1226
+ const donePromise = this.waitForBinding(timeout);
1227
+ await this.cdp.send("Runtime.evaluate", {
1228
+ expression: `window.__bpAudioInput.play('${base64}')`,
1229
+ awaitPromise: false
1230
+ });
1231
+ await donePromise;
1232
+ } else {
1233
+ await this.cdp.send("Runtime.evaluate", {
1234
+ expression: `window.__bpAudioInput.play('${base64}')`,
1235
+ awaitPromise: false
1236
+ });
1237
+ }
1238
+ }
1239
+ /**
1240
+ * Stop any currently playing audio.
1241
+ */
1242
+ async stop() {
1243
+ if (!this.injected) return;
1244
+ await this.cdp.send("Runtime.evaluate", {
1245
+ expression: "window.__bpAudioInput && window.__bpAudioInput.stop()",
1246
+ awaitPromise: false
1247
+ });
1248
+ }
1249
+ /**
1250
+ * Get current state of the injected audio input system.
1251
+ */
1252
+ async getState() {
1253
+ if (!this.injected) {
1254
+ return { contextState: "not-created", isPlaying: false, sampleRate: 0 };
1255
+ }
1256
+ const result = await this.cdp.send("Runtime.evaluate", {
1257
+ expression: "window.__bpAudioInput ? window.__bpAudioInput.getState() : null",
1258
+ returnByValue: true
1259
+ });
1260
+ return result.result.value ?? { contextState: "not-created", isPlaying: false, sampleRate: 0 };
1261
+ }
1262
+ /**
1263
+ * Clean up: remove binding handler.
1264
+ */
1265
+ async teardown() {
1266
+ if (this.bindingHandler) {
1267
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
1268
+ this.bindingHandler = null;
1269
+ }
1270
+ await this.stop();
1271
+ this.injected = false;
1272
+ this.bindingRegistered = false;
1273
+ }
1274
+ /**
1275
+ * Wait for the playback-complete binding to fire.
1276
+ */
1277
+ waitForBinding(timeout) {
1278
+ return new Promise((resolve, reject) => {
1279
+ const timer = setTimeout(() => {
1280
+ if (this.bindingHandler) {
1281
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
1282
+ this.bindingHandler = null;
1283
+ }
1284
+ reject(new Error(`AudioInput: playback timed out after ${timeout}ms`));
1285
+ }, timeout);
1286
+ if (this.bindingHandler) {
1287
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
1288
+ }
1289
+ this.bindingHandler = (params) => {
1290
+ if (params["name"] === INPUT_BINDING) {
1291
+ clearTimeout(timer);
1292
+ if (this.bindingHandler) {
1293
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
1294
+ this.bindingHandler = null;
1295
+ }
1296
+ resolve();
1297
+ }
1298
+ };
1299
+ this.cdp.on("Runtime.bindingCalled", this.bindingHandler);
1300
+ });
1301
+ }
1302
+ };
1303
+
1304
+ // src/audio/output.ts
1305
+ var OUTPUT_BINDING = "__bpAudioOutputData";
1306
+ var AUDIO_OUTPUT_SCRIPT = `
1307
+ (function() {
1308
+ // If already installed, stop any active capture but allow re-initialization
1309
+ // so that updated scripts (e.g. with new capture strategies) take effect.
1310
+ if (window.__bpAudioOutput) {
1311
+ if (window.__bpAudioOutput.isCapturing()) window.__bpAudioOutput.stop();
1312
+ // Keep existing allAudioContexts if available (preserves pre-override tracking)
1313
+ }
1314
+
1315
+ var BUFFER_SIZE = 4096;
1316
+ var FLUSH_SAMPLES = 48000; // flush every ~1s at 48kHz (scales with sample rate)
1317
+ var capturing = false;
1318
+ var capturedChunks = [];
1319
+ var totalSamples = 0;
1320
+ var flushCount = 0;
1321
+ var pendingTracks = [];
1322
+ var tappedTrackIds = {};
1323
+
1324
+ // --- Per-context tap infrastructure ---
1325
+ // Preserve any AudioContexts tracked by a previous script version
1326
+ var allAudioContexts = window.__bpTrackedAudioContexts || [];
1327
+ // Use a WeakMap to associate taps with AudioContext instances
1328
+ // (native objects like AudioContext may not support custom properties)
1329
+ var contextTapMap = typeof WeakMap !== 'undefined' ? new WeakMap() : null;
1330
+ var contextTapList = []; // fallback: [{ctx, proc}]
1331
+
1332
+ var OrigAudioContext = window.__bpOrigAudioContext || window.AudioContext || window.webkitAudioContext;
1333
+ // Save the native connect function once; on re-injection, reuse it to avoid double-wrapping
1334
+ var origConnect = window.__bpOrigConnect || AudioNode.prototype.connect;
1335
+ window.__bpOrigConnect = origConnect;
1336
+
1337
+ // Our own capture context (48kHz) for WebRTC tracks and media elements
1338
+ var captureCtx = null;
1339
+ var captureProcessor = null;
1340
+
1341
+ // Save original AudioContext constructor once
1342
+ if (!window.__bpOrigAudioContext) {
1343
+ window.__bpOrigAudioContext = OrigAudioContext;
1344
+ }
1345
+
1346
+ // Override AudioContext constructor to track all instances (skip if already overridden)
1347
+ if (OrigAudioContext && !window.__bpAudioContextOverridden) {
1348
+ window.__bpAudioContextOverridden = true;
1349
+ window.AudioContext = function() {
1350
+ var ctx = new (Function.prototype.bind.apply(OrigAudioContext, [null].concat(Array.prototype.slice.call(arguments))))();
1351
+ allAudioContexts.push(ctx);
1352
+ // Auto-resume suspended contexts \u2014 CDP automation has no user gesture,
1353
+ // so Chrome suspends new AudioContexts by default. Without this, voice
1354
+ // agents' ScriptProcessorNodes never fire and no audio flows.
1355
+ if (ctx.state === 'suspended') {
1356
+ console.log('[bp:output] AudioContext created suspended (' + ctx.sampleRate + 'Hz), auto-resuming...');
1357
+ ctx.resume().then(function() {
1358
+ console.log('[bp:output] AudioContext resumed successfully (' + ctx.sampleRate + 'Hz, state: ' + ctx.state + ')');
1359
+ }).catch(function(e) {
1360
+ console.warn('[bp:output] AudioContext resume failed (' + ctx.sampleRate + 'Hz):', e);
1361
+ });
1362
+ } else {
1363
+ console.log('[bp:output] AudioContext created (' + ctx.sampleRate + 'Hz, state: ' + ctx.state + ')');
1364
+ }
1365
+ return ctx;
1366
+ };
1367
+ window.AudioContext.prototype = OrigAudioContext.prototype;
1368
+ Object.keys(OrigAudioContext).forEach(function(k) {
1369
+ try { window.AudioContext[k] = OrigAudioContext[k]; } catch(e) {}
1370
+ });
1371
+ if (window.webkitAudioContext) {
1372
+ window.webkitAudioContext = window.AudioContext;
1373
+ }
1374
+ }
1375
+
1376
+ // Expose tracked contexts on window so re-injections preserve them
1377
+ window.__bpTrackedAudioContexts = allAudioContexts;
1378
+
1379
+ // Look up an existing tap for a given AudioContext
1380
+ function findTap(ctx) {
1381
+ if (contextTapMap) return contextTapMap.get(ctx) || null;
1382
+ for (var i = 0; i < contextTapList.length; i++) {
1383
+ if (contextTapList[i].ctx === ctx) return contextTapList[i].proc;
1384
+ }
1385
+ return null;
1386
+ }
1387
+
1388
+ // Store a tap for a given AudioContext
1389
+ function storeTap(ctx, proc) {
1390
+ if (contextTapMap) { contextTapMap.set(ctx, proc); }
1391
+ else { contextTapList.push({ ctx: ctx, proc: proc }); }
1392
+ }
1393
+
1394
+ // Count stored taps
1395
+ function tapCount() {
1396
+ if (contextTapMap) {
1397
+ var count = 0;
1398
+ for (var i = 0; i < allAudioContexts.length; i++) {
1399
+ if (contextTapMap.has(allAudioContexts[i])) count++;
1400
+ }
1401
+ return count;
1402
+ }
1403
+ return contextTapList.length;
1404
+ }
1405
+
1406
+ // Create or retrieve a ScriptProcessorNode tap for a specific AudioContext.
1407
+ // The tap lives in the SAME context as the source, avoiding cross-context errors.
1408
+ function getOrCreateTap(ctx) {
1409
+ var existing = findTap(ctx);
1410
+ if (existing) return existing;
1411
+
1412
+ try {
1413
+ if (ctx.state === 'closed') return null;
1414
+ var channels = Math.min(ctx.destination.channelCount || 2, 2);
1415
+ if (channels < 1) channels = 1;
1416
+ var proc = ctx.createScriptProcessor(BUFFER_SIZE, channels, channels);
1417
+ proc.onaudioprocess = function(e) {
1418
+ if (!capturing) return;
1419
+ var left = new Float32Array(e.inputBuffer.getChannelData(0));
1420
+ var right = e.inputBuffer.numberOfChannels > 1
1421
+ ? new Float32Array(e.inputBuffer.getChannelData(1))
1422
+ : new Float32Array(left.length);
1423
+ capturedChunks.push({ left: left, right: right, sampleRate: ctx.sampleRate });
1424
+ totalSamples += left.length;
1425
+ if (totalSamples >= FLUSH_SAMPLES) {
1426
+ flushToNodeJs();
1427
+ }
1428
+ };
1429
+ // Must connect to destination to keep ScriptProcessorNode alive
1430
+ origConnect.call(proc, ctx.destination);
1431
+ storeTap(ctx, proc);
1432
+ return proc;
1433
+ } catch(e) {
1434
+ return null;
1435
+ }
1436
+ }
1437
+
1438
+ // Override AudioNode.prototype.connect to tap connections to any AudioDestinationNode
1439
+ AudioNode.prototype.connect = function(destination) {
1440
+ var result = origConnect.apply(this, arguments);
1441
+
1442
+ if (capturing && destination instanceof AudioDestinationNode) {
1443
+ try {
1444
+ var tap = getOrCreateTap(destination.context);
1445
+ // Don't connect the tap to itself
1446
+ if (tap && tap !== this) {
1447
+ origConnect.call(this, tap);
1448
+ }
1449
+ } catch(e) {}
1450
+ }
1451
+ return result;
1452
+ };
1453
+
1454
+ var origPlay = window.__bpOrigPlay || HTMLMediaElement.prototype.play;
1455
+ window.__bpOrigPlay = origPlay;
1456
+ HTMLMediaElement.prototype.play = function() {
1457
+ if (capturing && !this.__bpCaptured) {
1458
+ this.__bpCaptured = true;
1459
+ try {
1460
+ if (!captureCtx) initCaptureCtx();
1461
+ var stream = this.captureStream ? this.captureStream() : null;
1462
+ if (stream && captureCtx) {
1463
+ var source = captureCtx.createMediaStreamSource(stream);
1464
+ origConnect.call(source, captureProcessor);
1465
+ }
1466
+ } catch(e) {}
1467
+ }
1468
+ return origPlay.apply(this, arguments);
1469
+ };
1470
+
1471
+ // Intercept srcObject assignment to catch WebRTC streams attached to media elements
1472
+ var origSrcObjectDesc = Object.getOwnPropertyDescriptor(HTMLMediaElement.prototype, 'srcObject');
1473
+ if (origSrcObjectDesc && origSrcObjectDesc.set) {
1474
+ Object.defineProperty(HTMLMediaElement.prototype, 'srcObject', {
1475
+ set: function(stream) {
1476
+ origSrcObjectDesc.set.call(this, stream);
1477
+ if (stream && stream.getAudioTracks) {
1478
+ var tracks = stream.getAudioTracks();
1479
+ for (var i = 0; i < tracks.length; i++) {
1480
+ if (capturing) {
1481
+ tapAudioTrack(tracks[i]);
1482
+ } else {
1483
+ pendingTracks.push(tracks[i]);
1484
+ }
1485
+ }
1486
+ }
1487
+ },
1488
+ get: origSrcObjectDesc.get,
1489
+ configurable: true
1490
+ });
1491
+ }
1492
+
1493
+ // Initialize our own 48kHz capture context for WebRTC and media element tapping
1494
+ function initCaptureCtx() {
1495
+ captureCtx = new OrigAudioContext({ sampleRate: 48000 });
1496
+ captureProcessor = captureCtx.createScriptProcessor(BUFFER_SIZE, 2, 2);
1497
+ captureProcessor.onaudioprocess = function(e) {
1498
+ if (!capturing) return;
1499
+ var left = new Float32Array(e.inputBuffer.getChannelData(0));
1500
+ var right = new Float32Array(e.inputBuffer.getChannelData(1));
1501
+ capturedChunks.push({ left: left, right: right, sampleRate: 48000 });
1502
+ totalSamples += left.length;
1503
+ if (totalSamples >= FLUSH_SAMPLES) {
1504
+ flushToNodeJs();
1505
+ }
1506
+ };
1507
+ origConnect.call(captureProcessor, captureCtx.destination);
1508
+ }
1509
+
1510
+ function uint8ToBase64(bytes) {
1511
+ var CHUNK = 8192;
1512
+ var parts = [];
1513
+ for (var i = 0; i < bytes.length; i += CHUNK) {
1514
+ var slice = bytes.subarray(i, Math.min(i + CHUNK, bytes.length));
1515
+ var binary = '';
1516
+ for (var j = 0; j < slice.length; j++) {
1517
+ binary += String.fromCharCode(slice[j]);
1518
+ }
1519
+ parts.push(binary);
1520
+ }
1521
+ return btoa(parts.join(''));
1522
+ }
1523
+
1524
+ function flushGroup(chunks, rate) {
1525
+ var totalLen = 0;
1526
+ for (var i = 0; i < chunks.length; i++) {
1527
+ totalLen += chunks[i].left.length;
1528
+ }
1529
+ if (totalLen === 0) return;
1530
+
1531
+ var left = new Float32Array(totalLen);
1532
+ var right = new Float32Array(totalLen);
1533
+ var offset = 0;
1534
+ for (var i = 0; i < chunks.length; i++) {
1535
+ left.set(chunks[i].left, offset);
1536
+ right.set(chunks[i].right, offset);
1537
+ offset += chunks[i].left.length;
1538
+ }
1539
+
1540
+ var leftB64 = uint8ToBase64(new Uint8Array(left.buffer));
1541
+ var rightB64 = uint8ToBase64(new Uint8Array(right.buffer));
1542
+
1543
+ flushCount++;
1544
+
1545
+ try {
1546
+ if (typeof window.__bpAudioOutputData === 'function') {
1547
+ window.__bpAudioOutputData(JSON.stringify({
1548
+ left: leftB64,
1549
+ right: rightB64,
1550
+ sampleRate: rate,
1551
+ samples: totalLen
1552
+ }));
1553
+ }
1554
+ } catch(e) {}
1555
+ }
1556
+
1557
+ function flushToNodeJs() {
1558
+ if (capturedChunks.length === 0) return;
1559
+
1560
+ // Group chunks by sample rate to avoid mixing different-rate audio
1561
+ var byRate = {};
1562
+ for (var i = 0; i < capturedChunks.length; i++) {
1563
+ var rate = capturedChunks[i].sampleRate || 48000;
1564
+ if (!byRate[rate]) byRate[rate] = [];
1565
+ byRate[rate].push(capturedChunks[i]);
1566
+ }
1567
+
1568
+ // Flush each sample rate group separately
1569
+ for (var rateKey in byRate) {
1570
+ if (byRate.hasOwnProperty(rateKey)) {
1571
+ flushGroup(byRate[rateKey], Number(rateKey));
1572
+ }
1573
+ }
1574
+
1575
+ capturedChunks = [];
1576
+ totalSamples = 0;
1577
+ }
1578
+
1579
+ // --- WebRTC interception (for apps that use RTCPeerConnection) ---
1580
+ var rtcTrackedStreams = [];
1581
+ var rtcPeerConnections = [];
1582
+
1583
+ function tapAudioTrack(track) {
1584
+ try {
1585
+ if (tappedTrackIds[track.id]) return;
1586
+ tappedTrackIds[track.id] = true;
1587
+ if (!captureCtx) initCaptureCtx();
1588
+ var stream = new MediaStream([track]);
1589
+ var source = captureCtx.createMediaStreamSource(stream);
1590
+ origConnect.call(source, captureProcessor);
1591
+ rtcTrackedStreams.push(source);
1592
+ } catch(e) {}
1593
+ }
1594
+
1595
+ function tapExistingPeerConnection(pc) {
1596
+ try {
1597
+ var receivers = pc.getReceivers ? pc.getReceivers() : [];
1598
+ for (var i = 0; i < receivers.length; i++) {
1599
+ if (receivers[i].track && receivers[i].track.kind === 'audio') {
1600
+ tapAudioTrack(receivers[i].track);
1601
+ }
1602
+ }
1603
+ } catch(e) {}
1604
+ }
1605
+
1606
+ if (typeof RTCPeerConnection !== 'undefined') {
1607
+ var OrigRTC = RTCPeerConnection;
1608
+
1609
+ window.RTCPeerConnection = function() {
1610
+ var pc = new (Function.prototype.bind.apply(OrigRTC, [null].concat(Array.prototype.slice.call(arguments))))();
1611
+ rtcPeerConnections.push(pc);
1612
+
1613
+ pc.addEventListener('track', function(event) {
1614
+ if (event.track && event.track.kind === 'audio') {
1615
+ if (capturing) {
1616
+ tapAudioTrack(event.track);
1617
+ } else {
1618
+ pendingTracks.push(event.track);
1619
+ }
1620
+ }
1621
+ });
1622
+
1623
+ return pc;
1624
+ };
1625
+ window.RTCPeerConnection.prototype = OrigRTC.prototype;
1626
+ Object.keys(OrigRTC).forEach(function(k) {
1627
+ try { window.RTCPeerConnection[k] = OrigRTC[k]; } catch(e) {}
1628
+ });
1629
+
1630
+ window.__bpTrackedPCs = rtcPeerConnections;
1631
+ }
1632
+
1633
+ window.__bpAudioOutput = {
1634
+ start: function() {
1635
+ capturing = true;
1636
+ capturedChunks = [];
1637
+ totalSamples = 0;
1638
+ flushCount = 0;
1639
+ tappedTrackIds = {};
1640
+
1641
+ // Resume any suspended capture context
1642
+ if (captureCtx && captureCtx.state === 'suspended') captureCtx.resume();
1643
+
1644
+ // Create taps for all tracked AudioContexts (catches contexts created before capture)
1645
+ for (var i = 0; i < allAudioContexts.length; i++) {
1646
+ var ctx = allAudioContexts[i];
1647
+ if (ctx.state !== 'closed') {
1648
+ getOrCreateTap(ctx);
1649
+ }
1650
+ }
1651
+
1652
+ // Drain pending WebRTC tracks
1653
+ for (var j = 0; j < pendingTracks.length; j++) {
1654
+ tapAudioTrack(pendingTracks[j]);
1655
+ }
1656
+ pendingTracks = [];
1657
+
1658
+ // Tap existing peer connections
1659
+ for (var k = 0; k < rtcPeerConnections.length; k++) {
1660
+ tapExistingPeerConnection(rtcPeerConnections[k]);
1661
+ }
1662
+
1663
+ // Scan existing media elements for srcObject with audio tracks
1664
+ var mediaEls = document.querySelectorAll('audio, video');
1665
+ for (var i = 0; i < mediaEls.length; i++) {
1666
+ var el = mediaEls[i];
1667
+ if (el.srcObject && el.srcObject.getAudioTracks && !el.__bpCaptured) {
1668
+ el.__bpCaptured = true;
1669
+ var tracks = el.srcObject.getAudioTracks();
1670
+ for (var j = 0; j < tracks.length; j++) {
1671
+ tapAudioTrack(tracks[j]);
1672
+ }
1673
+ }
1674
+ }
1675
+
1676
+ // Watch for dynamically added media elements with srcObject
1677
+ if (typeof MutationObserver !== 'undefined') {
1678
+ if (window.__bpMediaObserver) {
1679
+ window.__bpMediaObserver.disconnect();
1680
+ }
1681
+ window.__bpMediaObserver = new MutationObserver(function(mutations) {
1682
+ for (var i = 0; i < mutations.length; i++) {
1683
+ var added = mutations[i].addedNodes;
1684
+ for (var j = 0; j < added.length; j++) {
1685
+ var node = added[j];
1686
+ if (node.nodeType !== 1) continue;
1687
+ var els = [];
1688
+ if (node.tagName === 'AUDIO' || node.tagName === 'VIDEO') els.push(node);
1689
+ else if (node.querySelectorAll) {
1690
+ var nested = node.querySelectorAll('audio, video');
1691
+ for (var k = 0; k < nested.length; k++) els.push(nested[k]);
1692
+ }
1693
+ for (var m = 0; m < els.length; m++) {
1694
+ var el = els[m];
1695
+ if (el.srcObject && el.srcObject.getAudioTracks && !el.__bpCaptured) {
1696
+ el.__bpCaptured = true;
1697
+ var tracks = el.srcObject.getAudioTracks();
1698
+ for (var t = 0; t < tracks.length; t++) tapAudioTrack(tracks[t]);
1699
+ }
1700
+ }
1701
+ }
1702
+ }
1703
+ });
1704
+ window.__bpMediaObserver.observe(document, { childList: true, subtree: true });
1705
+ }
1706
+ },
1707
+ stop: function() {
1708
+ capturing = false;
1709
+ flushToNodeJs();
1710
+ // Disconnect MutationObserver
1711
+ if (window.__bpMediaObserver) {
1712
+ window.__bpMediaObserver.disconnect();
1713
+ window.__bpMediaObserver = null;
1714
+ }
1715
+ },
1716
+ isCapturing: function() { return capturing; },
1717
+ getBufferedSamples: function() { return totalSamples; },
1718
+ tapPC: function(pc) {
1719
+ if (!pc || typeof pc.getReceivers !== 'function') return false;
1720
+ if (rtcPeerConnections.indexOf(pc) === -1) {
1721
+ rtcPeerConnections.push(pc);
1722
+ }
1723
+ if (capturing) {
1724
+ tapExistingPeerConnection(pc);
1725
+ }
1726
+ pc.addEventListener('track', function(event) {
1727
+ if (event.track && event.track.kind === 'audio') {
1728
+ if (capturing) {
1729
+ tapAudioTrack(event.track);
1730
+ } else {
1731
+ pendingTracks.push(event.track);
1732
+ }
1733
+ }
1734
+ });
1735
+ return true;
1736
+ },
1737
+ getStats: function() {
1738
+ return {
1739
+ audioContexts: allAudioContexts.filter(function(c) { return c.state !== 'closed'; }).length,
1740
+ contextTaps: tapCount(),
1741
+ audioNodes: captureCtx ? captureCtx.destination.numberOfInputs : 0,
1742
+ rtcConnections: rtcPeerConnections.length,
1743
+ mediaElements: document.querySelectorAll('audio, video').length,
1744
+ pendingTracks: pendingTracks.length,
1745
+ tappedTracks: Object.keys(tappedTrackIds).length,
1746
+ capturing: capturing,
1747
+ bufferedSamples: totalSamples,
1748
+ rtcDetails: rtcPeerConnections.map(function(pc) {
1749
+ try {
1750
+ var receivers = pc.getReceivers ? pc.getReceivers() : [];
1751
+ var senders = pc.getSenders ? pc.getSenders() : [];
1752
+ var audioReceivers = receivers.filter(function(r) { return r.track && r.track.kind === 'audio'; }).length;
1753
+ var audioSenders = senders.filter(function(s) { return s.track && s.track.kind === 'audio'; }).length;
1754
+ return {
1755
+ state: pc.connectionState || pc.iceConnectionState || 'unknown',
1756
+ audioReceivers: audioReceivers,
1757
+ audioSenders: audioSenders,
1758
+ tapped: receivers.some(function(r) { return r.track && tappedTrackIds[r.track.id]; })
1759
+ };
1760
+ } catch(e) { return { state: 'error', audioReceivers: 0, audioSenders: 0, tapped: false }; }
1761
+ }),
1762
+ mediaElementDetails: (function() {
1763
+ try {
1764
+ var els = document.querySelectorAll('audio, video');
1765
+ var details = [];
1766
+ for (var i = 0; i < els.length; i++) {
1767
+ var el = els[i];
1768
+ var hasSrcObject = !!(el.srcObject);
1769
+ var audioTracks = 0;
1770
+ if (el.srcObject && el.srcObject.getAudioTracks) {
1771
+ audioTracks = el.srcObject.getAudioTracks().length;
1772
+ }
1773
+ details.push({
1774
+ tag: el.tagName.toLowerCase(),
1775
+ hasSrcObject: hasSrcObject,
1776
+ hasSrc: !!(el.src || el.currentSrc),
1777
+ audioTracks: audioTracks,
1778
+ tapped: !!(el.__bpCaptured)
1779
+ });
1780
+ }
1781
+ return details;
1782
+ } catch(e) { return []; }
1783
+ })()
1784
+ };
1785
+ }
1786
+ };
1787
+ })();
1788
+ `;
1789
+ var AudioOutput = class {
1790
+ cdp;
1791
+ chunks = [];
1792
+ injected = false;
1793
+ capturing = false;
1794
+ bindingHandler = null;
1795
+ onChunkHandler;
1796
+ onDiagHandler;
1797
+ /** Timestamp of the first non-silent chunk received */
1798
+ firstChunkTime = null;
1799
+ constructor(cdp) {
1800
+ this.cdp = cdp;
1801
+ }
1802
+ /** Whether the audio output system has been set up */
1803
+ get isSetup() {
1804
+ return this.injected;
1805
+ }
1806
+ /** Whether audio is currently being captured */
1807
+ get isCapturing() {
1808
+ return this.capturing;
1809
+ }
1810
+ /**
1811
+ * Set up audio output capture.
1812
+ * Registers bindings and injects the capture script.
1813
+ */
1814
+ async setup() {
1815
+ if (this.injected) return;
1816
+ await this.cdp.send("Runtime.addBinding", { name: OUTPUT_BINDING });
1817
+ this.bindingHandler = (params) => {
1818
+ if (params["name"] === OUTPUT_BINDING) {
1819
+ this.handleAudioData(params["payload"]);
1820
+ }
1821
+ };
1822
+ this.cdp.on("Runtime.bindingCalled", this.bindingHandler);
1823
+ await this.cdp.send("Page.addScriptToEvaluateOnNewDocument", {
1824
+ source: AUDIO_OUTPUT_SCRIPT
1825
+ });
1826
+ await this.cdp.send("Runtime.evaluate", {
1827
+ expression: AUDIO_OUTPUT_SCRIPT,
1828
+ awaitPromise: false,
1829
+ userGesture: true
1830
+ });
1831
+ this.injected = true;
1832
+ }
1833
+ /**
1834
+ * Start capturing audio output.
1835
+ */
1836
+ async start() {
1837
+ if (!this.injected) {
1838
+ await this.setup();
1839
+ }
1840
+ this.chunks = [];
1841
+ this.firstChunkTime = null;
1842
+ this.capturing = true;
1843
+ await this.cdp.send("Runtime.evaluate", {
1844
+ expression: `(function() {
1845
+ var resumed = [];
1846
+ (window.__bpTrackedAudioContexts || []).forEach(function(ctx) {
1847
+ if (ctx.state === 'suspended') {
1848
+ ctx.resume().then(function() {
1849
+ console.log('[bp:output] Resumed AudioContext (' + ctx.sampleRate + 'Hz) before capture');
1850
+ });
1851
+ resumed.push(ctx.sampleRate);
1852
+ }
1853
+ });
1854
+ if (window.__bpAudioInput && window.__bpAudioInput.getContext) {
1855
+ var inputCtx = window.__bpAudioInput.getContext();
1856
+ if (inputCtx && inputCtx.state === 'suspended') {
1857
+ inputCtx.resume();
1858
+ resumed.push('input-' + inputCtx.sampleRate);
1859
+ }
1860
+ }
1861
+ if (resumed.length) console.log('[bp:output] Resumed ' + resumed.length + ' contexts: ' + resumed.join(', '));
1862
+ })()`,
1863
+ awaitPromise: false,
1864
+ userGesture: true
1865
+ });
1866
+ await this.cdp.send("Runtime.evaluate", {
1867
+ expression: "window.__bpAudioOutput && window.__bpAudioOutput.start()",
1868
+ awaitPromise: false
1869
+ });
1870
+ await this.discoverExistingPeerConnections();
1871
+ if (this.onDiagHandler) {
1872
+ try {
1873
+ const statsResult = await this.cdp.send(
1874
+ "Runtime.evaluate",
1875
+ {
1876
+ expression: "window.__bpAudioOutput && window.__bpAudioOutput.getStats()",
1877
+ returnByValue: true
1878
+ }
308
1879
  );
1880
+ const stats = statsResult.result.value;
1881
+ if (stats) {
1882
+ this.onDiagHandler(
1883
+ `started \u2014 ${stats["audioContexts"]} AudioContexts, ${stats["contextTaps"]} taps, ${stats["rtcConnections"]} RTCPeerConnections, ${stats["mediaElements"]} MediaElements, ${stats["tappedTracks"]} tapped tracks`
1884
+ );
1885
+ }
1886
+ } catch {
1887
+ }
309
1888
  }
310
1889
  }
311
1890
  /**
312
- * Get the actual selector that matched the element.
313
- * Uses the last matched selector tracked by Page, falls back to first selector if unavailable.
1891
+ * Stop capturing and return all collected audio.
314
1892
  */
315
- getUsedSelector(selector) {
316
- const matched = this.page.getLastMatchedSelector();
317
- if (matched) return matched;
318
- return Array.isArray(selector) ? selector[0] : selector;
1893
+ async stop() {
1894
+ if (!this.injected) {
1895
+ return emptyCaptureResult();
1896
+ }
1897
+ await this.cdp.send("Runtime.evaluate", {
1898
+ expression: "window.__bpAudioOutput && window.__bpAudioOutput.stop()",
1899
+ awaitPromise: false
1900
+ });
1901
+ this.capturing = false;
1902
+ await sleep(250);
1903
+ return this.mergeChunks();
1904
+ }
1905
+ /**
1906
+ * Capture audio until silence is detected.
1907
+ *
1908
+ * Two-phase approach:
1909
+ * 1. **Wait phase**: Wait up to `maxDuration` for the first non-silent chunk.
1910
+ * The silence countdown does NOT tick during this phase, so slow voice agents
1911
+ * (STT → LLM → TTS can take 5-15s) don't cause premature timeout.
1912
+ * 2. **Capture phase**: Once audio is detected, capture until `silenceTimeout` ms
1913
+ * of consecutive silence pass, then stop.
1914
+ */
1915
+ async captureUntilSilence(options) {
1916
+ const silenceTimeout = options?.silenceTimeout ?? 1500;
1917
+ const silenceThreshold = options?.silenceThreshold ?? 0.01;
1918
+ const maxDuration = options?.maxDuration ?? 3e5;
1919
+ const noAudioTimeout = options?.noAudioTimeout ?? 15e3;
1920
+ if (!this.capturing) {
1921
+ await this.start();
1922
+ }
1923
+ return new Promise((resolve) => {
1924
+ let heardAudio = false;
1925
+ let lastSoundTime = 0;
1926
+ const startTime = Date.now();
1927
+ const checkInterval = setInterval(async () => {
1928
+ const elapsed = Date.now() - startTime;
1929
+ if (elapsed > maxDuration) {
1930
+ clearInterval(checkInterval);
1931
+ this.onDiagHandler?.(`max duration reached (${maxDuration}ms), stopping`);
1932
+ resolve(await this.stop());
1933
+ return;
1934
+ }
1935
+ const latest = this.chunks[this.chunks.length - 1];
1936
+ if (latest) {
1937
+ const rms = calculateRMS(latest.left);
1938
+ if (rms > silenceThreshold) {
1939
+ if (!heardAudio) {
1940
+ heardAudio = true;
1941
+ this.onDiagHandler?.("first audio detected \u2014 silence countdown begins");
1942
+ }
1943
+ lastSoundTime = Date.now();
1944
+ }
1945
+ }
1946
+ if (!heardAudio && elapsed > noAudioTimeout) {
1947
+ clearInterval(checkInterval);
1948
+ this.onDiagHandler?.(`no audio detected after ${noAudioTimeout}ms, stopping early`);
1949
+ resolve(await this.stop());
1950
+ return;
1951
+ }
1952
+ if (heardAudio && Date.now() - lastSoundTime > silenceTimeout) {
1953
+ clearInterval(checkInterval);
1954
+ resolve(await this.stop());
1955
+ }
1956
+ }, 200);
1957
+ });
1958
+ }
1959
+ /**
1960
+ * Subscribe to real-time audio chunks as they arrive.
1961
+ */
1962
+ onData(handler) {
1963
+ this.onChunkHandler = handler;
1964
+ }
1965
+ /**
1966
+ * Subscribe to diagnostic messages (for --verbose).
1967
+ */
1968
+ onDiag(handler) {
1969
+ this.onDiagHandler = handler;
1970
+ }
1971
+ /**
1972
+ * Clean up: remove binding handler.
1973
+ */
1974
+ async teardown() {
1975
+ if (this.capturing) {
1976
+ await this.stop();
1977
+ }
1978
+ if (this.bindingHandler) {
1979
+ this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
1980
+ this.bindingHandler = null;
1981
+ }
1982
+ this.onChunkHandler = void 0;
1983
+ this.onDiagHandler = void 0;
1984
+ this.injected = false;
1985
+ }
1986
+ /**
1987
+ * Use CDP Runtime.queryObjects to find RTCPeerConnection instances
1988
+ * that were created before our override was injected, and tap their audio tracks.
1989
+ */
1990
+ async discoverExistingPeerConnections() {
1991
+ try {
1992
+ const protoResult = await this.cdp.send("Runtime.evaluate", {
1993
+ expression: 'typeof RTCPeerConnection !== "undefined" ? RTCPeerConnection.prototype : null',
1994
+ returnByValue: false
1995
+ });
1996
+ const protoId = protoResult.result.objectId;
1997
+ if (!protoId) return;
1998
+ const queryResult = await this.cdp.send("Runtime.queryObjects", {
1999
+ prototypeObjectId: protoId
2000
+ });
2001
+ const arrayId = queryResult.objects.objectId;
2002
+ if (!arrayId) return;
2003
+ const propsResult = await this.cdp.send("Runtime.getProperties", {
2004
+ objectId: arrayId,
2005
+ ownProperties: true
2006
+ });
2007
+ let tapped = 0;
2008
+ for (const prop of propsResult.result) {
2009
+ if (prop.name === "length" || prop.name === "__proto__") continue;
2010
+ const pcObjectId = prop.value?.objectId;
2011
+ if (!pcObjectId) continue;
2012
+ await this.cdp.send("Runtime.callFunctionOn", {
2013
+ objectId: pcObjectId,
2014
+ functionDeclaration: "function() { if (window.__bpAudioOutput && window.__bpAudioOutput.tapPC) { return window.__bpAudioOutput.tapPC(this); } return false; }",
2015
+ returnByValue: true
2016
+ });
2017
+ tapped++;
2018
+ }
2019
+ if (tapped > 0) {
2020
+ this.onDiagHandler?.(`retroactively discovered ${tapped} existing RTCPeerConnection(s)`);
2021
+ }
2022
+ await this.cdp.send("Runtime.releaseObject", { objectId: arrayId });
2023
+ await this.cdp.send("Runtime.releaseObject", { objectId: protoId });
2024
+ } catch {
2025
+ }
2026
+ }
2027
+ handleAudioData(payload) {
2028
+ try {
2029
+ const data = JSON.parse(payload);
2030
+ const leftBytes = base64ToBuffer(data.left);
2031
+ const rightBytes = base64ToBuffer(data.right);
2032
+ const chunk = {
2033
+ left: new Float32Array(leftBytes.buffer),
2034
+ right: new Float32Array(rightBytes.buffer),
2035
+ sampleRate: data.sampleRate,
2036
+ samples: data.samples,
2037
+ timestamp: Date.now()
2038
+ };
2039
+ this.chunks.push(chunk);
2040
+ if (this.onDiagHandler) {
2041
+ const rms = calculateRMS(chunk.left);
2042
+ const label = rms > 0.01 ? "audio" : "silence";
2043
+ this.onDiagHandler(`chunk: ${chunk.samples} samples, RMS=${rms.toFixed(4)} (${label})`);
2044
+ }
2045
+ if (this.firstChunkTime === null) {
2046
+ const rms = calculateRMS(chunk.left);
2047
+ if (rms > 1e-3) {
2048
+ this.firstChunkTime = Date.now();
2049
+ }
2050
+ }
2051
+ this.onChunkHandler?.(chunk);
2052
+ } catch {
2053
+ }
2054
+ }
2055
+ mergeChunks() {
2056
+ if (this.chunks.length === 0) {
2057
+ return emptyCaptureResult();
2058
+ }
2059
+ const byRate = /* @__PURE__ */ new Map();
2060
+ for (const chunk of this.chunks) {
2061
+ const rate = chunk.sampleRate;
2062
+ if (!byRate.has(rate)) byRate.set(rate, []);
2063
+ byRate.get(rate).push(chunk);
2064
+ }
2065
+ let bestRate = this.chunks[0].sampleRate;
2066
+ let bestNonSilentSamples = 0;
2067
+ for (const [rate, chunks] of byRate) {
2068
+ let nonSilentSamples = 0;
2069
+ for (const chunk of chunks) {
2070
+ const rms = calculateRMS(chunk.left);
2071
+ if (rms > 0.01) {
2072
+ nonSilentSamples += chunk.left.length;
2073
+ }
2074
+ }
2075
+ if (nonSilentSamples > bestNonSilentSamples) {
2076
+ bestNonSilentSamples = nonSilentSamples;
2077
+ bestRate = rate;
2078
+ }
2079
+ }
2080
+ const bestChunks = byRate.get(bestRate);
2081
+ let totalLen = 0;
2082
+ for (const chunk of bestChunks) {
2083
+ totalLen += chunk.left.length;
2084
+ }
2085
+ const left = new Float32Array(totalLen);
2086
+ const right = new Float32Array(totalLen);
2087
+ let offset = 0;
2088
+ for (const chunk of bestChunks) {
2089
+ left.set(chunk.left, offset);
2090
+ right.set(chunk.right, offset);
2091
+ offset += chunk.left.length;
2092
+ }
2093
+ if (byRate.size > 1) {
2094
+ this.onDiagHandler?.(
2095
+ `mergeChunks: ${byRate.size} sample rates detected, using ${bestRate}Hz (${bestNonSilentSamples} non-silent samples)`
2096
+ );
2097
+ }
2098
+ return {
2099
+ left,
2100
+ right,
2101
+ sampleRate: bestRate,
2102
+ durationMs: totalLen / bestRate * 1e3,
2103
+ chunkCount: bestChunks.length
2104
+ };
319
2105
  }
320
2106
  };
321
- function addBatchToPage(page) {
322
- const executor = new BatchExecutor(page);
323
- return Object.assign(page, {
324
- batch: (steps, options) => executor.execute(steps, options)
2107
+ function emptyCaptureResult() {
2108
+ return {
2109
+ left: new Float32Array(0),
2110
+ right: new Float32Array(0),
2111
+ sampleRate: 48e3,
2112
+ durationMs: 0,
2113
+ chunkCount: 0
2114
+ };
2115
+ }
2116
+ function sleep(ms) {
2117
+ return new Promise((resolve) => setTimeout(resolve, ms));
2118
+ }
2119
+
2120
+ // src/audio/transcribe.ts
2121
+ async function transcribe(audio, options) {
2122
+ const apiKey = options?.apiKey ?? getEnvVar("OPENAI_API_KEY");
2123
+ if (!apiKey) {
2124
+ throw new Error(
2125
+ "OpenAI API key required for transcription. Set OPENAI_API_KEY environment variable or pass apiKey option."
2126
+ );
2127
+ }
2128
+ if (audio.left.length === 0) {
2129
+ return { text: "", audioDurationMs: 0, apiDurationMs: 0 };
2130
+ }
2131
+ const model = options?.model ?? "whisper-1";
2132
+ const responseFormat = options?.responseFormat ?? "text";
2133
+ const wavBuffer = pcmToWav({
2134
+ left: audio.left,
2135
+ right: audio.right.length > 0 ? audio.right : void 0,
2136
+ sampleRate: audio.sampleRate
325
2137
  });
2138
+ const boundary = `----bpAudio${Date.now()}`;
2139
+ const parts = [];
2140
+ appendFormField(parts, boundary, "file", new Uint8Array(wavBuffer), "audio.wav", "audio/wav");
2141
+ appendFormTextField(parts, boundary, "model", model);
2142
+ appendFormTextField(parts, boundary, "response_format", responseFormat);
2143
+ if (options?.language) {
2144
+ appendFormTextField(parts, boundary, "language", options.language);
2145
+ }
2146
+ if (options?.prompt) {
2147
+ appendFormTextField(parts, boundary, "prompt", options.prompt);
2148
+ }
2149
+ const closing = new TextEncoder().encode(`\r
2150
+ --${boundary}--\r
2151
+ `);
2152
+ parts.push(closing);
2153
+ const totalLength = parts.reduce((sum, p) => sum + p.length, 0);
2154
+ const body = new Uint8Array(totalLength);
2155
+ let offset = 0;
2156
+ for (const part of parts) {
2157
+ body.set(part, offset);
2158
+ offset += part.length;
2159
+ }
2160
+ const start = Date.now();
2161
+ const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
2162
+ method: "POST",
2163
+ headers: {
2164
+ Authorization: `Bearer ${apiKey}`,
2165
+ "Content-Type": `multipart/form-data; boundary=${boundary}`
2166
+ },
2167
+ body
2168
+ });
2169
+ const apiDurationMs = Date.now() - start;
2170
+ if (!response.ok) {
2171
+ const errorBody = await response.text().catch(() => "");
2172
+ throw new Error(`Whisper API error (${response.status}): ${errorBody}`);
2173
+ }
2174
+ let text;
2175
+ if (responseFormat === "text") {
2176
+ text = (await response.text()).trim();
2177
+ } else {
2178
+ const json = await response.json();
2179
+ text = json.text ?? "";
2180
+ }
2181
+ return {
2182
+ text,
2183
+ audioDurationMs: audio.durationMs,
2184
+ apiDurationMs
2185
+ };
2186
+ }
2187
+ function isTranscriptionAvailable() {
2188
+ return !!getEnvVar("OPENAI_API_KEY");
2189
+ }
2190
+ function getEnvVar(name) {
2191
+ if (typeof globalThis.process !== "undefined" && globalThis.process.env) {
2192
+ return globalThis.process.env[name];
2193
+ }
2194
+ return void 0;
2195
+ }
2196
+ function appendFormTextField(parts, boundary, name, value) {
2197
+ const text = `\r
2198
+ --${boundary}\r
2199
+ Content-Disposition: form-data; name="${name}"\r
2200
+ \r
2201
+ ${value}`;
2202
+ parts.push(new TextEncoder().encode(text));
2203
+ }
2204
+ function appendFormField(parts, boundary, name, data, filename, contentType) {
2205
+ const header = `\r
2206
+ --${boundary}\r
2207
+ Content-Disposition: form-data; name="${name}"; filename="${filename}"\r
2208
+ Content-Type: ${contentType}\r
2209
+ \r
2210
+ `;
2211
+ parts.push(new TextEncoder().encode(header));
2212
+ parts.push(data);
326
2213
  }
327
2214
 
328
2215
  // src/cdp/protocol.ts
@@ -1020,7 +2907,7 @@ async function isElementAttached(cdp, selector, contextId) {
1020
2907
  const result = await cdp.send("Runtime.evaluate", params);
1021
2908
  return result.result.value === true;
1022
2909
  }
1023
- function sleep(ms) {
2910
+ function sleep2(ms) {
1024
2911
  return new Promise((resolve) => setTimeout(resolve, ms));
1025
2912
  }
1026
2913
  async function waitForElement(cdp, selector, options = {}) {
@@ -1046,7 +2933,7 @@ async function waitForElement(cdp, selector, options = {}) {
1046
2933
  if (conditionMet) {
1047
2934
  return { success: true, waitedMs: Date.now() - startTime };
1048
2935
  }
1049
- await sleep(pollInterval);
2936
+ await sleep2(pollInterval);
1050
2937
  }
1051
2938
  return { success: false, waitedMs: Date.now() - startTime };
1052
2939
  }
@@ -1075,7 +2962,7 @@ async function waitForAnyElement(cdp, selectors, options = {}) {
1075
2962
  return { success: true, selector, waitedMs: Date.now() - startTime };
1076
2963
  }
1077
2964
  }
1078
- await sleep(pollInterval);
2965
+ await sleep2(pollInterval);
1079
2966
  }
1080
2967
  return { success: false, waitedMs: Date.now() - startTime };
1081
2968
  }
@@ -1124,7 +3011,7 @@ async function waitForNavigation(cdp, options = {}) {
1124
3011
  }
1125
3012
  const pollUrl = async () => {
1126
3013
  while (!resolved && Date.now() < startTime + timeout) {
1127
- await sleep(100);
3014
+ await sleep2(100);
1128
3015
  if (resolved) return;
1129
3016
  try {
1130
3017
  const currentUrl = await getCurrentUrl(cdp);
@@ -1454,6 +3341,10 @@ var Page = class {
1454
3341
  currentFrameContextId = null;
1455
3342
  /** Last matched selector from findElement (for selectorUsed tracking) */
1456
3343
  _lastMatchedSelector;
3344
+ /** Audio input controller (lazy-initialized) */
3345
+ _audioInput;
3346
+ /** Audio output controller (lazy-initialized) */
3347
+ _audioOutput;
1457
3348
  constructor(cdp, targetId) {
1458
3349
  this.cdp = cdp;
1459
3350
  this._targetId = targetId;
@@ -1701,7 +3592,7 @@ var Page = class {
1701
3592
  key: char
1702
3593
  });
1703
3594
  if (delay > 0) {
1704
- await sleep2(delay);
3595
+ await sleep3(delay);
1705
3596
  }
1706
3597
  }
1707
3598
  return true;
@@ -1742,7 +3633,7 @@ var Page = class {
1742
3633
  async selectCustom(config, options = {}) {
1743
3634
  const { trigger, option, value, match = "text" } = config;
1744
3635
  await this.click(trigger, options);
1745
- await sleep2(100);
3636
+ await sleep3(100);
1746
3637
  let optionSelector;
1747
3638
  const optionSelectors = Array.isArray(option) ? option : [option];
1748
3639
  if (match === "contains") {
@@ -1854,7 +3745,7 @@ var Page = class {
1854
3745
  if (shouldWait === true) {
1855
3746
  await this.waitForNavigation({ timeout: options.timeout ?? DEFAULT_TIMEOUT2 });
1856
3747
  } else if (shouldWait === "auto") {
1857
- await Promise.race([this.waitForNavigation({ timeout: 1e3, optional: true }), sleep2(500)]);
3748
+ await Promise.race([this.waitForNavigation({ timeout: 1e3, optional: true }), sleep3(500)]);
1858
3749
  }
1859
3750
  return true;
1860
3751
  }
@@ -1872,7 +3763,7 @@ var Page = class {
1872
3763
  this.waitForNavigation({ timeout: 1e3, optional: true }).then(
1873
3764
  (success) => success ? "nav" : null
1874
3765
  ),
1875
- sleep2(500).then(() => "timeout")
3766
+ sleep3(500).then(() => "timeout")
1876
3767
  ]);
1877
3768
  if (navigationDetected === "nav") {
1878
3769
  return true;
@@ -1886,7 +3777,7 @@ var Page = class {
1886
3777
  if (shouldWait === true) {
1887
3778
  await this.waitForNavigation({ timeout: options.timeout ?? DEFAULT_TIMEOUT2 });
1888
3779
  } else if (shouldWait === "auto") {
1889
- await sleep2(100);
3780
+ await sleep3(100);
1890
3781
  }
1891
3782
  }
1892
3783
  return true;
@@ -2883,7 +4774,7 @@ var Page = class {
2883
4774
  lastError = e;
2884
4775
  if (attempt < retries) {
2885
4776
  this.rootNodeId = null;
2886
- await sleep2(delay);
4777
+ await sleep3(delay);
2887
4778
  continue;
2888
4779
  }
2889
4780
  }
@@ -3055,8 +4946,107 @@ var Page = class {
3055
4946
  clickCount: 1
3056
4947
  });
3057
4948
  }
4949
+ // ============ Audio I/O ============
4950
+ /**
4951
+ * Audio input controller (fake microphone).
4952
+ * Lazy-initialized on first access.
4953
+ */
4954
+ get audioInput() {
4955
+ if (!this._audioInput) {
4956
+ this._audioInput = new AudioInput(this.cdp);
4957
+ }
4958
+ return this._audioInput;
4959
+ }
4960
+ /**
4961
+ * Audio output capture controller.
4962
+ * Lazy-initialized on first access.
4963
+ */
4964
+ get audioOutput() {
4965
+ if (!this._audioOutput) {
4966
+ this._audioOutput = new AudioOutput(this.cdp);
4967
+ }
4968
+ return this._audioOutput;
4969
+ }
4970
+ /**
4971
+ * Set up both audio input (fake microphone) and output (capture).
4972
+ * Must be called before navigating to the page that will use audio.
4973
+ */
4974
+ async setupAudio() {
4975
+ try {
4976
+ await this.cdp.send("Input.dispatchMouseEvent", {
4977
+ type: "mousePressed",
4978
+ x: 0,
4979
+ y: 0,
4980
+ button: "left",
4981
+ clickCount: 1
4982
+ });
4983
+ await this.cdp.send("Input.dispatchMouseEvent", {
4984
+ type: "mouseReleased",
4985
+ x: 0,
4986
+ y: 0,
4987
+ button: "left",
4988
+ clickCount: 1
4989
+ });
4990
+ } catch {
4991
+ }
4992
+ await this.audioInput.setup();
4993
+ await this.audioOutput.setup();
4994
+ }
4995
+ /**
4996
+ * Full audio round-trip: feed input audio, capture the response.
4997
+ *
4998
+ * 1. Starts capturing output
4999
+ * 2. Feeds input audio as microphone data
5000
+ * 3. Waits for the page to respond and then go silent
5001
+ * 4. Returns the captured response audio with latency metrics
5002
+ *
5003
+ * @example
5004
+ * ```typescript
5005
+ * await page.setupAudio();
5006
+ * await page.goto('https://voice-agent.example.com');
5007
+ * const result = await page.audioRoundTrip({
5008
+ * input: wavFileBytes,
5009
+ * silenceTimeout: 3000,
5010
+ * });
5011
+ * console.log(`Response: ${result.audio.durationMs}ms, latency: ${result.latencyMs}ms`);
5012
+ * ```
5013
+ */
5014
+ async audioRoundTrip(options) {
5015
+ if (!this.audioInput.isSetup || !this.audioOutput.isSetup) {
5016
+ await this.setupAudio();
5017
+ }
5018
+ const start = Date.now();
5019
+ await this.audioOutput.start();
5020
+ if (options.preDelay && options.preDelay > 0) {
5021
+ await sleep3(options.preDelay);
5022
+ }
5023
+ const inputDone = this.audioInput.play(options.input, {
5024
+ waitForEnd: !!options.sendSelector
5025
+ });
5026
+ if (options.sendSelector) {
5027
+ await inputDone.catch(() => {
5028
+ });
5029
+ await this.click(options.sendSelector);
5030
+ }
5031
+ const audio = await this.audioOutput.captureUntilSilence({
5032
+ silenceTimeout: options.silenceTimeout ?? 1500,
5033
+ silenceThreshold: options.silenceThreshold ?? 0.01,
5034
+ maxDuration: options.timeout ?? 12e4
5035
+ });
5036
+ await this.audioInput.stop();
5037
+ if (!options.sendSelector) {
5038
+ await inputDone.catch(() => {
5039
+ });
5040
+ }
5041
+ const firstChunkTime = this.audioOutput.firstChunkTime;
5042
+ return {
5043
+ audio,
5044
+ latencyMs: firstChunkTime !== null ? firstChunkTime - start : -1,
5045
+ totalMs: Date.now() - start
5046
+ };
5047
+ }
3058
5048
  };
3059
- function sleep2(ms) {
5049
+ function sleep3(ms) {
3060
5050
  return new Promise((resolve) => setTimeout(resolve, ms));
3061
5051
  }
3062
5052
 
@@ -3415,6 +5405,8 @@ function disableTracing() {
3415
5405
  }
3416
5406
  // Annotate the CommonJS export names for ESM import in node:
3417
5407
  0 && (module.exports = {
5408
+ AudioInput,
5409
+ AudioOutput,
3418
5410
  BatchExecutor,
3419
5411
  Browser,
3420
5412
  BrowserBaseProvider,
@@ -3428,6 +5420,8 @@ function disableTracing() {
3428
5420
  TimeoutError,
3429
5421
  Tracer,
3430
5422
  addBatchToPage,
5423
+ bufferToBase64,
5424
+ calculateRMS,
3431
5425
  connect,
3432
5426
  createCDPClient,
3433
5427
  createProvider,
@@ -3435,8 +5429,17 @@ function disableTracing() {
3435
5429
  disableTracing,
3436
5430
  discoverTargets,
3437
5431
  enableTracing,
5432
+ generateSilence,
5433
+ generateTone,
5434
+ getAudioChromeFlags,
3438
5435
  getBrowserWebSocketUrl,
3439
5436
  getTracer,
5437
+ grantAudioPermissions,
5438
+ isTranscriptionAvailable,
5439
+ parseWavHeader,
5440
+ pcmToWav,
5441
+ transcribe,
5442
+ validateSteps,
3440
5443
  waitForAnyElement,
3441
5444
  waitForElement,
3442
5445
  waitForNavigation,