browser-pilot 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -1
- package/dist/actions.cjs +465 -6
- package/dist/actions.d.cts +22 -3
- package/dist/actions.d.ts +22 -3
- package/dist/actions.mjs +5 -3
- package/dist/browser.cjs +1350 -14
- package/dist/browser.d.cts +3 -3
- package/dist/browser.d.ts +3 -3
- package/dist/browser.mjs +2 -2
- package/dist/{chunk-JN44FHTK.mjs → chunk-7OSR2CAE.mjs} +1429 -14
- package/dist/chunk-KKW2SZLV.mjs +741 -0
- package/dist/cli.mjs +6150 -103
- package/dist/index.cjs +2026 -23
- package/dist/index.d.cts +142 -6
- package/dist/index.d.ts +142 -6
- package/dist/index.mjs +357 -10
- package/dist/providers.d.cts +2 -2
- package/dist/providers.d.ts +2 -2
- package/dist/{types-D_uDqh0Z.d.cts → types--wXNHUwt.d.cts} +1 -1
- package/dist/{types-D_uDqh0Z.d.ts → types--wXNHUwt.d.ts} +1 -1
- package/dist/{types-DklIxnbO.d.cts → types-CYw-7vx1.d.cts} +244 -1
- package/dist/{types-Pv8KzZ6l.d.ts → types-DOGsEYQa.d.ts} +244 -1
- package/package.json +3 -3
- package/dist/chunk-ZIQA4JOT.mjs +0 -226
- package/dist/chunk-ZTQ37YQT.mjs +0 -283
- package/dist/cli.cjs +0 -6377
- package/dist/cli.d.cts +0 -25
- package/dist/cli.d.ts +0 -25
package/dist/browser.cjs
CHANGED
|
@@ -710,10 +710,32 @@ var BatchExecutor = class {
|
|
|
710
710
|
await this.page.switchToMain();
|
|
711
711
|
return {};
|
|
712
712
|
}
|
|
713
|
-
default:
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
713
|
+
default: {
|
|
714
|
+
const action = step.action;
|
|
715
|
+
const aliases = {
|
|
716
|
+
execute: "evaluate",
|
|
717
|
+
navigate: "goto",
|
|
718
|
+
input: "fill",
|
|
719
|
+
tap: "click",
|
|
720
|
+
go: "goto",
|
|
721
|
+
run: "evaluate",
|
|
722
|
+
capture: "screenshot",
|
|
723
|
+
inspect: "snapshot",
|
|
724
|
+
enter: "press",
|
|
725
|
+
open: "goto",
|
|
726
|
+
visit: "goto",
|
|
727
|
+
eval: "evaluate",
|
|
728
|
+
js: "evaluate",
|
|
729
|
+
snap: "snapshot",
|
|
730
|
+
frame: "switchFrame"
|
|
731
|
+
};
|
|
732
|
+
const suggestion = aliases[action.toLowerCase()];
|
|
733
|
+
const hint = suggestion ? ` Did you mean "${suggestion}"?` : "";
|
|
734
|
+
const valid = "goto, click, fill, type, select, check, uncheck, submit, press, focus, hover, scroll, wait, snapshot, screenshot, evaluate, text, switchFrame, switchToMain";
|
|
735
|
+
throw new Error(`Unknown action "${action}".${hint}
|
|
736
|
+
|
|
737
|
+
Valid actions: ${valid}`);
|
|
738
|
+
}
|
|
717
739
|
}
|
|
718
740
|
}
|
|
719
741
|
/**
|
|
@@ -727,6 +749,1217 @@ var BatchExecutor = class {
|
|
|
727
749
|
}
|
|
728
750
|
};
|
|
729
751
|
|
|
752
|
+
// src/audio/encoding.ts
|
|
753
|
+
function bufferToBase64(data) {
|
|
754
|
+
const bytes = data instanceof Uint8Array ? data : new Uint8Array(data);
|
|
755
|
+
let binary = "";
|
|
756
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
757
|
+
binary += String.fromCharCode(bytes[i]);
|
|
758
|
+
}
|
|
759
|
+
return btoa(binary);
|
|
760
|
+
}
|
|
761
|
+
function base64ToBuffer(b64) {
|
|
762
|
+
const binary = atob(b64);
|
|
763
|
+
const bytes = new Uint8Array(binary.length);
|
|
764
|
+
for (let i = 0; i < binary.length; i++) {
|
|
765
|
+
bytes[i] = binary.charCodeAt(i);
|
|
766
|
+
}
|
|
767
|
+
return bytes;
|
|
768
|
+
}
|
|
769
|
+
function calculateRMS(samples) {
|
|
770
|
+
if (samples.length === 0) return 0;
|
|
771
|
+
let sum = 0;
|
|
772
|
+
for (let i = 0; i < samples.length; i++) {
|
|
773
|
+
sum += samples[i] * samples[i];
|
|
774
|
+
}
|
|
775
|
+
return Math.sqrt(sum / samples.length);
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
// src/audio/permissions.ts
|
|
779
|
+
async function grantAudioPermissions(cdp, origin) {
|
|
780
|
+
await cdp.send("Browser.grantPermissions", {
|
|
781
|
+
permissions: ["audioCapture"],
|
|
782
|
+
origin: origin ?? ""
|
|
783
|
+
});
|
|
784
|
+
await cdp.send("Page.addScriptToEvaluateOnNewDocument", {
|
|
785
|
+
source: PERMISSIONS_OVERRIDE_SCRIPT
|
|
786
|
+
});
|
|
787
|
+
}
|
|
788
|
+
var PERMISSIONS_OVERRIDE_SCRIPT = `
|
|
789
|
+
(function() {
|
|
790
|
+
if (window.__bpPermissionsPatched) return;
|
|
791
|
+
window.__bpPermissionsPatched = true;
|
|
792
|
+
|
|
793
|
+
var origQuery = navigator.permissions.query.bind(navigator.permissions);
|
|
794
|
+
navigator.permissions.query = function(desc) {
|
|
795
|
+
if (desc && (desc.name === 'microphone' || desc.name === 'audio-capture')) {
|
|
796
|
+
return Promise.resolve({
|
|
797
|
+
state: 'granted',
|
|
798
|
+
onchange: null,
|
|
799
|
+
addEventListener: function() {},
|
|
800
|
+
removeEventListener: function() {},
|
|
801
|
+
dispatchEvent: function() { return true; }
|
|
802
|
+
});
|
|
803
|
+
}
|
|
804
|
+
return origQuery(desc);
|
|
805
|
+
};
|
|
806
|
+
})();
|
|
807
|
+
`;
|
|
808
|
+
|
|
809
|
+
// src/audio/input.ts
|
|
810
|
+
var INPUT_BINDING = "__bpAudioInputDone";
|
|
811
|
+
var AUDIO_INPUT_SCRIPT = `
|
|
812
|
+
(function() {
|
|
813
|
+
if (window.__bpAudioInput) return;
|
|
814
|
+
|
|
815
|
+
var audioCtx = null;
|
|
816
|
+
var sourceNode = null;
|
|
817
|
+
var destinationNode = null;
|
|
818
|
+
var fakeStream = null;
|
|
819
|
+
var silenceGain = null;
|
|
820
|
+
var silenceOsc = null;
|
|
821
|
+
var isPlaying = false;
|
|
822
|
+
|
|
823
|
+
function ensureFakeStream() {
|
|
824
|
+
if (fakeStream) return fakeStream;
|
|
825
|
+
// Use the original AudioContext to avoid being tracked by our output override
|
|
826
|
+
var CtorToUse = window.__bpOrigAudioContext || window.AudioContext || window.webkitAudioContext;
|
|
827
|
+
audioCtx = new CtorToUse({ sampleRate: 48000 });
|
|
828
|
+
// Auto-resume if suspended (CDP automation has no user gesture)
|
|
829
|
+
if (audioCtx.state === 'suspended') {
|
|
830
|
+
console.log('[bp:input] AudioContext suspended, auto-resuming...');
|
|
831
|
+
audioCtx.resume().then(function() {
|
|
832
|
+
console.log('[bp:input] AudioContext resumed (' + audioCtx.state + ')');
|
|
833
|
+
}).catch(function(e) {
|
|
834
|
+
console.warn('[bp:input] AudioContext resume failed:', e);
|
|
835
|
+
});
|
|
836
|
+
}
|
|
837
|
+
destinationNode = audioCtx.createMediaStreamDestination();
|
|
838
|
+
|
|
839
|
+
// Start with silence so the stream always has active tracks
|
|
840
|
+
silenceGain = audioCtx.createGain();
|
|
841
|
+
silenceGain.gain.value = 0;
|
|
842
|
+
silenceOsc = audioCtx.createOscillator();
|
|
843
|
+
silenceOsc.connect(silenceGain);
|
|
844
|
+
silenceGain.connect(destinationNode);
|
|
845
|
+
silenceOsc.start();
|
|
846
|
+
|
|
847
|
+
fakeStream = destinationNode.stream;
|
|
848
|
+
console.log('[bp:input] Fake mic stream created (48kHz, ' + fakeStream.getAudioTracks().length + ' tracks)');
|
|
849
|
+
return fakeStream;
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
function playAudio(base64Data) {
|
|
853
|
+
ensureFakeStream();
|
|
854
|
+
|
|
855
|
+
var resumePromise = audioCtx.state === 'suspended'
|
|
856
|
+
? audioCtx.resume()
|
|
857
|
+
: Promise.resolve();
|
|
858
|
+
|
|
859
|
+
return resumePromise.then(function() {
|
|
860
|
+
if (sourceNode) {
|
|
861
|
+
try { sourceNode.stop(); } catch(e) {}
|
|
862
|
+
sourceNode.disconnect();
|
|
863
|
+
sourceNode = null;
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
var binaryStr = atob(base64Data);
|
|
867
|
+
var bytes = new Uint8Array(binaryStr.length);
|
|
868
|
+
for (var i = 0; i < binaryStr.length; i++) {
|
|
869
|
+
bytes[i] = binaryStr.charCodeAt(i);
|
|
870
|
+
}
|
|
871
|
+
console.log('[bp:input] Decoding audio (' + bytes.length + ' bytes)...');
|
|
872
|
+
|
|
873
|
+
return audioCtx.decodeAudioData(bytes.buffer.slice(0));
|
|
874
|
+
}).then(function(audioBuffer) {
|
|
875
|
+
sourceNode = audioCtx.createBufferSource();
|
|
876
|
+
sourceNode.buffer = audioBuffer;
|
|
877
|
+
sourceNode.connect(destinationNode);
|
|
878
|
+
|
|
879
|
+
var durationMs = Math.round(audioBuffer.duration * 1000);
|
|
880
|
+
console.log('[bp:input] Playing ' + durationMs + 'ms audio (' + audioBuffer.sampleRate + 'Hz, ' + audioBuffer.numberOfChannels + 'ch)');
|
|
881
|
+
|
|
882
|
+
return new Promise(function(resolve) {
|
|
883
|
+
sourceNode.onended = function() {
|
|
884
|
+
isPlaying = false;
|
|
885
|
+
console.log('[bp:input] Playback ended');
|
|
886
|
+
resolve(true);
|
|
887
|
+
try {
|
|
888
|
+
if (typeof window.__bpAudioInputDone === 'function') {
|
|
889
|
+
window.__bpAudioInputDone('done');
|
|
890
|
+
}
|
|
891
|
+
} catch(e) {}
|
|
892
|
+
};
|
|
893
|
+
isPlaying = true;
|
|
894
|
+
sourceNode.start();
|
|
895
|
+
});
|
|
896
|
+
});
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
function stopAudio() {
|
|
900
|
+
if (sourceNode) {
|
|
901
|
+
try { sourceNode.stop(); } catch(e) {}
|
|
902
|
+
sourceNode.disconnect();
|
|
903
|
+
sourceNode = null;
|
|
904
|
+
}
|
|
905
|
+
isPlaying = false;
|
|
906
|
+
console.log('[bp:input] Stopped');
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
var origGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
|
|
910
|
+
|
|
911
|
+
navigator.mediaDevices.getUserMedia = function(constraints) {
|
|
912
|
+
if (constraints && constraints.audio) {
|
|
913
|
+
var stream = ensureFakeStream();
|
|
914
|
+
console.log('[bp:input] getUserMedia intercepted \u2014 returning fake mic' + (constraints.video ? ' + real video' : ''));
|
|
915
|
+
|
|
916
|
+
if (constraints.video) {
|
|
917
|
+
// Get real video + our fake audio
|
|
918
|
+
return origGetUserMedia({ video: constraints.video }).then(function(realStream) {
|
|
919
|
+
var combined = new MediaStream(
|
|
920
|
+
stream.getAudioTracks().concat(realStream.getVideoTracks())
|
|
921
|
+
);
|
|
922
|
+
return combined;
|
|
923
|
+
});
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
// Return a clone so consumers can't stop our source track
|
|
927
|
+
return Promise.resolve(stream.clone());
|
|
928
|
+
}
|
|
929
|
+
return origGetUserMedia(constraints);
|
|
930
|
+
};
|
|
931
|
+
|
|
932
|
+
var origEnumerate = navigator.mediaDevices.enumerateDevices.bind(navigator.mediaDevices);
|
|
933
|
+
navigator.mediaDevices.enumerateDevices = function() {
|
|
934
|
+
return origEnumerate().then(function(devices) {
|
|
935
|
+
var hasMic = devices.some(function(d) { return d.kind === 'audioinput'; });
|
|
936
|
+
if (!hasMic) {
|
|
937
|
+
devices.push({
|
|
938
|
+
deviceId: 'bp-fake-mic',
|
|
939
|
+
kind: 'audioinput',
|
|
940
|
+
label: 'Default Audio Input',
|
|
941
|
+
groupId: 'bp-audio',
|
|
942
|
+
toJSON: function() {
|
|
943
|
+
return { deviceId: this.deviceId, kind: this.kind, label: this.label, groupId: this.groupId };
|
|
944
|
+
}
|
|
945
|
+
});
|
|
946
|
+
}
|
|
947
|
+
return devices;
|
|
948
|
+
});
|
|
949
|
+
};
|
|
950
|
+
|
|
951
|
+
window.__bpAudioInput = {
|
|
952
|
+
play: playAudio,
|
|
953
|
+
stop: stopAudio,
|
|
954
|
+
isPlaying: function() { return isPlaying; },
|
|
955
|
+
getState: function() {
|
|
956
|
+
return {
|
|
957
|
+
contextState: audioCtx ? audioCtx.state : 'not-created',
|
|
958
|
+
isPlaying: isPlaying,
|
|
959
|
+
sampleRate: audioCtx ? audioCtx.sampleRate : 0
|
|
960
|
+
};
|
|
961
|
+
},
|
|
962
|
+
getContext: function() { return audioCtx; }
|
|
963
|
+
};
|
|
964
|
+
|
|
965
|
+
console.log('[bp:input] Audio input override installed (getUserMedia + enumerateDevices)');
|
|
966
|
+
})();
|
|
967
|
+
`;
|
|
968
|
+
var AudioInput = class {
|
|
969
|
+
cdp;
|
|
970
|
+
injected = false;
|
|
971
|
+
bindingRegistered = false;
|
|
972
|
+
bindingHandler = null;
|
|
973
|
+
constructor(cdp) {
|
|
974
|
+
this.cdp = cdp;
|
|
975
|
+
}
|
|
976
|
+
/** Whether the audio input system has been set up */
|
|
977
|
+
get isSetup() {
|
|
978
|
+
return this.injected;
|
|
979
|
+
}
|
|
980
|
+
/**
|
|
981
|
+
* Set up audio input injection.
|
|
982
|
+
* Must be called before navigating to the page that will use getUserMedia.
|
|
983
|
+
* Grants permissions and injects the getUserMedia override.
|
|
984
|
+
*/
|
|
985
|
+
async setup() {
|
|
986
|
+
if (this.injected) return;
|
|
987
|
+
try {
|
|
988
|
+
const resp = await this.cdp.send("Runtime.evaluate", {
|
|
989
|
+
expression: "location.href",
|
|
990
|
+
returnByValue: true
|
|
991
|
+
});
|
|
992
|
+
const href = resp.result?.value;
|
|
993
|
+
if (typeof href === "string" && (href === "about:blank" || href === "about:srcdoc")) {
|
|
994
|
+
throw new Error(
|
|
995
|
+
'Cannot set up audio on about:blank. Navigate to a page first.\nExample: await page.goto("https://your-voice-app.com")'
|
|
996
|
+
);
|
|
997
|
+
}
|
|
998
|
+
} catch (e) {
|
|
999
|
+
if (e instanceof Error && e.message.includes("Cannot set up audio")) throw e;
|
|
1000
|
+
}
|
|
1001
|
+
let origin;
|
|
1002
|
+
try {
|
|
1003
|
+
const resp = await this.cdp.send("Runtime.evaluate", {
|
|
1004
|
+
expression: "location.origin",
|
|
1005
|
+
returnByValue: true
|
|
1006
|
+
});
|
|
1007
|
+
const val = resp.result?.value;
|
|
1008
|
+
if (typeof val === "string" && val !== "null") {
|
|
1009
|
+
origin = val;
|
|
1010
|
+
}
|
|
1011
|
+
} catch {
|
|
1012
|
+
}
|
|
1013
|
+
await grantAudioPermissions(this.cdp, origin);
|
|
1014
|
+
if (!this.bindingRegistered) {
|
|
1015
|
+
await this.cdp.send("Runtime.addBinding", { name: INPUT_BINDING });
|
|
1016
|
+
this.bindingRegistered = true;
|
|
1017
|
+
}
|
|
1018
|
+
await this.cdp.send("Page.addScriptToEvaluateOnNewDocument", {
|
|
1019
|
+
source: AUDIO_INPUT_SCRIPT
|
|
1020
|
+
});
|
|
1021
|
+
await this.cdp.send("Runtime.evaluate", {
|
|
1022
|
+
expression: AUDIO_INPUT_SCRIPT,
|
|
1023
|
+
awaitPromise: false,
|
|
1024
|
+
userGesture: true
|
|
1025
|
+
});
|
|
1026
|
+
this.injected = true;
|
|
1027
|
+
}
|
|
1028
|
+
/**
|
|
1029
|
+
* Play audio bytes into the page's fake microphone.
|
|
1030
|
+
* Accepts any format that Web Audio API can decode (WAV, MP3, OGG, etc.).
|
|
1031
|
+
*
|
|
1032
|
+
* @param audioData - Raw audio file bytes
|
|
1033
|
+
* @param options - Playback options
|
|
1034
|
+
*/
|
|
1035
|
+
async play(audioData, options) {
|
|
1036
|
+
if (!this.injected) {
|
|
1037
|
+
await this.setup();
|
|
1038
|
+
}
|
|
1039
|
+
await this.cdp.send("Runtime.evaluate", {
|
|
1040
|
+
expression: `(function() {
|
|
1041
|
+
var resumed = [];
|
|
1042
|
+
(window.__bpTrackedAudioContexts || []).forEach(function(ctx) {
|
|
1043
|
+
if (ctx.state === 'suspended') {
|
|
1044
|
+
ctx.resume().then(function() {
|
|
1045
|
+
console.log('[bp:input] Resumed suspended AudioContext (' + ctx.sampleRate + 'Hz)');
|
|
1046
|
+
});
|
|
1047
|
+
resumed.push(ctx.sampleRate);
|
|
1048
|
+
}
|
|
1049
|
+
});
|
|
1050
|
+
// Also resume the input context itself
|
|
1051
|
+
if (window.__bpAudioInput && window.__bpAudioInput.getContext) {
|
|
1052
|
+
var inputCtx = window.__bpAudioInput.getContext();
|
|
1053
|
+
if (inputCtx && inputCtx.state === 'suspended') {
|
|
1054
|
+
inputCtx.resume().then(function() {
|
|
1055
|
+
console.log('[bp:input] Resumed input AudioContext (' + inputCtx.sampleRate + 'Hz)');
|
|
1056
|
+
});
|
|
1057
|
+
resumed.push('input-' + inputCtx.sampleRate);
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
return resumed.length > 0 ? 'resumed: ' + resumed.join(',') : 'all running';
|
|
1061
|
+
})()`,
|
|
1062
|
+
awaitPromise: false,
|
|
1063
|
+
userGesture: true
|
|
1064
|
+
});
|
|
1065
|
+
const base64 = bufferToBase64(audioData);
|
|
1066
|
+
const waitForEnd = options?.waitForEnd ?? true;
|
|
1067
|
+
const timeout = options?.timeout ?? 6e4;
|
|
1068
|
+
if (waitForEnd) {
|
|
1069
|
+
const donePromise = this.waitForBinding(timeout);
|
|
1070
|
+
await this.cdp.send("Runtime.evaluate", {
|
|
1071
|
+
expression: `window.__bpAudioInput.play('${base64}')`,
|
|
1072
|
+
awaitPromise: false
|
|
1073
|
+
});
|
|
1074
|
+
await donePromise;
|
|
1075
|
+
} else {
|
|
1076
|
+
await this.cdp.send("Runtime.evaluate", {
|
|
1077
|
+
expression: `window.__bpAudioInput.play('${base64}')`,
|
|
1078
|
+
awaitPromise: false
|
|
1079
|
+
});
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
/**
|
|
1083
|
+
* Stop any currently playing audio.
|
|
1084
|
+
*/
|
|
1085
|
+
async stop() {
|
|
1086
|
+
if (!this.injected) return;
|
|
1087
|
+
await this.cdp.send("Runtime.evaluate", {
|
|
1088
|
+
expression: "window.__bpAudioInput && window.__bpAudioInput.stop()",
|
|
1089
|
+
awaitPromise: false
|
|
1090
|
+
});
|
|
1091
|
+
}
|
|
1092
|
+
/**
|
|
1093
|
+
* Get current state of the injected audio input system.
|
|
1094
|
+
*/
|
|
1095
|
+
async getState() {
|
|
1096
|
+
if (!this.injected) {
|
|
1097
|
+
return { contextState: "not-created", isPlaying: false, sampleRate: 0 };
|
|
1098
|
+
}
|
|
1099
|
+
const result = await this.cdp.send("Runtime.evaluate", {
|
|
1100
|
+
expression: "window.__bpAudioInput ? window.__bpAudioInput.getState() : null",
|
|
1101
|
+
returnByValue: true
|
|
1102
|
+
});
|
|
1103
|
+
return result.result.value ?? { contextState: "not-created", isPlaying: false, sampleRate: 0 };
|
|
1104
|
+
}
|
|
1105
|
+
/**
|
|
1106
|
+
* Clean up: remove binding handler.
|
|
1107
|
+
*/
|
|
1108
|
+
async teardown() {
|
|
1109
|
+
if (this.bindingHandler) {
|
|
1110
|
+
this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
|
|
1111
|
+
this.bindingHandler = null;
|
|
1112
|
+
}
|
|
1113
|
+
await this.stop();
|
|
1114
|
+
this.injected = false;
|
|
1115
|
+
this.bindingRegistered = false;
|
|
1116
|
+
}
|
|
1117
|
+
/**
|
|
1118
|
+
* Wait for the playback-complete binding to fire.
|
|
1119
|
+
*/
|
|
1120
|
+
waitForBinding(timeout) {
|
|
1121
|
+
return new Promise((resolve, reject) => {
|
|
1122
|
+
const timer = setTimeout(() => {
|
|
1123
|
+
if (this.bindingHandler) {
|
|
1124
|
+
this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
|
|
1125
|
+
this.bindingHandler = null;
|
|
1126
|
+
}
|
|
1127
|
+
reject(new Error(`AudioInput: playback timed out after ${timeout}ms`));
|
|
1128
|
+
}, timeout);
|
|
1129
|
+
if (this.bindingHandler) {
|
|
1130
|
+
this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
|
|
1131
|
+
}
|
|
1132
|
+
this.bindingHandler = (params) => {
|
|
1133
|
+
if (params["name"] === INPUT_BINDING) {
|
|
1134
|
+
clearTimeout(timer);
|
|
1135
|
+
if (this.bindingHandler) {
|
|
1136
|
+
this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
|
|
1137
|
+
this.bindingHandler = null;
|
|
1138
|
+
}
|
|
1139
|
+
resolve();
|
|
1140
|
+
}
|
|
1141
|
+
};
|
|
1142
|
+
this.cdp.on("Runtime.bindingCalled", this.bindingHandler);
|
|
1143
|
+
});
|
|
1144
|
+
}
|
|
1145
|
+
};
|
|
1146
|
+
|
|
1147
|
+
// src/audio/output.ts
|
|
1148
|
+
var OUTPUT_BINDING = "__bpAudioOutputData";
|
|
1149
|
+
var AUDIO_OUTPUT_SCRIPT = `
|
|
1150
|
+
(function() {
|
|
1151
|
+
// If already installed, stop any active capture but allow re-initialization
|
|
1152
|
+
// so that updated scripts (e.g. with new capture strategies) take effect.
|
|
1153
|
+
if (window.__bpAudioOutput) {
|
|
1154
|
+
if (window.__bpAudioOutput.isCapturing()) window.__bpAudioOutput.stop();
|
|
1155
|
+
// Keep existing allAudioContexts if available (preserves pre-override tracking)
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
var BUFFER_SIZE = 4096;
|
|
1159
|
+
var FLUSH_SAMPLES = 48000; // flush every ~1s at 48kHz (scales with sample rate)
|
|
1160
|
+
var capturing = false;
|
|
1161
|
+
var capturedChunks = [];
|
|
1162
|
+
var totalSamples = 0;
|
|
1163
|
+
var flushCount = 0;
|
|
1164
|
+
var pendingTracks = [];
|
|
1165
|
+
var tappedTrackIds = {};
|
|
1166
|
+
|
|
1167
|
+
// --- Per-context tap infrastructure ---
|
|
1168
|
+
// Preserve any AudioContexts tracked by a previous script version
|
|
1169
|
+
var allAudioContexts = window.__bpTrackedAudioContexts || [];
|
|
1170
|
+
// Use a WeakMap to associate taps with AudioContext instances
|
|
1171
|
+
// (native objects like AudioContext may not support custom properties)
|
|
1172
|
+
var contextTapMap = typeof WeakMap !== 'undefined' ? new WeakMap() : null;
|
|
1173
|
+
var contextTapList = []; // fallback: [{ctx, proc}]
|
|
1174
|
+
|
|
1175
|
+
var OrigAudioContext = window.__bpOrigAudioContext || window.AudioContext || window.webkitAudioContext;
|
|
1176
|
+
// Save the native connect function once; on re-injection, reuse it to avoid double-wrapping
|
|
1177
|
+
var origConnect = window.__bpOrigConnect || AudioNode.prototype.connect;
|
|
1178
|
+
window.__bpOrigConnect = origConnect;
|
|
1179
|
+
|
|
1180
|
+
// Our own capture context (48kHz) for WebRTC tracks and media elements
|
|
1181
|
+
var captureCtx = null;
|
|
1182
|
+
var captureProcessor = null;
|
|
1183
|
+
|
|
1184
|
+
// Save original AudioContext constructor once
|
|
1185
|
+
if (!window.__bpOrigAudioContext) {
|
|
1186
|
+
window.__bpOrigAudioContext = OrigAudioContext;
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
// Override AudioContext constructor to track all instances (skip if already overridden)
|
|
1190
|
+
if (OrigAudioContext && !window.__bpAudioContextOverridden) {
|
|
1191
|
+
window.__bpAudioContextOverridden = true;
|
|
1192
|
+
window.AudioContext = function() {
|
|
1193
|
+
var ctx = new (Function.prototype.bind.apply(OrigAudioContext, [null].concat(Array.prototype.slice.call(arguments))))();
|
|
1194
|
+
allAudioContexts.push(ctx);
|
|
1195
|
+
// Auto-resume suspended contexts \u2014 CDP automation has no user gesture,
|
|
1196
|
+
// so Chrome suspends new AudioContexts by default. Without this, voice
|
|
1197
|
+
// agents' ScriptProcessorNodes never fire and no audio flows.
|
|
1198
|
+
if (ctx.state === 'suspended') {
|
|
1199
|
+
console.log('[bp:output] AudioContext created suspended (' + ctx.sampleRate + 'Hz), auto-resuming...');
|
|
1200
|
+
ctx.resume().then(function() {
|
|
1201
|
+
console.log('[bp:output] AudioContext resumed successfully (' + ctx.sampleRate + 'Hz, state: ' + ctx.state + ')');
|
|
1202
|
+
}).catch(function(e) {
|
|
1203
|
+
console.warn('[bp:output] AudioContext resume failed (' + ctx.sampleRate + 'Hz):', e);
|
|
1204
|
+
});
|
|
1205
|
+
} else {
|
|
1206
|
+
console.log('[bp:output] AudioContext created (' + ctx.sampleRate + 'Hz, state: ' + ctx.state + ')');
|
|
1207
|
+
}
|
|
1208
|
+
return ctx;
|
|
1209
|
+
};
|
|
1210
|
+
window.AudioContext.prototype = OrigAudioContext.prototype;
|
|
1211
|
+
Object.keys(OrigAudioContext).forEach(function(k) {
|
|
1212
|
+
try { window.AudioContext[k] = OrigAudioContext[k]; } catch(e) {}
|
|
1213
|
+
});
|
|
1214
|
+
if (window.webkitAudioContext) {
|
|
1215
|
+
window.webkitAudioContext = window.AudioContext;
|
|
1216
|
+
}
|
|
1217
|
+
}
|
|
1218
|
+
|
|
1219
|
+
// Expose tracked contexts on window so re-injections preserve them
|
|
1220
|
+
window.__bpTrackedAudioContexts = allAudioContexts;
|
|
1221
|
+
|
|
1222
|
+
// Look up an existing tap for a given AudioContext
|
|
1223
|
+
function findTap(ctx) {
|
|
1224
|
+
if (contextTapMap) return contextTapMap.get(ctx) || null;
|
|
1225
|
+
for (var i = 0; i < contextTapList.length; i++) {
|
|
1226
|
+
if (contextTapList[i].ctx === ctx) return contextTapList[i].proc;
|
|
1227
|
+
}
|
|
1228
|
+
return null;
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
// Store a tap for a given AudioContext
|
|
1232
|
+
function storeTap(ctx, proc) {
|
|
1233
|
+
if (contextTapMap) { contextTapMap.set(ctx, proc); }
|
|
1234
|
+
else { contextTapList.push({ ctx: ctx, proc: proc }); }
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
// Count stored taps
|
|
1238
|
+
function tapCount() {
|
|
1239
|
+
if (contextTapMap) {
|
|
1240
|
+
var count = 0;
|
|
1241
|
+
for (var i = 0; i < allAudioContexts.length; i++) {
|
|
1242
|
+
if (contextTapMap.has(allAudioContexts[i])) count++;
|
|
1243
|
+
}
|
|
1244
|
+
return count;
|
|
1245
|
+
}
|
|
1246
|
+
return contextTapList.length;
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
// Create or retrieve a ScriptProcessorNode tap for a specific AudioContext.
|
|
1250
|
+
// The tap lives in the SAME context as the source, avoiding cross-context errors.
|
|
1251
|
+
function getOrCreateTap(ctx) {
|
|
1252
|
+
var existing = findTap(ctx);
|
|
1253
|
+
if (existing) return existing;
|
|
1254
|
+
|
|
1255
|
+
try {
|
|
1256
|
+
if (ctx.state === 'closed') return null;
|
|
1257
|
+
var channels = Math.min(ctx.destination.channelCount || 2, 2);
|
|
1258
|
+
if (channels < 1) channels = 1;
|
|
1259
|
+
var proc = ctx.createScriptProcessor(BUFFER_SIZE, channels, channels);
|
|
1260
|
+
proc.onaudioprocess = function(e) {
|
|
1261
|
+
if (!capturing) return;
|
|
1262
|
+
var left = new Float32Array(e.inputBuffer.getChannelData(0));
|
|
1263
|
+
var right = e.inputBuffer.numberOfChannels > 1
|
|
1264
|
+
? new Float32Array(e.inputBuffer.getChannelData(1))
|
|
1265
|
+
: new Float32Array(left.length);
|
|
1266
|
+
capturedChunks.push({ left: left, right: right, sampleRate: ctx.sampleRate });
|
|
1267
|
+
totalSamples += left.length;
|
|
1268
|
+
if (totalSamples >= FLUSH_SAMPLES) {
|
|
1269
|
+
flushToNodeJs();
|
|
1270
|
+
}
|
|
1271
|
+
};
|
|
1272
|
+
// Must connect to destination to keep ScriptProcessorNode alive
|
|
1273
|
+
origConnect.call(proc, ctx.destination);
|
|
1274
|
+
storeTap(ctx, proc);
|
|
1275
|
+
return proc;
|
|
1276
|
+
} catch(e) {
|
|
1277
|
+
return null;
|
|
1278
|
+
}
|
|
1279
|
+
}
|
|
1280
|
+
|
|
1281
|
+
// Override AudioNode.prototype.connect to tap connections to any AudioDestinationNode
|
|
1282
|
+
AudioNode.prototype.connect = function(destination) {
|
|
1283
|
+
var result = origConnect.apply(this, arguments);
|
|
1284
|
+
|
|
1285
|
+
if (capturing && destination instanceof AudioDestinationNode) {
|
|
1286
|
+
try {
|
|
1287
|
+
var tap = getOrCreateTap(destination.context);
|
|
1288
|
+
// Don't connect the tap to itself
|
|
1289
|
+
if (tap && tap !== this) {
|
|
1290
|
+
origConnect.call(this, tap);
|
|
1291
|
+
}
|
|
1292
|
+
} catch(e) {}
|
|
1293
|
+
}
|
|
1294
|
+
return result;
|
|
1295
|
+
};
|
|
1296
|
+
|
|
1297
|
+
var origPlay = window.__bpOrigPlay || HTMLMediaElement.prototype.play;
|
|
1298
|
+
window.__bpOrigPlay = origPlay;
|
|
1299
|
+
HTMLMediaElement.prototype.play = function() {
|
|
1300
|
+
if (capturing && !this.__bpCaptured) {
|
|
1301
|
+
this.__bpCaptured = true;
|
|
1302
|
+
try {
|
|
1303
|
+
if (!captureCtx) initCaptureCtx();
|
|
1304
|
+
var stream = this.captureStream ? this.captureStream() : null;
|
|
1305
|
+
if (stream && captureCtx) {
|
|
1306
|
+
var source = captureCtx.createMediaStreamSource(stream);
|
|
1307
|
+
origConnect.call(source, captureProcessor);
|
|
1308
|
+
}
|
|
1309
|
+
} catch(e) {}
|
|
1310
|
+
}
|
|
1311
|
+
return origPlay.apply(this, arguments);
|
|
1312
|
+
};
|
|
1313
|
+
|
|
1314
|
+
// Intercept srcObject assignment to catch WebRTC streams attached to media elements
|
|
1315
|
+
var origSrcObjectDesc = Object.getOwnPropertyDescriptor(HTMLMediaElement.prototype, 'srcObject');
|
|
1316
|
+
if (origSrcObjectDesc && origSrcObjectDesc.set) {
|
|
1317
|
+
Object.defineProperty(HTMLMediaElement.prototype, 'srcObject', {
|
|
1318
|
+
set: function(stream) {
|
|
1319
|
+
origSrcObjectDesc.set.call(this, stream);
|
|
1320
|
+
if (stream && stream.getAudioTracks) {
|
|
1321
|
+
var tracks = stream.getAudioTracks();
|
|
1322
|
+
for (var i = 0; i < tracks.length; i++) {
|
|
1323
|
+
if (capturing) {
|
|
1324
|
+
tapAudioTrack(tracks[i]);
|
|
1325
|
+
} else {
|
|
1326
|
+
pendingTracks.push(tracks[i]);
|
|
1327
|
+
}
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
},
|
|
1331
|
+
get: origSrcObjectDesc.get,
|
|
1332
|
+
configurable: true
|
|
1333
|
+
});
|
|
1334
|
+
}
|
|
1335
|
+
|
|
1336
|
+
// Initialize our own 48kHz capture context for WebRTC and media element tapping
|
|
1337
|
+
function initCaptureCtx() {
|
|
1338
|
+
captureCtx = new OrigAudioContext({ sampleRate: 48000 });
|
|
1339
|
+
captureProcessor = captureCtx.createScriptProcessor(BUFFER_SIZE, 2, 2);
|
|
1340
|
+
captureProcessor.onaudioprocess = function(e) {
|
|
1341
|
+
if (!capturing) return;
|
|
1342
|
+
var left = new Float32Array(e.inputBuffer.getChannelData(0));
|
|
1343
|
+
var right = new Float32Array(e.inputBuffer.getChannelData(1));
|
|
1344
|
+
capturedChunks.push({ left: left, right: right, sampleRate: 48000 });
|
|
1345
|
+
totalSamples += left.length;
|
|
1346
|
+
if (totalSamples >= FLUSH_SAMPLES) {
|
|
1347
|
+
flushToNodeJs();
|
|
1348
|
+
}
|
|
1349
|
+
};
|
|
1350
|
+
origConnect.call(captureProcessor, captureCtx.destination);
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
function uint8ToBase64(bytes) {
|
|
1354
|
+
var CHUNK = 8192;
|
|
1355
|
+
var parts = [];
|
|
1356
|
+
for (var i = 0; i < bytes.length; i += CHUNK) {
|
|
1357
|
+
var slice = bytes.subarray(i, Math.min(i + CHUNK, bytes.length));
|
|
1358
|
+
var binary = '';
|
|
1359
|
+
for (var j = 0; j < slice.length; j++) {
|
|
1360
|
+
binary += String.fromCharCode(slice[j]);
|
|
1361
|
+
}
|
|
1362
|
+
parts.push(binary);
|
|
1363
|
+
}
|
|
1364
|
+
return btoa(parts.join(''));
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
function flushGroup(chunks, rate) {
|
|
1368
|
+
var totalLen = 0;
|
|
1369
|
+
for (var i = 0; i < chunks.length; i++) {
|
|
1370
|
+
totalLen += chunks[i].left.length;
|
|
1371
|
+
}
|
|
1372
|
+
if (totalLen === 0) return;
|
|
1373
|
+
|
|
1374
|
+
var left = new Float32Array(totalLen);
|
|
1375
|
+
var right = new Float32Array(totalLen);
|
|
1376
|
+
var offset = 0;
|
|
1377
|
+
for (var i = 0; i < chunks.length; i++) {
|
|
1378
|
+
left.set(chunks[i].left, offset);
|
|
1379
|
+
right.set(chunks[i].right, offset);
|
|
1380
|
+
offset += chunks[i].left.length;
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
var leftB64 = uint8ToBase64(new Uint8Array(left.buffer));
|
|
1384
|
+
var rightB64 = uint8ToBase64(new Uint8Array(right.buffer));
|
|
1385
|
+
|
|
1386
|
+
flushCount++;
|
|
1387
|
+
|
|
1388
|
+
try {
|
|
1389
|
+
if (typeof window.__bpAudioOutputData === 'function') {
|
|
1390
|
+
window.__bpAudioOutputData(JSON.stringify({
|
|
1391
|
+
left: leftB64,
|
|
1392
|
+
right: rightB64,
|
|
1393
|
+
sampleRate: rate,
|
|
1394
|
+
samples: totalLen
|
|
1395
|
+
}));
|
|
1396
|
+
}
|
|
1397
|
+
} catch(e) {}
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
function flushToNodeJs() {
|
|
1401
|
+
if (capturedChunks.length === 0) return;
|
|
1402
|
+
|
|
1403
|
+
// Group chunks by sample rate to avoid mixing different-rate audio
|
|
1404
|
+
var byRate = {};
|
|
1405
|
+
for (var i = 0; i < capturedChunks.length; i++) {
|
|
1406
|
+
var rate = capturedChunks[i].sampleRate || 48000;
|
|
1407
|
+
if (!byRate[rate]) byRate[rate] = [];
|
|
1408
|
+
byRate[rate].push(capturedChunks[i]);
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
// Flush each sample rate group separately
|
|
1412
|
+
for (var rateKey in byRate) {
|
|
1413
|
+
if (byRate.hasOwnProperty(rateKey)) {
|
|
1414
|
+
flushGroup(byRate[rateKey], Number(rateKey));
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1417
|
+
|
|
1418
|
+
capturedChunks = [];
|
|
1419
|
+
totalSamples = 0;
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
// --- WebRTC interception (for apps that use RTCPeerConnection) ---
|
|
1423
|
+
var rtcTrackedStreams = [];
|
|
1424
|
+
var rtcPeerConnections = [];
|
|
1425
|
+
|
|
1426
|
+
function tapAudioTrack(track) {
|
|
1427
|
+
try {
|
|
1428
|
+
if (tappedTrackIds[track.id]) return;
|
|
1429
|
+
tappedTrackIds[track.id] = true;
|
|
1430
|
+
if (!captureCtx) initCaptureCtx();
|
|
1431
|
+
var stream = new MediaStream([track]);
|
|
1432
|
+
var source = captureCtx.createMediaStreamSource(stream);
|
|
1433
|
+
origConnect.call(source, captureProcessor);
|
|
1434
|
+
rtcTrackedStreams.push(source);
|
|
1435
|
+
} catch(e) {}
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
function tapExistingPeerConnection(pc) {
|
|
1439
|
+
try {
|
|
1440
|
+
var receivers = pc.getReceivers ? pc.getReceivers() : [];
|
|
1441
|
+
for (var i = 0; i < receivers.length; i++) {
|
|
1442
|
+
if (receivers[i].track && receivers[i].track.kind === 'audio') {
|
|
1443
|
+
tapAudioTrack(receivers[i].track);
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
} catch(e) {}
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1449
|
+
if (typeof RTCPeerConnection !== 'undefined') {
|
|
1450
|
+
var OrigRTC = RTCPeerConnection;
|
|
1451
|
+
|
|
1452
|
+
window.RTCPeerConnection = function() {
|
|
1453
|
+
var pc = new (Function.prototype.bind.apply(OrigRTC, [null].concat(Array.prototype.slice.call(arguments))))();
|
|
1454
|
+
rtcPeerConnections.push(pc);
|
|
1455
|
+
|
|
1456
|
+
pc.addEventListener('track', function(event) {
|
|
1457
|
+
if (event.track && event.track.kind === 'audio') {
|
|
1458
|
+
if (capturing) {
|
|
1459
|
+
tapAudioTrack(event.track);
|
|
1460
|
+
} else {
|
|
1461
|
+
pendingTracks.push(event.track);
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
});
|
|
1465
|
+
|
|
1466
|
+
return pc;
|
|
1467
|
+
};
|
|
1468
|
+
window.RTCPeerConnection.prototype = OrigRTC.prototype;
|
|
1469
|
+
Object.keys(OrigRTC).forEach(function(k) {
|
|
1470
|
+
try { window.RTCPeerConnection[k] = OrigRTC[k]; } catch(e) {}
|
|
1471
|
+
});
|
|
1472
|
+
|
|
1473
|
+
window.__bpTrackedPCs = rtcPeerConnections;
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1476
|
+
window.__bpAudioOutput = {
|
|
1477
|
+
start: function() {
|
|
1478
|
+
capturing = true;
|
|
1479
|
+
capturedChunks = [];
|
|
1480
|
+
totalSamples = 0;
|
|
1481
|
+
flushCount = 0;
|
|
1482
|
+
tappedTrackIds = {};
|
|
1483
|
+
|
|
1484
|
+
// Resume any suspended capture context
|
|
1485
|
+
if (captureCtx && captureCtx.state === 'suspended') captureCtx.resume();
|
|
1486
|
+
|
|
1487
|
+
// Create taps for all tracked AudioContexts (catches contexts created before capture)
|
|
1488
|
+
for (var i = 0; i < allAudioContexts.length; i++) {
|
|
1489
|
+
var ctx = allAudioContexts[i];
|
|
1490
|
+
if (ctx.state !== 'closed') {
|
|
1491
|
+
getOrCreateTap(ctx);
|
|
1492
|
+
}
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
// Drain pending WebRTC tracks
|
|
1496
|
+
for (var j = 0; j < pendingTracks.length; j++) {
|
|
1497
|
+
tapAudioTrack(pendingTracks[j]);
|
|
1498
|
+
}
|
|
1499
|
+
pendingTracks = [];
|
|
1500
|
+
|
|
1501
|
+
// Tap existing peer connections
|
|
1502
|
+
for (var k = 0; k < rtcPeerConnections.length; k++) {
|
|
1503
|
+
tapExistingPeerConnection(rtcPeerConnections[k]);
|
|
1504
|
+
}
|
|
1505
|
+
|
|
1506
|
+
// Scan existing media elements for srcObject with audio tracks
|
|
1507
|
+
var mediaEls = document.querySelectorAll('audio, video');
|
|
1508
|
+
for (var i = 0; i < mediaEls.length; i++) {
|
|
1509
|
+
var el = mediaEls[i];
|
|
1510
|
+
if (el.srcObject && el.srcObject.getAudioTracks && !el.__bpCaptured) {
|
|
1511
|
+
el.__bpCaptured = true;
|
|
1512
|
+
var tracks = el.srcObject.getAudioTracks();
|
|
1513
|
+
for (var j = 0; j < tracks.length; j++) {
|
|
1514
|
+
tapAudioTrack(tracks[j]);
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
|
|
1519
|
+
// Watch for dynamically added media elements with srcObject
|
|
1520
|
+
if (typeof MutationObserver !== 'undefined') {
|
|
1521
|
+
if (window.__bpMediaObserver) {
|
|
1522
|
+
window.__bpMediaObserver.disconnect();
|
|
1523
|
+
}
|
|
1524
|
+
window.__bpMediaObserver = new MutationObserver(function(mutations) {
|
|
1525
|
+
for (var i = 0; i < mutations.length; i++) {
|
|
1526
|
+
var added = mutations[i].addedNodes;
|
|
1527
|
+
for (var j = 0; j < added.length; j++) {
|
|
1528
|
+
var node = added[j];
|
|
1529
|
+
if (node.nodeType !== 1) continue;
|
|
1530
|
+
var els = [];
|
|
1531
|
+
if (node.tagName === 'AUDIO' || node.tagName === 'VIDEO') els.push(node);
|
|
1532
|
+
else if (node.querySelectorAll) {
|
|
1533
|
+
var nested = node.querySelectorAll('audio, video');
|
|
1534
|
+
for (var k = 0; k < nested.length; k++) els.push(nested[k]);
|
|
1535
|
+
}
|
|
1536
|
+
for (var m = 0; m < els.length; m++) {
|
|
1537
|
+
var el = els[m];
|
|
1538
|
+
if (el.srcObject && el.srcObject.getAudioTracks && !el.__bpCaptured) {
|
|
1539
|
+
el.__bpCaptured = true;
|
|
1540
|
+
var tracks = el.srcObject.getAudioTracks();
|
|
1541
|
+
for (var t = 0; t < tracks.length; t++) tapAudioTrack(tracks[t]);
|
|
1542
|
+
}
|
|
1543
|
+
}
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
});
|
|
1547
|
+
window.__bpMediaObserver.observe(document, { childList: true, subtree: true });
|
|
1548
|
+
}
|
|
1549
|
+
},
|
|
1550
|
+
stop: function() {
|
|
1551
|
+
capturing = false;
|
|
1552
|
+
flushToNodeJs();
|
|
1553
|
+
// Disconnect MutationObserver
|
|
1554
|
+
if (window.__bpMediaObserver) {
|
|
1555
|
+
window.__bpMediaObserver.disconnect();
|
|
1556
|
+
window.__bpMediaObserver = null;
|
|
1557
|
+
}
|
|
1558
|
+
},
|
|
1559
|
+
isCapturing: function() { return capturing; },
|
|
1560
|
+
getBufferedSamples: function() { return totalSamples; },
|
|
1561
|
+
tapPC: function(pc) {
|
|
1562
|
+
if (!pc || typeof pc.getReceivers !== 'function') return false;
|
|
1563
|
+
if (rtcPeerConnections.indexOf(pc) === -1) {
|
|
1564
|
+
rtcPeerConnections.push(pc);
|
|
1565
|
+
}
|
|
1566
|
+
if (capturing) {
|
|
1567
|
+
tapExistingPeerConnection(pc);
|
|
1568
|
+
}
|
|
1569
|
+
pc.addEventListener('track', function(event) {
|
|
1570
|
+
if (event.track && event.track.kind === 'audio') {
|
|
1571
|
+
if (capturing) {
|
|
1572
|
+
tapAudioTrack(event.track);
|
|
1573
|
+
} else {
|
|
1574
|
+
pendingTracks.push(event.track);
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
});
|
|
1578
|
+
return true;
|
|
1579
|
+
},
|
|
1580
|
+
getStats: function() {
|
|
1581
|
+
return {
|
|
1582
|
+
audioContexts: allAudioContexts.filter(function(c) { return c.state !== 'closed'; }).length,
|
|
1583
|
+
contextTaps: tapCount(),
|
|
1584
|
+
audioNodes: captureCtx ? captureCtx.destination.numberOfInputs : 0,
|
|
1585
|
+
rtcConnections: rtcPeerConnections.length,
|
|
1586
|
+
mediaElements: document.querySelectorAll('audio, video').length,
|
|
1587
|
+
pendingTracks: pendingTracks.length,
|
|
1588
|
+
tappedTracks: Object.keys(tappedTrackIds).length,
|
|
1589
|
+
capturing: capturing,
|
|
1590
|
+
bufferedSamples: totalSamples,
|
|
1591
|
+
rtcDetails: rtcPeerConnections.map(function(pc) {
|
|
1592
|
+
try {
|
|
1593
|
+
var receivers = pc.getReceivers ? pc.getReceivers() : [];
|
|
1594
|
+
var senders = pc.getSenders ? pc.getSenders() : [];
|
|
1595
|
+
var audioReceivers = receivers.filter(function(r) { return r.track && r.track.kind === 'audio'; }).length;
|
|
1596
|
+
var audioSenders = senders.filter(function(s) { return s.track && s.track.kind === 'audio'; }).length;
|
|
1597
|
+
return {
|
|
1598
|
+
state: pc.connectionState || pc.iceConnectionState || 'unknown',
|
|
1599
|
+
audioReceivers: audioReceivers,
|
|
1600
|
+
audioSenders: audioSenders,
|
|
1601
|
+
tapped: receivers.some(function(r) { return r.track && tappedTrackIds[r.track.id]; })
|
|
1602
|
+
};
|
|
1603
|
+
} catch(e) { return { state: 'error', audioReceivers: 0, audioSenders: 0, tapped: false }; }
|
|
1604
|
+
}),
|
|
1605
|
+
mediaElementDetails: (function() {
|
|
1606
|
+
try {
|
|
1607
|
+
var els = document.querySelectorAll('audio, video');
|
|
1608
|
+
var details = [];
|
|
1609
|
+
for (var i = 0; i < els.length; i++) {
|
|
1610
|
+
var el = els[i];
|
|
1611
|
+
var hasSrcObject = !!(el.srcObject);
|
|
1612
|
+
var audioTracks = 0;
|
|
1613
|
+
if (el.srcObject && el.srcObject.getAudioTracks) {
|
|
1614
|
+
audioTracks = el.srcObject.getAudioTracks().length;
|
|
1615
|
+
}
|
|
1616
|
+
details.push({
|
|
1617
|
+
tag: el.tagName.toLowerCase(),
|
|
1618
|
+
hasSrcObject: hasSrcObject,
|
|
1619
|
+
hasSrc: !!(el.src || el.currentSrc),
|
|
1620
|
+
audioTracks: audioTracks,
|
|
1621
|
+
tapped: !!(el.__bpCaptured)
|
|
1622
|
+
});
|
|
1623
|
+
}
|
|
1624
|
+
return details;
|
|
1625
|
+
} catch(e) { return []; }
|
|
1626
|
+
})()
|
|
1627
|
+
};
|
|
1628
|
+
}
|
|
1629
|
+
};
|
|
1630
|
+
})();
|
|
1631
|
+
`;
|
|
1632
|
+
var AudioOutput = class {
|
|
1633
|
+
cdp;
|
|
1634
|
+
chunks = [];
|
|
1635
|
+
injected = false;
|
|
1636
|
+
capturing = false;
|
|
1637
|
+
bindingHandler = null;
|
|
1638
|
+
onChunkHandler;
|
|
1639
|
+
onDiagHandler;
|
|
1640
|
+
/** Timestamp of the first non-silent chunk received */
|
|
1641
|
+
firstChunkTime = null;
|
|
1642
|
+
constructor(cdp) {
|
|
1643
|
+
this.cdp = cdp;
|
|
1644
|
+
}
|
|
1645
|
+
/** Whether the audio output system has been set up */
|
|
1646
|
+
get isSetup() {
|
|
1647
|
+
return this.injected;
|
|
1648
|
+
}
|
|
1649
|
+
/** Whether audio is currently being captured */
|
|
1650
|
+
get isCapturing() {
|
|
1651
|
+
return this.capturing;
|
|
1652
|
+
}
|
|
1653
|
+
/**
|
|
1654
|
+
* Set up audio output capture.
|
|
1655
|
+
* Registers bindings and injects the capture script.
|
|
1656
|
+
*/
|
|
1657
|
+
async setup() {
|
|
1658
|
+
if (this.injected) return;
|
|
1659
|
+
await this.cdp.send("Runtime.addBinding", { name: OUTPUT_BINDING });
|
|
1660
|
+
this.bindingHandler = (params) => {
|
|
1661
|
+
if (params["name"] === OUTPUT_BINDING) {
|
|
1662
|
+
this.handleAudioData(params["payload"]);
|
|
1663
|
+
}
|
|
1664
|
+
};
|
|
1665
|
+
this.cdp.on("Runtime.bindingCalled", this.bindingHandler);
|
|
1666
|
+
await this.cdp.send("Page.addScriptToEvaluateOnNewDocument", {
|
|
1667
|
+
source: AUDIO_OUTPUT_SCRIPT
|
|
1668
|
+
});
|
|
1669
|
+
await this.cdp.send("Runtime.evaluate", {
|
|
1670
|
+
expression: AUDIO_OUTPUT_SCRIPT,
|
|
1671
|
+
awaitPromise: false,
|
|
1672
|
+
userGesture: true
|
|
1673
|
+
});
|
|
1674
|
+
this.injected = true;
|
|
1675
|
+
}
|
|
1676
|
+
/**
|
|
1677
|
+
* Start capturing audio output.
|
|
1678
|
+
*/
|
|
1679
|
+
async start() {
|
|
1680
|
+
if (!this.injected) {
|
|
1681
|
+
await this.setup();
|
|
1682
|
+
}
|
|
1683
|
+
this.chunks = [];
|
|
1684
|
+
this.firstChunkTime = null;
|
|
1685
|
+
this.capturing = true;
|
|
1686
|
+
await this.cdp.send("Runtime.evaluate", {
|
|
1687
|
+
expression: `(function() {
|
|
1688
|
+
var resumed = [];
|
|
1689
|
+
(window.__bpTrackedAudioContexts || []).forEach(function(ctx) {
|
|
1690
|
+
if (ctx.state === 'suspended') {
|
|
1691
|
+
ctx.resume().then(function() {
|
|
1692
|
+
console.log('[bp:output] Resumed AudioContext (' + ctx.sampleRate + 'Hz) before capture');
|
|
1693
|
+
});
|
|
1694
|
+
resumed.push(ctx.sampleRate);
|
|
1695
|
+
}
|
|
1696
|
+
});
|
|
1697
|
+
if (window.__bpAudioInput && window.__bpAudioInput.getContext) {
|
|
1698
|
+
var inputCtx = window.__bpAudioInput.getContext();
|
|
1699
|
+
if (inputCtx && inputCtx.state === 'suspended') {
|
|
1700
|
+
inputCtx.resume();
|
|
1701
|
+
resumed.push('input-' + inputCtx.sampleRate);
|
|
1702
|
+
}
|
|
1703
|
+
}
|
|
1704
|
+
if (resumed.length) console.log('[bp:output] Resumed ' + resumed.length + ' contexts: ' + resumed.join(', '));
|
|
1705
|
+
})()`,
|
|
1706
|
+
awaitPromise: false,
|
|
1707
|
+
userGesture: true
|
|
1708
|
+
});
|
|
1709
|
+
await this.cdp.send("Runtime.evaluate", {
|
|
1710
|
+
expression: "window.__bpAudioOutput && window.__bpAudioOutput.start()",
|
|
1711
|
+
awaitPromise: false
|
|
1712
|
+
});
|
|
1713
|
+
await this.discoverExistingPeerConnections();
|
|
1714
|
+
if (this.onDiagHandler) {
|
|
1715
|
+
try {
|
|
1716
|
+
const statsResult = await this.cdp.send(
|
|
1717
|
+
"Runtime.evaluate",
|
|
1718
|
+
{
|
|
1719
|
+
expression: "window.__bpAudioOutput && window.__bpAudioOutput.getStats()",
|
|
1720
|
+
returnByValue: true
|
|
1721
|
+
}
|
|
1722
|
+
);
|
|
1723
|
+
const stats = statsResult.result.value;
|
|
1724
|
+
if (stats) {
|
|
1725
|
+
this.onDiagHandler(
|
|
1726
|
+
`started \u2014 ${stats["audioContexts"]} AudioContexts, ${stats["contextTaps"]} taps, ${stats["rtcConnections"]} RTCPeerConnections, ${stats["mediaElements"]} MediaElements, ${stats["tappedTracks"]} tapped tracks`
|
|
1727
|
+
);
|
|
1728
|
+
}
|
|
1729
|
+
} catch {
|
|
1730
|
+
}
|
|
1731
|
+
}
|
|
1732
|
+
}
|
|
1733
|
+
/**
|
|
1734
|
+
* Stop capturing and return all collected audio.
|
|
1735
|
+
*/
|
|
1736
|
+
async stop() {
|
|
1737
|
+
if (!this.injected) {
|
|
1738
|
+
return emptyCaptureResult();
|
|
1739
|
+
}
|
|
1740
|
+
await this.cdp.send("Runtime.evaluate", {
|
|
1741
|
+
expression: "window.__bpAudioOutput && window.__bpAudioOutput.stop()",
|
|
1742
|
+
awaitPromise: false
|
|
1743
|
+
});
|
|
1744
|
+
this.capturing = false;
|
|
1745
|
+
await sleep(250);
|
|
1746
|
+
return this.mergeChunks();
|
|
1747
|
+
}
|
|
1748
|
+
/**
|
|
1749
|
+
* Capture audio until silence is detected.
|
|
1750
|
+
*
|
|
1751
|
+
* Two-phase approach:
|
|
1752
|
+
* 1. **Wait phase**: Wait up to `maxDuration` for the first non-silent chunk.
|
|
1753
|
+
* The silence countdown does NOT tick during this phase, so slow voice agents
|
|
1754
|
+
* (STT → LLM → TTS can take 5-15s) don't cause premature timeout.
|
|
1755
|
+
* 2. **Capture phase**: Once audio is detected, capture until `silenceTimeout` ms
|
|
1756
|
+
* of consecutive silence pass, then stop.
|
|
1757
|
+
*/
|
|
1758
|
+
async captureUntilSilence(options) {
|
|
1759
|
+
const silenceTimeout = options?.silenceTimeout ?? 1500;
|
|
1760
|
+
const silenceThreshold = options?.silenceThreshold ?? 0.01;
|
|
1761
|
+
const maxDuration = options?.maxDuration ?? 3e5;
|
|
1762
|
+
const noAudioTimeout = options?.noAudioTimeout ?? 15e3;
|
|
1763
|
+
if (!this.capturing) {
|
|
1764
|
+
await this.start();
|
|
1765
|
+
}
|
|
1766
|
+
return new Promise((resolve) => {
|
|
1767
|
+
let heardAudio = false;
|
|
1768
|
+
let lastSoundTime = 0;
|
|
1769
|
+
const startTime = Date.now();
|
|
1770
|
+
const checkInterval = setInterval(async () => {
|
|
1771
|
+
const elapsed = Date.now() - startTime;
|
|
1772
|
+
if (elapsed > maxDuration) {
|
|
1773
|
+
clearInterval(checkInterval);
|
|
1774
|
+
this.onDiagHandler?.(`max duration reached (${maxDuration}ms), stopping`);
|
|
1775
|
+
resolve(await this.stop());
|
|
1776
|
+
return;
|
|
1777
|
+
}
|
|
1778
|
+
const latest = this.chunks[this.chunks.length - 1];
|
|
1779
|
+
if (latest) {
|
|
1780
|
+
const rms = calculateRMS(latest.left);
|
|
1781
|
+
if (rms > silenceThreshold) {
|
|
1782
|
+
if (!heardAudio) {
|
|
1783
|
+
heardAudio = true;
|
|
1784
|
+
this.onDiagHandler?.("first audio detected \u2014 silence countdown begins");
|
|
1785
|
+
}
|
|
1786
|
+
lastSoundTime = Date.now();
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
if (!heardAudio && elapsed > noAudioTimeout) {
|
|
1790
|
+
clearInterval(checkInterval);
|
|
1791
|
+
this.onDiagHandler?.(`no audio detected after ${noAudioTimeout}ms, stopping early`);
|
|
1792
|
+
resolve(await this.stop());
|
|
1793
|
+
return;
|
|
1794
|
+
}
|
|
1795
|
+
if (heardAudio && Date.now() - lastSoundTime > silenceTimeout) {
|
|
1796
|
+
clearInterval(checkInterval);
|
|
1797
|
+
resolve(await this.stop());
|
|
1798
|
+
}
|
|
1799
|
+
}, 200);
|
|
1800
|
+
});
|
|
1801
|
+
}
|
|
1802
|
+
/**
|
|
1803
|
+
* Subscribe to real-time audio chunks as they arrive.
|
|
1804
|
+
*/
|
|
1805
|
+
onData(handler) {
|
|
1806
|
+
this.onChunkHandler = handler;
|
|
1807
|
+
}
|
|
1808
|
+
/**
|
|
1809
|
+
* Subscribe to diagnostic messages (for --verbose).
|
|
1810
|
+
*/
|
|
1811
|
+
onDiag(handler) {
|
|
1812
|
+
this.onDiagHandler = handler;
|
|
1813
|
+
}
|
|
1814
|
+
/**
|
|
1815
|
+
* Clean up: remove binding handler.
|
|
1816
|
+
*/
|
|
1817
|
+
async teardown() {
|
|
1818
|
+
if (this.capturing) {
|
|
1819
|
+
await this.stop();
|
|
1820
|
+
}
|
|
1821
|
+
if (this.bindingHandler) {
|
|
1822
|
+
this.cdp.off("Runtime.bindingCalled", this.bindingHandler);
|
|
1823
|
+
this.bindingHandler = null;
|
|
1824
|
+
}
|
|
1825
|
+
this.onChunkHandler = void 0;
|
|
1826
|
+
this.onDiagHandler = void 0;
|
|
1827
|
+
this.injected = false;
|
|
1828
|
+
}
|
|
1829
|
+
/**
|
|
1830
|
+
* Use CDP Runtime.queryObjects to find RTCPeerConnection instances
|
|
1831
|
+
* that were created before our override was injected, and tap their audio tracks.
|
|
1832
|
+
*/
|
|
1833
|
+
async discoverExistingPeerConnections() {
|
|
1834
|
+
try {
|
|
1835
|
+
const protoResult = await this.cdp.send("Runtime.evaluate", {
|
|
1836
|
+
expression: 'typeof RTCPeerConnection !== "undefined" ? RTCPeerConnection.prototype : null',
|
|
1837
|
+
returnByValue: false
|
|
1838
|
+
});
|
|
1839
|
+
const protoId = protoResult.result.objectId;
|
|
1840
|
+
if (!protoId) return;
|
|
1841
|
+
const queryResult = await this.cdp.send("Runtime.queryObjects", {
|
|
1842
|
+
prototypeObjectId: protoId
|
|
1843
|
+
});
|
|
1844
|
+
const arrayId = queryResult.objects.objectId;
|
|
1845
|
+
if (!arrayId) return;
|
|
1846
|
+
const propsResult = await this.cdp.send("Runtime.getProperties", {
|
|
1847
|
+
objectId: arrayId,
|
|
1848
|
+
ownProperties: true
|
|
1849
|
+
});
|
|
1850
|
+
let tapped = 0;
|
|
1851
|
+
for (const prop of propsResult.result) {
|
|
1852
|
+
if (prop.name === "length" || prop.name === "__proto__") continue;
|
|
1853
|
+
const pcObjectId = prop.value?.objectId;
|
|
1854
|
+
if (!pcObjectId) continue;
|
|
1855
|
+
await this.cdp.send("Runtime.callFunctionOn", {
|
|
1856
|
+
objectId: pcObjectId,
|
|
1857
|
+
functionDeclaration: "function() { if (window.__bpAudioOutput && window.__bpAudioOutput.tapPC) { return window.__bpAudioOutput.tapPC(this); } return false; }",
|
|
1858
|
+
returnByValue: true
|
|
1859
|
+
});
|
|
1860
|
+
tapped++;
|
|
1861
|
+
}
|
|
1862
|
+
if (tapped > 0) {
|
|
1863
|
+
this.onDiagHandler?.(`retroactively discovered ${tapped} existing RTCPeerConnection(s)`);
|
|
1864
|
+
}
|
|
1865
|
+
await this.cdp.send("Runtime.releaseObject", { objectId: arrayId });
|
|
1866
|
+
await this.cdp.send("Runtime.releaseObject", { objectId: protoId });
|
|
1867
|
+
} catch {
|
|
1868
|
+
}
|
|
1869
|
+
}
|
|
1870
|
+
handleAudioData(payload) {
|
|
1871
|
+
try {
|
|
1872
|
+
const data = JSON.parse(payload);
|
|
1873
|
+
const leftBytes = base64ToBuffer(data.left);
|
|
1874
|
+
const rightBytes = base64ToBuffer(data.right);
|
|
1875
|
+
const chunk = {
|
|
1876
|
+
left: new Float32Array(leftBytes.buffer),
|
|
1877
|
+
right: new Float32Array(rightBytes.buffer),
|
|
1878
|
+
sampleRate: data.sampleRate,
|
|
1879
|
+
samples: data.samples,
|
|
1880
|
+
timestamp: Date.now()
|
|
1881
|
+
};
|
|
1882
|
+
this.chunks.push(chunk);
|
|
1883
|
+
if (this.onDiagHandler) {
|
|
1884
|
+
const rms = calculateRMS(chunk.left);
|
|
1885
|
+
const label = rms > 0.01 ? "audio" : "silence";
|
|
1886
|
+
this.onDiagHandler(`chunk: ${chunk.samples} samples, RMS=${rms.toFixed(4)} (${label})`);
|
|
1887
|
+
}
|
|
1888
|
+
if (this.firstChunkTime === null) {
|
|
1889
|
+
const rms = calculateRMS(chunk.left);
|
|
1890
|
+
if (rms > 1e-3) {
|
|
1891
|
+
this.firstChunkTime = Date.now();
|
|
1892
|
+
}
|
|
1893
|
+
}
|
|
1894
|
+
this.onChunkHandler?.(chunk);
|
|
1895
|
+
} catch {
|
|
1896
|
+
}
|
|
1897
|
+
}
|
|
1898
|
+
mergeChunks() {
|
|
1899
|
+
if (this.chunks.length === 0) {
|
|
1900
|
+
return emptyCaptureResult();
|
|
1901
|
+
}
|
|
1902
|
+
const byRate = /* @__PURE__ */ new Map();
|
|
1903
|
+
for (const chunk of this.chunks) {
|
|
1904
|
+
const rate = chunk.sampleRate;
|
|
1905
|
+
if (!byRate.has(rate)) byRate.set(rate, []);
|
|
1906
|
+
byRate.get(rate).push(chunk);
|
|
1907
|
+
}
|
|
1908
|
+
let bestRate = this.chunks[0].sampleRate;
|
|
1909
|
+
let bestNonSilentSamples = 0;
|
|
1910
|
+
for (const [rate, chunks] of byRate) {
|
|
1911
|
+
let nonSilentSamples = 0;
|
|
1912
|
+
for (const chunk of chunks) {
|
|
1913
|
+
const rms = calculateRMS(chunk.left);
|
|
1914
|
+
if (rms > 0.01) {
|
|
1915
|
+
nonSilentSamples += chunk.left.length;
|
|
1916
|
+
}
|
|
1917
|
+
}
|
|
1918
|
+
if (nonSilentSamples > bestNonSilentSamples) {
|
|
1919
|
+
bestNonSilentSamples = nonSilentSamples;
|
|
1920
|
+
bestRate = rate;
|
|
1921
|
+
}
|
|
1922
|
+
}
|
|
1923
|
+
const bestChunks = byRate.get(bestRate);
|
|
1924
|
+
let totalLen = 0;
|
|
1925
|
+
for (const chunk of bestChunks) {
|
|
1926
|
+
totalLen += chunk.left.length;
|
|
1927
|
+
}
|
|
1928
|
+
const left = new Float32Array(totalLen);
|
|
1929
|
+
const right = new Float32Array(totalLen);
|
|
1930
|
+
let offset = 0;
|
|
1931
|
+
for (const chunk of bestChunks) {
|
|
1932
|
+
left.set(chunk.left, offset);
|
|
1933
|
+
right.set(chunk.right, offset);
|
|
1934
|
+
offset += chunk.left.length;
|
|
1935
|
+
}
|
|
1936
|
+
if (byRate.size > 1) {
|
|
1937
|
+
this.onDiagHandler?.(
|
|
1938
|
+
`mergeChunks: ${byRate.size} sample rates detected, using ${bestRate}Hz (${bestNonSilentSamples} non-silent samples)`
|
|
1939
|
+
);
|
|
1940
|
+
}
|
|
1941
|
+
return {
|
|
1942
|
+
left,
|
|
1943
|
+
right,
|
|
1944
|
+
sampleRate: bestRate,
|
|
1945
|
+
durationMs: totalLen / bestRate * 1e3,
|
|
1946
|
+
chunkCount: bestChunks.length
|
|
1947
|
+
};
|
|
1948
|
+
}
|
|
1949
|
+
};
|
|
1950
|
+
function emptyCaptureResult() {
|
|
1951
|
+
return {
|
|
1952
|
+
left: new Float32Array(0),
|
|
1953
|
+
right: new Float32Array(0),
|
|
1954
|
+
sampleRate: 48e3,
|
|
1955
|
+
durationMs: 0,
|
|
1956
|
+
chunkCount: 0
|
|
1957
|
+
};
|
|
1958
|
+
}
|
|
1959
|
+
function sleep(ms) {
|
|
1960
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1961
|
+
}
|
|
1962
|
+
|
|
730
1963
|
// src/network/interceptor.ts
|
|
731
1964
|
var RequestInterceptor = class {
|
|
732
1965
|
cdp;
|
|
@@ -977,7 +2210,7 @@ async function isElementAttached(cdp, selector, contextId) {
|
|
|
977
2210
|
const result = await cdp.send("Runtime.evaluate", params);
|
|
978
2211
|
return result.result.value === true;
|
|
979
2212
|
}
|
|
980
|
-
function
|
|
2213
|
+
function sleep2(ms) {
|
|
981
2214
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
982
2215
|
}
|
|
983
2216
|
async function waitForAnyElement(cdp, selectors, options = {}) {
|
|
@@ -1005,7 +2238,7 @@ async function waitForAnyElement(cdp, selectors, options = {}) {
|
|
|
1005
2238
|
return { success: true, selector, waitedMs: Date.now() - startTime };
|
|
1006
2239
|
}
|
|
1007
2240
|
}
|
|
1008
|
-
await
|
|
2241
|
+
await sleep2(pollInterval);
|
|
1009
2242
|
}
|
|
1010
2243
|
return { success: false, waitedMs: Date.now() - startTime };
|
|
1011
2244
|
}
|
|
@@ -1054,7 +2287,7 @@ async function waitForNavigation(cdp, options = {}) {
|
|
|
1054
2287
|
}
|
|
1055
2288
|
const pollUrl = async () => {
|
|
1056
2289
|
while (!resolved && Date.now() < startTime + timeout) {
|
|
1057
|
-
await
|
|
2290
|
+
await sleep2(100);
|
|
1058
2291
|
if (resolved) return;
|
|
1059
2292
|
try {
|
|
1060
2293
|
const currentUrl = await getCurrentUrl(cdp);
|
|
@@ -1384,6 +2617,10 @@ var Page = class {
|
|
|
1384
2617
|
currentFrameContextId = null;
|
|
1385
2618
|
/** Last matched selector from findElement (for selectorUsed tracking) */
|
|
1386
2619
|
_lastMatchedSelector;
|
|
2620
|
+
/** Audio input controller (lazy-initialized) */
|
|
2621
|
+
_audioInput;
|
|
2622
|
+
/** Audio output controller (lazy-initialized) */
|
|
2623
|
+
_audioOutput;
|
|
1387
2624
|
constructor(cdp, targetId) {
|
|
1388
2625
|
this.cdp = cdp;
|
|
1389
2626
|
this._targetId = targetId;
|
|
@@ -1631,7 +2868,7 @@ var Page = class {
|
|
|
1631
2868
|
key: char
|
|
1632
2869
|
});
|
|
1633
2870
|
if (delay > 0) {
|
|
1634
|
-
await
|
|
2871
|
+
await sleep3(delay);
|
|
1635
2872
|
}
|
|
1636
2873
|
}
|
|
1637
2874
|
return true;
|
|
@@ -1672,7 +2909,7 @@ var Page = class {
|
|
|
1672
2909
|
async selectCustom(config, options = {}) {
|
|
1673
2910
|
const { trigger, option, value, match = "text" } = config;
|
|
1674
2911
|
await this.click(trigger, options);
|
|
1675
|
-
await
|
|
2912
|
+
await sleep3(100);
|
|
1676
2913
|
let optionSelector;
|
|
1677
2914
|
const optionSelectors = Array.isArray(option) ? option : [option];
|
|
1678
2915
|
if (match === "contains") {
|
|
@@ -1784,7 +3021,7 @@ var Page = class {
|
|
|
1784
3021
|
if (shouldWait === true) {
|
|
1785
3022
|
await this.waitForNavigation({ timeout: options.timeout ?? DEFAULT_TIMEOUT2 });
|
|
1786
3023
|
} else if (shouldWait === "auto") {
|
|
1787
|
-
await Promise.race([this.waitForNavigation({ timeout: 1e3, optional: true }),
|
|
3024
|
+
await Promise.race([this.waitForNavigation({ timeout: 1e3, optional: true }), sleep3(500)]);
|
|
1788
3025
|
}
|
|
1789
3026
|
return true;
|
|
1790
3027
|
}
|
|
@@ -1802,7 +3039,7 @@ var Page = class {
|
|
|
1802
3039
|
this.waitForNavigation({ timeout: 1e3, optional: true }).then(
|
|
1803
3040
|
(success) => success ? "nav" : null
|
|
1804
3041
|
),
|
|
1805
|
-
|
|
3042
|
+
sleep3(500).then(() => "timeout")
|
|
1806
3043
|
]);
|
|
1807
3044
|
if (navigationDetected === "nav") {
|
|
1808
3045
|
return true;
|
|
@@ -1816,7 +3053,7 @@ var Page = class {
|
|
|
1816
3053
|
if (shouldWait === true) {
|
|
1817
3054
|
await this.waitForNavigation({ timeout: options.timeout ?? DEFAULT_TIMEOUT2 });
|
|
1818
3055
|
} else if (shouldWait === "auto") {
|
|
1819
|
-
await
|
|
3056
|
+
await sleep3(100);
|
|
1820
3057
|
}
|
|
1821
3058
|
}
|
|
1822
3059
|
return true;
|
|
@@ -2813,7 +4050,7 @@ var Page = class {
|
|
|
2813
4050
|
lastError = e;
|
|
2814
4051
|
if (attempt < retries) {
|
|
2815
4052
|
this.rootNodeId = null;
|
|
2816
|
-
await
|
|
4053
|
+
await sleep3(delay);
|
|
2817
4054
|
continue;
|
|
2818
4055
|
}
|
|
2819
4056
|
}
|
|
@@ -2985,8 +4222,107 @@ var Page = class {
|
|
|
2985
4222
|
clickCount: 1
|
|
2986
4223
|
});
|
|
2987
4224
|
}
|
|
4225
|
+
// ============ Audio I/O ============
|
|
4226
|
+
/**
|
|
4227
|
+
* Audio input controller (fake microphone).
|
|
4228
|
+
* Lazy-initialized on first access.
|
|
4229
|
+
*/
|
|
4230
|
+
get audioInput() {
|
|
4231
|
+
if (!this._audioInput) {
|
|
4232
|
+
this._audioInput = new AudioInput(this.cdp);
|
|
4233
|
+
}
|
|
4234
|
+
return this._audioInput;
|
|
4235
|
+
}
|
|
4236
|
+
/**
|
|
4237
|
+
* Audio output capture controller.
|
|
4238
|
+
* Lazy-initialized on first access.
|
|
4239
|
+
*/
|
|
4240
|
+
get audioOutput() {
|
|
4241
|
+
if (!this._audioOutput) {
|
|
4242
|
+
this._audioOutput = new AudioOutput(this.cdp);
|
|
4243
|
+
}
|
|
4244
|
+
return this._audioOutput;
|
|
4245
|
+
}
|
|
4246
|
+
/**
|
|
4247
|
+
* Set up both audio input (fake microphone) and output (capture).
|
|
4248
|
+
* Must be called before navigating to the page that will use audio.
|
|
4249
|
+
*/
|
|
4250
|
+
async setupAudio() {
|
|
4251
|
+
try {
|
|
4252
|
+
await this.cdp.send("Input.dispatchMouseEvent", {
|
|
4253
|
+
type: "mousePressed",
|
|
4254
|
+
x: 0,
|
|
4255
|
+
y: 0,
|
|
4256
|
+
button: "left",
|
|
4257
|
+
clickCount: 1
|
|
4258
|
+
});
|
|
4259
|
+
await this.cdp.send("Input.dispatchMouseEvent", {
|
|
4260
|
+
type: "mouseReleased",
|
|
4261
|
+
x: 0,
|
|
4262
|
+
y: 0,
|
|
4263
|
+
button: "left",
|
|
4264
|
+
clickCount: 1
|
|
4265
|
+
});
|
|
4266
|
+
} catch {
|
|
4267
|
+
}
|
|
4268
|
+
await this.audioInput.setup();
|
|
4269
|
+
await this.audioOutput.setup();
|
|
4270
|
+
}
|
|
4271
|
+
/**
|
|
4272
|
+
* Full audio round-trip: feed input audio, capture the response.
|
|
4273
|
+
*
|
|
4274
|
+
* 1. Starts capturing output
|
|
4275
|
+
* 2. Feeds input audio as microphone data
|
|
4276
|
+
* 3. Waits for the page to respond and then go silent
|
|
4277
|
+
* 4. Returns the captured response audio with latency metrics
|
|
4278
|
+
*
|
|
4279
|
+
* @example
|
|
4280
|
+
* ```typescript
|
|
4281
|
+
* await page.setupAudio();
|
|
4282
|
+
* await page.goto('https://voice-agent.example.com');
|
|
4283
|
+
* const result = await page.audioRoundTrip({
|
|
4284
|
+
* input: wavFileBytes,
|
|
4285
|
+
* silenceTimeout: 3000,
|
|
4286
|
+
* });
|
|
4287
|
+
* console.log(`Response: ${result.audio.durationMs}ms, latency: ${result.latencyMs}ms`);
|
|
4288
|
+
* ```
|
|
4289
|
+
*/
|
|
4290
|
+
async audioRoundTrip(options) {
|
|
4291
|
+
if (!this.audioInput.isSetup || !this.audioOutput.isSetup) {
|
|
4292
|
+
await this.setupAudio();
|
|
4293
|
+
}
|
|
4294
|
+
const start = Date.now();
|
|
4295
|
+
await this.audioOutput.start();
|
|
4296
|
+
if (options.preDelay && options.preDelay > 0) {
|
|
4297
|
+
await sleep3(options.preDelay);
|
|
4298
|
+
}
|
|
4299
|
+
const inputDone = this.audioInput.play(options.input, {
|
|
4300
|
+
waitForEnd: !!options.sendSelector
|
|
4301
|
+
});
|
|
4302
|
+
if (options.sendSelector) {
|
|
4303
|
+
await inputDone.catch(() => {
|
|
4304
|
+
});
|
|
4305
|
+
await this.click(options.sendSelector);
|
|
4306
|
+
}
|
|
4307
|
+
const audio = await this.audioOutput.captureUntilSilence({
|
|
4308
|
+
silenceTimeout: options.silenceTimeout ?? 1500,
|
|
4309
|
+
silenceThreshold: options.silenceThreshold ?? 0.01,
|
|
4310
|
+
maxDuration: options.timeout ?? 12e4
|
|
4311
|
+
});
|
|
4312
|
+
await this.audioInput.stop();
|
|
4313
|
+
if (!options.sendSelector) {
|
|
4314
|
+
await inputDone.catch(() => {
|
|
4315
|
+
});
|
|
4316
|
+
}
|
|
4317
|
+
const firstChunkTime = this.audioOutput.firstChunkTime;
|
|
4318
|
+
return {
|
|
4319
|
+
audio,
|
|
4320
|
+
latencyMs: firstChunkTime !== null ? firstChunkTime - start : -1,
|
|
4321
|
+
totalMs: Date.now() - start
|
|
4322
|
+
};
|
|
4323
|
+
}
|
|
2988
4324
|
};
|
|
2989
|
-
function
|
|
4325
|
+
function sleep3(ms) {
|
|
2990
4326
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
2991
4327
|
}
|
|
2992
4328
|
|