@craftedxp/voice-js 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONSUMING.md +1 -1
- package/README.md +8 -7
- package/dist/browser.d.mts +20 -4
- package/dist/browser.d.ts +334 -250
- package/dist/browser.js +818 -540
- package/dist/browser.js.map +1 -1
- package/dist/browser.mjs +278 -8
- package/dist/browser.mjs.map +1 -1
- package/dist/embed.iife.js +1094 -4
- package/dist/node.d.mts +20 -4
- package/dist/node.d.ts +324 -247
- package/dist/node.js +480 -368
- package/dist/node.js.map +1 -1
- package/dist/node.mjs +103 -5
- package/dist/node.mjs.map +1 -1
- package/package.json +1 -1
package/dist/browser.mjs
CHANGED
|
@@ -354,7 +354,8 @@ function handleServerMessage(raw, state, cb) {
|
|
|
354
354
|
state.agentBubbleId = id;
|
|
355
355
|
state.transcript = [...state.transcript, { id, role: "agent", text: "" }];
|
|
356
356
|
cb.onTranscript(state.transcript);
|
|
357
|
-
|
|
357
|
+
const seq = typeof msg.seq === "number" ? msg.seq : void 0;
|
|
358
|
+
cb.onAgentTurnStart(seq);
|
|
358
359
|
setState(state, "agent_speaking", cb);
|
|
359
360
|
return;
|
|
360
361
|
}
|
|
@@ -368,10 +369,13 @@ function handleServerMessage(raw, state, cb) {
|
|
|
368
369
|
cb.onTranscript(state.transcript);
|
|
369
370
|
return;
|
|
370
371
|
}
|
|
371
|
-
case "agent_turn_end":
|
|
372
|
+
case "agent_turn_end": {
|
|
372
373
|
state.agentBubbleId = null;
|
|
374
|
+
const seq = typeof msg.seq === "number" ? msg.seq : void 0;
|
|
375
|
+
cb.onAgentTurnEnd(seq);
|
|
373
376
|
setState(state, "listening", cb);
|
|
374
377
|
return;
|
|
378
|
+
}
|
|
375
379
|
case "interrupt":
|
|
376
380
|
cb.onInterrupt();
|
|
377
381
|
return;
|
|
@@ -558,6 +562,76 @@ var dispatchClientToolCall = (send, tools, frame) => {
|
|
|
558
562
|
})();
|
|
559
563
|
};
|
|
560
564
|
|
|
565
|
+
// src/ClientMarksBuffer.ts
|
|
566
|
+
var createClientMarksBuffer = (args) => {
|
|
567
|
+
const now = args.now ?? (() => performance.now());
|
|
568
|
+
let pendingFirstOutboundAt = null;
|
|
569
|
+
const inFlight = /* @__PURE__ */ new Map();
|
|
570
|
+
const tryEmit = (seq) => {
|
|
571
|
+
const slot = inFlight.get(seq);
|
|
572
|
+
if (!slot) return;
|
|
573
|
+
if (!slot.ended) return;
|
|
574
|
+
const marks = {};
|
|
575
|
+
if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
|
|
576
|
+
marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt;
|
|
577
|
+
}
|
|
578
|
+
args.send({
|
|
579
|
+
type: "client_marks",
|
|
580
|
+
seq,
|
|
581
|
+
marks,
|
|
582
|
+
clientNow: Date.now()
|
|
583
|
+
});
|
|
584
|
+
inFlight.delete(seq);
|
|
585
|
+
};
|
|
586
|
+
const markFirstOutboundAudio = () => {
|
|
587
|
+
if (pendingFirstOutboundAt !== null) return;
|
|
588
|
+
pendingFirstOutboundAt = now();
|
|
589
|
+
};
|
|
590
|
+
const markFirstAudibleOutput = () => {
|
|
591
|
+
let target;
|
|
592
|
+
for (const slot of inFlight.values()) {
|
|
593
|
+
if (!slot.ended) {
|
|
594
|
+
target = slot;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
if (!target) return;
|
|
598
|
+
if (target.firstAudibleAt !== null) return;
|
|
599
|
+
target.firstAudibleAt = now();
|
|
600
|
+
};
|
|
601
|
+
const onAgentTurnStart = (seq) => {
|
|
602
|
+
inFlight.set(seq, {
|
|
603
|
+
firstOutboundAt: pendingFirstOutboundAt,
|
|
604
|
+
firstAudibleAt: null,
|
|
605
|
+
ended: false
|
|
606
|
+
});
|
|
607
|
+
pendingFirstOutboundAt = null;
|
|
608
|
+
};
|
|
609
|
+
const onAgentTurnEnd = (seq) => {
|
|
610
|
+
const slot = inFlight.get(seq);
|
|
611
|
+
if (!slot) {
|
|
612
|
+
args.send({ type: "client_marks", seq, marks: {}, clientNow: Date.now() });
|
|
613
|
+
return;
|
|
614
|
+
}
|
|
615
|
+
slot.ended = true;
|
|
616
|
+
tryEmit(seq);
|
|
617
|
+
};
|
|
618
|
+
const flush = () => {
|
|
619
|
+
for (const seq of [...inFlight.keys()]) {
|
|
620
|
+
const slot = inFlight.get(seq);
|
|
621
|
+
slot.ended = true;
|
|
622
|
+
tryEmit(seq);
|
|
623
|
+
}
|
|
624
|
+
pendingFirstOutboundAt = null;
|
|
625
|
+
};
|
|
626
|
+
return {
|
|
627
|
+
markFirstOutboundAudio,
|
|
628
|
+
markFirstAudibleOutput,
|
|
629
|
+
onAgentTurnStart,
|
|
630
|
+
onAgentTurnEnd,
|
|
631
|
+
flush
|
|
632
|
+
};
|
|
633
|
+
};
|
|
634
|
+
|
|
561
635
|
// src/VoiceClient.ts
|
|
562
636
|
var BrowserVoiceClient = class {
|
|
563
637
|
constructor(args) {
|
|
@@ -621,7 +695,13 @@ var BrowserVoiceClient = class {
|
|
|
621
695
|
this.playback?.flush();
|
|
622
696
|
this.args.options.onInterrupt?.();
|
|
623
697
|
},
|
|
624
|
-
onAgentTurnStart: () =>
|
|
698
|
+
onAgentTurnStart: (seq) => {
|
|
699
|
+
if (typeof seq === "number") this.marks.onAgentTurnStart(seq);
|
|
700
|
+
this.args.options.onAgentTurnStart?.();
|
|
701
|
+
},
|
|
702
|
+
onAgentTurnEnd: (seq) => {
|
|
703
|
+
if (typeof seq === "number") this.marks.onAgentTurnEnd(seq);
|
|
704
|
+
},
|
|
625
705
|
onCallEnd: (reason) => this.teardown(reason),
|
|
626
706
|
onConnected: () => this.sendClientToolsRegister(),
|
|
627
707
|
onClientToolCall: (frame) => dispatchClientToolCall(
|
|
@@ -631,6 +711,7 @@ var BrowserVoiceClient = class {
|
|
|
631
711
|
)
|
|
632
712
|
});
|
|
633
713
|
} else {
|
|
714
|
+
this.marks.markFirstAudibleOutput();
|
|
634
715
|
this.playback?.enqueue(ev.data);
|
|
635
716
|
}
|
|
636
717
|
break;
|
|
@@ -649,6 +730,7 @@ var BrowserVoiceClient = class {
|
|
|
649
730
|
if (this.capture?.isCapturing()) return;
|
|
650
731
|
this.capture = createAudioCapture({
|
|
651
732
|
onChunk: (pcm) => {
|
|
733
|
+
this.marks.markFirstOutboundAudio();
|
|
652
734
|
this.rws?.send(pcm);
|
|
653
735
|
},
|
|
654
736
|
onVolume: (v) => {
|
|
@@ -669,6 +751,10 @@ var BrowserVoiceClient = class {
|
|
|
669
751
|
}
|
|
670
752
|
};
|
|
671
753
|
this.teardown = (reason) => {
|
|
754
|
+
try {
|
|
755
|
+
this.marks.flush();
|
|
756
|
+
} catch {
|
|
757
|
+
}
|
|
672
758
|
this.capture?.stop();
|
|
673
759
|
this.capture = null;
|
|
674
760
|
this.playback?.close();
|
|
@@ -694,6 +780,14 @@ var BrowserVoiceClient = class {
|
|
|
694
780
|
this.args = args;
|
|
695
781
|
this.proto = createProtocolState();
|
|
696
782
|
validateClientToolMap(args.options.clientTools);
|
|
783
|
+
this.marks = createClientMarksBuffer({
|
|
784
|
+
send: (frame) => {
|
|
785
|
+
try {
|
|
786
|
+
this.rws?.send(JSON.stringify(frame));
|
|
787
|
+
} catch {
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
});
|
|
697
791
|
}
|
|
698
792
|
// ---------------------------------------------------------------
|
|
699
793
|
// Call interface
|
|
@@ -742,6 +836,161 @@ var BrowserVoiceClient = class {
|
|
|
742
836
|
}
|
|
743
837
|
};
|
|
744
838
|
|
|
839
|
+
// src/webrtc/createWebRtcCall.ts
|
|
840
|
+
async function createWebRtcCall(opts) {
|
|
841
|
+
const proto = createProtocolState();
|
|
842
|
+
let muted = false;
|
|
843
|
+
let ended = false;
|
|
844
|
+
const fireState = (next) => {
|
|
845
|
+
if (proto.state === next) return;
|
|
846
|
+
proto.state = next;
|
|
847
|
+
opts.onStateChange?.(next);
|
|
848
|
+
};
|
|
849
|
+
const dispatch = (raw) => {
|
|
850
|
+
handleServerMessage(raw, proto, {
|
|
851
|
+
onState: fireState,
|
|
852
|
+
onTranscript: (entries) => opts.onTranscript?.(entries),
|
|
853
|
+
onError: (err) => opts.onError?.(err),
|
|
854
|
+
onInterrupt: () => opts.onInterrupt?.(),
|
|
855
|
+
onAgentTurnStart: () => opts.onAgentTurnStart?.(),
|
|
856
|
+
onAgentTurnEnd: () => {
|
|
857
|
+
},
|
|
858
|
+
onCallEnd: () => teardown(),
|
|
859
|
+
onConnected: () => {
|
|
860
|
+
},
|
|
861
|
+
onClientToolCall: () => {
|
|
862
|
+
}
|
|
863
|
+
});
|
|
864
|
+
};
|
|
865
|
+
fireState("connecting");
|
|
866
|
+
const pc = new RTCPeerConnection({
|
|
867
|
+
iceServers: [{ urls: "stun:stun.l.google.com:19302" }]
|
|
868
|
+
});
|
|
869
|
+
const audioEl = document.createElement("audio");
|
|
870
|
+
audioEl.autoplay = true;
|
|
871
|
+
audioEl.style.display = "none";
|
|
872
|
+
document.body.appendChild(audioEl);
|
|
873
|
+
pc.ontrack = (event) => {
|
|
874
|
+
audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track]);
|
|
875
|
+
};
|
|
876
|
+
let mic;
|
|
877
|
+
try {
|
|
878
|
+
mic = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
879
|
+
} catch (err) {
|
|
880
|
+
const code = err instanceof DOMException && err.name === "NotAllowedError" ? "mic_denied" : "mic_start_failed";
|
|
881
|
+
opts.onError?.({
|
|
882
|
+
code,
|
|
883
|
+
message: err instanceof Error ? err.message : "getUserMedia failed"
|
|
884
|
+
});
|
|
885
|
+
fireState("error");
|
|
886
|
+
pc.close();
|
|
887
|
+
audioEl.remove();
|
|
888
|
+
throw err;
|
|
889
|
+
}
|
|
890
|
+
for (const track of mic.getAudioTracks()) pc.addTrack(track, mic);
|
|
891
|
+
const dc = pc.createDataChannel("control", { ordered: true });
|
|
892
|
+
dc.onmessage = (e) => {
|
|
893
|
+
if (typeof e.data === "string") dispatch(e.data);
|
|
894
|
+
};
|
|
895
|
+
dc.onerror = () => {
|
|
896
|
+
opts.onError?.({ code: "socket_error", message: "control channel error" });
|
|
897
|
+
};
|
|
898
|
+
const gateway = opts.webrtcGatewayBase || "";
|
|
899
|
+
const offerUrl = gateway ? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`;
|
|
900
|
+
const iceUrl = gateway ? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`;
|
|
901
|
+
await pc.setLocalDescription(await pc.createOffer());
|
|
902
|
+
let callId;
|
|
903
|
+
try {
|
|
904
|
+
const offerRes = await fetch(offerUrl, {
|
|
905
|
+
method: "POST",
|
|
906
|
+
headers: { "content-type": "application/json" },
|
|
907
|
+
body: JSON.stringify({ sdp: pc.localDescription.sdp, type: "offer", agentId: opts.agentId })
|
|
908
|
+
});
|
|
909
|
+
if (!offerRes.ok) {
|
|
910
|
+
const code = offerRes.status === 401 ? "unauthorized" : "server_error";
|
|
911
|
+
opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` });
|
|
912
|
+
fireState("error");
|
|
913
|
+
mic.getTracks().forEach((t) => t.stop());
|
|
914
|
+
pc.close();
|
|
915
|
+
audioEl.remove();
|
|
916
|
+
throw new Error(`webrtc offer failed: ${offerRes.status}`);
|
|
917
|
+
}
|
|
918
|
+
const body = await offerRes.json();
|
|
919
|
+
callId = body.callId;
|
|
920
|
+
await pc.setRemoteDescription({ type: "answer", sdp: body.sdp });
|
|
921
|
+
} catch (err) {
|
|
922
|
+
if (!ended) {
|
|
923
|
+
opts.onError?.({
|
|
924
|
+
code: "network_unreachable",
|
|
925
|
+
message: err instanceof Error ? err.message : "signaling failed"
|
|
926
|
+
});
|
|
927
|
+
fireState("error");
|
|
928
|
+
mic.getTracks().forEach((t) => t.stop());
|
|
929
|
+
pc.close();
|
|
930
|
+
audioEl.remove();
|
|
931
|
+
}
|
|
932
|
+
throw err;
|
|
933
|
+
}
|
|
934
|
+
pc.onicecandidate = (e) => {
|
|
935
|
+
if (!e.candidate) return;
|
|
936
|
+
void fetch(iceUrl, {
|
|
937
|
+
method: "POST",
|
|
938
|
+
headers: { "content-type": "application/json" },
|
|
939
|
+
body: JSON.stringify({ callId, candidate: e.candidate })
|
|
940
|
+
}).catch(() => {
|
|
941
|
+
});
|
|
942
|
+
};
|
|
943
|
+
pc.onconnectionstatechange = () => {
|
|
944
|
+
const s = pc.connectionState;
|
|
945
|
+
if (s === "connected") fireState("listening");
|
|
946
|
+
if (s === "failed" || s === "disconnected") {
|
|
947
|
+
opts.onError?.({ code: "socket_error", message: `webrtc connection ${s}` });
|
|
948
|
+
teardown();
|
|
949
|
+
}
|
|
950
|
+
if (s === "closed" && !ended) teardown();
|
|
951
|
+
};
|
|
952
|
+
const teardown = () => {
|
|
953
|
+
if (ended) return;
|
|
954
|
+
ended = true;
|
|
955
|
+
try {
|
|
956
|
+
mic.getTracks().forEach((t) => t.stop());
|
|
957
|
+
} catch {
|
|
958
|
+
}
|
|
959
|
+
try {
|
|
960
|
+
pc.close();
|
|
961
|
+
} catch {
|
|
962
|
+
}
|
|
963
|
+
try {
|
|
964
|
+
audioEl.remove();
|
|
965
|
+
} catch {
|
|
966
|
+
}
|
|
967
|
+
fireState("ended");
|
|
968
|
+
opts.onEnd?.();
|
|
969
|
+
};
|
|
970
|
+
return {
|
|
971
|
+
get state() {
|
|
972
|
+
return proto.state;
|
|
973
|
+
},
|
|
974
|
+
get transcript() {
|
|
975
|
+
return proto.transcript.slice();
|
|
976
|
+
},
|
|
977
|
+
get isMuted() {
|
|
978
|
+
return muted;
|
|
979
|
+
},
|
|
980
|
+
end: () => teardown(),
|
|
981
|
+
mute: () => {
|
|
982
|
+
if (muted) return;
|
|
983
|
+
muted = true;
|
|
984
|
+
mic.getAudioTracks().forEach((t) => t.enabled = false);
|
|
985
|
+
},
|
|
986
|
+
unmute: () => {
|
|
987
|
+
if (!muted) return;
|
|
988
|
+
muted = false;
|
|
989
|
+
mic.getAudioTracks().forEach((t) => t.enabled = true);
|
|
990
|
+
}
|
|
991
|
+
};
|
|
992
|
+
}
|
|
993
|
+
|
|
745
994
|
// src/browser.ts
|
|
746
995
|
var browserWsFactory = (url) => new globalThis.WebSocket(url);
|
|
747
996
|
var BrowserVoiceFactory = class {
|
|
@@ -757,21 +1006,42 @@ var BrowserVoiceFactory = class {
|
|
|
757
1006
|
context,
|
|
758
1007
|
metadata
|
|
759
1008
|
};
|
|
760
|
-
let
|
|
1009
|
+
let resolved;
|
|
761
1010
|
if (options.token) {
|
|
762
|
-
|
|
1011
|
+
resolved = { token: options.token, transport: "ws" };
|
|
763
1012
|
} else {
|
|
764
|
-
|
|
765
|
-
if (!
|
|
1013
|
+
const r = await this.config.fetchToken(fetchArgs);
|
|
1014
|
+
if (!r) {
|
|
766
1015
|
throw new Error("configureVoiceClient.fetchToken returned empty token");
|
|
767
1016
|
}
|
|
1017
|
+
resolved = typeof r === "string" ? { token: r, transport: "ws" } : r;
|
|
1018
|
+
if (!resolved.token) {
|
|
1019
|
+
throw new Error("configureVoiceClient.fetchToken returned an object without `token`");
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
if (resolved.transport === "webrtc") {
|
|
1023
|
+
return createWebRtcCall({
|
|
1024
|
+
agentId: options.agentId,
|
|
1025
|
+
apiBase: this.config.apiBase,
|
|
1026
|
+
token: resolved.token,
|
|
1027
|
+
webrtcGatewayBase: resolved.webrtcGatewayBase,
|
|
1028
|
+
onStateChange: options.onStateChange,
|
|
1029
|
+
onTranscript: options.onTranscript,
|
|
1030
|
+
onError: options.onError,
|
|
1031
|
+
// Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
|
|
1032
|
+
// from the server yet — use 'agent_ended' as placeholder. durationMs is
|
|
1033
|
+
// tracked at 0 until the followup lands (see spec Followups section).
|
|
1034
|
+
onEnd: options.onEnd ? () => options.onEnd({ reason: "agent_ended", durationMs: 0 }) : void 0,
|
|
1035
|
+
onInterrupt: options.onInterrupt,
|
|
1036
|
+
onAgentTurnStart: options.onAgentTurnStart
|
|
1037
|
+
});
|
|
768
1038
|
}
|
|
769
1039
|
const client = new BrowserVoiceClient({
|
|
770
1040
|
config: this.config,
|
|
771
1041
|
// Carry merged context/metadata through to startCall so server can
|
|
772
1042
|
// see what the SDK saw.
|
|
773
1043
|
options: { ...options, context, metadata },
|
|
774
|
-
token,
|
|
1044
|
+
token: resolved.token,
|
|
775
1045
|
wsFactory: browserWsFactory
|
|
776
1046
|
});
|
|
777
1047
|
await client.start();
|