@craftedxp/voice-js 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONSUMING.md +1 -1
- package/README.md +8 -7
- package/dist/browser.d.mts +20 -4
- package/dist/browser.d.ts +334 -250
- package/dist/browser.js +818 -541
- package/dist/browser.js.map +1 -1
- package/dist/browser.mjs +278 -9
- package/dist/browser.mjs.map +1 -1
- package/dist/embed.iife.js +1094 -4
- package/dist/node.d.mts +20 -4
- package/dist/node.d.ts +324 -247
- package/dist/node.js +480 -369
- package/dist/node.js.map +1 -1
- package/dist/node.mjs +103 -6
- package/dist/node.mjs.map +1 -1
- package/package.json +1 -1
package/dist/browser.mjs
CHANGED
|
@@ -354,7 +354,8 @@ function handleServerMessage(raw, state, cb) {
|
|
|
354
354
|
state.agentBubbleId = id;
|
|
355
355
|
state.transcript = [...state.transcript, { id, role: "agent", text: "" }];
|
|
356
356
|
cb.onTranscript(state.transcript);
|
|
357
|
-
|
|
357
|
+
const seq = typeof msg.seq === "number" ? msg.seq : void 0;
|
|
358
|
+
cb.onAgentTurnStart(seq);
|
|
358
359
|
setState(state, "agent_speaking", cb);
|
|
359
360
|
return;
|
|
360
361
|
}
|
|
@@ -368,10 +369,13 @@ function handleServerMessage(raw, state, cb) {
|
|
|
368
369
|
cb.onTranscript(state.transcript);
|
|
369
370
|
return;
|
|
370
371
|
}
|
|
371
|
-
case "agent_turn_end":
|
|
372
|
+
case "agent_turn_end": {
|
|
372
373
|
state.agentBubbleId = null;
|
|
374
|
+
const seq = typeof msg.seq === "number" ? msg.seq : void 0;
|
|
375
|
+
cb.onAgentTurnEnd(seq);
|
|
373
376
|
setState(state, "listening", cb);
|
|
374
377
|
return;
|
|
378
|
+
}
|
|
375
379
|
case "interrupt":
|
|
376
380
|
cb.onInterrupt();
|
|
377
381
|
return;
|
|
@@ -447,7 +451,6 @@ function handleServerMessage(raw, state, cb) {
|
|
|
447
451
|
}
|
|
448
452
|
var setState = (state, next, cb) => {
|
|
449
453
|
if (state.state === next) return;
|
|
450
|
-
state.state = next;
|
|
451
454
|
cb.onState(next);
|
|
452
455
|
};
|
|
453
456
|
var upsertUserPartial = (state, text, isFinal) => {
|
|
@@ -559,6 +562,76 @@ var dispatchClientToolCall = (send, tools, frame) => {
|
|
|
559
562
|
})();
|
|
560
563
|
};
|
|
561
564
|
|
|
565
|
+
// src/ClientMarksBuffer.ts
|
|
566
|
+
var createClientMarksBuffer = (args) => {
|
|
567
|
+
const now = args.now ?? (() => performance.now());
|
|
568
|
+
let pendingFirstOutboundAt = null;
|
|
569
|
+
const inFlight = /* @__PURE__ */ new Map();
|
|
570
|
+
const tryEmit = (seq) => {
|
|
571
|
+
const slot = inFlight.get(seq);
|
|
572
|
+
if (!slot) return;
|
|
573
|
+
if (!slot.ended) return;
|
|
574
|
+
const marks = {};
|
|
575
|
+
if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
|
|
576
|
+
marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt;
|
|
577
|
+
}
|
|
578
|
+
args.send({
|
|
579
|
+
type: "client_marks",
|
|
580
|
+
seq,
|
|
581
|
+
marks,
|
|
582
|
+
clientNow: Date.now()
|
|
583
|
+
});
|
|
584
|
+
inFlight.delete(seq);
|
|
585
|
+
};
|
|
586
|
+
const markFirstOutboundAudio = () => {
|
|
587
|
+
if (pendingFirstOutboundAt !== null) return;
|
|
588
|
+
pendingFirstOutboundAt = now();
|
|
589
|
+
};
|
|
590
|
+
const markFirstAudibleOutput = () => {
|
|
591
|
+
let target;
|
|
592
|
+
for (const slot of inFlight.values()) {
|
|
593
|
+
if (!slot.ended) {
|
|
594
|
+
target = slot;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
if (!target) return;
|
|
598
|
+
if (target.firstAudibleAt !== null) return;
|
|
599
|
+
target.firstAudibleAt = now();
|
|
600
|
+
};
|
|
601
|
+
const onAgentTurnStart = (seq) => {
|
|
602
|
+
inFlight.set(seq, {
|
|
603
|
+
firstOutboundAt: pendingFirstOutboundAt,
|
|
604
|
+
firstAudibleAt: null,
|
|
605
|
+
ended: false
|
|
606
|
+
});
|
|
607
|
+
pendingFirstOutboundAt = null;
|
|
608
|
+
};
|
|
609
|
+
const onAgentTurnEnd = (seq) => {
|
|
610
|
+
const slot = inFlight.get(seq);
|
|
611
|
+
if (!slot) {
|
|
612
|
+
args.send({ type: "client_marks", seq, marks: {}, clientNow: Date.now() });
|
|
613
|
+
return;
|
|
614
|
+
}
|
|
615
|
+
slot.ended = true;
|
|
616
|
+
tryEmit(seq);
|
|
617
|
+
};
|
|
618
|
+
const flush = () => {
|
|
619
|
+
for (const seq of [...inFlight.keys()]) {
|
|
620
|
+
const slot = inFlight.get(seq);
|
|
621
|
+
slot.ended = true;
|
|
622
|
+
tryEmit(seq);
|
|
623
|
+
}
|
|
624
|
+
pendingFirstOutboundAt = null;
|
|
625
|
+
};
|
|
626
|
+
return {
|
|
627
|
+
markFirstOutboundAudio,
|
|
628
|
+
markFirstAudibleOutput,
|
|
629
|
+
onAgentTurnStart,
|
|
630
|
+
onAgentTurnEnd,
|
|
631
|
+
flush
|
|
632
|
+
};
|
|
633
|
+
};
|
|
634
|
+
|
|
562
635
|
// src/VoiceClient.ts
|
|
563
636
|
var BrowserVoiceClient = class {
|
|
564
637
|
constructor(args) {
|
|
@@ -622,7 +695,13 @@ var BrowserVoiceClient = class {
|
|
|
622
695
|
this.playback?.flush();
|
|
623
696
|
this.args.options.onInterrupt?.();
|
|
624
697
|
},
|
|
625
|
-
onAgentTurnStart: () =>
|
|
698
|
+
onAgentTurnStart: (seq) => {
|
|
699
|
+
if (typeof seq === "number") this.marks.onAgentTurnStart(seq);
|
|
700
|
+
this.args.options.onAgentTurnStart?.();
|
|
701
|
+
},
|
|
702
|
+
onAgentTurnEnd: (seq) => {
|
|
703
|
+
if (typeof seq === "number") this.marks.onAgentTurnEnd(seq);
|
|
704
|
+
},
|
|
626
705
|
onCallEnd: (reason) => this.teardown(reason),
|
|
627
706
|
onConnected: () => this.sendClientToolsRegister(),
|
|
628
707
|
onClientToolCall: (frame) => dispatchClientToolCall(
|
|
@@ -632,6 +711,7 @@ var BrowserVoiceClient = class {
|
|
|
632
711
|
)
|
|
633
712
|
});
|
|
634
713
|
} else {
|
|
714
|
+
this.marks.markFirstAudibleOutput();
|
|
635
715
|
this.playback?.enqueue(ev.data);
|
|
636
716
|
}
|
|
637
717
|
break;
|
|
@@ -650,6 +730,7 @@ var BrowserVoiceClient = class {
|
|
|
650
730
|
if (this.capture?.isCapturing()) return;
|
|
651
731
|
this.capture = createAudioCapture({
|
|
652
732
|
onChunk: (pcm) => {
|
|
733
|
+
this.marks.markFirstOutboundAudio();
|
|
653
734
|
this.rws?.send(pcm);
|
|
654
735
|
},
|
|
655
736
|
onVolume: (v) => {
|
|
@@ -670,6 +751,10 @@ var BrowserVoiceClient = class {
|
|
|
670
751
|
}
|
|
671
752
|
};
|
|
672
753
|
this.teardown = (reason) => {
|
|
754
|
+
try {
|
|
755
|
+
this.marks.flush();
|
|
756
|
+
} catch {
|
|
757
|
+
}
|
|
673
758
|
this.capture?.stop();
|
|
674
759
|
this.capture = null;
|
|
675
760
|
this.playback?.close();
|
|
@@ -695,6 +780,14 @@ var BrowserVoiceClient = class {
|
|
|
695
780
|
this.args = args;
|
|
696
781
|
this.proto = createProtocolState();
|
|
697
782
|
validateClientToolMap(args.options.clientTools);
|
|
783
|
+
this.marks = createClientMarksBuffer({
|
|
784
|
+
send: (frame) => {
|
|
785
|
+
try {
|
|
786
|
+
this.rws?.send(JSON.stringify(frame));
|
|
787
|
+
} catch {
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
});
|
|
698
791
|
}
|
|
699
792
|
// ---------------------------------------------------------------
|
|
700
793
|
// Call interface
|
|
@@ -743,6 +836,161 @@ var BrowserVoiceClient = class {
|
|
|
743
836
|
}
|
|
744
837
|
};
|
|
745
838
|
|
|
839
|
+
// src/webrtc/createWebRtcCall.ts
|
|
840
|
+
async function createWebRtcCall(opts) {
|
|
841
|
+
const proto = createProtocolState();
|
|
842
|
+
let muted = false;
|
|
843
|
+
let ended = false;
|
|
844
|
+
const fireState = (next) => {
|
|
845
|
+
if (proto.state === next) return;
|
|
846
|
+
proto.state = next;
|
|
847
|
+
opts.onStateChange?.(next);
|
|
848
|
+
};
|
|
849
|
+
const dispatch = (raw) => {
|
|
850
|
+
handleServerMessage(raw, proto, {
|
|
851
|
+
onState: fireState,
|
|
852
|
+
onTranscript: (entries) => opts.onTranscript?.(entries),
|
|
853
|
+
onError: (err) => opts.onError?.(err),
|
|
854
|
+
onInterrupt: () => opts.onInterrupt?.(),
|
|
855
|
+
onAgentTurnStart: () => opts.onAgentTurnStart?.(),
|
|
856
|
+
onAgentTurnEnd: () => {
|
|
857
|
+
},
|
|
858
|
+
onCallEnd: () => teardown(),
|
|
859
|
+
onConnected: () => {
|
|
860
|
+
},
|
|
861
|
+
onClientToolCall: () => {
|
|
862
|
+
}
|
|
863
|
+
});
|
|
864
|
+
};
|
|
865
|
+
fireState("connecting");
|
|
866
|
+
const pc = new RTCPeerConnection({
|
|
867
|
+
iceServers: [{ urls: "stun:stun.l.google.com:19302" }]
|
|
868
|
+
});
|
|
869
|
+
const audioEl = document.createElement("audio");
|
|
870
|
+
audioEl.autoplay = true;
|
|
871
|
+
audioEl.style.display = "none";
|
|
872
|
+
document.body.appendChild(audioEl);
|
|
873
|
+
pc.ontrack = (event) => {
|
|
874
|
+
audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track]);
|
|
875
|
+
};
|
|
876
|
+
let mic;
|
|
877
|
+
try {
|
|
878
|
+
mic = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
879
|
+
} catch (err) {
|
|
880
|
+
const code = err instanceof DOMException && err.name === "NotAllowedError" ? "mic_denied" : "mic_start_failed";
|
|
881
|
+
opts.onError?.({
|
|
882
|
+
code,
|
|
883
|
+
message: err instanceof Error ? err.message : "getUserMedia failed"
|
|
884
|
+
});
|
|
885
|
+
fireState("error");
|
|
886
|
+
pc.close();
|
|
887
|
+
audioEl.remove();
|
|
888
|
+
throw err;
|
|
889
|
+
}
|
|
890
|
+
for (const track of mic.getAudioTracks()) pc.addTrack(track, mic);
|
|
891
|
+
const dc = pc.createDataChannel("control", { ordered: true });
|
|
892
|
+
dc.onmessage = (e) => {
|
|
893
|
+
if (typeof e.data === "string") dispatch(e.data);
|
|
894
|
+
};
|
|
895
|
+
dc.onerror = () => {
|
|
896
|
+
opts.onError?.({ code: "socket_error", message: "control channel error" });
|
|
897
|
+
};
|
|
898
|
+
const gateway = opts.webrtcGatewayBase || "";
|
|
899
|
+
const offerUrl = gateway ? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`;
|
|
900
|
+
const iceUrl = gateway ? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`;
|
|
901
|
+
await pc.setLocalDescription(await pc.createOffer());
|
|
902
|
+
let callId;
|
|
903
|
+
try {
|
|
904
|
+
const offerRes = await fetch(offerUrl, {
|
|
905
|
+
method: "POST",
|
|
906
|
+
headers: { "content-type": "application/json" },
|
|
907
|
+
body: JSON.stringify({ sdp: pc.localDescription.sdp, type: "offer", agentId: opts.agentId })
|
|
908
|
+
});
|
|
909
|
+
if (!offerRes.ok) {
|
|
910
|
+
const code = offerRes.status === 401 ? "unauthorized" : "server_error";
|
|
911
|
+
opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` });
|
|
912
|
+
fireState("error");
|
|
913
|
+
mic.getTracks().forEach((t) => t.stop());
|
|
914
|
+
pc.close();
|
|
915
|
+
audioEl.remove();
|
|
916
|
+
throw new Error(`webrtc offer failed: ${offerRes.status}`);
|
|
917
|
+
}
|
|
918
|
+
const body = await offerRes.json();
|
|
919
|
+
callId = body.callId;
|
|
920
|
+
await pc.setRemoteDescription({ type: "answer", sdp: body.sdp });
|
|
921
|
+
} catch (err) {
|
|
922
|
+
if (!ended) {
|
|
923
|
+
opts.onError?.({
|
|
924
|
+
code: "network_unreachable",
|
|
925
|
+
message: err instanceof Error ? err.message : "signaling failed"
|
|
926
|
+
});
|
|
927
|
+
fireState("error");
|
|
928
|
+
mic.getTracks().forEach((t) => t.stop());
|
|
929
|
+
pc.close();
|
|
930
|
+
audioEl.remove();
|
|
931
|
+
}
|
|
932
|
+
throw err;
|
|
933
|
+
}
|
|
934
|
+
pc.onicecandidate = (e) => {
|
|
935
|
+
if (!e.candidate) return;
|
|
936
|
+
void fetch(iceUrl, {
|
|
937
|
+
method: "POST",
|
|
938
|
+
headers: { "content-type": "application/json" },
|
|
939
|
+
body: JSON.stringify({ callId, candidate: e.candidate })
|
|
940
|
+
}).catch(() => {
|
|
941
|
+
});
|
|
942
|
+
};
|
|
943
|
+
pc.onconnectionstatechange = () => {
|
|
944
|
+
const s = pc.connectionState;
|
|
945
|
+
if (s === "connected") fireState("listening");
|
|
946
|
+
if (s === "failed" || s === "disconnected") {
|
|
947
|
+
opts.onError?.({ code: "socket_error", message: `webrtc connection ${s}` });
|
|
948
|
+
teardown();
|
|
949
|
+
}
|
|
950
|
+
if (s === "closed" && !ended) teardown();
|
|
951
|
+
};
|
|
952
|
+
const teardown = () => {
|
|
953
|
+
if (ended) return;
|
|
954
|
+
ended = true;
|
|
955
|
+
try {
|
|
956
|
+
mic.getTracks().forEach((t) => t.stop());
|
|
957
|
+
} catch {
|
|
958
|
+
}
|
|
959
|
+
try {
|
|
960
|
+
pc.close();
|
|
961
|
+
} catch {
|
|
962
|
+
}
|
|
963
|
+
try {
|
|
964
|
+
audioEl.remove();
|
|
965
|
+
} catch {
|
|
966
|
+
}
|
|
967
|
+
fireState("ended");
|
|
968
|
+
opts.onEnd?.();
|
|
969
|
+
};
|
|
970
|
+
return {
|
|
971
|
+
get state() {
|
|
972
|
+
return proto.state;
|
|
973
|
+
},
|
|
974
|
+
get transcript() {
|
|
975
|
+
return proto.transcript.slice();
|
|
976
|
+
},
|
|
977
|
+
get isMuted() {
|
|
978
|
+
return muted;
|
|
979
|
+
},
|
|
980
|
+
end: () => teardown(),
|
|
981
|
+
mute: () => {
|
|
982
|
+
if (muted) return;
|
|
983
|
+
muted = true;
|
|
984
|
+
mic.getAudioTracks().forEach((t) => t.enabled = false);
|
|
985
|
+
},
|
|
986
|
+
unmute: () => {
|
|
987
|
+
if (!muted) return;
|
|
988
|
+
muted = false;
|
|
989
|
+
mic.getAudioTracks().forEach((t) => t.enabled = true);
|
|
990
|
+
}
|
|
991
|
+
};
|
|
992
|
+
}
|
|
993
|
+
|
|
746
994
|
// src/browser.ts
|
|
747
995
|
var browserWsFactory = (url) => new globalThis.WebSocket(url);
|
|
748
996
|
var BrowserVoiceFactory = class {
|
|
@@ -758,21 +1006,42 @@ var BrowserVoiceFactory = class {
|
|
|
758
1006
|
context,
|
|
759
1007
|
metadata
|
|
760
1008
|
};
|
|
761
|
-
let
|
|
1009
|
+
let resolved;
|
|
762
1010
|
if (options.token) {
|
|
763
|
-
|
|
1011
|
+
resolved = { token: options.token, transport: "ws" };
|
|
764
1012
|
} else {
|
|
765
|
-
|
|
766
|
-
if (!
|
|
1013
|
+
const r = await this.config.fetchToken(fetchArgs);
|
|
1014
|
+
if (!r) {
|
|
767
1015
|
throw new Error("configureVoiceClient.fetchToken returned empty token");
|
|
768
1016
|
}
|
|
1017
|
+
resolved = typeof r === "string" ? { token: r, transport: "ws" } : r;
|
|
1018
|
+
if (!resolved.token) {
|
|
1019
|
+
throw new Error("configureVoiceClient.fetchToken returned an object without `token`");
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
if (resolved.transport === "webrtc") {
|
|
1023
|
+
return createWebRtcCall({
|
|
1024
|
+
agentId: options.agentId,
|
|
1025
|
+
apiBase: this.config.apiBase,
|
|
1026
|
+
token: resolved.token,
|
|
1027
|
+
webrtcGatewayBase: resolved.webrtcGatewayBase,
|
|
1028
|
+
onStateChange: options.onStateChange,
|
|
1029
|
+
onTranscript: options.onTranscript,
|
|
1030
|
+
onError: options.onError,
|
|
1031
|
+
// Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
|
|
1032
|
+
// from the server yet — use 'agent_ended' as placeholder. durationMs is
|
|
1033
|
+
// tracked at 0 until the followup lands (see spec Followups section).
|
|
1034
|
+
onEnd: options.onEnd ? () => options.onEnd({ reason: "agent_ended", durationMs: 0 }) : void 0,
|
|
1035
|
+
onInterrupt: options.onInterrupt,
|
|
1036
|
+
onAgentTurnStart: options.onAgentTurnStart
|
|
1037
|
+
});
|
|
769
1038
|
}
|
|
770
1039
|
const client = new BrowserVoiceClient({
|
|
771
1040
|
config: this.config,
|
|
772
1041
|
// Carry merged context/metadata through to startCall so server can
|
|
773
1042
|
// see what the SDK saw.
|
|
774
1043
|
options: { ...options, context, metadata },
|
|
775
|
-
token,
|
|
1044
|
+
token: resolved.token,
|
|
776
1045
|
wsFactory: browserWsFactory
|
|
777
1046
|
});
|
|
778
1047
|
await client.start();
|