@craftedxp/voice-js 0.3.2 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/browser.mjs CHANGED
@@ -354,7 +354,8 @@ function handleServerMessage(raw, state, cb) {
354
354
  state.agentBubbleId = id;
355
355
  state.transcript = [...state.transcript, { id, role: "agent", text: "" }];
356
356
  cb.onTranscript(state.transcript);
357
- cb.onAgentTurnStart();
357
+ const seq = typeof msg.seq === "number" ? msg.seq : void 0;
358
+ cb.onAgentTurnStart(seq);
358
359
  setState(state, "agent_speaking", cb);
359
360
  return;
360
361
  }
@@ -368,10 +369,13 @@ function handleServerMessage(raw, state, cb) {
368
369
  cb.onTranscript(state.transcript);
369
370
  return;
370
371
  }
371
- case "agent_turn_end":
372
+ case "agent_turn_end": {
372
373
  state.agentBubbleId = null;
374
+ const seq = typeof msg.seq === "number" ? msg.seq : void 0;
375
+ cb.onAgentTurnEnd(seq);
373
376
  setState(state, "listening", cb);
374
377
  return;
378
+ }
375
379
  case "interrupt":
376
380
  cb.onInterrupt();
377
381
  return;
@@ -558,6 +562,76 @@ var dispatchClientToolCall = (send, tools, frame) => {
558
562
  })();
559
563
  };
560
564
 
565
+ // src/ClientMarksBuffer.ts
566
+ var createClientMarksBuffer = (args) => {
567
+ const now = args.now ?? (() => performance.now());
568
+ let pendingFirstOutboundAt = null;
569
+ const inFlight = /* @__PURE__ */ new Map();
570
+ const tryEmit = (seq) => {
571
+ const slot = inFlight.get(seq);
572
+ if (!slot) return;
573
+ if (!slot.ended) return;
574
+ const marks = {};
575
+ if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
576
+ marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt;
577
+ }
578
+ args.send({
579
+ type: "client_marks",
580
+ seq,
581
+ marks,
582
+ clientNow: Date.now()
583
+ });
584
+ inFlight.delete(seq);
585
+ };
586
+ const markFirstOutboundAudio = () => {
587
+ if (pendingFirstOutboundAt !== null) return;
588
+ pendingFirstOutboundAt = now();
589
+ };
590
+ const markFirstAudibleOutput = () => {
591
+ let target;
592
+ for (const slot of inFlight.values()) {
593
+ if (!slot.ended) {
594
+ target = slot;
595
+ }
596
+ }
597
+ if (!target) return;
598
+ if (target.firstAudibleAt !== null) return;
599
+ target.firstAudibleAt = now();
600
+ };
601
+ const onAgentTurnStart = (seq) => {
602
+ inFlight.set(seq, {
603
+ firstOutboundAt: pendingFirstOutboundAt,
604
+ firstAudibleAt: null,
605
+ ended: false
606
+ });
607
+ pendingFirstOutboundAt = null;
608
+ };
609
+ const onAgentTurnEnd = (seq) => {
610
+ const slot = inFlight.get(seq);
611
+ if (!slot) {
612
+ args.send({ type: "client_marks", seq, marks: {}, clientNow: Date.now() });
613
+ return;
614
+ }
615
+ slot.ended = true;
616
+ tryEmit(seq);
617
+ };
618
+ const flush = () => {
619
+ for (const seq of [...inFlight.keys()]) {
620
+ const slot = inFlight.get(seq);
621
+ slot.ended = true;
622
+ tryEmit(seq);
623
+ }
624
+ pendingFirstOutboundAt = null;
625
+ };
626
+ return {
627
+ markFirstOutboundAudio,
628
+ markFirstAudibleOutput,
629
+ onAgentTurnStart,
630
+ onAgentTurnEnd,
631
+ flush
632
+ };
633
+ };
634
+
561
635
  // src/VoiceClient.ts
562
636
  var BrowserVoiceClient = class {
563
637
  constructor(args) {
@@ -621,7 +695,13 @@ var BrowserVoiceClient = class {
621
695
  this.playback?.flush();
622
696
  this.args.options.onInterrupt?.();
623
697
  },
624
- onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
698
+ onAgentTurnStart: (seq) => {
699
+ if (typeof seq === "number") this.marks.onAgentTurnStart(seq);
700
+ this.args.options.onAgentTurnStart?.();
701
+ },
702
+ onAgentTurnEnd: (seq) => {
703
+ if (typeof seq === "number") this.marks.onAgentTurnEnd(seq);
704
+ },
625
705
  onCallEnd: (reason) => this.teardown(reason),
626
706
  onConnected: () => this.sendClientToolsRegister(),
627
707
  onClientToolCall: (frame) => dispatchClientToolCall(
@@ -631,6 +711,7 @@ var BrowserVoiceClient = class {
631
711
  )
632
712
  });
633
713
  } else {
714
+ this.marks.markFirstAudibleOutput();
634
715
  this.playback?.enqueue(ev.data);
635
716
  }
636
717
  break;
@@ -649,6 +730,7 @@ var BrowserVoiceClient = class {
649
730
  if (this.capture?.isCapturing()) return;
650
731
  this.capture = createAudioCapture({
651
732
  onChunk: (pcm) => {
733
+ this.marks.markFirstOutboundAudio();
652
734
  this.rws?.send(pcm);
653
735
  },
654
736
  onVolume: (v) => {
@@ -669,6 +751,10 @@ var BrowserVoiceClient = class {
669
751
  }
670
752
  };
671
753
  this.teardown = (reason) => {
754
+ try {
755
+ this.marks.flush();
756
+ } catch {
757
+ }
672
758
  this.capture?.stop();
673
759
  this.capture = null;
674
760
  this.playback?.close();
@@ -694,6 +780,14 @@ var BrowserVoiceClient = class {
694
780
  this.args = args;
695
781
  this.proto = createProtocolState();
696
782
  validateClientToolMap(args.options.clientTools);
783
+ this.marks = createClientMarksBuffer({
784
+ send: (frame) => {
785
+ try {
786
+ this.rws?.send(JSON.stringify(frame));
787
+ } catch {
788
+ }
789
+ }
790
+ });
697
791
  }
698
792
  // ---------------------------------------------------------------
699
793
  // Call interface
@@ -742,6 +836,179 @@ var BrowserVoiceClient = class {
742
836
  }
743
837
  };
744
838
 
839
+ // src/webrtc/createWebRtcCall.ts
840
+ async function createWebRtcCall(opts) {
841
+ validateClientToolMap(opts.clientTools);
842
+ const proto = createProtocolState();
843
+ let muted = false;
844
+ let ended = false;
845
+ const tools = opts.clientTools ?? {};
846
+ const sendControl = (frame) => {
847
+ if (dc?.readyState !== "open") return;
848
+ try {
849
+ dc.send(JSON.stringify(frame));
850
+ } catch {
851
+ }
852
+ };
853
+ const fireState = (next) => {
854
+ if (proto.state === next) return;
855
+ proto.state = next;
856
+ opts.onStateChange?.(next);
857
+ };
858
+ const dispatch = (raw) => {
859
+ handleServerMessage(raw, proto, {
860
+ onState: fireState,
861
+ onTranscript: (entries) => opts.onTranscript?.(entries),
862
+ onError: (err) => opts.onError?.(err),
863
+ onInterrupt: () => opts.onInterrupt?.(),
864
+ onAgentTurnStart: () => opts.onAgentTurnStart?.(),
865
+ onAgentTurnEnd: () => {
866
+ },
867
+ onCallEnd: () => teardown(),
868
+ onConnected: () => {
869
+ if (Object.keys(tools).length > 0) {
870
+ sendControl(buildRegisterFrame(tools));
871
+ }
872
+ },
873
+ onClientToolCall: (frame) => {
874
+ dispatchClientToolCall(sendControl, tools, frame);
875
+ }
876
+ });
877
+ };
878
+ fireState("connecting");
879
+ const pc = new RTCPeerConnection({
880
+ iceServers: [{ urls: "stun:stun.l.google.com:19302" }]
881
+ });
882
+ const audioEl = document.createElement("audio");
883
+ audioEl.autoplay = true;
884
+ audioEl.style.display = "none";
885
+ document.body.appendChild(audioEl);
886
+ pc.ontrack = (event) => {
887
+ audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track]);
888
+ };
889
+ let mic;
890
+ try {
891
+ mic = await navigator.mediaDevices.getUserMedia({ audio: true });
892
+ } catch (err) {
893
+ const code = err instanceof DOMException && err.name === "NotAllowedError" ? "mic_denied" : "mic_start_failed";
894
+ opts.onError?.({
895
+ code,
896
+ message: err instanceof Error ? err.message : "getUserMedia failed"
897
+ });
898
+ fireState("error");
899
+ pc.close();
900
+ audioEl.remove();
901
+ throw err;
902
+ }
903
+ for (const track of mic.getAudioTracks()) pc.addTrack(track, mic);
904
+ const dc = pc.createDataChannel("control", { ordered: true });
905
+ dc.onmessage = (e) => {
906
+ if (typeof e.data === "string") dispatch(e.data);
907
+ };
908
+ dc.onerror = () => {
909
+ opts.onError?.({ code: "socket_error", message: "control channel error" });
910
+ };
911
+ dc.onopen = () => {
912
+ if (Object.keys(tools).length > 0) {
913
+ sendControl(buildRegisterFrame(tools));
914
+ }
915
+ };
916
+ const gateway = opts.webrtcGatewayBase || "";
917
+ const offerUrl = gateway ? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`;
918
+ const iceUrl = gateway ? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`;
919
+ await pc.setLocalDescription(await pc.createOffer());
920
+ let callId;
921
+ try {
922
+ const offerRes = await fetch(offerUrl, {
923
+ method: "POST",
924
+ headers: { "content-type": "application/json" },
925
+ body: JSON.stringify({ sdp: pc.localDescription.sdp, type: "offer", agentId: opts.agentId })
926
+ });
927
+ if (!offerRes.ok) {
928
+ const code = offerRes.status === 401 ? "unauthorized" : "server_error";
929
+ opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` });
930
+ fireState("error");
931
+ mic.getTracks().forEach((t) => t.stop());
932
+ pc.close();
933
+ audioEl.remove();
934
+ throw new Error(`webrtc offer failed: ${offerRes.status}`);
935
+ }
936
+ const body = await offerRes.json();
937
+ callId = body.callId;
938
+ await pc.setRemoteDescription({ type: "answer", sdp: body.sdp });
939
+ } catch (err) {
940
+ if (!ended) {
941
+ opts.onError?.({
942
+ code: "network_unreachable",
943
+ message: err instanceof Error ? err.message : "signaling failed"
944
+ });
945
+ fireState("error");
946
+ mic.getTracks().forEach((t) => t.stop());
947
+ pc.close();
948
+ audioEl.remove();
949
+ }
950
+ throw err;
951
+ }
952
+ pc.onicecandidate = (e) => {
953
+ if (!e.candidate) return;
954
+ void fetch(iceUrl, {
955
+ method: "POST",
956
+ headers: { "content-type": "application/json" },
957
+ body: JSON.stringify({ callId, candidate: e.candidate })
958
+ }).catch(() => {
959
+ });
960
+ };
961
+ pc.onconnectionstatechange = () => {
962
+ const s = pc.connectionState;
963
+ if (s === "connected") fireState("listening");
964
+ if (s === "failed" || s === "disconnected") {
965
+ opts.onError?.({ code: "socket_error", message: `webrtc connection ${s}` });
966
+ teardown();
967
+ }
968
+ if (s === "closed" && !ended) teardown();
969
+ };
970
+ const teardown = () => {
971
+ if (ended) return;
972
+ ended = true;
973
+ try {
974
+ mic.getTracks().forEach((t) => t.stop());
975
+ } catch {
976
+ }
977
+ try {
978
+ pc.close();
979
+ } catch {
980
+ }
981
+ try {
982
+ audioEl.remove();
983
+ } catch {
984
+ }
985
+ fireState("ended");
986
+ opts.onEnd?.();
987
+ };
988
+ return {
989
+ get state() {
990
+ return proto.state;
991
+ },
992
+ get transcript() {
993
+ return proto.transcript.slice();
994
+ },
995
+ get isMuted() {
996
+ return muted;
997
+ },
998
+ end: () => teardown(),
999
+ mute: () => {
1000
+ if (muted) return;
1001
+ muted = true;
1002
+ mic.getAudioTracks().forEach((t) => t.enabled = false);
1003
+ },
1004
+ unmute: () => {
1005
+ if (!muted) return;
1006
+ muted = false;
1007
+ mic.getAudioTracks().forEach((t) => t.enabled = true);
1008
+ }
1009
+ };
1010
+ }
1011
+
745
1012
  // src/browser.ts
746
1013
  var browserWsFactory = (url) => new globalThis.WebSocket(url);
747
1014
  var BrowserVoiceFactory = class {
@@ -757,21 +1024,43 @@ var BrowserVoiceFactory = class {
757
1024
  context,
758
1025
  metadata
759
1026
  };
760
- let token;
1027
+ let resolved;
761
1028
  if (options.token) {
762
- token = options.token;
1029
+ resolved = { token: options.token, transport: "ws" };
763
1030
  } else {
764
- token = await this.config.fetchToken(fetchArgs);
765
- if (!token) {
1031
+ const r = await this.config.fetchToken(fetchArgs);
1032
+ if (!r) {
766
1033
  throw new Error("configureVoiceClient.fetchToken returned empty token");
767
1034
  }
1035
+ resolved = typeof r === "string" ? { token: r, transport: "ws" } : r;
1036
+ if (!resolved.token) {
1037
+ throw new Error("configureVoiceClient.fetchToken returned an object without `token`");
1038
+ }
1039
+ }
1040
+ if (resolved.transport === "webrtc") {
1041
+ return createWebRtcCall({
1042
+ agentId: options.agentId,
1043
+ apiBase: this.config.apiBase,
1044
+ token: resolved.token,
1045
+ webrtcGatewayBase: resolved.webrtcGatewayBase,
1046
+ onStateChange: options.onStateChange,
1047
+ onTranscript: options.onTranscript,
1048
+ onError: options.onError,
1049
+ // Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
1050
+ // from the server yet — use 'agent_ended' as placeholder. durationMs is
1051
+ // tracked at 0 until the followup lands (see spec Followups section).
1052
+ onEnd: options.onEnd ? () => options.onEnd({ reason: "agent_ended", durationMs: 0 }) : void 0,
1053
+ onInterrupt: options.onInterrupt,
1054
+ onAgentTurnStart: options.onAgentTurnStart,
1055
+ clientTools: options.clientTools
1056
+ });
768
1057
  }
769
1058
  const client = new BrowserVoiceClient({
770
1059
  config: this.config,
771
1060
  // Carry merged context/metadata through to startCall so server can
772
1061
  // see what the SDK saw.
773
1062
  options: { ...options, context, metadata },
774
- token,
1063
+ token: resolved.token,
775
1064
  wsFactory: browserWsFactory
776
1065
  });
777
1066
  await client.start();