@craftedxp/voice-js 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/browser.mjs CHANGED
@@ -354,7 +354,8 @@ function handleServerMessage(raw, state, cb) {
354
354
  state.agentBubbleId = id;
355
355
  state.transcript = [...state.transcript, { id, role: "agent", text: "" }];
356
356
  cb.onTranscript(state.transcript);
357
- cb.onAgentTurnStart();
357
+ const seq = typeof msg.seq === "number" ? msg.seq : void 0;
358
+ cb.onAgentTurnStart(seq);
358
359
  setState(state, "agent_speaking", cb);
359
360
  return;
360
361
  }
@@ -368,10 +369,13 @@ function handleServerMessage(raw, state, cb) {
368
369
  cb.onTranscript(state.transcript);
369
370
  return;
370
371
  }
371
- case "agent_turn_end":
372
+ case "agent_turn_end": {
372
373
  state.agentBubbleId = null;
374
+ const seq = typeof msg.seq === "number" ? msg.seq : void 0;
375
+ cb.onAgentTurnEnd(seq);
373
376
  setState(state, "listening", cb);
374
377
  return;
378
+ }
375
379
  case "interrupt":
376
380
  cb.onInterrupt();
377
381
  return;
@@ -558,6 +562,76 @@ var dispatchClientToolCall = (send, tools, frame) => {
558
562
  })();
559
563
  };
560
564
 
565
+ // src/ClientMarksBuffer.ts
566
+ var createClientMarksBuffer = (args) => {
567
+ const now = args.now ?? (() => performance.now());
568
+ let pendingFirstOutboundAt = null;
569
+ const inFlight = /* @__PURE__ */ new Map();
570
+ const tryEmit = (seq) => {
571
+ const slot = inFlight.get(seq);
572
+ if (!slot) return;
573
+ if (!slot.ended) return;
574
+ const marks = {};
575
+ if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
576
+ marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt;
577
+ }
578
+ args.send({
579
+ type: "client_marks",
580
+ seq,
581
+ marks,
582
+ clientNow: Date.now()
583
+ });
584
+ inFlight.delete(seq);
585
+ };
586
+ const markFirstOutboundAudio = () => {
587
+ if (pendingFirstOutboundAt !== null) return;
588
+ pendingFirstOutboundAt = now();
589
+ };
590
+ const markFirstAudibleOutput = () => {
591
+ let target;
592
+ for (const slot of inFlight.values()) {
593
+ if (!slot.ended) {
594
+ target = slot;
595
+ }
596
+ }
597
+ if (!target) return;
598
+ if (target.firstAudibleAt !== null) return;
599
+ target.firstAudibleAt = now();
600
+ };
601
+ const onAgentTurnStart = (seq) => {
602
+ inFlight.set(seq, {
603
+ firstOutboundAt: pendingFirstOutboundAt,
604
+ firstAudibleAt: null,
605
+ ended: false
606
+ });
607
+ pendingFirstOutboundAt = null;
608
+ };
609
+ const onAgentTurnEnd = (seq) => {
610
+ const slot = inFlight.get(seq);
611
+ if (!slot) {
612
+ args.send({ type: "client_marks", seq, marks: {}, clientNow: Date.now() });
613
+ return;
614
+ }
615
+ slot.ended = true;
616
+ tryEmit(seq);
617
+ };
618
+ const flush = () => {
619
+ for (const seq of [...inFlight.keys()]) {
620
+ const slot = inFlight.get(seq);
621
+ slot.ended = true;
622
+ tryEmit(seq);
623
+ }
624
+ pendingFirstOutboundAt = null;
625
+ };
626
+ return {
627
+ markFirstOutboundAudio,
628
+ markFirstAudibleOutput,
629
+ onAgentTurnStart,
630
+ onAgentTurnEnd,
631
+ flush
632
+ };
633
+ };
634
+
561
635
  // src/VoiceClient.ts
562
636
  var BrowserVoiceClient = class {
563
637
  constructor(args) {
@@ -621,7 +695,13 @@ var BrowserVoiceClient = class {
621
695
  this.playback?.flush();
622
696
  this.args.options.onInterrupt?.();
623
697
  },
624
- onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
698
+ onAgentTurnStart: (seq) => {
699
+ if (typeof seq === "number") this.marks.onAgentTurnStart(seq);
700
+ this.args.options.onAgentTurnStart?.();
701
+ },
702
+ onAgentTurnEnd: (seq) => {
703
+ if (typeof seq === "number") this.marks.onAgentTurnEnd(seq);
704
+ },
625
705
  onCallEnd: (reason) => this.teardown(reason),
626
706
  onConnected: () => this.sendClientToolsRegister(),
627
707
  onClientToolCall: (frame) => dispatchClientToolCall(
@@ -631,6 +711,7 @@ var BrowserVoiceClient = class {
631
711
  )
632
712
  });
633
713
  } else {
714
+ this.marks.markFirstAudibleOutput();
634
715
  this.playback?.enqueue(ev.data);
635
716
  }
636
717
  break;
@@ -649,6 +730,7 @@ var BrowserVoiceClient = class {
649
730
  if (this.capture?.isCapturing()) return;
650
731
  this.capture = createAudioCapture({
651
732
  onChunk: (pcm) => {
733
+ this.marks.markFirstOutboundAudio();
652
734
  this.rws?.send(pcm);
653
735
  },
654
736
  onVolume: (v) => {
@@ -669,6 +751,10 @@ var BrowserVoiceClient = class {
669
751
  }
670
752
  };
671
753
  this.teardown = (reason) => {
754
+ try {
755
+ this.marks.flush();
756
+ } catch {
757
+ }
672
758
  this.capture?.stop();
673
759
  this.capture = null;
674
760
  this.playback?.close();
@@ -694,6 +780,14 @@ var BrowserVoiceClient = class {
694
780
  this.args = args;
695
781
  this.proto = createProtocolState();
696
782
  validateClientToolMap(args.options.clientTools);
783
+ this.marks = createClientMarksBuffer({
784
+ send: (frame) => {
785
+ try {
786
+ this.rws?.send(JSON.stringify(frame));
787
+ } catch {
788
+ }
789
+ }
790
+ });
697
791
  }
698
792
  // ---------------------------------------------------------------
699
793
  // Call interface
@@ -742,6 +836,161 @@ var BrowserVoiceClient = class {
742
836
  }
743
837
  };
744
838
 
839
+ // src/webrtc/createWebRtcCall.ts
840
+ async function createWebRtcCall(opts) {
841
+ const proto = createProtocolState();
842
+ let muted = false;
843
+ let ended = false;
844
+ const fireState = (next) => {
845
+ if (proto.state === next) return;
846
+ proto.state = next;
847
+ opts.onStateChange?.(next);
848
+ };
849
+ const dispatch = (raw) => {
850
+ handleServerMessage(raw, proto, {
851
+ onState: fireState,
852
+ onTranscript: (entries) => opts.onTranscript?.(entries),
853
+ onError: (err) => opts.onError?.(err),
854
+ onInterrupt: () => opts.onInterrupt?.(),
855
+ onAgentTurnStart: () => opts.onAgentTurnStart?.(),
856
+ onAgentTurnEnd: () => {
857
+ },
858
+ onCallEnd: () => teardown(),
859
+ onConnected: () => {
860
+ },
861
+ onClientToolCall: () => {
862
+ }
863
+ });
864
+ };
865
+ fireState("connecting");
866
+ const pc = new RTCPeerConnection({
867
+ iceServers: [{ urls: "stun:stun.l.google.com:19302" }]
868
+ });
869
+ const audioEl = document.createElement("audio");
870
+ audioEl.autoplay = true;
871
+ audioEl.style.display = "none";
872
+ document.body.appendChild(audioEl);
873
+ pc.ontrack = (event) => {
874
+ audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track]);
875
+ };
876
+ let mic;
877
+ try {
878
+ mic = await navigator.mediaDevices.getUserMedia({ audio: true });
879
+ } catch (err) {
880
+ const code = err instanceof DOMException && err.name === "NotAllowedError" ? "mic_denied" : "mic_start_failed";
881
+ opts.onError?.({
882
+ code,
883
+ message: err instanceof Error ? err.message : "getUserMedia failed"
884
+ });
885
+ fireState("error");
886
+ pc.close();
887
+ audioEl.remove();
888
+ throw err;
889
+ }
890
+ for (const track of mic.getAudioTracks()) pc.addTrack(track, mic);
891
+ const dc = pc.createDataChannel("control", { ordered: true });
892
+ dc.onmessage = (e) => {
893
+ if (typeof e.data === "string") dispatch(e.data);
894
+ };
895
+ dc.onerror = () => {
896
+ opts.onError?.({ code: "socket_error", message: "control channel error" });
897
+ };
898
+ const gateway = opts.webrtcGatewayBase || "";
899
+ const offerUrl = gateway ? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`;
900
+ const iceUrl = gateway ? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`;
901
+ await pc.setLocalDescription(await pc.createOffer());
902
+ let callId;
903
+ try {
904
+ const offerRes = await fetch(offerUrl, {
905
+ method: "POST",
906
+ headers: { "content-type": "application/json" },
907
+ body: JSON.stringify({ sdp: pc.localDescription.sdp, type: "offer", agentId: opts.agentId })
908
+ });
909
+ if (!offerRes.ok) {
910
+ const code = offerRes.status === 401 ? "unauthorized" : "server_error";
911
+ opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` });
912
+ fireState("error");
913
+ mic.getTracks().forEach((t) => t.stop());
914
+ pc.close();
915
+ audioEl.remove();
916
+ throw new Error(`webrtc offer failed: ${offerRes.status}`);
917
+ }
918
+ const body = await offerRes.json();
919
+ callId = body.callId;
920
+ await pc.setRemoteDescription({ type: "answer", sdp: body.sdp });
921
+ } catch (err) {
922
+ if (!ended) {
923
+ opts.onError?.({
924
+ code: "network_unreachable",
925
+ message: err instanceof Error ? err.message : "signaling failed"
926
+ });
927
+ fireState("error");
928
+ mic.getTracks().forEach((t) => t.stop());
929
+ pc.close();
930
+ audioEl.remove();
931
+ }
932
+ throw err;
933
+ }
934
+ pc.onicecandidate = (e) => {
935
+ if (!e.candidate) return;
936
+ void fetch(iceUrl, {
937
+ method: "POST",
938
+ headers: { "content-type": "application/json" },
939
+ body: JSON.stringify({ callId, candidate: e.candidate })
940
+ }).catch(() => {
941
+ });
942
+ };
943
+ pc.onconnectionstatechange = () => {
944
+ const s = pc.connectionState;
945
+ if (s === "connected") fireState("listening");
946
+ if (s === "failed" || s === "disconnected") {
947
+ opts.onError?.({ code: "socket_error", message: `webrtc connection ${s}` });
948
+ teardown();
949
+ }
950
+ if (s === "closed" && !ended) teardown();
951
+ };
952
+ const teardown = () => {
953
+ if (ended) return;
954
+ ended = true;
955
+ try {
956
+ mic.getTracks().forEach((t) => t.stop());
957
+ } catch {
958
+ }
959
+ try {
960
+ pc.close();
961
+ } catch {
962
+ }
963
+ try {
964
+ audioEl.remove();
965
+ } catch {
966
+ }
967
+ fireState("ended");
968
+ opts.onEnd?.();
969
+ };
970
+ return {
971
+ get state() {
972
+ return proto.state;
973
+ },
974
+ get transcript() {
975
+ return proto.transcript.slice();
976
+ },
977
+ get isMuted() {
978
+ return muted;
979
+ },
980
+ end: () => teardown(),
981
+ mute: () => {
982
+ if (muted) return;
983
+ muted = true;
984
+ mic.getAudioTracks().forEach((t) => t.enabled = false);
985
+ },
986
+ unmute: () => {
987
+ if (!muted) return;
988
+ muted = false;
989
+ mic.getAudioTracks().forEach((t) => t.enabled = true);
990
+ }
991
+ };
992
+ }
993
+
745
994
  // src/browser.ts
746
995
  var browserWsFactory = (url) => new globalThis.WebSocket(url);
747
996
  var BrowserVoiceFactory = class {
@@ -757,21 +1006,42 @@ var BrowserVoiceFactory = class {
757
1006
  context,
758
1007
  metadata
759
1008
  };
760
- let token;
1009
+ let resolved;
761
1010
  if (options.token) {
762
- token = options.token;
1011
+ resolved = { token: options.token, transport: "ws" };
763
1012
  } else {
764
- token = await this.config.fetchToken(fetchArgs);
765
- if (!token) {
1013
+ const r = await this.config.fetchToken(fetchArgs);
1014
+ if (!r) {
766
1015
  throw new Error("configureVoiceClient.fetchToken returned empty token");
767
1016
  }
1017
+ resolved = typeof r === "string" ? { token: r, transport: "ws" } : r;
1018
+ if (!resolved.token) {
1019
+ throw new Error("configureVoiceClient.fetchToken returned an object without `token`");
1020
+ }
1021
+ }
1022
+ if (resolved.transport === "webrtc") {
1023
+ return createWebRtcCall({
1024
+ agentId: options.agentId,
1025
+ apiBase: this.config.apiBase,
1026
+ token: resolved.token,
1027
+ webrtcGatewayBase: resolved.webrtcGatewayBase,
1028
+ onStateChange: options.onStateChange,
1029
+ onTranscript: options.onTranscript,
1030
+ onError: options.onError,
1031
+ // Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
1032
+ // from the server yet — use 'agent_ended' as placeholder. durationMs is
1033
+ // tracked at 0 until the followup lands (see spec Followups section).
1034
+ onEnd: options.onEnd ? () => options.onEnd({ reason: "agent_ended", durationMs: 0 }) : void 0,
1035
+ onInterrupt: options.onInterrupt,
1036
+ onAgentTurnStart: options.onAgentTurnStart
1037
+ });
768
1038
  }
769
1039
  const client = new BrowserVoiceClient({
770
1040
  config: this.config,
771
1041
  // Carry merged context/metadata through to startCall so server can
772
1042
  // see what the SDK saw.
773
1043
  options: { ...options, context, metadata },
774
- token,
1044
+ token: resolved.token,
775
1045
  wsFactory: browserWsFactory
776
1046
  });
777
1047
  await client.start();