@craftedxp/voice-js 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/browser.mjs CHANGED
@@ -354,7 +354,8 @@ function handleServerMessage(raw, state, cb) {
354
354
  state.agentBubbleId = id;
355
355
  state.transcript = [...state.transcript, { id, role: "agent", text: "" }];
356
356
  cb.onTranscript(state.transcript);
357
- cb.onAgentTurnStart();
357
+ const seq = typeof msg.seq === "number" ? msg.seq : void 0;
358
+ cb.onAgentTurnStart(seq);
358
359
  setState(state, "agent_speaking", cb);
359
360
  return;
360
361
  }
@@ -368,10 +369,13 @@ function handleServerMessage(raw, state, cb) {
368
369
  cb.onTranscript(state.transcript);
369
370
  return;
370
371
  }
371
- case "agent_turn_end":
372
+ case "agent_turn_end": {
372
373
  state.agentBubbleId = null;
374
+ const seq = typeof msg.seq === "number" ? msg.seq : void 0;
375
+ cb.onAgentTurnEnd(seq);
373
376
  setState(state, "listening", cb);
374
377
  return;
378
+ }
375
379
  case "interrupt":
376
380
  cb.onInterrupt();
377
381
  return;
@@ -447,7 +451,6 @@ function handleServerMessage(raw, state, cb) {
447
451
  }
448
452
  var setState = (state, next, cb) => {
449
453
  if (state.state === next) return;
450
- state.state = next;
451
454
  cb.onState(next);
452
455
  };
453
456
  var upsertUserPartial = (state, text, isFinal) => {
@@ -559,6 +562,76 @@ var dispatchClientToolCall = (send, tools, frame) => {
559
562
  })();
560
563
  };
561
564
 
565
+ // src/ClientMarksBuffer.ts
566
+ var createClientMarksBuffer = (args) => {
567
+ const now = args.now ?? (() => performance.now());
568
+ let pendingFirstOutboundAt = null;
569
+ const inFlight = /* @__PURE__ */ new Map();
570
+ const tryEmit = (seq) => {
571
+ const slot = inFlight.get(seq);
572
+ if (!slot) return;
573
+ if (!slot.ended) return;
574
+ const marks = {};
575
+ if (slot.firstOutboundAt !== null && slot.firstAudibleAt !== null) {
576
+ marks.client_mic_to_first_audible_ms = slot.firstAudibleAt - slot.firstOutboundAt;
577
+ }
578
+ args.send({
579
+ type: "client_marks",
580
+ seq,
581
+ marks,
582
+ clientNow: Date.now()
583
+ });
584
+ inFlight.delete(seq);
585
+ };
586
+ const markFirstOutboundAudio = () => {
587
+ if (pendingFirstOutboundAt !== null) return;
588
+ pendingFirstOutboundAt = now();
589
+ };
590
+ const markFirstAudibleOutput = () => {
591
+ let target;
592
+ for (const slot of inFlight.values()) {
593
+ if (!slot.ended) {
594
+ target = slot;
595
+ }
596
+ }
597
+ if (!target) return;
598
+ if (target.firstAudibleAt !== null) return;
599
+ target.firstAudibleAt = now();
600
+ };
601
+ const onAgentTurnStart = (seq) => {
602
+ inFlight.set(seq, {
603
+ firstOutboundAt: pendingFirstOutboundAt,
604
+ firstAudibleAt: null,
605
+ ended: false
606
+ });
607
+ pendingFirstOutboundAt = null;
608
+ };
609
+ const onAgentTurnEnd = (seq) => {
610
+ const slot = inFlight.get(seq);
611
+ if (!slot) {
612
+ args.send({ type: "client_marks", seq, marks: {}, clientNow: Date.now() });
613
+ return;
614
+ }
615
+ slot.ended = true;
616
+ tryEmit(seq);
617
+ };
618
+ const flush = () => {
619
+ for (const seq of [...inFlight.keys()]) {
620
+ const slot = inFlight.get(seq);
621
+ slot.ended = true;
622
+ tryEmit(seq);
623
+ }
624
+ pendingFirstOutboundAt = null;
625
+ };
626
+ return {
627
+ markFirstOutboundAudio,
628
+ markFirstAudibleOutput,
629
+ onAgentTurnStart,
630
+ onAgentTurnEnd,
631
+ flush
632
+ };
633
+ };
634
+
562
635
  // src/VoiceClient.ts
563
636
  var BrowserVoiceClient = class {
564
637
  constructor(args) {
@@ -622,7 +695,13 @@ var BrowserVoiceClient = class {
622
695
  this.playback?.flush();
623
696
  this.args.options.onInterrupt?.();
624
697
  },
625
- onAgentTurnStart: () => this.args.options.onAgentTurnStart?.(),
698
+ onAgentTurnStart: (seq) => {
699
+ if (typeof seq === "number") this.marks.onAgentTurnStart(seq);
700
+ this.args.options.onAgentTurnStart?.();
701
+ },
702
+ onAgentTurnEnd: (seq) => {
703
+ if (typeof seq === "number") this.marks.onAgentTurnEnd(seq);
704
+ },
626
705
  onCallEnd: (reason) => this.teardown(reason),
627
706
  onConnected: () => this.sendClientToolsRegister(),
628
707
  onClientToolCall: (frame) => dispatchClientToolCall(
@@ -632,6 +711,7 @@ var BrowserVoiceClient = class {
632
711
  )
633
712
  });
634
713
  } else {
714
+ this.marks.markFirstAudibleOutput();
635
715
  this.playback?.enqueue(ev.data);
636
716
  }
637
717
  break;
@@ -650,6 +730,7 @@ var BrowserVoiceClient = class {
650
730
  if (this.capture?.isCapturing()) return;
651
731
  this.capture = createAudioCapture({
652
732
  onChunk: (pcm) => {
733
+ this.marks.markFirstOutboundAudio();
653
734
  this.rws?.send(pcm);
654
735
  },
655
736
  onVolume: (v) => {
@@ -670,6 +751,10 @@ var BrowserVoiceClient = class {
670
751
  }
671
752
  };
672
753
  this.teardown = (reason) => {
754
+ try {
755
+ this.marks.flush();
756
+ } catch {
757
+ }
673
758
  this.capture?.stop();
674
759
  this.capture = null;
675
760
  this.playback?.close();
@@ -695,6 +780,14 @@ var BrowserVoiceClient = class {
695
780
  this.args = args;
696
781
  this.proto = createProtocolState();
697
782
  validateClientToolMap(args.options.clientTools);
783
+ this.marks = createClientMarksBuffer({
784
+ send: (frame) => {
785
+ try {
786
+ this.rws?.send(JSON.stringify(frame));
787
+ } catch {
788
+ }
789
+ }
790
+ });
698
791
  }
699
792
  // ---------------------------------------------------------------
700
793
  // Call interface
@@ -743,6 +836,161 @@ var BrowserVoiceClient = class {
743
836
  }
744
837
  };
745
838
 
839
+ // src/webrtc/createWebRtcCall.ts
840
+ async function createWebRtcCall(opts) {
841
+ const proto = createProtocolState();
842
+ let muted = false;
843
+ let ended = false;
844
+ const fireState = (next) => {
845
+ if (proto.state === next) return;
846
+ proto.state = next;
847
+ opts.onStateChange?.(next);
848
+ };
849
+ const dispatch = (raw) => {
850
+ handleServerMessage(raw, proto, {
851
+ onState: fireState,
852
+ onTranscript: (entries) => opts.onTranscript?.(entries),
853
+ onError: (err) => opts.onError?.(err),
854
+ onInterrupt: () => opts.onInterrupt?.(),
855
+ onAgentTurnStart: () => opts.onAgentTurnStart?.(),
856
+ onAgentTurnEnd: () => {
857
+ },
858
+ onCallEnd: () => teardown(),
859
+ onConnected: () => {
860
+ },
861
+ onClientToolCall: () => {
862
+ }
863
+ });
864
+ };
865
+ fireState("connecting");
866
+ const pc = new RTCPeerConnection({
867
+ iceServers: [{ urls: "stun:stun.l.google.com:19302" }]
868
+ });
869
+ const audioEl = document.createElement("audio");
870
+ audioEl.autoplay = true;
871
+ audioEl.style.display = "none";
872
+ document.body.appendChild(audioEl);
873
+ pc.ontrack = (event) => {
874
+ audioEl.srcObject = event.streams[0] ?? new MediaStream([event.track]);
875
+ };
876
+ let mic;
877
+ try {
878
+ mic = await navigator.mediaDevices.getUserMedia({ audio: true });
879
+ } catch (err) {
880
+ const code = err instanceof DOMException && err.name === "NotAllowedError" ? "mic_denied" : "mic_start_failed";
881
+ opts.onError?.({
882
+ code,
883
+ message: err instanceof Error ? err.message : "getUserMedia failed"
884
+ });
885
+ fireState("error");
886
+ pc.close();
887
+ audioEl.remove();
888
+ throw err;
889
+ }
890
+ for (const track of mic.getAudioTracks()) pc.addTrack(track, mic);
891
+ const dc = pc.createDataChannel("control", { ordered: true });
892
+ dc.onmessage = (e) => {
893
+ if (typeof e.data === "string") dispatch(e.data);
894
+ };
895
+ dc.onerror = () => {
896
+ opts.onError?.({ code: "socket_error", message: "control channel error" });
897
+ };
898
+ const gateway = opts.webrtcGatewayBase || "";
899
+ const offerUrl = gateway ? `${gateway}/webrtc/offer?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/offer?token=${encodeURIComponent(opts.token)}`;
900
+ const iceUrl = gateway ? `${gateway}/webrtc/ice?token=${encodeURIComponent(opts.token)}` : `${opts.apiBase}/v1/agents/${encodeURIComponent(opts.agentId)}/webrtc/ice?token=${encodeURIComponent(opts.token)}`;
901
+ await pc.setLocalDescription(await pc.createOffer());
902
+ let callId;
903
+ try {
904
+ const offerRes = await fetch(offerUrl, {
905
+ method: "POST",
906
+ headers: { "content-type": "application/json" },
907
+ body: JSON.stringify({ sdp: pc.localDescription.sdp, type: "offer", agentId: opts.agentId })
908
+ });
909
+ if (!offerRes.ok) {
910
+ const code = offerRes.status === 401 ? "unauthorized" : "server_error";
911
+ opts.onError?.({ code, message: `signaling failed: HTTP ${offerRes.status}` });
912
+ fireState("error");
913
+ mic.getTracks().forEach((t) => t.stop());
914
+ pc.close();
915
+ audioEl.remove();
916
+ throw new Error(`webrtc offer failed: ${offerRes.status}`);
917
+ }
918
+ const body = await offerRes.json();
919
+ callId = body.callId;
920
+ await pc.setRemoteDescription({ type: "answer", sdp: body.sdp });
921
+ } catch (err) {
922
+ if (!ended) {
923
+ opts.onError?.({
924
+ code: "network_unreachable",
925
+ message: err instanceof Error ? err.message : "signaling failed"
926
+ });
927
+ fireState("error");
928
+ mic.getTracks().forEach((t) => t.stop());
929
+ pc.close();
930
+ audioEl.remove();
931
+ }
932
+ throw err;
933
+ }
934
+ pc.onicecandidate = (e) => {
935
+ if (!e.candidate) return;
936
+ void fetch(iceUrl, {
937
+ method: "POST",
938
+ headers: { "content-type": "application/json" },
939
+ body: JSON.stringify({ callId, candidate: e.candidate })
940
+ }).catch(() => {
941
+ });
942
+ };
943
+ pc.onconnectionstatechange = () => {
944
+ const s = pc.connectionState;
945
+ if (s === "connected") fireState("listening");
946
+ if (s === "failed" || s === "disconnected") {
947
+ opts.onError?.({ code: "socket_error", message: `webrtc connection ${s}` });
948
+ teardown();
949
+ }
950
+ if (s === "closed" && !ended) teardown();
951
+ };
952
+ const teardown = () => {
953
+ if (ended) return;
954
+ ended = true;
955
+ try {
956
+ mic.getTracks().forEach((t) => t.stop());
957
+ } catch {
958
+ }
959
+ try {
960
+ pc.close();
961
+ } catch {
962
+ }
963
+ try {
964
+ audioEl.remove();
965
+ } catch {
966
+ }
967
+ fireState("ended");
968
+ opts.onEnd?.();
969
+ };
970
+ return {
971
+ get state() {
972
+ return proto.state;
973
+ },
974
+ get transcript() {
975
+ return proto.transcript.slice();
976
+ },
977
+ get isMuted() {
978
+ return muted;
979
+ },
980
+ end: () => teardown(),
981
+ mute: () => {
982
+ if (muted) return;
983
+ muted = true;
984
+ mic.getAudioTracks().forEach((t) => t.enabled = false);
985
+ },
986
+ unmute: () => {
987
+ if (!muted) return;
988
+ muted = false;
989
+ mic.getAudioTracks().forEach((t) => t.enabled = true);
990
+ }
991
+ };
992
+ }
993
+
746
994
  // src/browser.ts
747
995
  var browserWsFactory = (url) => new globalThis.WebSocket(url);
748
996
  var BrowserVoiceFactory = class {
@@ -758,21 +1006,42 @@ var BrowserVoiceFactory = class {
758
1006
  context,
759
1007
  metadata
760
1008
  };
761
- let token;
1009
+ let resolved;
762
1010
  if (options.token) {
763
- token = options.token;
1011
+ resolved = { token: options.token, transport: "ws" };
764
1012
  } else {
765
- token = await this.config.fetchToken(fetchArgs);
766
- if (!token) {
1013
+ const r = await this.config.fetchToken(fetchArgs);
1014
+ if (!r) {
767
1015
  throw new Error("configureVoiceClient.fetchToken returned empty token");
768
1016
  }
1017
+ resolved = typeof r === "string" ? { token: r, transport: "ws" } : r;
1018
+ if (!resolved.token) {
1019
+ throw new Error("configureVoiceClient.fetchToken returned an object without `token`");
1020
+ }
1021
+ }
1022
+ if (resolved.transport === "webrtc") {
1023
+ return createWebRtcCall({
1024
+ agentId: options.agentId,
1025
+ apiBase: this.config.apiBase,
1026
+ token: resolved.token,
1027
+ webrtcGatewayBase: resolved.webrtcGatewayBase,
1028
+ onStateChange: options.onStateChange,
1029
+ onTranscript: options.onTranscript,
1030
+ onError: options.onError,
1031
+ // Synthesise a minimal CallEndEvent. WebRTC doesn't carry an end reason
1032
+ // from the server yet — use 'agent_ended' as placeholder. durationMs is
1033
+ // tracked at 0 until the followup lands (see spec Followups section).
1034
+ onEnd: options.onEnd ? () => options.onEnd({ reason: "agent_ended", durationMs: 0 }) : void 0,
1035
+ onInterrupt: options.onInterrupt,
1036
+ onAgentTurnStart: options.onAgentTurnStart
1037
+ });
769
1038
  }
770
1039
  const client = new BrowserVoiceClient({
771
1040
  config: this.config,
772
1041
  // Carry merged context/metadata through to startCall so server can
773
1042
  // see what the SDK saw.
774
1043
  options: { ...options, context, metadata },
775
- token,
1044
+ token: resolved.token,
776
1045
  wsFactory: browserWsFactory
777
1046
  });
778
1047
  await client.start();