@speechos/core 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -35,8 +35,6 @@ const defaultConfig = {
35
35
  apiKey: "",
36
36
  userId: "",
37
37
  host: DEFAULT_HOST,
38
- position: "bottom-center",
39
- zIndex: 999999,
40
38
  debug: false
41
39
  };
42
40
  /**
@@ -44,31 +42,19 @@ const defaultConfig = {
44
42
  * @param userConfig - User-provided configuration
45
43
  * @returns Validated and merged configuration
46
44
  */
47
- function validateConfig(userConfig = {}) {
45
+ function validateConfig(userConfig) {
48
46
  if (!userConfig.apiKey) throw new Error("SpeechOS requires an apiKey. Get one from your team dashboard at /a/<team-slug>/.");
49
- const config = {
50
- ...defaultConfig,
51
- ...userConfig
47
+ return {
48
+ apiKey: userConfig.apiKey,
49
+ userId: userConfig.userId ?? defaultConfig.userId,
50
+ host: userConfig.host ?? defaultConfig.host,
51
+ debug: userConfig.debug ?? defaultConfig.debug
52
52
  };
53
- const validPositions = [
54
- "bottom-center",
55
- "bottom-right",
56
- "bottom-left"
57
- ];
58
- if (!validPositions.includes(config.position)) {
59
- console.warn(`Invalid position "${config.position}". Using default "bottom-center".`);
60
- config.position = "bottom-center";
61
- }
62
- if (typeof config.zIndex !== "number" || config.zIndex < 0) {
63
- console.warn(`Invalid zIndex "${config.zIndex}". Using default ${defaultConfig.zIndex}.`);
64
- config.zIndex = defaultConfig.zIndex;
65
- }
66
- return config;
67
53
  }
68
54
  /**
69
55
  * Current active configuration (singleton)
70
56
  */
71
- let currentConfig = defaultConfig;
57
+ let currentConfig = { ...defaultConfig };
72
58
  /**
73
59
  * Get the current configuration
74
60
  */
@@ -98,6 +84,28 @@ function updateUserId(userId) {
98
84
  userId
99
85
  };
100
86
  }
87
+ /**
88
+ * LocalStorage key for anonymous ID persistence
89
+ */
90
+ const ANONYMOUS_ID_KEY = "speechos_anonymous_id";
91
+ /**
92
+ * Get or generate a persistent anonymous ID for Mixpanel tracking.
93
+ *
94
+ * This ID is stored in localStorage to persist across sessions,
95
+ * allowing consistent anonymous user tracking without identifying
96
+ * the account owner's customers.
97
+ *
98
+ * @returns A UUID string for anonymous identification
99
+ */
100
+ function getAnonymousId() {
101
+ if (typeof localStorage === "undefined") return crypto.randomUUID();
102
+ let anonymousId = localStorage.getItem(ANONYMOUS_ID_KEY);
103
+ if (!anonymousId) {
104
+ anonymousId = crypto.randomUUID();
105
+ localStorage.setItem(ANONYMOUS_ID_KEY, anonymousId);
106
+ }
107
+ return anonymousId;
108
+ }
101
109
 
102
110
  //#endregion
103
111
  //#region src/events.ts
@@ -191,33 +199,38 @@ const initialState = {
191
199
  var StateManager = class {
192
200
  state;
193
201
  subscribers = /* @__PURE__ */ new Set();
202
+ /** Cached immutable snapshot for useSyncExternalStore compatibility */
203
+ snapshot;
194
204
  constructor(initialState$1) {
195
205
  this.state = { ...initialState$1 };
206
+ this.snapshot = Object.freeze({ ...this.state });
196
207
  }
197
208
  /**
198
- * Get the current state (returns a copy to prevent mutations)
209
+ * Get the current state snapshot (returns a stable reference for React)
210
+ * This returns an immutable frozen object that only changes when setState is called.
199
211
  */
200
212
  getState() {
201
- return { ...this.state };
213
+ return this.snapshot;
202
214
  }
203
215
  /**
204
216
  * Update state with partial values
205
217
  * @param partial - Partial state to merge with current state
206
218
  */
207
219
  setState(partial) {
208
- const prevState = { ...this.state };
220
+ const prevState = this.snapshot;
209
221
  this.state = {
210
222
  ...this.state,
211
223
  ...partial
212
224
  };
225
+ this.snapshot = Object.freeze({ ...this.state });
213
226
  this.subscribers.forEach((callback) => {
214
227
  try {
215
- callback(this.state, prevState);
228
+ callback(this.snapshot, prevState);
216
229
  } catch (error) {
217
230
  console.error("Error in state change callback:", error);
218
231
  }
219
232
  });
220
- events.emit("state:change", { state: this.state });
233
+ events.emit("state:change", { state: this.snapshot });
221
234
  }
222
235
  /**
223
236
  * Subscribe to state changes
@@ -234,7 +247,17 @@ var StateManager = class {
234
247
  * Reset state to initial values
235
248
  */
236
249
  reset() {
237
- this.setState(initialState);
250
+ const prevState = this.snapshot;
251
+ this.state = { ...initialState };
252
+ this.snapshot = Object.freeze({ ...this.state });
253
+ this.subscribers.forEach((callback) => {
254
+ try {
255
+ callback(this.snapshot, prevState);
256
+ } catch (error) {
257
+ console.error("Error in state change callback:", error);
258
+ }
259
+ });
260
+ events.emit("state:change", { state: this.snapshot });
238
261
  }
239
262
  /**
240
263
  * Show the widget
@@ -369,12 +392,15 @@ function createStateManager(initial) {
369
392
 
370
393
  //#endregion
371
394
  //#region src/livekit.ts
372
- const MESSAGE_TYPE_REQUEST_TRANSCRIPT = "request_transcript";
373
- const MESSAGE_TYPE_TRANSCRIPT = "transcript";
374
- const MESSAGE_TYPE_EDIT_TEXT = "edit_text";
375
- const MESSAGE_TYPE_EDITED_TEXT = "edited_text";
376
- const MESSAGE_TYPE_ERROR = "error";
395
+ const MESSAGE_TYPE_REQUEST_TRANSCRIPT$1 = "request_transcript";
396
+ const MESSAGE_TYPE_TRANSCRIPT$1 = "transcript";
397
+ const MESSAGE_TYPE_EDIT_TEXT$1 = "edit_text";
398
+ const MESSAGE_TYPE_EDITED_TEXT$1 = "edited_text";
399
+ const MESSAGE_TYPE_EXECUTE_COMMAND$1 = "execute_command";
400
+ const MESSAGE_TYPE_COMMAND_RESULT$1 = "command_result";
401
+ const MESSAGE_TYPE_ERROR$1 = "error";
377
402
  const TOPIC_SPEECHOS = "speechos";
403
+ const TOKEN_CACHE_TTL_MS = 4 * 60 * 1e3;
378
404
  /**
379
405
  * A deferred promise with timeout support.
380
406
  * Encapsulates resolve/reject/timeout in a single object for cleaner async handling.
@@ -438,53 +464,116 @@ var LiveKitManager = class {
438
464
  room = null;
439
465
  tokenData = null;
440
466
  micTrack = null;
467
+ cachedTokenData = null;
468
+ tokenCacheTimestamp = null;
469
+ tokenPrefetchPromise = null;
470
+ tokenRefreshTimer = null;
471
+ autoRefreshEnabled = false;
441
472
  pendingTranscript = null;
442
473
  pendingEditText = null;
474
+ pendingCommand = null;
443
475
  pendingTrackSubscribed = null;
444
- preWarmPromise = null;
445
476
  editOriginalText = null;
477
+ sessionSettings = {};
446
478
  /**
447
- * Pre-warm resources for faster connection
448
- * Call this when user shows intent (e.g., expands widget)
449
- * Only fetches token - mic permission is requested when user clicks Dictate
479
+ * Check if the cached token is still valid (within TTL)
450
480
  */
451
- async preWarm() {
452
- if (this.tokenData || this.preWarmPromise || this.room?.state === "connected") {
453
- const config$1 = getConfig();
454
- if (config$1.debug) console.log("[SpeechOS] Pre-warm skipped - token already available");
455
- return;
456
- }
481
+ isCachedTokenValid() {
482
+ if (!this.cachedTokenData || !this.tokenCacheTimestamp) return false;
483
+ const age = Date.now() - this.tokenCacheTimestamp;
484
+ return age < TOKEN_CACHE_TTL_MS;
485
+ }
486
+ /**
487
+ * Pre-fetch a LiveKit token for later use
488
+ * Call this early (e.g., when widget expands) to reduce latency when starting a voice session.
489
+ * If a prefetch is already in progress, returns the existing promise.
490
+ * If a valid cached token exists, returns it immediately.
491
+ */
492
+ async prefetchToken() {
457
493
  const config = getConfig();
458
- if (config.debug) console.log("[SpeechOS] Pre-warming: fetching token...");
459
- this.preWarmPromise = (async () => {
460
- try {
461
- await this.fetchToken();
462
- if (config.debug) console.log("[SpeechOS] Pre-warm complete - token ready");
463
- } catch (error) {
464
- if (config.debug) console.warn("[SpeechOS] Pre-warm failed:", error);
465
- this.preWarmPromise = null;
466
- }
467
- })();
468
- await this.preWarmPromise;
494
+ if (this.isCachedTokenValid() && this.cachedTokenData) {
495
+ if (config.debug) console.log("[SpeechOS] Using cached token (prefetch hit)");
496
+ return this.cachedTokenData;
497
+ }
498
+ if (this.tokenPrefetchPromise) {
499
+ if (config.debug) console.log("[SpeechOS] Prefetch already in progress, awaiting...");
500
+ return this.tokenPrefetchPromise;
501
+ }
502
+ if (config.debug) console.log("[SpeechOS] Starting token prefetch...");
503
+ this.tokenPrefetchPromise = this.fetchTokenFromServer().then((data) => {
504
+ this.cachedTokenData = data;
505
+ this.tokenCacheTimestamp = Date.now();
506
+ this.tokenPrefetchPromise = null;
507
+ return data;
508
+ }).catch((error) => {
509
+ this.tokenPrefetchPromise = null;
510
+ throw error;
511
+ });
512
+ return this.tokenPrefetchPromise;
469
513
  }
470
514
  /**
471
515
  * Fetch a LiveKit token from the backend
516
+ * Uses cached token if valid, otherwise fetches a fresh one.
517
+ * Includes language settings and user vocabulary which are stored in the VoiceSession.
472
518
  */
473
519
  async fetchToken() {
520
+ const config = getConfig();
521
+ if (this.isCachedTokenValid() && this.cachedTokenData) {
522
+ if (config.debug) console.log("[SpeechOS] Using cached token");
523
+ this.tokenData = this.cachedTokenData;
524
+ return this.cachedTokenData;
525
+ }
526
+ if (this.tokenPrefetchPromise) {
527
+ if (config.debug) console.log("[SpeechOS] Waiting for prefetch to complete...");
528
+ const data$1 = await this.tokenPrefetchPromise;
529
+ this.tokenData = data$1;
530
+ return data$1;
531
+ }
532
+ const data = await this.fetchTokenFromServer();
533
+ this.cachedTokenData = data;
534
+ this.tokenCacheTimestamp = Date.now();
535
+ this.tokenData = data;
536
+ return data;
537
+ }
538
+ /**
539
+ * Internal method to fetch a fresh token from the server
540
+ */
541
+ async fetchTokenFromServer() {
474
542
  const config = getConfig();
475
543
  const url = `${config.host}/livekit/api/token/`;
476
- if (config.debug) console.log("[SpeechOS] Fetching LiveKit token from:", url);
544
+ const settings = this.sessionSettings;
545
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
546
+ const outputLanguage = settings.outputLanguageCode ?? "en-US";
547
+ const smartFormat = settings.smartFormat ?? true;
548
+ const vocabulary = settings.vocabulary ?? [];
549
+ const snippets = settings.snippets ?? [];
550
+ if (config.debug) {
551
+ console.log("[SpeechOS] Fetching LiveKit token from:", url);
552
+ console.log("[SpeechOS] Session settings:", {
553
+ inputLanguage,
554
+ outputLanguage,
555
+ smartFormat,
556
+ snippetsCount: snippets.length,
557
+ vocabularyCount: vocabulary.length
558
+ });
559
+ }
477
560
  const response = await fetch(url, {
478
561
  method: "POST",
479
562
  headers: {
480
563
  "Content-Type": "application/json",
481
564
  ...config.apiKey ? { Authorization: `Api-Key ${config.apiKey}` } : {}
482
565
  },
483
- body: JSON.stringify({ user_id: config.userId || null })
566
+ body: JSON.stringify({
567
+ user_id: config.userId || null,
568
+ input_language: inputLanguage,
569
+ output_language: outputLanguage,
570
+ smart_format: smartFormat,
571
+ custom_vocabulary: vocabulary,
572
+ custom_snippets: snippets
573
+ })
484
574
  });
485
575
  if (!response.ok) throw new Error(`Failed to fetch LiveKit token: ${response.status} ${response.statusText}`);
486
576
  const data = await response.json();
487
- this.tokenData = data;
488
577
  if (config.debug) console.log("[SpeechOS] LiveKit token received:", {
489
578
  room: data.room,
490
579
  identity: data.identity,
@@ -497,8 +586,7 @@ var LiveKitManager = class {
497
586
  */
498
587
  async connect() {
499
588
  const config = getConfig();
500
- if (!this.tokenData) await this.fetchToken();
501
- else if (config.debug) console.log("[SpeechOS] Using pre-fetched token");
589
+ await this.fetchToken();
502
590
  if (!this.tokenData) throw new Error("No token available for LiveKit connection");
503
591
  this.room = new livekit_client.Room({
504
592
  adaptiveStream: true,
@@ -562,7 +650,7 @@ var LiveKitManager = class {
562
650
  try {
563
651
  const message = JSON.parse(new TextDecoder().decode(data));
564
652
  if (config.debug) console.log("[SpeechOS] Data received:", message);
565
- if (message.type === MESSAGE_TYPE_TRANSCRIPT) {
653
+ if (message.type === MESSAGE_TYPE_TRANSCRIPT$1) {
566
654
  const transcript = message.transcript || "";
567
655
  if (config.debug) console.log("[SpeechOS] Transcript received:", transcript);
568
656
  events.emit("transcription:complete", { text: transcript });
@@ -570,7 +658,7 @@ var LiveKitManager = class {
570
658
  this.pendingTranscript.resolve(transcript);
571
659
  this.pendingTranscript = null;
572
660
  }
573
- } else if (message.type === MESSAGE_TYPE_EDITED_TEXT) {
661
+ } else if (message.type === MESSAGE_TYPE_EDITED_TEXT$1) {
574
662
  const editedText = message.text || "";
575
663
  if (config.debug) console.log("[SpeechOS] Edited text received:", editedText);
576
664
  events.emit("edit:complete", {
@@ -582,7 +670,15 @@ var LiveKitManager = class {
582
670
  this.pendingEditText = null;
583
671
  }
584
672
  this.editOriginalText = null;
585
- } else if (message.type === MESSAGE_TYPE_ERROR) {
673
+ } else if (message.type === MESSAGE_TYPE_COMMAND_RESULT$1) {
674
+ const commandResult = message.command || null;
675
+ if (config.debug) console.log("[SpeechOS] Command result received:", commandResult);
676
+ events.emit("command:complete", { command: commandResult });
677
+ if (this.pendingCommand) {
678
+ this.pendingCommand.resolve(commandResult);
679
+ this.pendingCommand = null;
680
+ }
681
+ } else if (message.type === MESSAGE_TYPE_ERROR$1) {
586
682
  const serverError = message;
587
683
  const errorCode = serverError.code || "server_error";
588
684
  const errorMessage = serverError.message || "A server error occurred";
@@ -602,6 +698,10 @@ var LiveKitManager = class {
602
698
  this.pendingEditText.reject(error);
603
699
  this.pendingEditText = null;
604
700
  }
701
+ if (this.pendingCommand) {
702
+ this.pendingCommand.reject(error);
703
+ this.pendingCommand = null;
704
+ }
605
705
  }
606
706
  } catch (error) {
607
707
  console.error("[SpeechOS] Failed to parse data message:", error);
@@ -609,16 +709,34 @@ var LiveKitManager = class {
609
709
  }
610
710
  /**
611
711
  * Publish microphone audio track
712
+ * Uses the device ID from session settings if set
612
713
  */
613
714
  async enableMicrophone() {
614
715
  if (!this.room || this.room.state !== "connected") throw new Error("Not connected to room");
615
716
  const config = getConfig();
616
717
  if (!this.micTrack) {
617
718
  if (config.debug) console.log("[SpeechOS] Creating microphone track...");
618
- this.micTrack = await (0, livekit_client.createLocalAudioTrack)({
719
+ const deviceId = this.sessionSettings.audioDeviceId;
720
+ const trackOptions = {
619
721
  echoCancellation: true,
620
722
  noiseSuppression: true
621
- });
723
+ };
724
+ if (deviceId) {
725
+ trackOptions.deviceId = { exact: deviceId };
726
+ if (config.debug) console.log("[SpeechOS] Using audio device:", deviceId);
727
+ }
728
+ try {
729
+ this.micTrack = await (0, livekit_client.createLocalAudioTrack)(trackOptions);
730
+ } catch (error) {
731
+ if (deviceId && error instanceof Error) {
732
+ console.warn("[SpeechOS] Selected audio device unavailable, falling back to default:", error.message);
733
+ this.micTrack = await (0, livekit_client.createLocalAudioTrack)({
734
+ echoCancellation: true,
735
+ noiseSuppression: true
736
+ });
737
+ } else throw error;
738
+ }
739
+ this.logMicrophoneInfo();
622
740
  }
623
741
  const existingPub = this.room.localParticipant.getTrackPublication(livekit_client.Track.Source.Microphone);
624
742
  if (!existingPub) {
@@ -628,6 +746,24 @@ var LiveKitManager = class {
628
746
  }
629
747
  }
630
748
  /**
749
+ * Log information about the current microphone track
750
+ */
751
+ logMicrophoneInfo() {
752
+ if (!this.micTrack) return;
753
+ const config = getConfig();
754
+ const mediaTrack = this.micTrack.mediaStreamTrack;
755
+ const settings = mediaTrack.getSettings();
756
+ console.log("[SpeechOS] Microphone active:", {
757
+ deviceId: settings.deviceId || "unknown",
758
+ label: mediaTrack.label || "Unknown device",
759
+ sampleRate: settings.sampleRate,
760
+ channelCount: settings.channelCount,
761
+ echoCancellation: settings.echoCancellation,
762
+ noiseSuppression: settings.noiseSuppression
763
+ });
764
+ if (config.debug) console.log("[SpeechOS] Full audio track settings:", settings);
765
+ }
766
+ /**
631
767
  * Disable microphone audio track
632
768
  */
633
769
  async disableMicrophone() {
@@ -659,30 +795,85 @@ var LiveKitManager = class {
659
795
  });
660
796
  }
661
797
  /**
662
- * Start a voice session
663
- * Connects to room, enables microphone, and waits for agent to subscribe to our track
798
+ * Start a voice session with pre-connect audio buffering
799
+ * Fetches a fresh token, then enables mic with preConnectBuffer to capture audio while connecting.
800
+ * Agent subscription happens in the background - we don't block on it.
801
+ *
802
+ * @param options - Session options including action type and parameters
664
803
  */
665
- async startVoiceSession() {
804
+ async startVoiceSession(options) {
666
805
  const config = getConfig();
667
806
  if (config.debug) console.log("[SpeechOS] Starting voice session...");
668
- if (this.preWarmPromise) {
669
- if (config.debug) console.log("[SpeechOS] Waiting for pre-warm to complete...");
670
- await this.preWarmPromise;
671
- }
672
- if (this.tokenData) {
673
- if (config.debug) console.log("[SpeechOS] Using cached token from init");
674
- } else {
675
- if (config.debug) console.log("[SpeechOS] Fetching fresh token for session...");
676
- await this.fetchToken();
677
- }
807
+ this.sessionSettings = options?.settings || {};
808
+ await this.fetchToken();
809
+ if (!this.tokenData) throw new Error("No token available for LiveKit connection");
678
810
  this.pendingTrackSubscribed = new Deferred();
679
811
  this.pendingTrackSubscribed.setTimeout(15e3, "Connection timed out - agent not available", "connection_timeout", "connection");
680
- await this.connect();
681
- await this.enableMicrophone();
682
- if (config.debug) console.log("[SpeechOS] Microphone published, waiting for LocalTrackSubscribed event...");
683
- await this.pendingTrackSubscribed.promise;
684
- this.pendingTrackSubscribed = null;
685
- if (config.debug) console.log("[SpeechOS] Voice session ready - agent subscribed to audio");
812
+ this.room = new livekit_client.Room({
813
+ adaptiveStream: true,
814
+ dynacast: true
815
+ });
816
+ this.setupRoomEvents();
817
+ if (config.debug) console.log("[SpeechOS] Connecting to LiveKit room:", this.tokenData.room, "at", this.tokenData.ws_url);
818
+ await this.room.connect(this.tokenData.ws_url, this.tokenData.token);
819
+ if (config.debug) console.log("[SpeechOS] Connected, enabling microphone with preConnectBuffer...");
820
+ await this.enableMicrophoneWithPreConnectBuffer();
821
+ if (options?.onMicReady) options.onMicReady();
822
+ state.setConnected(true);
823
+ if (config.debug) console.log("[SpeechOS] Voice session ready - microphone active");
824
+ this.waitForAgentSubscription();
825
+ }
826
+ /**
827
+ * Wait for the agent to subscribe to our audio track in the background
828
+ * Handles timeout errors without blocking the main flow
829
+ */
830
+ waitForAgentSubscription() {
831
+ const config = getConfig();
832
+ if (!this.pendingTrackSubscribed) return;
833
+ this.pendingTrackSubscribed.promise.then(() => {
834
+ if (config.debug) console.log("[SpeechOS] Agent subscribed to audio track - full duplex established");
835
+ this.pendingTrackSubscribed = null;
836
+ }).catch((error) => {
837
+ console.warn("[SpeechOS] Agent subscription timeout:", error.message);
838
+ this.pendingTrackSubscribed = null;
839
+ });
840
+ }
841
+ /**
842
+ * Enable microphone with pre-connect buffering
843
+ * This starts capturing audio locally before the room is connected,
844
+ * buffering it until the connection is established.
845
+ */
846
+ async enableMicrophoneWithPreConnectBuffer() {
847
+ if (!this.room) throw new Error("Room not initialized");
848
+ const config = getConfig();
849
+ const deviceId = this.sessionSettings.audioDeviceId;
850
+ const constraints = {
851
+ echoCancellation: true,
852
+ noiseSuppression: true
853
+ };
854
+ if (deviceId) {
855
+ constraints.deviceId = { exact: deviceId };
856
+ if (config.debug) console.log("[SpeechOS] Using audio device:", deviceId);
857
+ }
858
+ try {
859
+ await this.room.localParticipant.setMicrophoneEnabled(true, constraints, { preConnectBuffer: true });
860
+ state.setMicEnabled(true);
861
+ const micPub = this.room.localParticipant.getTrackPublication(livekit_client.Track.Source.Microphone);
862
+ if (micPub?.track) {
863
+ this.micTrack = micPub.track;
864
+ this.logMicrophoneInfo();
865
+ }
866
+ if (config.debug) console.log("[SpeechOS] Microphone enabled with pre-connect buffer - audio is being captured");
867
+ } catch (error) {
868
+ if (deviceId && error instanceof Error) {
869
+ console.warn("[SpeechOS] Selected audio device unavailable, falling back to default:", error.message);
870
+ await this.room.localParticipant.setMicrophoneEnabled(true, {
871
+ echoCancellation: true,
872
+ noiseSuppression: true
873
+ }, { preConnectBuffer: true });
874
+ state.setMicEnabled(true);
875
+ } else throw error;
876
+ }
686
877
  }
687
878
  /**
688
879
  * Stop the voice session and request the transcript
@@ -691,12 +882,19 @@ var LiveKitManager = class {
691
882
  */
692
883
  async stopVoiceSession() {
693
884
  const config = getConfig();
885
+ const settings = this.sessionSettings;
886
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
887
+ const outputLanguage = settings.outputLanguageCode ?? "en-US";
888
+ console.log("[SpeechOS] Dictate command:", {
889
+ inputLanguage,
890
+ outputLanguage
891
+ });
694
892
  if (config.debug) console.log("[SpeechOS] Stopping voice session, requesting transcript...");
695
893
  await this.disableMicrophone();
696
894
  if (config.debug) console.log("[SpeechOS] Requesting transcript from agent...");
697
895
  this.pendingTranscript = new Deferred();
698
896
  this.pendingTranscript.setTimeout(1e4, "Transcription timed out. Please try again.", "transcription_timeout", "timeout");
699
- await this.sendDataMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT });
897
+ await this.sendDataMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT$1 });
700
898
  const result = await this.pendingTranscript.promise;
701
899
  this.pendingTranscript = null;
702
900
  return result;
@@ -715,6 +913,14 @@ var LiveKitManager = class {
715
913
  */
716
914
  async requestEditText(originalText) {
717
915
  const config = getConfig();
916
+ const settings = this.sessionSettings;
917
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
918
+ const outputLanguage = settings.outputLanguageCode ?? "en-US";
919
+ console.log("[SpeechOS] Edit command:", {
920
+ inputLanguage,
921
+ outputLanguage,
922
+ originalTextLength: originalText.length
923
+ });
718
924
  if (config.debug) console.log("[SpeechOS] Requesting text edit...");
719
925
  this.editOriginalText = originalText;
720
926
  await this.disableMicrophone();
@@ -722,7 +928,7 @@ var LiveKitManager = class {
722
928
  this.pendingEditText = new Deferred();
723
929
  this.pendingEditText.setTimeout(15e3, "Edit request timed out. Please try again.", "edit_timeout", "timeout");
724
930
  await this.sendDataMessage({
725
- type: MESSAGE_TYPE_EDIT_TEXT,
931
+ type: MESSAGE_TYPE_EDIT_TEXT$1,
726
932
  text: originalText
727
933
  });
728
934
  const result = await this.pendingEditText.promise;
@@ -736,6 +942,39 @@ var LiveKitManager = class {
736
942
  return this.requestEditText(originalText);
737
943
  }
738
944
  /**
945
+ * Request command matching using the transcript as input
946
+ * Sends command definitions to the backend, which matches the user's speech against them
947
+ * Returns a promise that resolves with the matched command or null if no match
948
+ * @throws Error if timeout occurs waiting for command result
949
+ */
950
+ async requestCommand(commands) {
951
+ const config = getConfig();
952
+ const settings = this.sessionSettings;
953
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
954
+ console.log("[SpeechOS] Command request:", {
955
+ inputLanguage,
956
+ commandCount: commands.length
957
+ });
958
+ if (config.debug) console.log("[SpeechOS] Requesting command match...");
959
+ await this.disableMicrophone();
960
+ if (config.debug) console.log("[SpeechOS] Sending execute_command request to agent...");
961
+ this.pendingCommand = new Deferred();
962
+ this.pendingCommand.setTimeout(15e3, "Command request timed out. Please try again.", "command_timeout", "timeout");
963
+ await this.sendDataMessage({
964
+ type: MESSAGE_TYPE_EXECUTE_COMMAND$1,
965
+ commands
966
+ });
967
+ const result = await this.pendingCommand.promise;
968
+ this.pendingCommand = null;
969
+ return result;
970
+ }
971
+ /**
972
+ * Alias for requestCommand - granular API naming
973
+ */
974
+ async stopAndCommand(commands) {
975
+ return this.requestCommand(commands);
976
+ }
977
+ /**
739
978
  * Disconnect from the current room
740
979
  * Clears the token so a fresh one is fetched for the next session
741
980
  */
@@ -758,16 +997,110 @@ var LiveKitManager = class {
758
997
  this.pendingEditText.reject(new Error("Disconnected"));
759
998
  this.pendingEditText = null;
760
999
  }
1000
+ if (this.pendingCommand) {
1001
+ this.pendingCommand.reject(new Error("Disconnected"));
1002
+ this.pendingCommand = null;
1003
+ }
761
1004
  if (this.pendingTrackSubscribed) {
762
1005
  this.pendingTrackSubscribed.reject(new Error("Disconnected"));
763
1006
  this.pendingTrackSubscribed = null;
764
1007
  }
765
1008
  this.tokenData = null;
766
- this.preWarmPromise = null;
767
1009
  this.editOriginalText = null;
1010
+ this.sessionSettings = {};
768
1011
  if (config.debug) console.log("[SpeechOS] Session state cleared");
769
1012
  }
770
1013
  /**
1014
+ * Invalidate the cached token
1015
+ * Call this when settings change that would affect the token (language, vocabulary)
1016
+ */
1017
+ invalidateTokenCache() {
1018
+ const config = getConfig();
1019
+ if (config.debug) console.log("[SpeechOS] Token cache invalidated");
1020
+ this.cachedTokenData = null;
1021
+ this.tokenCacheTimestamp = null;
1022
+ }
1023
+ /**
1024
+ * Start auto-refreshing the token while the widget is expanded.
1025
+ * Call this after a voice session completes to immediately fetch a fresh token
1026
+ * (since each command requires its own token) and keep it fresh for subsequent commands.
1027
+ */
1028
+ startAutoRefresh() {
1029
+ const config = getConfig();
1030
+ this.autoRefreshEnabled = true;
1031
+ if (config.debug) console.log("[SpeechOS] Token auto-refresh enabled");
1032
+ this.invalidateTokenCache();
1033
+ this.prefetchToken().then(() => {
1034
+ this.scheduleTokenRefresh();
1035
+ }).catch((error) => {
1036
+ if (config.debug) console.warn("[SpeechOS] Failed to prefetch token after command:", error);
1037
+ if (this.autoRefreshEnabled) this.tokenRefreshTimer = setTimeout(() => {
1038
+ this.performAutoRefresh();
1039
+ }, 5 * 1e3);
1040
+ });
1041
+ }
1042
+ /**
1043
+ * Stop auto-refreshing the token.
1044
+ * Call this when the widget collapses or user navigates away.
1045
+ */
1046
+ stopAutoRefresh() {
1047
+ const config = getConfig();
1048
+ this.autoRefreshEnabled = false;
1049
+ if (this.tokenRefreshTimer) {
1050
+ clearTimeout(this.tokenRefreshTimer);
1051
+ this.tokenRefreshTimer = null;
1052
+ }
1053
+ if (config.debug) console.log("[SpeechOS] Token auto-refresh disabled");
1054
+ }
1055
+ /**
1056
+ * Schedule a token refresh before the current cache expires.
1057
+ * Handles computer sleep by checking elapsed time on each refresh attempt.
1058
+ */
1059
+ scheduleTokenRefresh() {
1060
+ if (!this.autoRefreshEnabled) return;
1061
+ if (this.tokenRefreshTimer) {
1062
+ clearTimeout(this.tokenRefreshTimer);
1063
+ this.tokenRefreshTimer = null;
1064
+ }
1065
+ const config = getConfig();
1066
+ const refreshBuffer = 30 * 1e3;
1067
+ let timeUntilRefresh;
1068
+ if (this.tokenCacheTimestamp) {
1069
+ const age = Date.now() - this.tokenCacheTimestamp;
1070
+ const timeRemaining = TOKEN_CACHE_TTL_MS - age;
1071
+ timeUntilRefresh = Math.max(0, timeRemaining - refreshBuffer);
1072
+ } else timeUntilRefresh = 0;
1073
+ if (config.debug) console.log(`[SpeechOS] Scheduling token refresh in ${Math.round(timeUntilRefresh / 1e3)}s`);
1074
+ this.tokenRefreshTimer = setTimeout(() => {
1075
+ this.performAutoRefresh();
1076
+ }, timeUntilRefresh);
1077
+ }
1078
+ /**
1079
+ * Perform the auto-refresh, handling computer sleep scenarios.
1080
+ */
1081
+ async performAutoRefresh() {
1082
+ if (!this.autoRefreshEnabled) return;
1083
+ const config = getConfig();
1084
+ if (this.isCachedTokenValid()) {
1085
+ if (config.debug) console.log("[SpeechOS] Token still valid on refresh check, rescheduling");
1086
+ this.scheduleTokenRefresh();
1087
+ return;
1088
+ }
1089
+ if (config.debug) console.log("[SpeechOS] Auto-refreshing token...");
1090
+ try {
1091
+ const data = await this.fetchTokenFromServer();
1092
+ this.cachedTokenData = data;
1093
+ this.tokenCacheTimestamp = Date.now();
1094
+ if (config.debug) console.log("[SpeechOS] Token auto-refreshed successfully");
1095
+ this.scheduleTokenRefresh();
1096
+ } catch (error) {
1097
+ console.warn("[SpeechOS] Token auto-refresh failed:", error);
1098
+ if (this.autoRefreshEnabled) this.tokenRefreshTimer = setTimeout(() => {
1099
+ this.performAutoRefresh();
1100
+ }, 30 * 1e3);
1101
+ }
1102
+ }
1103
+ /**
771
1104
  * Get the current room instance
772
1105
  */
773
1106
  getRoom() {
@@ -791,88 +1124,749 @@ var LiveKitManager = class {
791
1124
  isMicrophoneEnabled() {
792
1125
  return this.micTrack !== null;
793
1126
  }
794
- /**
795
- * Clear the cached token
796
- * Used when user identity changes to ensure next session gets a fresh token
797
- */
798
- clearToken() {
799
- const config = getConfig();
800
- if (config.debug) console.log("[SpeechOS] Clearing cached token");
801
- this.tokenData = null;
802
- this.preWarmPromise = null;
803
- }
804
1127
  };
805
1128
  const livekit = new LiveKitManager();
1129
+ events.on("settings:changed", () => {
1130
+ livekit.invalidateTokenCache();
1131
+ });
806
1132
 
807
1133
  //#endregion
808
- //#region src/transcript-store.ts
809
- const STORAGE_KEY = "speechos_transcripts";
810
- const MAX_ENTRIES = 50;
1134
+ //#region src/audio-capture.ts
811
1135
  /**
812
- * Generate a unique ID for transcript entries
1136
+ * Detect if running in Safari.
813
1137
  */
814
- function generateId() {
815
- return `${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
1138
+ function isSafari() {
1139
+ const ua = navigator.userAgent.toLowerCase();
1140
+ const vendor = navigator.vendor?.toLowerCase() || "";
1141
+ const hasSafariUA = ua.includes("safari") && !ua.includes("chrome") && !ua.includes("chromium");
1142
+ const isAppleVendor = vendor.includes("apple");
1143
+ return hasSafariUA && isAppleVendor;
816
1144
  }
817
1145
  /**
818
- * Get all transcripts from localStorage
1146
+ * Detect the best supported audio format for the current browser.
1147
+ *
1148
+ * IMPORTANT: Safari must use MP4/AAC. Its WebM/Opus implementation is buggy
1149
+ * and produces truncated/incomplete audio.
819
1150
  */
820
- function getTranscripts() {
821
- try {
822
- const stored = localStorage.getItem(STORAGE_KEY);
823
- if (!stored) return [];
824
- const entries = JSON.parse(stored);
825
- return entries.sort((a, b) => b.timestamp - a.timestamp);
826
- } catch {
827
- return [];
1151
+ function getSupportedAudioFormat() {
1152
+ if (isSafari()) {
1153
+ if (MediaRecorder.isTypeSupported("audio/mp4")) return {
1154
+ mimeType: "audio/mp4",
1155
+ format: "mp4",
1156
+ needsEncodingParams: false
1157
+ };
1158
+ return {
1159
+ mimeType: "",
1160
+ format: "mp4",
1161
+ needsEncodingParams: true
1162
+ };
828
1163
  }
1164
+ if (MediaRecorder.isTypeSupported("audio/webm;codecs=opus")) return {
1165
+ mimeType: "audio/webm;codecs=opus",
1166
+ format: "webm",
1167
+ needsEncodingParams: false
1168
+ };
1169
+ if (MediaRecorder.isTypeSupported("audio/webm")) return {
1170
+ mimeType: "audio/webm",
1171
+ format: "webm",
1172
+ needsEncodingParams: false
1173
+ };
1174
+ if (MediaRecorder.isTypeSupported("audio/mp4")) return {
1175
+ mimeType: "audio/mp4",
1176
+ format: "mp4",
1177
+ needsEncodingParams: false
1178
+ };
1179
+ return {
1180
+ mimeType: "",
1181
+ format: "webm",
1182
+ needsEncodingParams: true
1183
+ };
829
1184
  }
830
1185
  /**
831
- * Save a new transcript entry
1186
+ * Audio capture manager with buffering support.
1187
+ *
1188
+ * Usage:
1189
+ * 1. Create instance with onChunk callback
1190
+ * 2. Call start() - immediately begins capturing
1191
+ * 3. Call setReady() when connection is established - flushes buffer
1192
+ * 4. Call stop() when done
832
1193
  */
833
- function saveTranscript(text, action, originalText) {
834
- const entry = {
835
- id: generateId(),
836
- text,
837
- timestamp: Date.now(),
838
- action,
839
- ...originalText && { originalText }
840
- };
841
- const entries = getTranscripts();
842
- entries.unshift(entry);
843
- const pruned = entries.slice(0, MAX_ENTRIES);
844
- try {
845
- localStorage.setItem(STORAGE_KEY, JSON.stringify(pruned));
846
- } catch {}
847
- return entry;
848
- }
1194
+ var AudioCapture = class AudioCapture {
1195
+ mediaStream = null;
1196
+ recorder = null;
1197
+ buffer = [];
1198
+ isReady = false;
1199
+ isRecording = false;
1200
+ onChunk;
1201
+ audioFormat;
1202
+ deviceId;
1203
+ /**
1204
+ * Time slice for MediaRecorder in milliseconds.
1205
+ *
1206
+ * Safari requires a larger timeslice (1000ms) to properly flush its internal
1207
+ * audio buffers. Smaller values cause Safari to drop or truncate audio data.
1208
+ * See: https://community.openai.com/t/whisper-problem-with-audio-mp4-blobs-from-safari/
1209
+ *
1210
+ * Other browsers (Chrome, Firefox, Edge) work well with smaller timeslices
1211
+ * which provide lower latency for real-time transcription.
1212
+ */
1213
+ static TIME_SLICE_MS = 100;
1214
+ static SAFARI_TIME_SLICE_MS = 1e3;
1215
+ /**
1216
+ * @param onChunk - Callback for receiving audio chunks
1217
+ * @param deviceId - Optional audio device ID (empty string or undefined for system default)
1218
+ */
1219
+ constructor(onChunk, deviceId) {
1220
+ this.onChunk = onChunk;
1221
+ this.audioFormat = getSupportedAudioFormat();
1222
+ this.deviceId = deviceId;
1223
+ }
1224
+ /**
1225
+ * Get the appropriate timeslice for the current browser.
1226
+ * Safari needs a larger timeslice to avoid dropping audio data.
1227
+ */
1228
+ getTimeSlice() {
1229
+ return isSafari() ? AudioCapture.SAFARI_TIME_SLICE_MS : AudioCapture.TIME_SLICE_MS;
1230
+ }
1231
+ /**
1232
+ * Get the timeslice being used (in milliseconds).
1233
+ * Useful for callers that need to wait for audio processing.
1234
+ */
1235
+ getTimeSliceMs() {
1236
+ return this.getTimeSlice();
1237
+ }
1238
+ /**
1239
+ * Get the audio format being used.
1240
+ */
1241
+ getFormat() {
1242
+ return this.audioFormat;
1243
+ }
1244
+ /**
1245
+ * Start capturing audio immediately.
1246
+ *
1247
+ * Audio chunks will be buffered until setReady() is called.
1248
+ */
1249
+ async start() {
1250
+ const config = getConfig();
1251
+ if (this.isRecording) {
1252
+ if (config.debug) console.log("[SpeechOS] AudioCapture already recording");
1253
+ return;
1254
+ }
1255
+ this.buffer = [];
1256
+ this.isReady = false;
1257
+ const constraints = { audio: {
1258
+ echoCancellation: true,
1259
+ noiseSuppression: true,
1260
+ ...this.deviceId ? { deviceId: { exact: this.deviceId } } : {}
1261
+ } };
1262
+ if (config.debug) {
1263
+ console.log("[SpeechOS] AudioCapture starting with format:", this.audioFormat.mimeType);
1264
+ console.log("[SpeechOS] Detected Safari:", isSafari());
1265
+ if (this.deviceId) console.log("[SpeechOS] Using audio device:", this.deviceId);
1266
+ }
1267
+ try {
1268
+ this.mediaStream = await navigator.mediaDevices.getUserMedia(constraints);
1269
+ const recorderOptions = {};
1270
+ if (this.audioFormat.mimeType) recorderOptions.mimeType = this.audioFormat.mimeType;
1271
+ this.recorder = new MediaRecorder(this.mediaStream, recorderOptions);
1272
+ this.recorder.ondataavailable = (event) => {
1273
+ if (event.data && event.data.size > 0) this.handleChunk(event.data);
1274
+ };
1275
+ this.recorder.onerror = (event) => {
1276
+ console.error("[SpeechOS] MediaRecorder error:", event);
1277
+ };
1278
+ const timeSlice = this.getTimeSlice();
1279
+ this.recorder.start(timeSlice);
1280
+ this.isRecording = true;
1281
+ if (config.debug) console.log(`[SpeechOS] AudioCapture started with ${timeSlice}ms timeslice, buffering until ready`);
1282
+ } catch (error) {
1283
+ if (this.deviceId && error instanceof Error) {
1284
+ console.warn("[SpeechOS] Selected device unavailable, trying default:", error.message);
1285
+ this.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: {
1286
+ echoCancellation: true,
1287
+ noiseSuppression: true
1288
+ } });
1289
+ const recorderOptions = {};
1290
+ if (this.audioFormat.mimeType) recorderOptions.mimeType = this.audioFormat.mimeType;
1291
+ this.recorder = new MediaRecorder(this.mediaStream, recorderOptions);
1292
+ this.recorder.ondataavailable = (event) => {
1293
+ if (event.data && event.data.size > 0) this.handleChunk(event.data);
1294
+ };
1295
+ this.recorder.start(this.getTimeSlice());
1296
+ this.isRecording = true;
1297
+ } else throw error;
1298
+ }
1299
+ }
1300
+ /**
1301
+ * Handle an audio chunk with atomic buffer swap pattern.
1302
+ *
1303
+ * If not ready: buffer the chunk.
1304
+ * If ready: send directly via callback.
1305
+ */
1306
+ handleChunk(chunk) {
1307
+ if (this.isReady) this.onChunk(chunk);
1308
+ else this.buffer.push(chunk);
1309
+ }
1310
+ /**
1311
+ * Mark the capture as ready (connection established).
1312
+ *
1313
+ * This flushes any buffered chunks and switches to direct mode.
1314
+ * Uses atomic swap to prevent chunk reordering.
1315
+ */
1316
+ setReady() {
1317
+ const config = getConfig();
1318
+ if (this.isReady) return;
1319
+ const toFlush = this.buffer;
1320
+ this.buffer = [];
1321
+ for (const chunk of toFlush) this.onChunk(chunk);
1322
+ this.isReady = true;
1323
+ if (config.debug) console.log(`[SpeechOS] AudioCapture ready, flushed ${toFlush.length} buffered chunks`);
1324
+ }
1325
+ /**
1326
+ * Stop capturing audio and wait for final chunk.
1327
+ *
1328
+ * Uses requestData() before stop() to force the MediaRecorder to flush
1329
+ * any buffered audio immediately. This is critical for Safari which
1330
+ * may hold audio data in internal buffers.
1331
+ *
1332
+ * Safari requires an additional delay after stopping to ensure all audio
1333
+ * from its internal encoding pipeline has been fully processed and emitted.
1334
+ */
1335
+ async stop() {
1336
+ const config = getConfig();
1337
+ const safari = isSafari();
1338
+ if (this.recorder && this.recorder.state !== "inactive") {
1339
+ if (this.recorder.state === "recording") try {
1340
+ const dataPromise = new Promise((resolve) => {
1341
+ const handler = (event) => {
1342
+ this.recorder?.removeEventListener("dataavailable", handler);
1343
+ if (config.debug) console.log(`[SpeechOS] requestData flush received: ${event.data.size} bytes`);
1344
+ resolve();
1345
+ };
1346
+ this.recorder?.addEventListener("dataavailable", handler);
1347
+ });
1348
+ this.recorder.requestData();
1349
+ if (config.debug) console.log("[SpeechOS] Requested data flush before stop");
1350
+ await dataPromise;
1351
+ } catch (e) {
1352
+ if (config.debug) console.log("[SpeechOS] requestData() not supported or failed:", e);
1353
+ }
1354
+ const stopPromise = new Promise((resolve) => {
1355
+ if (!this.recorder) {
1356
+ resolve();
1357
+ return;
1358
+ }
1359
+ this.recorder.onstop = () => {
1360
+ if (config.debug) console.log("[SpeechOS] MediaRecorder onstop fired");
1361
+ resolve();
1362
+ };
1363
+ });
1364
+ this.recorder.stop();
1365
+ await stopPromise;
1366
+ if (safari) {
1367
+ if (config.debug) console.log("[SpeechOS] Safari: waiting 2s for encoding pipeline to flush");
1368
+ await new Promise((resolve) => setTimeout(resolve, 2e3));
1369
+ }
1370
+ }
1371
+ if (this.mediaStream) {
1372
+ for (const track of this.mediaStream.getTracks()) track.stop();
1373
+ this.mediaStream = null;
1374
+ }
1375
+ this.recorder = null;
1376
+ this.isRecording = false;
1377
+ this.isReady = false;
1378
+ this.buffer = [];
1379
+ if (config.debug) console.log("[SpeechOS] AudioCapture stopped");
1380
+ }
1381
+ /**
1382
+ * Check if currently recording.
1383
+ */
1384
+ get recording() {
1385
+ return this.isRecording;
1386
+ }
1387
+ /**
1388
+ * Check if ready (connection established, direct mode active).
1389
+ */
1390
+ get ready() {
1391
+ return this.isReady;
1392
+ }
1393
+ /**
1394
+ * Get the number of buffered chunks waiting to be sent.
1395
+ */
1396
+ get bufferedChunks() {
1397
+ return this.buffer.length;
1398
+ }
1399
+ };
849
1400
  /**
850
- * Clear all transcript history
1401
+ * Factory function to create an AudioCapture instance.
1402
+ * @param onChunk - Callback for receiving audio chunks
1403
+ * @param deviceId - Optional audio device ID (empty string or undefined for system default)
851
1404
  */
852
- function clearTranscripts() {
853
- try {
854
- localStorage.removeItem(STORAGE_KEY);
855
- } catch {}
1405
+ function createAudioCapture(onChunk, deviceId) {
1406
+ return new AudioCapture(onChunk, deviceId);
856
1407
  }
1408
+
1409
+ //#endregion
1410
+ //#region src/websocket.ts
1411
+ const MESSAGE_TYPE_AUTH = "auth";
1412
+ const MESSAGE_TYPE_READY = "ready";
1413
+ const MESSAGE_TYPE_TRANSCRIPTION = "transcription";
1414
+ const MESSAGE_TYPE_REQUEST_TRANSCRIPT = "request_transcript";
1415
+ const MESSAGE_TYPE_TRANSCRIPT = "transcript";
1416
+ const MESSAGE_TYPE_EDIT_TEXT = "edit_text";
1417
+ const MESSAGE_TYPE_EDITED_TEXT = "edited_text";
1418
+ const MESSAGE_TYPE_EXECUTE_COMMAND = "execute_command";
1419
+ const MESSAGE_TYPE_COMMAND_RESULT = "command_result";
1420
+ const MESSAGE_TYPE_ERROR = "error";
857
1421
  /**
858
- * Delete a single transcript by ID
1422
+ * Response timeout in milliseconds.
859
1423
  */
860
- function deleteTranscript(id) {
861
- const entries = getTranscripts().filter((e) => e.id !== id);
862
- try {
863
- localStorage.setItem(STORAGE_KEY, JSON.stringify(entries));
864
- } catch {}
865
- }
866
- const transcriptStore = {
867
- getTranscripts,
868
- saveTranscript,
869
- clearTranscripts,
870
- deleteTranscript
1424
+ const RESPONSE_TIMEOUT_MS = 15e3;
1425
+ /**
1426
+ * A deferred promise with timeout support.
1427
+ */
1428
+ var Deferred$1 = class {
1429
+ promise;
1430
+ _resolve;
1431
+ _reject;
1432
+ _timeoutId = null;
1433
+ _settled = false;
1434
+ constructor() {
1435
+ this.promise = new Promise((resolve, reject) => {
1436
+ this._resolve = resolve;
1437
+ this._reject = reject;
1438
+ });
1439
+ }
1440
+ setTimeout(ms, errorMessage, errorCode, errorSource) {
1441
+ this._timeoutId = setTimeout(() => {
1442
+ if (!this._settled) {
1443
+ console.error(`[SpeechOS] Error: ${errorMessage} (${errorCode})`);
1444
+ events.emit("error", {
1445
+ code: errorCode,
1446
+ message: errorMessage,
1447
+ source: errorSource
1448
+ });
1449
+ this.reject(new Error(errorMessage));
1450
+ }
1451
+ }, ms);
1452
+ }
1453
+ resolve(value) {
1454
+ if (!this._settled) {
1455
+ this._settled = true;
1456
+ this.clearTimeout();
1457
+ this._resolve(value);
1458
+ }
1459
+ }
1460
+ reject(error) {
1461
+ if (!this._settled) {
1462
+ this._settled = true;
1463
+ this.clearTimeout();
1464
+ this._reject(error);
1465
+ }
1466
+ }
1467
+ clearTimeout() {
1468
+ if (this._timeoutId !== null) {
1469
+ clearTimeout(this._timeoutId);
1470
+ this._timeoutId = null;
1471
+ }
1472
+ }
1473
+ get isSettled() {
1474
+ return this._settled;
1475
+ }
871
1476
  };
1477
+ /**
1478
+ * Maximum time to wait for WebSocket buffer to drain.
1479
+ */
1480
+ const BUFFER_DRAIN_TIMEOUT_MS = 5e3;
1481
+ /**
1482
+ * Polling interval for checking WebSocket buffer.
1483
+ */
1484
+ const BUFFER_CHECK_INTERVAL_MS = 50;
1485
+ /**
1486
+ * WebSocket connection manager for voice sessions.
1487
+ */
1488
+ var WebSocketManager = class {
1489
+ ws = null;
1490
+ audioCapture = null;
1491
+ sessionId = null;
1492
+ pendingAuth = null;
1493
+ pendingTranscript = null;
1494
+ pendingEditText = null;
1495
+ pendingCommand = null;
1496
+ pendingAudioSends = /* @__PURE__ */ new Set();
1497
+ editOriginalText = null;
1498
+ lastInputText = void 0;
1499
+ sessionAction = "dictate";
1500
+ sessionInputText = "";
1501
+ sessionCommands = [];
1502
+ sessionSettings = {};
1503
+ /**
1504
+ * Get the WebSocket URL for voice sessions.
1505
+ */
1506
+ getWebSocketUrl() {
1507
+ const config = getConfig();
1508
+ const host = config.host || "https://app.speechos.ai";
1509
+ const wsUrl = host.replace(/^http/, "ws");
1510
+ return `${wsUrl}/ws/voice/`;
1511
+ }
1512
+ /**
1513
+ * Start a voice session with the WebSocket backend.
1514
+ *
1515
+ * This method:
1516
+ * 1. Starts audio capture immediately (buffering)
1517
+ * 2. Opens WebSocket connection
1518
+ * 3. Authenticates with API key and action parameters
1519
+ * 4. Flushes buffered audio and continues streaming
1520
+ *
1521
+ * @param options - Session options including action type and parameters
1522
+ */
1523
+ async startVoiceSession(options) {
1524
+ const config = getConfig();
1525
+ this.sessionAction = options?.action || "dictate";
1526
+ this.sessionInputText = options?.inputText || "";
1527
+ this.sessionCommands = options?.commands || [];
1528
+ this.sessionSettings = options?.settings || {};
1529
+ if (this.sessionAction === "edit") this.editOriginalText = this.sessionInputText;
1530
+ if (config.debug) console.log("[SpeechOS] Starting WebSocket voice session...");
1531
+ this.audioCapture = createAudioCapture((chunk) => {
1532
+ this.sendAudioChunk(chunk);
1533
+ }, this.sessionSettings.audioDeviceId);
1534
+ await this.audioCapture.start();
1535
+ if (options?.onMicReady) options.onMicReady();
1536
+ state.setMicEnabled(true);
1537
+ const wsUrl = this.getWebSocketUrl();
1538
+ if (config.debug) console.log("[SpeechOS] Connecting to WebSocket:", wsUrl);
1539
+ this.ws = new WebSocket(wsUrl);
1540
+ this.ws.onopen = () => {
1541
+ if (config.debug) console.log("[SpeechOS] WebSocket connected, authenticating...");
1542
+ this.authenticate();
1543
+ };
1544
+ this.ws.onmessage = (event) => {
1545
+ this.handleMessage(event.data);
1546
+ };
1547
+ this.ws.onerror = (event) => {
1548
+ console.error("[SpeechOS] WebSocket error:", event);
1549
+ events.emit("error", {
1550
+ code: "websocket_error",
1551
+ message: "WebSocket connection error",
1552
+ source: "connection"
1553
+ });
1554
+ };
1555
+ this.ws.onclose = (event) => {
1556
+ if (config.debug) console.log("[SpeechOS] WebSocket closed:", event.code, event.reason);
1557
+ state.setConnected(false);
1558
+ };
1559
+ this.pendingAuth = new Deferred$1();
1560
+ this.pendingAuth.setTimeout(RESPONSE_TIMEOUT_MS, "Connection timed out", "connection_timeout", "connection");
1561
+ await this.pendingAuth.promise;
1562
+ this.pendingAuth = null;
1563
+ if (this.audioCapture) this.audioCapture.setReady();
1564
+ state.setConnected(true);
1565
+ if (config.debug) console.log("[SpeechOS] WebSocket voice session ready");
1566
+ }
1567
+ /**
1568
+ * Send authentication message with action parameters.
1569
+ * All session parameters are now sent upfront in the auth message.
1570
+ */
1571
+ authenticate() {
1572
+ const config = getConfig();
1573
+ const audioFormat = getSupportedAudioFormat();
1574
+ const settings = this.sessionSettings;
1575
+ const anonymousId = getAnonymousId();
1576
+ const authMessage = {
1577
+ type: MESSAGE_TYPE_AUTH,
1578
+ api_key: config.apiKey,
1579
+ user_id: config.userId || null,
1580
+ anonymous_id: anonymousId,
1581
+ input_language: settings.inputLanguageCode ?? "en-US",
1582
+ output_language: settings.outputLanguageCode ?? "en-US",
1583
+ smart_format: settings.smartFormat ?? true,
1584
+ custom_vocabulary: settings.vocabulary ?? [],
1585
+ custom_snippets: settings.snippets ?? [],
1586
+ audio_format: audioFormat.format,
1587
+ action: this.sessionAction,
1588
+ input_text: this.sessionInputText,
1589
+ commands: this.sessionCommands
1590
+ };
1591
+ if (config.debug) console.log("[SpeechOS] Sending auth message with action:", this.sessionAction);
1592
+ this.ws?.send(JSON.stringify(authMessage));
1593
+ }
1594
+ /**
1595
+ * Send an audio chunk over the WebSocket.
1596
+ * Tracks the promise so we can wait for all sends to complete.
1597
+ */
1598
+ sendAudioChunk(chunk) {
1599
+ const sendPromise = this.doSendAudioChunk(chunk);
1600
+ this.pendingAudioSends.add(sendPromise);
1601
+ sendPromise.finally(() => {
1602
+ this.pendingAudioSends.delete(sendPromise);
1603
+ });
1604
+ }
1605
+ /**
1606
+ * Actually send the audio chunk (async operation).
1607
+ */
1608
+ async doSendAudioChunk(chunk) {
1609
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
1610
+ const arrayBuffer = await chunk.arrayBuffer();
1611
+ this.ws.send(arrayBuffer);
1612
+ }
1613
+ }
1614
+ /**
1615
+ * Handle incoming WebSocket messages.
1616
+ */
1617
+ handleMessage(data) {
1618
+ const config = getConfig();
1619
+ try {
1620
+ const message = JSON.parse(data);
1621
+ if (config.debug) console.log("[SpeechOS] WebSocket message:", message);
1622
+ switch (message.type) {
1623
+ case MESSAGE_TYPE_READY:
1624
+ this.handleReady(message);
1625
+ break;
1626
+ case MESSAGE_TYPE_TRANSCRIPTION:
1627
+ this.handleIntermediateTranscription(message);
1628
+ break;
1629
+ case MESSAGE_TYPE_TRANSCRIPT:
1630
+ this.handleFinalTranscript(message);
1631
+ break;
1632
+ case MESSAGE_TYPE_EDITED_TEXT:
1633
+ this.handleEditedText(message);
1634
+ break;
1635
+ case MESSAGE_TYPE_COMMAND_RESULT:
1636
+ this.handleCommandResult(message);
1637
+ break;
1638
+ case MESSAGE_TYPE_ERROR:
1639
+ this.handleError(message);
1640
+ break;
1641
+ default: if (config.debug) console.log("[SpeechOS] Unknown message type:", message.type);
1642
+ }
1643
+ } catch (error) {
1644
+ console.error("[SpeechOS] Failed to parse message:", error);
1645
+ }
1646
+ }
1647
+ handleReady(message) {
1648
+ const config = getConfig();
1649
+ this.sessionId = message.session_id;
1650
+ if (config.debug) console.log("[SpeechOS] Session ready:", this.sessionId);
1651
+ if (this.pendingAuth) this.pendingAuth.resolve();
1652
+ }
1653
+ handleIntermediateTranscription(message) {
1654
+ const config = getConfig();
1655
+ if (config.debug) console.log("[SpeechOS] Intermediate transcription:", message.transcript, "final:", message.is_final);
1656
+ }
1657
+ handleFinalTranscript(message) {
1658
+ const transcript = message.transcript || "";
1659
+ events.emit("transcription:complete", { text: transcript });
1660
+ if (this.pendingTranscript) {
1661
+ this.pendingTranscript.resolve(transcript);
1662
+ this.pendingTranscript = null;
1663
+ }
1664
+ }
1665
+ handleEditedText(message) {
1666
+ const editedText = message.text || "";
1667
+ events.emit("edit:complete", {
1668
+ text: editedText,
1669
+ originalText: this.editOriginalText || ""
1670
+ });
1671
+ if (this.pendingEditText) {
1672
+ this.pendingEditText.resolve(editedText);
1673
+ this.pendingEditText = null;
1674
+ }
1675
+ this.editOriginalText = null;
1676
+ }
1677
+ handleCommandResult(message) {
1678
+ const commandResult = message.command || null;
1679
+ this.lastInputText = message.transcript;
1680
+ events.emit("command:complete", { command: commandResult });
1681
+ if (this.pendingCommand) {
1682
+ this.pendingCommand.resolve(commandResult);
1683
+ this.pendingCommand = null;
1684
+ }
1685
+ }
1686
+ handleError(message) {
1687
+ const errorCode = message.code || "server_error";
1688
+ const errorMessage = message.message || "A server error occurred";
1689
+ console.error(`[SpeechOS] Error: ${errorMessage} (${errorCode})`);
1690
+ events.emit("error", {
1691
+ code: errorCode,
1692
+ message: errorMessage,
1693
+ source: "server"
1694
+ });
1695
+ const error = new Error(errorMessage);
1696
+ if (this.pendingAuth) {
1697
+ this.pendingAuth.reject(error);
1698
+ this.pendingAuth = null;
1699
+ }
1700
+ if (this.pendingTranscript) {
1701
+ this.pendingTranscript.reject(error);
1702
+ this.pendingTranscript = null;
1703
+ }
1704
+ if (this.pendingEditText) {
1705
+ this.pendingEditText.reject(error);
1706
+ this.pendingEditText = null;
1707
+ }
1708
+ if (this.pendingCommand) {
1709
+ this.pendingCommand.reject(error);
1710
+ this.pendingCommand = null;
1711
+ }
1712
+ }
1713
+ /**
1714
+ * Stop the voice session and request the transcript.
1715
+ */
1716
+ async stopVoiceSession() {
1717
+ const config = getConfig();
1718
+ if (config.debug) console.log("[SpeechOS] Stopping voice session, requesting transcript...");
1719
+ await this.stopAudioCapture();
1720
+ this.pendingTranscript = new Deferred$1();
1721
+ this.pendingTranscript.setTimeout(RESPONSE_TIMEOUT_MS, "Transcription timed out. Please try again.", "transcription_timeout", "timeout");
1722
+ this.sendMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT });
1723
+ const result = await this.pendingTranscript.promise;
1724
+ this.pendingTranscript = null;
1725
+ return result;
1726
+ }
1727
+ /**
1728
+ * Request text editing using the transcript as instructions.
1729
+ * Note: The input text was already sent in the auth message via startVoiceSession.
1730
+ */
1731
+ async requestEditText(_originalText) {
1732
+ const config = getConfig();
1733
+ if (config.debug) console.log("[SpeechOS] Requesting text edit...");
1734
+ await this.stopAudioCapture();
1735
+ this.pendingEditText = new Deferred$1();
1736
+ this.pendingEditText.setTimeout(RESPONSE_TIMEOUT_MS, "Edit request timed out. Please try again.", "edit_timeout", "timeout");
1737
+ this.sendMessage({ type: MESSAGE_TYPE_EDIT_TEXT });
1738
+ const result = await this.pendingEditText.promise;
1739
+ this.pendingEditText = null;
1740
+ return result;
1741
+ }
1742
+ /**
1743
+ * Request command matching using the transcript as input.
1744
+ * Note: The command definitions were already sent in the auth message via startVoiceSession.
1745
+ */
1746
+ async requestCommand(_commands) {
1747
+ const config = getConfig();
1748
+ if (config.debug) console.log("[SpeechOS] Requesting command match...");
1749
+ await this.stopAudioCapture();
1750
+ this.pendingCommand = new Deferred$1();
1751
+ this.pendingCommand.setTimeout(RESPONSE_TIMEOUT_MS, "Command request timed out. Please try again.", "command_timeout", "timeout");
1752
+ this.sendMessage({ type: MESSAGE_TYPE_EXECUTE_COMMAND });
1753
+ const result = await this.pendingCommand.promise;
1754
+ this.pendingCommand = null;
1755
+ return result;
1756
+ }
1757
+ /**
1758
+ * Stop audio capture and wait for all data to be sent.
1759
+ *
1760
+ * Waits for:
1761
+ * 1. All pending sendAudioChunk calls to complete (arrayBuffer conversion)
1762
+ * 2. WebSocket buffer to drain (all data transmitted)
1763
+ *
1764
+ * WebSocket message ordering ensures server receives all audio before transcript request.
1765
+ */
1766
+ async stopAudioCapture() {
1767
+ const config = getConfig();
1768
+ const startTime = Date.now();
1769
+ if (config.debug) console.log("[SpeechOS] stopAudioCapture: starting...");
1770
+ if (this.audioCapture) {
1771
+ await this.audioCapture.stop();
1772
+ this.audioCapture = null;
1773
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: recorder stopped after ${Date.now() - startTime}ms`);
1774
+ }
1775
+ state.setMicEnabled(false);
1776
+ if (this.pendingAudioSends.size > 0) {
1777
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: waiting for ${this.pendingAudioSends.size} pending audio sends...`);
1778
+ await Promise.all(this.pendingAudioSends);
1779
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: all sends complete after ${Date.now() - startTime}ms`);
1780
+ } else if (config.debug) console.log("[SpeechOS] stopAudioCapture: no pending sends");
1781
+ await this.waitForBufferDrain();
1782
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: complete after ${Date.now() - startTime}ms`);
1783
+ }
1784
+ /**
1785
+ * Wait for the WebSocket send buffer to drain.
1786
+ *
1787
+ * This ensures all audio data has been transmitted before we request
1788
+ * the transcript. Uses the same pattern as LiveKit's ReadableStream approach.
1789
+ */
1790
+ async waitForBufferDrain() {
1791
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
1792
+ const config = getConfig();
1793
+ const startTime = Date.now();
1794
+ while (this.ws.bufferedAmount > 0) {
1795
+ if (Date.now() - startTime > BUFFER_DRAIN_TIMEOUT_MS) {
1796
+ console.warn(`[SpeechOS] Buffer drain timeout, ${this.ws.bufferedAmount} bytes still pending`);
1797
+ break;
1798
+ }
1799
+ await new Promise((resolve) => setTimeout(resolve, BUFFER_CHECK_INTERVAL_MS));
1800
+ }
1801
+ if (config.debug) console.log(`[SpeechOS] Buffer drained in ${Date.now() - startTime}ms`);
1802
+ }
1803
+ /**
1804
+ * Send a JSON message over the WebSocket.
1805
+ */
1806
+ sendMessage(message) {
1807
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) this.ws.send(JSON.stringify(message));
1808
+ }
1809
+ /**
1810
+ * Disconnect from the WebSocket.
1811
+ */
1812
+ async disconnect() {
1813
+ const config = getConfig();
1814
+ if (config.debug) console.log("[SpeechOS] Disconnecting WebSocket...");
1815
+ await this.stopAudioCapture();
1816
+ if (this.ws) {
1817
+ this.ws.close();
1818
+ this.ws = null;
1819
+ }
1820
+ const error = new Error("Disconnected");
1821
+ if (this.pendingAuth) {
1822
+ this.pendingAuth.reject(error);
1823
+ this.pendingAuth = null;
1824
+ }
1825
+ if (this.pendingTranscript) {
1826
+ this.pendingTranscript.reject(error);
1827
+ this.pendingTranscript = null;
1828
+ }
1829
+ if (this.pendingEditText) {
1830
+ this.pendingEditText.reject(error);
1831
+ this.pendingEditText = null;
1832
+ }
1833
+ if (this.pendingCommand) {
1834
+ this.pendingCommand.reject(error);
1835
+ this.pendingCommand = null;
1836
+ }
1837
+ this.sessionId = null;
1838
+ this.editOriginalText = null;
1839
+ this.lastInputText = void 0;
1840
+ this.sessionSettings = {};
1841
+ state.setConnected(false);
1842
+ state.setMicEnabled(false);
1843
+ if (config.debug) console.log("[SpeechOS] WebSocket disconnected");
1844
+ }
1845
+ /**
1846
+ * Check if connected to WebSocket.
1847
+ */
1848
+ isConnected() {
1849
+ return this.ws !== null && this.ws.readyState === WebSocket.OPEN;
1850
+ }
1851
+ /**
1852
+ * Get the last input text from a command result.
1853
+ * This is the raw transcript of what the user said.
1854
+ */
1855
+ getLastInputText() {
1856
+ return this.lastInputText;
1857
+ }
1858
+ };
1859
+ const websocket = new WebSocketManager();
872
1860
 
873
1861
  //#endregion
874
1862
  //#region src/speechos.ts
875
1863
  /**
1864
+ * Get the active voice backend (always websocket now)
1865
+ */
1866
+ function getBackend$1() {
1867
+ return websocket;
1868
+ }
1869
+ /**
876
1870
  * SpeechOS Core SDK
877
1871
  *
878
1872
  * Provides two API layers:
@@ -891,7 +1885,6 @@ var SpeechOSCore = class {
891
1885
  const currentConfig$1 = getConfig();
892
1886
  if (currentConfig$1.debug) console.log("[SpeechOS] Initialized with config:", {
893
1887
  host: currentConfig$1.host,
894
- position: currentConfig$1.position,
895
1888
  debug: currentConfig$1.debug
896
1889
  });
897
1890
  }
@@ -931,7 +1924,6 @@ var SpeechOSCore = class {
931
1924
  state.setRecordingState("processing");
932
1925
  try {
933
1926
  const transcript = await livekit.stopAndGetTranscript();
934
- transcriptStore.saveTranscript(transcript, "dictate");
935
1927
  state.completeRecording();
936
1928
  return transcript;
937
1929
  } catch (error) {
@@ -948,7 +1940,6 @@ var SpeechOSCore = class {
948
1940
  state.setRecordingState("processing");
949
1941
  try {
950
1942
  const editedText = await livekit.stopAndEdit(originalText);
951
- transcriptStore.saveTranscript(editedText, "edit", originalText);
952
1943
  state.completeRecording();
953
1944
  return editedText;
954
1945
  } catch (error) {
@@ -974,8 +1965,13 @@ var SpeechOSCore = class {
974
1965
  state.setActiveAction("dictate");
975
1966
  state.startRecording();
976
1967
  try {
977
- await livekit.startVoiceSession();
978
- state.setRecordingState("recording");
1968
+ const backend = getBackend$1();
1969
+ await backend.startVoiceSession({
1970
+ action: "dictate",
1971
+ onMicReady: () => {
1972
+ state.setRecordingState("recording");
1973
+ }
1974
+ });
979
1975
  return new Promise((resolve, reject) => {
980
1976
  this._dictateResolve = resolve;
981
1977
  this._dictateReject = reject;
@@ -995,8 +1991,8 @@ var SpeechOSCore = class {
995
1991
  async stopDictation() {
996
1992
  state.setRecordingState("processing");
997
1993
  try {
998
- const transcript = await livekit.stopVoiceSession();
999
- transcriptStore.saveTranscript(transcript, "dictate");
1994
+ const backend = getBackend$1();
1995
+ const transcript = await backend.stopVoiceSession();
1000
1996
  state.completeRecording();
1001
1997
  if (this._dictateResolve) {
1002
1998
  this._dictateResolve(transcript);
@@ -1030,8 +2026,14 @@ var SpeechOSCore = class {
1030
2026
  state.startRecording();
1031
2027
  this._editOriginalText = originalText;
1032
2028
  try {
1033
- await livekit.startVoiceSession();
1034
- state.setRecordingState("recording");
2029
+ const backend = getBackend$1();
2030
+ await backend.startVoiceSession({
2031
+ action: "edit",
2032
+ inputText: originalText,
2033
+ onMicReady: () => {
2034
+ state.setRecordingState("recording");
2035
+ }
2036
+ });
1035
2037
  return new Promise((resolve, reject) => {
1036
2038
  this._editResolve = resolve;
1037
2039
  this._editReject = reject;
@@ -1052,9 +2054,9 @@ var SpeechOSCore = class {
1052
2054
  async stopEdit() {
1053
2055
  state.setRecordingState("processing");
1054
2056
  try {
2057
+ const backend = getBackend$1();
1055
2058
  const originalText = this._editOriginalText || "";
1056
- const editedText = await livekit.requestEditText(originalText);
1057
- transcriptStore.saveTranscript(editedText, "edit", originalText);
2059
+ const editedText = await backend.requestEditText(originalText);
1058
2060
  state.completeRecording();
1059
2061
  if (this._editResolve) {
1060
2062
  this._editResolve(editedText);
@@ -1077,6 +2079,71 @@ var SpeechOSCore = class {
1077
2079
  }
1078
2080
  }
1079
2081
  /**
2082
+ * One-shot command: connect, wait for agent, record voice, match against commands
2083
+ * Automatically handles the full voice session lifecycle
2084
+ *
2085
+ * @param commands - Array of command definitions to match against
2086
+ * @returns The matched command result or null if no match
2087
+ */
2088
+ async command(commands) {
2089
+ this.ensureInitialized();
2090
+ state.setActiveAction("command");
2091
+ state.startRecording();
2092
+ this._commandCommands = commands;
2093
+ try {
2094
+ const backend = getBackend$1();
2095
+ await backend.startVoiceSession({
2096
+ action: "command",
2097
+ commands,
2098
+ onMicReady: () => {
2099
+ state.setRecordingState("recording");
2100
+ }
2101
+ });
2102
+ return new Promise((resolve, reject) => {
2103
+ this._commandResolve = resolve;
2104
+ this._commandReject = reject;
2105
+ });
2106
+ } catch (error) {
2107
+ state.setError(error instanceof Error ? error.message : "Failed to start command");
2108
+ await this.cleanup();
2109
+ throw error;
2110
+ }
2111
+ }
2112
+ _commandCommands;
2113
+ _commandResolve;
2114
+ _commandReject;
2115
+ /**
2116
+ * Stop command recording and get the matched command
2117
+ * Call this after command() when user stops speaking
2118
+ */
2119
+ async stopCommand() {
2120
+ state.setRecordingState("processing");
2121
+ try {
2122
+ const backend = getBackend$1();
2123
+ const commands = this._commandCommands || [];
2124
+ const result = await backend.requestCommand(commands);
2125
+ state.completeRecording();
2126
+ if (this._commandResolve) {
2127
+ this._commandResolve(result);
2128
+ this._commandResolve = void 0;
2129
+ this._commandReject = void 0;
2130
+ }
2131
+ return result;
2132
+ } catch (error) {
2133
+ const err = error instanceof Error ? error : new Error("Command request failed");
2134
+ state.setError(err.message);
2135
+ if (this._commandReject) {
2136
+ this._commandReject(err);
2137
+ this._commandResolve = void 0;
2138
+ this._commandReject = void 0;
2139
+ }
2140
+ throw err;
2141
+ } finally {
2142
+ this._commandCommands = void 0;
2143
+ await this.cleanup();
2144
+ }
2145
+ }
2146
+ /**
1080
2147
  * Cancel the current operation
1081
2148
  */
1082
2149
  async cancel() {
@@ -1091,7 +2158,13 @@ var SpeechOSCore = class {
1091
2158
  this._editResolve = void 0;
1092
2159
  this._editReject = void 0;
1093
2160
  }
2161
+ if (this._commandReject) {
2162
+ this._commandReject(err);
2163
+ this._commandResolve = void 0;
2164
+ this._commandReject = void 0;
2165
+ }
1094
2166
  this._editOriginalText = void 0;
2167
+ this._commandCommands = void 0;
1095
2168
  await this.cleanup();
1096
2169
  state.cancelRecording();
1097
2170
  }
@@ -1118,7 +2191,8 @@ var SpeechOSCore = class {
1118
2191
  }
1119
2192
  async cleanup() {
1120
2193
  try {
1121
- await livekit.disconnect();
2194
+ const backend = getBackend$1();
2195
+ await backend.disconnect();
1122
2196
  } catch (error) {
1123
2197
  const config = getConfig();
1124
2198
  if (config.debug) console.warn("[SpeechOS] Cleanup disconnect error:", error);
@@ -1134,6 +2208,9 @@ var SpeechOSCore = class {
1134
2208
  this._editResolve = void 0;
1135
2209
  this._editReject = void 0;
1136
2210
  this._editOriginalText = void 0;
2211
+ this._commandResolve = void 0;
2212
+ this._commandReject = void 0;
2213
+ this._commandCommands = void 0;
1137
2214
  resetConfig();
1138
2215
  state.reset();
1139
2216
  events.clear();
@@ -1141,6 +2218,34 @@ var SpeechOSCore = class {
1141
2218
  };
1142
2219
  const speechOS = new SpeechOSCore();
1143
2220
 
2221
+ //#endregion
2222
+ //#region src/backend.ts
2223
+ /**
2224
+ * WebSocket backend adapter - wraps the websocket module to match the VoiceBackend interface
2225
+ */
2226
+ const websocketBackend = {
2227
+ startVoiceSession: (options) => websocket.startVoiceSession(options),
2228
+ stopVoiceSession: () => websocket.stopVoiceSession(),
2229
+ requestEditText: (text) => websocket.requestEditText(text),
2230
+ requestCommand: (commands) => websocket.requestCommand(commands),
2231
+ disconnect: () => websocket.disconnect(),
2232
+ isConnected: () => websocket.isConnected(),
2233
+ getLastInputText: () => websocket.getLastInputText(),
2234
+ prefetchToken: () => Promise.resolve({}),
2235
+ startAutoRefresh: () => {},
2236
+ stopAutoRefresh: () => {},
2237
+ invalidateTokenCache: () => {}
2238
+ };
2239
+ /**
2240
+ * Get the active voice backend.
2241
+ * Always returns WebSocket backend (LiveKit is legacy).
2242
+ *
2243
+ * @returns The websocket backend
2244
+ */
2245
+ function getBackend() {
2246
+ return websocketBackend;
2247
+ }
2248
+
1144
2249
  //#endregion
1145
2250
  //#region src/index.ts
1146
2251
  const VERSION = "0.1.0";
@@ -1151,15 +2256,15 @@ exports.Deferred = Deferred;
1151
2256
  exports.SpeechOSEventEmitter = SpeechOSEventEmitter;
1152
2257
  exports.VERSION = VERSION;
1153
2258
  exports.createStateManager = createStateManager;
1154
- exports.defaultConfig = defaultConfig;
1155
2259
  exports.events = events;
2260
+ exports.getBackend = getBackend;
1156
2261
  exports.getConfig = getConfig;
1157
2262
  exports.livekit = livekit;
1158
2263
  exports.resetConfig = resetConfig;
1159
2264
  exports.setConfig = setConfig;
1160
2265
  exports.speechOS = speechOS;
1161
2266
  exports.state = state;
1162
- exports.transcriptStore = transcriptStore;
1163
2267
  exports.updateUserId = updateUserId;
1164
2268
  exports.validateConfig = validateConfig;
2269
+ exports.websocket = websocket;
1165
2270
  //# sourceMappingURL=index.cjs.map