@speechos/core 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -12,8 +12,6 @@ const defaultConfig = {
12
12
  apiKey: "",
13
13
  userId: "",
14
14
  host: DEFAULT_HOST,
15
- position: "bottom-center",
16
- zIndex: 999999,
17
15
  debug: false
18
16
  };
19
17
  /**
@@ -21,31 +19,19 @@ const defaultConfig = {
21
19
  * @param userConfig - User-provided configuration
22
20
  * @returns Validated and merged configuration
23
21
  */
24
- function validateConfig(userConfig = {}) {
22
+ function validateConfig(userConfig) {
25
23
  if (!userConfig.apiKey) throw new Error("SpeechOS requires an apiKey. Get one from your team dashboard at /a/<team-slug>/.");
26
- const config = {
27
- ...defaultConfig,
28
- ...userConfig
24
+ return {
25
+ apiKey: userConfig.apiKey,
26
+ userId: userConfig.userId ?? defaultConfig.userId,
27
+ host: userConfig.host ?? defaultConfig.host,
28
+ debug: userConfig.debug ?? defaultConfig.debug
29
29
  };
30
- const validPositions = [
31
- "bottom-center",
32
- "bottom-right",
33
- "bottom-left"
34
- ];
35
- if (!validPositions.includes(config.position)) {
36
- console.warn(`Invalid position "${config.position}". Using default "bottom-center".`);
37
- config.position = "bottom-center";
38
- }
39
- if (typeof config.zIndex !== "number" || config.zIndex < 0) {
40
- console.warn(`Invalid zIndex "${config.zIndex}". Using default ${defaultConfig.zIndex}.`);
41
- config.zIndex = defaultConfig.zIndex;
42
- }
43
- return config;
44
30
  }
45
31
  /**
46
32
  * Current active configuration (singleton)
47
33
  */
48
- let currentConfig = defaultConfig;
34
+ let currentConfig = { ...defaultConfig };
49
35
  /**
50
36
  * Get the current configuration
51
37
  */
@@ -75,6 +61,28 @@ function updateUserId(userId) {
75
61
  userId
76
62
  };
77
63
  }
64
+ /**
65
+ * LocalStorage key for anonymous ID persistence
66
+ */
67
+ const ANONYMOUS_ID_KEY = "speechos_anonymous_id";
68
+ /**
69
+ * Get or generate a persistent anonymous ID for Mixpanel tracking.
70
+ *
71
+ * This ID is stored in localStorage to persist across sessions,
72
+ * allowing consistent anonymous user tracking without identifying
73
+ * the account owner's customers.
74
+ *
75
+ * @returns A UUID string for anonymous identification
76
+ */
77
+ function getAnonymousId() {
78
+ if (typeof localStorage === "undefined") return crypto.randomUUID();
79
+ let anonymousId = localStorage.getItem(ANONYMOUS_ID_KEY);
80
+ if (!anonymousId) {
81
+ anonymousId = crypto.randomUUID();
82
+ localStorage.setItem(ANONYMOUS_ID_KEY, anonymousId);
83
+ }
84
+ return anonymousId;
85
+ }
78
86
 
79
87
  //#endregion
80
88
  //#region src/events.ts
@@ -168,33 +176,38 @@ const initialState = {
168
176
  var StateManager = class {
169
177
  state;
170
178
  subscribers = /* @__PURE__ */ new Set();
179
+ /** Cached immutable snapshot for useSyncExternalStore compatibility */
180
+ snapshot;
171
181
  constructor(initialState$1) {
172
182
  this.state = { ...initialState$1 };
183
+ this.snapshot = Object.freeze({ ...this.state });
173
184
  }
174
185
  /**
175
- * Get the current state (returns a copy to prevent mutations)
186
+ * Get the current state snapshot (returns a stable reference for React)
187
+ * This returns an immutable frozen object that only changes when setState is called.
176
188
  */
177
189
  getState() {
178
- return { ...this.state };
190
+ return this.snapshot;
179
191
  }
180
192
  /**
181
193
  * Update state with partial values
182
194
  * @param partial - Partial state to merge with current state
183
195
  */
184
196
  setState(partial) {
185
- const prevState = { ...this.state };
197
+ const prevState = this.snapshot;
186
198
  this.state = {
187
199
  ...this.state,
188
200
  ...partial
189
201
  };
202
+ this.snapshot = Object.freeze({ ...this.state });
190
203
  this.subscribers.forEach((callback) => {
191
204
  try {
192
- callback(this.state, prevState);
205
+ callback(this.snapshot, prevState);
193
206
  } catch (error) {
194
207
  console.error("Error in state change callback:", error);
195
208
  }
196
209
  });
197
- events.emit("state:change", { state: this.state });
210
+ events.emit("state:change", { state: this.snapshot });
198
211
  }
199
212
  /**
200
213
  * Subscribe to state changes
@@ -211,7 +224,17 @@ var StateManager = class {
211
224
  * Reset state to initial values
212
225
  */
213
226
  reset() {
214
- this.setState(initialState);
227
+ const prevState = this.snapshot;
228
+ this.state = { ...initialState };
229
+ this.snapshot = Object.freeze({ ...this.state });
230
+ this.subscribers.forEach((callback) => {
231
+ try {
232
+ callback(this.snapshot, prevState);
233
+ } catch (error) {
234
+ console.error("Error in state change callback:", error);
235
+ }
236
+ });
237
+ events.emit("state:change", { state: this.snapshot });
215
238
  }
216
239
  /**
217
240
  * Show the widget
@@ -346,12 +369,15 @@ function createStateManager(initial) {
346
369
 
347
370
  //#endregion
348
371
  //#region src/livekit.ts
349
- const MESSAGE_TYPE_REQUEST_TRANSCRIPT = "request_transcript";
350
- const MESSAGE_TYPE_TRANSCRIPT = "transcript";
351
- const MESSAGE_TYPE_EDIT_TEXT = "edit_text";
352
- const MESSAGE_TYPE_EDITED_TEXT = "edited_text";
353
- const MESSAGE_TYPE_ERROR = "error";
372
+ const MESSAGE_TYPE_REQUEST_TRANSCRIPT$1 = "request_transcript";
373
+ const MESSAGE_TYPE_TRANSCRIPT$1 = "transcript";
374
+ const MESSAGE_TYPE_EDIT_TEXT$1 = "edit_text";
375
+ const MESSAGE_TYPE_EDITED_TEXT$1 = "edited_text";
376
+ const MESSAGE_TYPE_EXECUTE_COMMAND$1 = "execute_command";
377
+ const MESSAGE_TYPE_COMMAND_RESULT$1 = "command_result";
378
+ const MESSAGE_TYPE_ERROR$1 = "error";
354
379
  const TOPIC_SPEECHOS = "speechos";
380
+ const TOKEN_CACHE_TTL_MS = 4 * 60 * 1e3;
355
381
  /**
356
382
  * A deferred promise with timeout support.
357
383
  * Encapsulates resolve/reject/timeout in a single object for cleaner async handling.
@@ -415,53 +441,116 @@ var LiveKitManager = class {
415
441
  room = null;
416
442
  tokenData = null;
417
443
  micTrack = null;
444
+ cachedTokenData = null;
445
+ tokenCacheTimestamp = null;
446
+ tokenPrefetchPromise = null;
447
+ tokenRefreshTimer = null;
448
+ autoRefreshEnabled = false;
418
449
  pendingTranscript = null;
419
450
  pendingEditText = null;
451
+ pendingCommand = null;
420
452
  pendingTrackSubscribed = null;
421
- preWarmPromise = null;
422
453
  editOriginalText = null;
454
+ sessionSettings = {};
423
455
  /**
424
- * Pre-warm resources for faster connection
425
- * Call this when user shows intent (e.g., expands widget)
426
- * Only fetches token - mic permission is requested when user clicks Dictate
456
+ * Check if the cached token is still valid (within TTL)
427
457
  */
428
- async preWarm() {
429
- if (this.tokenData || this.preWarmPromise || this.room?.state === "connected") {
430
- const config$1 = getConfig();
431
- if (config$1.debug) console.log("[SpeechOS] Pre-warm skipped - token already available");
432
- return;
433
- }
458
+ isCachedTokenValid() {
459
+ if (!this.cachedTokenData || !this.tokenCacheTimestamp) return false;
460
+ const age = Date.now() - this.tokenCacheTimestamp;
461
+ return age < TOKEN_CACHE_TTL_MS;
462
+ }
463
+ /**
464
+ * Pre-fetch a LiveKit token for later use
465
+ * Call this early (e.g., when widget expands) to reduce latency when starting a voice session.
466
+ * If a prefetch is already in progress, returns the existing promise.
467
+ * If a valid cached token exists, returns it immediately.
468
+ */
469
+ async prefetchToken() {
434
470
  const config = getConfig();
435
- if (config.debug) console.log("[SpeechOS] Pre-warming: fetching token...");
436
- this.preWarmPromise = (async () => {
437
- try {
438
- await this.fetchToken();
439
- if (config.debug) console.log("[SpeechOS] Pre-warm complete - token ready");
440
- } catch (error) {
441
- if (config.debug) console.warn("[SpeechOS] Pre-warm failed:", error);
442
- this.preWarmPromise = null;
443
- }
444
- })();
445
- await this.preWarmPromise;
471
+ if (this.isCachedTokenValid() && this.cachedTokenData) {
472
+ if (config.debug) console.log("[SpeechOS] Using cached token (prefetch hit)");
473
+ return this.cachedTokenData;
474
+ }
475
+ if (this.tokenPrefetchPromise) {
476
+ if (config.debug) console.log("[SpeechOS] Prefetch already in progress, awaiting...");
477
+ return this.tokenPrefetchPromise;
478
+ }
479
+ if (config.debug) console.log("[SpeechOS] Starting token prefetch...");
480
+ this.tokenPrefetchPromise = this.fetchTokenFromServer().then((data) => {
481
+ this.cachedTokenData = data;
482
+ this.tokenCacheTimestamp = Date.now();
483
+ this.tokenPrefetchPromise = null;
484
+ return data;
485
+ }).catch((error) => {
486
+ this.tokenPrefetchPromise = null;
487
+ throw error;
488
+ });
489
+ return this.tokenPrefetchPromise;
446
490
  }
447
491
  /**
448
492
  * Fetch a LiveKit token from the backend
493
+ * Uses cached token if valid, otherwise fetches a fresh one.
494
+ * Includes language settings and user vocabulary which are stored in the VoiceSession.
449
495
  */
450
496
  async fetchToken() {
497
+ const config = getConfig();
498
+ if (this.isCachedTokenValid() && this.cachedTokenData) {
499
+ if (config.debug) console.log("[SpeechOS] Using cached token");
500
+ this.tokenData = this.cachedTokenData;
501
+ return this.cachedTokenData;
502
+ }
503
+ if (this.tokenPrefetchPromise) {
504
+ if (config.debug) console.log("[SpeechOS] Waiting for prefetch to complete...");
505
+ const data$1 = await this.tokenPrefetchPromise;
506
+ this.tokenData = data$1;
507
+ return data$1;
508
+ }
509
+ const data = await this.fetchTokenFromServer();
510
+ this.cachedTokenData = data;
511
+ this.tokenCacheTimestamp = Date.now();
512
+ this.tokenData = data;
513
+ return data;
514
+ }
515
+ /**
516
+ * Internal method to fetch a fresh token from the server
517
+ */
518
+ async fetchTokenFromServer() {
451
519
  const config = getConfig();
452
520
  const url = `${config.host}/livekit/api/token/`;
453
- if (config.debug) console.log("[SpeechOS] Fetching LiveKit token from:", url);
521
+ const settings = this.sessionSettings;
522
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
523
+ const outputLanguage = settings.outputLanguageCode ?? "en-US";
524
+ const smartFormat = settings.smartFormat ?? true;
525
+ const vocabulary = settings.vocabulary ?? [];
526
+ const snippets = settings.snippets ?? [];
527
+ if (config.debug) {
528
+ console.log("[SpeechOS] Fetching LiveKit token from:", url);
529
+ console.log("[SpeechOS] Session settings:", {
530
+ inputLanguage,
531
+ outputLanguage,
532
+ smartFormat,
533
+ snippetsCount: snippets.length,
534
+ vocabularyCount: vocabulary.length
535
+ });
536
+ }
454
537
  const response = await fetch(url, {
455
538
  method: "POST",
456
539
  headers: {
457
540
  "Content-Type": "application/json",
458
541
  ...config.apiKey ? { Authorization: `Api-Key ${config.apiKey}` } : {}
459
542
  },
460
- body: JSON.stringify({ user_id: config.userId || null })
543
+ body: JSON.stringify({
544
+ user_id: config.userId || null,
545
+ input_language: inputLanguage,
546
+ output_language: outputLanguage,
547
+ smart_format: smartFormat,
548
+ custom_vocabulary: vocabulary,
549
+ custom_snippets: snippets
550
+ })
461
551
  });
462
552
  if (!response.ok) throw new Error(`Failed to fetch LiveKit token: ${response.status} ${response.statusText}`);
463
553
  const data = await response.json();
464
- this.tokenData = data;
465
554
  if (config.debug) console.log("[SpeechOS] LiveKit token received:", {
466
555
  room: data.room,
467
556
  identity: data.identity,
@@ -474,8 +563,7 @@ var LiveKitManager = class {
474
563
  */
475
564
  async connect() {
476
565
  const config = getConfig();
477
- if (!this.tokenData) await this.fetchToken();
478
- else if (config.debug) console.log("[SpeechOS] Using pre-fetched token");
566
+ await this.fetchToken();
479
567
  if (!this.tokenData) throw new Error("No token available for LiveKit connection");
480
568
  this.room = new Room({
481
569
  adaptiveStream: true,
@@ -539,7 +627,7 @@ var LiveKitManager = class {
539
627
  try {
540
628
  const message = JSON.parse(new TextDecoder().decode(data));
541
629
  if (config.debug) console.log("[SpeechOS] Data received:", message);
542
- if (message.type === MESSAGE_TYPE_TRANSCRIPT) {
630
+ if (message.type === MESSAGE_TYPE_TRANSCRIPT$1) {
543
631
  const transcript = message.transcript || "";
544
632
  if (config.debug) console.log("[SpeechOS] Transcript received:", transcript);
545
633
  events.emit("transcription:complete", { text: transcript });
@@ -547,7 +635,7 @@ var LiveKitManager = class {
547
635
  this.pendingTranscript.resolve(transcript);
548
636
  this.pendingTranscript = null;
549
637
  }
550
- } else if (message.type === MESSAGE_TYPE_EDITED_TEXT) {
638
+ } else if (message.type === MESSAGE_TYPE_EDITED_TEXT$1) {
551
639
  const editedText = message.text || "";
552
640
  if (config.debug) console.log("[SpeechOS] Edited text received:", editedText);
553
641
  events.emit("edit:complete", {
@@ -559,7 +647,15 @@ var LiveKitManager = class {
559
647
  this.pendingEditText = null;
560
648
  }
561
649
  this.editOriginalText = null;
562
- } else if (message.type === MESSAGE_TYPE_ERROR) {
650
+ } else if (message.type === MESSAGE_TYPE_COMMAND_RESULT$1) {
651
+ const commandResult = message.command || null;
652
+ if (config.debug) console.log("[SpeechOS] Command result received:", commandResult);
653
+ events.emit("command:complete", { command: commandResult });
654
+ if (this.pendingCommand) {
655
+ this.pendingCommand.resolve(commandResult);
656
+ this.pendingCommand = null;
657
+ }
658
+ } else if (message.type === MESSAGE_TYPE_ERROR$1) {
563
659
  const serverError = message;
564
660
  const errorCode = serverError.code || "server_error";
565
661
  const errorMessage = serverError.message || "A server error occurred";
@@ -579,6 +675,10 @@ var LiveKitManager = class {
579
675
  this.pendingEditText.reject(error);
580
676
  this.pendingEditText = null;
581
677
  }
678
+ if (this.pendingCommand) {
679
+ this.pendingCommand.reject(error);
680
+ this.pendingCommand = null;
681
+ }
582
682
  }
583
683
  } catch (error) {
584
684
  console.error("[SpeechOS] Failed to parse data message:", error);
@@ -586,16 +686,34 @@ var LiveKitManager = class {
586
686
  }
587
687
  /**
588
688
  * Publish microphone audio track
689
+ * Uses the device ID from session settings if set
589
690
  */
590
691
  async enableMicrophone() {
591
692
  if (!this.room || this.room.state !== "connected") throw new Error("Not connected to room");
592
693
  const config = getConfig();
593
694
  if (!this.micTrack) {
594
695
  if (config.debug) console.log("[SpeechOS] Creating microphone track...");
595
- this.micTrack = await createLocalAudioTrack({
696
+ const deviceId = this.sessionSettings.audioDeviceId;
697
+ const trackOptions = {
596
698
  echoCancellation: true,
597
699
  noiseSuppression: true
598
- });
700
+ };
701
+ if (deviceId) {
702
+ trackOptions.deviceId = { exact: deviceId };
703
+ if (config.debug) console.log("[SpeechOS] Using audio device:", deviceId);
704
+ }
705
+ try {
706
+ this.micTrack = await createLocalAudioTrack(trackOptions);
707
+ } catch (error) {
708
+ if (deviceId && error instanceof Error) {
709
+ console.warn("[SpeechOS] Selected audio device unavailable, falling back to default:", error.message);
710
+ this.micTrack = await createLocalAudioTrack({
711
+ echoCancellation: true,
712
+ noiseSuppression: true
713
+ });
714
+ } else throw error;
715
+ }
716
+ this.logMicrophoneInfo();
599
717
  }
600
718
  const existingPub = this.room.localParticipant.getTrackPublication(Track.Source.Microphone);
601
719
  if (!existingPub) {
@@ -605,6 +723,24 @@ var LiveKitManager = class {
605
723
  }
606
724
  }
607
725
  /**
726
+ * Log information about the current microphone track
727
+ */
728
+ logMicrophoneInfo() {
729
+ if (!this.micTrack) return;
730
+ const config = getConfig();
731
+ const mediaTrack = this.micTrack.mediaStreamTrack;
732
+ const settings = mediaTrack.getSettings();
733
+ console.log("[SpeechOS] Microphone active:", {
734
+ deviceId: settings.deviceId || "unknown",
735
+ label: mediaTrack.label || "Unknown device",
736
+ sampleRate: settings.sampleRate,
737
+ channelCount: settings.channelCount,
738
+ echoCancellation: settings.echoCancellation,
739
+ noiseSuppression: settings.noiseSuppression
740
+ });
741
+ if (config.debug) console.log("[SpeechOS] Full audio track settings:", settings);
742
+ }
743
+ /**
608
744
  * Disable microphone audio track
609
745
  */
610
746
  async disableMicrophone() {
@@ -636,30 +772,85 @@ var LiveKitManager = class {
636
772
  });
637
773
  }
638
774
  /**
639
- * Start a voice session
640
- * Connects to room, enables microphone, and waits for agent to subscribe to our track
775
+ * Start a voice session with pre-connect audio buffering
776
+ * Fetches a fresh token, then enables mic with preConnectBuffer to capture audio while connecting.
777
+ * Agent subscription happens in the background - we don't block on it.
778
+ *
779
+ * @param options - Session options including action type and parameters
641
780
  */
642
- async startVoiceSession() {
781
+ async startVoiceSession(options) {
643
782
  const config = getConfig();
644
783
  if (config.debug) console.log("[SpeechOS] Starting voice session...");
645
- if (this.preWarmPromise) {
646
- if (config.debug) console.log("[SpeechOS] Waiting for pre-warm to complete...");
647
- await this.preWarmPromise;
648
- }
649
- if (this.tokenData) {
650
- if (config.debug) console.log("[SpeechOS] Using cached token from init");
651
- } else {
652
- if (config.debug) console.log("[SpeechOS] Fetching fresh token for session...");
653
- await this.fetchToken();
654
- }
784
+ this.sessionSettings = options?.settings || {};
785
+ await this.fetchToken();
786
+ if (!this.tokenData) throw new Error("No token available for LiveKit connection");
655
787
  this.pendingTrackSubscribed = new Deferred();
656
788
  this.pendingTrackSubscribed.setTimeout(15e3, "Connection timed out - agent not available", "connection_timeout", "connection");
657
- await this.connect();
658
- await this.enableMicrophone();
659
- if (config.debug) console.log("[SpeechOS] Microphone published, waiting for LocalTrackSubscribed event...");
660
- await this.pendingTrackSubscribed.promise;
661
- this.pendingTrackSubscribed = null;
662
- if (config.debug) console.log("[SpeechOS] Voice session ready - agent subscribed to audio");
789
+ this.room = new Room({
790
+ adaptiveStream: true,
791
+ dynacast: true
792
+ });
793
+ this.setupRoomEvents();
794
+ if (config.debug) console.log("[SpeechOS] Connecting to LiveKit room:", this.tokenData.room, "at", this.tokenData.ws_url);
795
+ await this.room.connect(this.tokenData.ws_url, this.tokenData.token);
796
+ if (config.debug) console.log("[SpeechOS] Connected, enabling microphone with preConnectBuffer...");
797
+ await this.enableMicrophoneWithPreConnectBuffer();
798
+ if (options?.onMicReady) options.onMicReady();
799
+ state.setConnected(true);
800
+ if (config.debug) console.log("[SpeechOS] Voice session ready - microphone active");
801
+ this.waitForAgentSubscription();
802
+ }
803
+ /**
804
+ * Wait for the agent to subscribe to our audio track in the background
805
+ * Handles timeout errors without blocking the main flow
806
+ */
807
+ waitForAgentSubscription() {
808
+ const config = getConfig();
809
+ if (!this.pendingTrackSubscribed) return;
810
+ this.pendingTrackSubscribed.promise.then(() => {
811
+ if (config.debug) console.log("[SpeechOS] Agent subscribed to audio track - full duplex established");
812
+ this.pendingTrackSubscribed = null;
813
+ }).catch((error) => {
814
+ console.warn("[SpeechOS] Agent subscription timeout:", error.message);
815
+ this.pendingTrackSubscribed = null;
816
+ });
817
+ }
818
+ /**
819
+ * Enable microphone with pre-connect buffering
820
+ * This starts capturing audio locally before the room is connected,
821
+ * buffering it until the connection is established.
822
+ */
823
+ async enableMicrophoneWithPreConnectBuffer() {
824
+ if (!this.room) throw new Error("Room not initialized");
825
+ const config = getConfig();
826
+ const deviceId = this.sessionSettings.audioDeviceId;
827
+ const constraints = {
828
+ echoCancellation: true,
829
+ noiseSuppression: true
830
+ };
831
+ if (deviceId) {
832
+ constraints.deviceId = { exact: deviceId };
833
+ if (config.debug) console.log("[SpeechOS] Using audio device:", deviceId);
834
+ }
835
+ try {
836
+ await this.room.localParticipant.setMicrophoneEnabled(true, constraints, { preConnectBuffer: true });
837
+ state.setMicEnabled(true);
838
+ const micPub = this.room.localParticipant.getTrackPublication(Track.Source.Microphone);
839
+ if (micPub?.track) {
840
+ this.micTrack = micPub.track;
841
+ this.logMicrophoneInfo();
842
+ }
843
+ if (config.debug) console.log("[SpeechOS] Microphone enabled with pre-connect buffer - audio is being captured");
844
+ } catch (error) {
845
+ if (deviceId && error instanceof Error) {
846
+ console.warn("[SpeechOS] Selected audio device unavailable, falling back to default:", error.message);
847
+ await this.room.localParticipant.setMicrophoneEnabled(true, {
848
+ echoCancellation: true,
849
+ noiseSuppression: true
850
+ }, { preConnectBuffer: true });
851
+ state.setMicEnabled(true);
852
+ } else throw error;
853
+ }
663
854
  }
664
855
  /**
665
856
  * Stop the voice session and request the transcript
@@ -668,12 +859,19 @@ var LiveKitManager = class {
668
859
  */
669
860
  async stopVoiceSession() {
670
861
  const config = getConfig();
862
+ const settings = this.sessionSettings;
863
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
864
+ const outputLanguage = settings.outputLanguageCode ?? "en-US";
865
+ console.log("[SpeechOS] Dictate command:", {
866
+ inputLanguage,
867
+ outputLanguage
868
+ });
671
869
  if (config.debug) console.log("[SpeechOS] Stopping voice session, requesting transcript...");
672
870
  await this.disableMicrophone();
673
871
  if (config.debug) console.log("[SpeechOS] Requesting transcript from agent...");
674
872
  this.pendingTranscript = new Deferred();
675
873
  this.pendingTranscript.setTimeout(1e4, "Transcription timed out. Please try again.", "transcription_timeout", "timeout");
676
- await this.sendDataMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT });
874
+ await this.sendDataMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT$1 });
677
875
  const result = await this.pendingTranscript.promise;
678
876
  this.pendingTranscript = null;
679
877
  return result;
@@ -692,6 +890,14 @@ var LiveKitManager = class {
692
890
  */
693
891
  async requestEditText(originalText) {
694
892
  const config = getConfig();
893
+ const settings = this.sessionSettings;
894
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
895
+ const outputLanguage = settings.outputLanguageCode ?? "en-US";
896
+ console.log("[SpeechOS] Edit command:", {
897
+ inputLanguage,
898
+ outputLanguage,
899
+ originalTextLength: originalText.length
900
+ });
695
901
  if (config.debug) console.log("[SpeechOS] Requesting text edit...");
696
902
  this.editOriginalText = originalText;
697
903
  await this.disableMicrophone();
@@ -699,7 +905,7 @@ var LiveKitManager = class {
699
905
  this.pendingEditText = new Deferred();
700
906
  this.pendingEditText.setTimeout(15e3, "Edit request timed out. Please try again.", "edit_timeout", "timeout");
701
907
  await this.sendDataMessage({
702
- type: MESSAGE_TYPE_EDIT_TEXT,
908
+ type: MESSAGE_TYPE_EDIT_TEXT$1,
703
909
  text: originalText
704
910
  });
705
911
  const result = await this.pendingEditText.promise;
@@ -713,6 +919,39 @@ var LiveKitManager = class {
713
919
  return this.requestEditText(originalText);
714
920
  }
715
921
  /**
922
+ * Request command matching using the transcript as input
923
+ * Sends command definitions to the backend, which matches the user's speech against them
924
+ * Returns a promise that resolves with the matched command or null if no match
925
+ * @throws Error if timeout occurs waiting for command result
926
+ */
927
+ async requestCommand(commands) {
928
+ const config = getConfig();
929
+ const settings = this.sessionSettings;
930
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
931
+ console.log("[SpeechOS] Command request:", {
932
+ inputLanguage,
933
+ commandCount: commands.length
934
+ });
935
+ if (config.debug) console.log("[SpeechOS] Requesting command match...");
936
+ await this.disableMicrophone();
937
+ if (config.debug) console.log("[SpeechOS] Sending execute_command request to agent...");
938
+ this.pendingCommand = new Deferred();
939
+ this.pendingCommand.setTimeout(15e3, "Command request timed out. Please try again.", "command_timeout", "timeout");
940
+ await this.sendDataMessage({
941
+ type: MESSAGE_TYPE_EXECUTE_COMMAND$1,
942
+ commands
943
+ });
944
+ const result = await this.pendingCommand.promise;
945
+ this.pendingCommand = null;
946
+ return result;
947
+ }
948
+ /**
949
+ * Alias for requestCommand - granular API naming
950
+ */
951
+ async stopAndCommand(commands) {
952
+ return this.requestCommand(commands);
953
+ }
954
+ /**
716
955
  * Disconnect from the current room
717
956
  * Clears the token so a fresh one is fetched for the next session
718
957
  */
@@ -735,16 +974,110 @@ var LiveKitManager = class {
735
974
  this.pendingEditText.reject(new Error("Disconnected"));
736
975
  this.pendingEditText = null;
737
976
  }
977
+ if (this.pendingCommand) {
978
+ this.pendingCommand.reject(new Error("Disconnected"));
979
+ this.pendingCommand = null;
980
+ }
738
981
  if (this.pendingTrackSubscribed) {
739
982
  this.pendingTrackSubscribed.reject(new Error("Disconnected"));
740
983
  this.pendingTrackSubscribed = null;
741
984
  }
742
985
  this.tokenData = null;
743
- this.preWarmPromise = null;
744
986
  this.editOriginalText = null;
987
+ this.sessionSettings = {};
745
988
  if (config.debug) console.log("[SpeechOS] Session state cleared");
746
989
  }
747
990
  /**
991
+ * Invalidate the cached token
992
+ * Call this when settings change that would affect the token (language, vocabulary)
993
+ */
994
+ invalidateTokenCache() {
995
+ const config = getConfig();
996
+ if (config.debug) console.log("[SpeechOS] Token cache invalidated");
997
+ this.cachedTokenData = null;
998
+ this.tokenCacheTimestamp = null;
999
+ }
1000
+ /**
1001
+ * Start auto-refreshing the token while the widget is expanded.
1002
+ * Call this after a voice session completes to immediately fetch a fresh token
1003
+ * (since each command requires its own token) and keep it fresh for subsequent commands.
1004
+ */
1005
+ startAutoRefresh() {
1006
+ const config = getConfig();
1007
+ this.autoRefreshEnabled = true;
1008
+ if (config.debug) console.log("[SpeechOS] Token auto-refresh enabled");
1009
+ this.invalidateTokenCache();
1010
+ this.prefetchToken().then(() => {
1011
+ this.scheduleTokenRefresh();
1012
+ }).catch((error) => {
1013
+ if (config.debug) console.warn("[SpeechOS] Failed to prefetch token after command:", error);
1014
+ if (this.autoRefreshEnabled) this.tokenRefreshTimer = setTimeout(() => {
1015
+ this.performAutoRefresh();
1016
+ }, 5 * 1e3);
1017
+ });
1018
+ }
1019
+ /**
1020
+ * Stop auto-refreshing the token.
1021
+ * Call this when the widget collapses or user navigates away.
1022
+ */
1023
+ stopAutoRefresh() {
1024
+ const config = getConfig();
1025
+ this.autoRefreshEnabled = false;
1026
+ if (this.tokenRefreshTimer) {
1027
+ clearTimeout(this.tokenRefreshTimer);
1028
+ this.tokenRefreshTimer = null;
1029
+ }
1030
+ if (config.debug) console.log("[SpeechOS] Token auto-refresh disabled");
1031
+ }
1032
+ /**
1033
+ * Schedule a token refresh before the current cache expires.
1034
+ * Handles computer sleep by checking elapsed time on each refresh attempt.
1035
+ */
1036
+ scheduleTokenRefresh() {
1037
+ if (!this.autoRefreshEnabled) return;
1038
+ if (this.tokenRefreshTimer) {
1039
+ clearTimeout(this.tokenRefreshTimer);
1040
+ this.tokenRefreshTimer = null;
1041
+ }
1042
+ const config = getConfig();
1043
+ const refreshBuffer = 30 * 1e3;
1044
+ let timeUntilRefresh;
1045
+ if (this.tokenCacheTimestamp) {
1046
+ const age = Date.now() - this.tokenCacheTimestamp;
1047
+ const timeRemaining = TOKEN_CACHE_TTL_MS - age;
1048
+ timeUntilRefresh = Math.max(0, timeRemaining - refreshBuffer);
1049
+ } else timeUntilRefresh = 0;
1050
+ if (config.debug) console.log(`[SpeechOS] Scheduling token refresh in ${Math.round(timeUntilRefresh / 1e3)}s`);
1051
+ this.tokenRefreshTimer = setTimeout(() => {
1052
+ this.performAutoRefresh();
1053
+ }, timeUntilRefresh);
1054
+ }
1055
+ /**
1056
+ * Perform the auto-refresh, handling computer sleep scenarios.
1057
+ */
1058
+ async performAutoRefresh() {
1059
+ if (!this.autoRefreshEnabled) return;
1060
+ const config = getConfig();
1061
+ if (this.isCachedTokenValid()) {
1062
+ if (config.debug) console.log("[SpeechOS] Token still valid on refresh check, rescheduling");
1063
+ this.scheduleTokenRefresh();
1064
+ return;
1065
+ }
1066
+ if (config.debug) console.log("[SpeechOS] Auto-refreshing token...");
1067
+ try {
1068
+ const data = await this.fetchTokenFromServer();
1069
+ this.cachedTokenData = data;
1070
+ this.tokenCacheTimestamp = Date.now();
1071
+ if (config.debug) console.log("[SpeechOS] Token auto-refreshed successfully");
1072
+ this.scheduleTokenRefresh();
1073
+ } catch (error) {
1074
+ console.warn("[SpeechOS] Token auto-refresh failed:", error);
1075
+ if (this.autoRefreshEnabled) this.tokenRefreshTimer = setTimeout(() => {
1076
+ this.performAutoRefresh();
1077
+ }, 30 * 1e3);
1078
+ }
1079
+ }
1080
+ /**
748
1081
  * Get the current room instance
749
1082
  */
750
1083
  getRoom() {
@@ -768,88 +1101,749 @@ var LiveKitManager = class {
768
1101
  isMicrophoneEnabled() {
769
1102
  return this.micTrack !== null;
770
1103
  }
771
- /**
772
- * Clear the cached token
773
- * Used when user identity changes to ensure next session gets a fresh token
774
- */
775
- clearToken() {
776
- const config = getConfig();
777
- if (config.debug) console.log("[SpeechOS] Clearing cached token");
778
- this.tokenData = null;
779
- this.preWarmPromise = null;
780
- }
781
1104
  };
782
1105
  const livekit = new LiveKitManager();
1106
+ events.on("settings:changed", () => {
1107
+ livekit.invalidateTokenCache();
1108
+ });
783
1109
 
784
1110
  //#endregion
785
- //#region src/transcript-store.ts
786
- const STORAGE_KEY = "speechos_transcripts";
787
- const MAX_ENTRIES = 50;
1111
+ //#region src/audio-capture.ts
788
1112
  /**
789
- * Generate a unique ID for transcript entries
1113
+ * Detect if running in Safari.
790
1114
  */
791
- function generateId() {
792
- return `${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
1115
+ function isSafari() {
1116
+ const ua = navigator.userAgent.toLowerCase();
1117
+ const vendor = navigator.vendor?.toLowerCase() || "";
1118
+ const hasSafariUA = ua.includes("safari") && !ua.includes("chrome") && !ua.includes("chromium");
1119
+ const isAppleVendor = vendor.includes("apple");
1120
+ return hasSafariUA && isAppleVendor;
793
1121
  }
794
1122
  /**
795
- * Get all transcripts from localStorage
1123
+ * Detect the best supported audio format for the current browser.
1124
+ *
1125
+ * IMPORTANT: Safari must use MP4/AAC. Its WebM/Opus implementation is buggy
1126
+ * and produces truncated/incomplete audio.
796
1127
  */
797
- function getTranscripts() {
798
- try {
799
- const stored = localStorage.getItem(STORAGE_KEY);
800
- if (!stored) return [];
801
- const entries = JSON.parse(stored);
802
- return entries.sort((a, b) => b.timestamp - a.timestamp);
803
- } catch {
804
- return [];
1128
+ function getSupportedAudioFormat() {
1129
+ if (isSafari()) {
1130
+ if (MediaRecorder.isTypeSupported("audio/mp4")) return {
1131
+ mimeType: "audio/mp4",
1132
+ format: "mp4",
1133
+ needsEncodingParams: false
1134
+ };
1135
+ return {
1136
+ mimeType: "",
1137
+ format: "mp4",
1138
+ needsEncodingParams: true
1139
+ };
805
1140
  }
1141
+ if (MediaRecorder.isTypeSupported("audio/webm;codecs=opus")) return {
1142
+ mimeType: "audio/webm;codecs=opus",
1143
+ format: "webm",
1144
+ needsEncodingParams: false
1145
+ };
1146
+ if (MediaRecorder.isTypeSupported("audio/webm")) return {
1147
+ mimeType: "audio/webm",
1148
+ format: "webm",
1149
+ needsEncodingParams: false
1150
+ };
1151
+ if (MediaRecorder.isTypeSupported("audio/mp4")) return {
1152
+ mimeType: "audio/mp4",
1153
+ format: "mp4",
1154
+ needsEncodingParams: false
1155
+ };
1156
+ return {
1157
+ mimeType: "",
1158
+ format: "webm",
1159
+ needsEncodingParams: true
1160
+ };
806
1161
  }
807
1162
  /**
808
- * Save a new transcript entry
1163
+ * Audio capture manager with buffering support.
1164
+ *
1165
+ * Usage:
1166
+ * 1. Create instance with onChunk callback
1167
+ * 2. Call start() - immediately begins capturing
1168
+ * 3. Call setReady() when connection is established - flushes buffer
1169
+ * 4. Call stop() when done
809
1170
  */
810
- function saveTranscript(text, action, originalText) {
811
- const entry = {
812
- id: generateId(),
813
- text,
814
- timestamp: Date.now(),
815
- action,
816
- ...originalText && { originalText }
817
- };
818
- const entries = getTranscripts();
819
- entries.unshift(entry);
820
- const pruned = entries.slice(0, MAX_ENTRIES);
821
- try {
822
- localStorage.setItem(STORAGE_KEY, JSON.stringify(pruned));
823
- } catch {}
824
- return entry;
825
- }
1171
+ var AudioCapture = class AudioCapture {
1172
+ mediaStream = null;
1173
+ recorder = null;
1174
+ buffer = [];
1175
+ isReady = false;
1176
+ isRecording = false;
1177
+ onChunk;
1178
+ audioFormat;
1179
+ deviceId;
1180
+ /**
1181
+ * Time slice for MediaRecorder in milliseconds.
1182
+ *
1183
+ * Safari requires a larger timeslice (1000ms) to properly flush its internal
1184
+ * audio buffers. Smaller values cause Safari to drop or truncate audio data.
1185
+ * See: https://community.openai.com/t/whisper-problem-with-audio-mp4-blobs-from-safari/
1186
+ *
1187
+ * Other browsers (Chrome, Firefox, Edge) work well with smaller timeslices
1188
+ * which provide lower latency for real-time transcription.
1189
+ */
1190
+ static TIME_SLICE_MS = 100;
1191
+ static SAFARI_TIME_SLICE_MS = 1e3;
1192
+ /**
1193
+ * @param onChunk - Callback for receiving audio chunks
1194
+ * @param deviceId - Optional audio device ID (empty string or undefined for system default)
1195
+ */
1196
+ constructor(onChunk, deviceId) {
1197
+ this.onChunk = onChunk;
1198
+ this.audioFormat = getSupportedAudioFormat();
1199
+ this.deviceId = deviceId;
1200
+ }
1201
+ /**
1202
+ * Get the appropriate timeslice for the current browser.
1203
+ * Safari needs a larger timeslice to avoid dropping audio data.
1204
+ */
1205
+ getTimeSlice() {
1206
+ return isSafari() ? AudioCapture.SAFARI_TIME_SLICE_MS : AudioCapture.TIME_SLICE_MS;
1207
+ }
1208
+ /**
1209
+ * Get the timeslice being used (in milliseconds).
1210
+ * Useful for callers that need to wait for audio processing.
1211
+ */
1212
+ getTimeSliceMs() {
1213
+ return this.getTimeSlice();
1214
+ }
1215
+ /**
1216
+ * Get the audio format being used.
1217
+ */
1218
+ getFormat() {
1219
+ return this.audioFormat;
1220
+ }
1221
+ /**
1222
+ * Start capturing audio immediately.
1223
+ *
1224
+ * Audio chunks will be buffered until setReady() is called.
1225
+ */
1226
+ async start() {
1227
+ const config = getConfig();
1228
+ if (this.isRecording) {
1229
+ if (config.debug) console.log("[SpeechOS] AudioCapture already recording");
1230
+ return;
1231
+ }
1232
+ this.buffer = [];
1233
+ this.isReady = false;
1234
+ const constraints = { audio: {
1235
+ echoCancellation: true,
1236
+ noiseSuppression: true,
1237
+ ...this.deviceId ? { deviceId: { exact: this.deviceId } } : {}
1238
+ } };
1239
+ if (config.debug) {
1240
+ console.log("[SpeechOS] AudioCapture starting with format:", this.audioFormat.mimeType);
1241
+ console.log("[SpeechOS] Detected Safari:", isSafari());
1242
+ if (this.deviceId) console.log("[SpeechOS] Using audio device:", this.deviceId);
1243
+ }
1244
+ try {
1245
+ this.mediaStream = await navigator.mediaDevices.getUserMedia(constraints);
1246
+ const recorderOptions = {};
1247
+ if (this.audioFormat.mimeType) recorderOptions.mimeType = this.audioFormat.mimeType;
1248
+ this.recorder = new MediaRecorder(this.mediaStream, recorderOptions);
1249
+ this.recorder.ondataavailable = (event) => {
1250
+ if (event.data && event.data.size > 0) this.handleChunk(event.data);
1251
+ };
1252
+ this.recorder.onerror = (event) => {
1253
+ console.error("[SpeechOS] MediaRecorder error:", event);
1254
+ };
1255
+ const timeSlice = this.getTimeSlice();
1256
+ this.recorder.start(timeSlice);
1257
+ this.isRecording = true;
1258
+ if (config.debug) console.log(`[SpeechOS] AudioCapture started with ${timeSlice}ms timeslice, buffering until ready`);
1259
+ } catch (error) {
1260
+ if (this.deviceId && error instanceof Error) {
1261
+ console.warn("[SpeechOS] Selected device unavailable, trying default:", error.message);
1262
+ this.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: {
1263
+ echoCancellation: true,
1264
+ noiseSuppression: true
1265
+ } });
1266
+ const recorderOptions = {};
1267
+ if (this.audioFormat.mimeType) recorderOptions.mimeType = this.audioFormat.mimeType;
1268
+ this.recorder = new MediaRecorder(this.mediaStream, recorderOptions);
1269
+ this.recorder.ondataavailable = (event) => {
1270
+ if (event.data && event.data.size > 0) this.handleChunk(event.data);
1271
+ };
1272
+ this.recorder.start(this.getTimeSlice());
1273
+ this.isRecording = true;
1274
+ } else throw error;
1275
+ }
1276
+ }
1277
+ /**
1278
+ * Handle an audio chunk with atomic buffer swap pattern.
1279
+ *
1280
+ * If not ready: buffer the chunk.
1281
+ * If ready: send directly via callback.
1282
+ */
1283
+ handleChunk(chunk) {
1284
+ if (this.isReady) this.onChunk(chunk);
1285
+ else this.buffer.push(chunk);
1286
+ }
1287
+ /**
1288
+ * Mark the capture as ready (connection established).
1289
+ *
1290
+ * This flushes any buffered chunks and switches to direct mode.
1291
+ * Uses atomic swap to prevent chunk reordering.
1292
+ */
1293
+ setReady() {
1294
+ const config = getConfig();
1295
+ if (this.isReady) return;
1296
+ const toFlush = this.buffer;
1297
+ this.buffer = [];
1298
+ for (const chunk of toFlush) this.onChunk(chunk);
1299
+ this.isReady = true;
1300
+ if (config.debug) console.log(`[SpeechOS] AudioCapture ready, flushed ${toFlush.length} buffered chunks`);
1301
+ }
1302
+ /**
1303
+ * Stop capturing audio and wait for final chunk.
1304
+ *
1305
+ * Uses requestData() before stop() to force the MediaRecorder to flush
1306
+ * any buffered audio immediately. This is critical for Safari which
1307
+ * may hold audio data in internal buffers.
1308
+ *
1309
+ * Safari requires an additional delay after stopping to ensure all audio
1310
+ * from its internal encoding pipeline has been fully processed and emitted.
1311
+ */
1312
+ async stop() {
1313
+ const config = getConfig();
1314
+ const safari = isSafari();
1315
+ if (this.recorder && this.recorder.state !== "inactive") {
1316
+ if (this.recorder.state === "recording") try {
1317
+ const dataPromise = new Promise((resolve) => {
1318
+ const handler = (event) => {
1319
+ this.recorder?.removeEventListener("dataavailable", handler);
1320
+ if (config.debug) console.log(`[SpeechOS] requestData flush received: ${event.data.size} bytes`);
1321
+ resolve();
1322
+ };
1323
+ this.recorder?.addEventListener("dataavailable", handler);
1324
+ });
1325
+ this.recorder.requestData();
1326
+ if (config.debug) console.log("[SpeechOS] Requested data flush before stop");
1327
+ await dataPromise;
1328
+ } catch (e) {
1329
+ if (config.debug) console.log("[SpeechOS] requestData() not supported or failed:", e);
1330
+ }
1331
+ const stopPromise = new Promise((resolve) => {
1332
+ if (!this.recorder) {
1333
+ resolve();
1334
+ return;
1335
+ }
1336
+ this.recorder.onstop = () => {
1337
+ if (config.debug) console.log("[SpeechOS] MediaRecorder onstop fired");
1338
+ resolve();
1339
+ };
1340
+ });
1341
+ this.recorder.stop();
1342
+ await stopPromise;
1343
+ if (safari) {
1344
+ if (config.debug) console.log("[SpeechOS] Safari: waiting 2s for encoding pipeline to flush");
1345
+ await new Promise((resolve) => setTimeout(resolve, 2e3));
1346
+ }
1347
+ }
1348
+ if (this.mediaStream) {
1349
+ for (const track of this.mediaStream.getTracks()) track.stop();
1350
+ this.mediaStream = null;
1351
+ }
1352
+ this.recorder = null;
1353
+ this.isRecording = false;
1354
+ this.isReady = false;
1355
+ this.buffer = [];
1356
+ if (config.debug) console.log("[SpeechOS] AudioCapture stopped");
1357
+ }
1358
+ /**
1359
+ * Check if currently recording.
1360
+ */
1361
+ get recording() {
1362
+ return this.isRecording;
1363
+ }
1364
+ /**
1365
+ * Check if ready (connection established, direct mode active).
1366
+ */
1367
+ get ready() {
1368
+ return this.isReady;
1369
+ }
1370
+ /**
1371
+ * Get the number of buffered chunks waiting to be sent.
1372
+ */
1373
+ get bufferedChunks() {
1374
+ return this.buffer.length;
1375
+ }
1376
+ };
826
1377
  /**
827
- * Clear all transcript history
1378
+ * Factory function to create an AudioCapture instance.
1379
+ * @param onChunk - Callback for receiving audio chunks
1380
+ * @param deviceId - Optional audio device ID (empty string or undefined for system default)
828
1381
  */
829
- function clearTranscripts() {
830
- try {
831
- localStorage.removeItem(STORAGE_KEY);
832
- } catch {}
1382
+ function createAudioCapture(onChunk, deviceId) {
1383
+ return new AudioCapture(onChunk, deviceId);
833
1384
  }
1385
+
1386
+ //#endregion
1387
+ //#region src/websocket.ts
1388
+ const MESSAGE_TYPE_AUTH = "auth";
1389
+ const MESSAGE_TYPE_READY = "ready";
1390
+ const MESSAGE_TYPE_TRANSCRIPTION = "transcription";
1391
+ const MESSAGE_TYPE_REQUEST_TRANSCRIPT = "request_transcript";
1392
+ const MESSAGE_TYPE_TRANSCRIPT = "transcript";
1393
+ const MESSAGE_TYPE_EDIT_TEXT = "edit_text";
1394
+ const MESSAGE_TYPE_EDITED_TEXT = "edited_text";
1395
+ const MESSAGE_TYPE_EXECUTE_COMMAND = "execute_command";
1396
+ const MESSAGE_TYPE_COMMAND_RESULT = "command_result";
1397
+ const MESSAGE_TYPE_ERROR = "error";
834
1398
  /**
835
- * Delete a single transcript by ID
1399
+ * Response timeout in milliseconds.
836
1400
  */
837
- function deleteTranscript(id) {
838
- const entries = getTranscripts().filter((e) => e.id !== id);
839
- try {
840
- localStorage.setItem(STORAGE_KEY, JSON.stringify(entries));
841
- } catch {}
842
- }
843
- const transcriptStore = {
844
- getTranscripts,
845
- saveTranscript,
846
- clearTranscripts,
847
- deleteTranscript
1401
+ const RESPONSE_TIMEOUT_MS = 15e3;
1402
+ /**
1403
+ * A deferred promise with timeout support.
1404
+ */
1405
+ var Deferred$1 = class {
1406
+ promise;
1407
+ _resolve;
1408
+ _reject;
1409
+ _timeoutId = null;
1410
+ _settled = false;
1411
+ constructor() {
1412
+ this.promise = new Promise((resolve, reject) => {
1413
+ this._resolve = resolve;
1414
+ this._reject = reject;
1415
+ });
1416
+ }
1417
+ setTimeout(ms, errorMessage, errorCode, errorSource) {
1418
+ this._timeoutId = setTimeout(() => {
1419
+ if (!this._settled) {
1420
+ console.error(`[SpeechOS] Error: ${errorMessage} (${errorCode})`);
1421
+ events.emit("error", {
1422
+ code: errorCode,
1423
+ message: errorMessage,
1424
+ source: errorSource
1425
+ });
1426
+ this.reject(new Error(errorMessage));
1427
+ }
1428
+ }, ms);
1429
+ }
1430
+ resolve(value) {
1431
+ if (!this._settled) {
1432
+ this._settled = true;
1433
+ this.clearTimeout();
1434
+ this._resolve(value);
1435
+ }
1436
+ }
1437
+ reject(error) {
1438
+ if (!this._settled) {
1439
+ this._settled = true;
1440
+ this.clearTimeout();
1441
+ this._reject(error);
1442
+ }
1443
+ }
1444
+ clearTimeout() {
1445
+ if (this._timeoutId !== null) {
1446
+ clearTimeout(this._timeoutId);
1447
+ this._timeoutId = null;
1448
+ }
1449
+ }
1450
+ get isSettled() {
1451
+ return this._settled;
1452
+ }
848
1453
  };
1454
+ /**
1455
+ * Maximum time to wait for WebSocket buffer to drain.
1456
+ */
1457
+ const BUFFER_DRAIN_TIMEOUT_MS = 5e3;
1458
+ /**
1459
+ * Polling interval for checking WebSocket buffer.
1460
+ */
1461
+ const BUFFER_CHECK_INTERVAL_MS = 50;
1462
+ /**
1463
+ * WebSocket connection manager for voice sessions.
1464
+ */
1465
+ var WebSocketManager = class {
1466
+ ws = null;
1467
+ audioCapture = null;
1468
+ sessionId = null;
1469
+ pendingAuth = null;
1470
+ pendingTranscript = null;
1471
+ pendingEditText = null;
1472
+ pendingCommand = null;
1473
+ pendingAudioSends = /* @__PURE__ */ new Set();
1474
+ editOriginalText = null;
1475
+ lastInputText = void 0;
1476
+ sessionAction = "dictate";
1477
+ sessionInputText = "";
1478
+ sessionCommands = [];
1479
+ sessionSettings = {};
1480
+ /**
1481
+ * Get the WebSocket URL for voice sessions.
1482
+ */
1483
+ getWebSocketUrl() {
1484
+ const config = getConfig();
1485
+ const host = config.host || "https://app.speechos.ai";
1486
+ const wsUrl = host.replace(/^http/, "ws");
1487
+ return `${wsUrl}/ws/voice/`;
1488
+ }
1489
+ /**
1490
+ * Start a voice session with the WebSocket backend.
1491
+ *
1492
+ * This method:
1493
+ * 1. Starts audio capture immediately (buffering)
1494
+ * 2. Opens WebSocket connection
1495
+ * 3. Authenticates with API key and action parameters
1496
+ * 4. Flushes buffered audio and continues streaming
1497
+ *
1498
+ * @param options - Session options including action type and parameters
1499
+ */
1500
+ async startVoiceSession(options) {
1501
+ const config = getConfig();
1502
+ this.sessionAction = options?.action || "dictate";
1503
+ this.sessionInputText = options?.inputText || "";
1504
+ this.sessionCommands = options?.commands || [];
1505
+ this.sessionSettings = options?.settings || {};
1506
+ if (this.sessionAction === "edit") this.editOriginalText = this.sessionInputText;
1507
+ if (config.debug) console.log("[SpeechOS] Starting WebSocket voice session...");
1508
+ this.audioCapture = createAudioCapture((chunk) => {
1509
+ this.sendAudioChunk(chunk);
1510
+ }, this.sessionSettings.audioDeviceId);
1511
+ await this.audioCapture.start();
1512
+ if (options?.onMicReady) options.onMicReady();
1513
+ state.setMicEnabled(true);
1514
+ const wsUrl = this.getWebSocketUrl();
1515
+ if (config.debug) console.log("[SpeechOS] Connecting to WebSocket:", wsUrl);
1516
+ this.ws = new WebSocket(wsUrl);
1517
+ this.ws.onopen = () => {
1518
+ if (config.debug) console.log("[SpeechOS] WebSocket connected, authenticating...");
1519
+ this.authenticate();
1520
+ };
1521
+ this.ws.onmessage = (event) => {
1522
+ this.handleMessage(event.data);
1523
+ };
1524
+ this.ws.onerror = (event) => {
1525
+ console.error("[SpeechOS] WebSocket error:", event);
1526
+ events.emit("error", {
1527
+ code: "websocket_error",
1528
+ message: "WebSocket connection error",
1529
+ source: "connection"
1530
+ });
1531
+ };
1532
+ this.ws.onclose = (event) => {
1533
+ if (config.debug) console.log("[SpeechOS] WebSocket closed:", event.code, event.reason);
1534
+ state.setConnected(false);
1535
+ };
1536
+ this.pendingAuth = new Deferred$1();
1537
+ this.pendingAuth.setTimeout(RESPONSE_TIMEOUT_MS, "Connection timed out", "connection_timeout", "connection");
1538
+ await this.pendingAuth.promise;
1539
+ this.pendingAuth = null;
1540
+ if (this.audioCapture) this.audioCapture.setReady();
1541
+ state.setConnected(true);
1542
+ if (config.debug) console.log("[SpeechOS] WebSocket voice session ready");
1543
+ }
1544
+ /**
1545
+ * Send authentication message with action parameters.
1546
+ * All session parameters are now sent upfront in the auth message.
1547
+ */
1548
+ authenticate() {
1549
+ const config = getConfig();
1550
+ const audioFormat = getSupportedAudioFormat();
1551
+ const settings = this.sessionSettings;
1552
+ const anonymousId = getAnonymousId();
1553
+ const authMessage = {
1554
+ type: MESSAGE_TYPE_AUTH,
1555
+ api_key: config.apiKey,
1556
+ user_id: config.userId || null,
1557
+ anonymous_id: anonymousId,
1558
+ input_language: settings.inputLanguageCode ?? "en-US",
1559
+ output_language: settings.outputLanguageCode ?? "en-US",
1560
+ smart_format: settings.smartFormat ?? true,
1561
+ custom_vocabulary: settings.vocabulary ?? [],
1562
+ custom_snippets: settings.snippets ?? [],
1563
+ audio_format: audioFormat.format,
1564
+ action: this.sessionAction,
1565
+ input_text: this.sessionInputText,
1566
+ commands: this.sessionCommands
1567
+ };
1568
+ if (config.debug) console.log("[SpeechOS] Sending auth message with action:", this.sessionAction);
1569
+ this.ws?.send(JSON.stringify(authMessage));
1570
+ }
1571
+ /**
1572
+ * Send an audio chunk over the WebSocket.
1573
+ * Tracks the promise so we can wait for all sends to complete.
1574
+ */
1575
+ sendAudioChunk(chunk) {
1576
+ const sendPromise = this.doSendAudioChunk(chunk);
1577
+ this.pendingAudioSends.add(sendPromise);
1578
+ sendPromise.finally(() => {
1579
+ this.pendingAudioSends.delete(sendPromise);
1580
+ });
1581
+ }
1582
+ /**
1583
+ * Actually send the audio chunk (async operation).
1584
+ */
1585
+ async doSendAudioChunk(chunk) {
1586
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
1587
+ const arrayBuffer = await chunk.arrayBuffer();
1588
+ this.ws.send(arrayBuffer);
1589
+ }
1590
+ }
1591
+ /**
1592
+ * Handle incoming WebSocket messages.
1593
+ */
1594
+ handleMessage(data) {
1595
+ const config = getConfig();
1596
+ try {
1597
+ const message = JSON.parse(data);
1598
+ if (config.debug) console.log("[SpeechOS] WebSocket message:", message);
1599
+ switch (message.type) {
1600
+ case MESSAGE_TYPE_READY:
1601
+ this.handleReady(message);
1602
+ break;
1603
+ case MESSAGE_TYPE_TRANSCRIPTION:
1604
+ this.handleIntermediateTranscription(message);
1605
+ break;
1606
+ case MESSAGE_TYPE_TRANSCRIPT:
1607
+ this.handleFinalTranscript(message);
1608
+ break;
1609
+ case MESSAGE_TYPE_EDITED_TEXT:
1610
+ this.handleEditedText(message);
1611
+ break;
1612
+ case MESSAGE_TYPE_COMMAND_RESULT:
1613
+ this.handleCommandResult(message);
1614
+ break;
1615
+ case MESSAGE_TYPE_ERROR:
1616
+ this.handleError(message);
1617
+ break;
1618
+ default: if (config.debug) console.log("[SpeechOS] Unknown message type:", message.type);
1619
+ }
1620
+ } catch (error) {
1621
+ console.error("[SpeechOS] Failed to parse message:", error);
1622
+ }
1623
+ }
1624
+ handleReady(message) {
1625
+ const config = getConfig();
1626
+ this.sessionId = message.session_id;
1627
+ if (config.debug) console.log("[SpeechOS] Session ready:", this.sessionId);
1628
+ if (this.pendingAuth) this.pendingAuth.resolve();
1629
+ }
1630
+ handleIntermediateTranscription(message) {
1631
+ const config = getConfig();
1632
+ if (config.debug) console.log("[SpeechOS] Intermediate transcription:", message.transcript, "final:", message.is_final);
1633
+ }
1634
+ handleFinalTranscript(message) {
1635
+ const transcript = message.transcript || "";
1636
+ events.emit("transcription:complete", { text: transcript });
1637
+ if (this.pendingTranscript) {
1638
+ this.pendingTranscript.resolve(transcript);
1639
+ this.pendingTranscript = null;
1640
+ }
1641
+ }
1642
+ handleEditedText(message) {
1643
+ const editedText = message.text || "";
1644
+ events.emit("edit:complete", {
1645
+ text: editedText,
1646
+ originalText: this.editOriginalText || ""
1647
+ });
1648
+ if (this.pendingEditText) {
1649
+ this.pendingEditText.resolve(editedText);
1650
+ this.pendingEditText = null;
1651
+ }
1652
+ this.editOriginalText = null;
1653
+ }
1654
+ handleCommandResult(message) {
1655
+ const commandResult = message.command || null;
1656
+ this.lastInputText = message.transcript;
1657
+ events.emit("command:complete", { command: commandResult });
1658
+ if (this.pendingCommand) {
1659
+ this.pendingCommand.resolve(commandResult);
1660
+ this.pendingCommand = null;
1661
+ }
1662
+ }
1663
+ handleError(message) {
1664
+ const errorCode = message.code || "server_error";
1665
+ const errorMessage = message.message || "A server error occurred";
1666
+ console.error(`[SpeechOS] Error: ${errorMessage} (${errorCode})`);
1667
+ events.emit("error", {
1668
+ code: errorCode,
1669
+ message: errorMessage,
1670
+ source: "server"
1671
+ });
1672
+ const error = new Error(errorMessage);
1673
+ if (this.pendingAuth) {
1674
+ this.pendingAuth.reject(error);
1675
+ this.pendingAuth = null;
1676
+ }
1677
+ if (this.pendingTranscript) {
1678
+ this.pendingTranscript.reject(error);
1679
+ this.pendingTranscript = null;
1680
+ }
1681
+ if (this.pendingEditText) {
1682
+ this.pendingEditText.reject(error);
1683
+ this.pendingEditText = null;
1684
+ }
1685
+ if (this.pendingCommand) {
1686
+ this.pendingCommand.reject(error);
1687
+ this.pendingCommand = null;
1688
+ }
1689
+ }
1690
+ /**
1691
+ * Stop the voice session and request the transcript.
1692
+ */
1693
+ async stopVoiceSession() {
1694
+ const config = getConfig();
1695
+ if (config.debug) console.log("[SpeechOS] Stopping voice session, requesting transcript...");
1696
+ await this.stopAudioCapture();
1697
+ this.pendingTranscript = new Deferred$1();
1698
+ this.pendingTranscript.setTimeout(RESPONSE_TIMEOUT_MS, "Transcription timed out. Please try again.", "transcription_timeout", "timeout");
1699
+ this.sendMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT });
1700
+ const result = await this.pendingTranscript.promise;
1701
+ this.pendingTranscript = null;
1702
+ return result;
1703
+ }
1704
+ /**
1705
+ * Request text editing using the transcript as instructions.
1706
+ * Note: The input text was already sent in the auth message via startVoiceSession.
1707
+ */
1708
+ async requestEditText(_originalText) {
1709
+ const config = getConfig();
1710
+ if (config.debug) console.log("[SpeechOS] Requesting text edit...");
1711
+ await this.stopAudioCapture();
1712
+ this.pendingEditText = new Deferred$1();
1713
+ this.pendingEditText.setTimeout(RESPONSE_TIMEOUT_MS, "Edit request timed out. Please try again.", "edit_timeout", "timeout");
1714
+ this.sendMessage({ type: MESSAGE_TYPE_EDIT_TEXT });
1715
+ const result = await this.pendingEditText.promise;
1716
+ this.pendingEditText = null;
1717
+ return result;
1718
+ }
1719
+ /**
1720
+ * Request command matching using the transcript as input.
1721
+ * Note: The command definitions were already sent in the auth message via startVoiceSession.
1722
+ */
1723
+ async requestCommand(_commands) {
1724
+ const config = getConfig();
1725
+ if (config.debug) console.log("[SpeechOS] Requesting command match...");
1726
+ await this.stopAudioCapture();
1727
+ this.pendingCommand = new Deferred$1();
1728
+ this.pendingCommand.setTimeout(RESPONSE_TIMEOUT_MS, "Command request timed out. Please try again.", "command_timeout", "timeout");
1729
+ this.sendMessage({ type: MESSAGE_TYPE_EXECUTE_COMMAND });
1730
+ const result = await this.pendingCommand.promise;
1731
+ this.pendingCommand = null;
1732
+ return result;
1733
+ }
1734
+ /**
1735
+ * Stop audio capture and wait for all data to be sent.
1736
+ *
1737
+ * Waits for:
1738
+ * 1. All pending sendAudioChunk calls to complete (arrayBuffer conversion)
1739
+ * 2. WebSocket buffer to drain (all data transmitted)
1740
+ *
1741
+ * WebSocket message ordering ensures server receives all audio before transcript request.
1742
+ */
1743
+ async stopAudioCapture() {
1744
+ const config = getConfig();
1745
+ const startTime = Date.now();
1746
+ if (config.debug) console.log("[SpeechOS] stopAudioCapture: starting...");
1747
+ if (this.audioCapture) {
1748
+ await this.audioCapture.stop();
1749
+ this.audioCapture = null;
1750
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: recorder stopped after ${Date.now() - startTime}ms`);
1751
+ }
1752
+ state.setMicEnabled(false);
1753
+ if (this.pendingAudioSends.size > 0) {
1754
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: waiting for ${this.pendingAudioSends.size} pending audio sends...`);
1755
+ await Promise.all(this.pendingAudioSends);
1756
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: all sends complete after ${Date.now() - startTime}ms`);
1757
+ } else if (config.debug) console.log("[SpeechOS] stopAudioCapture: no pending sends");
1758
+ await this.waitForBufferDrain();
1759
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: complete after ${Date.now() - startTime}ms`);
1760
+ }
1761
+ /**
1762
+ * Wait for the WebSocket send buffer to drain.
1763
+ *
1764
+ * This ensures all audio data has been transmitted before we request
1765
+ * the transcript. Uses the same pattern as LiveKit's ReadableStream approach.
1766
+ */
1767
+ async waitForBufferDrain() {
1768
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
1769
+ const config = getConfig();
1770
+ const startTime = Date.now();
1771
+ while (this.ws.bufferedAmount > 0) {
1772
+ if (Date.now() - startTime > BUFFER_DRAIN_TIMEOUT_MS) {
1773
+ console.warn(`[SpeechOS] Buffer drain timeout, ${this.ws.bufferedAmount} bytes still pending`);
1774
+ break;
1775
+ }
1776
+ await new Promise((resolve) => setTimeout(resolve, BUFFER_CHECK_INTERVAL_MS));
1777
+ }
1778
+ if (config.debug) console.log(`[SpeechOS] Buffer drained in ${Date.now() - startTime}ms`);
1779
+ }
1780
+ /**
1781
+ * Send a JSON message over the WebSocket.
1782
+ */
1783
+ sendMessage(message) {
1784
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) this.ws.send(JSON.stringify(message));
1785
+ }
1786
+ /**
1787
+ * Disconnect from the WebSocket.
1788
+ */
1789
+ async disconnect() {
1790
+ const config = getConfig();
1791
+ if (config.debug) console.log("[SpeechOS] Disconnecting WebSocket...");
1792
+ await this.stopAudioCapture();
1793
+ if (this.ws) {
1794
+ this.ws.close();
1795
+ this.ws = null;
1796
+ }
1797
+ const error = new Error("Disconnected");
1798
+ if (this.pendingAuth) {
1799
+ this.pendingAuth.reject(error);
1800
+ this.pendingAuth = null;
1801
+ }
1802
+ if (this.pendingTranscript) {
1803
+ this.pendingTranscript.reject(error);
1804
+ this.pendingTranscript = null;
1805
+ }
1806
+ if (this.pendingEditText) {
1807
+ this.pendingEditText.reject(error);
1808
+ this.pendingEditText = null;
1809
+ }
1810
+ if (this.pendingCommand) {
1811
+ this.pendingCommand.reject(error);
1812
+ this.pendingCommand = null;
1813
+ }
1814
+ this.sessionId = null;
1815
+ this.editOriginalText = null;
1816
+ this.lastInputText = void 0;
1817
+ this.sessionSettings = {};
1818
+ state.setConnected(false);
1819
+ state.setMicEnabled(false);
1820
+ if (config.debug) console.log("[SpeechOS] WebSocket disconnected");
1821
+ }
1822
+ /**
1823
+ * Check if connected to WebSocket.
1824
+ */
1825
+ isConnected() {
1826
+ return this.ws !== null && this.ws.readyState === WebSocket.OPEN;
1827
+ }
1828
+ /**
1829
+ * Get the last input text from a command result.
1830
+ * This is the raw transcript of what the user said.
1831
+ */
1832
+ getLastInputText() {
1833
+ return this.lastInputText;
1834
+ }
1835
+ };
1836
+ const websocket = new WebSocketManager();
849
1837
 
850
1838
  //#endregion
851
1839
  //#region src/speechos.ts
852
1840
  /**
1841
+ * Get the active voice backend (always websocket now)
1842
+ */
1843
+ function getBackend$1() {
1844
+ return websocket;
1845
+ }
1846
+ /**
853
1847
  * SpeechOS Core SDK
854
1848
  *
855
1849
  * Provides two API layers:
@@ -868,7 +1862,6 @@ var SpeechOSCore = class {
868
1862
  const currentConfig$1 = getConfig();
869
1863
  if (currentConfig$1.debug) console.log("[SpeechOS] Initialized with config:", {
870
1864
  host: currentConfig$1.host,
871
- position: currentConfig$1.position,
872
1865
  debug: currentConfig$1.debug
873
1866
  });
874
1867
  }
@@ -908,7 +1901,6 @@ var SpeechOSCore = class {
908
1901
  state.setRecordingState("processing");
909
1902
  try {
910
1903
  const transcript = await livekit.stopAndGetTranscript();
911
- transcriptStore.saveTranscript(transcript, "dictate");
912
1904
  state.completeRecording();
913
1905
  return transcript;
914
1906
  } catch (error) {
@@ -925,7 +1917,6 @@ var SpeechOSCore = class {
925
1917
  state.setRecordingState("processing");
926
1918
  try {
927
1919
  const editedText = await livekit.stopAndEdit(originalText);
928
- transcriptStore.saveTranscript(editedText, "edit", originalText);
929
1920
  state.completeRecording();
930
1921
  return editedText;
931
1922
  } catch (error) {
@@ -951,8 +1942,13 @@ var SpeechOSCore = class {
951
1942
  state.setActiveAction("dictate");
952
1943
  state.startRecording();
953
1944
  try {
954
- await livekit.startVoiceSession();
955
- state.setRecordingState("recording");
1945
+ const backend = getBackend$1();
1946
+ await backend.startVoiceSession({
1947
+ action: "dictate",
1948
+ onMicReady: () => {
1949
+ state.setRecordingState("recording");
1950
+ }
1951
+ });
956
1952
  return new Promise((resolve, reject) => {
957
1953
  this._dictateResolve = resolve;
958
1954
  this._dictateReject = reject;
@@ -972,8 +1968,8 @@ var SpeechOSCore = class {
972
1968
  async stopDictation() {
973
1969
  state.setRecordingState("processing");
974
1970
  try {
975
- const transcript = await livekit.stopVoiceSession();
976
- transcriptStore.saveTranscript(transcript, "dictate");
1971
+ const backend = getBackend$1();
1972
+ const transcript = await backend.stopVoiceSession();
977
1973
  state.completeRecording();
978
1974
  if (this._dictateResolve) {
979
1975
  this._dictateResolve(transcript);
@@ -1007,8 +2003,14 @@ var SpeechOSCore = class {
1007
2003
  state.startRecording();
1008
2004
  this._editOriginalText = originalText;
1009
2005
  try {
1010
- await livekit.startVoiceSession();
1011
- state.setRecordingState("recording");
2006
+ const backend = getBackend$1();
2007
+ await backend.startVoiceSession({
2008
+ action: "edit",
2009
+ inputText: originalText,
2010
+ onMicReady: () => {
2011
+ state.setRecordingState("recording");
2012
+ }
2013
+ });
1012
2014
  return new Promise((resolve, reject) => {
1013
2015
  this._editResolve = resolve;
1014
2016
  this._editReject = reject;
@@ -1029,9 +2031,9 @@ var SpeechOSCore = class {
1029
2031
  async stopEdit() {
1030
2032
  state.setRecordingState("processing");
1031
2033
  try {
2034
+ const backend = getBackend$1();
1032
2035
  const originalText = this._editOriginalText || "";
1033
- const editedText = await livekit.requestEditText(originalText);
1034
- transcriptStore.saveTranscript(editedText, "edit", originalText);
2036
+ const editedText = await backend.requestEditText(originalText);
1035
2037
  state.completeRecording();
1036
2038
  if (this._editResolve) {
1037
2039
  this._editResolve(editedText);
@@ -1054,6 +2056,71 @@ var SpeechOSCore = class {
1054
2056
  }
1055
2057
  }
1056
2058
  /**
2059
+ * One-shot command: connect, wait for agent, record voice, match against commands
2060
+ * Automatically handles the full voice session lifecycle
2061
+ *
2062
+ * @param commands - Array of command definitions to match against
2063
+ * @returns The matched command result or null if no match
2064
+ */
2065
+ async command(commands) {
2066
+ this.ensureInitialized();
2067
+ state.setActiveAction("command");
2068
+ state.startRecording();
2069
+ this._commandCommands = commands;
2070
+ try {
2071
+ const backend = getBackend$1();
2072
+ await backend.startVoiceSession({
2073
+ action: "command",
2074
+ commands,
2075
+ onMicReady: () => {
2076
+ state.setRecordingState("recording");
2077
+ }
2078
+ });
2079
+ return new Promise((resolve, reject) => {
2080
+ this._commandResolve = resolve;
2081
+ this._commandReject = reject;
2082
+ });
2083
+ } catch (error) {
2084
+ state.setError(error instanceof Error ? error.message : "Failed to start command");
2085
+ await this.cleanup();
2086
+ throw error;
2087
+ }
2088
+ }
2089
+ _commandCommands;
2090
+ _commandResolve;
2091
+ _commandReject;
2092
+ /**
2093
+ * Stop command recording and get the matched command
2094
+ * Call this after command() when user stops speaking
2095
+ */
2096
+ async stopCommand() {
2097
+ state.setRecordingState("processing");
2098
+ try {
2099
+ const backend = getBackend$1();
2100
+ const commands = this._commandCommands || [];
2101
+ const result = await backend.requestCommand(commands);
2102
+ state.completeRecording();
2103
+ if (this._commandResolve) {
2104
+ this._commandResolve(result);
2105
+ this._commandResolve = void 0;
2106
+ this._commandReject = void 0;
2107
+ }
2108
+ return result;
2109
+ } catch (error) {
2110
+ const err = error instanceof Error ? error : new Error("Command request failed");
2111
+ state.setError(err.message);
2112
+ if (this._commandReject) {
2113
+ this._commandReject(err);
2114
+ this._commandResolve = void 0;
2115
+ this._commandReject = void 0;
2116
+ }
2117
+ throw err;
2118
+ } finally {
2119
+ this._commandCommands = void 0;
2120
+ await this.cleanup();
2121
+ }
2122
+ }
2123
+ /**
1057
2124
  * Cancel the current operation
1058
2125
  */
1059
2126
  async cancel() {
@@ -1068,7 +2135,13 @@ var SpeechOSCore = class {
1068
2135
  this._editResolve = void 0;
1069
2136
  this._editReject = void 0;
1070
2137
  }
2138
+ if (this._commandReject) {
2139
+ this._commandReject(err);
2140
+ this._commandResolve = void 0;
2141
+ this._commandReject = void 0;
2142
+ }
1071
2143
  this._editOriginalText = void 0;
2144
+ this._commandCommands = void 0;
1072
2145
  await this.cleanup();
1073
2146
  state.cancelRecording();
1074
2147
  }
@@ -1095,7 +2168,8 @@ var SpeechOSCore = class {
1095
2168
  }
1096
2169
  async cleanup() {
1097
2170
  try {
1098
- await livekit.disconnect();
2171
+ const backend = getBackend$1();
2172
+ await backend.disconnect();
1099
2173
  } catch (error) {
1100
2174
  const config = getConfig();
1101
2175
  if (config.debug) console.warn("[SpeechOS] Cleanup disconnect error:", error);
@@ -1111,6 +2185,9 @@ var SpeechOSCore = class {
1111
2185
  this._editResolve = void 0;
1112
2186
  this._editReject = void 0;
1113
2187
  this._editOriginalText = void 0;
2188
+ this._commandResolve = void 0;
2189
+ this._commandReject = void 0;
2190
+ this._commandCommands = void 0;
1114
2191
  resetConfig();
1115
2192
  state.reset();
1116
2193
  events.clear();
@@ -1118,10 +2195,38 @@ var SpeechOSCore = class {
1118
2195
  };
1119
2196
  const speechOS = new SpeechOSCore();
1120
2197
 
2198
+ //#endregion
2199
+ //#region src/backend.ts
2200
+ /**
2201
+ * WebSocket backend adapter - wraps the websocket module to match the VoiceBackend interface
2202
+ */
2203
+ const websocketBackend = {
2204
+ startVoiceSession: (options) => websocket.startVoiceSession(options),
2205
+ stopVoiceSession: () => websocket.stopVoiceSession(),
2206
+ requestEditText: (text) => websocket.requestEditText(text),
2207
+ requestCommand: (commands) => websocket.requestCommand(commands),
2208
+ disconnect: () => websocket.disconnect(),
2209
+ isConnected: () => websocket.isConnected(),
2210
+ getLastInputText: () => websocket.getLastInputText(),
2211
+ prefetchToken: () => Promise.resolve({}),
2212
+ startAutoRefresh: () => {},
2213
+ stopAutoRefresh: () => {},
2214
+ invalidateTokenCache: () => {}
2215
+ };
2216
+ /**
2217
+ * Get the active voice backend.
2218
+ * Always returns WebSocket backend (LiveKit is legacy).
2219
+ *
2220
+ * @returns The websocket backend
2221
+ */
2222
+ function getBackend() {
2223
+ return websocketBackend;
2224
+ }
2225
+
1121
2226
  //#endregion
1122
2227
  //#region src/index.ts
1123
2228
  const VERSION = "0.1.0";
1124
2229
 
1125
2230
  //#endregion
1126
- export { DEFAULT_HOST, Deferred, SpeechOSEventEmitter, VERSION, createStateManager, defaultConfig, events, getConfig, livekit, resetConfig, setConfig, speechOS, state, transcriptStore, updateUserId, validateConfig };
2231
+ export { DEFAULT_HOST, Deferred, SpeechOSEventEmitter, VERSION, createStateManager, events, getBackend, getConfig, livekit, resetConfig, setConfig, speechOS, state, updateUserId, validateConfig, websocket };
1127
2232
  //# sourceMappingURL=index.js.map