@speechos/core 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -12,8 +12,6 @@ const defaultConfig = {
12
12
  apiKey: "",
13
13
  userId: "",
14
14
  host: DEFAULT_HOST,
15
- position: "bottom-center",
16
- zIndex: 999999,
17
15
  debug: false
18
16
  };
19
17
  /**
@@ -21,31 +19,19 @@ const defaultConfig = {
21
19
  * @param userConfig - User-provided configuration
22
20
  * @returns Validated and merged configuration
23
21
  */
24
- function validateConfig(userConfig = {}) {
22
+ function validateConfig(userConfig) {
25
23
  if (!userConfig.apiKey) throw new Error("SpeechOS requires an apiKey. Get one from your team dashboard at /a/<team-slug>/.");
26
- const config = {
27
- ...defaultConfig,
28
- ...userConfig
24
+ return {
25
+ apiKey: userConfig.apiKey,
26
+ userId: userConfig.userId ?? defaultConfig.userId,
27
+ host: userConfig.host ?? defaultConfig.host,
28
+ debug: userConfig.debug ?? defaultConfig.debug
29
29
  };
30
- const validPositions = [
31
- "bottom-center",
32
- "bottom-right",
33
- "bottom-left"
34
- ];
35
- if (!validPositions.includes(config.position)) {
36
- console.warn(`Invalid position "${config.position}". Using default "bottom-center".`);
37
- config.position = "bottom-center";
38
- }
39
- if (typeof config.zIndex !== "number" || config.zIndex < 0) {
40
- console.warn(`Invalid zIndex "${config.zIndex}". Using default ${defaultConfig.zIndex}.`);
41
- config.zIndex = defaultConfig.zIndex;
42
- }
43
- return config;
44
30
  }
45
31
  /**
46
32
  * Current active configuration (singleton)
47
33
  */
48
- let currentConfig = defaultConfig;
34
+ let currentConfig = { ...defaultConfig };
49
35
  /**
50
36
  * Get the current configuration
51
37
  */
@@ -168,33 +154,38 @@ const initialState = {
168
154
  var StateManager = class {
169
155
  state;
170
156
  subscribers = /* @__PURE__ */ new Set();
157
+ /** Cached immutable snapshot for useSyncExternalStore compatibility */
158
+ snapshot;
171
159
  constructor(initialState$1) {
172
160
  this.state = { ...initialState$1 };
161
+ this.snapshot = Object.freeze({ ...this.state });
173
162
  }
174
163
  /**
175
- * Get the current state (returns a copy to prevent mutations)
164
+ * Get the current state snapshot (returns a stable reference for React)
165
+ * This returns an immutable frozen object that only changes when setState is called.
176
166
  */
177
167
  getState() {
178
- return { ...this.state };
168
+ return this.snapshot;
179
169
  }
180
170
  /**
181
171
  * Update state with partial values
182
172
  * @param partial - Partial state to merge with current state
183
173
  */
184
174
  setState(partial) {
185
- const prevState = { ...this.state };
175
+ const prevState = this.snapshot;
186
176
  this.state = {
187
177
  ...this.state,
188
178
  ...partial
189
179
  };
180
+ this.snapshot = Object.freeze({ ...this.state });
190
181
  this.subscribers.forEach((callback) => {
191
182
  try {
192
- callback(this.state, prevState);
183
+ callback(this.snapshot, prevState);
193
184
  } catch (error) {
194
185
  console.error("Error in state change callback:", error);
195
186
  }
196
187
  });
197
- events.emit("state:change", { state: this.state });
188
+ events.emit("state:change", { state: this.snapshot });
198
189
  }
199
190
  /**
200
191
  * Subscribe to state changes
@@ -211,7 +202,17 @@ var StateManager = class {
211
202
  * Reset state to initial values
212
203
  */
213
204
  reset() {
214
- this.setState(initialState);
205
+ const prevState = this.snapshot;
206
+ this.state = { ...initialState };
207
+ this.snapshot = Object.freeze({ ...this.state });
208
+ this.subscribers.forEach((callback) => {
209
+ try {
210
+ callback(this.snapshot, prevState);
211
+ } catch (error) {
212
+ console.error("Error in state change callback:", error);
213
+ }
214
+ });
215
+ events.emit("state:change", { state: this.snapshot });
215
216
  }
216
217
  /**
217
218
  * Show the widget
@@ -346,12 +347,15 @@ function createStateManager(initial) {
346
347
 
347
348
  //#endregion
348
349
  //#region src/livekit.ts
349
- const MESSAGE_TYPE_REQUEST_TRANSCRIPT = "request_transcript";
350
- const MESSAGE_TYPE_TRANSCRIPT = "transcript";
351
- const MESSAGE_TYPE_EDIT_TEXT = "edit_text";
352
- const MESSAGE_TYPE_EDITED_TEXT = "edited_text";
353
- const MESSAGE_TYPE_ERROR = "error";
350
+ const MESSAGE_TYPE_REQUEST_TRANSCRIPT$1 = "request_transcript";
351
+ const MESSAGE_TYPE_TRANSCRIPT$1 = "transcript";
352
+ const MESSAGE_TYPE_EDIT_TEXT$1 = "edit_text";
353
+ const MESSAGE_TYPE_EDITED_TEXT$1 = "edited_text";
354
+ const MESSAGE_TYPE_EXECUTE_COMMAND$1 = "execute_command";
355
+ const MESSAGE_TYPE_COMMAND_RESULT$1 = "command_result";
356
+ const MESSAGE_TYPE_ERROR$1 = "error";
354
357
  const TOPIC_SPEECHOS = "speechos";
358
+ const TOKEN_CACHE_TTL_MS = 4 * 60 * 1e3;
355
359
  /**
356
360
  * A deferred promise with timeout support.
357
361
  * Encapsulates resolve/reject/timeout in a single object for cleaner async handling.
@@ -415,53 +419,116 @@ var LiveKitManager = class {
415
419
  room = null;
416
420
  tokenData = null;
417
421
  micTrack = null;
422
+ cachedTokenData = null;
423
+ tokenCacheTimestamp = null;
424
+ tokenPrefetchPromise = null;
425
+ tokenRefreshTimer = null;
426
+ autoRefreshEnabled = false;
418
427
  pendingTranscript = null;
419
428
  pendingEditText = null;
429
+ pendingCommand = null;
420
430
  pendingTrackSubscribed = null;
421
- preWarmPromise = null;
422
431
  editOriginalText = null;
432
+ sessionSettings = {};
423
433
  /**
424
- * Pre-warm resources for faster connection
425
- * Call this when user shows intent (e.g., expands widget)
426
- * Only fetches token - mic permission is requested when user clicks Dictate
434
+ * Check if the cached token is still valid (within TTL)
427
435
  */
428
- async preWarm() {
429
- if (this.tokenData || this.preWarmPromise || this.room?.state === "connected") {
430
- const config$1 = getConfig();
431
- if (config$1.debug) console.log("[SpeechOS] Pre-warm skipped - token already available");
432
- return;
433
- }
436
+ isCachedTokenValid() {
437
+ if (!this.cachedTokenData || !this.tokenCacheTimestamp) return false;
438
+ const age = Date.now() - this.tokenCacheTimestamp;
439
+ return age < TOKEN_CACHE_TTL_MS;
440
+ }
441
+ /**
442
+ * Pre-fetch a LiveKit token for later use
443
+ * Call this early (e.g., when widget expands) to reduce latency when starting a voice session.
444
+ * If a prefetch is already in progress, returns the existing promise.
445
+ * If a valid cached token exists, returns it immediately.
446
+ */
447
+ async prefetchToken() {
434
448
  const config = getConfig();
435
- if (config.debug) console.log("[SpeechOS] Pre-warming: fetching token...");
436
- this.preWarmPromise = (async () => {
437
- try {
438
- await this.fetchToken();
439
- if (config.debug) console.log("[SpeechOS] Pre-warm complete - token ready");
440
- } catch (error) {
441
- if (config.debug) console.warn("[SpeechOS] Pre-warm failed:", error);
442
- this.preWarmPromise = null;
443
- }
444
- })();
445
- await this.preWarmPromise;
449
+ if (this.isCachedTokenValid() && this.cachedTokenData) {
450
+ if (config.debug) console.log("[SpeechOS] Using cached token (prefetch hit)");
451
+ return this.cachedTokenData;
452
+ }
453
+ if (this.tokenPrefetchPromise) {
454
+ if (config.debug) console.log("[SpeechOS] Prefetch already in progress, awaiting...");
455
+ return this.tokenPrefetchPromise;
456
+ }
457
+ if (config.debug) console.log("[SpeechOS] Starting token prefetch...");
458
+ this.tokenPrefetchPromise = this.fetchTokenFromServer().then((data) => {
459
+ this.cachedTokenData = data;
460
+ this.tokenCacheTimestamp = Date.now();
461
+ this.tokenPrefetchPromise = null;
462
+ return data;
463
+ }).catch((error) => {
464
+ this.tokenPrefetchPromise = null;
465
+ throw error;
466
+ });
467
+ return this.tokenPrefetchPromise;
446
468
  }
447
469
  /**
448
470
  * Fetch a LiveKit token from the backend
471
+ * Uses cached token if valid, otherwise fetches a fresh one.
472
+ * Includes language settings and user vocabulary which are stored in the VoiceSession.
449
473
  */
450
474
  async fetchToken() {
475
+ const config = getConfig();
476
+ if (this.isCachedTokenValid() && this.cachedTokenData) {
477
+ if (config.debug) console.log("[SpeechOS] Using cached token");
478
+ this.tokenData = this.cachedTokenData;
479
+ return this.cachedTokenData;
480
+ }
481
+ if (this.tokenPrefetchPromise) {
482
+ if (config.debug) console.log("[SpeechOS] Waiting for prefetch to complete...");
483
+ const data$1 = await this.tokenPrefetchPromise;
484
+ this.tokenData = data$1;
485
+ return data$1;
486
+ }
487
+ const data = await this.fetchTokenFromServer();
488
+ this.cachedTokenData = data;
489
+ this.tokenCacheTimestamp = Date.now();
490
+ this.tokenData = data;
491
+ return data;
492
+ }
493
+ /**
494
+ * Internal method to fetch a fresh token from the server
495
+ */
496
+ async fetchTokenFromServer() {
451
497
  const config = getConfig();
452
498
  const url = `${config.host}/livekit/api/token/`;
453
- if (config.debug) console.log("[SpeechOS] Fetching LiveKit token from:", url);
499
+ const settings = this.sessionSettings;
500
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
501
+ const outputLanguage = settings.outputLanguageCode ?? "en-US";
502
+ const smartFormat = settings.smartFormat ?? true;
503
+ const vocabulary = settings.vocabulary ?? [];
504
+ const snippets = settings.snippets ?? [];
505
+ if (config.debug) {
506
+ console.log("[SpeechOS] Fetching LiveKit token from:", url);
507
+ console.log("[SpeechOS] Session settings:", {
508
+ inputLanguage,
509
+ outputLanguage,
510
+ smartFormat,
511
+ snippetsCount: snippets.length,
512
+ vocabularyCount: vocabulary.length
513
+ });
514
+ }
454
515
  const response = await fetch(url, {
455
516
  method: "POST",
456
517
  headers: {
457
518
  "Content-Type": "application/json",
458
519
  ...config.apiKey ? { Authorization: `Api-Key ${config.apiKey}` } : {}
459
520
  },
460
- body: JSON.stringify({ user_id: config.userId || null })
521
+ body: JSON.stringify({
522
+ user_id: config.userId || null,
523
+ input_language: inputLanguage,
524
+ output_language: outputLanguage,
525
+ smart_format: smartFormat,
526
+ custom_vocabulary: vocabulary,
527
+ custom_snippets: snippets
528
+ })
461
529
  });
462
530
  if (!response.ok) throw new Error(`Failed to fetch LiveKit token: ${response.status} ${response.statusText}`);
463
531
  const data = await response.json();
464
- this.tokenData = data;
465
532
  if (config.debug) console.log("[SpeechOS] LiveKit token received:", {
466
533
  room: data.room,
467
534
  identity: data.identity,
@@ -474,8 +541,7 @@ var LiveKitManager = class {
474
541
  */
475
542
  async connect() {
476
543
  const config = getConfig();
477
- if (!this.tokenData) await this.fetchToken();
478
- else if (config.debug) console.log("[SpeechOS] Using pre-fetched token");
544
+ await this.fetchToken();
479
545
  if (!this.tokenData) throw new Error("No token available for LiveKit connection");
480
546
  this.room = new Room({
481
547
  adaptiveStream: true,
@@ -539,7 +605,7 @@ var LiveKitManager = class {
539
605
  try {
540
606
  const message = JSON.parse(new TextDecoder().decode(data));
541
607
  if (config.debug) console.log("[SpeechOS] Data received:", message);
542
- if (message.type === MESSAGE_TYPE_TRANSCRIPT) {
608
+ if (message.type === MESSAGE_TYPE_TRANSCRIPT$1) {
543
609
  const transcript = message.transcript || "";
544
610
  if (config.debug) console.log("[SpeechOS] Transcript received:", transcript);
545
611
  events.emit("transcription:complete", { text: transcript });
@@ -547,7 +613,7 @@ var LiveKitManager = class {
547
613
  this.pendingTranscript.resolve(transcript);
548
614
  this.pendingTranscript = null;
549
615
  }
550
- } else if (message.type === MESSAGE_TYPE_EDITED_TEXT) {
616
+ } else if (message.type === MESSAGE_TYPE_EDITED_TEXT$1) {
551
617
  const editedText = message.text || "";
552
618
  if (config.debug) console.log("[SpeechOS] Edited text received:", editedText);
553
619
  events.emit("edit:complete", {
@@ -559,7 +625,15 @@ var LiveKitManager = class {
559
625
  this.pendingEditText = null;
560
626
  }
561
627
  this.editOriginalText = null;
562
- } else if (message.type === MESSAGE_TYPE_ERROR) {
628
+ } else if (message.type === MESSAGE_TYPE_COMMAND_RESULT$1) {
629
+ const commandResult = message.command || null;
630
+ if (config.debug) console.log("[SpeechOS] Command result received:", commandResult);
631
+ events.emit("command:complete", { command: commandResult });
632
+ if (this.pendingCommand) {
633
+ this.pendingCommand.resolve(commandResult);
634
+ this.pendingCommand = null;
635
+ }
636
+ } else if (message.type === MESSAGE_TYPE_ERROR$1) {
563
637
  const serverError = message;
564
638
  const errorCode = serverError.code || "server_error";
565
639
  const errorMessage = serverError.message || "A server error occurred";
@@ -579,6 +653,10 @@ var LiveKitManager = class {
579
653
  this.pendingEditText.reject(error);
580
654
  this.pendingEditText = null;
581
655
  }
656
+ if (this.pendingCommand) {
657
+ this.pendingCommand.reject(error);
658
+ this.pendingCommand = null;
659
+ }
582
660
  }
583
661
  } catch (error) {
584
662
  console.error("[SpeechOS] Failed to parse data message:", error);
@@ -586,16 +664,34 @@ var LiveKitManager = class {
586
664
  }
587
665
  /**
588
666
  * Publish microphone audio track
667
+ * Uses the device ID from session settings if set
589
668
  */
590
669
  async enableMicrophone() {
591
670
  if (!this.room || this.room.state !== "connected") throw new Error("Not connected to room");
592
671
  const config = getConfig();
593
672
  if (!this.micTrack) {
594
673
  if (config.debug) console.log("[SpeechOS] Creating microphone track...");
595
- this.micTrack = await createLocalAudioTrack({
674
+ const deviceId = this.sessionSettings.audioDeviceId;
675
+ const trackOptions = {
596
676
  echoCancellation: true,
597
677
  noiseSuppression: true
598
- });
678
+ };
679
+ if (deviceId) {
680
+ trackOptions.deviceId = { exact: deviceId };
681
+ if (config.debug) console.log("[SpeechOS] Using audio device:", deviceId);
682
+ }
683
+ try {
684
+ this.micTrack = await createLocalAudioTrack(trackOptions);
685
+ } catch (error) {
686
+ if (deviceId && error instanceof Error) {
687
+ console.warn("[SpeechOS] Selected audio device unavailable, falling back to default:", error.message);
688
+ this.micTrack = await createLocalAudioTrack({
689
+ echoCancellation: true,
690
+ noiseSuppression: true
691
+ });
692
+ } else throw error;
693
+ }
694
+ this.logMicrophoneInfo();
599
695
  }
600
696
  const existingPub = this.room.localParticipant.getTrackPublication(Track.Source.Microphone);
601
697
  if (!existingPub) {
@@ -605,6 +701,24 @@ var LiveKitManager = class {
605
701
  }
606
702
  }
607
703
  /**
704
+ * Log information about the current microphone track
705
+ */
706
+ logMicrophoneInfo() {
707
+ if (!this.micTrack) return;
708
+ const config = getConfig();
709
+ const mediaTrack = this.micTrack.mediaStreamTrack;
710
+ const settings = mediaTrack.getSettings();
711
+ console.log("[SpeechOS] Microphone active:", {
712
+ deviceId: settings.deviceId || "unknown",
713
+ label: mediaTrack.label || "Unknown device",
714
+ sampleRate: settings.sampleRate,
715
+ channelCount: settings.channelCount,
716
+ echoCancellation: settings.echoCancellation,
717
+ noiseSuppression: settings.noiseSuppression
718
+ });
719
+ if (config.debug) console.log("[SpeechOS] Full audio track settings:", settings);
720
+ }
721
+ /**
608
722
  * Disable microphone audio track
609
723
  */
610
724
  async disableMicrophone() {
@@ -636,30 +750,85 @@ var LiveKitManager = class {
636
750
  });
637
751
  }
638
752
  /**
639
- * Start a voice session
640
- * Connects to room, enables microphone, and waits for agent to subscribe to our track
753
+ * Start a voice session with pre-connect audio buffering
754
+ * Fetches a fresh token, then enables mic with preConnectBuffer to capture audio while connecting.
755
+ * Agent subscription happens in the background - we don't block on it.
756
+ *
757
+ * @param options - Session options including action type and parameters
641
758
  */
642
- async startVoiceSession() {
759
+ async startVoiceSession(options) {
643
760
  const config = getConfig();
644
761
  if (config.debug) console.log("[SpeechOS] Starting voice session...");
645
- if (this.preWarmPromise) {
646
- if (config.debug) console.log("[SpeechOS] Waiting for pre-warm to complete...");
647
- await this.preWarmPromise;
648
- }
649
- if (this.tokenData) {
650
- if (config.debug) console.log("[SpeechOS] Using cached token from init");
651
- } else {
652
- if (config.debug) console.log("[SpeechOS] Fetching fresh token for session...");
653
- await this.fetchToken();
654
- }
762
+ this.sessionSettings = options?.settings || {};
763
+ await this.fetchToken();
764
+ if (!this.tokenData) throw new Error("No token available for LiveKit connection");
655
765
  this.pendingTrackSubscribed = new Deferred();
656
766
  this.pendingTrackSubscribed.setTimeout(15e3, "Connection timed out - agent not available", "connection_timeout", "connection");
657
- await this.connect();
658
- await this.enableMicrophone();
659
- if (config.debug) console.log("[SpeechOS] Microphone published, waiting for LocalTrackSubscribed event...");
660
- await this.pendingTrackSubscribed.promise;
661
- this.pendingTrackSubscribed = null;
662
- if (config.debug) console.log("[SpeechOS] Voice session ready - agent subscribed to audio");
767
+ this.room = new Room({
768
+ adaptiveStream: true,
769
+ dynacast: true
770
+ });
771
+ this.setupRoomEvents();
772
+ if (config.debug) console.log("[SpeechOS] Connecting to LiveKit room:", this.tokenData.room, "at", this.tokenData.ws_url);
773
+ await this.room.connect(this.tokenData.ws_url, this.tokenData.token);
774
+ if (config.debug) console.log("[SpeechOS] Connected, enabling microphone with preConnectBuffer...");
775
+ await this.enableMicrophoneWithPreConnectBuffer();
776
+ if (options?.onMicReady) options.onMicReady();
777
+ state.setConnected(true);
778
+ if (config.debug) console.log("[SpeechOS] Voice session ready - microphone active");
779
+ this.waitForAgentSubscription();
780
+ }
781
+ /**
782
+ * Wait for the agent to subscribe to our audio track in the background
783
+ * Handles timeout errors without blocking the main flow
784
+ */
785
+ waitForAgentSubscription() {
786
+ const config = getConfig();
787
+ if (!this.pendingTrackSubscribed) return;
788
+ this.pendingTrackSubscribed.promise.then(() => {
789
+ if (config.debug) console.log("[SpeechOS] Agent subscribed to audio track - full duplex established");
790
+ this.pendingTrackSubscribed = null;
791
+ }).catch((error) => {
792
+ console.warn("[SpeechOS] Agent subscription timeout:", error.message);
793
+ this.pendingTrackSubscribed = null;
794
+ });
795
+ }
796
+ /**
797
+ * Enable microphone with pre-connect buffering
798
+ * This starts capturing audio locally before the room is connected,
799
+ * buffering it until the connection is established.
800
+ */
801
+ async enableMicrophoneWithPreConnectBuffer() {
802
+ if (!this.room) throw new Error("Room not initialized");
803
+ const config = getConfig();
804
+ const deviceId = this.sessionSettings.audioDeviceId;
805
+ const constraints = {
806
+ echoCancellation: true,
807
+ noiseSuppression: true
808
+ };
809
+ if (deviceId) {
810
+ constraints.deviceId = { exact: deviceId };
811
+ if (config.debug) console.log("[SpeechOS] Using audio device:", deviceId);
812
+ }
813
+ try {
814
+ await this.room.localParticipant.setMicrophoneEnabled(true, constraints, { preConnectBuffer: true });
815
+ state.setMicEnabled(true);
816
+ const micPub = this.room.localParticipant.getTrackPublication(Track.Source.Microphone);
817
+ if (micPub?.track) {
818
+ this.micTrack = micPub.track;
819
+ this.logMicrophoneInfo();
820
+ }
821
+ if (config.debug) console.log("[SpeechOS] Microphone enabled with pre-connect buffer - audio is being captured");
822
+ } catch (error) {
823
+ if (deviceId && error instanceof Error) {
824
+ console.warn("[SpeechOS] Selected audio device unavailable, falling back to default:", error.message);
825
+ await this.room.localParticipant.setMicrophoneEnabled(true, {
826
+ echoCancellation: true,
827
+ noiseSuppression: true
828
+ }, { preConnectBuffer: true });
829
+ state.setMicEnabled(true);
830
+ } else throw error;
831
+ }
663
832
  }
664
833
  /**
665
834
  * Stop the voice session and request the transcript
@@ -668,12 +837,19 @@ var LiveKitManager = class {
668
837
  */
669
838
  async stopVoiceSession() {
670
839
  const config = getConfig();
840
+ const settings = this.sessionSettings;
841
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
842
+ const outputLanguage = settings.outputLanguageCode ?? "en-US";
843
+ console.log("[SpeechOS] Dictate command:", {
844
+ inputLanguage,
845
+ outputLanguage
846
+ });
671
847
  if (config.debug) console.log("[SpeechOS] Stopping voice session, requesting transcript...");
672
848
  await this.disableMicrophone();
673
849
  if (config.debug) console.log("[SpeechOS] Requesting transcript from agent...");
674
850
  this.pendingTranscript = new Deferred();
675
851
  this.pendingTranscript.setTimeout(1e4, "Transcription timed out. Please try again.", "transcription_timeout", "timeout");
676
- await this.sendDataMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT });
852
+ await this.sendDataMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT$1 });
677
853
  const result = await this.pendingTranscript.promise;
678
854
  this.pendingTranscript = null;
679
855
  return result;
@@ -692,6 +868,14 @@ var LiveKitManager = class {
692
868
  */
693
869
  async requestEditText(originalText) {
694
870
  const config = getConfig();
871
+ const settings = this.sessionSettings;
872
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
873
+ const outputLanguage = settings.outputLanguageCode ?? "en-US";
874
+ console.log("[SpeechOS] Edit command:", {
875
+ inputLanguage,
876
+ outputLanguage,
877
+ originalTextLength: originalText.length
878
+ });
695
879
  if (config.debug) console.log("[SpeechOS] Requesting text edit...");
696
880
  this.editOriginalText = originalText;
697
881
  await this.disableMicrophone();
@@ -699,7 +883,7 @@ var LiveKitManager = class {
699
883
  this.pendingEditText = new Deferred();
700
884
  this.pendingEditText.setTimeout(15e3, "Edit request timed out. Please try again.", "edit_timeout", "timeout");
701
885
  await this.sendDataMessage({
702
- type: MESSAGE_TYPE_EDIT_TEXT,
886
+ type: MESSAGE_TYPE_EDIT_TEXT$1,
703
887
  text: originalText
704
888
  });
705
889
  const result = await this.pendingEditText.promise;
@@ -713,6 +897,39 @@ var LiveKitManager = class {
713
897
  return this.requestEditText(originalText);
714
898
  }
715
899
  /**
900
+ * Request command matching using the transcript as input
901
+ * Sends command definitions to the backend, which matches the user's speech against them
902
+ * Returns a promise that resolves with the matched command or null if no match
903
+ * @throws Error if timeout occurs waiting for command result
904
+ */
905
+ async requestCommand(commands) {
906
+ const config = getConfig();
907
+ const settings = this.sessionSettings;
908
+ const inputLanguage = settings.inputLanguageCode ?? "en-US";
909
+ console.log("[SpeechOS] Command request:", {
910
+ inputLanguage,
911
+ commandCount: commands.length
912
+ });
913
+ if (config.debug) console.log("[SpeechOS] Requesting command match...");
914
+ await this.disableMicrophone();
915
+ if (config.debug) console.log("[SpeechOS] Sending execute_command request to agent...");
916
+ this.pendingCommand = new Deferred();
917
+ this.pendingCommand.setTimeout(15e3, "Command request timed out. Please try again.", "command_timeout", "timeout");
918
+ await this.sendDataMessage({
919
+ type: MESSAGE_TYPE_EXECUTE_COMMAND$1,
920
+ commands
921
+ });
922
+ const result = await this.pendingCommand.promise;
923
+ this.pendingCommand = null;
924
+ return result;
925
+ }
926
+ /**
927
+ * Alias for requestCommand - granular API naming
928
+ */
929
+ async stopAndCommand(commands) {
930
+ return this.requestCommand(commands);
931
+ }
932
+ /**
716
933
  * Disconnect from the current room
717
934
  * Clears the token so a fresh one is fetched for the next session
718
935
  */
@@ -735,16 +952,110 @@ var LiveKitManager = class {
735
952
  this.pendingEditText.reject(new Error("Disconnected"));
736
953
  this.pendingEditText = null;
737
954
  }
955
+ if (this.pendingCommand) {
956
+ this.pendingCommand.reject(new Error("Disconnected"));
957
+ this.pendingCommand = null;
958
+ }
738
959
  if (this.pendingTrackSubscribed) {
739
960
  this.pendingTrackSubscribed.reject(new Error("Disconnected"));
740
961
  this.pendingTrackSubscribed = null;
741
962
  }
742
963
  this.tokenData = null;
743
- this.preWarmPromise = null;
744
964
  this.editOriginalText = null;
965
+ this.sessionSettings = {};
745
966
  if (config.debug) console.log("[SpeechOS] Session state cleared");
746
967
  }
747
968
  /**
969
+ * Invalidate the cached token
970
+ * Call this when settings change that would affect the token (language, vocabulary)
971
+ */
972
+ invalidateTokenCache() {
973
+ const config = getConfig();
974
+ if (config.debug) console.log("[SpeechOS] Token cache invalidated");
975
+ this.cachedTokenData = null;
976
+ this.tokenCacheTimestamp = null;
977
+ }
978
+ /**
979
+ * Start auto-refreshing the token while the widget is expanded.
980
+ * Call this after a voice session completes to immediately fetch a fresh token
981
+ * (since each command requires its own token) and keep it fresh for subsequent commands.
982
+ */
983
+ startAutoRefresh() {
984
+ const config = getConfig();
985
+ this.autoRefreshEnabled = true;
986
+ if (config.debug) console.log("[SpeechOS] Token auto-refresh enabled");
987
+ this.invalidateTokenCache();
988
+ this.prefetchToken().then(() => {
989
+ this.scheduleTokenRefresh();
990
+ }).catch((error) => {
991
+ if (config.debug) console.warn("[SpeechOS] Failed to prefetch token after command:", error);
992
+ if (this.autoRefreshEnabled) this.tokenRefreshTimer = setTimeout(() => {
993
+ this.performAutoRefresh();
994
+ }, 5 * 1e3);
995
+ });
996
+ }
997
+ /**
998
+ * Stop auto-refreshing the token.
999
+ * Call this when the widget collapses or user navigates away.
1000
+ */
1001
+ stopAutoRefresh() {
1002
+ const config = getConfig();
1003
+ this.autoRefreshEnabled = false;
1004
+ if (this.tokenRefreshTimer) {
1005
+ clearTimeout(this.tokenRefreshTimer);
1006
+ this.tokenRefreshTimer = null;
1007
+ }
1008
+ if (config.debug) console.log("[SpeechOS] Token auto-refresh disabled");
1009
+ }
1010
+ /**
1011
+ * Schedule a token refresh before the current cache expires.
1012
+ * Handles computer sleep by checking elapsed time on each refresh attempt.
1013
+ */
1014
+ scheduleTokenRefresh() {
1015
+ if (!this.autoRefreshEnabled) return;
1016
+ if (this.tokenRefreshTimer) {
1017
+ clearTimeout(this.tokenRefreshTimer);
1018
+ this.tokenRefreshTimer = null;
1019
+ }
1020
+ const config = getConfig();
1021
+ const refreshBuffer = 30 * 1e3;
1022
+ let timeUntilRefresh;
1023
+ if (this.tokenCacheTimestamp) {
1024
+ const age = Date.now() - this.tokenCacheTimestamp;
1025
+ const timeRemaining = TOKEN_CACHE_TTL_MS - age;
1026
+ timeUntilRefresh = Math.max(0, timeRemaining - refreshBuffer);
1027
+ } else timeUntilRefresh = 0;
1028
+ if (config.debug) console.log(`[SpeechOS] Scheduling token refresh in ${Math.round(timeUntilRefresh / 1e3)}s`);
1029
+ this.tokenRefreshTimer = setTimeout(() => {
1030
+ this.performAutoRefresh();
1031
+ }, timeUntilRefresh);
1032
+ }
1033
+ /**
1034
+ * Perform the auto-refresh, handling computer sleep scenarios.
1035
+ */
1036
+ async performAutoRefresh() {
1037
+ if (!this.autoRefreshEnabled) return;
1038
+ const config = getConfig();
1039
+ if (this.isCachedTokenValid()) {
1040
+ if (config.debug) console.log("[SpeechOS] Token still valid on refresh check, rescheduling");
1041
+ this.scheduleTokenRefresh();
1042
+ return;
1043
+ }
1044
+ if (config.debug) console.log("[SpeechOS] Auto-refreshing token...");
1045
+ try {
1046
+ const data = await this.fetchTokenFromServer();
1047
+ this.cachedTokenData = data;
1048
+ this.tokenCacheTimestamp = Date.now();
1049
+ if (config.debug) console.log("[SpeechOS] Token auto-refreshed successfully");
1050
+ this.scheduleTokenRefresh();
1051
+ } catch (error) {
1052
+ console.warn("[SpeechOS] Token auto-refresh failed:", error);
1053
+ if (this.autoRefreshEnabled) this.tokenRefreshTimer = setTimeout(() => {
1054
+ this.performAutoRefresh();
1055
+ }, 30 * 1e3);
1056
+ }
1057
+ }
1058
+ /**
748
1059
  * Get the current room instance
749
1060
  */
750
1061
  getRoom() {
@@ -768,88 +1079,747 @@ var LiveKitManager = class {
768
1079
  isMicrophoneEnabled() {
769
1080
  return this.micTrack !== null;
770
1081
  }
771
- /**
772
- * Clear the cached token
773
- * Used when user identity changes to ensure next session gets a fresh token
774
- */
775
- clearToken() {
776
- const config = getConfig();
777
- if (config.debug) console.log("[SpeechOS] Clearing cached token");
778
- this.tokenData = null;
779
- this.preWarmPromise = null;
780
- }
781
1082
  };
782
1083
  const livekit = new LiveKitManager();
1084
+ events.on("settings:changed", () => {
1085
+ livekit.invalidateTokenCache();
1086
+ });
783
1087
 
784
1088
  //#endregion
785
- //#region src/transcript-store.ts
786
- const STORAGE_KEY = "speechos_transcripts";
787
- const MAX_ENTRIES = 50;
1089
+ //#region src/audio-capture.ts
788
1090
  /**
789
- * Generate a unique ID for transcript entries
1091
+ * Detect if running in Safari.
790
1092
  */
791
- function generateId() {
792
- return `${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
1093
+ function isSafari() {
1094
+ const ua = navigator.userAgent.toLowerCase();
1095
+ const vendor = navigator.vendor?.toLowerCase() || "";
1096
+ const hasSafariUA = ua.includes("safari") && !ua.includes("chrome") && !ua.includes("chromium");
1097
+ const isAppleVendor = vendor.includes("apple");
1098
+ return hasSafariUA && isAppleVendor;
793
1099
  }
794
1100
  /**
795
- * Get all transcripts from localStorage
1101
+ * Detect the best supported audio format for the current browser.
1102
+ *
1103
+ * IMPORTANT: Safari must use MP4/AAC. Its WebM/Opus implementation is buggy
1104
+ * and produces truncated/incomplete audio.
796
1105
  */
797
- function getTranscripts() {
798
- try {
799
- const stored = localStorage.getItem(STORAGE_KEY);
800
- if (!stored) return [];
801
- const entries = JSON.parse(stored);
802
- return entries.sort((a, b) => b.timestamp - a.timestamp);
803
- } catch {
804
- return [];
1106
+ function getSupportedAudioFormat() {
1107
+ if (isSafari()) {
1108
+ if (MediaRecorder.isTypeSupported("audio/mp4")) return {
1109
+ mimeType: "audio/mp4",
1110
+ format: "mp4",
1111
+ needsEncodingParams: false
1112
+ };
1113
+ return {
1114
+ mimeType: "",
1115
+ format: "mp4",
1116
+ needsEncodingParams: true
1117
+ };
805
1118
  }
1119
+ if (MediaRecorder.isTypeSupported("audio/webm;codecs=opus")) return {
1120
+ mimeType: "audio/webm;codecs=opus",
1121
+ format: "webm",
1122
+ needsEncodingParams: false
1123
+ };
1124
+ if (MediaRecorder.isTypeSupported("audio/webm")) return {
1125
+ mimeType: "audio/webm",
1126
+ format: "webm",
1127
+ needsEncodingParams: false
1128
+ };
1129
+ if (MediaRecorder.isTypeSupported("audio/mp4")) return {
1130
+ mimeType: "audio/mp4",
1131
+ format: "mp4",
1132
+ needsEncodingParams: false
1133
+ };
1134
+ return {
1135
+ mimeType: "",
1136
+ format: "webm",
1137
+ needsEncodingParams: true
1138
+ };
806
1139
  }
807
1140
  /**
808
- * Save a new transcript entry
1141
+ * Audio capture manager with buffering support.
1142
+ *
1143
+ * Usage:
1144
+ * 1. Create instance with onChunk callback
1145
+ * 2. Call start() - immediately begins capturing
1146
+ * 3. Call setReady() when connection is established - flushes buffer
1147
+ * 4. Call stop() when done
809
1148
  */
810
- function saveTranscript(text, action, originalText) {
811
- const entry = {
812
- id: generateId(),
813
- text,
814
- timestamp: Date.now(),
815
- action,
816
- ...originalText && { originalText }
817
- };
818
- const entries = getTranscripts();
819
- entries.unshift(entry);
820
- const pruned = entries.slice(0, MAX_ENTRIES);
821
- try {
822
- localStorage.setItem(STORAGE_KEY, JSON.stringify(pruned));
823
- } catch {}
824
- return entry;
825
- }
1149
+ var AudioCapture = class AudioCapture {
1150
+ mediaStream = null;
1151
+ recorder = null;
1152
+ buffer = [];
1153
+ isReady = false;
1154
+ isRecording = false;
1155
+ onChunk;
1156
+ audioFormat;
1157
+ deviceId;
1158
+ /**
1159
+ * Time slice for MediaRecorder in milliseconds.
1160
+ *
1161
+ * Safari requires a larger timeslice (1000ms) to properly flush its internal
1162
+ * audio buffers. Smaller values cause Safari to drop or truncate audio data.
1163
+ * See: https://community.openai.com/t/whisper-problem-with-audio-mp4-blobs-from-safari/
1164
+ *
1165
+ * Other browsers (Chrome, Firefox, Edge) work well with smaller timeslices
1166
+ * which provide lower latency for real-time transcription.
1167
+ */
1168
+ static TIME_SLICE_MS = 100;
1169
+ static SAFARI_TIME_SLICE_MS = 1e3;
1170
+ /**
1171
+ * @param onChunk - Callback for receiving audio chunks
1172
+ * @param deviceId - Optional audio device ID (empty string or undefined for system default)
1173
+ */
1174
+ constructor(onChunk, deviceId) {
1175
+ this.onChunk = onChunk;
1176
+ this.audioFormat = getSupportedAudioFormat();
1177
+ this.deviceId = deviceId;
1178
+ }
1179
+ /**
1180
+ * Get the appropriate timeslice for the current browser.
1181
+ * Safari needs a larger timeslice to avoid dropping audio data.
1182
+ */
1183
+ getTimeSlice() {
1184
+ return isSafari() ? AudioCapture.SAFARI_TIME_SLICE_MS : AudioCapture.TIME_SLICE_MS;
1185
+ }
1186
+ /**
1187
+ * Get the timeslice being used (in milliseconds).
1188
+ * Useful for callers that need to wait for audio processing.
1189
+ */
1190
+ getTimeSliceMs() {
1191
+ return this.getTimeSlice();
1192
+ }
1193
+ /**
1194
+ * Get the audio format being used.
1195
+ */
1196
+ getFormat() {
1197
+ return this.audioFormat;
1198
+ }
1199
+ /**
1200
+ * Start capturing audio immediately.
1201
+ *
1202
+ * Audio chunks will be buffered until setReady() is called.
1203
+ */
1204
+ async start() {
1205
+ const config = getConfig();
1206
+ if (this.isRecording) {
1207
+ if (config.debug) console.log("[SpeechOS] AudioCapture already recording");
1208
+ return;
1209
+ }
1210
+ this.buffer = [];
1211
+ this.isReady = false;
1212
+ const constraints = { audio: {
1213
+ echoCancellation: true,
1214
+ noiseSuppression: true,
1215
+ ...this.deviceId ? { deviceId: { exact: this.deviceId } } : {}
1216
+ } };
1217
+ if (config.debug) {
1218
+ console.log("[SpeechOS] AudioCapture starting with format:", this.audioFormat.mimeType);
1219
+ console.log("[SpeechOS] Detected Safari:", isSafari());
1220
+ if (this.deviceId) console.log("[SpeechOS] Using audio device:", this.deviceId);
1221
+ }
1222
+ try {
1223
+ this.mediaStream = await navigator.mediaDevices.getUserMedia(constraints);
1224
+ const recorderOptions = {};
1225
+ if (this.audioFormat.mimeType) recorderOptions.mimeType = this.audioFormat.mimeType;
1226
+ this.recorder = new MediaRecorder(this.mediaStream, recorderOptions);
1227
+ this.recorder.ondataavailable = (event) => {
1228
+ if (event.data && event.data.size > 0) this.handleChunk(event.data);
1229
+ };
1230
+ this.recorder.onerror = (event) => {
1231
+ console.error("[SpeechOS] MediaRecorder error:", event);
1232
+ };
1233
+ const timeSlice = this.getTimeSlice();
1234
+ this.recorder.start(timeSlice);
1235
+ this.isRecording = true;
1236
+ if (config.debug) console.log(`[SpeechOS] AudioCapture started with ${timeSlice}ms timeslice, buffering until ready`);
1237
+ } catch (error) {
1238
+ if (this.deviceId && error instanceof Error) {
1239
+ console.warn("[SpeechOS] Selected device unavailable, trying default:", error.message);
1240
+ this.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: {
1241
+ echoCancellation: true,
1242
+ noiseSuppression: true
1243
+ } });
1244
+ const recorderOptions = {};
1245
+ if (this.audioFormat.mimeType) recorderOptions.mimeType = this.audioFormat.mimeType;
1246
+ this.recorder = new MediaRecorder(this.mediaStream, recorderOptions);
1247
+ this.recorder.ondataavailable = (event) => {
1248
+ if (event.data && event.data.size > 0) this.handleChunk(event.data);
1249
+ };
1250
+ this.recorder.start(this.getTimeSlice());
1251
+ this.isRecording = true;
1252
+ } else throw error;
1253
+ }
1254
+ }
1255
+ /**
1256
+ * Handle an audio chunk with atomic buffer swap pattern.
1257
+ *
1258
+ * If not ready: buffer the chunk.
1259
+ * If ready: send directly via callback.
1260
+ */
1261
+ handleChunk(chunk) {
1262
+ if (this.isReady) this.onChunk(chunk);
1263
+ else this.buffer.push(chunk);
1264
+ }
1265
+ /**
1266
+ * Mark the capture as ready (connection established).
1267
+ *
1268
+ * This flushes any buffered chunks and switches to direct mode.
1269
+ * Uses atomic swap to prevent chunk reordering.
1270
+ */
1271
+ setReady() {
1272
+ const config = getConfig();
1273
+ if (this.isReady) return;
1274
+ const toFlush = this.buffer;
1275
+ this.buffer = [];
1276
+ for (const chunk of toFlush) this.onChunk(chunk);
1277
+ this.isReady = true;
1278
+ if (config.debug) console.log(`[SpeechOS] AudioCapture ready, flushed ${toFlush.length} buffered chunks`);
1279
+ }
1280
+ /**
1281
+ * Stop capturing audio and wait for final chunk.
1282
+ *
1283
+ * Uses requestData() before stop() to force the MediaRecorder to flush
1284
+ * any buffered audio immediately. This is critical for Safari which
1285
+ * may hold audio data in internal buffers.
1286
+ *
1287
+ * Safari requires an additional delay after stopping to ensure all audio
1288
+ * from its internal encoding pipeline has been fully processed and emitted.
1289
+ */
1290
+ async stop() {
1291
+ const config = getConfig();
1292
+ const safari = isSafari();
1293
+ if (this.recorder && this.recorder.state !== "inactive") {
1294
+ if (this.recorder.state === "recording") try {
1295
+ const dataPromise = new Promise((resolve) => {
1296
+ const handler = (event) => {
1297
+ this.recorder?.removeEventListener("dataavailable", handler);
1298
+ if (config.debug) console.log(`[SpeechOS] requestData flush received: ${event.data.size} bytes`);
1299
+ resolve();
1300
+ };
1301
+ this.recorder?.addEventListener("dataavailable", handler);
1302
+ });
1303
+ this.recorder.requestData();
1304
+ if (config.debug) console.log("[SpeechOS] Requested data flush before stop");
1305
+ await dataPromise;
1306
+ } catch (e) {
1307
+ if (config.debug) console.log("[SpeechOS] requestData() not supported or failed:", e);
1308
+ }
1309
+ const stopPromise = new Promise((resolve) => {
1310
+ if (!this.recorder) {
1311
+ resolve();
1312
+ return;
1313
+ }
1314
+ this.recorder.onstop = () => {
1315
+ if (config.debug) console.log("[SpeechOS] MediaRecorder onstop fired");
1316
+ resolve();
1317
+ };
1318
+ });
1319
+ this.recorder.stop();
1320
+ await stopPromise;
1321
+ if (safari) {
1322
+ if (config.debug) console.log("[SpeechOS] Safari: waiting 2s for encoding pipeline to flush");
1323
+ await new Promise((resolve) => setTimeout(resolve, 2e3));
1324
+ }
1325
+ }
1326
+ if (this.mediaStream) {
1327
+ for (const track of this.mediaStream.getTracks()) track.stop();
1328
+ this.mediaStream = null;
1329
+ }
1330
+ this.recorder = null;
1331
+ this.isRecording = false;
1332
+ this.isReady = false;
1333
+ this.buffer = [];
1334
+ if (config.debug) console.log("[SpeechOS] AudioCapture stopped");
1335
+ }
1336
+ /**
1337
+ * Check if currently recording.
1338
+ */
1339
+ get recording() {
1340
+ return this.isRecording;
1341
+ }
1342
+ /**
1343
+ * Check if ready (connection established, direct mode active).
1344
+ */
1345
+ get ready() {
1346
+ return this.isReady;
1347
+ }
1348
+ /**
1349
+ * Get the number of buffered chunks waiting to be sent.
1350
+ */
1351
+ get bufferedChunks() {
1352
+ return this.buffer.length;
1353
+ }
1354
+ };
826
1355
  /**
827
- * Clear all transcript history
1356
+ * Factory function to create an AudioCapture instance.
1357
+ * @param onChunk - Callback for receiving audio chunks
1358
+ * @param deviceId - Optional audio device ID (empty string or undefined for system default)
828
1359
  */
829
- function clearTranscripts() {
830
- try {
831
- localStorage.removeItem(STORAGE_KEY);
832
- } catch {}
1360
+ function createAudioCapture(onChunk, deviceId) {
1361
+ return new AudioCapture(onChunk, deviceId);
833
1362
  }
1363
+
1364
+ //#endregion
1365
+ //#region src/websocket.ts
1366
+ const MESSAGE_TYPE_AUTH = "auth";
1367
+ const MESSAGE_TYPE_READY = "ready";
1368
+ const MESSAGE_TYPE_TRANSCRIPTION = "transcription";
1369
+ const MESSAGE_TYPE_REQUEST_TRANSCRIPT = "request_transcript";
1370
+ const MESSAGE_TYPE_TRANSCRIPT = "transcript";
1371
+ const MESSAGE_TYPE_EDIT_TEXT = "edit_text";
1372
+ const MESSAGE_TYPE_EDITED_TEXT = "edited_text";
1373
+ const MESSAGE_TYPE_EXECUTE_COMMAND = "execute_command";
1374
+ const MESSAGE_TYPE_COMMAND_RESULT = "command_result";
1375
+ const MESSAGE_TYPE_ERROR = "error";
834
1376
  /**
835
- * Delete a single transcript by ID
1377
+ * Response timeout in milliseconds.
836
1378
  */
837
- function deleteTranscript(id) {
838
- const entries = getTranscripts().filter((e) => e.id !== id);
839
- try {
840
- localStorage.setItem(STORAGE_KEY, JSON.stringify(entries));
841
- } catch {}
842
- }
843
- const transcriptStore = {
844
- getTranscripts,
845
- saveTranscript,
846
- clearTranscripts,
847
- deleteTranscript
1379
+ const RESPONSE_TIMEOUT_MS = 15e3;
1380
+ /**
1381
+ * A deferred promise with timeout support.
1382
+ */
1383
+ var Deferred$1 = class {
1384
+ promise;
1385
+ _resolve;
1386
+ _reject;
1387
+ _timeoutId = null;
1388
+ _settled = false;
1389
+ constructor() {
1390
+ this.promise = new Promise((resolve, reject) => {
1391
+ this._resolve = resolve;
1392
+ this._reject = reject;
1393
+ });
1394
+ }
1395
+ setTimeout(ms, errorMessage, errorCode, errorSource) {
1396
+ this._timeoutId = setTimeout(() => {
1397
+ if (!this._settled) {
1398
+ console.error(`[SpeechOS] Error: ${errorMessage} (${errorCode})`);
1399
+ events.emit("error", {
1400
+ code: errorCode,
1401
+ message: errorMessage,
1402
+ source: errorSource
1403
+ });
1404
+ this.reject(new Error(errorMessage));
1405
+ }
1406
+ }, ms);
1407
+ }
1408
+ resolve(value) {
1409
+ if (!this._settled) {
1410
+ this._settled = true;
1411
+ this.clearTimeout();
1412
+ this._resolve(value);
1413
+ }
1414
+ }
1415
+ reject(error) {
1416
+ if (!this._settled) {
1417
+ this._settled = true;
1418
+ this.clearTimeout();
1419
+ this._reject(error);
1420
+ }
1421
+ }
1422
+ clearTimeout() {
1423
+ if (this._timeoutId !== null) {
1424
+ clearTimeout(this._timeoutId);
1425
+ this._timeoutId = null;
1426
+ }
1427
+ }
1428
+ get isSettled() {
1429
+ return this._settled;
1430
+ }
1431
+ };
1432
+ /**
1433
+ * Maximum time to wait for WebSocket buffer to drain.
1434
+ */
1435
+ const BUFFER_DRAIN_TIMEOUT_MS = 5e3;
1436
+ /**
1437
+ * Polling interval for checking WebSocket buffer.
1438
+ */
1439
+ const BUFFER_CHECK_INTERVAL_MS = 50;
1440
+ /**
1441
+ * WebSocket connection manager for voice sessions.
1442
+ */
1443
+ var WebSocketManager = class {
1444
+ ws = null;
1445
+ audioCapture = null;
1446
+ sessionId = null;
1447
+ pendingAuth = null;
1448
+ pendingTranscript = null;
1449
+ pendingEditText = null;
1450
+ pendingCommand = null;
1451
+ pendingAudioSends = /* @__PURE__ */ new Set();
1452
+ editOriginalText = null;
1453
+ lastInputText = void 0;
1454
+ sessionAction = "dictate";
1455
+ sessionInputText = "";
1456
+ sessionCommands = [];
1457
+ sessionSettings = {};
1458
+ /**
1459
+ * Get the WebSocket URL for voice sessions.
1460
+ */
1461
+ getWebSocketUrl() {
1462
+ const config = getConfig();
1463
+ const host = config.host || "https://app.speechos.ai";
1464
+ const wsUrl = host.replace(/^http/, "ws");
1465
+ return `${wsUrl}/ws/voice/`;
1466
+ }
1467
+ /**
1468
+ * Start a voice session with the WebSocket backend.
1469
+ *
1470
+ * This method:
1471
+ * 1. Starts audio capture immediately (buffering)
1472
+ * 2. Opens WebSocket connection
1473
+ * 3. Authenticates with API key and action parameters
1474
+ * 4. Flushes buffered audio and continues streaming
1475
+ *
1476
+ * @param options - Session options including action type and parameters
1477
+ */
1478
+ async startVoiceSession(options) {
1479
+ const config = getConfig();
1480
+ this.sessionAction = options?.action || "dictate";
1481
+ this.sessionInputText = options?.inputText || "";
1482
+ this.sessionCommands = options?.commands || [];
1483
+ this.sessionSettings = options?.settings || {};
1484
+ if (this.sessionAction === "edit") this.editOriginalText = this.sessionInputText;
1485
+ if (config.debug) console.log("[SpeechOS] Starting WebSocket voice session...");
1486
+ this.audioCapture = createAudioCapture((chunk) => {
1487
+ this.sendAudioChunk(chunk);
1488
+ }, this.sessionSettings.audioDeviceId);
1489
+ await this.audioCapture.start();
1490
+ if (options?.onMicReady) options.onMicReady();
1491
+ state.setMicEnabled(true);
1492
+ const wsUrl = this.getWebSocketUrl();
1493
+ if (config.debug) console.log("[SpeechOS] Connecting to WebSocket:", wsUrl);
1494
+ this.ws = new WebSocket(wsUrl);
1495
+ this.ws.onopen = () => {
1496
+ if (config.debug) console.log("[SpeechOS] WebSocket connected, authenticating...");
1497
+ this.authenticate();
1498
+ };
1499
+ this.ws.onmessage = (event) => {
1500
+ this.handleMessage(event.data);
1501
+ };
1502
+ this.ws.onerror = (event) => {
1503
+ console.error("[SpeechOS] WebSocket error:", event);
1504
+ events.emit("error", {
1505
+ code: "websocket_error",
1506
+ message: "WebSocket connection error",
1507
+ source: "connection"
1508
+ });
1509
+ };
1510
+ this.ws.onclose = (event) => {
1511
+ if (config.debug) console.log("[SpeechOS] WebSocket closed:", event.code, event.reason);
1512
+ state.setConnected(false);
1513
+ };
1514
+ this.pendingAuth = new Deferred$1();
1515
+ this.pendingAuth.setTimeout(RESPONSE_TIMEOUT_MS, "Connection timed out", "connection_timeout", "connection");
1516
+ await this.pendingAuth.promise;
1517
+ this.pendingAuth = null;
1518
+ if (this.audioCapture) this.audioCapture.setReady();
1519
+ state.setConnected(true);
1520
+ if (config.debug) console.log("[SpeechOS] WebSocket voice session ready");
1521
+ }
1522
+ /**
1523
+ * Send authentication message with action parameters.
1524
+ * All session parameters are now sent upfront in the auth message.
1525
+ */
1526
+ authenticate() {
1527
+ const config = getConfig();
1528
+ const audioFormat = getSupportedAudioFormat();
1529
+ const settings = this.sessionSettings;
1530
+ const authMessage = {
1531
+ type: MESSAGE_TYPE_AUTH,
1532
+ api_key: config.apiKey,
1533
+ user_id: config.userId || null,
1534
+ input_language: settings.inputLanguageCode ?? "en-US",
1535
+ output_language: settings.outputLanguageCode ?? "en-US",
1536
+ smart_format: settings.smartFormat ?? true,
1537
+ custom_vocabulary: settings.vocabulary ?? [],
1538
+ custom_snippets: settings.snippets ?? [],
1539
+ audio_format: audioFormat.format,
1540
+ action: this.sessionAction,
1541
+ input_text: this.sessionInputText,
1542
+ commands: this.sessionCommands
1543
+ };
1544
+ if (config.debug) console.log("[SpeechOS] Sending auth message with action:", this.sessionAction);
1545
+ this.ws?.send(JSON.stringify(authMessage));
1546
+ }
1547
+ /**
1548
+ * Send an audio chunk over the WebSocket.
1549
+ * Tracks the promise so we can wait for all sends to complete.
1550
+ */
1551
+ sendAudioChunk(chunk) {
1552
+ const sendPromise = this.doSendAudioChunk(chunk);
1553
+ this.pendingAudioSends.add(sendPromise);
1554
+ sendPromise.finally(() => {
1555
+ this.pendingAudioSends.delete(sendPromise);
1556
+ });
1557
+ }
1558
+ /**
1559
+ * Actually send the audio chunk (async operation).
1560
+ */
1561
+ async doSendAudioChunk(chunk) {
1562
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
1563
+ const arrayBuffer = await chunk.arrayBuffer();
1564
+ this.ws.send(arrayBuffer);
1565
+ }
1566
+ }
1567
+ /**
1568
+ * Handle incoming WebSocket messages.
1569
+ */
1570
+ handleMessage(data) {
1571
+ const config = getConfig();
1572
+ try {
1573
+ const message = JSON.parse(data);
1574
+ if (config.debug) console.log("[SpeechOS] WebSocket message:", message);
1575
+ switch (message.type) {
1576
+ case MESSAGE_TYPE_READY:
1577
+ this.handleReady(message);
1578
+ break;
1579
+ case MESSAGE_TYPE_TRANSCRIPTION:
1580
+ this.handleIntermediateTranscription(message);
1581
+ break;
1582
+ case MESSAGE_TYPE_TRANSCRIPT:
1583
+ this.handleFinalTranscript(message);
1584
+ break;
1585
+ case MESSAGE_TYPE_EDITED_TEXT:
1586
+ this.handleEditedText(message);
1587
+ break;
1588
+ case MESSAGE_TYPE_COMMAND_RESULT:
1589
+ this.handleCommandResult(message);
1590
+ break;
1591
+ case MESSAGE_TYPE_ERROR:
1592
+ this.handleError(message);
1593
+ break;
1594
+ default: if (config.debug) console.log("[SpeechOS] Unknown message type:", message.type);
1595
+ }
1596
+ } catch (error) {
1597
+ console.error("[SpeechOS] Failed to parse message:", error);
1598
+ }
1599
+ }
1600
+ handleReady(message) {
1601
+ const config = getConfig();
1602
+ this.sessionId = message.session_id;
1603
+ if (config.debug) console.log("[SpeechOS] Session ready:", this.sessionId);
1604
+ if (this.pendingAuth) this.pendingAuth.resolve();
1605
+ }
1606
+ handleIntermediateTranscription(message) {
1607
+ const config = getConfig();
1608
+ if (config.debug) console.log("[SpeechOS] Intermediate transcription:", message.transcript, "final:", message.is_final);
1609
+ }
1610
+ handleFinalTranscript(message) {
1611
+ const transcript = message.transcript || "";
1612
+ events.emit("transcription:complete", { text: transcript });
1613
+ if (this.pendingTranscript) {
1614
+ this.pendingTranscript.resolve(transcript);
1615
+ this.pendingTranscript = null;
1616
+ }
1617
+ }
1618
+ handleEditedText(message) {
1619
+ const editedText = message.text || "";
1620
+ events.emit("edit:complete", {
1621
+ text: editedText,
1622
+ originalText: this.editOriginalText || ""
1623
+ });
1624
+ if (this.pendingEditText) {
1625
+ this.pendingEditText.resolve(editedText);
1626
+ this.pendingEditText = null;
1627
+ }
1628
+ this.editOriginalText = null;
1629
+ }
1630
+ handleCommandResult(message) {
1631
+ const commandResult = message.command || null;
1632
+ this.lastInputText = message.transcript;
1633
+ events.emit("command:complete", { command: commandResult });
1634
+ if (this.pendingCommand) {
1635
+ this.pendingCommand.resolve(commandResult);
1636
+ this.pendingCommand = null;
1637
+ }
1638
+ }
1639
+ handleError(message) {
1640
+ const errorCode = message.code || "server_error";
1641
+ const errorMessage = message.message || "A server error occurred";
1642
+ console.error(`[SpeechOS] Error: ${errorMessage} (${errorCode})`);
1643
+ events.emit("error", {
1644
+ code: errorCode,
1645
+ message: errorMessage,
1646
+ source: "server"
1647
+ });
1648
+ const error = new Error(errorMessage);
1649
+ if (this.pendingAuth) {
1650
+ this.pendingAuth.reject(error);
1651
+ this.pendingAuth = null;
1652
+ }
1653
+ if (this.pendingTranscript) {
1654
+ this.pendingTranscript.reject(error);
1655
+ this.pendingTranscript = null;
1656
+ }
1657
+ if (this.pendingEditText) {
1658
+ this.pendingEditText.reject(error);
1659
+ this.pendingEditText = null;
1660
+ }
1661
+ if (this.pendingCommand) {
1662
+ this.pendingCommand.reject(error);
1663
+ this.pendingCommand = null;
1664
+ }
1665
+ }
1666
+ /**
1667
+ * Stop the voice session and request the transcript.
1668
+ */
1669
+ async stopVoiceSession() {
1670
+ const config = getConfig();
1671
+ if (config.debug) console.log("[SpeechOS] Stopping voice session, requesting transcript...");
1672
+ await this.stopAudioCapture();
1673
+ this.pendingTranscript = new Deferred$1();
1674
+ this.pendingTranscript.setTimeout(RESPONSE_TIMEOUT_MS, "Transcription timed out. Please try again.", "transcription_timeout", "timeout");
1675
+ this.sendMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT });
1676
+ const result = await this.pendingTranscript.promise;
1677
+ this.pendingTranscript = null;
1678
+ return result;
1679
+ }
1680
+ /**
1681
+ * Request text editing using the transcript as instructions.
1682
+ * Note: The input text was already sent in the auth message via startVoiceSession.
1683
+ */
1684
+ async requestEditText(_originalText) {
1685
+ const config = getConfig();
1686
+ if (config.debug) console.log("[SpeechOS] Requesting text edit...");
1687
+ await this.stopAudioCapture();
1688
+ this.pendingEditText = new Deferred$1();
1689
+ this.pendingEditText.setTimeout(RESPONSE_TIMEOUT_MS, "Edit request timed out. Please try again.", "edit_timeout", "timeout");
1690
+ this.sendMessage({ type: MESSAGE_TYPE_EDIT_TEXT });
1691
+ const result = await this.pendingEditText.promise;
1692
+ this.pendingEditText = null;
1693
+ return result;
1694
+ }
1695
+ /**
1696
+ * Request command matching using the transcript as input.
1697
+ * Note: The command definitions were already sent in the auth message via startVoiceSession.
1698
+ */
1699
+ async requestCommand(_commands) {
1700
+ const config = getConfig();
1701
+ if (config.debug) console.log("[SpeechOS] Requesting command match...");
1702
+ await this.stopAudioCapture();
1703
+ this.pendingCommand = new Deferred$1();
1704
+ this.pendingCommand.setTimeout(RESPONSE_TIMEOUT_MS, "Command request timed out. Please try again.", "command_timeout", "timeout");
1705
+ this.sendMessage({ type: MESSAGE_TYPE_EXECUTE_COMMAND });
1706
+ const result = await this.pendingCommand.promise;
1707
+ this.pendingCommand = null;
1708
+ return result;
1709
+ }
1710
+ /**
1711
+ * Stop audio capture and wait for all data to be sent.
1712
+ *
1713
+ * Waits for:
1714
+ * 1. All pending sendAudioChunk calls to complete (arrayBuffer conversion)
1715
+ * 2. WebSocket buffer to drain (all data transmitted)
1716
+ *
1717
+ * WebSocket message ordering ensures server receives all audio before transcript request.
1718
+ */
1719
+ async stopAudioCapture() {
1720
+ const config = getConfig();
1721
+ const startTime = Date.now();
1722
+ if (config.debug) console.log("[SpeechOS] stopAudioCapture: starting...");
1723
+ if (this.audioCapture) {
1724
+ await this.audioCapture.stop();
1725
+ this.audioCapture = null;
1726
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: recorder stopped after ${Date.now() - startTime}ms`);
1727
+ }
1728
+ state.setMicEnabled(false);
1729
+ if (this.pendingAudioSends.size > 0) {
1730
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: waiting for ${this.pendingAudioSends.size} pending audio sends...`);
1731
+ await Promise.all(this.pendingAudioSends);
1732
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: all sends complete after ${Date.now() - startTime}ms`);
1733
+ } else if (config.debug) console.log("[SpeechOS] stopAudioCapture: no pending sends");
1734
+ await this.waitForBufferDrain();
1735
+ if (config.debug) console.log(`[SpeechOS] stopAudioCapture: complete after ${Date.now() - startTime}ms`);
1736
+ }
1737
+ /**
1738
+ * Wait for the WebSocket send buffer to drain.
1739
+ *
1740
+ * This ensures all audio data has been transmitted before we request
1741
+ * the transcript. Uses the same pattern as LiveKit's ReadableStream approach.
1742
+ */
1743
+ async waitForBufferDrain() {
1744
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
1745
+ const config = getConfig();
1746
+ const startTime = Date.now();
1747
+ while (this.ws.bufferedAmount > 0) {
1748
+ if (Date.now() - startTime > BUFFER_DRAIN_TIMEOUT_MS) {
1749
+ console.warn(`[SpeechOS] Buffer drain timeout, ${this.ws.bufferedAmount} bytes still pending`);
1750
+ break;
1751
+ }
1752
+ await new Promise((resolve) => setTimeout(resolve, BUFFER_CHECK_INTERVAL_MS));
1753
+ }
1754
+ if (config.debug) console.log(`[SpeechOS] Buffer drained in ${Date.now() - startTime}ms`);
1755
+ }
1756
+ /**
1757
+ * Send a JSON message over the WebSocket.
1758
+ */
1759
+ sendMessage(message) {
1760
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) this.ws.send(JSON.stringify(message));
1761
+ }
1762
+ /**
1763
+ * Disconnect from the WebSocket.
1764
+ */
1765
+ async disconnect() {
1766
+ const config = getConfig();
1767
+ if (config.debug) console.log("[SpeechOS] Disconnecting WebSocket...");
1768
+ await this.stopAudioCapture();
1769
+ if (this.ws) {
1770
+ this.ws.close();
1771
+ this.ws = null;
1772
+ }
1773
+ const error = new Error("Disconnected");
1774
+ if (this.pendingAuth) {
1775
+ this.pendingAuth.reject(error);
1776
+ this.pendingAuth = null;
1777
+ }
1778
+ if (this.pendingTranscript) {
1779
+ this.pendingTranscript.reject(error);
1780
+ this.pendingTranscript = null;
1781
+ }
1782
+ if (this.pendingEditText) {
1783
+ this.pendingEditText.reject(error);
1784
+ this.pendingEditText = null;
1785
+ }
1786
+ if (this.pendingCommand) {
1787
+ this.pendingCommand.reject(error);
1788
+ this.pendingCommand = null;
1789
+ }
1790
+ this.sessionId = null;
1791
+ this.editOriginalText = null;
1792
+ this.lastInputText = void 0;
1793
+ this.sessionSettings = {};
1794
+ state.setConnected(false);
1795
+ state.setMicEnabled(false);
1796
+ if (config.debug) console.log("[SpeechOS] WebSocket disconnected");
1797
+ }
1798
+ /**
1799
+ * Check if connected to WebSocket.
1800
+ */
1801
+ isConnected() {
1802
+ return this.ws !== null && this.ws.readyState === WebSocket.OPEN;
1803
+ }
1804
+ /**
1805
+ * Get the last input text from a command result.
1806
+ * This is the raw transcript of what the user said.
1807
+ */
1808
+ getLastInputText() {
1809
+ return this.lastInputText;
1810
+ }
848
1811
  };
1812
+ const websocket = new WebSocketManager();
849
1813
 
850
1814
  //#endregion
851
1815
  //#region src/speechos.ts
852
1816
  /**
1817
+ * Get the active voice backend (always websocket now)
1818
+ */
1819
+ function getBackend$1() {
1820
+ return websocket;
1821
+ }
1822
+ /**
853
1823
  * SpeechOS Core SDK
854
1824
  *
855
1825
  * Provides two API layers:
@@ -868,7 +1838,6 @@ var SpeechOSCore = class {
868
1838
  const currentConfig$1 = getConfig();
869
1839
  if (currentConfig$1.debug) console.log("[SpeechOS] Initialized with config:", {
870
1840
  host: currentConfig$1.host,
871
- position: currentConfig$1.position,
872
1841
  debug: currentConfig$1.debug
873
1842
  });
874
1843
  }
@@ -908,7 +1877,6 @@ var SpeechOSCore = class {
908
1877
  state.setRecordingState("processing");
909
1878
  try {
910
1879
  const transcript = await livekit.stopAndGetTranscript();
911
- transcriptStore.saveTranscript(transcript, "dictate");
912
1880
  state.completeRecording();
913
1881
  return transcript;
914
1882
  } catch (error) {
@@ -925,7 +1893,6 @@ var SpeechOSCore = class {
925
1893
  state.setRecordingState("processing");
926
1894
  try {
927
1895
  const editedText = await livekit.stopAndEdit(originalText);
928
- transcriptStore.saveTranscript(editedText, "edit", originalText);
929
1896
  state.completeRecording();
930
1897
  return editedText;
931
1898
  } catch (error) {
@@ -951,8 +1918,13 @@ var SpeechOSCore = class {
951
1918
  state.setActiveAction("dictate");
952
1919
  state.startRecording();
953
1920
  try {
954
- await livekit.startVoiceSession();
955
- state.setRecordingState("recording");
1921
+ const backend = getBackend$1();
1922
+ await backend.startVoiceSession({
1923
+ action: "dictate",
1924
+ onMicReady: () => {
1925
+ state.setRecordingState("recording");
1926
+ }
1927
+ });
956
1928
  return new Promise((resolve, reject) => {
957
1929
  this._dictateResolve = resolve;
958
1930
  this._dictateReject = reject;
@@ -972,8 +1944,8 @@ var SpeechOSCore = class {
972
1944
  async stopDictation() {
973
1945
  state.setRecordingState("processing");
974
1946
  try {
975
- const transcript = await livekit.stopVoiceSession();
976
- transcriptStore.saveTranscript(transcript, "dictate");
1947
+ const backend = getBackend$1();
1948
+ const transcript = await backend.stopVoiceSession();
977
1949
  state.completeRecording();
978
1950
  if (this._dictateResolve) {
979
1951
  this._dictateResolve(transcript);
@@ -1007,8 +1979,14 @@ var SpeechOSCore = class {
1007
1979
  state.startRecording();
1008
1980
  this._editOriginalText = originalText;
1009
1981
  try {
1010
- await livekit.startVoiceSession();
1011
- state.setRecordingState("recording");
1982
+ const backend = getBackend$1();
1983
+ await backend.startVoiceSession({
1984
+ action: "edit",
1985
+ inputText: originalText,
1986
+ onMicReady: () => {
1987
+ state.setRecordingState("recording");
1988
+ }
1989
+ });
1012
1990
  return new Promise((resolve, reject) => {
1013
1991
  this._editResolve = resolve;
1014
1992
  this._editReject = reject;
@@ -1029,9 +2007,9 @@ var SpeechOSCore = class {
1029
2007
  async stopEdit() {
1030
2008
  state.setRecordingState("processing");
1031
2009
  try {
2010
+ const backend = getBackend$1();
1032
2011
  const originalText = this._editOriginalText || "";
1033
- const editedText = await livekit.requestEditText(originalText);
1034
- transcriptStore.saveTranscript(editedText, "edit", originalText);
2012
+ const editedText = await backend.requestEditText(originalText);
1035
2013
  state.completeRecording();
1036
2014
  if (this._editResolve) {
1037
2015
  this._editResolve(editedText);
@@ -1054,6 +2032,71 @@ var SpeechOSCore = class {
1054
2032
  }
1055
2033
  }
1056
2034
  /**
2035
+ * One-shot command: connect, wait for agent, record voice, match against commands
2036
+ * Automatically handles the full voice session lifecycle
2037
+ *
2038
+ * @param commands - Array of command definitions to match against
2039
+ * @returns The matched command result or null if no match
2040
+ */
2041
+ async command(commands) {
2042
+ this.ensureInitialized();
2043
+ state.setActiveAction("command");
2044
+ state.startRecording();
2045
+ this._commandCommands = commands;
2046
+ try {
2047
+ const backend = getBackend$1();
2048
+ await backend.startVoiceSession({
2049
+ action: "command",
2050
+ commands,
2051
+ onMicReady: () => {
2052
+ state.setRecordingState("recording");
2053
+ }
2054
+ });
2055
+ return new Promise((resolve, reject) => {
2056
+ this._commandResolve = resolve;
2057
+ this._commandReject = reject;
2058
+ });
2059
+ } catch (error) {
2060
+ state.setError(error instanceof Error ? error.message : "Failed to start command");
2061
+ await this.cleanup();
2062
+ throw error;
2063
+ }
2064
+ }
2065
+ _commandCommands;
2066
+ _commandResolve;
2067
+ _commandReject;
2068
+ /**
2069
+ * Stop command recording and get the matched command
2070
+ * Call this after command() when user stops speaking
2071
+ */
2072
+ async stopCommand() {
2073
+ state.setRecordingState("processing");
2074
+ try {
2075
+ const backend = getBackend$1();
2076
+ const commands = this._commandCommands || [];
2077
+ const result = await backend.requestCommand(commands);
2078
+ state.completeRecording();
2079
+ if (this._commandResolve) {
2080
+ this._commandResolve(result);
2081
+ this._commandResolve = void 0;
2082
+ this._commandReject = void 0;
2083
+ }
2084
+ return result;
2085
+ } catch (error) {
2086
+ const err = error instanceof Error ? error : new Error("Command request failed");
2087
+ state.setError(err.message);
2088
+ if (this._commandReject) {
2089
+ this._commandReject(err);
2090
+ this._commandResolve = void 0;
2091
+ this._commandReject = void 0;
2092
+ }
2093
+ throw err;
2094
+ } finally {
2095
+ this._commandCommands = void 0;
2096
+ await this.cleanup();
2097
+ }
2098
+ }
2099
+ /**
1057
2100
  * Cancel the current operation
1058
2101
  */
1059
2102
  async cancel() {
@@ -1068,7 +2111,13 @@ var SpeechOSCore = class {
1068
2111
  this._editResolve = void 0;
1069
2112
  this._editReject = void 0;
1070
2113
  }
2114
+ if (this._commandReject) {
2115
+ this._commandReject(err);
2116
+ this._commandResolve = void 0;
2117
+ this._commandReject = void 0;
2118
+ }
1071
2119
  this._editOriginalText = void 0;
2120
+ this._commandCommands = void 0;
1072
2121
  await this.cleanup();
1073
2122
  state.cancelRecording();
1074
2123
  }
@@ -1095,7 +2144,8 @@ var SpeechOSCore = class {
1095
2144
  }
1096
2145
  async cleanup() {
1097
2146
  try {
1098
- await livekit.disconnect();
2147
+ const backend = getBackend$1();
2148
+ await backend.disconnect();
1099
2149
  } catch (error) {
1100
2150
  const config = getConfig();
1101
2151
  if (config.debug) console.warn("[SpeechOS] Cleanup disconnect error:", error);
@@ -1111,6 +2161,9 @@ var SpeechOSCore = class {
1111
2161
  this._editResolve = void 0;
1112
2162
  this._editReject = void 0;
1113
2163
  this._editOriginalText = void 0;
2164
+ this._commandResolve = void 0;
2165
+ this._commandReject = void 0;
2166
+ this._commandCommands = void 0;
1114
2167
  resetConfig();
1115
2168
  state.reset();
1116
2169
  events.clear();
@@ -1118,10 +2171,38 @@ var SpeechOSCore = class {
1118
2171
  };
1119
2172
  const speechOS = new SpeechOSCore();
1120
2173
 
2174
+ //#endregion
2175
+ //#region src/backend.ts
2176
+ /**
2177
+ * WebSocket backend adapter - wraps the websocket module to match the VoiceBackend interface
2178
+ */
2179
+ const websocketBackend = {
2180
+ startVoiceSession: (options) => websocket.startVoiceSession(options),
2181
+ stopVoiceSession: () => websocket.stopVoiceSession(),
2182
+ requestEditText: (text) => websocket.requestEditText(text),
2183
+ requestCommand: (commands) => websocket.requestCommand(commands),
2184
+ disconnect: () => websocket.disconnect(),
2185
+ isConnected: () => websocket.isConnected(),
2186
+ getLastInputText: () => websocket.getLastInputText(),
2187
+ prefetchToken: () => Promise.resolve({}),
2188
+ startAutoRefresh: () => {},
2189
+ stopAutoRefresh: () => {},
2190
+ invalidateTokenCache: () => {}
2191
+ };
2192
+ /**
2193
+ * Get the active voice backend.
2194
+ * Always returns WebSocket backend (LiveKit is legacy).
2195
+ *
2196
+ * @returns The websocket backend
2197
+ */
2198
+ function getBackend() {
2199
+ return websocketBackend;
2200
+ }
2201
+
1121
2202
  //#endregion
1122
2203
  //#region src/index.ts
1123
2204
  const VERSION = "0.1.0";
1124
2205
 
1125
2206
  //#endregion
1126
- export { DEFAULT_HOST, Deferred, SpeechOSEventEmitter, VERSION, createStateManager, defaultConfig, events, getConfig, livekit, resetConfig, setConfig, speechOS, state, transcriptStore, updateUserId, validateConfig };
2207
+ export { DEFAULT_HOST, Deferred, SpeechOSEventEmitter, VERSION, createStateManager, events, getBackend, getConfig, livekit, resetConfig, setConfig, speechOS, state, updateUserId, validateConfig, websocket };
1127
2208
  //# sourceMappingURL=index.js.map