@speechos/core 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  * Handles room connections, audio streaming, and transcription requests
4
4
  */
5
5
  import { Room } from "livekit-client";
6
- import type { LiveKitTokenResponse, ErrorSource } from "./types.js";
6
+ import type { LiveKitTokenResponse, ErrorSource, CommandDefinition, CommandResult, VoiceSessionOptions } from "./types.js";
7
7
  /**
8
8
  * A deferred promise with timeout support.
9
9
  * Encapsulates resolve/reject/timeout in a single object for cleaner async handling.
@@ -31,21 +31,38 @@ declare class LiveKitManager {
31
31
  private room;
32
32
  private tokenData;
33
33
  private micTrack;
34
+ private cachedTokenData;
35
+ private tokenCacheTimestamp;
36
+ private tokenPrefetchPromise;
37
+ private tokenRefreshTimer;
38
+ private autoRefreshEnabled;
34
39
  private pendingTranscript;
35
40
  private pendingEditText;
41
+ private pendingCommand;
36
42
  private pendingTrackSubscribed;
37
- private preWarmPromise;
38
43
  private editOriginalText;
44
+ private sessionSettings;
39
45
  /**
40
- * Pre-warm resources for faster connection
41
- * Call this when user shows intent (e.g., expands widget)
42
- * Only fetches token - mic permission is requested when user clicks Dictate
46
+ * Check if the cached token is still valid (within TTL)
43
47
  */
44
- preWarm(): Promise<void>;
48
+ private isCachedTokenValid;
49
+ /**
50
+ * Pre-fetch a LiveKit token for later use
51
+ * Call this early (e.g., when widget expands) to reduce latency when starting a voice session.
52
+ * If a prefetch is already in progress, returns the existing promise.
53
+ * If a valid cached token exists, returns it immediately.
54
+ */
55
+ prefetchToken(): Promise<LiveKitTokenResponse>;
45
56
  /**
46
57
  * Fetch a LiveKit token from the backend
58
+ * Uses cached token if valid, otherwise fetches a fresh one.
59
+ * Includes language settings and user vocabulary which are stored in the VoiceSession.
47
60
  */
48
61
  fetchToken(): Promise<LiveKitTokenResponse>;
62
+ /**
63
+ * Internal method to fetch a fresh token from the server
64
+ */
65
+ private fetchTokenFromServer;
49
66
  /**
50
67
  * Connect to a LiveKit room (fresh connection each time)
51
68
  */
@@ -65,8 +82,13 @@ declare class LiveKitManager {
65
82
  private handleDataMessage;
66
83
  /**
67
84
  * Publish microphone audio track
85
+ * Uses the device ID from session settings if set
68
86
  */
69
87
  enableMicrophone(): Promise<void>;
88
+ /**
89
+ * Log information about the current microphone track
90
+ */
91
+ private logMicrophoneInfo;
70
92
  /**
71
93
  * Disable microphone audio track
72
94
  */
@@ -76,10 +98,24 @@ declare class LiveKitManager {
76
98
  */
77
99
  sendDataMessage(message: object): Promise<void>;
78
100
  /**
79
- * Start a voice session
80
- * Connects to room, enables microphone, and waits for agent to subscribe to our track
101
+ * Start a voice session with pre-connect audio buffering
102
+ * Fetches a fresh token, then enables mic with preConnectBuffer to capture audio while connecting.
103
+ * Agent subscription happens in the background - we don't block on it.
104
+ *
105
+ * @param options - Session options including action type and parameters
81
106
  */
82
- startVoiceSession(): Promise<void>;
107
+ startVoiceSession(options?: VoiceSessionOptions): Promise<void>;
108
+ /**
109
+ * Wait for the agent to subscribe to our audio track in the background
110
+ * Handles timeout errors without blocking the main flow
111
+ */
112
+ private waitForAgentSubscription;
113
+ /**
114
+ * Enable microphone with pre-connect buffering
115
+ * This starts capturing audio locally before the room is connected,
116
+ * buffering it until the connection is established.
117
+ */
118
+ private enableMicrophoneWithPreConnectBuffer;
83
119
  /**
84
120
  * Stop the voice session and request the transcript
85
121
  * Returns a promise that resolves with the transcript text
@@ -101,11 +137,47 @@ declare class LiveKitManager {
101
137
  * Alias for requestEditText - granular API naming
102
138
  */
103
139
  stopAndEdit(originalText: string): Promise<string>;
140
+ /**
141
+ * Request command matching using the transcript as input
142
+ * Sends command definitions to the backend, which matches the user's speech against them
143
+ * Returns a promise that resolves with the matched command or null if no match
144
+ * @throws Error if timeout occurs waiting for command result
145
+ */
146
+ requestCommand(commands: CommandDefinition[]): Promise<CommandResult | null>;
147
+ /**
148
+ * Alias for requestCommand - granular API naming
149
+ */
150
+ stopAndCommand(commands: CommandDefinition[]): Promise<CommandResult | null>;
104
151
  /**
105
152
  * Disconnect from the current room
106
153
  * Clears the token so a fresh one is fetched for the next session
107
154
  */
108
155
  disconnect(): Promise<void>;
156
+ /**
157
+ * Invalidate the cached token
158
+ * Call this when settings change that would affect the token (language, vocabulary)
159
+ */
160
+ invalidateTokenCache(): void;
161
+ /**
162
+ * Start auto-refreshing the token while the widget is expanded.
163
+ * Call this after a voice session completes to immediately fetch a fresh token
164
+ * (since each command requires its own token) and keep it fresh for subsequent commands.
165
+ */
166
+ startAutoRefresh(): void;
167
+ /**
168
+ * Stop auto-refreshing the token.
169
+ * Call this when the widget collapses or user navigates away.
170
+ */
171
+ stopAutoRefresh(): void;
172
+ /**
173
+ * Schedule a token refresh before the current cache expires.
174
+ * Handles computer sleep by checking elapsed time on each refresh attempt.
175
+ */
176
+ private scheduleTokenRefresh;
177
+ /**
178
+ * Perform the auto-refresh, handling computer sleep scenarios.
179
+ */
180
+ private performAutoRefresh;
109
181
  /**
110
182
  * Get the current room instance
111
183
  */
@@ -122,11 +194,6 @@ declare class LiveKitManager {
122
194
  * Check if microphone is enabled
123
195
  */
124
196
  isMicrophoneEnabled(): boolean;
125
- /**
126
- * Clear the cached token
127
- * Used when user identity changes to ensure next session gets a fresh token
128
- */
129
- clearToken(): void;
130
197
  }
131
198
  export declare const livekit: LiveKitManager;
132
199
  export {};
package/dist/livekit.d.ts CHANGED
@@ -3,7 +3,7 @@
3
3
  * Handles room connections, audio streaming, and transcription requests
4
4
  */
5
5
  import { Room } from "livekit-client";
6
- import type { LiveKitTokenResponse, ErrorSource } from "./types.js";
6
+ import type { LiveKitTokenResponse, ErrorSource, CommandDefinition, CommandResult, VoiceSessionOptions } from "./types.js";
7
7
  /**
8
8
  * A deferred promise with timeout support.
9
9
  * Encapsulates resolve/reject/timeout in a single object for cleaner async handling.
@@ -31,21 +31,38 @@ declare class LiveKitManager {
31
31
  private room;
32
32
  private tokenData;
33
33
  private micTrack;
34
+ private cachedTokenData;
35
+ private tokenCacheTimestamp;
36
+ private tokenPrefetchPromise;
37
+ private tokenRefreshTimer;
38
+ private autoRefreshEnabled;
34
39
  private pendingTranscript;
35
40
  private pendingEditText;
41
+ private pendingCommand;
36
42
  private pendingTrackSubscribed;
37
- private preWarmPromise;
38
43
  private editOriginalText;
44
+ private sessionSettings;
39
45
  /**
40
- * Pre-warm resources for faster connection
41
- * Call this when user shows intent (e.g., expands widget)
42
- * Only fetches token - mic permission is requested when user clicks Dictate
46
+ * Check if the cached token is still valid (within TTL)
43
47
  */
44
- preWarm(): Promise<void>;
48
+ private isCachedTokenValid;
49
+ /**
50
+ * Pre-fetch a LiveKit token for later use
51
+ * Call this early (e.g., when widget expands) to reduce latency when starting a voice session.
52
+ * If a prefetch is already in progress, returns the existing promise.
53
+ * If a valid cached token exists, returns it immediately.
54
+ */
55
+ prefetchToken(): Promise<LiveKitTokenResponse>;
45
56
  /**
46
57
  * Fetch a LiveKit token from the backend
58
+ * Uses cached token if valid, otherwise fetches a fresh one.
59
+ * Includes language settings and user vocabulary which are stored in the VoiceSession.
47
60
  */
48
61
  fetchToken(): Promise<LiveKitTokenResponse>;
62
+ /**
63
+ * Internal method to fetch a fresh token from the server
64
+ */
65
+ private fetchTokenFromServer;
49
66
  /**
50
67
  * Connect to a LiveKit room (fresh connection each time)
51
68
  */
@@ -65,8 +82,13 @@ declare class LiveKitManager {
65
82
  private handleDataMessage;
66
83
  /**
67
84
  * Publish microphone audio track
85
+ * Uses the device ID from session settings if set
68
86
  */
69
87
  enableMicrophone(): Promise<void>;
88
+ /**
89
+ * Log information about the current microphone track
90
+ */
91
+ private logMicrophoneInfo;
70
92
  /**
71
93
  * Disable microphone audio track
72
94
  */
@@ -76,10 +98,24 @@ declare class LiveKitManager {
76
98
  */
77
99
  sendDataMessage(message: object): Promise<void>;
78
100
  /**
79
- * Start a voice session
80
- * Connects to room, enables microphone, and waits for agent to subscribe to our track
101
+ * Start a voice session with pre-connect audio buffering
102
+ * Fetches a fresh token, then enables mic with preConnectBuffer to capture audio while connecting.
103
+ * Agent subscription happens in the background - we don't block on it.
104
+ *
105
+ * @param options - Session options including action type and parameters
81
106
  */
82
- startVoiceSession(): Promise<void>;
107
+ startVoiceSession(options?: VoiceSessionOptions): Promise<void>;
108
+ /**
109
+ * Wait for the agent to subscribe to our audio track in the background
110
+ * Handles timeout errors without blocking the main flow
111
+ */
112
+ private waitForAgentSubscription;
113
+ /**
114
+ * Enable microphone with pre-connect buffering
115
+ * This starts capturing audio locally before the room is connected,
116
+ * buffering it until the connection is established.
117
+ */
118
+ private enableMicrophoneWithPreConnectBuffer;
83
119
  /**
84
120
  * Stop the voice session and request the transcript
85
121
  * Returns a promise that resolves with the transcript text
@@ -101,11 +137,47 @@ declare class LiveKitManager {
101
137
  * Alias for requestEditText - granular API naming
102
138
  */
103
139
  stopAndEdit(originalText: string): Promise<string>;
140
+ /**
141
+ * Request command matching using the transcript as input
142
+ * Sends command definitions to the backend, which matches the user's speech against them
143
+ * Returns a promise that resolves with the matched command or null if no match
144
+ * @throws Error if timeout occurs waiting for command result
145
+ */
146
+ requestCommand(commands: CommandDefinition[]): Promise<CommandResult | null>;
147
+ /**
148
+ * Alias for requestCommand - granular API naming
149
+ */
150
+ stopAndCommand(commands: CommandDefinition[]): Promise<CommandResult | null>;
104
151
  /**
105
152
  * Disconnect from the current room
106
153
  * Clears the token so a fresh one is fetched for the next session
107
154
  */
108
155
  disconnect(): Promise<void>;
156
+ /**
157
+ * Invalidate the cached token
158
+ * Call this when settings change that would affect the token (language, vocabulary)
159
+ */
160
+ invalidateTokenCache(): void;
161
+ /**
162
+ * Start auto-refreshing the token while the widget is expanded.
163
+ * Call this after a voice session completes to immediately fetch a fresh token
164
+ * (since each command requires its own token) and keep it fresh for subsequent commands.
165
+ */
166
+ startAutoRefresh(): void;
167
+ /**
168
+ * Stop auto-refreshing the token.
169
+ * Call this when the widget collapses or user navigates away.
170
+ */
171
+ stopAutoRefresh(): void;
172
+ /**
173
+ * Schedule a token refresh before the current cache expires.
174
+ * Handles computer sleep by checking elapsed time on each refresh attempt.
175
+ */
176
+ private scheduleTokenRefresh;
177
+ /**
178
+ * Perform the auto-refresh, handling computer sleep scenarios.
179
+ */
180
+ private performAutoRefresh;
109
181
  /**
110
182
  * Get the current room instance
111
183
  */
@@ -122,11 +194,6 @@ declare class LiveKitManager {
122
194
  * Check if microphone is enabled
123
195
  */
124
196
  isMicrophoneEnabled(): boolean;
125
- /**
126
- * Clear the cached token
127
- * Used when user identity changes to ensure next session gets a fresh token
128
- */
129
- clearToken(): void;
130
197
  }
131
198
  export declare const livekit: LiveKitManager;
132
199
  export {};
@@ -4,7 +4,7 @@
4
4
  * Provides both low-level and high-level APIs for voice interaction.
5
5
  * This is the main entry point for headless usage of SpeechOS.
6
6
  */
7
- import type { SpeechOSConfig } from "./types.js";
7
+ import type { SpeechOSCoreConfig, CommandDefinition, CommandResult } from "./types.js";
8
8
  import { state } from "./state.js";
9
9
  import { events } from "./events.js";
10
10
  /**
@@ -20,7 +20,7 @@ declare class SpeechOSCore {
20
20
  * Initialize the SDK with configuration
21
21
  * @param config - Configuration options including apiKey
22
22
  */
23
- init(config: SpeechOSConfig): void;
23
+ init(config: SpeechOSCoreConfig): void;
24
24
  /**
25
25
  * Check if the SDK is initialized
26
26
  */
@@ -84,6 +84,22 @@ declare class SpeechOSCore {
84
84
  * Call this after edit() when user stops speaking
85
85
  */
86
86
  stopEdit(): Promise<string>;
87
+ /**
88
+ * One-shot command: connect, wait for agent, record voice, match against commands
89
+ * Automatically handles the full voice session lifecycle
90
+ *
91
+ * @param commands - Array of command definitions to match against
92
+ * @returns The matched command result or null if no match
93
+ */
94
+ command(commands: CommandDefinition[]): Promise<CommandResult | null>;
95
+ private _commandCommands?;
96
+ private _commandResolve?;
97
+ private _commandReject?;
98
+ /**
99
+ * Stop command recording and get the matched command
100
+ * Call this after command() when user stops speaking
101
+ */
102
+ stopCommand(): Promise<CommandResult | null>;
87
103
  /**
88
104
  * Cancel the current operation
89
105
  */
@@ -99,7 +115,7 @@ declare class SpeechOSCore {
99
115
  /**
100
116
  * Get the current config
101
117
  */
102
- getConfig(): SpeechOSConfig;
118
+ getConfig(): SpeechOSCoreConfig;
103
119
  private ensureInitialized;
104
120
  private cleanup;
105
121
  /**
@@ -4,7 +4,7 @@
4
4
  * Provides both low-level and high-level APIs for voice interaction.
5
5
  * This is the main entry point for headless usage of SpeechOS.
6
6
  */
7
- import type { SpeechOSConfig } from "./types.js";
7
+ import type { SpeechOSCoreConfig, CommandDefinition, CommandResult } from "./types.js";
8
8
  import { state } from "./state.js";
9
9
  import { events } from "./events.js";
10
10
  /**
@@ -20,7 +20,7 @@ declare class SpeechOSCore {
20
20
  * Initialize the SDK with configuration
21
21
  * @param config - Configuration options including apiKey
22
22
  */
23
- init(config: SpeechOSConfig): void;
23
+ init(config: SpeechOSCoreConfig): void;
24
24
  /**
25
25
  * Check if the SDK is initialized
26
26
  */
@@ -84,6 +84,22 @@ declare class SpeechOSCore {
84
84
  * Call this after edit() when user stops speaking
85
85
  */
86
86
  stopEdit(): Promise<string>;
87
+ /**
88
+ * One-shot command: connect, wait for agent, record voice, match against commands
89
+ * Automatically handles the full voice session lifecycle
90
+ *
91
+ * @param commands - Array of command definitions to match against
92
+ * @returns The matched command result or null if no match
93
+ */
94
+ command(commands: CommandDefinition[]): Promise<CommandResult | null>;
95
+ private _commandCommands?;
96
+ private _commandResolve?;
97
+ private _commandReject?;
98
+ /**
99
+ * Stop command recording and get the matched command
100
+ * Call this after command() when user stops speaking
101
+ */
102
+ stopCommand(): Promise<CommandResult | null>;
87
103
  /**
88
104
  * Cancel the current operation
89
105
  */
@@ -99,7 +115,7 @@ declare class SpeechOSCore {
99
115
  /**
100
116
  * Get the current config
101
117
  */
102
- getConfig(): SpeechOSConfig;
118
+ getConfig(): SpeechOSCoreConfig;
103
119
  private ensureInitialized;
104
120
  private cleanup;
105
121
  /**
package/dist/state.d.cts CHANGED
@@ -9,9 +9,12 @@ import type { SpeechOSState, StateChangeCallback, UnsubscribeFn } from "./types.
9
9
  declare class StateManager {
10
10
  private state;
11
11
  private subscribers;
12
+ /** Cached immutable snapshot for useSyncExternalStore compatibility */
13
+ private snapshot;
12
14
  constructor(initialState: SpeechOSState);
13
15
  /**
14
- * Get the current state (returns a copy to prevent mutations)
16
+ * Get the current state snapshot (returns a stable reference for React)
17
+ * This returns an immutable frozen object that only changes when setState is called.
15
18
  */
16
19
  getState(): SpeechOSState;
17
20
  /**
package/dist/state.d.ts CHANGED
@@ -9,9 +9,12 @@ import type { SpeechOSState, StateChangeCallback, UnsubscribeFn } from "./types.
9
9
  declare class StateManager {
10
10
  private state;
11
11
  private subscribers;
12
+ /** Cached immutable snapshot for useSyncExternalStore compatibility */
13
+ private snapshot;
12
14
  constructor(initialState: SpeechOSState);
13
15
  /**
14
- * Get the current state (returns a copy to prevent mutations)
16
+ * Get the current state snapshot (returns a stable reference for React)
17
+ * This returns an immutable frozen object that only changes when setState is called.
15
18
  */
16
19
  getState(): SpeechOSState;
17
20
  /**
package/dist/types.d.cts CHANGED
@@ -18,22 +18,60 @@ export interface ServerErrorMessage {
18
18
  */
19
19
  export type ErrorSource = "init" | "connection" | "timeout" | "server";
20
20
  /**
21
- * Configuration options for initializing SpeechOS
21
+ * Backend type for voice sessions
22
+ * - 'websocket': Direct WebSocket connection (lower latency, recommended)
23
+ * - 'livekit': LiveKit WebRTC connection (legacy)
22
24
  */
23
- export interface SpeechOSConfig {
24
- /** API key for authentication with SpeechOS backend */
25
- apiKey?: string;
25
+ export type VoiceBackend = "websocket" | "livekit";
26
+ /**
27
+ * Configuration options for initializing SpeechOS Core
28
+ */
29
+ export interface SpeechOSCoreConfig {
30
+ /** API key for authentication with SpeechOS backend (required) */
31
+ apiKey: string;
26
32
  /** Optional user identifier for tracking which end user is using the SDK */
27
33
  userId?: string;
28
34
  /** Backend host URL for API calls (default: https://app.speechos.ai) */
29
35
  host?: string;
30
- /** Position of the widget on screen (used by client package) */
31
- position?: "bottom-center" | "bottom-right" | "bottom-left";
32
- /** Custom z-index for widget overlay (used by client package) */
33
- zIndex?: number;
34
36
  /** Enable debug logging */
35
37
  debug?: boolean;
36
38
  }
39
+ /**
40
+ * Session settings passed when starting a voice session
41
+ * Contains user preferences for transcription and processing
42
+ */
43
+ export interface SessionSettings {
44
+ /** Input language code for speech recognition (e.g., "en-US", "es", "fr") */
45
+ inputLanguageCode?: string;
46
+ /** Output language code for transcription formatting */
47
+ outputLanguageCode?: string;
48
+ /** Whether to apply AI formatting (removes filler words, adds punctuation) */
49
+ smartFormat?: boolean;
50
+ /** Custom vocabulary terms to improve transcription accuracy */
51
+ vocabulary?: string[];
52
+ /** Text snippets with trigger phrases that expand to full text */
53
+ snippets?: Array<{
54
+ trigger: string;
55
+ expansion: string;
56
+ }>;
57
+ /** Audio input device ID (empty string for system default) */
58
+ audioDeviceId?: string;
59
+ }
60
+ /**
61
+ * Options for starting a voice session
62
+ */
63
+ export interface VoiceSessionOptions {
64
+ /** Callback when microphone is ready and capturing */
65
+ onMicReady?: () => void;
66
+ /** Action type for this session */
67
+ action?: SpeechOSAction;
68
+ /** Text to edit (for edit action) */
69
+ inputText?: string;
70
+ /** Command definitions (for command action) */
71
+ commands?: CommandDefinition[];
72
+ /** User settings for this session */
73
+ settings?: SessionSettings;
74
+ }
37
75
  /**
38
76
  * LiveKit token response from the backend
39
77
  */
@@ -43,10 +81,59 @@ export interface LiveKitTokenResponse {
43
81
  room: string;
44
82
  identity: string;
45
83
  }
84
+ /**
85
+ * User vocabulary data sent with transcription/edit requests
86
+ * Includes custom vocabulary terms for improved transcription accuracy
87
+ * and text snippets that can be expanded from trigger phrases
88
+ */
89
+ export interface UserVocabularyData {
90
+ /** Custom vocabulary terms to improve transcription of domain-specific words */
91
+ vocabulary: string[];
92
+ /** Text snippets with trigger phrases that expand to full text */
93
+ snippets: Array<{
94
+ /** Short trigger phrase the user speaks */
95
+ trigger: string;
96
+ /** Full text to expand the trigger into */
97
+ expansion: string;
98
+ }>;
99
+ }
46
100
  /**
47
101
  * Available actions that can be triggered from the widget
48
102
  */
49
- export type SpeechOSAction = "dictate" | "edit";
103
+ export type SpeechOSAction = "dictate" | "edit" | "command";
104
+ /**
105
+ * Definition of a command argument
106
+ */
107
+ export interface CommandArgument {
108
+ /** Name of the argument (used as key in the result) */
109
+ name: string;
110
+ /** Description of what this argument represents */
111
+ description: string;
112
+ /** Type of the argument value */
113
+ type?: "string" | "number" | "integer" | "boolean";
114
+ /** Whether this argument is required (default: true) */
115
+ required?: boolean;
116
+ }
117
+ /**
118
+ * Definition of a command that can be matched
119
+ */
120
+ export interface CommandDefinition {
121
+ /** Unique name/identifier for the command */
122
+ name: string;
123
+ /** Description of what this command does (helps LLM match intent) */
124
+ description: string;
125
+ /** Arguments that can be extracted from the user's speech */
126
+ arguments?: CommandArgument[];
127
+ }
128
+ /**
129
+ * Result of a successful command match
130
+ */
131
+ export interface CommandResult {
132
+ /** Name of the matched command */
133
+ name: string;
134
+ /** Extracted argument values */
135
+ arguments: Record<string, unknown>;
136
+ }
50
137
  /**
51
138
  * Recording/dictation states
52
139
  */
@@ -109,6 +196,10 @@ export interface SpeechOSEventMap {
109
196
  text: string;
110
197
  originalText: string;
111
198
  };
199
+ /** Emitted when command matching completes (null if no command matched) */
200
+ "command:complete": {
201
+ command: CommandResult | null;
202
+ };
112
203
  /** Emitted when transcribed text is inserted into a form field */
113
204
  "transcription:inserted": {
114
205
  text: string;
@@ -120,6 +211,11 @@ export interface SpeechOSEventMap {
120
211
  editedContent: string;
121
212
  element: HTMLElement;
122
213
  };
214
+ /** Emitted when user settings change (language, snippets, vocabulary, smartFormat) */
215
+ "settings:changed": {
216
+ /** Type of setting that changed */
217
+ setting: "language" | "snippets" | "vocabulary" | "smartFormat";
218
+ };
123
219
  /** Emitted when an error occurs */
124
220
  error: {
125
221
  code: string;