npm - @speechos/core - Versions diffs - 0.2.0 → 0.2.3 - Mend

@speechos/core 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/dist/audio-capture.d.cts +130 -0
package/dist/audio-capture.d.ts +130 -0
package/dist/backend.d.cts +41 -0
package/dist/backend.d.ts +41 -0
package/dist/config.d.cts +23 -7
package/dist/config.d.ts +23 -7
package/dist/index.cjs +1263 -158
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +6 -5
package/dist/index.d.ts +6 -5
package/dist/index.js +1262 -157
package/dist/index.js.map +1 -1
package/dist/livekit.d.cts +81 -14
package/dist/livekit.d.ts +81 -14
package/dist/speechos.d.cts +19 -3
package/dist/speechos.d.ts +19 -3
package/dist/state.d.cts +4 -1
package/dist/state.d.ts +4 -1
package/dist/types.d.cts +105 -9
package/dist/types.d.ts +105 -9
package/dist/websocket.d.cts +133 -0
package/dist/websocket.d.ts +133 -0
package/package.json +5 -4
package/dist/transcript-store.d.cts +0 -35
package/dist/transcript-store.d.ts +0 -35

package/dist/livekit.d.cts CHANGED Viewed

@@ -3,7 +3,7 @@
  * Handles room connections, audio streaming, and transcription requests
  */
 import { Room } from "livekit-client";
-import type { LiveKitTokenResponse, ErrorSource } from "./types.js";
+import type { LiveKitTokenResponse, ErrorSource, CommandDefinition, CommandResult, VoiceSessionOptions } from "./types.js";
 /**
  * A deferred promise with timeout support.
  * Encapsulates resolve/reject/timeout in a single object for cleaner async handling.
@@ -31,21 +31,38 @@ declare class LiveKitManager {
     private room;
     private tokenData;
     private micTrack;
+    private cachedTokenData;
+    private tokenCacheTimestamp;
+    private tokenPrefetchPromise;
+    private tokenRefreshTimer;
+    private autoRefreshEnabled;
     private pendingTranscript;
     private pendingEditText;
+    private pendingCommand;
     private pendingTrackSubscribed;
-    private preWarmPromise;
     private editOriginalText;
+    private sessionSettings;
     /**
-     * Pre-warm resources for faster connection
-     * Call this when user shows intent (e.g., expands widget)
-     * Only fetches token - mic permission is requested when user clicks Dictate
+     * Check if the cached token is still valid (within TTL)
      */
-    preWarm(): Promise<void>;
+    private isCachedTokenValid;
+    /**
+     * Pre-fetch a LiveKit token for later use
+     * Call this early (e.g., when widget expands) to reduce latency when starting a voice session.
+     * If a prefetch is already in progress, returns the existing promise.
+     * If a valid cached token exists, returns it immediately.
+     */
+    prefetchToken(): Promise<LiveKitTokenResponse>;
     /**
      * Fetch a LiveKit token from the backend
+     * Uses cached token if valid, otherwise fetches a fresh one.
+     * Includes language settings and user vocabulary which are stored in the VoiceSession.
      */
     fetchToken(): Promise<LiveKitTokenResponse>;
+    /**
+     * Internal method to fetch a fresh token from the server
+     */
+    private fetchTokenFromServer;
     /**
      * Connect to a LiveKit room (fresh connection each time)
      */
@@ -65,8 +82,13 @@ declare class LiveKitManager {
     private handleDataMessage;
     /**
      * Publish microphone audio track
+     * Uses the device ID from session settings if set
      */
     enableMicrophone(): Promise<void>;
+    /**
+     * Log information about the current microphone track
+     */
+    private logMicrophoneInfo;
     /**
      * Disable microphone audio track
      */
@@ -76,10 +98,24 @@ declare class LiveKitManager {
      */
     sendDataMessage(message: object): Promise<void>;
     /**
-     * Start a voice session
-     * Connects to room, enables microphone, and waits for agent to subscribe to our track
+     * Start a voice session with pre-connect audio buffering
+     * Fetches a fresh token, then enables mic with preConnectBuffer to capture audio while connecting.
+     * Agent subscription happens in the background - we don't block on it.
+     *
+     * @param options - Session options including action type and parameters
      */
-    startVoiceSession(): Promise<void>;
+    startVoiceSession(options?: VoiceSessionOptions): Promise<void>;
+    /**
+     * Wait for the agent to subscribe to our audio track in the background
+     * Handles timeout errors without blocking the main flow
+     */
+    private waitForAgentSubscription;
+    /**
+     * Enable microphone with pre-connect buffering
+     * This starts capturing audio locally before the room is connected,
+     * buffering it until the connection is established.
+     */
+    private enableMicrophoneWithPreConnectBuffer;
     /**
      * Stop the voice session and request the transcript
      * Returns a promise that resolves with the transcript text
@@ -101,11 +137,47 @@ declare class LiveKitManager {
      * Alias for requestEditText - granular API naming
      */
     stopAndEdit(originalText: string): Promise<string>;
+    /**
+     * Request command matching using the transcript as input
+     * Sends command definitions to the backend, which matches the user's speech against them
+     * Returns a promise that resolves with the matched command or null if no match
+     * @throws Error if timeout occurs waiting for command result
+     */
+    requestCommand(commands: CommandDefinition[]): Promise<CommandResult | null>;
+    /**
+     * Alias for requestCommand - granular API naming
+     */
+    stopAndCommand(commands: CommandDefinition[]): Promise<CommandResult | null>;
     /**
      * Disconnect from the current room
      * Clears the token so a fresh one is fetched for the next session
      */
     disconnect(): Promise<void>;
+    /**
+     * Invalidate the cached token
+     * Call this when settings change that would affect the token (language, vocabulary)
+     */
+    invalidateTokenCache(): void;
+    /**
+     * Start auto-refreshing the token while the widget is expanded.
+     * Call this after a voice session completes to immediately fetch a fresh token
+     * (since each command requires its own token) and keep it fresh for subsequent commands.
+     */
+    startAutoRefresh(): void;
+    /**
+     * Stop auto-refreshing the token.
+     * Call this when the widget collapses or user navigates away.
+     */
+    stopAutoRefresh(): void;
+    /**
+     * Schedule a token refresh before the current cache expires.
+     * Handles computer sleep by checking elapsed time on each refresh attempt.
+     */
+    private scheduleTokenRefresh;
+    /**
+     * Perform the auto-refresh, handling computer sleep scenarios.
+     */
+    private performAutoRefresh;
     /**
      * Get the current room instance
      */
@@ -122,11 +194,6 @@ declare class LiveKitManager {
      * Check if microphone is enabled
      */
     isMicrophoneEnabled(): boolean;
-    /**
-     * Clear the cached token
-     * Used when user identity changes to ensure next session gets a fresh token
-     */
-    clearToken(): void;
 }
 export declare const livekit: LiveKitManager;
 export {};

package/dist/livekit.d.ts CHANGED Viewed

@@ -3,7 +3,7 @@
  * Handles room connections, audio streaming, and transcription requests
  */
 import { Room } from "livekit-client";
-import type { LiveKitTokenResponse, ErrorSource } from "./types.js";
+import type { LiveKitTokenResponse, ErrorSource, CommandDefinition, CommandResult, VoiceSessionOptions } from "./types.js";
 /**
  * A deferred promise with timeout support.
  * Encapsulates resolve/reject/timeout in a single object for cleaner async handling.
@@ -31,21 +31,38 @@ declare class LiveKitManager {
     private room;
     private tokenData;
     private micTrack;
+    private cachedTokenData;
+    private tokenCacheTimestamp;
+    private tokenPrefetchPromise;
+    private tokenRefreshTimer;
+    private autoRefreshEnabled;
     private pendingTranscript;
     private pendingEditText;
+    private pendingCommand;
     private pendingTrackSubscribed;
-    private preWarmPromise;
     private editOriginalText;
+    private sessionSettings;
     /**
-     * Pre-warm resources for faster connection
-     * Call this when user shows intent (e.g., expands widget)
-     * Only fetches token - mic permission is requested when user clicks Dictate
+     * Check if the cached token is still valid (within TTL)
      */
-    preWarm(): Promise<void>;
+    private isCachedTokenValid;
+    /**
+     * Pre-fetch a LiveKit token for later use
+     * Call this early (e.g., when widget expands) to reduce latency when starting a voice session.
+     * If a prefetch is already in progress, returns the existing promise.
+     * If a valid cached token exists, returns it immediately.
+     */
+    prefetchToken(): Promise<LiveKitTokenResponse>;
     /**
      * Fetch a LiveKit token from the backend
+     * Uses cached token if valid, otherwise fetches a fresh one.
+     * Includes language settings and user vocabulary which are stored in the VoiceSession.
      */
     fetchToken(): Promise<LiveKitTokenResponse>;
+    /**
+     * Internal method to fetch a fresh token from the server
+     */
+    private fetchTokenFromServer;
     /**
      * Connect to a LiveKit room (fresh connection each time)
      */
@@ -65,8 +82,13 @@ declare class LiveKitManager {
     private handleDataMessage;
     /**
      * Publish microphone audio track
+     * Uses the device ID from session settings if set
      */
     enableMicrophone(): Promise<void>;
+    /**
+     * Log information about the current microphone track
+     */
+    private logMicrophoneInfo;
     /**
      * Disable microphone audio track
      */
@@ -76,10 +98,24 @@ declare class LiveKitManager {
      */
     sendDataMessage(message: object): Promise<void>;
     /**
-     * Start a voice session
-     * Connects to room, enables microphone, and waits for agent to subscribe to our track
+     * Start a voice session with pre-connect audio buffering
+     * Fetches a fresh token, then enables mic with preConnectBuffer to capture audio while connecting.
+     * Agent subscription happens in the background - we don't block on it.
+     *
+     * @param options - Session options including action type and parameters
      */
-    startVoiceSession(): Promise<void>;
+    startVoiceSession(options?: VoiceSessionOptions): Promise<void>;
+    /**
+     * Wait for the agent to subscribe to our audio track in the background
+     * Handles timeout errors without blocking the main flow
+     */
+    private waitForAgentSubscription;
+    /**
+     * Enable microphone with pre-connect buffering
+     * This starts capturing audio locally before the room is connected,
+     * buffering it until the connection is established.
+     */
+    private enableMicrophoneWithPreConnectBuffer;
     /**
      * Stop the voice session and request the transcript
      * Returns a promise that resolves with the transcript text
@@ -101,11 +137,47 @@ declare class LiveKitManager {
      * Alias for requestEditText - granular API naming
      */
     stopAndEdit(originalText: string): Promise<string>;
+    /**
+     * Request command matching using the transcript as input
+     * Sends command definitions to the backend, which matches the user's speech against them
+     * Returns a promise that resolves with the matched command or null if no match
+     * @throws Error if timeout occurs waiting for command result
+     */
+    requestCommand(commands: CommandDefinition[]): Promise<CommandResult | null>;
+    /**
+     * Alias for requestCommand - granular API naming
+     */
+    stopAndCommand(commands: CommandDefinition[]): Promise<CommandResult | null>;
     /**
      * Disconnect from the current room
      * Clears the token so a fresh one is fetched for the next session
      */
     disconnect(): Promise<void>;
+    /**
+     * Invalidate the cached token
+     * Call this when settings change that would affect the token (language, vocabulary)
+     */
+    invalidateTokenCache(): void;
+    /**
+     * Start auto-refreshing the token while the widget is expanded.
+     * Call this after a voice session completes to immediately fetch a fresh token
+     * (since each command requires its own token) and keep it fresh for subsequent commands.
+     */
+    startAutoRefresh(): void;
+    /**
+     * Stop auto-refreshing the token.
+     * Call this when the widget collapses or user navigates away.
+     */
+    stopAutoRefresh(): void;
+    /**
+     * Schedule a token refresh before the current cache expires.
+     * Handles computer sleep by checking elapsed time on each refresh attempt.
+     */
+    private scheduleTokenRefresh;
+    /**
+     * Perform the auto-refresh, handling computer sleep scenarios.
+     */
+    private performAutoRefresh;
     /**
      * Get the current room instance
      */
@@ -122,11 +194,6 @@ declare class LiveKitManager {
      * Check if microphone is enabled
      */
     isMicrophoneEnabled(): boolean;
-    /**
-     * Clear the cached token
-     * Used when user identity changes to ensure next session gets a fresh token
-     */
-    clearToken(): void;
 }
 export declare const livekit: LiveKitManager;
 export {};

package/dist/speechos.d.cts CHANGED Viewed

@@ -4,7 +4,7 @@
  * Provides both low-level and high-level APIs for voice interaction.
  * This is the main entry point for headless usage of SpeechOS.
  */
-import type { SpeechOSConfig } from "./types.js";
+import type { SpeechOSCoreConfig, CommandDefinition, CommandResult } from "./types.js";
 import { state } from "./state.js";
 import { events } from "./events.js";
 /**
@@ -20,7 +20,7 @@ declare class SpeechOSCore {
      * Initialize the SDK with configuration
      * @param config - Configuration options including apiKey
      */
-    init(config: SpeechOSConfig): void;
+    init(config: SpeechOSCoreConfig): void;
     /**
      * Check if the SDK is initialized
      */
@@ -84,6 +84,22 @@ declare class SpeechOSCore {
      * Call this after edit() when user stops speaking
      */
     stopEdit(): Promise<string>;
+    /**
+     * One-shot command: connect, wait for agent, record voice, match against commands
+     * Automatically handles the full voice session lifecycle
+     *
+     * @param commands - Array of command definitions to match against
+     * @returns The matched command result or null if no match
+     */
+    command(commands: CommandDefinition[]): Promise<CommandResult | null>;
+    private _commandCommands?;
+    private _commandResolve?;
+    private _commandReject?;
+    /**
+     * Stop command recording and get the matched command
+     * Call this after command() when user stops speaking
+     */
+    stopCommand(): Promise<CommandResult | null>;
     /**
      * Cancel the current operation
      */
@@ -99,7 +115,7 @@ declare class SpeechOSCore {
     /**
      * Get the current config
      */
-    getConfig(): SpeechOSConfig;
+    getConfig(): SpeechOSCoreConfig;
     private ensureInitialized;
     private cleanup;
     /**

package/dist/speechos.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@
  * Provides both low-level and high-level APIs for voice interaction.
  * This is the main entry point for headless usage of SpeechOS.
  */
-import type { SpeechOSConfig } from "./types.js";
+import type { SpeechOSCoreConfig, CommandDefinition, CommandResult } from "./types.js";
 import { state } from "./state.js";
 import { events } from "./events.js";
 /**
@@ -20,7 +20,7 @@ declare class SpeechOSCore {
      * Initialize the SDK with configuration
      * @param config - Configuration options including apiKey
      */
-    init(config: SpeechOSConfig): void;
+    init(config: SpeechOSCoreConfig): void;
     /**
      * Check if the SDK is initialized
      */
@@ -84,6 +84,22 @@ declare class SpeechOSCore {
      * Call this after edit() when user stops speaking
      */
     stopEdit(): Promise<string>;
+    /**
+     * One-shot command: connect, wait for agent, record voice, match against commands
+     * Automatically handles the full voice session lifecycle
+     *
+     * @param commands - Array of command definitions to match against
+     * @returns The matched command result or null if no match
+     */
+    command(commands: CommandDefinition[]): Promise<CommandResult | null>;
+    private _commandCommands?;
+    private _commandResolve?;
+    private _commandReject?;
+    /**
+     * Stop command recording and get the matched command
+     * Call this after command() when user stops speaking
+     */
+    stopCommand(): Promise<CommandResult | null>;
     /**
      * Cancel the current operation
      */
@@ -99,7 +115,7 @@ declare class SpeechOSCore {
     /**
      * Get the current config
      */
-    getConfig(): SpeechOSConfig;
+    getConfig(): SpeechOSCoreConfig;
     private ensureInitialized;
     private cleanup;
     /**

package/dist/state.d.cts CHANGED Viewed

@@ -9,9 +9,12 @@ import type { SpeechOSState, StateChangeCallback, UnsubscribeFn } from "./types.
 declare class StateManager {
     private state;
     private subscribers;
+    /** Cached immutable snapshot for useSyncExternalStore compatibility */
+    private snapshot;
     constructor(initialState: SpeechOSState);
     /**
-     * Get the current state (returns a copy to prevent mutations)
+     * Get the current state snapshot (returns a stable reference for React)
+     * This returns an immutable frozen object that only changes when setState is called.
      */
     getState(): SpeechOSState;
     /**

package/dist/state.d.ts CHANGED Viewed

@@ -9,9 +9,12 @@ import type { SpeechOSState, StateChangeCallback, UnsubscribeFn } from "./types.
 declare class StateManager {
     private state;
     private subscribers;
+    /** Cached immutable snapshot for useSyncExternalStore compatibility */
+    private snapshot;
     constructor(initialState: SpeechOSState);
     /**
-     * Get the current state (returns a copy to prevent mutations)
+     * Get the current state snapshot (returns a stable reference for React)
+     * This returns an immutable frozen object that only changes when setState is called.
      */
     getState(): SpeechOSState;
     /**

package/dist/types.d.cts CHANGED Viewed

@@ -18,22 +18,60 @@ export interface ServerErrorMessage {
  */
 export type ErrorSource = "init" | "connection" | "timeout" | "server";
 /**
- * Configuration options for initializing SpeechOS
+ * Backend type for voice sessions
+ * - 'websocket': Direct WebSocket connection (lower latency, recommended)
+ * - 'livekit': LiveKit WebRTC connection (legacy)
  */
-export interface SpeechOSConfig {
-    /** API key for authentication with SpeechOS backend */
-    apiKey?: string;
+export type VoiceBackend = "websocket" | "livekit";
+/**
+ * Configuration options for initializing SpeechOS Core
+ */
+export interface SpeechOSCoreConfig {
+    /** API key for authentication with SpeechOS backend (required) */
+    apiKey: string;
     /** Optional user identifier for tracking which end user is using the SDK */
     userId?: string;
     /** Backend host URL for API calls (default: https://app.speechos.ai) */
     host?: string;
-    /** Position of the widget on screen (used by client package) */
-    position?: "bottom-center" | "bottom-right" | "bottom-left";
-    /** Custom z-index for widget overlay (used by client package) */
-    zIndex?: number;
     /** Enable debug logging */
     debug?: boolean;
 }
+/**
+ * Session settings passed when starting a voice session
+ * Contains user preferences for transcription and processing
+ */
+export interface SessionSettings {
+    /** Input language code for speech recognition (e.g., "en-US", "es", "fr") */
+    inputLanguageCode?: string;
+    /** Output language code for transcription formatting */
+    outputLanguageCode?: string;
+    /** Whether to apply AI formatting (removes filler words, adds punctuation) */
+    smartFormat?: boolean;
+    /** Custom vocabulary terms to improve transcription accuracy */
+    vocabulary?: string[];
+    /** Text snippets with trigger phrases that expand to full text */
+    snippets?: Array<{
+        trigger: string;
+        expansion: string;
+    }>;
+    /** Audio input device ID (empty string for system default) */
+    audioDeviceId?: string;
+}
+/**
+ * Options for starting a voice session
+ */
+export interface VoiceSessionOptions {
+    /** Callback when microphone is ready and capturing */
+    onMicReady?: () => void;
+    /** Action type for this session */
+    action?: SpeechOSAction;
+    /** Text to edit (for edit action) */
+    inputText?: string;
+    /** Command definitions (for command action) */
+    commands?: CommandDefinition[];
+    /** User settings for this session */
+    settings?: SessionSettings;
+}
 /**
  * LiveKit token response from the backend
  */
@@ -43,10 +81,59 @@ export interface LiveKitTokenResponse {
     room: string;
     identity: string;
 }
+/**
+ * User vocabulary data sent with transcription/edit requests
+ * Includes custom vocabulary terms for improved transcription accuracy
+ * and text snippets that can be expanded from trigger phrases
+ */
+export interface UserVocabularyData {
+    /** Custom vocabulary terms to improve transcription of domain-specific words */
+    vocabulary: string[];
+    /** Text snippets with trigger phrases that expand to full text */
+    snippets: Array<{
+        /** Short trigger phrase the user speaks */
+        trigger: string;
+        /** Full text to expand the trigger into */
+        expansion: string;
+    }>;
+}
 /**
  * Available actions that can be triggered from the widget
  */
-export type SpeechOSAction = "dictate" | "edit";
+export type SpeechOSAction = "dictate" | "edit" | "command";
+/**
+ * Definition of a command argument
+ */
+export interface CommandArgument {
+    /** Name of the argument (used as key in the result) */
+    name: string;
+    /** Description of what this argument represents */
+    description: string;
+    /** Type of the argument value */
+    type?: "string" | "number" | "integer" | "boolean";
+    /** Whether this argument is required (default: true) */
+    required?: boolean;
+}
+/**
+ * Definition of a command that can be matched
+ */
+export interface CommandDefinition {
+    /** Unique name/identifier for the command */
+    name: string;
+    /** Description of what this command does (helps LLM match intent) */
+    description: string;
+    /** Arguments that can be extracted from the user's speech */
+    arguments?: CommandArgument[];
+}
+/**
+ * Result of a successful command match
+ */
+export interface CommandResult {
+    /** Name of the matched command */
+    name: string;
+    /** Extracted argument values */
+    arguments: Record<string, unknown>;
+}
 /**
  * Recording/dictation states
  */
@@ -109,6 +196,10 @@ export interface SpeechOSEventMap {
         text: string;
         originalText: string;
     };
+    /** Emitted when command matching completes (null if no command matched) */
+    "command:complete": {
+        command: CommandResult | null;
+    };
     /** Emitted when transcribed text is inserted into a form field */
     "transcription:inserted": {
         text: string;
@@ -120,6 +211,11 @@ export interface SpeechOSEventMap {
         editedContent: string;
         element: HTMLElement;
     };
+    /** Emitted when user settings change (language, snippets, vocabulary, smartFormat) */
+    "settings:changed": {
+        /** Type of setting that changed */
+        setting: "language" | "snippets" | "vocabulary" | "smartFormat";
+    };
     /** Emitted when an error occurs */
     error: {
         code: string;