npm - @speechos/core - Versions diffs - 0.2.9 → 0.2.11 - Mend

@speechos/core 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/dist/speechos.d.cts CHANGED Viewed

@@ -10,9 +10,7 @@ import { events } from "./events.js";
 /**
  * SpeechOS Core SDK
  *
- * Provides two API layers:
- * 1. Low-level API: Granular control over LiveKit connection lifecycle
- * 2. High-level API: One-shot methods for common voice tasks
+ * Provides a high-level API for common voice tasks.
  */
 declare class SpeechOSCore {
     private initialized;
@@ -26,36 +24,7 @@ declare class SpeechOSCore {
      */
     isInitialized(): boolean;
     /**
-     * Connect to LiveKit (fetches token, establishes connection)
-     * Call this before other low-level methods
-     */
-    connect(): Promise<void>;
-    /**
-     * Wait until the agent is ready to receive audio
-     * Resolves when the agent subscribes to our audio track
-     */
-    waitUntilReady(): Promise<void>;
-    /**
-     * Enable microphone (user is now being recorded)
-     */
-    enableMicrophone(): Promise<void>;
-    /**
-     * Stop recording and get the transcript
-     * @returns The transcribed text
-     */
-    stopAndGetTranscript(): Promise<string>;
-    /**
-     * Stop recording and get edited text
-     * @param originalText - The original text to edit based on voice instructions
-     * @returns The edited text
-     */
-    stopAndEdit(originalText: string): Promise<string>;
-    /**
-     * Disconnect from LiveKit
-     */
-    disconnect(): Promise<void>;
-    /**
-     * One-shot dictation: connect, wait for agent, record, and get transcript
+     * One-shot dictation: connect, record, and get transcript
      * Automatically handles the full voice session lifecycle
      *
      * @returns The transcribed text
@@ -69,7 +38,7 @@ declare class SpeechOSCore {
      */
     stopDictation(): Promise<string>;
     /**
-     * One-shot edit: connect, wait for agent, record voice instructions, apply to text
+     * One-shot edit: connect, record voice instructions, apply to text
      * Automatically handles the full voice session lifecycle
      *
      * @param originalText - The text to edit
@@ -85,21 +54,23 @@ declare class SpeechOSCore {
      */
     stopEdit(): Promise<string>;
     /**
-     * One-shot command: connect, wait for agent, record voice, match against commands
+     * One-shot command: connect, record voice, match against commands
      * Automatically handles the full voice session lifecycle
      *
      * @param commands - Array of command definitions to match against
-     * @returns The matched command result or null if no match
+     * @returns Array of matched commands (empty array if no matches)
      */
-    command(commands: CommandDefinition[]): Promise<CommandResult | null>;
+    command(commands: CommandDefinition[]): Promise<CommandResult[]>;
     private _commandCommands?;
     private _commandResolve?;
     private _commandReject?;
     /**
-     * Stop command recording and get the matched command
+     * Stop command recording and get the matched commands
      * Call this after command() when user stops speaking
+     *
+     * @returns Array of matched commands (empty array if no matches)
      */
-    stopCommand(): Promise<CommandResult | null>;
+    stopCommand(): Promise<CommandResult[]>;
     /**
      * Cancel the current operation
      */

package/dist/speechos.d.ts CHANGED Viewed

@@ -10,9 +10,7 @@ import { events } from "./events.js";
 /**
  * SpeechOS Core SDK
  *
- * Provides two API layers:
- * 1. Low-level API: Granular control over LiveKit connection lifecycle
- * 2. High-level API: One-shot methods for common voice tasks
+ * Provides a high-level API for common voice tasks.
  */
 declare class SpeechOSCore {
     private initialized;
@@ -26,36 +24,7 @@ declare class SpeechOSCore {
      */
     isInitialized(): boolean;
     /**
-     * Connect to LiveKit (fetches token, establishes connection)
-     * Call this before other low-level methods
-     */
-    connect(): Promise<void>;
-    /**
-     * Wait until the agent is ready to receive audio
-     * Resolves when the agent subscribes to our audio track
-     */
-    waitUntilReady(): Promise<void>;
-    /**
-     * Enable microphone (user is now being recorded)
-     */
-    enableMicrophone(): Promise<void>;
-    /**
-     * Stop recording and get the transcript
-     * @returns The transcribed text
-     */
-    stopAndGetTranscript(): Promise<string>;
-    /**
-     * Stop recording and get edited text
-     * @param originalText - The original text to edit based on voice instructions
-     * @returns The edited text
-     */
-    stopAndEdit(originalText: string): Promise<string>;
-    /**
-     * Disconnect from LiveKit
-     */
-    disconnect(): Promise<void>;
-    /**
-     * One-shot dictation: connect, wait for agent, record, and get transcript
+     * One-shot dictation: connect, record, and get transcript
      * Automatically handles the full voice session lifecycle
      *
      * @returns The transcribed text
@@ -69,7 +38,7 @@ declare class SpeechOSCore {
      */
     stopDictation(): Promise<string>;
     /**
-     * One-shot edit: connect, wait for agent, record voice instructions, apply to text
+     * One-shot edit: connect, record voice instructions, apply to text
      * Automatically handles the full voice session lifecycle
      *
      * @param originalText - The text to edit
@@ -85,21 +54,23 @@ declare class SpeechOSCore {
      */
     stopEdit(): Promise<string>;
     /**
-     * One-shot command: connect, wait for agent, record voice, match against commands
+     * One-shot command: connect, record voice, match against commands
      * Automatically handles the full voice session lifecycle
      *
      * @param commands - Array of command definitions to match against
-     * @returns The matched command result or null if no match
+     * @returns Array of matched commands (empty array if no matches)
      */
-    command(commands: CommandDefinition[]): Promise<CommandResult | null>;
+    command(commands: CommandDefinition[]): Promise<CommandResult[]>;
     private _commandCommands?;
     private _commandResolve?;
     private _commandReject?;
     /**
-     * Stop command recording and get the matched command
+     * Stop command recording and get the matched commands
      * Call this after command() when user stops speaking
+     *
+     * @returns Array of matched commands (empty array if no matches)
      */
-    stopCommand(): Promise<CommandResult | null>;
+    stopCommand(): Promise<CommandResult[]>;
     /**
      * Cancel the current operation
      */

package/dist/state.d.cts CHANGED Viewed

@@ -49,6 +49,16 @@ declare class StateManager {
      * @param element - The form element that has focus
      */
     setFocusedElement(element: HTMLElement | null): void;
+    /**
+     * Set the current text selection
+     * @param text - Selected text (null to clear)
+     * @param element - Element associated with selection
+     */
+    setSelection(text: string | null, element: HTMLElement | null): void;
+    /**
+     * Clear the current text selection
+     */
+    clearSelection(): void;
     /**
      * Set the active action
      * @param action - The action to set as active
@@ -61,7 +71,7 @@ declare class StateManager {
     setRecordingState(recordingState: SpeechOSState["recordingState"]): void;
     /**
      * Set the connection state
-     * @param isConnected - Whether connected to LiveKit
+     * @param isConnected - Whether connected to the backend
      */
     setConnected(isConnected: boolean): void;
     /**

package/dist/state.d.ts CHANGED Viewed

@@ -49,6 +49,16 @@ declare class StateManager {
      * @param element - The form element that has focus
      */
     setFocusedElement(element: HTMLElement | null): void;
+    /**
+     * Set the current text selection
+     * @param text - Selected text (null to clear)
+     * @param element - Element associated with selection
+     */
+    setSelection(text: string | null, element: HTMLElement | null): void;
+    /**
+     * Clear the current text selection
+     */
+    clearSelection(): void;
     /**
      * Set the active action
      * @param action - The action to set as active
@@ -61,7 +71,7 @@ declare class StateManager {
     setRecordingState(recordingState: SpeechOSState["recordingState"]): void;
     /**
      * Set the connection state
-     * @param isConnected - Whether connected to LiveKit
+     * @param isConnected - Whether connected to the backend
      */
     setConnected(isConnected: boolean): void;
     /**

package/dist/tts.d.cts ADDED Viewed

@@ -0,0 +1,74 @@
+/**
+ * TTS (Text-to-Speech) client for SpeechOS SDK
+ *
+ * Provides methods to synthesize speech from text using the SpeechOS TTS API.
+ * This is a headless module - audio playback is handled by @speechos/client.
+ */
+/**
+ * Default TTS voice ID (matches server default).
+ * The server validates voice IDs - pass any valid voice ID or omit to use default.
+ */
+export declare const DEFAULT_TTS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM";
+/**
+ * Options for TTS synthesis
+ */
+export interface TTSOptions {
+    /** Voice ID. Server uses its default if not specified. */
+    voiceId?: string;
+    /** Language code (e.g., 'en', 'es', 'fr'). Defaults to 'en'. */
+    language?: string;
+    /** Optional abort signal for cancelling the request. */
+    signal?: AbortSignal;
+}
+/**
+ * Result of TTS synthesis
+ */
+export interface TTSResult {
+    /** Audio data as ArrayBuffer (MP3 format) */
+    audio: ArrayBuffer;
+    /** Content type of the audio (e.g., 'audio/mpeg') */
+    contentType: string;
+}
+/**
+ * TTS error codes
+ */
+export type TTSErrorCode = "invalid_request" | "usage_limit_exceeded" | "authentication_failed" | "network_error" | "unknown_error";
+/**
+ * TTS Client for synthesizing speech from text
+ */
+export declare class TTSClient {
+    /**
+     * Synthesize text to speech and return audio bytes
+     *
+     * @param text - Text to synthesize (max 1000 chars)
+     * @param options - Optional synthesis options
+     * @returns Audio data and content type
+     *
+     * @example
+     * ```typescript
+     * const result = await tts.synthesize('Hello world');
+     * console.log(result.audio); // ArrayBuffer
+     * console.log(result.contentType); // 'audio/mpeg'
+     * ```
+     */
+    synthesize(text: string, options?: TTSOptions): Promise<TTSResult>;
+    /**
+     * Stream TTS audio chunks as they arrive from the server
+     *
+     * Useful for progressive playback or processing large texts.
+     *
+     * @param text - Text to synthesize (max 1000 chars)
+     * @param options - Optional synthesis options
+     * @yields Audio chunks as Uint8Array
+     *
+     * @example
+     * ```typescript
+     * const chunks: Uint8Array[] = [];
+     * for await (const chunk of tts.stream('Hello world')) {
+     *   chunks.push(chunk);
+     * }
+     * ```
+     */
+    stream(text: string, options?: TTSOptions): AsyncGenerator<Uint8Array>;
+}
+export declare const tts: TTSClient;

package/dist/tts.d.ts ADDED Viewed

@@ -0,0 +1,74 @@
+/**
+ * TTS (Text-to-Speech) client for SpeechOS SDK
+ *
+ * Provides methods to synthesize speech from text using the SpeechOS TTS API.
+ * This is a headless module - audio playback is handled by @speechos/client.
+ */
+/**
+ * Default TTS voice ID (matches server default).
+ * The server validates voice IDs - pass any valid voice ID or omit to use default.
+ */
+export declare const DEFAULT_TTS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM";
+/**
+ * Options for TTS synthesis
+ */
+export interface TTSOptions {
+    /** Voice ID. Server uses its default if not specified. */
+    voiceId?: string;
+    /** Language code (e.g., 'en', 'es', 'fr'). Defaults to 'en'. */
+    language?: string;
+    /** Optional abort signal for cancelling the request. */
+    signal?: AbortSignal;
+}
+/**
+ * Result of TTS synthesis
+ */
+export interface TTSResult {
+    /** Audio data as ArrayBuffer (MP3 format) */
+    audio: ArrayBuffer;
+    /** Content type of the audio (e.g., 'audio/mpeg') */
+    contentType: string;
+}
+/**
+ * TTS error codes
+ */
+export type TTSErrorCode = "invalid_request" | "usage_limit_exceeded" | "authentication_failed" | "network_error" | "unknown_error";
+/**
+ * TTS Client for synthesizing speech from text
+ */
+export declare class TTSClient {
+    /**
+     * Synthesize text to speech and return audio bytes
+     *
+     * @param text - Text to synthesize (max 1000 chars)
+     * @param options - Optional synthesis options
+     * @returns Audio data and content type
+     *
+     * @example
+     * ```typescript
+     * const result = await tts.synthesize('Hello world');
+     * console.log(result.audio); // ArrayBuffer
+     * console.log(result.contentType); // 'audio/mpeg'
+     * ```
+     */
+    synthesize(text: string, options?: TTSOptions): Promise<TTSResult>;
+    /**
+     * Stream TTS audio chunks as they arrive from the server
+     *
+     * Useful for progressive playback or processing large texts.
+     *
+     * @param text - Text to synthesize (max 1000 chars)
+     * @param options - Optional synthesis options
+     * @yields Audio chunks as Uint8Array
+     *
+     * @example
+     * ```typescript
+     * const chunks: Uint8Array[] = [];
+     * for await (const chunk of tts.stream('Hello world')) {
+     *   chunks.push(chunk);
+     * }
+     * ```
+     */
+    stream(text: string, options?: TTSOptions): AsyncGenerator<Uint8Array>;
+}
+export declare const tts: TTSClient;

package/dist/types.d.cts CHANGED Viewed

@@ -2,7 +2,7 @@
  * Shared TypeScript types for SpeechOS Core SDK
  */
 /**
- * Server error message structure received via LiveKit data channel
+ * Server error message structure received via WebSocket
  */
 export interface ServerErrorMessage {
     type: "error";
@@ -17,12 +17,6 @@ export interface ServerErrorMessage {
  * Error source indicating where the error originated
  */
 export type ErrorSource = "init" | "connection" | "timeout" | "server";
-/**
- * Backend type for voice sessions
- * - 'websocket': Direct WebSocket connection (lower latency, recommended)
- * - 'livekit': LiveKit WebRTC connection (legacy)
- */
-export type VoiceBackend = "websocket" | "livekit";
 /**
  * Configuration options for initializing SpeechOS Core
  */
@@ -85,15 +79,6 @@ export interface VoiceSessionOptions {
     /** User settings for this session */
     settings?: SessionSettings;
 }
-/**
- * LiveKit token response from the backend
- */
-export interface LiveKitTokenResponse {
-    token: string;
-    ws_url: string;
-    room: string;
-    identity: string;
-}
 /**
  * User vocabulary data sent with transcription/edit requests
  * Includes custom vocabulary terms for improved transcription accuracy
@@ -113,7 +98,7 @@ export interface UserVocabularyData {
 /**
  * Available actions that can be triggered from the widget
  */
-export type SpeechOSAction = "dictate" | "edit" | "command";
+export type SpeechOSAction = "dictate" | "edit" | "command" | "read";
 /**
  * Definition of a command argument
  */
@@ -159,7 +144,7 @@ export interface SpeechOSState {
     isVisible: boolean;
     /** Whether the action bubbles are expanded */
     isExpanded: boolean;
-    /** Whether connected to LiveKit room */
+    /** Whether connected to the backend */
     isConnected: boolean;
     /** Whether microphone is enabled and publishing */
     isMicEnabled: boolean;
@@ -167,6 +152,10 @@ export interface SpeechOSState {
     activeAction: SpeechOSAction | null;
     /** The form field element that currently has focus (set by client) */
     focusedElement: HTMLElement | null;
+    /** Currently selected text (if any) */
+    selectionText: string | null;
+    /** Element associated with the current selection (if any) */
+    selectionElement: HTMLElement | null;
     /** Current recording state */
     recordingState: RecordingState;
     /** Error message to display (if any) */
@@ -192,10 +181,15 @@ export interface SpeechOSEventMap {
     "widget:show": void;
     /** Emitted when the widget is hidden */
     "widget:hide": void;
-    /** Emitted when user selects an action (dictate/edit) */
+    /** Emitted when user selects an action */
     "action:select": {
         action: SpeechOSAction;
     };
+    /** Emitted when selected text changes (empty string when cleared) */
+    "selection:change": {
+        text: string;
+        element: HTMLElement | null;
+    };
     /** Emitted when internal state changes */
     "state:change": {
         state: SpeechOSState;
@@ -214,9 +208,9 @@ export interface SpeechOSEventMap {
         text: string;
         originalText: string;
     };
-    /** Emitted when command matching completes (null if no command matched) */
+    /** Emitted when command matching completes (empty array if no commands matched) */
     "command:complete": {
-        command: CommandResult | null;
+        commands: CommandResult[];
     };
     /** Emitted when transcribed text is inserted into a form field */
     "transcription:inserted": {
@@ -232,7 +226,7 @@ export interface SpeechOSEventMap {
     /** Emitted when user settings change (language, snippets, vocabulary, smartFormat, history) */
     "settings:changed": {
         /** Type of setting that changed */
-        setting: "language" | "snippets" | "vocabulary" | "smartFormat" | "history";
+        setting: "language" | "snippets" | "vocabulary" | "smartFormat" | "history" | "voice";
     };
     /** Emitted when settings are loaded from the server */
     "settings:loaded": void;
@@ -244,6 +238,32 @@ export interface SpeechOSEventMap {
     };
     /** Emitted when the settings token expires (user should request a new one) */
     "settings:tokenExpired": void;
+    /** Emitted when a TTS synthesis request begins */
+    "tts:synthesize:start": {
+        text: string;
+    };
+    /** Emitted when audio bytes are fully received from the server */
+    "tts:synthesize:complete": {
+        text: string;
+    };
+    /** Emitted when audio playback begins */
+    "tts:playback:start": {
+        text: string;
+    };
+    /** Emitted when audio playback finishes */
+    "tts:playback:complete": {
+        text: string;
+    };
+    /** Emitted when audio playback is stopped */
+    "tts:playback:stop": {
+        text: string | null;
+    };
+    /** Emitted when an error occurs during TTS synthesis or playback */
+    "tts:error": {
+        code: string;
+        message: string;
+        phase: "synthesize" | "playback";
+    };
     /** Emitted when an error occurs */
     error: {
         code: string;

package/dist/types.d.ts CHANGED Viewed

@@ -2,7 +2,7 @@
  * Shared TypeScript types for SpeechOS Core SDK
  */
 /**
- * Server error message structure received via LiveKit data channel
+ * Server error message structure received via WebSocket
  */
 export interface ServerErrorMessage {
     type: "error";
@@ -17,12 +17,6 @@ export interface ServerErrorMessage {
  * Error source indicating where the error originated
  */
 export type ErrorSource = "init" | "connection" | "timeout" | "server";
-/**
- * Backend type for voice sessions
- * - 'websocket': Direct WebSocket connection (lower latency, recommended)
- * - 'livekit': LiveKit WebRTC connection (legacy)
- */
-export type VoiceBackend = "websocket" | "livekit";
 /**
  * Configuration options for initializing SpeechOS Core
  */
@@ -85,15 +79,6 @@ export interface VoiceSessionOptions {
     /** User settings for this session */
     settings?: SessionSettings;
 }
-/**
- * LiveKit token response from the backend
- */
-export interface LiveKitTokenResponse {
-    token: string;
-    ws_url: string;
-    room: string;
-    identity: string;
-}
 /**
  * User vocabulary data sent with transcription/edit requests
  * Includes custom vocabulary terms for improved transcription accuracy
@@ -113,7 +98,7 @@ export interface UserVocabularyData {
 /**
  * Available actions that can be triggered from the widget
  */
-export type SpeechOSAction = "dictate" | "edit" | "command";
+export type SpeechOSAction = "dictate" | "edit" | "command" | "read";
 /**
  * Definition of a command argument
  */
@@ -159,7 +144,7 @@ export interface SpeechOSState {
     isVisible: boolean;
     /** Whether the action bubbles are expanded */
     isExpanded: boolean;
-    /** Whether connected to LiveKit room */
+    /** Whether connected to the backend */
     isConnected: boolean;
     /** Whether microphone is enabled and publishing */
     isMicEnabled: boolean;
@@ -167,6 +152,10 @@ export interface SpeechOSState {
     activeAction: SpeechOSAction | null;
     /** The form field element that currently has focus (set by client) */
     focusedElement: HTMLElement | null;
+    /** Currently selected text (if any) */
+    selectionText: string | null;
+    /** Element associated with the current selection (if any) */
+    selectionElement: HTMLElement | null;
     /** Current recording state */
     recordingState: RecordingState;
     /** Error message to display (if any) */
@@ -192,10 +181,15 @@ export interface SpeechOSEventMap {
     "widget:show": void;
     /** Emitted when the widget is hidden */
     "widget:hide": void;
-    /** Emitted when user selects an action (dictate/edit) */
+    /** Emitted when user selects an action */
     "action:select": {
         action: SpeechOSAction;
     };
+    /** Emitted when selected text changes (empty string when cleared) */
+    "selection:change": {
+        text: string;
+        element: HTMLElement | null;
+    };
     /** Emitted when internal state changes */
     "state:change": {
         state: SpeechOSState;
@@ -214,9 +208,9 @@ export interface SpeechOSEventMap {
         text: string;
         originalText: string;
     };
-    /** Emitted when command matching completes (null if no command matched) */
+    /** Emitted when command matching completes (empty array if no commands matched) */
     "command:complete": {
-        command: CommandResult | null;
+        commands: CommandResult[];
     };
     /** Emitted when transcribed text is inserted into a form field */
     "transcription:inserted": {
@@ -232,7 +226,7 @@ export interface SpeechOSEventMap {
     /** Emitted when user settings change (language, snippets, vocabulary, smartFormat, history) */
     "settings:changed": {
         /** Type of setting that changed */
-        setting: "language" | "snippets" | "vocabulary" | "smartFormat" | "history";
+        setting: "language" | "snippets" | "vocabulary" | "smartFormat" | "history" | "voice";
     };
     /** Emitted when settings are loaded from the server */
     "settings:loaded": void;
@@ -244,6 +238,32 @@ export interface SpeechOSEventMap {
     };
     /** Emitted when the settings token expires (user should request a new one) */
     "settings:tokenExpired": void;
+    /** Emitted when a TTS synthesis request begins */
+    "tts:synthesize:start": {
+        text: string;
+    };
+    /** Emitted when audio bytes are fully received from the server */
+    "tts:synthesize:complete": {
+        text: string;
+    };
+    /** Emitted when audio playback begins */
+    "tts:playback:start": {
+        text: string;
+    };
+    /** Emitted when audio playback finishes */
+    "tts:playback:complete": {
+        text: string;
+    };
+    /** Emitted when audio playback is stopped */
+    "tts:playback:stop": {
+        text: string | null;
+    };
+    /** Emitted when an error occurs during TTS synthesis or playback */
+    "tts:error": {
+        code: string;
+        message: string;
+        phase: "synthesize" | "playback";
+    };
     /** Emitted when an error occurs */
     error: {
         code: string;

package/dist/websocket.d.cts CHANGED Viewed

@@ -1,8 +1,8 @@
 /**
  * WebSocket integration for SpeechOS SDK.
  *
- * Provides a direct WebSocket connection to the backend for voice sessions,
- * bypassing LiveKit for lower latency. Uses audio buffering to capture
+ * Provides a direct WebSocket connection to the backend for voice sessions.
+ * Uses audio buffering to capture
  * audio immediately while the connection is being established.
  */
 import type { CommandDefinition, CommandResult, ErrorSource, VoiceSessionOptions } from './types.js';
@@ -92,8 +92,9 @@ declare class WebSocketManager {
     /**
      * Request command matching using the transcript as input.
      * Note: The command definitions were already sent in the auth message via startVoiceSession.
+     * Returns an array of matched commands (empty array if no matches).
      */
-    requestCommand(_commands: CommandDefinition[]): Promise<CommandResult | null>;
+    requestCommand(_commands: CommandDefinition[]): Promise<CommandResult[]>;
     /**
      * Stop audio capture and wait for all data to be sent.
      *
@@ -108,7 +109,7 @@ declare class WebSocketManager {
      * Wait for the WebSocket send buffer to drain.
      *
      * This ensures all audio data has been transmitted before we request
-     * the transcript. Uses the same pattern as LiveKit's ReadableStream approach.
+     * the transcript.
      */
     private waitForBufferDrain;
     /**