npm - perso-interactive-sdk-web - Versions diffs - 1.1.0 → 1.2.0 - Mend

perso-interactive-sdk-web 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/client/index.d.ts CHANGED Viewed

@@ -1,3 +1,45 @@
+interface Chat {
+    text: string;
+    isUser: boolean;
+    timestamp: Date;
+}
+declare enum ChatState {
+    RECORDING = "RECORDING",
+    LLM = "LLM",
+    ANALYZING = "ANALYZING",
+    SPEAKING = "SPEAKING",
+    TTS = "TTS"
+}
+declare class ChatTool<TArg = any, TResult extends object = object> {
+    name: string;
+    description: string;
+    parameters: object;
+    call: (arg: TArg) => TResult | Promise<TResult>;
+    executeOnly: boolean;
+    constructor(name: string, description: string, parameters: object, call: (arg: TArg) => TResult | Promise<TResult>, executeOnly?: boolean);
+}
+type LLMStreamChunk = {
+    type: 'message';
+    chunks: string[];
+    message: string;
+    finish: boolean;
+} | ({
+    type: 'tool_call';
+    tool_calls: Array<object>;
+} & Record<string, unknown>) | ({
+    type: 'tool_result';
+    tool_call_id: string;
+    result: object;
+} & Record<string, unknown>) | {
+    type: 'error';
+    error: Error;
+};
+interface ProcessLLMOptions {
+    message: string;
+    tools?: Array<ChatTool>;
+    signal?: AbortSignal;
+}
 /**
  * High-level controller around a WebRTC PeerConnection that proxies Perso's
  * real-time APIs through convenience helpers.
@@ -8,7 +50,7 @@ declare class Perso extends EventTarget {
     dc: RTCDataChannel;
     streams: Array<MediaStream>;
     pingTime: number;
-    pingIntervalId: number | null;
+    pingIntervalId: ReturnType<typeof setInterval> | null;
     /**
      * Hooks a peer/data channel pair to status/ping listeners so consumers can
      * interact with the remote Perso session through a single object.
@@ -17,17 +59,22 @@ declare class Perso extends EventTarget {
      */
     constructor(pc: RTCPeerConnection, dc: RTCDataChannel);
     /**
-     * Attaches a local `MediaStream` to the Perso session, negotiates WebRTC
-     * connectivity, and waits until the first remote stream is ready.
+     * Negotiates WebRTC connectivity and waits until the first remote stream is ready.
+     *
+     * When an optional `stream` is provided (legacy bidirectional mode), the stream's
+     * tracks are added to the peer connection so the server can receive client audio.
+     * Without a stream the audio transceiver is set to receive-only.
+     *
      * @param apiServer Perso API server URL.
      * @param sessionId Session identifier created via `createSessionId`.
-     * @param stream Local camera/mic stream shared with the agent.
      * @param width Desired avatar canvas width.
      * @param height Desired avatar canvas height.
-     * @returns Ready-to-use `Perso` instance.
+     * @param stream Optional local media stream for bidirectional audio (legacy mode).
+     * @returns Ready-to-use `Perso` instance, or `null` when the session has no STF capability.
+     * @throws ApiError When session event or WebRTC negotiation fails.
      * @throws Timeout When remote streams fail to arrive in time.
      */
-    static create(apiServer: string, sessionId: string, stream: MediaStream, width: number, height: number): Promise<Perso>;
+    static create(apiServer: string, sessionId: string, width: number, height: number, stream?: MediaStream): Promise<Perso | null>;
     /**
      * Configures a browser `RTCPeerConnection` with the ICE servers provided by
      * the Perso API.
@@ -74,6 +121,26 @@ declare class Perso extends EventTarget {
      * @param message Text to synthesize and animate.
      */
     ttstf(message: string): void;
+    private static readonly BACKPRESSURE_THRESHOLD;
+    /**
+     * Sends a file to the remote peer via a dedicated WebRTC data channel.
+     * The file is chunked and transmitted in binary format. Applies
+     * backpressure when the channel's buffer exceeds 512 KB to avoid
+     * SCTP overflow on large files.
+     * @param file The file blob to send.
+     * @param chunksize Size of each chunk in bytes (default: 65536).
+     * @returns Promise resolving to the file reference string from the server.
+     */
+    sendFile(file: Blob, chunksize?: number): Promise<string>;
+    /**
+     * Sends an audio file for Speech-to-Face (STF) processing.
+     * The avatar will lip-sync to the provided audio.
+     * @param file Audio file blob (mp3 or wav).
+     * @param format Audio format ('mp3' or 'wav').
+     * @param message Optional text message associated with the audio.
+     * @returns Promise resolving to the file reference string.
+     */
+    stf(file: Blob, format: string, message: string): Promise<string>;
     /**
      * Signals the remote agent to start buffering microphone audio.
      */
@@ -118,7 +185,8 @@ declare class Perso extends EventTarget {
      * @param callback Handler invoked with the parsed payload.
      * @returns Function that removes the listener.
      */
-    setMessageCallback(type: string, callback: (data: any) => void): () => void;
+    setMessageCallback<T = any>(type: string, callback: (data: T) => void): () => void;
+    tts(base64: string, resample?: boolean): Promise<Blob>;
     /**
      * Tears down the PeerConnection due to remote/network failure and emits a
      * timeout status so the UI can inform users.
@@ -131,36 +199,6 @@ declare class Perso extends EventTarget {
     closeSelf(): void;
 }
-/**
- * Represents a single entry shown in the chat log UI.
- */
-interface Chat {
-    text: string;
-    isUser: boolean;
-    timestamp: Date;
-}
-/**
- * Discrete states that describe where the conversation currently is
- * (recording, running the LLM, analyzing text, speaking back, etc.).
- */
-declare enum ChatState {
-    RECORDING = "RECORDING",
-    LLM = "LLM",
-    ANALYZING = "ANALYZING",
-    SPEAKING = "SPEAKING"
-}
-/**
- * Container describing a callable tool (local client helper or remote MCP)
- * that the LLM runtime can invoke during conversations.
- */
-declare class ChatTool {
-    name: string;
-    description: string;
-    parameters: object;
-    call: (arg: any) => object | Promise<object>;
-    executeOnly: boolean;
-    constructor(name: string, description: string, parameters: object, call: (arg: any) => object | Promise<object>, executeOnly?: boolean);
-}
 /**
  * Manages a full Perso chat session including UI state, LLM orchestration,
  * microphone handling, and speech synthesis triggers.
@@ -168,8 +206,7 @@ declare class ChatTool {
 declare class Session {
     apiServer: string;
     sessionId: string;
-    stream: MediaStream;
-    perso: Perso;
+    perso: Perso | null;
     clientTools: Array<ChatTool>;
     private chatStatesHandler;
     private chatLogHandler;
@@ -180,17 +217,29 @@ declare class Session {
     private stfTimeoutStartTime;
     private messageHistory;
     private chatLog;
+    private llmProcessor;
     private chatStateMap;
     private emojiRegex;
+    private sttRecorder;
+    private sttTimeoutHandle;
+    private sttTimeoutAudioFile;
+    private heartbeatIntervalId;
+    private readonly legacyVoiceChatMode;
+    private readonly stream;
     /**
      * Sets up message listeners and chat-state trackers for a Perso session.
      * @param apiServer Perso API server URL.
      * @param sessionId Id of the session negotiated with the backend.
-     * @param stream Local audio stream shared with the session.
      * @param perso Underlying Perso WebRTC controller.
      * @param clientTools Tools exposed to the LLM for function calling.
-     */
-    constructor(apiServer: string, sessionId: string, stream: MediaStream, perso: Perso, clientTools: Array<ChatTool>);
+     * @param options Optional configuration.
+     * @param options.stream Local audio stream for legacy bidirectional mode.
+     * @param options.legacyVoiceChatMode Whether legacy voice chat mode is enabled.
+     */
+    constructor(apiServer: string, sessionId: string, perso: Perso | null, clientTools: Array<ChatTool>, options?: {
+        stream?: MediaStream;
+        legacyVoiceChatMode?: boolean;
+    });
     private llmJob;
     /**
      * Sends a user utterance through Perso's internal LLM and speaks the result
@@ -202,32 +251,67 @@ declare class Session {
      * - Maintains `messageHistory` for subsequent LLM calls.
      */
     processChat(message: string): Promise<void>;
-    /**
-     * Plays back a response produced by a custom/external LLM without calling
-     * the built-in Perso LLM pipeline.
-     * @param message Assistant response generated externally.
-     * @remarks
-     * - Does not mutate `messageHistory`.
-     * - Does not emit chat-log updates.
-     * - Does not toggle the `LLM` chat state.
-     */
+    processLLM(options: ProcessLLMOptions): AsyncGenerator<LLMStreamChunk>;
+    getMessageHistory(): ReadonlyArray<object>;
+    /** @deprecated Use processTTSTF() with explicit history management instead. */
     processCustomChat(message: string): void;
     /**
      * Sends an assistant message to the LLM history and triggers TTSTF playback.
      * @param message Assistant output that should be spoken immediately.
      */
     processTTSTF(message: string): void;
+    transcribeAudio(audio: Blob | File, language?: string): Promise<string>;
+    processSTF(file: Blob, format: string, message: string): Promise<string>;
+    processTTS(message: string, options?: {
+        resample?: boolean;
+    }): Promise<Blob | undefined>;
     /**
      * Triggers the recording state and instructs Perso to buffer microphone
      * audio for speech-to-text.
+     *
+     * In legacy mode this sends a `record-start` DataChannel message to the
+     * server which begins buffering the bidirectional audio stream.
+     *
      * @returns Result of `perso.recordStart()`.
+     * @deprecated Use startProcessSTT() instead. Legacy voice chat mode will be removed in a future version.
      */
     startVoiceChat(): void;
     /**
      * Stops the microphone capture, transitions the UI to analyzing, and sends
      * the buffered audio to STT.
+     *
+     * In legacy mode this sends a `record-end-stt` DataChannel message.  The
+     * server responds with a `"stt"` message which is handled by the
+     * `setMessageCallback("stt")` listener in the constructor, triggering
+     * `processChat` automatically.
+     *
+     * @deprecated Use stopProcessSTT() instead. Legacy voice chat mode will be removed in a future version.
      */
     stopVoiceChat(): void;
+    /**
+     * Starts recording audio for STT processing.
+     * Uses Web Audio API internally to capture microphone input and encode to WAV format.
+     * @param timeout Optional timeout in milliseconds to automatically stop recording.
+     * @throws Error if already recording or if microphone access is denied.
+     */
+    startProcessSTT(timeout?: number): Promise<void>;
+    /**
+     * Result of STT processing including transcribed text and recorded audio.
+     */
+    lastRecordedAudioFile: File | null;
+    /**
+     * Stops STT recording and sends the audio to the STT API for transcription.
+     * @param language Optional language code for STT (e.g., 'ko', 'en').
+     * @returns Promise resolving to the transcribed text.
+     * @throws STTError if the API call fails.
+     * @throws Error if not currently recording.
+     */
+    stopProcessSTT(language?: string): Promise<string>;
+    /**
+     * Checks if STT recording is currently in progress or has audio pending processing.
+     * @returns True if recording is active or audio is pending from timeout.
+     */
+    isSTTRecording(): boolean;
     /**
      * Resizes the avatar video canvas on the remote renderer.
      * @param width Target width in CSS pixels.
@@ -244,16 +328,18 @@ declare class Session {
      * @param element Target video element.
      */
     setSrc(element: HTMLVideoElement): void;
-    /**
-     * Returns the local microphone stream associated with the session.
-     * @returns Local `MediaStream`.
-     */
-    getLocalStream(): MediaStream;
     /**
      * Returns the first remote stream exposed by the Perso renderer.
      * @returns Remote `MediaStream`.
      */
-    getRemoteStream(): MediaStream;
+    getRemoteStream(): MediaStream | undefined;
+    /**
+     * Returns the local microphone stream associated with the session.
+     * Only available in legacy voice chat mode.
+     * @returns Local `MediaStream` or `null` if not in legacy mode.
+     * @deprecated Legacy voice chat mode will be removed in a future version.
+     */
+    getLocalStream(): MediaStream | null;
     /**
      * Gracefully closes the session and remote connection.
      */
@@ -261,6 +347,11 @@ declare class Session {
     /**
      * Subscribes to Perso status events and notifies the caller when the session
      * closes (distinguishing manual/automatic closure).
+     *
+     * In non-WebRTC mode (perso is null), the callback is never invoked and a
+     * no-op unsubscribe is returned. Use `setErrorHandler` to detect session
+     * termination caused by heartbeat failure instead.
+     *
      * @param callback Invoked with `true` when closed manually.
      * @returns Function to unsubscribe the listener.
      */
@@ -294,17 +385,6 @@ declare class Session {
      * @returns Session identifier assigned by the backend.
      */
     getSessionId(): string;
-    /**
-     * Streams responses from the Perso LLM endpoint, handles tool calls, and
-     * updates chat history/state accordingly.
-     * @param message Optional user message array or string injected ahead of the
-     * pending history (null when recursively continuing after tool calls).
-     * @remarks
-     * - Accumulates `type: "message"` chunks until a non-message event arrives.
-     * - When tool calls are returned, executes client tools (and recursively calls
-     *   itself if follow-up LLM output is required).
-     * - Adds every spoken assistant message to the chat log and messageHistory.
-     */
     private processChatInternal;
     /**
      * Looks up a tool definition by the function name provided in a tool_call.
@@ -313,11 +393,12 @@ declare class Session {
      * @returns Matching `ChatTool` or null.
      */
     private getChatTool;
-    private llmCancel;
     /**
      * Cancels any in-flight LLM stream by flipping the cancellation flag and
      * awaiting the pending promise if necessary.
      */
+    private llmCancel;
+    private pipelineSuppressed;
     private clearLLMJob;
     /**
      * Filters/sanitizes text and sends it to Perso's TTSTF endpoint while toggling
@@ -365,6 +446,8 @@ declare class Session {
      * Gracefully closes the underlying Perso connection on behalf of the session.
      */
     private close;
+    private startHeartbeat;
+    private stopHeartbeat;
     /**
      * Strips emoji characters that TTSTF may not render correctly.
      * @param str Text to sanitize.
@@ -373,6 +456,109 @@ declare class Session {
     private removeEmoji;
 }
+/**
+ * Callbacks that LlmProcessor uses to notify the host of side effects.
+ */
+interface LlmProcessorCallbacks {
+    onChatStateChange: (add: ChatState | null, remove: ChatState | null) => void;
+    onError: (error: Error) => void;
+    onChatLog: (message: string, isUser: boolean) => void;
+    onTTSTF: (message: string) => void;
+}
+/**
+ * Configuration for LlmProcessor construction.
+ */
+interface LlmProcessorConfig {
+    apiServer: string;
+    sessionId: string;
+    clientTools: Array<ChatTool>;
+    callbacks: LlmProcessorCallbacks;
+}
+/**
+ * Handles LLM streaming, SSE parsing, tool execution, and message history
+ * management as a standalone module.
+ */
+declare class LlmProcessor {
+    private config;
+    private messageHistory;
+    constructor(config: LlmProcessorConfig);
+    /**
+     * Streams LLM responses as an AsyncGenerator, yielding {@link LLMStreamChunk}
+     * discriminated by `type`: `assistant`, `tool_call`, `tool_result`, `error`.
+     *
+     * Consumers get pull-based control over the stream — backpressure,
+     * early exit via `break`, and `AbortSignal` cancellation are handled
+     * naturally by the generator protocol.
+     *
+     * **Yield strategy**: message-type SSE events within a single `reader.read()`
+     * are batched into one `assistant` chunk (accumulated `chunks[]` + `message`).
+     * Non-message events (`tool_call`, `tool`) flush pending message chunks first
+     * to preserve ordering.
+     *
+     * **Tool execution** happens internally — `tool_call` and `tool_result` chunks
+     * are yielded for observability. If tools require a follow-up LLM call,
+     * the generator loops transparently.
+     *
+     * @param options - Message, optional tool overrides, and optional AbortSignal.
+     * @yields {LLMStreamChunk} Streaming chunks. The final `assistant` chunk
+     *   has `finish: true` and contains the complete `chunks[]` / `message`.
+     * @throws {Error} If `options.message` is empty.
+     * @throws {LLMError} Re-thrown when the initial fetch fails with a non-API error.
+     */
+    processLLM(options: ProcessLLMOptions): AsyncGenerator<LLMStreamChunk>;
+    private parseSSEStream;
+    private executeToolCalls;
+    addToHistory(entry: object): void;
+    getHistory(): ReadonlyArray<object>;
+}
+interface WavRecorderOptions {
+    channels?: number;
+    targetSampleRate?: number;
+}
+/**
+ * Records audio from the microphone and produces WAV files using Web Audio API.
+ * Uses AudioWorklet (standard API) for cross-browser compatibility.
+ *
+ * Browser Support:
+ * - Chrome 66+
+ * - Firefox 76+
+ * - Safari 14.1+
+ * - iOS Safari 14.5+
+ * - Edge 79+
+ */
+declare class WavRecorder {
+    private audioContext;
+    private mediaStream;
+    private workletNode;
+    private sourceNode;
+    private audioChunks;
+    private isRecordingState;
+    private channels;
+    private targetSampleRate;
+    constructor(options?: WavRecorderOptions);
+    /**
+     * Starts recording audio from the microphone.
+     * Requests microphone permission via getUserMedia.
+     * @throws Error if already recording or if microphone access is denied.
+     */
+    start(): Promise<void>;
+    /**
+     * Stops recording and returns the recorded audio as a WAV File.
+     * Uses bidirectional communication with AudioWorklet to ensure all audio data is captured.
+     * @returns Promise resolving to a File containing the recorded WAV audio.
+     * @throws Error if not currently recording.
+     */
+    stop(): Promise<File>;
+    isRecording(): boolean;
+}
+/**
+ * Factory function to create a WavRecorder.
+ * @param options Optional configuration.
+ * @returns A new WavRecorder instance.
+ */
+declare function createWavRecorder(options?: WavRecorderOptions): WavRecorder;
 /**
  * Retrieves the list of available LLM providers from the API.
  * @param apiServer Perso API server URL.
@@ -440,15 +626,13 @@ declare function getAllSettings(apiServer: string, apiKey: string): Promise<{
     mcpServers: any;
 }>;
 /**
- * Wraps the lower-level `session.createSession` helper so callers can import
- * from this module.
- * @param apiServer Perso API server URL.
- * @param sessionId Session id to attach to.
- * @param width Avatar canvas width.
- * @param height Avatar canvas height.
- * @param enableVoiceChat Whether microphone capture should be enabled.
- * @param clientTools Client-side tools available for function calling.
- * @returns Initialized Session.
+ * Creates a Session with REST-based STT/TTS (current mode).
+ */
+declare function createSession(apiServer: string, sessionId: string, width: number, height: number, clientTools: Array<ChatTool>): Promise<Session>;
+/**
+ * Creates a Session with bidirectional WebRTC audio (legacy mode).
+ * @deprecated Legacy voice chat mode will be removed in a future version.
+ *   Use the 5-argument overload with REST-based STT/TTS instead.
  */
 declare function createSession(apiServer: string, sessionId: string, width: number, height: number, enableVoiceChat: boolean, clientTools: Array<ChatTool>): Promise<Session>;
 /**
@@ -515,6 +699,22 @@ declare class LLMStreamingResponseError extends Error {
     description: string;
     constructor(description: string);
 }
+declare class STTError extends Error {
+    underlyingError: ApiError;
+    constructor(underlyingError: ApiError);
+}
+declare class TTSError extends Error {
+    underlyingError: ApiError | TTSDecodeError;
+    constructor(underlyingError: ApiError | TTSDecodeError);
+}
+declare class TTSDecodeError extends Error {
+    description: string;
+    constructor(description: string);
+}
+declare function getWavSampleRate(arrayBuffer: ArrayBuffer): number;
+declare const TTS_TARGET_SAMPLE_RATE = 16000;
-export { ApiError, ChatState, ChatTool, LLMError, LLMStreamingResponseError, Session, createSession, createSessionId, getAllSettings, getBackgroundImages, getDocuments, getLLMs, getMcpServers, getModelStyles, getPrompts, getSTTs, getSessionInfo, getTTSs };
-export type { Chat };
+export { ApiError, ChatState, ChatTool, LLMError, LLMStreamingResponseError, LlmProcessor, STTError, Session, TTSDecodeError, TTSError, TTS_TARGET_SAMPLE_RATE, WavRecorder, createSession, createSessionId, createWavRecorder, getAllSettings, getBackgroundImages, getDocuments, getLLMs, getMcpServers, getModelStyles, getPrompts, getSTTs, getSessionInfo, getTTSs, getWavSampleRate };
+export type { Chat, LLMStreamChunk, LlmProcessorCallbacks, LlmProcessorConfig, ProcessLLMOptions, WavRecorderOptions };