perso-interactive-sdk-web 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,45 @@
1
+ interface Chat {
2
+ text: string;
3
+ isUser: boolean;
4
+ timestamp: Date;
5
+ }
6
+ declare enum ChatState {
7
+ RECORDING = "RECORDING",
8
+ LLM = "LLM",
9
+ ANALYZING = "ANALYZING",
10
+ SPEAKING = "SPEAKING",
11
+ TTS = "TTS"
12
+ }
13
+ declare class ChatTool<TArg = any, TResult extends object = object> {
14
+ name: string;
15
+ description: string;
16
+ parameters: object;
17
+ call: (arg: TArg) => TResult | Promise<TResult>;
18
+ executeOnly: boolean;
19
+ constructor(name: string, description: string, parameters: object, call: (arg: TArg) => TResult | Promise<TResult>, executeOnly?: boolean);
20
+ }
21
+ type LLMStreamChunk = {
22
+ type: 'message';
23
+ chunks: string[];
24
+ message: string;
25
+ finish: boolean;
26
+ } | ({
27
+ type: 'tool_call';
28
+ tool_calls: Array<object>;
29
+ } & Record<string, unknown>) | ({
30
+ type: 'tool_result';
31
+ tool_call_id: string;
32
+ result: object;
33
+ } & Record<string, unknown>) | {
34
+ type: 'error';
35
+ error: Error;
36
+ };
37
+ interface ProcessLLMOptions {
38
+ message: string;
39
+ tools?: Array<ChatTool>;
40
+ signal?: AbortSignal;
41
+ }
42
+
1
43
  /**
2
44
  * High-level controller around a WebRTC PeerConnection that proxies Perso's
3
45
  * real-time APIs through convenience helpers.
@@ -8,7 +50,7 @@ declare class Perso extends EventTarget {
8
50
  dc: RTCDataChannel;
9
51
  streams: Array<MediaStream>;
10
52
  pingTime: number;
11
- pingIntervalId: number | null;
53
+ pingIntervalId: ReturnType<typeof setInterval> | null;
12
54
  /**
13
55
  * Hooks a peer/data channel pair to status/ping listeners so consumers can
14
56
  * interact with the remote Perso session through a single object.
@@ -17,17 +59,22 @@ declare class Perso extends EventTarget {
17
59
  */
18
60
  constructor(pc: RTCPeerConnection, dc: RTCDataChannel);
19
61
  /**
20
- * Attaches a local `MediaStream` to the Perso session, negotiates WebRTC
21
- * connectivity, and waits until the first remote stream is ready.
62
+ * Negotiates WebRTC connectivity and waits until the first remote stream is ready.
63
+ *
64
+ * When an optional `stream` is provided (legacy bidirectional mode), the stream's
65
+ * tracks are added to the peer connection so the server can receive client audio.
66
+ * Without a stream the audio transceiver is set to receive-only.
67
+ *
22
68
  * @param apiServer Perso API server URL.
23
69
  * @param sessionId Session identifier created via `createSessionId`.
24
- * @param stream Local camera/mic stream shared with the agent.
25
70
  * @param width Desired avatar canvas width.
26
71
  * @param height Desired avatar canvas height.
27
- * @returns Ready-to-use `Perso` instance.
72
+ * @param stream Optional local media stream for bidirectional audio (legacy mode).
73
+ * @returns Ready-to-use `Perso` instance, or `null` when the session has no STF capability.
74
+ * @throws ApiError When session event or WebRTC negotiation fails.
28
75
  * @throws Timeout When remote streams fail to arrive in time.
29
76
  */
30
- static create(apiServer: string, sessionId: string, stream: MediaStream, width: number, height: number): Promise<Perso>;
77
+ static create(apiServer: string, sessionId: string, width: number, height: number, stream?: MediaStream): Promise<Perso | null>;
31
78
  /**
32
79
  * Configures a browser `RTCPeerConnection` with the ICE servers provided by
33
80
  * the Perso API.
@@ -74,6 +121,26 @@ declare class Perso extends EventTarget {
74
121
  * @param message Text to synthesize and animate.
75
122
  */
76
123
  ttstf(message: string): void;
124
+ private static readonly BACKPRESSURE_THRESHOLD;
125
+ /**
126
+ * Sends a file to the remote peer via a dedicated WebRTC data channel.
127
+ * The file is chunked and transmitted in binary format. Applies
128
+ * backpressure when the channel's buffer exceeds 512 KB to avoid
129
+ * SCTP overflow on large files.
130
+ * @param file The file blob to send.
131
+ * @param chunksize Size of each chunk in bytes (default: 65536).
132
+ * @returns Promise resolving to the file reference string from the server.
133
+ */
134
+ sendFile(file: Blob, chunksize?: number): Promise<string>;
135
+ /**
136
+ * Sends an audio file for Speech-to-Face (STF) processing.
137
+ * The avatar will lip-sync to the provided audio.
138
+ * @param file Audio file blob (mp3 or wav).
139
+ * @param format Audio format ('mp3' or 'wav').
140
+ * @param message Optional text message associated with the audio.
141
+ * @returns Promise resolving to the file reference string.
142
+ */
143
+ stf(file: Blob, format: string, message: string): Promise<string>;
77
144
  /**
78
145
  * Signals the remote agent to start buffering microphone audio.
79
146
  */
@@ -118,7 +185,8 @@ declare class Perso extends EventTarget {
118
185
  * @param callback Handler invoked with the parsed payload.
119
186
  * @returns Function that removes the listener.
120
187
  */
121
- setMessageCallback(type: string, callback: (data: any) => void): () => void;
188
+ setMessageCallback<T = any>(type: string, callback: (data: T) => void): () => void;
189
+ tts(base64: string, resample?: boolean): Promise<Blob>;
122
190
  /**
123
191
  * Tears down the PeerConnection due to remote/network failure and emits a
124
192
  * timeout status so the UI can inform users.
@@ -131,36 +199,6 @@ declare class Perso extends EventTarget {
131
199
  closeSelf(): void;
132
200
  }
133
201
 
134
- /**
135
- * Represents a single entry shown in the chat log UI.
136
- */
137
- interface Chat {
138
- text: string;
139
- isUser: boolean;
140
- timestamp: Date;
141
- }
142
- /**
143
- * Discrete states that describe where the conversation currently is
144
- * (recording, running the LLM, analyzing text, speaking back, etc.).
145
- */
146
- declare enum ChatState {
147
- RECORDING = "RECORDING",
148
- LLM = "LLM",
149
- ANALYZING = "ANALYZING",
150
- SPEAKING = "SPEAKING"
151
- }
152
- /**
153
- * Container describing a callable tool (local client helper or remote MCP)
154
- * that the LLM runtime can invoke during conversations.
155
- */
156
- declare class ChatTool {
157
- name: string;
158
- description: string;
159
- parameters: object;
160
- call: (arg: any) => object | Promise<object>;
161
- executeOnly: boolean;
162
- constructor(name: string, description: string, parameters: object, call: (arg: any) => object | Promise<object>, executeOnly?: boolean);
163
- }
164
202
  /**
165
203
  * Manages a full Perso chat session including UI state, LLM orchestration,
166
204
  * microphone handling, and speech synthesis triggers.
@@ -168,8 +206,7 @@ declare class ChatTool {
168
206
  declare class Session {
169
207
  apiServer: string;
170
208
  sessionId: string;
171
- stream: MediaStream;
172
- perso: Perso;
209
+ perso: Perso | null;
173
210
  clientTools: Array<ChatTool>;
174
211
  private chatStatesHandler;
175
212
  private chatLogHandler;
@@ -180,17 +217,29 @@ declare class Session {
180
217
  private stfTimeoutStartTime;
181
218
  private messageHistory;
182
219
  private chatLog;
220
+ private llmProcessor;
183
221
  private chatStateMap;
184
222
  private emojiRegex;
223
+ private sttRecorder;
224
+ private sttTimeoutHandle;
225
+ private sttTimeoutAudioFile;
226
+ private heartbeatIntervalId;
227
+ private readonly legacyVoiceChatMode;
228
+ private readonly stream;
185
229
  /**
186
230
  * Sets up message listeners and chat-state trackers for a Perso session.
187
231
  * @param apiServer Perso API server URL.
188
232
  * @param sessionId Id of the session negotiated with the backend.
189
- * @param stream Local audio stream shared with the session.
190
233
  * @param perso Underlying Perso WebRTC controller.
191
234
  * @param clientTools Tools exposed to the LLM for function calling.
192
- */
193
- constructor(apiServer: string, sessionId: string, stream: MediaStream, perso: Perso, clientTools: Array<ChatTool>);
235
+ * @param options Optional configuration.
236
+ * @param options.stream Local audio stream for legacy bidirectional mode.
237
+ * @param options.legacyVoiceChatMode Whether legacy voice chat mode is enabled.
238
+ */
239
+ constructor(apiServer: string, sessionId: string, perso: Perso | null, clientTools: Array<ChatTool>, options?: {
240
+ stream?: MediaStream;
241
+ legacyVoiceChatMode?: boolean;
242
+ });
194
243
  private llmJob;
195
244
  /**
196
245
  * Sends a user utterance through Perso's internal LLM and speaks the result
@@ -202,32 +251,67 @@ declare class Session {
202
251
  * - Maintains `messageHistory` for subsequent LLM calls.
203
252
  */
204
253
  processChat(message: string): Promise<void>;
205
- /**
206
- * Plays back a response produced by a custom/external LLM without calling
207
- * the built-in Perso LLM pipeline.
208
- * @param message Assistant response generated externally.
209
- * @remarks
210
- * - Does not mutate `messageHistory`.
211
- * - Does not emit chat-log updates.
212
- * - Does not toggle the `LLM` chat state.
213
- */
254
+ processLLM(options: ProcessLLMOptions): AsyncGenerator<LLMStreamChunk>;
255
+ getMessageHistory(): ReadonlyArray<object>;
256
+ /** @deprecated Use processTTSTF() with explicit history management instead. */
214
257
  processCustomChat(message: string): void;
215
258
  /**
216
259
  * Sends an assistant message to the LLM history and triggers TTSTF playback.
217
260
  * @param message Assistant output that should be spoken immediately.
218
261
  */
219
262
  processTTSTF(message: string): void;
263
+ transcribeAudio(audio: Blob | File, language?: string): Promise<string>;
264
+ processSTF(file: Blob, format: string, message: string): Promise<string>;
265
+ processTTS(message: string, options?: {
266
+ resample?: boolean;
267
+ }): Promise<Blob | undefined>;
220
268
  /**
221
269
  * Triggers the recording state and instructs Perso to buffer microphone
222
270
  * audio for speech-to-text.
271
+ *
272
+ * In legacy mode this sends a `record-start` DataChannel message to the
273
+ * server which begins buffering the bidirectional audio stream.
274
+ *
223
275
  * @returns Result of `perso.recordStart()`.
276
+ * @deprecated Use startProcessSTT() instead. Legacy voice chat mode will be removed in a future version.
224
277
  */
225
278
  startVoiceChat(): void;
226
279
  /**
227
280
  * Stops the microphone capture, transitions the UI to analyzing, and sends
228
281
  * the buffered audio to STT.
282
+ *
283
+ * In legacy mode this sends a `record-end-stt` DataChannel message. The
284
+ * server responds with a `"stt"` message which is handled by the
285
+ * `setMessageCallback("stt")` listener in the constructor, triggering
286
+ * `processChat` automatically.
287
+ *
288
+ * @deprecated Use stopProcessSTT() instead. Legacy voice chat mode will be removed in a future version.
229
289
  */
230
290
  stopVoiceChat(): void;
291
+ /**
292
+ * Starts recording audio for STT processing.
293
+ * Uses Web Audio API internally to capture microphone input and encode to WAV format.
294
+ * @param timeout Optional timeout in milliseconds to automatically stop recording.
295
+ * @throws Error if already recording or if microphone access is denied.
296
+ */
297
+ startProcessSTT(timeout?: number): Promise<void>;
298
+ /**
299
+ * Result of STT processing including transcribed text and recorded audio.
300
+ */
301
+ lastRecordedAudioFile: File | null;
302
+ /**
303
+ * Stops STT recording and sends the audio to the STT API for transcription.
304
+ * @param language Optional language code for STT (e.g., 'ko', 'en').
305
+ * @returns Promise resolving to the transcribed text.
306
+ * @throws STTError if the API call fails.
307
+ * @throws Error if not currently recording.
308
+ */
309
+ stopProcessSTT(language?: string): Promise<string>;
310
+ /**
311
+ * Checks if STT recording is currently in progress or has audio pending processing.
312
+ * @returns True if recording is active or audio is pending from timeout.
313
+ */
314
+ isSTTRecording(): boolean;
231
315
  /**
232
316
  * Resizes the avatar video canvas on the remote renderer.
233
317
  * @param width Target width in CSS pixels.
@@ -244,16 +328,18 @@ declare class Session {
244
328
  * @param element Target video element.
245
329
  */
246
330
  setSrc(element: HTMLVideoElement): void;
247
- /**
248
- * Returns the local microphone stream associated with the session.
249
- * @returns Local `MediaStream`.
250
- */
251
- getLocalStream(): MediaStream;
252
331
  /**
253
332
  * Returns the first remote stream exposed by the Perso renderer.
254
333
  * @returns Remote `MediaStream`.
255
334
  */
256
- getRemoteStream(): MediaStream;
335
+ getRemoteStream(): MediaStream | undefined;
336
+ /**
337
+ * Returns the local microphone stream associated with the session.
338
+ * Only available in legacy voice chat mode.
339
+ * @returns Local `MediaStream` or `null` if not in legacy mode.
340
+ * @deprecated Legacy voice chat mode will be removed in a future version.
341
+ */
342
+ getLocalStream(): MediaStream | null;
257
343
  /**
258
344
  * Gracefully closes the session and remote connection.
259
345
  */
@@ -261,6 +347,11 @@ declare class Session {
261
347
  /**
262
348
  * Subscribes to Perso status events and notifies the caller when the session
263
349
  * closes (distinguishing manual/automatic closure).
350
+ *
351
+ * In non-WebRTC mode (perso is null), the callback is never invoked and a
352
+ * no-op unsubscribe is returned. Use `setErrorHandler` to detect session
353
+ * termination caused by heartbeat failure instead.
354
+ *
264
355
  * @param callback Invoked with `true` when closed manually.
265
356
  * @returns Function to unsubscribe the listener.
266
357
  */
@@ -294,17 +385,6 @@ declare class Session {
294
385
  * @returns Session identifier assigned by the backend.
295
386
  */
296
387
  getSessionId(): string;
297
- /**
298
- * Streams responses from the Perso LLM endpoint, handles tool calls, and
299
- * updates chat history/state accordingly.
300
- * @param message Optional user message array or string injected ahead of the
301
- * pending history (null when recursively continuing after tool calls).
302
- * @remarks
303
- * - Accumulates `type: "message"` chunks until a non-message event arrives.
304
- * - When tool calls are returned, executes client tools (and recursively calls
305
- * itself if follow-up LLM output is required).
306
- * - Adds every spoken assistant message to the chat log and messageHistory.
307
- */
308
388
  private processChatInternal;
309
389
  /**
310
390
  * Looks up a tool definition by the function name provided in a tool_call.
@@ -313,11 +393,12 @@ declare class Session {
313
393
  * @returns Matching `ChatTool` or null.
314
394
  */
315
395
  private getChatTool;
316
- private llmCancel;
317
396
  /**
318
397
  * Cancels any in-flight LLM stream by flipping the cancellation flag and
319
398
  * awaiting the pending promise if necessary.
320
399
  */
400
+ private llmCancel;
401
+ private pipelineSuppressed;
321
402
  private clearLLMJob;
322
403
  /**
323
404
  * Filters/sanitizes text and sends it to Perso's TTSTF endpoint while toggling
@@ -365,6 +446,8 @@ declare class Session {
365
446
  * Gracefully closes the underlying Perso connection on behalf of the session.
366
447
  */
367
448
  private close;
449
+ private startHeartbeat;
450
+ private stopHeartbeat;
368
451
  /**
369
452
  * Strips emoji characters that TTSTF may not render correctly.
370
453
  * @param str Text to sanitize.
@@ -373,6 +456,109 @@ declare class Session {
373
456
  private removeEmoji;
374
457
  }
375
458
 
459
+ /**
460
+ * Callbacks that LlmProcessor uses to notify the host of side effects.
461
+ */
462
+ interface LlmProcessorCallbacks {
463
+ onChatStateChange: (add: ChatState | null, remove: ChatState | null) => void;
464
+ onError: (error: Error) => void;
465
+ onChatLog: (message: string, isUser: boolean) => void;
466
+ onTTSTF: (message: string) => void;
467
+ }
468
+ /**
469
+ * Configuration for LlmProcessor construction.
470
+ */
471
+ interface LlmProcessorConfig {
472
+ apiServer: string;
473
+ sessionId: string;
474
+ clientTools: Array<ChatTool>;
475
+ callbacks: LlmProcessorCallbacks;
476
+ }
477
+ /**
478
+ * Handles LLM streaming, SSE parsing, tool execution, and message history
479
+ * management as a standalone module.
480
+ */
481
+ declare class LlmProcessor {
482
+ private config;
483
+ private messageHistory;
484
+ constructor(config: LlmProcessorConfig);
485
+ /**
486
+ * Streams LLM responses as an AsyncGenerator, yielding {@link LLMStreamChunk}
487
+ * discriminated by `type`: `assistant`, `tool_call`, `tool_result`, `error`.
488
+ *
489
+ * Consumers get pull-based control over the stream — backpressure,
490
+ * early exit via `break`, and `AbortSignal` cancellation are handled
491
+ * naturally by the generator protocol.
492
+ *
493
+ * **Yield strategy**: message-type SSE events within a single `reader.read()`
494
+ * are batched into one `assistant` chunk (accumulated `chunks[]` + `message`).
495
+ * Non-message events (`tool_call`, `tool`) flush pending message chunks first
496
+ * to preserve ordering.
497
+ *
498
+ * **Tool execution** happens internally — `tool_call` and `tool_result` chunks
499
+ * are yielded for observability. If tools require a follow-up LLM call,
500
+ * the generator loops transparently.
501
+ *
502
+ * @param options - Message, optional tool overrides, and optional AbortSignal.
503
+ * @yields {LLMStreamChunk} Streaming chunks. The final `assistant` chunk
504
+ * has `finish: true` and contains the complete `chunks[]` / `message`.
505
+ * @throws {Error} If `options.message` is empty.
506
+ * @throws {LLMError} Re-thrown when the initial fetch fails with a non-API error.
507
+ */
508
+ processLLM(options: ProcessLLMOptions): AsyncGenerator<LLMStreamChunk>;
509
+ private parseSSEStream;
510
+ private executeToolCalls;
511
+ addToHistory(entry: object): void;
512
+ getHistory(): ReadonlyArray<object>;
513
+ }
514
+
515
+ interface WavRecorderOptions {
516
+ channels?: number;
517
+ targetSampleRate?: number;
518
+ }
519
+ /**
520
+ * Records audio from the microphone and produces WAV files using Web Audio API.
521
+ * Uses AudioWorklet (standard API) for cross-browser compatibility.
522
+ *
523
+ * Browser Support:
524
+ * - Chrome 66+
525
+ * - Firefox 76+
526
+ * - Safari 14.1+
527
+ * - iOS Safari 14.5+
528
+ * - Edge 79+
529
+ */
530
+ declare class WavRecorder {
531
+ private audioContext;
532
+ private mediaStream;
533
+ private workletNode;
534
+ private sourceNode;
535
+ private audioChunks;
536
+ private isRecordingState;
537
+ private channels;
538
+ private targetSampleRate;
539
+ constructor(options?: WavRecorderOptions);
540
+ /**
541
+ * Starts recording audio from the microphone.
542
+ * Requests microphone permission via getUserMedia.
543
+ * @throws Error if already recording or if microphone access is denied.
544
+ */
545
+ start(): Promise<void>;
546
+ /**
547
+ * Stops recording and returns the recorded audio as a WAV File.
548
+ * Uses bidirectional communication with AudioWorklet to ensure all audio data is captured.
549
+ * @returns Promise resolving to a File containing the recorded WAV audio.
550
+ * @throws Error if not currently recording.
551
+ */
552
+ stop(): Promise<File>;
553
+ isRecording(): boolean;
554
+ }
555
+ /**
556
+ * Factory function to create a WavRecorder.
557
+ * @param options Optional configuration.
558
+ * @returns A new WavRecorder instance.
559
+ */
560
+ declare function createWavRecorder(options?: WavRecorderOptions): WavRecorder;
561
+
376
562
  /**
377
563
  * Retrieves the list of available LLM providers from the API.
378
564
  * @param apiServer Perso API server URL.
@@ -440,15 +626,13 @@ declare function getAllSettings(apiServer: string, apiKey: string): Promise<{
440
626
  mcpServers: any;
441
627
  }>;
442
628
  /**
443
- * Wraps the lower-level `session.createSession` helper so callers can import
444
- * from this module.
445
- * @param apiServer Perso API server URL.
446
- * @param sessionId Session id to attach to.
447
- * @param width Avatar canvas width.
448
- * @param height Avatar canvas height.
449
- * @param enableVoiceChat Whether microphone capture should be enabled.
450
- * @param clientTools Client-side tools available for function calling.
451
- * @returns Initialized Session.
629
+ * Creates a Session with REST-based STT/TTS (current mode).
630
+ */
631
+ declare function createSession(apiServer: string, sessionId: string, width: number, height: number, clientTools: Array<ChatTool>): Promise<Session>;
632
+ /**
633
+ * Creates a Session with bidirectional WebRTC audio (legacy mode).
634
+ * @deprecated Legacy voice chat mode will be removed in a future version.
635
+ * Use the 5-argument overload with REST-based STT/TTS instead.
452
636
  */
453
637
  declare function createSession(apiServer: string, sessionId: string, width: number, height: number, enableVoiceChat: boolean, clientTools: Array<ChatTool>): Promise<Session>;
454
638
  /**
@@ -515,6 +699,22 @@ declare class LLMStreamingResponseError extends Error {
515
699
  description: string;
516
700
  constructor(description: string);
517
701
  }
702
+ declare class STTError extends Error {
703
+ underlyingError: ApiError;
704
+ constructor(underlyingError: ApiError);
705
+ }
706
+ declare class TTSError extends Error {
707
+ underlyingError: ApiError | TTSDecodeError;
708
+ constructor(underlyingError: ApiError | TTSDecodeError);
709
+ }
710
+ declare class TTSDecodeError extends Error {
711
+ description: string;
712
+ constructor(description: string);
713
+ }
714
+
715
+ declare function getWavSampleRate(arrayBuffer: ArrayBuffer): number;
716
+
717
+ declare const TTS_TARGET_SAMPLE_RATE = 16000;
518
718
 
519
- export { ApiError, ChatState, ChatTool, LLMError, LLMStreamingResponseError, Session, createSession, createSessionId, getAllSettings, getBackgroundImages, getDocuments, getLLMs, getMcpServers, getModelStyles, getPrompts, getSTTs, getSessionInfo, getTTSs };
520
- export type { Chat };
719
+ export { ApiError, ChatState, ChatTool, LLMError, LLMStreamingResponseError, LlmProcessor, STTError, Session, TTSDecodeError, TTSError, TTS_TARGET_SAMPLE_RATE, WavRecorder, createSession, createSessionId, createWavRecorder, getAllSettings, getBackgroundImages, getDocuments, getLLMs, getMcpServers, getModelStyles, getPrompts, getSTTs, getSessionInfo, getTTSs, getWavSampleRate };
720
+ export type { Chat, LLMStreamChunk, LlmProcessorCallbacks, LlmProcessorConfig, ProcessLLMOptions, WavRecorderOptions };