@estuary-ai/sdk 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -74,6 +74,50 @@ await client.startVoice();
74
74
  client.toggleMute();
75
75
  ```
76
76
 
77
+ ### Interrupts
78
+
79
+ Interrupt the bot's current response (stops audio playback and generation):
80
+
81
+ ```typescript
82
+ client.interrupt(); // interrupt current response
83
+ client.interrupt('msg_abc123'); // interrupt a specific message
84
+ ```
85
+
86
+ ### Vision / Camera
87
+
88
+ Send images for vision processing. The server may also request captures via the `cameraCaptureRequest` event.
89
+
90
+ ```typescript
91
+ // Send a camera image proactively
92
+ client.sendCameraImage(base64Image, 'image/jpeg');
93
+
94
+ // Respond to a server-initiated capture request
95
+ client.on('cameraCaptureRequest', (request) => {
96
+ const image = captureFrame(); // your capture logic
97
+ client.sendCameraImage(image, 'image/jpeg', request.requestId, request.text);
98
+ });
99
+ ```
100
+
101
+ ### Character Actions
102
+
103
+ Bot responses can include inline action tags (e.g., `<action name="wave" target="user"/>`). The SDK automatically parses these, strips them from `botResponse.text`, and emits `characterAction` events:
104
+
105
+ ```typescript
106
+ client.on('characterAction', (action) => {
107
+ console.log(action.name); // e.g., "wave"
108
+ console.log(action.params); // e.g., { target: "user" }
109
+ console.log(action.messageId); // originating message
110
+ });
111
+ ```
112
+
113
+ For non-streaming contexts, use the `parseActions` utility:
114
+
115
+ ```typescript
116
+ import { parseActions } from '@estuary-ai/sdk';
117
+
118
+ const { actions, cleanText } = parseActions(rawBotText);
119
+ ```
120
+
77
121
  ### Memory & Knowledge Graph
78
122
 
79
123
  ```typescript
@@ -81,6 +125,9 @@ const memories = await client.memory.getMemories({ status: 'active', limit: 50 }
81
125
  const facts = await client.memory.getCoreFacts();
82
126
  const graph = await client.memory.getGraph({ includeEntities: true });
83
127
  const results = await client.memory.search('favorite food');
128
+ const timeline = await client.memory.getTimeline({ groupBy: 'week' });
129
+ const stats = await client.memory.getStats();
130
+ await client.memory.deleteAll(true); // pass true to confirm
84
131
  ```
85
132
 
86
133
  ### Real-Time Memory Extraction
@@ -109,17 +156,69 @@ await client.connect();
109
156
  ## Events
110
157
 
111
158
  ```typescript
159
+ // Connection
112
160
  client.on('connected', (session) => { /* authenticated */ });
113
161
  client.on('disconnected', (reason) => { /* lost connection */ });
114
- client.on('botResponse', (response) => { /* streaming text */ });
162
+ client.on('reconnecting', (attempt) => { /* reconnect attempt number */ });
163
+ client.on('connectionStateChanged', (state) => { /* ConnectionState enum */ });
164
+ client.on('authError', (error) => { /* authentication failed */ });
165
+
166
+ // Conversation
167
+ client.on('botResponse', (response) => { /* streaming text (actions auto-stripped) */ });
115
168
  client.on('botVoice', (voice) => { /* audio chunk */ });
116
- client.on('sttResponse', (stt) => { /* speech-to-text */ });
169
+ client.on('sttResponse', (stt) => { /* speech-to-text transcript */ });
117
170
  client.on('interrupt', (data) => { /* response interrupted */ });
171
+ client.on('characterAction', (action) => { /* parsed action from bot response */ });
172
+ client.on('cameraCaptureRequest', (request) => { /* server requests a camera image */ });
173
+
174
+ // Voice
175
+ client.on('voiceStarted', () => { /* voice session began */ });
176
+ client.on('voiceStopped', () => { /* voice session ended */ });
177
+ client.on('livekitConnected', (room) => { /* joined LiveKit room */ });
178
+ client.on('livekitDisconnected', () => { /* left LiveKit room */ });
179
+
180
+ // Audio playback
181
+ client.on('audioPlaybackStarted', (messageId) => { /* bot audio started playing */ });
182
+ client.on('audioPlaybackComplete', (messageId) => { /* bot audio finished playing */ });
183
+
184
+ // Memory
118
185
  client.on('memoryUpdated', (event) => { /* real-time memory extraction */ });
186
+
187
+ // Errors & limits
119
188
  client.on('error', (error) => { /* EstuaryError */ });
120
189
  client.on('quotaExceeded', (data) => { /* rate limited */ });
121
190
  ```
122
191
 
192
+ ## Error Handling
193
+
194
+ Errors are instances of `EstuaryError` with a typed `code` field:
195
+
196
+ ```typescript
197
+ import { EstuaryError, ErrorCode } from '@estuary-ai/sdk';
198
+
199
+ client.on('error', (error) => {
200
+ if (error instanceof EstuaryError) {
201
+ switch (error.code) {
202
+ case ErrorCode.NOT_CONNECTED:
203
+ case ErrorCode.CONNECTION_FAILED:
204
+ case ErrorCode.CONNECTION_TIMEOUT:
205
+ // connection issues
206
+ break;
207
+ case ErrorCode.AUTH_FAILED:
208
+ // bad API key or character ID
209
+ break;
210
+ case ErrorCode.MICROPHONE_DENIED:
211
+ // user denied mic permission
212
+ break;
213
+ }
214
+ }
215
+ });
216
+
217
+ client.on('authError', (message) => {
218
+ console.error('Authentication failed:', message);
219
+ });
220
+ ```
221
+
123
222
  ## Configuration
124
223
 
125
224
  ```typescript
@@ -135,9 +234,46 @@ interface EstuaryConfig {
135
234
  debug?: boolean; // Default: false
136
235
  voiceTransport?: 'websocket' | 'livekit' | 'auto'; // Default: 'auto'
137
236
  realtimeMemory?: boolean; // Enable real-time memory extraction events. Default: false
237
+ suppressMicDuringPlayback?: boolean; // Mute mic while bot audio plays (software AEC). Default: false
138
238
  }
139
239
  ```
140
240
 
241
+ ## Exports
242
+
243
+ Key exports for TypeScript users:
244
+
245
+ ```typescript
246
+ // Client
247
+ import { EstuaryClient } from '@estuary-ai/sdk';
248
+
249
+ // Errors
250
+ import { EstuaryError, ErrorCode } from '@estuary-ai/sdk';
251
+
252
+ // Enums
253
+ import { ConnectionState } from '@estuary-ai/sdk';
254
+
255
+ // Utilities
256
+ import { parseActions } from '@estuary-ai/sdk';
257
+
258
+ // Types (import type)
259
+ import type {
260
+ EstuaryConfig,
261
+ SessionInfo,
262
+ BotResponse,
263
+ BotVoice,
264
+ SttResponse,
265
+ InterruptData,
266
+ CameraCaptureRequest,
267
+ CharacterAction,
268
+ QuotaExceededData,
269
+ MemoryData,
270
+ MemoryUpdatedEvent,
271
+ EstuaryEventMap,
272
+ ParsedAction,
273
+ MemoryClient,
274
+ } from '@estuary-ai/sdk';
275
+ ```
276
+
141
277
  ## Requirements
142
278
 
143
279
  - Node.js 18+ or modern browser
package/dist/index.d.mts CHANGED
@@ -21,6 +21,8 @@ interface EstuaryConfig {
21
21
  voiceTransport?: VoiceTransport;
22
22
  /** Enable real-time memory extraction after each response (default: false) */
23
23
  realtimeMemory?: boolean;
24
+ /** Suppress mic during TTS playback (software AEC fallback, disables barge-in). Default: false */
25
+ suppressMicDuringPlayback?: boolean;
24
26
  }
25
27
  type VoiceTransport = 'websocket' | 'livekit' | 'auto';
26
28
  declare enum ConnectionState {
@@ -96,6 +98,14 @@ interface MemoryUpdatedEvent {
96
98
  newMemories: MemoryData[];
97
99
  timestamp: string;
98
100
  }
101
+ interface CharacterAction {
102
+ /** Action name (e.g., "follow_user", "sit", "look_at") */
103
+ name: string;
104
+ /** Action parameters as key-value pairs */
105
+ params: Record<string, string>;
106
+ /** Message ID of the bot response that contained this action */
107
+ messageId: string;
108
+ }
99
109
  type EstuaryEventMap = {
100
110
  connected: (session: SessionInfo) => void;
101
111
  disconnected: (reason: string) => void;
@@ -109,6 +119,7 @@ type EstuaryEventMap = {
109
119
  authError: (error: string) => void;
110
120
  quotaExceeded: (data: QuotaExceededData) => void;
111
121
  cameraCaptureRequest: (request: CameraCaptureRequest) => void;
122
+ characterAction: (action: CharacterAction) => void;
112
123
  voiceStarted: () => void;
113
124
  voiceStopped: () => void;
114
125
  livekitConnected: (room: string) => void;
@@ -121,6 +132,8 @@ interface VoiceManager {
121
132
  start(): Promise<void>;
122
133
  stop(): Promise<void>;
123
134
  toggleMute(): void;
135
+ /** Suppress audio sending (software AEC). No-op if not supported. */
136
+ setSuppressed?(suppressed: boolean): void;
124
137
  readonly isMuted: boolean;
125
138
  readonly isActive: boolean;
126
139
  dispose(): void;
@@ -229,6 +242,7 @@ declare class EstuaryClient extends TypedEventEmitter<EstuaryEventMap> {
229
242
  private audioPlayer;
230
243
  private _memory;
231
244
  private _sessionInfo;
245
+ private actionParsers;
232
246
  constructor(config: EstuaryConfig);
233
247
  /** Memory API client for querying memories, graphs, and facts */
234
248
  get memory(): MemoryClient;
@@ -266,6 +280,7 @@ declare class EstuaryClient extends TypedEventEmitter<EstuaryEventMap> {
266
280
  get isVoiceActive(): boolean;
267
281
  private ensureConnected;
268
282
  private forwardSocketEvents;
283
+ private handleBotResponse;
269
284
  private handleBotVoice;
270
285
  }
271
286
 
@@ -289,4 +304,23 @@ declare class EstuaryError extends Error {
289
304
  constructor(code: ErrorCode, message: string, details?: unknown);
290
305
  }
291
306
 
292
- export { type BotResponse, type BotVoice, type CameraCaptureRequest, ConnectionState, type CoreFactsResponse, ErrorCode, EstuaryClient, type EstuaryConfig, EstuaryError, type EstuaryEventMap, type InterruptData, type LiveKitTokenResponse, MemoryClient, type MemoryData, type MemoryGraphOptions, type MemoryGraphResponse, type MemoryListOptions, type MemoryListResponse, type MemorySearchOptions, type MemorySearchResponse, type MemoryStatsResponse, type MemoryTimelineOptions, type MemoryTimelineResponse, type MemoryUpdatedEvent, type QuotaExceededData, type SessionInfo, type SttResponse, type VoiceManager, type VoiceTransport };
307
+ /**
308
+ * Parses `<action name="..." .../>` XML tags from bot response text.
309
+ *
310
+ * Designed for streaming: call `parse()` with the accumulated text on each
311
+ * chunk and it returns only newly-discovered actions since the last call.
312
+ */
313
+ interface ParsedAction {
314
+ name: string;
315
+ params: Record<string, string>;
316
+ }
317
+ /**
318
+ * One-shot parse: extract all actions and return clean text.
319
+ * Useful for non-streaming contexts.
320
+ */
321
+ declare function parseActions(text: string): {
322
+ actions: ParsedAction[];
323
+ cleanText: string;
324
+ };
325
+
326
+ export { type BotResponse, type BotVoice, type CameraCaptureRequest, type CharacterAction, ConnectionState, type CoreFactsResponse, ErrorCode, EstuaryClient, type EstuaryConfig, EstuaryError, type EstuaryEventMap, type InterruptData, type LiveKitTokenResponse, MemoryClient, type MemoryData, type MemoryGraphOptions, type MemoryGraphResponse, type MemoryListOptions, type MemoryListResponse, type MemorySearchOptions, type MemorySearchResponse, type MemoryStatsResponse, type MemoryTimelineOptions, type MemoryTimelineResponse, type MemoryUpdatedEvent, type ParsedAction, type QuotaExceededData, type SessionInfo, type SttResponse, type VoiceManager, type VoiceTransport, parseActions };
package/dist/index.js CHANGED
@@ -4889,6 +4889,7 @@ var init_websocket_voice = __esm({
4889
4889
  scriptProcessor = null;
4890
4890
  sourceNode = null;
4891
4891
  _isMuted = false;
4892
+ _isSuppressed = false;
4892
4893
  _isActive = false;
4893
4894
  constructor(socketManager, sampleRate, logger) {
4894
4895
  this.socketManager = socketManager;
@@ -4911,7 +4912,13 @@ var init_websocket_voice = __esm({
4911
4912
  let stream;
4912
4913
  try {
4913
4914
  stream = await navigator.mediaDevices.getUserMedia({
4914
- audio: { sampleRate: this.sampleRate, channelCount: 1 }
4915
+ audio: {
4916
+ sampleRate: this.sampleRate,
4917
+ channelCount: 1,
4918
+ echoCancellation: true,
4919
+ noiseSuppression: true,
4920
+ autoGainControl: true
4921
+ }
4915
4922
  });
4916
4923
  } catch (err) {
4917
4924
  throw new exports.EstuaryError(
@@ -4928,7 +4935,7 @@ var init_websocket_voice = __esm({
4928
4935
  const nativeRate = this.audioContext.sampleRate;
4929
4936
  const targetRate = this.sampleRate;
4930
4937
  this.scriptProcessor.onaudioprocess = (event) => {
4931
- if (this._isMuted) return;
4938
+ if (this._isMuted || this._isSuppressed) return;
4932
4939
  const inputData = event.inputBuffer.getChannelData(0);
4933
4940
  let pcmFloat;
4934
4941
  if (nativeRate !== targetRate) {
@@ -4958,6 +4965,7 @@ var init_websocket_voice = __esm({
4958
4965
  this.cleanup();
4959
4966
  this._isActive = false;
4960
4967
  this._isMuted = false;
4968
+ this._isSuppressed = false;
4961
4969
  this.logger.debug("WebSocket voice stopped");
4962
4970
  }
4963
4971
  toggleMute() {
@@ -4968,10 +4976,15 @@ var init_websocket_voice = __esm({
4968
4976
  }
4969
4977
  this.logger.debug("Mute toggled:", this._isMuted);
4970
4978
  }
4979
+ setSuppressed(suppressed) {
4980
+ this._isSuppressed = suppressed;
4981
+ this.logger.debug("Audio suppression:", suppressed ? "on" : "off");
4982
+ }
4971
4983
  dispose() {
4972
4984
  this.cleanup();
4973
4985
  this._isActive = false;
4974
4986
  this._isMuted = false;
4987
+ this._isSuppressed = false;
4975
4988
  }
4976
4989
  cleanup() {
4977
4990
  if (this.scriptProcessor) {
@@ -9310,6 +9323,53 @@ var Logger = class {
9310
9323
 
9311
9324
  // src/client.ts
9312
9325
  init_errors();
9326
+
9327
+ // src/utils/action-parser.ts
9328
+ var ACTION_TAG_RE = /<action\s+([^>]*?)\/>/gi;
9329
+ var ATTR_RE = /(\w+)\s*=\s*"([^"]*)"|(\w+)\s*=\s*'([^']*)'/g;
9330
+ function parseAttributes(attrString) {
9331
+ const attrs = {};
9332
+ let match;
9333
+ while ((match = ATTR_RE.exec(attrString)) !== null) {
9334
+ const key = match[1] ?? match[3];
9335
+ const value2 = match[2] ?? match[4];
9336
+ attrs[key] = value2;
9337
+ }
9338
+ return attrs;
9339
+ }
9340
+ var StreamingActionParser = class {
9341
+ emittedCount = 0;
9342
+ /**
9343
+ * Parse the accumulated response text and return any new actions found
9344
+ * since the last call. Also returns the text with all action tags stripped.
9345
+ */
9346
+ parse(accumulatedText) {
9347
+ const allActions = [];
9348
+ let match;
9349
+ ACTION_TAG_RE.lastIndex = 0;
9350
+ while ((match = ACTION_TAG_RE.exec(accumulatedText)) !== null) {
9351
+ const attrs = parseAttributes(match[1]);
9352
+ const name = attrs.name;
9353
+ if (name) {
9354
+ delete attrs.name;
9355
+ allActions.push({ name, params: attrs });
9356
+ }
9357
+ }
9358
+ const newActions = allActions.slice(this.emittedCount);
9359
+ this.emittedCount = allActions.length;
9360
+ const cleanText = accumulatedText.replace(ACTION_TAG_RE, "").replace(/\s{2,}/g, " ").trim();
9361
+ return { actions: newActions, cleanText };
9362
+ }
9363
+ reset() {
9364
+ this.emittedCount = 0;
9365
+ }
9366
+ };
9367
+ function parseActions(text) {
9368
+ const parser = new StreamingActionParser();
9369
+ return parser.parse(text);
9370
+ }
9371
+
9372
+ // src/client.ts
9313
9373
  var DEFAULT_SAMPLE_RATE = 16e3;
9314
9374
  var EstuaryClient = class extends TypedEventEmitter {
9315
9375
  config;
@@ -9319,6 +9379,7 @@ var EstuaryClient = class extends TypedEventEmitter {
9319
9379
  audioPlayer = null;
9320
9380
  _memory;
9321
9381
  _sessionInfo = null;
9382
+ actionParsers = /* @__PURE__ */ new Map();
9322
9383
  constructor(config) {
9323
9384
  super();
9324
9385
  this.config = config;
@@ -9370,6 +9431,9 @@ var EstuaryClient = class extends TypedEventEmitter {
9370
9431
  this.ensureConnected();
9371
9432
  this.socketManager.emitEvent("client_interrupt", { message_id: messageId });
9372
9433
  this.audioPlayer?.clear();
9434
+ if (this.config.suppressMicDuringPlayback) {
9435
+ this.voiceManager?.setSuppressed?.(false);
9436
+ }
9373
9437
  }
9374
9438
  /** Send a camera image for vision processing */
9375
9439
  sendCameraImage(imageBase64, mimeType, requestId, text) {
@@ -9408,9 +9472,15 @@ var EstuaryClient = class extends TypedEventEmitter {
9408
9472
  this.audioPlayer = new AudioPlayer(sampleRate, (event) => {
9409
9473
  if (event.type === "started") {
9410
9474
  this.emit("audioPlaybackStarted", event.messageId);
9475
+ if (this.config.suppressMicDuringPlayback) {
9476
+ this.voiceManager?.setSuppressed?.(true);
9477
+ }
9411
9478
  } else if (event.type === "complete") {
9412
9479
  this.emit("audioPlaybackComplete", event.messageId);
9413
9480
  this.notifyAudioPlaybackComplete(event.messageId);
9481
+ if (this.config.suppressMicDuringPlayback) {
9482
+ this.voiceManager?.setSuppressed?.(false);
9483
+ }
9414
9484
  }
9415
9485
  });
9416
9486
  }
@@ -9454,15 +9524,20 @@ var EstuaryClient = class extends TypedEventEmitter {
9454
9524
  });
9455
9525
  this.socketManager.on("disconnected", (reason) => {
9456
9526
  this._sessionInfo = null;
9527
+ this.actionParsers.clear();
9457
9528
  this.emit("disconnected", reason);
9458
9529
  });
9459
9530
  this.socketManager.on("reconnecting", (attempt) => this.emit("reconnecting", attempt));
9460
9531
  this.socketManager.on("connectionStateChanged", (state) => this.emit("connectionStateChanged", state));
9461
- this.socketManager.on("botResponse", (response) => this.emit("botResponse", response));
9532
+ this.socketManager.on("botResponse", (response) => this.handleBotResponse(response));
9462
9533
  this.socketManager.on("botVoice", (voice) => this.handleBotVoice(voice));
9463
9534
  this.socketManager.on("sttResponse", (response) => this.emit("sttResponse", response));
9464
9535
  this.socketManager.on("interrupt", (data) => {
9465
9536
  this.audioPlayer?.clear();
9537
+ this.actionParsers.clear();
9538
+ if (this.config.suppressMicDuringPlayback) {
9539
+ this.voiceManager?.setSuppressed?.(false);
9540
+ }
9466
9541
  this.emit("interrupt", data);
9467
9542
  });
9468
9543
  this.socketManager.on("error", (error) => this.emit("error", error));
@@ -9473,6 +9548,28 @@ var EstuaryClient = class extends TypedEventEmitter {
9473
9548
  this.socketManager.on("livekitDisconnected", () => this.emit("livekitDisconnected"));
9474
9549
  this.socketManager.on("memoryUpdated", (event) => this.emit("memoryUpdated", event));
9475
9550
  }
9551
+ handleBotResponse(response) {
9552
+ const { messageId } = response;
9553
+ if (!this.actionParsers.has(messageId)) {
9554
+ this.actionParsers.set(messageId, new StreamingActionParser());
9555
+ }
9556
+ const parser = this.actionParsers.get(messageId);
9557
+ const { actions, cleanText } = parser.parse(response.text);
9558
+ for (const action of actions) {
9559
+ this.emit("characterAction", {
9560
+ name: action.name,
9561
+ params: action.params,
9562
+ messageId
9563
+ });
9564
+ }
9565
+ this.emit("botResponse", {
9566
+ ...response,
9567
+ text: cleanText
9568
+ });
9569
+ if (response.isFinal) {
9570
+ this.actionParsers.delete(messageId);
9571
+ }
9572
+ }
9476
9573
  handleBotVoice(voice) {
9477
9574
  this.emit("botVoice", voice);
9478
9575
  this.audioPlayer?.enqueue(voice);
@@ -9500,5 +9597,6 @@ xmlhttprequest-ssl/lib/XMLHttpRequest.js:
9500
9597
 
9501
9598
  exports.ConnectionState = ConnectionState;
9502
9599
  exports.EstuaryClient = EstuaryClient;
9600
+ exports.parseActions = parseActions;
9503
9601
  //# sourceMappingURL=index.js.map
9504
9602
  //# sourceMappingURL=index.js.map