@mastra/voice-aws-nova-sonic 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,1539 @@
1
+ 'use strict';
2
+
3
+ var stream = require('stream');
4
+ var crypto = require('crypto');
5
+ var voice = require('@mastra/core/voice');
6
+ var clientBedrockRuntime = require('@aws-sdk/client-bedrock-runtime');
7
+ var nodeHttpHandler = require('@smithy/node-http-handler');
8
+ var credentialProviderNode = require('@aws-sdk/credential-provider-node');
9
+
10
+ // src/index.ts
11
+
12
+ // src/types.ts
13
+ var NovaSonicErrorCode = /* @__PURE__ */ ((NovaSonicErrorCode2) => {
14
+ NovaSonicErrorCode2["CONNECTION_FAILED"] = "connection_failed";
15
+ NovaSonicErrorCode2["CONNECTION_NOT_ESTABLISHED"] = "connection_not_established";
16
+ NovaSonicErrorCode2["AUTHENTICATION_FAILED"] = "authentication_failed";
17
+ NovaSonicErrorCode2["CREDENTIALS_MISSING"] = "credentials_missing";
18
+ NovaSonicErrorCode2["REGION_INVALID"] = "region_invalid";
19
+ NovaSonicErrorCode2["WEBSOCKET_ERROR"] = "websocket_error";
20
+ NovaSonicErrorCode2["AUDIO_PROCESSING_ERROR"] = "audio_processing_error";
21
+ NovaSonicErrorCode2["AUDIO_STREAM_ERROR"] = "audio_stream_error";
22
+ NovaSonicErrorCode2["SPEAKER_STREAM_ERROR"] = "speaker_stream_error";
23
+ NovaSonicErrorCode2["TRANSCRIPTION_TIMEOUT"] = "transcription_timeout";
24
+ NovaSonicErrorCode2["TRANSCRIPTION_FAILED"] = "transcription_failed";
25
+ NovaSonicErrorCode2["TOOL_EXECUTION_ERROR"] = "tool_execution_error";
26
+ NovaSonicErrorCode2["TOOL_NOT_FOUND"] = "tool_not_found";
27
+ NovaSonicErrorCode2["SESSION_CONFIG_UPDATE_FAILED"] = "session_config_update_failed";
28
+ NovaSonicErrorCode2["INVALID_AUDIO_FORMAT"] = "invalid_audio_format";
29
+ NovaSonicErrorCode2["NOT_CONNECTED"] = "not_connected";
30
+ NovaSonicErrorCode2["INVALID_STATE"] = "invalid_state";
31
+ NovaSonicErrorCode2["VALIDATION_ERROR"] = "validation_error";
32
+ NovaSonicErrorCode2["UNKNOWN_ERROR"] = "unknown_error";
33
+ return NovaSonicErrorCode2;
34
+ })(NovaSonicErrorCode || {});
35
+
36
+ // src/utils/errors.ts
37
+ var NovaSonicError = class extends Error {
38
+ code;
39
+ details;
40
+ timestamp;
41
+ constructor(code, message, details) {
42
+ super(message);
43
+ this.name = "NovaSonicError";
44
+ this.code = code;
45
+ this.details = details;
46
+ this.timestamp = Date.now();
47
+ }
48
+ toEventData() {
49
+ return {
50
+ message: this.message,
51
+ code: this.code,
52
+ details: this.details,
53
+ timestamp: this.timestamp
54
+ };
55
+ }
56
+ };
57
+ async function getAwsCredentials(explicitCredentials, debug) {
58
+ if (explicitCredentials) {
59
+ if (debug) {
60
+ console.log("[getAwsCredentials] Using explicit credentials provided in config");
61
+ }
62
+ return explicitCredentials;
63
+ }
64
+ try {
65
+ if (debug) {
66
+ console.log("[getAwsCredentials] Using default credential provider chain");
67
+ }
68
+ const credentials = await credentialProviderNode.defaultProvider()();
69
+ if (debug) {
70
+ console.log("[getAwsCredentials] Credentials retrieved successfully");
71
+ }
72
+ return credentials;
73
+ } catch (error) {
74
+ if (error instanceof NovaSonicError) {
75
+ throw error;
76
+ }
77
+ throw new NovaSonicError(
78
+ "authentication_failed" /* AUTHENTICATION_FAILED */,
79
+ `Failed to load AWS credentials: ${error instanceof Error ? error.message : "Unknown error"}`,
80
+ error
81
+ );
82
+ }
83
+ }
84
+
85
+ // src/index.ts
86
+ var DEFAULT_MODEL = "amazon.nova-2-sonic-v1:0";
87
+ var DEFAULT_REGION = "us-east-1";
88
+ var NovaSonicVoice = class extends voice.MastraVoice {
89
+ client;
90
+ stream;
91
+ inputStream;
92
+ // Input stream for sending events to AWS
93
+ _eventQueue;
94
+ _signalQueue;
95
+ _closeSignal;
96
+ _promptName;
97
+ state = "disconnected";
98
+ events;
99
+ instructions;
100
+ tools;
101
+ requestContext;
102
+ debug;
103
+ region;
104
+ model;
105
+ credentials;
106
+ speakerStreams;
107
+ currentResponseId;
108
+ processingStream = false;
109
+ streamRestartAttempted = false;
110
+ // Prevent multiple restart attempts
111
+ sessionConfig;
112
+ promptStarted = false;
113
+ // Track if promptStart was sent (now sent during connection)
114
+ audioContentName;
115
+ audioContentStarted = false;
116
+ hasSentContentEnd = false;
117
+ // Track if contentEnd has been sent for current turn
118
+ turnCompleted = false;
119
+ // Track if turn has been completed (to prevent sending contentEnd after turn completion)
120
+ turnCompleteTimeout;
121
+ // Timeout for fallback turn completion
122
+ isReceivingAssistantAudio = false;
123
+ // Track if we're currently receiving assistant audio output
124
+ currentTextGenerationStage;
125
+ // Track generationStage (SPECULATIVE|FINAL) for current text content block
126
+ /**
127
+ * Creates a new instance of NovaSonicVoice.
128
+ *
129
+ * @param config - Configuration options for the voice instance
130
+ * @param config.region - AWS region (defaults to us-east-1)
131
+ * @param config.model - The model ID to use (defaults to amazon.nova-2-sonic-v1:0)
132
+ * @param config.credentials - AWS credentials (optional, uses default credential chain)
133
+ * @param config.speaker - Voice name/identifier
134
+ * @param config.languageCode - Language code for the voice
135
+ * @param config.debug - Enable debug mode
136
+ *
137
+ * @example
138
+ * ```typescript
139
+ * const voice = new NovaSonicVoice({
140
+ * region: 'us-east-1',
141
+ * model: 'amazon.nova-2-sonic-v1:0',
142
+ * speaker: 'default',
143
+ * });
144
+ * ```
145
+ */
146
+ constructor(config = {}) {
147
+ let normalizedConfig;
148
+ if ("realtimeConfig" in config || "speechModel" in config || "listeningModel" in config) {
149
+ normalizedConfig = config;
150
+ } else {
151
+ const configOptions = config;
152
+ normalizedConfig = {
153
+ realtimeConfig: {
154
+ model: configOptions.model || DEFAULT_MODEL,
155
+ apiKey: void 0,
156
+ // AWS doesn't use API keys
157
+ options: configOptions
158
+ },
159
+ speaker: typeof configOptions.speaker === "string" ? configOptions.speaker : "matthew"
160
+ };
161
+ }
162
+ super(normalizedConfig);
163
+ const options = normalizedConfig.realtimeConfig?.options || config;
164
+ this.region = options.region || DEFAULT_REGION;
165
+ this.model = options.model || DEFAULT_MODEL;
166
+ this.credentials = options.credentials;
167
+ this.debug = options.debug || false;
168
+ this.sessionConfig = options.sessionConfig;
169
+ this.events = {};
170
+ this.speakerStreams = /* @__PURE__ */ new Map();
171
+ const validRegions = ["us-east-1", "us-west-2", "ap-northeast-1"];
172
+ if (!validRegions.includes(this.region)) {
173
+ throw new NovaSonicError(
174
+ "region_invalid" /* REGION_INVALID */,
175
+ `Invalid region: ${this.region}. Supported regions: ${validRegions.join(", ")}`
176
+ );
177
+ }
178
+ }
179
+ /**
180
+ * Returns a list of available voice speakers.
181
+ *
182
+ * Nova 2 Sonic provides expressive voices across multiple languages.
183
+ * Tiffany (en-US, feminine) and Matthew (en-US, masculine) are polyglot
184
+ * voices that can speak all supported languages.
185
+ *
186
+ * @returns Promise resolving to an array of voice objects
187
+ */
188
+ async getSpeakers() {
189
+ return Promise.resolve([
190
+ // English (US) - Polyglot voices
191
+ { voiceId: "tiffany", name: "Tiffany", language: "English", locale: "en-US", gender: "feminine", polyglot: true },
192
+ { voiceId: "matthew", name: "Matthew", language: "English", locale: "en-US", gender: "masculine", polyglot: true },
193
+ // English (UK)
194
+ { voiceId: "amy", name: "Amy", language: "English", locale: "en-GB", gender: "feminine", polyglot: false },
195
+ // English (Australia)
196
+ { voiceId: "olivia", name: "Olivia", language: "English", locale: "en-AU", gender: "feminine", polyglot: false },
197
+ // English (Indian)
198
+ { voiceId: "kiara", name: "Kiara", language: "English", locale: "en-IN", gender: "feminine", polyglot: false },
199
+ { voiceId: "arjun", name: "Arjun", language: "English", locale: "en-IN", gender: "masculine", polyglot: false },
200
+ // French
201
+ { voiceId: "ambre", name: "Ambre", language: "French", locale: "fr-FR", gender: "feminine", polyglot: false },
202
+ { voiceId: "florian", name: "Florian", language: "French", locale: "fr-FR", gender: "masculine", polyglot: false },
203
+ // Italian
204
+ { voiceId: "beatrice", name: "Beatrice", language: "Italian", locale: "it-IT", gender: "feminine", polyglot: false },
205
+ { voiceId: "lorenzo", name: "Lorenzo", language: "Italian", locale: "it-IT", gender: "masculine", polyglot: false },
206
+ // German
207
+ { voiceId: "tina", name: "Tina", language: "German", locale: "de-DE", gender: "feminine", polyglot: false },
208
+ { voiceId: "lennart", name: "Lennart", language: "German", locale: "de-DE", gender: "masculine", polyglot: false },
209
+ // Spanish (US)
210
+ { voiceId: "lupe", name: "Lupe", language: "Spanish", locale: "es-US", gender: "feminine", polyglot: false },
211
+ { voiceId: "carlos", name: "Carlos", language: "Spanish", locale: "es-US", gender: "masculine", polyglot: false },
212
+ // Portuguese
213
+ { voiceId: "carolina", name: "Carolina", language: "Portuguese", locale: "pt-BR", gender: "feminine", polyglot: false },
214
+ { voiceId: "leo", name: "Leo", language: "Portuguese", locale: "pt-BR", gender: "masculine", polyglot: false },
215
+ // Hindi
216
+ { voiceId: "kiara", name: "Kiara", language: "Hindi", locale: "hi-IN", gender: "feminine", polyglot: false },
217
+ { voiceId: "arjun", name: "Arjun", language: "Hindi", locale: "hi-IN", gender: "masculine", polyglot: false }
218
+ ]);
219
+ }
220
+ /**
221
+ * Establishes a connection to the AWS Bedrock bidirectional streaming service.
222
+ * Must be called before using speak, listen, or send functions.
223
+ *
224
+ * @throws {NovaSonicError} If connection fails or credentials are missing
225
+ *
226
+ * @example
227
+ * ```typescript
228
+ * await voice.connect();
229
+ * // Now ready for voice interactions
230
+ * ```
231
+ */
232
+ async connect({ requestContext } = {}) {
233
+ if (this.state === "connected" || this.state === "connecting") {
234
+ this.log("Already connected or connecting");
235
+ return;
236
+ }
237
+ this.state = "connecting";
238
+ this.requestContext = requestContext;
239
+ this.streamRestartAttempted = false;
240
+ try {
241
+ await this.createBedrockClient();
242
+ const asyncIterable = this.createEventQueue();
243
+ this.enqueueInitialSessionEvents();
244
+ await this.sendInitialConnectCommand(asyncIterable);
245
+ this.processStream().catch((error) => {
246
+ this.log("Error in stream processing:", error);
247
+ this.emit("error", {
248
+ message: error instanceof Error ? error.message : "Stream processing error",
249
+ code: "STREAM_PROCESSING_ERROR",
250
+ details: error
251
+ });
252
+ });
253
+ this.log("Connected to AWS Bedrock Nova 2 Sonic");
254
+ } catch (error) {
255
+ this.state = "disconnected";
256
+ if (this.client) {
257
+ if (typeof this.client.destroy === "function") {
258
+ this.client.destroy();
259
+ }
260
+ this.client = void 0;
261
+ }
262
+ this.log("Connection error:", error);
263
+ const errorMessage = error instanceof Error ? error.message : "Unknown error during connection";
264
+ throw new NovaSonicError(
265
+ "connection_failed" /* CONNECTION_FAILED */,
266
+ `Failed to connect to AWS Bedrock: ${errorMessage}`,
267
+ error
268
+ );
269
+ }
270
+ }
271
+ /**
272
+ * Resolve credentials and initialize the Bedrock Runtime client over HTTP/2.
273
+ */
274
+ async createBedrockClient() {
275
+ this.log("Getting AWS credentials...");
276
+ const credentials = await getAwsCredentials(this.credentials, this.debug);
277
+ if (!credentials) {
278
+ throw new NovaSonicError(
279
+ "credentials_missing" /* CREDENTIALS_MISSING */,
280
+ "AWS credentials are required. Please configure AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables or provide credentials in the config."
281
+ );
282
+ }
283
+ this.log("Credentials retrieved:", {
284
+ hasAccessKeyId: !!credentials.accessKeyId,
285
+ hasSecretAccessKey: !!credentials.secretAccessKey,
286
+ hasSessionToken: !!credentials.sessionToken,
287
+ accessKeyIdPrefix: credentials.accessKeyId ? `${credentials.accessKeyId.substring(0, 6)}...` : "missing",
288
+ expiration: credentials.expiration ? credentials.expiration.toISOString() : "no expiration"
289
+ });
290
+ this.log(`Initializing Bedrock Runtime client for region: ${this.region}, model: ${this.model}`);
291
+ const nodeHttp2Handler = new nodeHttpHandler.NodeHttp2Handler({
292
+ requestTimeout: 3e5,
293
+ // 5 minutes
294
+ sessionTimeout: 3e5,
295
+ // 5 minutes
296
+ disableConcurrentStreams: false,
297
+ maxConcurrentStreams: 20
298
+ });
299
+ this.client = new clientBedrockRuntime.BedrockRuntimeClient({
300
+ region: this.region,
301
+ credentials,
302
+ requestHandler: nodeHttp2Handler
303
+ });
304
+ }
305
+ /**
306
+ * Build the async-iterable event queue used as the request body for the
307
+ * bidirectional stream. Returns the iterable and wires up internal queue
308
+ * helpers (_eventQueue, _signalQueue, _closeSignal) used by sendClientEvent.
309
+ */
310
+ createEventQueue() {
311
+ this.log("Creating bidirectional stream command...");
312
+ const voiceInstance = this;
313
+ const eventQueue = [];
314
+ const pendingResolvers = [];
315
+ let closeSignal = false;
316
+ const signalQueue = () => {
317
+ if (pendingResolvers.length > 0) {
318
+ voiceInstance.log(`[AsyncIterable] Signaling queue - resolving ${pendingResolvers.length} pending Promise(s)`);
319
+ const resolvers = [...pendingResolvers];
320
+ pendingResolvers.length = 0;
321
+ resolvers.forEach((resolve) => resolve());
322
+ } else {
323
+ voiceInstance.log("[AsyncIterable] signalQueue called but no pending Promise");
324
+ }
325
+ };
326
+ const asyncIterable = {
327
+ [Symbol.asyncIterator]: () => {
328
+ voiceInstance.log("[AsyncIterable] Iterator requested");
329
+ return {
330
+ next: async () => {
331
+ try {
332
+ if (closeSignal || voiceInstance.state === "disconnected") {
333
+ voiceInstance.log(`[AsyncIterable] Stream closed (state: ${voiceInstance.state}), done = true`);
334
+ return { value: void 0, done: true };
335
+ }
336
+ if (eventQueue.length === 0) {
337
+ try {
338
+ voiceInstance.log("[AsyncIterable] Queue empty, waiting for signal...");
339
+ await new Promise((resolve) => {
340
+ pendingResolvers.push(resolve);
341
+ voiceInstance.log(`[AsyncIterable] Promise created, waiting for signal (${pendingResolvers.length} pending)...`);
342
+ setImmediate(() => {
343
+ if (eventQueue.length > 0) {
344
+ voiceInstance.log("[AsyncIterable] Data arrived before wait, resolving immediately");
345
+ const index = pendingResolvers.indexOf(resolve);
346
+ if (index !== -1) {
347
+ pendingResolvers.splice(index, 1);
348
+ resolve();
349
+ }
350
+ return;
351
+ }
352
+ if (closeSignal || voiceInstance.state === "disconnected") {
353
+ voiceInstance.log("[AsyncIterable] Closed before wait, resolving");
354
+ const index = pendingResolvers.indexOf(resolve);
355
+ if (index !== -1) {
356
+ pendingResolvers.splice(index, 1);
357
+ resolve();
358
+ }
359
+ return;
360
+ }
361
+ });
362
+ });
363
+ voiceInstance.log("[AsyncIterable] Promise resolved, checking queue...");
364
+ } catch (error) {
365
+ if (error instanceof Error && error.message === "Stream closed") {
366
+ voiceInstance.log("[AsyncIterable] Stream closed during wait");
367
+ return { value: void 0, done: true };
368
+ }
369
+ voiceInstance.log("[AsyncIterable] Error during wait:", error);
370
+ }
371
+ }
372
+ if (closeSignal) {
373
+ voiceInstance.log("[AsyncIterable] Stream closed (closeSignal)");
374
+ return { value: void 0, done: true };
375
+ }
376
+ if (voiceInstance.state === "disconnected") {
377
+ voiceInstance.log("[AsyncIterable] Stream closed (disconnected state)");
378
+ return { value: void 0, done: true };
379
+ }
380
+ while (eventQueue.length === 0 && !closeSignal) {
381
+ if (voiceInstance.state === "disconnected") {
382
+ voiceInstance.log("[AsyncIterable] Stream closed before wait loop");
383
+ return { value: void 0, done: true };
384
+ }
385
+ voiceInstance.log("[AsyncIterable] Queue still empty, waiting again...");
386
+ await new Promise((resolve) => {
387
+ pendingResolvers.push(resolve);
388
+ setImmediate(() => {
389
+ if (eventQueue.length > 0 || closeSignal || voiceInstance.state === "disconnected") {
390
+ const index = pendingResolvers.indexOf(resolve);
391
+ if (index !== -1) {
392
+ pendingResolvers.splice(index, 1);
393
+ resolve();
394
+ }
395
+ }
396
+ });
397
+ });
398
+ if (closeSignal || voiceInstance.state === "disconnected") {
399
+ voiceInstance.log("[AsyncIterable] Stream closed during wait loop");
400
+ return { value: void 0, done: true };
401
+ }
402
+ }
403
+ const nextEvent = eventQueue.shift();
404
+ const eventJson = JSON.stringify(nextEvent);
405
+ const eventBytes = Buffer.from(eventJson, "utf-8");
406
+ voiceInstance.log(`[AsyncIterable] Yielding event of size: ${eventBytes.length}`);
407
+ return {
408
+ value: {
409
+ chunk: {
410
+ bytes: eventBytes
411
+ }
412
+ },
413
+ done: false
414
+ };
415
+ } catch (error) {
416
+ voiceInstance.log("[AsyncIterable] Error in iterator:", error);
417
+ closeSignal = true;
418
+ return { value: void 0, done: true };
419
+ }
420
+ },
421
+ return: async () => {
422
+ voiceInstance.log("[AsyncIterable] Iterator return() called");
423
+ closeSignal = true;
424
+ signalQueue();
425
+ return { value: void 0, done: true };
426
+ },
427
+ throw: async (error) => {
428
+ voiceInstance.log("[AsyncIterable] Iterator throw() called:", error);
429
+ closeSignal = true;
430
+ signalQueue();
431
+ throw error;
432
+ }
433
+ };
434
+ }
435
+ };
436
+ this._eventQueue = eventQueue;
437
+ this._signalQueue = signalQueue;
438
+ this._closeSignal = () => {
439
+ closeSignal = true;
440
+ signalQueue();
441
+ };
442
+ return asyncIterable;
443
+ }
444
+ /**
445
+ * Pre-populate the event queue with the AWS Nova Sonic connection
446
+ * handshake events: sessionStart, promptStart, then a SYSTEM text content
447
+ * block carrying the configured instructions. AUDIO contentStart is NOT
448
+ * sent here; it is deferred to the first send() call.
449
+ */
450
+ enqueueInitialSessionEvents() {
451
+ const eventQueue = this._eventQueue;
452
+ if (!eventQueue) {
453
+ throw new NovaSonicError(
454
+ "connection_failed" /* CONNECTION_FAILED */,
455
+ "Event queue must be initialized before enqueueing session events"
456
+ );
457
+ }
458
+ this.log("Pre-populating queue with sessionStart and promptStart events...");
459
+ const promptName = crypto.randomUUID();
460
+ this._promptName = promptName;
461
+ const sessionStartEvent = {};
462
+ if (this.sessionConfig) {
463
+ if (this.sessionConfig.inferenceConfiguration) {
464
+ sessionStartEvent.inferenceConfiguration = {
465
+ maxTokens: this.sessionConfig.inferenceConfiguration.maxTokens || 4096,
466
+ topP: this.sessionConfig.inferenceConfiguration.topP || 0.9,
467
+ temperature: this.sessionConfig.inferenceConfiguration.temperature || 0.7,
468
+ ...this.sessionConfig.inferenceConfiguration.topK !== void 0 && { topK: this.sessionConfig.inferenceConfiguration.topK },
469
+ ...this.sessionConfig.inferenceConfiguration.stopSequences && { stopSequences: this.sessionConfig.inferenceConfiguration.stopSequences }
470
+ };
471
+ } else {
472
+ sessionStartEvent.inferenceConfiguration = {
473
+ maxTokens: 4096,
474
+ topP: 0.9,
475
+ temperature: 0.7
476
+ };
477
+ }
478
+ if (this.sessionConfig.turnDetectionConfiguration) {
479
+ sessionStartEvent.turnDetectionConfiguration = {
480
+ ...this.sessionConfig.turnDetectionConfiguration.endpointingSensitivity && {
481
+ endpointingSensitivity: this.sessionConfig.turnDetectionConfiguration.endpointingSensitivity
482
+ }
483
+ };
484
+ }
485
+ } else {
486
+ sessionStartEvent.inferenceConfiguration = {
487
+ maxTokens: 4096,
488
+ topP: 0.9,
489
+ temperature: 0.7
490
+ };
491
+ }
492
+ eventQueue.push({
493
+ event: {
494
+ sessionStart: sessionStartEvent
495
+ }
496
+ });
497
+ let voiceId = "matthew";
498
+ if (this.sessionConfig?.voice) {
499
+ if (typeof this.sessionConfig.voice === "string") {
500
+ voiceId = this.sessionConfig.voice;
501
+ } else if (this.sessionConfig.voice.name) {
502
+ voiceId = this.sessionConfig.voice.name;
503
+ }
504
+ } else if (this.speaker && this.speaker !== "default") {
505
+ if (typeof this.speaker === "string") {
506
+ voiceId = this.speaker;
507
+ } else {
508
+ const speakerObj = this.speaker;
509
+ if (speakerObj && typeof speakerObj === "object" && speakerObj.name) {
510
+ voiceId = speakerObj.name;
511
+ }
512
+ }
513
+ }
514
+ const promptStartEvent = {
515
+ promptName,
516
+ textOutputConfiguration: {
517
+ mediaType: "text/plain"
518
+ },
519
+ // AWS REQUIRES this - cannot be omitted
520
+ audioOutputConfiguration: {
521
+ mediaType: "audio/lpcm",
522
+ sampleRateHertz: 24e3,
523
+ sampleSizeBits: 16,
524
+ channelCount: 1,
525
+ voiceId,
526
+ encoding: "base64",
527
+ audioType: "SPEECH"
528
+ }
529
+ };
530
+ if (this.sessionConfig?.tools && this.sessionConfig.tools.length > 0) {
531
+ promptStartEvent.toolConfiguration = {
532
+ tools: this.sessionConfig.tools.map((tool) => {
533
+ let inputSchemaJson;
534
+ if (typeof tool.inputSchema === "string") {
535
+ inputSchemaJson = tool.inputSchema;
536
+ } else {
537
+ inputSchemaJson = JSON.stringify(tool.inputSchema);
538
+ }
539
+ return {
540
+ toolSpec: {
541
+ name: tool.name,
542
+ description: tool.description,
543
+ inputSchema: {
544
+ json: inputSchemaJson
545
+ }
546
+ }
547
+ };
548
+ }),
549
+ // toolChoice goes inside toolConfiguration for Nova 2 Sonic
550
+ ...this.sessionConfig?.toolChoice && { toolChoice: this.sessionConfig.toolChoice }
551
+ };
552
+ } else if (this.sessionConfig?.toolChoice) {
553
+ promptStartEvent.toolConfiguration = {
554
+ toolChoice: this.sessionConfig.toolChoice
555
+ };
556
+ }
557
+ eventQueue.push({
558
+ event: {
559
+ promptStart: promptStartEvent
560
+ }
561
+ });
562
+ this.promptStarted = true;
563
+ const systemContentName = crypto.randomUUID();
564
+ eventQueue.push({
565
+ event: {
566
+ contentStart: {
567
+ promptName,
568
+ contentName: systemContentName,
569
+ type: "TEXT",
570
+ interactive: false,
571
+ role: "SYSTEM",
572
+ textInputConfiguration: {
573
+ mediaType: "text/plain"
574
+ }
575
+ }
576
+ }
577
+ });
578
+ eventQueue.push({
579
+ event: {
580
+ textInput: {
581
+ promptName,
582
+ contentName: systemContentName,
583
+ content: this.instructions || ""
584
+ }
585
+ }
586
+ });
587
+ eventQueue.push({
588
+ event: {
589
+ contentEnd: {
590
+ promptName,
591
+ contentName: systemContentName
592
+ }
593
+ }
594
+ });
595
+ this.audioContentStarted = false;
596
+ this.log(`Queue pre-populated with ${eventQueue.length} event(s)`);
597
+ }
598
+ /**
599
+ * Issue the InvokeModelWithBidirectionalStreamCommand to AWS Bedrock with
600
+ * a 5-second abort timeout that tears down the client on hang to avoid
601
+ * leaked HTTP/2 sessions. On success the response stream is stored and the
602
+ * voice transitions to 'connected'.
603
+ */
604
+ async sendInitialConnectCommand(asyncIterable) {
605
+ if (!this.client) {
606
+ throw new NovaSonicError(
607
+ "connection_failed" /* CONNECTION_FAILED */,
608
+ "Bedrock client must be created before sending the initial command"
609
+ );
610
+ }
611
+ const command = new clientBedrockRuntime.InvokeModelWithBidirectionalStreamCommand({
612
+ modelId: this.model,
613
+ body: asyncIterable
614
+ // Type assertion needed as SDK types may be strict
615
+ });
616
+ const sendStartTime = Date.now();
617
+ const abortController = new AbortController();
618
+ const timeoutId = setTimeout(() => {
619
+ this.log("[DEBUG] client.send() timeout after 5 seconds - aborting request");
620
+ abortController.abort();
621
+ }, 5e3);
622
+ let response;
623
+ try {
624
+ response = await this.client.send(command, { abortSignal: abortController.signal });
625
+ } catch (error) {
626
+ const sendDuration2 = Date.now() - sendStartTime;
627
+ if (abortController.signal.aborted) {
628
+ this.log(`[DEBUG] client.send() aborted after ${sendDuration2}ms`);
629
+ this._closeSignal?.();
630
+ this.client.destroy();
631
+ throw new Error("client.send() timeout");
632
+ }
633
+ this.log(`[DEBUG] client.send() error after ${sendDuration2}ms:`, error);
634
+ throw error;
635
+ } finally {
636
+ clearTimeout(timeoutId);
637
+ }
638
+ const sendDuration = Date.now() - sendStartTime;
639
+ this.log(`[DEBUG] client.send() completed in ${sendDuration}ms`);
640
+ this.log("Received response from AWS Bedrock");
641
+ this.stream = response.body;
642
+ this.log(`[DEBUG] Response stream is async iterable: ${this.stream && typeof this.stream[Symbol.asyncIterator] === "function"}`);
643
+ this.state = "connected";
644
+ this.log(`[STATE] State set to 'connected'`);
645
+ }
646
+ /**
647
+ * Process the bidirectional stream from AWS Bedrock
648
+ */
649
+ async processStream() {
650
+ if (!this.stream) {
651
+ this.log("[Stream] No stream available, cannot process");
652
+ return;
653
+ }
654
+ if (this.processingStream) {
655
+ this.log("[Stream] Already processing stream, skipping");
656
+ return;
657
+ }
658
+ this.processingStream = true;
659
+ this.log("[Stream] Starting stream processing");
660
+ let eventCount = 0;
661
+ let lastEventTime = Date.now();
662
+ try {
663
+ for await (const chunk of this.stream) {
664
+ if (chunk.chunk) {
665
+ const textResponse = Buffer.from(chunk.chunk.bytes || []).toString("utf-8");
666
+ eventCount++;
667
+ const now = Date.now();
668
+ const timeSinceLastEvent = now - lastEventTime;
669
+ lastEventTime = now;
670
+ this.log(`[Stream] Received chunk #${eventCount}, length: ${textResponse.length}, time since last: ${timeSinceLastEvent}ms`);
671
+ try {
672
+ const jsonResponse = JSON.parse(textResponse);
673
+ this.log(`[Stream] ========================================`);
674
+ this.log(`[Stream] Parsed JSON response, keys: ${Object.keys(jsonResponse).join(", ")}`);
675
+ if (jsonResponse.event) {
676
+ const eventKeys = Object.keys(jsonResponse.event);
677
+ this.log(`[Stream] Event keys: ${eventKeys.join(", ")}`);
678
+ if (jsonResponse.event.contentStart) {
679
+ this.log(`[Stream] \u2192 Handling contentStart`);
680
+ this.handleServerEvent({ contentStart: jsonResponse.event.contentStart });
681
+ } else if (jsonResponse.event.textOutput) {
682
+ this.log(`[Stream] \u2192 Handling textOutput, content length: ${jsonResponse.event.textOutput?.content?.length ?? 0}`);
683
+ this.handleServerEvent({ textOutput: jsonResponse.event.textOutput });
684
+ } else if (jsonResponse.event.audioOutput) {
685
+ this.handleServerEvent({ audioOutput: jsonResponse.event.audioOutput });
686
+ } else if (jsonResponse.event.toolUse) {
687
+ this.handleServerEvent({ toolUse: jsonResponse.event.toolUse });
688
+ } else if (jsonResponse.event.contentEnd && jsonResponse.event.contentEnd.type === "TOOL") {
689
+ this.handleServerEvent({ contentEnd: jsonResponse.event.contentEnd });
690
+ } else if (jsonResponse.event.contentEnd) {
691
+ this.log(`[Stream] Found contentEnd, type: ${jsonResponse.event.contentEnd.type}, stopReason: ${jsonResponse.event.contentEnd.stopReason}`);
692
+ this.handleServerEvent({ contentEnd: jsonResponse.event.contentEnd });
693
+ } else if (jsonResponse.event.completionStart) {
694
+ this.log("[Stream] Found completionStart inside event object:", JSON.stringify(jsonResponse.event.completionStart, null, 2));
695
+ this.emit("completionStart", jsonResponse.event.completionStart);
696
+ } else if (jsonResponse.event.completionEnd) {
697
+ this.log("[Stream] Found completionEnd inside event object:", JSON.stringify(jsonResponse.event.completionEnd, null, 2));
698
+ this.handleServerEvent({ completionEnd: jsonResponse.event.completionEnd });
699
+ } else {
700
+ const eventKeys2 = Object.keys(jsonResponse.event || {});
701
+ this.log(`[Stream] Event keys for other events: ${eventKeys2.join(", ")}`);
702
+ if (eventKeys2.length > 0) {
703
+ if (eventKeys2.includes("completionEnd")) {
704
+ this.log("[Stream] Found completionEnd in other events, handling explicitly");
705
+ this.handleServerEvent({ completionEnd: jsonResponse.event.completionEnd });
706
+ } else {
707
+ const eventKey = eventKeys2[0];
708
+ this.log(`[Stream] Dispatching other event: ${eventKey}`);
709
+ const eventValue = jsonResponse.event[eventKey];
710
+ if (eventValue !== void 0) {
711
+ if (eventKey === "completionEnd") {
712
+ this.handleServerEvent({ completionEnd: eventValue });
713
+ } else {
714
+ this.handleServerEvent({ [eventKey]: eventValue });
715
+ }
716
+ }
717
+ }
718
+ } else if (Object.keys(jsonResponse).length > 0) {
719
+ this.log(`[Stream] Unknown event structure, keys:`, Object.keys(jsonResponse).join(", "));
720
+ }
721
+ }
722
+ } else {
723
+ if (this.debug) {
724
+ this.log('[Stream] Received event without "event" wrapper, keys:', Object.keys(jsonResponse).join(", "));
725
+ }
726
+ if (jsonResponse.usageEvent) {
727
+ this.emit("usage", {
728
+ inputTokens: jsonResponse.usageEvent.totalInputTokens || 0,
729
+ outputTokens: jsonResponse.usageEvent.totalOutputTokens || 0,
730
+ totalTokens: jsonResponse.usageEvent.totalTokens || 0
731
+ });
732
+ }
733
+ if (jsonResponse.completionEnd) {
734
+ this.log("[Stream] Found completionEnd at top level:", JSON.stringify(jsonResponse.completionEnd, null, 2));
735
+ this.handleServerEvent({ completionEnd: jsonResponse.completionEnd });
736
+ }
737
+ if (!jsonResponse.event && !jsonResponse.completionEnd && !jsonResponse.usageEvent) {
738
+ this.log("[Stream] Received response without event wrapper, keys:", Object.keys(jsonResponse).join(", "));
739
+ }
740
+ if (jsonResponse.completionStart || jsonResponse.event?.completionStart) {
741
+ const completionStart = jsonResponse.completionStart || jsonResponse.event.completionStart;
742
+ this.log("[Stream] Found completionStart:", JSON.stringify(completionStart, null, 2));
743
+ this.emit("completionStart", completionStart);
744
+ }
745
+ }
746
+ } catch (parseError) {
747
+ this.log("[Stream] Failed to parse JSON response:", textResponse.substring(0, 200));
748
+ this.emit("error", {
749
+ message: "Failed to parse stream response",
750
+ code: "PARSE_ERROR",
751
+ details: parseError
752
+ });
753
+ }
754
+ } else if (chunk.internalServerException) {
755
+ this.emit("error", {
756
+ message: "Internal server error",
757
+ code: "INTERNAL_SERVER_ERROR",
758
+ details: chunk.internalServerException
759
+ });
760
+ } else if (chunk.modelStreamErrorException) {
761
+ this.emit("error", {
762
+ message: "Model stream error",
763
+ code: "MODEL_STREAM_ERROR",
764
+ details: chunk.modelStreamErrorException
765
+ });
766
+ } else if (chunk.modelTimeoutException) {
767
+ this.emit("error", {
768
+ message: "Model timeout",
769
+ code: "MODEL_TIMEOUT",
770
+ details: chunk.modelTimeoutException
771
+ });
772
+ } else if (chunk.serviceUnavailableException) {
773
+ this.emit("error", {
774
+ message: "Service unavailable",
775
+ code: "SERVICE_UNAVAILABLE",
776
+ details: chunk.serviceUnavailableException
777
+ });
778
+ } else if (chunk.throttlingException) {
779
+ this.emit("error", {
780
+ message: "Request throttled",
781
+ code: "THROTTLING",
782
+ details: chunk.throttlingException
783
+ });
784
+ } else if (chunk.validationException) {
785
+ this.emit("error", {
786
+ message: "Validation error",
787
+ code: "VALIDATION_ERROR",
788
+ details: chunk.validationException
789
+ });
790
+ }
791
+ }
792
+ } catch (streamError) {
793
+ this.log("[Stream] Error in processStream:", streamError);
794
+ this.emit("error", {
795
+ message: "Stream processing error",
796
+ code: "STREAM_ERROR",
797
+ details: streamError instanceof Error ? streamError.message : String(streamError)
798
+ });
799
+ } finally {
800
+ this.processingStream = false;
801
+ this.log(`[Stream] processStream finished, processingStream set to false. Total events received: ${eventCount || 0}`);
802
+ this.log(`[Stream] Stream state: state=${this.state}, stream exists=${!!this.stream}`);
803
+ if (!this.turnCompleted && this.audioContentStarted) {
804
+ this.log("[Stream] Stream ended but turn not completed - signaling turn completion as fallback");
805
+ this.log(`[Stream] State: turnCompleted=${this.turnCompleted}, audioContentStarted=${this.audioContentStarted}, hasSentContentEnd=${this.hasSentContentEnd}`);
806
+ this.turnCompleted = true;
807
+ this.emit("turnComplete", { timestamp: Date.now() });
808
+ if (this.currentResponseId) {
809
+ const stream = this.speakerStreams.get(this.currentResponseId);
810
+ if (stream) {
811
+ stream.end();
812
+ }
813
+ this.speakerStreams.delete(this.currentResponseId);
814
+ this.currentResponseId = void 0;
815
+ }
816
+ this.hasSentContentEnd = false;
817
+ this.log("[Stream] Turn completion signaled, ready for next turn");
818
+ } else if (this.turnCompleted) {
819
+ this.log("[Stream] Stream ended and turn was already completed");
820
+ } else {
821
+ this.log(`[Stream] Stream ended but turn not completed - audioContentStarted=${this.audioContentStarted}, turnCompleted=${this.turnCompleted}`);
822
+ }
823
+ if (this.stream && this.state === "connected" && !this.processingStream && !this.streamRestartAttempted) {
824
+ this.log("[Stream] Stream still open but processing stopped - will restart stream processing");
825
+ this.streamRestartAttempted = true;
826
+ setImmediate(() => {
827
+ if (this.stream && this.state === "connected" && !this.processingStream) {
828
+ this.log("[Stream] Restarting stream processing for subsequent turns");
829
+ this.processStream().catch((error) => {
830
+ this.log("[Stream] Error restarting stream processing:", error);
831
+ this.streamRestartAttempted = false;
832
+ });
833
+ } else {
834
+ this.streamRestartAttempted = false;
835
+ }
836
+ });
837
+ } else {
838
+ if (this.streamRestartAttempted) {
839
+ this.log("[Stream] Stream restart already attempted, skipping");
840
+ }
841
+ }
842
+ }
843
+ }
844
+ /**
845
+ * Handle server events from AWS Bedrock
846
+ */
847
+ handleServerEvent(event) {
848
+ if (this.debug) {
849
+ this.log("Received event, keys:", Object.keys(event).join(", "));
850
+ }
851
+ if (event.contentStart) {
852
+ this.handleContentStart(event.contentStart);
853
+ }
854
+ if (event.textOutput) {
855
+ this.handleTextOutput(event.textOutput);
856
+ }
857
+ if (event.audioOutput?.content) {
858
+ this.handleAudioOutput(event.audioOutput);
859
+ }
860
+ if (event.toolUse) {
861
+ this.handleToolUse(event.toolUse);
862
+ }
863
+ if (event.contentEnd) {
864
+ this.handleContentEnd(event.contentEnd);
865
+ }
866
+ if (event.completionEnd) {
867
+ this.handleCompletionEnd(event.completionEnd);
868
+ }
869
+ if (event.error) {
870
+ this.emit("error", {
871
+ message: event.error.message || "Unknown error",
872
+ code: event.error.code || "UNKNOWN_ERROR",
873
+ details: event.error
874
+ });
875
+ }
876
+ }
877
+ /**
878
+ * Handle a contentStart event. Tracks generationStage for text content
879
+ * blocks so the corresponding 'writing' events can be tagged
880
+ * SPECULATIVE/FINAL for the client.
881
+ */
882
+ handleContentStart(contentStart) {
883
+ const role = contentStart.role?.toLowerCase();
884
+ const contentType = contentStart.type;
885
+ this.log(`[Event] contentStart: type=${contentType || "unknown"}, role=${role}`);
886
+ this.emit("contentStart", contentStart);
887
+ if (contentType === "TEXT" && contentStart.additionalModelFields) {
888
+ try {
889
+ const additionalFields = JSON.parse(contentStart.additionalModelFields);
890
+ this.currentTextGenerationStage = additionalFields.generationStage;
891
+ this.log(`[Event] Text content generationStage: ${this.currentTextGenerationStage}`);
892
+ } catch {
893
+ this.currentTextGenerationStage = void 0;
894
+ }
895
+ } else if (contentType === "TEXT") {
896
+ this.currentTextGenerationStage = void 0;
897
+ }
898
+ }
899
+ /**
900
+ * Handle a textOutput event. Detects interruption (barge-in) markers in
901
+ * the payload, otherwise emits a 'writing' event with the text and
902
+ * current generationStage.
903
+ */
904
+ handleTextOutput(textOutput) {
905
+ const text = textOutput.content || "";
906
+ const role = textOutput.role?.toLowerCase() || "assistant";
907
+ this.log(`[Event] textOutput received: role=${role}, text length=${text.length}`);
908
+ let isInterrupted = false;
909
+ try {
910
+ const parsed = JSON.parse(text);
911
+ if (parsed && parsed.interrupted === true) {
912
+ isInterrupted = true;
913
+ }
914
+ } catch {
915
+ if (/interrupted/i.test(text)) {
916
+ isInterrupted = true;
917
+ }
918
+ }
919
+ if (isInterrupted) {
920
+ this.log(`[Event] Interrupt detected, emitting interrupt event`);
921
+ this.emit("interrupt", { type: "user", timestamp: Date.now() });
922
+ return;
923
+ }
924
+ const generationStage = this.currentTextGenerationStage;
925
+ this.log(`[Event] Emitting 'writing': role=${role}, generationStage=${generationStage}, length=${text.length}`);
926
+ this.emit("writing", { text, role, generationStage });
927
+ }
928
+ /**
929
+ * Handle an audioOutput event. Decodes the base64 LPCM payload, emits
930
+ * 'speaking' with both the base64 string and an Int16Array view, and
931
+ * forwards bytes to any active speaker stream.
932
+ */
933
+ handleAudioOutput(audioOutput) {
934
+ try {
935
+ const content = audioOutput.content;
936
+ const audioBytes = Buffer.from(content, "base64");
937
+ this.log(`[Event] Audio output: ${audioBytes.length} bytes`);
938
+ this.isReceivingAssistantAudio = true;
939
+ const audioData = new Int16Array(audioBytes.buffer, audioBytes.byteOffset, audioBytes.byteLength / 2);
940
+ this.emit("speaking", {
941
+ audio: content,
942
+ audioData,
943
+ response_id: this.currentResponseId
944
+ });
945
+ if (this.currentResponseId) {
946
+ const stream = this.speakerStreams.get(this.currentResponseId);
947
+ if (stream) {
948
+ stream.write(audioBytes);
949
+ }
950
+ }
951
+ } catch (error) {
952
+ this.log("[Event] Error decoding audio:", error);
953
+ this.emit("error", {
954
+ message: "Failed to decode audio",
955
+ code: "AUDIO_DECODE_ERROR",
956
+ details: error
957
+ });
958
+ }
959
+ }
960
+ /**
961
+ * Handle a toolUse event. Emits 'toolCall' and dispatches to the
962
+ * configured tool's execute() function via handleToolCall().
963
+ */
964
+ handleToolUse(toolUse) {
965
+ const toolUseId = toolUse.toolUseId || "";
966
+ const toolName = toolUse.toolName || "";
967
+ const toolInput = toolUse.input || {};
968
+ this.emit("toolCall", {
969
+ name: toolName,
970
+ args: toolInput,
971
+ id: toolUseId
972
+ });
973
+ if (this.tools && toolName in this.tools) {
974
+ this.handleToolCall(toolName, toolInput, toolUseId);
975
+ }
976
+ }
977
+ /**
978
+ * Handle a contentEnd event. Forwards it to clients, then routes by
979
+ * stopReason / type:
980
+ * - INTERRUPTED: emit 'interrupt' and tear down the active speaker stream
981
+ * - TOOL: end the active speaker stream
982
+ * - AUDIO with END_TURN: signal turnComplete (assistant audio finished)
983
+ * - AUDIO with PARTIAL_TURN while receiving assistant audio: schedule
984
+ * fallback turnComplete in case completionEnd never arrives
985
+ * - AUDIO otherwise: user input ended, reset turn flags
986
+ */
987
+ handleContentEnd(contentEnd) {
988
+ this.log(`[Event] contentEnd received: type=${contentEnd.type}, stopReason=${contentEnd.stopReason}`);
989
+ this.emit("contentEnd", contentEnd);
990
+ if (contentEnd.stopReason === "INTERRUPTED") {
991
+ this.log("[Event] Content interrupted by user (barge-in)");
992
+ this.emit("interrupt", { type: "user", timestamp: Date.now() });
993
+ if (this.currentResponseId) {
994
+ const stream = this.speakerStreams.get(this.currentResponseId);
995
+ if (stream) {
996
+ stream.destroy();
997
+ }
998
+ this.speakerStreams.delete(this.currentResponseId);
999
+ }
1000
+ this.currentResponseId = void 0;
1001
+ this.log("[Event] After interruption, keeping audioContentStarted=true for continued streaming");
1002
+ } else if (contentEnd.type === "TOOL" && this.currentResponseId) {
1003
+ const stream = this.speakerStreams.get(this.currentResponseId);
1004
+ if (stream) {
1005
+ stream.end();
1006
+ }
1007
+ } else if (contentEnd.type === "AUDIO") {
1008
+ if (contentEnd.stopReason === "END_TURN") {
1009
+ this.log(`[Event] contentEnd (AUDIO) with stopReason END_TURN - signaling turn complete`);
1010
+ if (this.currentResponseId) {
1011
+ const stream = this.speakerStreams.get(this.currentResponseId);
1012
+ if (stream) {
1013
+ stream.end();
1014
+ }
1015
+ this.speakerStreams.delete(this.currentResponseId);
1016
+ this.currentResponseId = void 0;
1017
+ }
1018
+ if (!this.turnCompleted) {
1019
+ this.turnCompleted = true;
1020
+ this.emit("turnComplete", { timestamp: Date.now() });
1021
+ this.hasSentContentEnd = false;
1022
+ this.log(`[Event] Turn complete (from contentEnd AUDIO with END_TURN), ready for next turn. audioContentStarted: ${this.audioContentStarted}, audioContentName: ${this.audioContentName}`);
1023
+ } else {
1024
+ this.log(`[Event] contentEnd (AUDIO) with END_TURN received but turn already completed - skipping duplicate turnComplete emission`);
1025
+ }
1026
+ if (!this.turnCompleteTimeout) {
1027
+ this.turnCompleteTimeout = setTimeout(() => {
1028
+ this.log(`[Event] Timeout: completionEnd not received, but turn already completed from contentEnd`);
1029
+ this.turnCompleteTimeout = void 0;
1030
+ }, 1e3);
1031
+ }
1032
+ } else {
1033
+ if (this.isReceivingAssistantAudio && contentEnd.stopReason === "PARTIAL_TURN") {
1034
+ this.isReceivingAssistantAudio = false;
1035
+ if (!this.turnCompleteTimeout && !this.turnCompleted) {
1036
+ this.log(`[Event] contentEnd (AUDIO) with PARTIAL_TURN for assistant output - waiting for completionEnd, setting fallback timeout`);
1037
+ this.turnCompleteTimeout = setTimeout(() => {
1038
+ if (!this.turnCompleted) {
1039
+ this.log(`[Event] Fallback: completionEnd not received after contentEnd (AUDIO) with PARTIAL_TURN, signaling turn complete`);
1040
+ this.turnCompleted = true;
1041
+ this.emit("turnComplete", { timestamp: Date.now() });
1042
+ if (this.currentResponseId) {
1043
+ const stream = this.speakerStreams.get(this.currentResponseId);
1044
+ if (stream) {
1045
+ stream.end();
1046
+ }
1047
+ this.speakerStreams.delete(this.currentResponseId);
1048
+ this.currentResponseId = void 0;
1049
+ }
1050
+ this.hasSentContentEnd = false;
1051
+ this.turnCompleteTimeout = void 0;
1052
+ }
1053
+ }, 2e3);
1054
+ }
1055
+ } else {
1056
+ this.hasSentContentEnd = false;
1057
+ this.turnCompleted = false;
1058
+ this.log(`[Event] contentEnd (AUDIO) - user input ended, stopReason: ${contentEnd.stopReason}. Keeping audioContentStarted=true for next turn. Reset hasSentContentEnd=false, turnCompleted=false.`);
1059
+ }
1060
+ }
1061
+ } else if (contentEnd.type === "TEXT") {
1062
+ this.currentTextGenerationStage = void 0;
1063
+ this.log(`[Event] contentEnd (TEXT) received, stopReason: ${contentEnd.stopReason}. Turn completion handled by completionEnd/contentEnd(AUDIO).`);
1064
+ if (contentEnd.stopReason === "END_TURN") {
1065
+ this.hasSentContentEnd = false;
1066
+ }
1067
+ }
1068
+ }
1069
+ /**
1070
+ * Handle a completionEnd event. AWS uses this as the definitive signal
1071
+ * that a turn (and all audio output) has finished. Tears down the active
1072
+ * speaker stream, clears any fallback timer, emits 'turnComplete' once,
1073
+ * and forwards token usage if reported.
1074
+ */
1075
+ handleCompletionEnd(completionEnd) {
1076
+ this.log(`[Event] completionEnd received, stopReason: ${completionEnd.stopReason}`);
1077
+ if (this.turnCompleteTimeout) {
1078
+ clearTimeout(this.turnCompleteTimeout);
1079
+ this.turnCompleteTimeout = void 0;
1080
+ }
1081
+ if (this.currentResponseId) {
1082
+ const stream = this.speakerStreams.get(this.currentResponseId);
1083
+ if (stream) {
1084
+ stream.end();
1085
+ }
1086
+ this.speakerStreams.delete(this.currentResponseId);
1087
+ this.currentResponseId = void 0;
1088
+ }
1089
+ this.isReceivingAssistantAudio = false;
1090
+ if (!this.turnCompleted) {
1091
+ this.log(`[Event] completionEnd - signaling turn complete (stopReason: ${completionEnd.stopReason || "undefined"})`);
1092
+ this.turnCompleted = true;
1093
+ this.emit("turnComplete", { timestamp: Date.now() });
1094
+ this.hasSentContentEnd = false;
1095
+ } else {
1096
+ this.log(`[Event] completionEnd received but turn already completed - skipping duplicate turnComplete emission`);
1097
+ }
1098
+ if (completionEnd.usage) {
1099
+ this.emit("usage", {
1100
+ inputTokens: completionEnd.usage.inputTokens || 0,
1101
+ outputTokens: completionEnd.usage.outputTokens || 0,
1102
+ totalTokens: (completionEnd.usage.inputTokens || 0) + (completionEnd.usage.outputTokens || 0)
1103
+ });
1104
+ }
1105
+ }
1106
+ /**
1107
+ * Handle tool execution
1108
+ */
1109
+ async handleToolCall(toolName, args, toolUseId) {
1110
+ const tool = this.tools?.[toolName];
1111
+ if (!tool || !tool.execute) {
1112
+ this.emit("error", {
1113
+ message: `Tool ${toolName} not found or has no execute function`,
1114
+ code: "TOOL_NOT_FOUND"
1115
+ });
1116
+ return;
1117
+ }
1118
+ try {
1119
+ const result = await tool.execute(
1120
+ { context: args, requestContext: this.requestContext },
1121
+ {
1122
+ toolCallId: toolUseId,
1123
+ messages: []
1124
+ }
1125
+ );
1126
+ await this.sendClientEvent({
1127
+ toolResult: {
1128
+ toolUseId,
1129
+ content: [
1130
+ {
1131
+ json: typeof result === "object" ? result : { result }
1132
+ }
1133
+ ]
1134
+ }
1135
+ });
1136
+ } catch (error) {
1137
+ this.emit("error", {
1138
+ message: `Error executing tool ${toolName}: ${error instanceof Error ? error.message : "Unknown error"}`,
1139
+ code: "TOOL_EXECUTION_ERROR",
1140
+ details: error
1141
+ });
1142
+ await this.sendClientEvent({
1143
+ toolResult: {
1144
+ toolUseId,
1145
+ content: [
1146
+ {
1147
+ text: `Error: ${error instanceof Error ? error.message : "Unknown error"}`
1148
+ }
1149
+ ]
1150
+ }
1151
+ });
1152
+ }
1153
+ }
1154
+ /**
1155
+ * Send a client event to AWS Bedrock
1156
+ * Events are sent through the input stream that was passed to the bidirectional stream command
1157
+ */
1158
+ async sendClientEvent(event) {
1159
+ if (this.state !== "connected") {
1160
+ throw new NovaSonicError(
1161
+ "not_connected" /* NOT_CONNECTED */,
1162
+ "Not connected to AWS Bedrock. Call connect() first."
1163
+ );
1164
+ }
1165
+ try {
1166
+ const eventQueue = this._eventQueue;
1167
+ const signalQueue = this._signalQueue;
1168
+ if (!eventQueue || !signalQueue) {
1169
+ throw new NovaSonicError(
1170
+ "not_connected" /* NOT_CONNECTED */,
1171
+ "Event queue not initialized. Connection may not be fully established."
1172
+ );
1173
+ }
1174
+ this.log(`[sendClientEvent] Adding event to queue (queue size: ${eventQueue.length})`);
1175
+ eventQueue.push({ event });
1176
+ this.log(`[sendClientEvent] Event added, queue size now: ${eventQueue.length}, signaling...`);
1177
+ signalQueue();
1178
+ this.log(`[sendClientEvent] Signal sent`);
1179
+ if (this.debug) {
1180
+ this.log("Sent client event, keys:", Object.keys(event).join(", "));
1181
+ }
1182
+ } catch (error) {
1183
+ throw new NovaSonicError(
1184
+ "websocket_error" /* WEBSOCKET_ERROR */,
1185
+ `Failed to send client event: ${error instanceof Error ? error.message : "Unknown error"}`,
1186
+ error
1187
+ );
1188
+ }
1189
+ }
1190
+ /**
1191
+ * Disconnects from the AWS Bedrock session and cleans up resources.
1192
+ *
1193
+ * Pushes a `sessionEnd` event to the queue before signalling close,
1194
+ * then schedules client destruction on the next tick so the async
1195
+ * iterator has a chance to yield the event to the SDK.
1196
+ */
1197
+ close() {
1198
+ if (this.state === "disconnected") {
1199
+ return;
1200
+ }
1201
+ this.state = "disconnected";
1202
+ this.processingStream = false;
1203
+ if (this.turnCompleteTimeout) {
1204
+ clearTimeout(this.turnCompleteTimeout);
1205
+ this.turnCompleteTimeout = void 0;
1206
+ }
1207
+ const eventQueue = this._eventQueue;
1208
+ const signalQueue = this._signalQueue;
1209
+ if (eventQueue && signalQueue) {
1210
+ eventQueue.push({ event: { sessionEnd: {} } });
1211
+ signalQueue();
1212
+ }
1213
+ const closeSignal = this._closeSignal;
1214
+ if (closeSignal) {
1215
+ closeSignal();
1216
+ }
1217
+ if (this.inputStream) {
1218
+ this.inputStream.end();
1219
+ this.inputStream = void 0;
1220
+ }
1221
+ for (const stream of this.speakerStreams.values()) {
1222
+ stream.end();
1223
+ }
1224
+ this.speakerStreams.clear();
1225
+ const client = this.client;
1226
+ this.client = void 0;
1227
+ this.stream = void 0;
1228
+ if (client) {
1229
+ setImmediate(() => {
1230
+ if (typeof client.destroy === "function") {
1231
+ client.destroy();
1232
+ }
1233
+ });
1234
+ }
1235
+ this.log("Disconnected from AWS Bedrock Nova 2 Sonic");
1236
+ }
1237
+ /**
1238
+ * Equips the voice instance with a set of instructions.
1239
+ */
1240
+ addInstructions(instructions) {
1241
+ this.instructions = instructions;
1242
+ }
1243
+ /**
1244
+ * Equips the voice instance with a set of tools.
1245
+ */
1246
+ addTools(tools) {
1247
+ this.tools = tools || {};
1248
+ }
1249
+ /**
1250
+ * Convert text to speech
1251
+ */
1252
+ async speak(input, options) {
1253
+ if (this.state !== "connected") {
1254
+ throw new NovaSonicError(
1255
+ "not_connected" /* NOT_CONNECTED */,
1256
+ "Not connected. Call connect() first."
1257
+ );
1258
+ }
1259
+ let text = "";
1260
+ if (typeof input !== "string") {
1261
+ const chunks = [];
1262
+ for await (const chunk of input) {
1263
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(String(chunk)));
1264
+ }
1265
+ text = Buffer.concat(chunks).toString("utf-8");
1266
+ } else {
1267
+ text = input;
1268
+ }
1269
+ if (text.trim().length === 0) {
1270
+ throw new NovaSonicError("validation_error" /* VALIDATION_ERROR */, "Input text is empty");
1271
+ }
1272
+ this.currentResponseId = `response-${Date.now()}`;
1273
+ const speakerStream = new stream.PassThrough();
1274
+ speakerStream.id = this.currentResponseId;
1275
+ this.speakerStreams.set(this.currentResponseId, speakerStream);
1276
+ this.emit("speaker", speakerStream);
1277
+ const promptName = this._promptName;
1278
+ if (!promptName) {
1279
+ throw new NovaSonicError(
1280
+ "not_connected" /* NOT_CONNECTED */,
1281
+ "Prompt name not initialized. Connection may not be fully established."
1282
+ );
1283
+ }
1284
+ if (!this.promptStarted) {
1285
+ throw new NovaSonicError(
1286
+ "invalid_state" /* INVALID_STATE */,
1287
+ "Prompt not started. This should not happen - prompt should be started during connection."
1288
+ );
1289
+ }
1290
+ const contentName = crypto.randomUUID();
1291
+ await this.sendClientEvent({
1292
+ contentStart: {
1293
+ promptName,
1294
+ contentName,
1295
+ type: "TEXT",
1296
+ interactive: true,
1297
+ role: "USER",
1298
+ textInputConfiguration: {
1299
+ mediaType: "text/plain"
1300
+ }
1301
+ }
1302
+ });
1303
+ await this.sendClientEvent({
1304
+ textInput: {
1305
+ promptName,
1306
+ contentName,
1307
+ content: text
1308
+ }
1309
+ });
1310
+ await this.sendClientEvent({
1311
+ contentEnd: {
1312
+ promptName,
1313
+ contentName
1314
+ }
1315
+ });
1316
+ }
1317
+ /**
1318
+ * Convert speech to text (transcription)
1319
+ * For Nova Sonic, this is the same as send() - both stream audio input
1320
+ */
1321
+ async listen(audioStream, options) {
1322
+ if (audioStream && typeof audioStream === "object" && "read" in audioStream) {
1323
+ await this.send(audioStream);
1324
+ } else {
1325
+ throw new NovaSonicError(
1326
+ "invalid_audio_format" /* INVALID_AUDIO_FORMAT */,
1327
+ "Unsupported audio stream format for listen()"
1328
+ );
1329
+ }
1330
+ }
1331
+ /**
1332
+ * Streams audio data in real-time to the AWS Bedrock service.
1333
+ * Following AWS Nova 2 Sonic event sequence:
1334
+ * 1. contentStart (AUDIO, USER) - if not already sent
1335
+ * 2. audioInput events (one per chunk)
1336
+ * 3. contentEnd - when audio stream ends (handled separately via endAudioInput)
1337
+ */
1338
+ async send(audioData) {
1339
+ this.log(`[send] Current state: ${this.state}`);
1340
+ if (this.state !== "connected") {
1341
+ this.log(`[send] ERROR: State is '${this.state}', expected 'connected'`);
1342
+ throw new NovaSonicError(
1343
+ "not_connected" /* NOT_CONNECTED */,
1344
+ `Not connected. Current state: ${this.state}. Call connect() first.`
1345
+ );
1346
+ }
1347
+ this.log(`[send] State check passed, proceeding with send`);
1348
+ if (!(audioData instanceof Int16Array) && !(audioData && typeof audioData === "object" && "read" in audioData)) {
1349
+ throw new NovaSonicError(
1350
+ "invalid_audio_format" /* INVALID_AUDIO_FORMAT */,
1351
+ "Unsupported audio data format"
1352
+ );
1353
+ }
1354
+ if (this.turnCompleted || this.hasSentContentEnd) {
1355
+ this.log(`[send] Starting new turn - resetting flags. turnCompleted=${this.turnCompleted}, hasSentContentEnd=${this.hasSentContentEnd}.`);
1356
+ const needNewContent = this.hasSentContentEnd;
1357
+ this.turnCompleted = false;
1358
+ this.hasSentContentEnd = false;
1359
+ this.streamRestartAttempted = false;
1360
+ if (needNewContent) {
1361
+ this.audioContentStarted = false;
1362
+ this.log(`[send] contentEnd was previously sent - will create new audio content container`);
1363
+ }
1364
+ this.log(`[send] State reset: turnCompleted=false, hasSentContentEnd=false, audioContentStarted=${this.audioContentStarted}`);
1365
+ }
1366
+ if (!this.promptStarted) {
1367
+ this.promptStarted = true;
1368
+ }
1369
+ const promptName = this._promptName;
1370
+ if (!promptName) {
1371
+ throw new NovaSonicError(
1372
+ "not_connected" /* NOT_CONNECTED */,
1373
+ "Prompt name not initialized. Connection may not be fully established."
1374
+ );
1375
+ }
1376
+ if (!this.audioContentStarted) {
1377
+ const audioContentId = crypto.randomUUID();
1378
+ this.audioContentName = audioContentId;
1379
+ this.log(`[send] First audio send - sending AUDIO contentStart with contentName: ${audioContentId}`);
1380
+ await this.sendClientEvent({
1381
+ contentStart: {
1382
+ promptName,
1383
+ contentName: audioContentId,
1384
+ type: "AUDIO",
1385
+ interactive: true,
1386
+ role: "USER",
1387
+ audioInputConfiguration: {
1388
+ mediaType: "audio/lpcm",
1389
+ sampleRateHertz: 16e3,
1390
+ sampleSizeBits: 16,
1391
+ channelCount: 1,
1392
+ encoding: "base64",
1393
+ audioType: "SPEECH"
1394
+ }
1395
+ }
1396
+ });
1397
+ this.audioContentStarted = true;
1398
+ this.log(`[send] AUDIO contentStart sent, ready to stream audio`);
1399
+ } else {
1400
+ this.log(`[send] AUDIO contentStart already sent, sending audioInput chunks directly`);
1401
+ }
1402
+ if (!this.audioContentName) {
1403
+ throw new NovaSonicError(
1404
+ "invalid_state" /* INVALID_STATE */,
1405
+ "Audio content name not initialized. This should not happen."
1406
+ );
1407
+ }
1408
+ const contentName = this.audioContentName;
1409
+ if (audioData instanceof Int16Array) {
1410
+ const buffer = Buffer.from(audioData.buffer, audioData.byteOffset, audioData.byteLength);
1411
+ const base64Audio = buffer.toString("base64");
1412
+ this.log(`[send] Sending audioInput chunk, size: ${buffer.length} bytes, contentName: ${contentName}, turnCompleted: ${this.turnCompleted}, hasSentContentEnd: ${this.hasSentContentEnd}, audioContentStarted: ${this.audioContentStarted}, state: ${this.state}`);
1413
+ if (this.state !== "connected") {
1414
+ this.log(`[send] ERROR: State changed to '${this.state}' during send!`);
1415
+ throw new NovaSonicError(
1416
+ "not_connected" /* NOT_CONNECTED */,
1417
+ `Connection lost during send. State: ${this.state}`
1418
+ );
1419
+ }
1420
+ await this.sendClientEvent({
1421
+ audioInput: {
1422
+ promptName,
1423
+ contentName,
1424
+ content: base64Audio
1425
+ }
1426
+ });
1427
+ this.log(`[send] audioInput chunk sent successfully`);
1428
+ } else if (audioData && typeof audioData === "object" && "read" in audioData) {
1429
+ const stream = audioData;
1430
+ for await (const chunk of stream) {
1431
+ const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
1432
+ const base64Audio = buffer.toString("base64");
1433
+ this.log(`[send] Sending audioInput chunk from stream, size: ${buffer.length} bytes, contentName: ${contentName}, turnCompleted: ${this.turnCompleted}, hasSentContentEnd: ${this.hasSentContentEnd}`);
1434
+ await this.sendClientEvent({
1435
+ audioInput: {
1436
+ promptName,
1437
+ contentName,
1438
+ content: base64Audio
1439
+ }
1440
+ });
1441
+ }
1442
+ } else {
1443
+ throw new NovaSonicError(
1444
+ "invalid_audio_format" /* INVALID_AUDIO_FORMAT */,
1445
+ "Unsupported audio data format"
1446
+ );
1447
+ }
1448
+ }
1449
+ /**
1450
+ * End audio input stream (sends contentEnd for audio)
1451
+ * Call this when done sending audio chunks
1452
+ */
1453
+ async endAudioInput() {
1454
+ if (this.hasSentContentEnd) {
1455
+ this.log("[endAudioInput] contentEnd already sent for this turn, skipping");
1456
+ return;
1457
+ }
1458
+ if (this.turnCompleted) {
1459
+ this.log("[endAudioInput] Turn already completed by AWS, skipping contentEnd. Resetting turnCompleted flag for next turn.");
1460
+ this.turnCompleted = false;
1461
+ this.hasSentContentEnd = false;
1462
+ return;
1463
+ }
1464
+ if (this.audioContentStarted && this.audioContentName && this._promptName) {
1465
+ const promptName = this._promptName;
1466
+ this.log("[endAudioInput] Sending contentEnd for audio input");
1467
+ await this.sendClientEvent({
1468
+ contentEnd: {
1469
+ promptName,
1470
+ contentName: this.audioContentName
1471
+ }
1472
+ });
1473
+ this.hasSentContentEnd = true;
1474
+ } else {
1475
+ this.log("[endAudioInput] Cannot send contentEnd: audioContentStarted=" + this.audioContentStarted + ", audioContentName=" + this.audioContentName);
1476
+ }
1477
+ }
1478
+ /**
1479
+ * Register an event listener
1480
+ */
1481
+ on(event, callback) {
1482
+ if (!this.events[event]) {
1483
+ this.events[event] = [];
1484
+ }
1485
+ this.events[event].push(callback);
1486
+ }
1487
+ /**
1488
+ * Remove an event listener
1489
+ */
1490
+ off(event, callback) {
1491
+ if (!this.events[event]) {
1492
+ return;
1493
+ }
1494
+ const index = this.events[event].indexOf(callback);
1495
+ if (index !== -1) {
1496
+ this.events[event].splice(index, 1);
1497
+ }
1498
+ }
1499
+ /**
1500
+ * Emit an event with arguments
1501
+ */
1502
+ emit(event, data) {
1503
+ if (!this.events[event]) {
1504
+ this.log(`[NovaSonic] emit('${event}'): No listeners registered for this event`);
1505
+ return;
1506
+ }
1507
+ const listenerCount = this.events[event].length;
1508
+ this.log(`[NovaSonic] emit('${event}'): Calling ${listenerCount} listener(s)`);
1509
+ for (const callback of this.events[event]) {
1510
+ try {
1511
+ callback(data);
1512
+ this.log(`[NovaSonic] emit('${event}'): Successfully called one listener`);
1513
+ } catch (error) {
1514
+ this.log(`Error in event handler for ${event}:`, error);
1515
+ }
1516
+ }
1517
+ this.log(`[NovaSonic] emit('${event}'): Finished calling all ${listenerCount} listener(s)`);
1518
+ }
1519
+ /**
1520
+ * Get listener status
1521
+ */
1522
+ async getListener() {
1523
+ return { enabled: this.state === "connected" };
1524
+ }
1525
+ /**
1526
+ * Log helper
1527
+ */
1528
+ log(...args) {
1529
+ if (this.debug) {
1530
+ console.log("[NovaSonicVoice]", ...args);
1531
+ }
1532
+ }
1533
+ };
1534
+
1535
+ exports.NovaSonicError = NovaSonicError;
1536
+ exports.NovaSonicErrorCode = NovaSonicErrorCode;
1537
+ exports.NovaSonicVoice = NovaSonicVoice;
1538
+ //# sourceMappingURL=index.cjs.map
1539
+ //# sourceMappingURL=index.cjs.map