@mastra/voice-aws-nova-sonic 0.0.0-studio-cli-20260504022012

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,1619 @@
1
+ 'use strict';
2
+
3
+ var crypto = require('crypto');
4
+ var stream = require('stream');
5
+ var clientBedrockRuntime = require('@aws-sdk/client-bedrock-runtime');
6
+ var voice = require('@mastra/core/voice');
7
+ var nodeHttpHandler = require('@smithy/node-http-handler');
8
+ var credentialProviderNode = require('@aws-sdk/credential-provider-node');
9
+
10
+ // src/index.ts
11
+
12
+ // src/types.ts
13
+ var NovaSonicErrorCode = /* @__PURE__ */ ((NovaSonicErrorCode2) => {
14
+ NovaSonicErrorCode2["CONNECTION_FAILED"] = "connection_failed";
15
+ NovaSonicErrorCode2["CONNECTION_NOT_ESTABLISHED"] = "connection_not_established";
16
+ NovaSonicErrorCode2["AUTHENTICATION_FAILED"] = "authentication_failed";
17
+ NovaSonicErrorCode2["CREDENTIALS_MISSING"] = "credentials_missing";
18
+ NovaSonicErrorCode2["REGION_INVALID"] = "region_invalid";
19
+ NovaSonicErrorCode2["WEBSOCKET_ERROR"] = "websocket_error";
20
+ NovaSonicErrorCode2["AUDIO_PROCESSING_ERROR"] = "audio_processing_error";
21
+ NovaSonicErrorCode2["AUDIO_STREAM_ERROR"] = "audio_stream_error";
22
+ NovaSonicErrorCode2["SPEAKER_STREAM_ERROR"] = "speaker_stream_error";
23
+ NovaSonicErrorCode2["TRANSCRIPTION_TIMEOUT"] = "transcription_timeout";
24
+ NovaSonicErrorCode2["TRANSCRIPTION_FAILED"] = "transcription_failed";
25
+ NovaSonicErrorCode2["TOOL_EXECUTION_ERROR"] = "tool_execution_error";
26
+ NovaSonicErrorCode2["TOOL_NOT_FOUND"] = "tool_not_found";
27
+ NovaSonicErrorCode2["SESSION_CONFIG_UPDATE_FAILED"] = "session_config_update_failed";
28
+ NovaSonicErrorCode2["INVALID_AUDIO_FORMAT"] = "invalid_audio_format";
29
+ NovaSonicErrorCode2["NOT_CONNECTED"] = "not_connected";
30
+ NovaSonicErrorCode2["INVALID_STATE"] = "invalid_state";
31
+ NovaSonicErrorCode2["VALIDATION_ERROR"] = "validation_error";
32
+ NovaSonicErrorCode2["UNKNOWN_ERROR"] = "unknown_error";
33
+ return NovaSonicErrorCode2;
34
+ })(NovaSonicErrorCode || {});
35
+
36
+ // src/utils/errors.ts
37
+ var NovaSonicError = class extends Error {
38
+ code;
39
+ details;
40
+ timestamp;
41
+ constructor(code, message, details) {
42
+ super(message);
43
+ this.name = "NovaSonicError";
44
+ this.code = code;
45
+ this.details = details;
46
+ this.timestamp = Date.now();
47
+ }
48
+ toEventData() {
49
+ return {
50
+ message: this.message,
51
+ code: this.code,
52
+ details: this.details,
53
+ timestamp: this.timestamp
54
+ };
55
+ }
56
+ };
57
+
58
+ // src/utils/auth.ts
59
+ async function getAwsCredentials(explicitCredentials, debug) {
60
+ if (explicitCredentials) {
61
+ if (debug) {
62
+ console.info("[getAwsCredentials] Using explicit credentials provided in config");
63
+ }
64
+ return explicitCredentials;
65
+ }
66
+ try {
67
+ if (debug) {
68
+ console.info("[getAwsCredentials] Using default credential provider chain");
69
+ }
70
+ const credentials = await credentialProviderNode.defaultProvider()();
71
+ if (debug) {
72
+ console.info("[getAwsCredentials] Credentials retrieved successfully");
73
+ }
74
+ return credentials;
75
+ } catch (error) {
76
+ if (error instanceof NovaSonicError) {
77
+ throw error;
78
+ }
79
+ throw new NovaSonicError(
80
+ "authentication_failed" /* AUTHENTICATION_FAILED */,
81
+ `Failed to load AWS credentials: ${error instanceof Error ? error.message : "Unknown error"}`,
82
+ error
83
+ );
84
+ }
85
+ }
86
+
87
+ // src/index.ts
88
+ var DEFAULT_MODEL = "amazon.nova-2-sonic-v1:0";
89
+ var DEFAULT_REGION = "us-east-1";
90
+ var NovaSonicVoice = class extends voice.MastraVoice {
91
+ client;
92
+ stream;
93
+ inputStream;
94
+ // Input stream for sending events to AWS
95
+ _eventQueue;
96
+ _signalQueue;
97
+ _closeSignal;
98
+ _promptName;
99
+ state = "disconnected";
100
+ events;
101
+ instructions;
102
+ tools;
103
+ requestContext;
104
+ debug;
105
+ region;
106
+ model;
107
+ credentials;
108
+ speakerStreams;
109
+ currentResponseId;
110
+ processingStream = false;
111
+ streamRestartAttempted = false;
112
+ // Prevent multiple restart attempts
113
+ sessionConfig;
114
+ promptStarted = false;
115
+ // Track if promptStart was sent (now sent during connection)
116
+ audioContentName;
117
+ audioContentStarted = false;
118
+ hasSentContentEnd = false;
119
+ // Track if contentEnd has been sent for current turn
120
+ turnCompleted = false;
121
+ // Track if turn has been completed (to prevent sending contentEnd after turn completion)
122
+ turnCompleteTimeout;
123
+ // Timeout for fallback turn completion
124
+ isReceivingAssistantAudio = false;
125
+ // Track if we're currently receiving assistant audio output
126
+ currentTextGenerationStage;
127
+ // Track generationStage (SPECULATIVE|FINAL) for current text content block
128
+ /**
129
+ * Creates a new instance of NovaSonicVoice.
130
+ *
131
+ * @param config - Configuration options for the voice instance
132
+ * @param config.region - AWS region (defaults to us-east-1)
133
+ * @param config.model - The model ID to use (defaults to amazon.nova-2-sonic-v1:0)
134
+ * @param config.credentials - AWS credentials (optional, uses default credential chain)
135
+ * @param config.speaker - Voice name/identifier
136
+ * @param config.languageCode - Language code for the voice
137
+ * @param config.debug - Enable debug mode
138
+ *
139
+ * @example
140
+ * ```typescript
141
+ * const voice = new NovaSonicVoice({
142
+ * region: 'us-east-1',
143
+ * model: 'amazon.nova-2-sonic-v1:0',
144
+ * speaker: 'default',
145
+ * });
146
+ * ```
147
+ */
148
+ constructor(config = {}) {
149
+ let normalizedConfig;
150
+ if ("realtimeConfig" in config || "speechModel" in config || "listeningModel" in config) {
151
+ normalizedConfig = config;
152
+ } else {
153
+ const configOptions = config;
154
+ normalizedConfig = {
155
+ realtimeConfig: {
156
+ model: configOptions.model || DEFAULT_MODEL,
157
+ apiKey: void 0,
158
+ // AWS doesn't use API keys
159
+ options: configOptions
160
+ },
161
+ speaker: typeof configOptions.speaker === "string" ? configOptions.speaker : "matthew"
162
+ };
163
+ }
164
+ super(normalizedConfig);
165
+ const options = normalizedConfig.realtimeConfig?.options || config;
166
+ this.region = options.region || DEFAULT_REGION;
167
+ this.model = options.model || DEFAULT_MODEL;
168
+ this.credentials = options.credentials;
169
+ this.debug = options.debug || false;
170
+ this.sessionConfig = options.sessionConfig;
171
+ this.events = {};
172
+ this.speakerStreams = /* @__PURE__ */ new Map();
173
+ const validRegions = ["us-east-1", "us-west-2", "ap-northeast-1"];
174
+ if (!validRegions.includes(this.region)) {
175
+ throw new NovaSonicError(
176
+ "region_invalid" /* REGION_INVALID */,
177
+ `Invalid region: ${this.region}. Supported regions: ${validRegions.join(", ")}`
178
+ );
179
+ }
180
+ }
181
+ /**
182
+ * Returns a list of available voice speakers.
183
+ *
184
+ * Nova 2 Sonic provides expressive voices across multiple languages.
185
+ * Tiffany (en-US, feminine) and Matthew (en-US, masculine) are polyglot
186
+ * voices that can speak all supported languages.
187
+ *
188
+ * @returns Promise resolving to an array of voice objects
189
+ */
190
+ async getSpeakers() {
191
+ return Promise.resolve([
192
+ // English (US) - Polyglot voices
193
+ { voiceId: "tiffany", name: "Tiffany", language: "English", locale: "en-US", gender: "feminine", polyglot: true },
194
+ {
195
+ voiceId: "matthew",
196
+ name: "Matthew",
197
+ language: "English",
198
+ locale: "en-US",
199
+ gender: "masculine",
200
+ polyglot: true
201
+ },
202
+ // English (UK)
203
+ { voiceId: "amy", name: "Amy", language: "English", locale: "en-GB", gender: "feminine", polyglot: false },
204
+ // English (Australia)
205
+ { voiceId: "olivia", name: "Olivia", language: "English", locale: "en-AU", gender: "feminine", polyglot: false },
206
+ // English (Indian)
207
+ { voiceId: "kiara", name: "Kiara", language: "English", locale: "en-IN", gender: "feminine", polyglot: false },
208
+ { voiceId: "arjun", name: "Arjun", language: "English", locale: "en-IN", gender: "masculine", polyglot: false },
209
+ // French
210
+ { voiceId: "ambre", name: "Ambre", language: "French", locale: "fr-FR", gender: "feminine", polyglot: false },
211
+ {
212
+ voiceId: "florian",
213
+ name: "Florian",
214
+ language: "French",
215
+ locale: "fr-FR",
216
+ gender: "masculine",
217
+ polyglot: false
218
+ },
219
+ // Italian
220
+ {
221
+ voiceId: "beatrice",
222
+ name: "Beatrice",
223
+ language: "Italian",
224
+ locale: "it-IT",
225
+ gender: "feminine",
226
+ polyglot: false
227
+ },
228
+ {
229
+ voiceId: "lorenzo",
230
+ name: "Lorenzo",
231
+ language: "Italian",
232
+ locale: "it-IT",
233
+ gender: "masculine",
234
+ polyglot: false
235
+ },
236
+ // German
237
+ { voiceId: "tina", name: "Tina", language: "German", locale: "de-DE", gender: "feminine", polyglot: false },
238
+ {
239
+ voiceId: "lennart",
240
+ name: "Lennart",
241
+ language: "German",
242
+ locale: "de-DE",
243
+ gender: "masculine",
244
+ polyglot: false
245
+ },
246
+ // Spanish (US)
247
+ { voiceId: "lupe", name: "Lupe", language: "Spanish", locale: "es-US", gender: "feminine", polyglot: false },
248
+ { voiceId: "carlos", name: "Carlos", language: "Spanish", locale: "es-US", gender: "masculine", polyglot: false },
249
+ // Portuguese
250
+ {
251
+ voiceId: "carolina",
252
+ name: "Carolina",
253
+ language: "Portuguese",
254
+ locale: "pt-BR",
255
+ gender: "feminine",
256
+ polyglot: false
257
+ },
258
+ { voiceId: "leo", name: "Leo", language: "Portuguese", locale: "pt-BR", gender: "masculine", polyglot: false },
259
+ // Hindi
260
+ { voiceId: "kiara", name: "Kiara", language: "Hindi", locale: "hi-IN", gender: "feminine", polyglot: false },
261
+ { voiceId: "arjun", name: "Arjun", language: "Hindi", locale: "hi-IN", gender: "masculine", polyglot: false }
262
+ ]);
263
+ }
264
+ /**
265
+ * Establishes a connection to the AWS Bedrock bidirectional streaming service.
266
+ * Must be called before using speak, listen, or send functions.
267
+ *
268
+ * @throws {NovaSonicError} If connection fails or credentials are missing
269
+ *
270
+ * @example
271
+ * ```typescript
272
+ * await voice.connect();
273
+ * // Now ready for voice interactions
274
+ * ```
275
+ */
276
+ async connect({ requestContext } = {}) {
277
+ if (this.state === "connected" || this.state === "connecting") {
278
+ this.log("Already connected or connecting");
279
+ return;
280
+ }
281
+ this.state = "connecting";
282
+ this.requestContext = requestContext;
283
+ this.streamRestartAttempted = false;
284
+ try {
285
+ await this.createBedrockClient();
286
+ const asyncIterable = this.createEventQueue();
287
+ this.enqueueInitialSessionEvents();
288
+ await this.sendInitialConnectCommand(asyncIterable);
289
+ this.processStream().catch((error) => {
290
+ this.log("Error in stream processing:", error);
291
+ this.emit("error", {
292
+ message: error instanceof Error ? error.message : "Stream processing error",
293
+ code: "STREAM_PROCESSING_ERROR",
294
+ details: error
295
+ });
296
+ });
297
+ this.log("Connected to AWS Bedrock Nova 2 Sonic");
298
+ } catch (error) {
299
+ this.state = "disconnected";
300
+ if (this.client) {
301
+ if (typeof this.client.destroy === "function") {
302
+ this.client.destroy();
303
+ }
304
+ this.client = void 0;
305
+ }
306
+ this.log("Connection error:", error);
307
+ const errorMessage = error instanceof Error ? error.message : "Unknown error during connection";
308
+ throw new NovaSonicError("connection_failed" /* CONNECTION_FAILED */, `Failed to connect to AWS Bedrock: ${errorMessage}`, error);
309
+ }
310
+ }
311
+ /**
312
+ * Resolve credentials and initialize the Bedrock Runtime client over HTTP/2.
313
+ */
314
+ async createBedrockClient() {
315
+ this.log("Getting AWS credentials...");
316
+ const credentials = await getAwsCredentials(this.credentials, this.debug);
317
+ if (!credentials) {
318
+ throw new NovaSonicError(
319
+ "credentials_missing" /* CREDENTIALS_MISSING */,
320
+ "AWS credentials are required. Please configure AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables or provide credentials in the config."
321
+ );
322
+ }
323
+ this.log("Credentials retrieved:", {
324
+ hasAccessKeyId: !!credentials.accessKeyId,
325
+ hasSecretAccessKey: !!credentials.secretAccessKey,
326
+ hasSessionToken: !!credentials.sessionToken,
327
+ accessKeyIdPrefix: credentials.accessKeyId ? `${credentials.accessKeyId.substring(0, 6)}...` : "missing",
328
+ expiration: credentials.expiration ? credentials.expiration.toISOString() : "no expiration"
329
+ });
330
+ this.log(`Initializing Bedrock Runtime client for region: ${this.region}, model: ${this.model}`);
331
+ const nodeHttp2Handler = new nodeHttpHandler.NodeHttp2Handler({
332
+ requestTimeout: 3e5,
333
+ // 5 minutes
334
+ sessionTimeout: 3e5,
335
+ // 5 minutes
336
+ disableConcurrentStreams: false,
337
+ maxConcurrentStreams: 20
338
+ });
339
+ this.client = new clientBedrockRuntime.BedrockRuntimeClient({
340
+ region: this.region,
341
+ credentials,
342
+ requestHandler: nodeHttp2Handler
343
+ });
344
+ }
345
+ /**
346
+ * Build the async-iterable event queue used as the request body for the
347
+ * bidirectional stream. Returns the iterable and wires up internal queue
348
+ * helpers (_eventQueue, _signalQueue, _closeSignal) used by sendClientEvent.
349
+ */
350
+ createEventQueue() {
351
+ this.log("Creating bidirectional stream command...");
352
+ const voiceInstance = this;
353
+ const eventQueue = [];
354
+ const pendingResolvers = [];
355
+ let closeSignal = false;
356
+ const signalQueue = () => {
357
+ if (pendingResolvers.length > 0) {
358
+ voiceInstance.log(`[AsyncIterable] Signaling queue - resolving ${pendingResolvers.length} pending Promise(s)`);
359
+ const resolvers = [...pendingResolvers];
360
+ pendingResolvers.length = 0;
361
+ resolvers.forEach((resolve) => resolve());
362
+ } else {
363
+ voiceInstance.log("[AsyncIterable] signalQueue called but no pending Promise");
364
+ }
365
+ };
366
+ const asyncIterable = {
367
+ [Symbol.asyncIterator]: () => {
368
+ voiceInstance.log("[AsyncIterable] Iterator requested");
369
+ return {
370
+ next: async () => {
371
+ try {
372
+ if (closeSignal || voiceInstance.state === "disconnected") {
373
+ voiceInstance.log(`[AsyncIterable] Stream closed (state: ${voiceInstance.state}), done = true`);
374
+ return { value: void 0, done: true };
375
+ }
376
+ if (eventQueue.length === 0) {
377
+ try {
378
+ voiceInstance.log("[AsyncIterable] Queue empty, waiting for signal...");
379
+ await new Promise((resolve) => {
380
+ pendingResolvers.push(resolve);
381
+ voiceInstance.log(
382
+ `[AsyncIterable] Promise created, waiting for signal (${pendingResolvers.length} pending)...`
383
+ );
384
+ setImmediate(() => {
385
+ if (eventQueue.length > 0) {
386
+ voiceInstance.log("[AsyncIterable] Data arrived before wait, resolving immediately");
387
+ const index = pendingResolvers.indexOf(resolve);
388
+ if (index !== -1) {
389
+ pendingResolvers.splice(index, 1);
390
+ resolve();
391
+ }
392
+ return;
393
+ }
394
+ if (closeSignal || voiceInstance.state === "disconnected") {
395
+ voiceInstance.log("[AsyncIterable] Closed before wait, resolving");
396
+ const index = pendingResolvers.indexOf(resolve);
397
+ if (index !== -1) {
398
+ pendingResolvers.splice(index, 1);
399
+ resolve();
400
+ }
401
+ return;
402
+ }
403
+ });
404
+ });
405
+ voiceInstance.log("[AsyncIterable] Promise resolved, checking queue...");
406
+ } catch (error) {
407
+ if (error instanceof Error && error.message === "Stream closed") {
408
+ voiceInstance.log("[AsyncIterable] Stream closed during wait");
409
+ return { value: void 0, done: true };
410
+ }
411
+ voiceInstance.log("[AsyncIterable] Error during wait:", error);
412
+ }
413
+ }
414
+ if (closeSignal) {
415
+ voiceInstance.log("[AsyncIterable] Stream closed (closeSignal)");
416
+ return { value: void 0, done: true };
417
+ }
418
+ if (voiceInstance.state === "disconnected") {
419
+ voiceInstance.log("[AsyncIterable] Stream closed (disconnected state)");
420
+ return { value: void 0, done: true };
421
+ }
422
+ while (eventQueue.length === 0 && !closeSignal) {
423
+ if (voiceInstance.state === "disconnected") {
424
+ voiceInstance.log("[AsyncIterable] Stream closed before wait loop");
425
+ return { value: void 0, done: true };
426
+ }
427
+ voiceInstance.log("[AsyncIterable] Queue still empty, waiting again...");
428
+ await new Promise((resolve) => {
429
+ pendingResolvers.push(resolve);
430
+ setImmediate(() => {
431
+ if (eventQueue.length > 0 || closeSignal || voiceInstance.state === "disconnected") {
432
+ const index = pendingResolvers.indexOf(resolve);
433
+ if (index !== -1) {
434
+ pendingResolvers.splice(index, 1);
435
+ resolve();
436
+ }
437
+ }
438
+ });
439
+ });
440
+ if (closeSignal || voiceInstance.state === "disconnected") {
441
+ voiceInstance.log("[AsyncIterable] Stream closed during wait loop");
442
+ return { value: void 0, done: true };
443
+ }
444
+ }
445
+ const nextEvent = eventQueue.shift();
446
+ const eventJson = JSON.stringify(nextEvent);
447
+ const eventBytes = Buffer.from(eventJson, "utf-8");
448
+ voiceInstance.log(`[AsyncIterable] Yielding event of size: ${eventBytes.length}`);
449
+ return {
450
+ value: {
451
+ chunk: {
452
+ bytes: eventBytes
453
+ }
454
+ },
455
+ done: false
456
+ };
457
+ } catch (error) {
458
+ voiceInstance.log("[AsyncIterable] Error in iterator:", error);
459
+ closeSignal = true;
460
+ return { value: void 0, done: true };
461
+ }
462
+ },
463
+ return: async () => {
464
+ voiceInstance.log("[AsyncIterable] Iterator return() called");
465
+ closeSignal = true;
466
+ signalQueue();
467
+ return { value: void 0, done: true };
468
+ },
469
+ throw: async (error) => {
470
+ voiceInstance.log("[AsyncIterable] Iterator throw() called:", error);
471
+ closeSignal = true;
472
+ signalQueue();
473
+ throw error;
474
+ }
475
+ };
476
+ }
477
+ };
478
+ this._eventQueue = eventQueue;
479
+ this._signalQueue = signalQueue;
480
+ this._closeSignal = () => {
481
+ closeSignal = true;
482
+ signalQueue();
483
+ };
484
+ return asyncIterable;
485
+ }
486
+ /**
487
+ * Pre-populate the event queue with the AWS Nova Sonic connection
488
+ * handshake events: sessionStart, promptStart, then a SYSTEM text content
489
+ * block carrying the configured instructions. AUDIO contentStart is NOT
490
+ * sent here; it is deferred to the first send() call.
491
+ */
492
+ enqueueInitialSessionEvents() {
493
+ const eventQueue = this._eventQueue;
494
+ if (!eventQueue) {
495
+ throw new NovaSonicError(
496
+ "connection_failed" /* CONNECTION_FAILED */,
497
+ "Event queue must be initialized before enqueueing session events"
498
+ );
499
+ }
500
+ this.log("Pre-populating queue with sessionStart and promptStart events...");
501
+ const promptName = crypto.randomUUID();
502
+ this._promptName = promptName;
503
+ const sessionStartEvent = {};
504
+ if (this.sessionConfig) {
505
+ if (this.sessionConfig.inferenceConfiguration) {
506
+ sessionStartEvent.inferenceConfiguration = {
507
+ maxTokens: this.sessionConfig.inferenceConfiguration.maxTokens || 4096,
508
+ topP: this.sessionConfig.inferenceConfiguration.topP || 0.9,
509
+ temperature: this.sessionConfig.inferenceConfiguration.temperature || 0.7,
510
+ ...this.sessionConfig.inferenceConfiguration.topK !== void 0 && {
511
+ topK: this.sessionConfig.inferenceConfiguration.topK
512
+ },
513
+ ...this.sessionConfig.inferenceConfiguration.stopSequences && {
514
+ stopSequences: this.sessionConfig.inferenceConfiguration.stopSequences
515
+ }
516
+ };
517
+ } else {
518
+ sessionStartEvent.inferenceConfiguration = {
519
+ maxTokens: 4096,
520
+ topP: 0.9,
521
+ temperature: 0.7
522
+ };
523
+ }
524
+ if (this.sessionConfig.turnDetectionConfiguration) {
525
+ sessionStartEvent.turnDetectionConfiguration = {
526
+ ...this.sessionConfig.turnDetectionConfiguration.endpointingSensitivity && {
527
+ endpointingSensitivity: this.sessionConfig.turnDetectionConfiguration.endpointingSensitivity
528
+ }
529
+ };
530
+ }
531
+ } else {
532
+ sessionStartEvent.inferenceConfiguration = {
533
+ maxTokens: 4096,
534
+ topP: 0.9,
535
+ temperature: 0.7
536
+ };
537
+ }
538
+ eventQueue.push({
539
+ event: {
540
+ sessionStart: sessionStartEvent
541
+ }
542
+ });
543
+ let voiceId = "matthew";
544
+ if (this.sessionConfig?.voice) {
545
+ if (typeof this.sessionConfig.voice === "string") {
546
+ voiceId = this.sessionConfig.voice;
547
+ } else if (this.sessionConfig.voice.name) {
548
+ voiceId = this.sessionConfig.voice.name;
549
+ }
550
+ } else if (this.speaker && this.speaker !== "default") {
551
+ if (typeof this.speaker === "string") {
552
+ voiceId = this.speaker;
553
+ } else {
554
+ const speakerObj = this.speaker;
555
+ if (speakerObj && typeof speakerObj === "object" && speakerObj.name) {
556
+ voiceId = speakerObj.name;
557
+ }
558
+ }
559
+ }
560
+ const promptStartEvent = {
561
+ promptName,
562
+ textOutputConfiguration: {
563
+ mediaType: "text/plain"
564
+ },
565
+ // AWS REQUIRES this - cannot be omitted
566
+ audioOutputConfiguration: {
567
+ mediaType: "audio/lpcm",
568
+ sampleRateHertz: 24e3,
569
+ sampleSizeBits: 16,
570
+ channelCount: 1,
571
+ voiceId,
572
+ encoding: "base64",
573
+ audioType: "SPEECH"
574
+ }
575
+ };
576
+ if (this.sessionConfig?.tools && this.sessionConfig.tools.length > 0) {
577
+ promptStartEvent.toolConfiguration = {
578
+ tools: this.sessionConfig.tools.map((tool) => {
579
+ let inputSchemaJson;
580
+ if (typeof tool.inputSchema === "string") {
581
+ inputSchemaJson = tool.inputSchema;
582
+ } else {
583
+ inputSchemaJson = JSON.stringify(tool.inputSchema);
584
+ }
585
+ return {
586
+ toolSpec: {
587
+ name: tool.name,
588
+ description: tool.description,
589
+ inputSchema: {
590
+ json: inputSchemaJson
591
+ }
592
+ }
593
+ };
594
+ }),
595
+ // toolChoice goes inside toolConfiguration for Nova 2 Sonic
596
+ ...this.sessionConfig?.toolChoice && { toolChoice: this.sessionConfig.toolChoice }
597
+ };
598
+ } else if (this.sessionConfig?.toolChoice) {
599
+ promptStartEvent.toolConfiguration = {
600
+ toolChoice: this.sessionConfig.toolChoice
601
+ };
602
+ }
603
+ eventQueue.push({
604
+ event: {
605
+ promptStart: promptStartEvent
606
+ }
607
+ });
608
+ this.promptStarted = true;
609
+ const systemContentName = crypto.randomUUID();
610
+ eventQueue.push({
611
+ event: {
612
+ contentStart: {
613
+ promptName,
614
+ contentName: systemContentName,
615
+ type: "TEXT",
616
+ interactive: false,
617
+ role: "SYSTEM",
618
+ textInputConfiguration: {
619
+ mediaType: "text/plain"
620
+ }
621
+ }
622
+ }
623
+ });
624
+ eventQueue.push({
625
+ event: {
626
+ textInput: {
627
+ promptName,
628
+ contentName: systemContentName,
629
+ content: this.instructions || ""
630
+ }
631
+ }
632
+ });
633
+ eventQueue.push({
634
+ event: {
635
+ contentEnd: {
636
+ promptName,
637
+ contentName: systemContentName
638
+ }
639
+ }
640
+ });
641
+ this.audioContentStarted = false;
642
+ this.log(`Queue pre-populated with ${eventQueue.length} event(s)`);
643
+ }
644
+ /**
645
+ * Issue the InvokeModelWithBidirectionalStreamCommand to AWS Bedrock with
646
+ * a 5-second abort timeout that tears down the client on hang to avoid
647
+ * leaked HTTP/2 sessions. On success the response stream is stored and the
648
+ * voice transitions to 'connected'.
649
+ */
650
+ async sendInitialConnectCommand(asyncIterable) {
651
+ if (!this.client) {
652
+ throw new NovaSonicError(
653
+ "connection_failed" /* CONNECTION_FAILED */,
654
+ "Bedrock client must be created before sending the initial command"
655
+ );
656
+ }
657
+ const command = new clientBedrockRuntime.InvokeModelWithBidirectionalStreamCommand({
658
+ modelId: this.model,
659
+ body: asyncIterable
660
+ // Type assertion needed as SDK types may be strict
661
+ });
662
+ const sendStartTime = Date.now();
663
+ const abortController = new AbortController();
664
+ const timeoutId = setTimeout(() => {
665
+ this.log("[DEBUG] client.send() timeout after 5 seconds - aborting request");
666
+ abortController.abort();
667
+ }, 5e3);
668
+ let response;
669
+ try {
670
+ response = await this.client.send(command, { abortSignal: abortController.signal });
671
+ } catch (error) {
672
+ const sendDuration2 = Date.now() - sendStartTime;
673
+ if (abortController.signal.aborted) {
674
+ this.log(`[DEBUG] client.send() aborted after ${sendDuration2}ms`);
675
+ this._closeSignal?.();
676
+ this.client.destroy();
677
+ throw new Error("client.send() timeout");
678
+ }
679
+ this.log(`[DEBUG] client.send() error after ${sendDuration2}ms:`, error);
680
+ throw error;
681
+ } finally {
682
+ clearTimeout(timeoutId);
683
+ }
684
+ const sendDuration = Date.now() - sendStartTime;
685
+ this.log(`[DEBUG] client.send() completed in ${sendDuration}ms`);
686
+ this.log("Received response from AWS Bedrock");
687
+ this.stream = response.body;
688
+ this.log(
689
+ `[DEBUG] Response stream is async iterable: ${this.stream && typeof this.stream[Symbol.asyncIterator] === "function"}`
690
+ );
691
+ this.state = "connected";
692
+ this.log(`[STATE] State set to 'connected'`);
693
+ }
694
+ /**
695
+ * Process the bidirectional stream from AWS Bedrock
696
+ */
697
+ async processStream() {
698
+ if (!this.stream) {
699
+ this.log("[Stream] No stream available, cannot process");
700
+ return;
701
+ }
702
+ if (this.processingStream) {
703
+ this.log("[Stream] Already processing stream, skipping");
704
+ return;
705
+ }
706
+ this.processingStream = true;
707
+ this.log("[Stream] Starting stream processing");
708
+ let eventCount = 0;
709
+ let lastEventTime = Date.now();
710
+ try {
711
+ for await (const chunk of this.stream) {
712
+ if (chunk.chunk) {
713
+ const textResponse = Buffer.from(chunk.chunk.bytes || []).toString("utf-8");
714
+ eventCount++;
715
+ const now = Date.now();
716
+ const timeSinceLastEvent = now - lastEventTime;
717
+ lastEventTime = now;
718
+ this.log(
719
+ `[Stream] Received chunk #${eventCount}, length: ${textResponse.length}, time since last: ${timeSinceLastEvent}ms`
720
+ );
721
+ try {
722
+ const jsonResponse = JSON.parse(textResponse);
723
+ this.log(`[Stream] ========================================`);
724
+ this.log(`[Stream] Parsed JSON response, keys: ${Object.keys(jsonResponse).join(", ")}`);
725
+ if (jsonResponse.event) {
726
+ const eventKeys = Object.keys(jsonResponse.event);
727
+ this.log(`[Stream] Event keys: ${eventKeys.join(", ")}`);
728
+ if (jsonResponse.event.contentStart) {
729
+ this.log(`[Stream] \u2192 Handling contentStart`);
730
+ this.handleServerEvent({ contentStart: jsonResponse.event.contentStart });
731
+ } else if (jsonResponse.event.textOutput) {
732
+ this.log(
733
+ `[Stream] \u2192 Handling textOutput, content length: ${jsonResponse.event.textOutput?.content?.length ?? 0}`
734
+ );
735
+ this.handleServerEvent({ textOutput: jsonResponse.event.textOutput });
736
+ } else if (jsonResponse.event.audioOutput) {
737
+ this.handleServerEvent({ audioOutput: jsonResponse.event.audioOutput });
738
+ } else if (jsonResponse.event.toolUse) {
739
+ this.handleServerEvent({ toolUse: jsonResponse.event.toolUse });
740
+ } else if (jsonResponse.event.contentEnd && jsonResponse.event.contentEnd.type === "TOOL") {
741
+ this.handleServerEvent({ contentEnd: jsonResponse.event.contentEnd });
742
+ } else if (jsonResponse.event.contentEnd) {
743
+ this.log(
744
+ `[Stream] Found contentEnd, type: ${jsonResponse.event.contentEnd.type}, stopReason: ${jsonResponse.event.contentEnd.stopReason}`
745
+ );
746
+ this.handleServerEvent({ contentEnd: jsonResponse.event.contentEnd });
747
+ } else if (jsonResponse.event.completionStart) {
748
+ this.log(
749
+ "[Stream] Found completionStart inside event object:",
750
+ JSON.stringify(jsonResponse.event.completionStart, null, 2)
751
+ );
752
+ this.emit("completionStart", jsonResponse.event.completionStart);
753
+ } else if (jsonResponse.event.completionEnd) {
754
+ this.log(
755
+ "[Stream] Found completionEnd inside event object:",
756
+ JSON.stringify(jsonResponse.event.completionEnd, null, 2)
757
+ );
758
+ this.handleServerEvent({ completionEnd: jsonResponse.event.completionEnd });
759
+ } else {
760
+ const eventKeys2 = Object.keys(jsonResponse.event || {});
761
+ this.log(`[Stream] Event keys for other events: ${eventKeys2.join(", ")}`);
762
+ if (eventKeys2.length > 0) {
763
+ if (eventKeys2.includes("completionEnd")) {
764
+ this.log("[Stream] Found completionEnd in other events, handling explicitly");
765
+ this.handleServerEvent({ completionEnd: jsonResponse.event.completionEnd });
766
+ } else {
767
+ const eventKey = eventKeys2[0];
768
+ this.log(`[Stream] Dispatching other event: ${eventKey}`);
769
+ const eventValue = jsonResponse.event[eventKey];
770
+ if (eventValue !== void 0) {
771
+ if (eventKey === "completionEnd") {
772
+ this.handleServerEvent({ completionEnd: eventValue });
773
+ } else {
774
+ this.handleServerEvent({ [eventKey]: eventValue });
775
+ }
776
+ }
777
+ }
778
+ } else if (Object.keys(jsonResponse).length > 0) {
779
+ this.log(`[Stream] Unknown event structure, keys:`, Object.keys(jsonResponse).join(", "));
780
+ }
781
+ }
782
+ } else {
783
+ if (this.debug) {
784
+ this.log(
785
+ '[Stream] Received event without "event" wrapper, keys:',
786
+ Object.keys(jsonResponse).join(", ")
787
+ );
788
+ }
789
+ if (jsonResponse.usageEvent) {
790
+ this.emit("usage", {
791
+ inputTokens: jsonResponse.usageEvent.totalInputTokens || 0,
792
+ outputTokens: jsonResponse.usageEvent.totalOutputTokens || 0,
793
+ totalTokens: jsonResponse.usageEvent.totalTokens || 0
794
+ });
795
+ }
796
+ if (jsonResponse.completionEnd) {
797
+ this.log(
798
+ "[Stream] Found completionEnd at top level:",
799
+ JSON.stringify(jsonResponse.completionEnd, null, 2)
800
+ );
801
+ this.handleServerEvent({ completionEnd: jsonResponse.completionEnd });
802
+ }
803
+ if (!jsonResponse.event && !jsonResponse.completionEnd && !jsonResponse.usageEvent) {
804
+ this.log(
805
+ "[Stream] Received response without event wrapper, keys:",
806
+ Object.keys(jsonResponse).join(", ")
807
+ );
808
+ }
809
+ if (jsonResponse.completionStart || jsonResponse.event?.completionStart) {
810
+ const completionStart = jsonResponse.completionStart || jsonResponse.event.completionStart;
811
+ this.log("[Stream] Found completionStart:", JSON.stringify(completionStart, null, 2));
812
+ this.emit("completionStart", completionStart);
813
+ }
814
+ }
815
+ } catch (parseError) {
816
+ this.log("[Stream] Failed to parse JSON response:", textResponse.substring(0, 200));
817
+ this.emit("error", {
818
+ message: "Failed to parse stream response",
819
+ code: "PARSE_ERROR",
820
+ details: parseError
821
+ });
822
+ }
823
+ } else if (chunk.internalServerException) {
824
+ this.emit("error", {
825
+ message: "Internal server error",
826
+ code: "INTERNAL_SERVER_ERROR",
827
+ details: chunk.internalServerException
828
+ });
829
+ } else if (chunk.modelStreamErrorException) {
830
+ this.emit("error", {
831
+ message: "Model stream error",
832
+ code: "MODEL_STREAM_ERROR",
833
+ details: chunk.modelStreamErrorException
834
+ });
835
+ } else if (chunk.modelTimeoutException) {
836
+ this.emit("error", {
837
+ message: "Model timeout",
838
+ code: "MODEL_TIMEOUT",
839
+ details: chunk.modelTimeoutException
840
+ });
841
+ } else if (chunk.serviceUnavailableException) {
842
+ this.emit("error", {
843
+ message: "Service unavailable",
844
+ code: "SERVICE_UNAVAILABLE",
845
+ details: chunk.serviceUnavailableException
846
+ });
847
+ } else if (chunk.throttlingException) {
848
+ this.emit("error", {
849
+ message: "Request throttled",
850
+ code: "THROTTLING",
851
+ details: chunk.throttlingException
852
+ });
853
+ } else if (chunk.validationException) {
854
+ this.emit("error", {
855
+ message: "Validation error",
856
+ code: "VALIDATION_ERROR",
857
+ details: chunk.validationException
858
+ });
859
+ }
860
+ }
861
+ } catch (streamError) {
862
+ this.log("[Stream] Error in processStream:", streamError);
863
+ this.emit("error", {
864
+ message: "Stream processing error",
865
+ code: "STREAM_ERROR",
866
+ details: streamError instanceof Error ? streamError.message : String(streamError)
867
+ });
868
+ } finally {
869
+ this.processingStream = false;
870
+ this.log(
871
+ `[Stream] processStream finished, processingStream set to false. Total events received: ${eventCount || 0}`
872
+ );
873
+ this.log(`[Stream] Stream state: state=${this.state}, stream exists=${!!this.stream}`);
874
+ if (!this.turnCompleted && this.audioContentStarted) {
875
+ this.log("[Stream] Stream ended but turn not completed - signaling turn completion as fallback");
876
+ this.log(
877
+ `[Stream] State: turnCompleted=${this.turnCompleted}, audioContentStarted=${this.audioContentStarted}, hasSentContentEnd=${this.hasSentContentEnd}`
878
+ );
879
+ this.turnCompleted = true;
880
+ this.emit("turnComplete", { timestamp: Date.now() });
881
+ if (this.currentResponseId) {
882
+ const stream = this.speakerStreams.get(this.currentResponseId);
883
+ if (stream) {
884
+ stream.end();
885
+ }
886
+ this.speakerStreams.delete(this.currentResponseId);
887
+ this.currentResponseId = void 0;
888
+ }
889
+ this.hasSentContentEnd = false;
890
+ this.log("[Stream] Turn completion signaled, ready for next turn");
891
+ } else if (this.turnCompleted) {
892
+ this.log("[Stream] Stream ended and turn was already completed");
893
+ } else {
894
+ this.log(
895
+ `[Stream] Stream ended but turn not completed - audioContentStarted=${this.audioContentStarted}, turnCompleted=${this.turnCompleted}`
896
+ );
897
+ }
898
+ if (this.stream && this.state === "connected" && !this.processingStream && !this.streamRestartAttempted) {
899
+ this.log("[Stream] Stream still open but processing stopped - will restart stream processing");
900
+ this.streamRestartAttempted = true;
901
+ setImmediate(() => {
902
+ if (this.stream && this.state === "connected" && !this.processingStream) {
903
+ this.log("[Stream] Restarting stream processing for subsequent turns");
904
+ this.processStream().catch((error) => {
905
+ this.log("[Stream] Error restarting stream processing:", error);
906
+ this.streamRestartAttempted = false;
907
+ });
908
+ } else {
909
+ this.streamRestartAttempted = false;
910
+ }
911
+ });
912
+ } else {
913
+ if (this.streamRestartAttempted) {
914
+ this.log("[Stream] Stream restart already attempted, skipping");
915
+ }
916
+ }
917
+ }
918
+ }
919
+ /**
920
+ * Handle server events from AWS Bedrock
921
+ */
922
+ handleServerEvent(event) {
923
+ if (this.debug) {
924
+ this.log("Received event, keys:", Object.keys(event).join(", "));
925
+ }
926
+ if (event.contentStart) {
927
+ this.handleContentStart(event.contentStart);
928
+ }
929
+ if (event.textOutput) {
930
+ this.handleTextOutput(event.textOutput);
931
+ }
932
+ if (event.audioOutput?.content) {
933
+ this.handleAudioOutput(event.audioOutput);
934
+ }
935
+ if (event.toolUse) {
936
+ this.handleToolUse(event.toolUse);
937
+ }
938
+ if (event.contentEnd) {
939
+ this.handleContentEnd(event.contentEnd);
940
+ }
941
+ if (event.completionEnd) {
942
+ this.handleCompletionEnd(event.completionEnd);
943
+ }
944
+ if (event.error) {
945
+ this.emit("error", {
946
+ message: event.error.message || "Unknown error",
947
+ code: event.error.code || "UNKNOWN_ERROR",
948
+ details: event.error
949
+ });
950
+ }
951
+ }
952
+ /**
953
+ * Handle a contentStart event. Tracks generationStage for text content
954
+ * blocks so the corresponding 'writing' events can be tagged
955
+ * SPECULATIVE/FINAL for the client.
956
+ */
957
+ handleContentStart(contentStart) {
958
+ const role = contentStart.role?.toLowerCase();
959
+ const contentType = contentStart.type;
960
+ this.log(`[Event] contentStart: type=${contentType || "unknown"}, role=${role}`);
961
+ this.emit("contentStart", contentStart);
962
+ if (contentType === "TEXT" && contentStart.additionalModelFields) {
963
+ try {
964
+ const additionalFields = JSON.parse(contentStart.additionalModelFields);
965
+ this.currentTextGenerationStage = additionalFields.generationStage;
966
+ this.log(`[Event] Text content generationStage: ${this.currentTextGenerationStage}`);
967
+ } catch {
968
+ this.currentTextGenerationStage = void 0;
969
+ }
970
+ } else if (contentType === "TEXT") {
971
+ this.currentTextGenerationStage = void 0;
972
+ }
973
+ }
974
+ /**
975
+ * Handle a textOutput event. Detects interruption (barge-in) markers in
976
+ * the payload, otherwise emits a 'writing' event with the text and
977
+ * current generationStage.
978
+ */
979
+ handleTextOutput(textOutput) {
980
+ const text = textOutput.content || "";
981
+ const role = textOutput.role?.toLowerCase() || "assistant";
982
+ this.log(`[Event] textOutput received: role=${role}, text length=${text.length}`);
983
+ let isInterrupted = false;
984
+ try {
985
+ const parsed = JSON.parse(text);
986
+ if (parsed && parsed.interrupted === true) {
987
+ isInterrupted = true;
988
+ }
989
+ } catch {
990
+ if (/interrupted/i.test(text)) {
991
+ isInterrupted = true;
992
+ }
993
+ }
994
+ if (isInterrupted) {
995
+ this.log(`[Event] Interrupt detected, emitting interrupt event`);
996
+ this.emit("interrupt", { type: "user", timestamp: Date.now() });
997
+ return;
998
+ }
999
+ const generationStage = this.currentTextGenerationStage;
1000
+ this.log(`[Event] Emitting 'writing': role=${role}, generationStage=${generationStage}, length=${text.length}`);
1001
+ this.emit("writing", { text, role, generationStage });
1002
+ }
1003
+ /**
1004
+ * Handle an audioOutput event. Decodes the base64 LPCM payload, emits
1005
+ * 'speaking' with both the base64 string and an Int16Array view, and
1006
+ * forwards bytes to any active speaker stream.
1007
+ */
1008
+ handleAudioOutput(audioOutput) {
1009
+ try {
1010
+ const content = audioOutput.content;
1011
+ const audioBytes = Buffer.from(content, "base64");
1012
+ this.log(`[Event] Audio output: ${audioBytes.length} bytes`);
1013
+ this.isReceivingAssistantAudio = true;
1014
+ const audioData = new Int16Array(audioBytes.buffer, audioBytes.byteOffset, audioBytes.byteLength / 2);
1015
+ this.emit("speaking", {
1016
+ audio: content,
1017
+ audioData,
1018
+ response_id: this.currentResponseId
1019
+ });
1020
+ if (this.currentResponseId) {
1021
+ const stream = this.speakerStreams.get(this.currentResponseId);
1022
+ if (stream) {
1023
+ stream.write(audioBytes);
1024
+ }
1025
+ }
1026
+ } catch (error) {
1027
+ this.log("[Event] Error decoding audio:", error);
1028
+ this.emit("error", {
1029
+ message: "Failed to decode audio",
1030
+ code: "AUDIO_DECODE_ERROR",
1031
+ details: error
1032
+ });
1033
+ }
1034
+ }
1035
+ /**
1036
+ * Handle a toolUse event. Emits 'toolCall' and dispatches to the
1037
+ * configured tool's execute() function via handleToolCall().
1038
+ */
1039
+ handleToolUse(toolUse) {
1040
+ const toolUseId = toolUse.toolUseId || "";
1041
+ const toolName = toolUse.toolName || "";
1042
+ const toolInput = toolUse.input || {};
1043
+ this.emit("toolCall", {
1044
+ name: toolName,
1045
+ args: toolInput,
1046
+ id: toolUseId
1047
+ });
1048
+ if (this.tools && toolName in this.tools) {
1049
+ void this.handleToolCall(toolName, toolInput, toolUseId);
1050
+ }
1051
+ }
1052
+ /**
1053
+ * Handle a contentEnd event. Forwards it to clients, then routes by
1054
+ * stopReason / type:
1055
+ * - INTERRUPTED: emit 'interrupt' and tear down the active speaker stream
1056
+ * - TOOL: end the active speaker stream
1057
+ * - AUDIO with END_TURN: signal turnComplete (assistant audio finished)
1058
+ * - AUDIO with PARTIAL_TURN while receiving assistant audio: schedule
1059
+ * fallback turnComplete in case completionEnd never arrives
1060
+ * - AUDIO otherwise: user input ended, reset turn flags
1061
+ */
1062
+ handleContentEnd(contentEnd) {
1063
+ this.log(`[Event] contentEnd received: type=${contentEnd.type}, stopReason=${contentEnd.stopReason}`);
1064
+ this.emit("contentEnd", contentEnd);
1065
+ if (contentEnd.stopReason === "INTERRUPTED") {
1066
+ this.log("[Event] Content interrupted by user (barge-in)");
1067
+ this.emit("interrupt", { type: "user", timestamp: Date.now() });
1068
+ if (this.currentResponseId) {
1069
+ const stream = this.speakerStreams.get(this.currentResponseId);
1070
+ if (stream) {
1071
+ stream.destroy();
1072
+ }
1073
+ this.speakerStreams.delete(this.currentResponseId);
1074
+ }
1075
+ this.currentResponseId = void 0;
1076
+ this.log("[Event] After interruption, keeping audioContentStarted=true for continued streaming");
1077
+ } else if (contentEnd.type === "TOOL" && this.currentResponseId) {
1078
+ const stream = this.speakerStreams.get(this.currentResponseId);
1079
+ if (stream) {
1080
+ stream.end();
1081
+ }
1082
+ } else if (contentEnd.type === "AUDIO") {
1083
+ if (contentEnd.stopReason === "END_TURN") {
1084
+ this.log(`[Event] contentEnd (AUDIO) with stopReason END_TURN - signaling turn complete`);
1085
+ if (this.currentResponseId) {
1086
+ const stream = this.speakerStreams.get(this.currentResponseId);
1087
+ if (stream) {
1088
+ stream.end();
1089
+ }
1090
+ this.speakerStreams.delete(this.currentResponseId);
1091
+ this.currentResponseId = void 0;
1092
+ }
1093
+ if (!this.turnCompleted) {
1094
+ this.turnCompleted = true;
1095
+ this.emit("turnComplete", { timestamp: Date.now() });
1096
+ this.hasSentContentEnd = false;
1097
+ this.log(
1098
+ `[Event] Turn complete (from contentEnd AUDIO with END_TURN), ready for next turn. audioContentStarted: ${this.audioContentStarted}, audioContentName: ${this.audioContentName}`
1099
+ );
1100
+ } else {
1101
+ this.log(
1102
+ `[Event] contentEnd (AUDIO) with END_TURN received but turn already completed - skipping duplicate turnComplete emission`
1103
+ );
1104
+ }
1105
+ if (!this.turnCompleteTimeout) {
1106
+ this.turnCompleteTimeout = setTimeout(() => {
1107
+ this.log(`[Event] Timeout: completionEnd not received, but turn already completed from contentEnd`);
1108
+ this.turnCompleteTimeout = void 0;
1109
+ }, 1e3);
1110
+ }
1111
+ } else {
1112
+ if (this.isReceivingAssistantAudio && contentEnd.stopReason === "PARTIAL_TURN") {
1113
+ this.isReceivingAssistantAudio = false;
1114
+ if (!this.turnCompleteTimeout && !this.turnCompleted) {
1115
+ this.log(
1116
+ `[Event] contentEnd (AUDIO) with PARTIAL_TURN for assistant output - waiting for completionEnd, setting fallback timeout`
1117
+ );
1118
+ this.turnCompleteTimeout = setTimeout(() => {
1119
+ if (!this.turnCompleted) {
1120
+ this.log(
1121
+ `[Event] Fallback: completionEnd not received after contentEnd (AUDIO) with PARTIAL_TURN, signaling turn complete`
1122
+ );
1123
+ this.turnCompleted = true;
1124
+ this.emit("turnComplete", { timestamp: Date.now() });
1125
+ if (this.currentResponseId) {
1126
+ const stream = this.speakerStreams.get(this.currentResponseId);
1127
+ if (stream) {
1128
+ stream.end();
1129
+ }
1130
+ this.speakerStreams.delete(this.currentResponseId);
1131
+ this.currentResponseId = void 0;
1132
+ }
1133
+ this.hasSentContentEnd = false;
1134
+ this.turnCompleteTimeout = void 0;
1135
+ }
1136
+ }, 2e3);
1137
+ }
1138
+ } else {
1139
+ this.hasSentContentEnd = false;
1140
+ this.turnCompleted = false;
1141
+ this.log(
1142
+ `[Event] contentEnd (AUDIO) - user input ended, stopReason: ${contentEnd.stopReason}. Keeping audioContentStarted=true for next turn. Reset hasSentContentEnd=false, turnCompleted=false.`
1143
+ );
1144
+ }
1145
+ }
1146
+ } else if (contentEnd.type === "TEXT") {
1147
+ this.currentTextGenerationStage = void 0;
1148
+ this.log(
1149
+ `[Event] contentEnd (TEXT) received, stopReason: ${contentEnd.stopReason}. Turn completion handled by completionEnd/contentEnd(AUDIO).`
1150
+ );
1151
+ if (contentEnd.stopReason === "END_TURN") {
1152
+ this.hasSentContentEnd = false;
1153
+ }
1154
+ }
1155
+ }
1156
+ /**
1157
+ * Handle a completionEnd event. AWS uses this as the definitive signal
1158
+ * that a turn (and all audio output) has finished. Tears down the active
1159
+ * speaker stream, clears any fallback timer, emits 'turnComplete' once,
1160
+ * and forwards token usage if reported.
1161
+ */
1162
+ handleCompletionEnd(completionEnd) {
1163
+ this.log(`[Event] completionEnd received, stopReason: ${completionEnd.stopReason}`);
1164
+ if (this.turnCompleteTimeout) {
1165
+ clearTimeout(this.turnCompleteTimeout);
1166
+ this.turnCompleteTimeout = void 0;
1167
+ }
1168
+ if (this.currentResponseId) {
1169
+ const stream = this.speakerStreams.get(this.currentResponseId);
1170
+ if (stream) {
1171
+ stream.end();
1172
+ }
1173
+ this.speakerStreams.delete(this.currentResponseId);
1174
+ this.currentResponseId = void 0;
1175
+ }
1176
+ this.isReceivingAssistantAudio = false;
1177
+ if (!this.turnCompleted) {
1178
+ this.log(
1179
+ `[Event] completionEnd - signaling turn complete (stopReason: ${completionEnd.stopReason || "undefined"})`
1180
+ );
1181
+ this.turnCompleted = true;
1182
+ this.emit("turnComplete", { timestamp: Date.now() });
1183
+ this.hasSentContentEnd = false;
1184
+ } else {
1185
+ this.log(`[Event] completionEnd received but turn already completed - skipping duplicate turnComplete emission`);
1186
+ }
1187
+ if (completionEnd.usage) {
1188
+ this.emit("usage", {
1189
+ inputTokens: completionEnd.usage.inputTokens || 0,
1190
+ outputTokens: completionEnd.usage.outputTokens || 0,
1191
+ totalTokens: (completionEnd.usage.inputTokens || 0) + (completionEnd.usage.outputTokens || 0)
1192
+ });
1193
+ }
1194
+ }
1195
+ /**
1196
+ * Handle tool execution
1197
+ */
1198
+ async handleToolCall(toolName, args, toolUseId) {
1199
+ const tool = this.tools?.[toolName];
1200
+ if (!tool || !tool.execute) {
1201
+ this.emit("error", {
1202
+ message: `Tool ${toolName} not found or has no execute function`,
1203
+ code: "TOOL_NOT_FOUND"
1204
+ });
1205
+ return;
1206
+ }
1207
+ try {
1208
+ const result = await tool.execute(
1209
+ { context: args, requestContext: this.requestContext },
1210
+ {
1211
+ toolCallId: toolUseId,
1212
+ messages: []
1213
+ }
1214
+ );
1215
+ await this.sendClientEvent({
1216
+ toolResult: {
1217
+ toolUseId,
1218
+ content: [
1219
+ {
1220
+ json: typeof result === "object" ? result : { result }
1221
+ }
1222
+ ]
1223
+ }
1224
+ });
1225
+ } catch (error) {
1226
+ this.emit("error", {
1227
+ message: `Error executing tool ${toolName}: ${error instanceof Error ? error.message : "Unknown error"}`,
1228
+ code: "TOOL_EXECUTION_ERROR",
1229
+ details: error
1230
+ });
1231
+ await this.sendClientEvent({
1232
+ toolResult: {
1233
+ toolUseId,
1234
+ content: [
1235
+ {
1236
+ text: `Error: ${error instanceof Error ? error.message : "Unknown error"}`
1237
+ }
1238
+ ]
1239
+ }
1240
+ });
1241
+ }
1242
+ }
1243
+ /**
1244
+ * Send a client event to AWS Bedrock
1245
+ * Events are sent through the input stream that was passed to the bidirectional stream command
1246
+ */
1247
+ async sendClientEvent(event) {
1248
+ if (this.state !== "connected") {
1249
+ throw new NovaSonicError("not_connected" /* NOT_CONNECTED */, "Not connected to AWS Bedrock. Call connect() first.");
1250
+ }
1251
+ try {
1252
+ const eventQueue = this._eventQueue;
1253
+ const signalQueue = this._signalQueue;
1254
+ if (!eventQueue || !signalQueue) {
1255
+ throw new NovaSonicError(
1256
+ "not_connected" /* NOT_CONNECTED */,
1257
+ "Event queue not initialized. Connection may not be fully established."
1258
+ );
1259
+ }
1260
+ this.log(`[sendClientEvent] Adding event to queue (queue size: ${eventQueue.length})`);
1261
+ eventQueue.push({ event });
1262
+ this.log(`[sendClientEvent] Event added, queue size now: ${eventQueue.length}, signaling...`);
1263
+ signalQueue();
1264
+ this.log(`[sendClientEvent] Signal sent`);
1265
+ if (this.debug) {
1266
+ this.log("Sent client event, keys:", Object.keys(event).join(", "));
1267
+ }
1268
+ } catch (error) {
1269
+ throw new NovaSonicError(
1270
+ "websocket_error" /* WEBSOCKET_ERROR */,
1271
+ `Failed to send client event: ${error instanceof Error ? error.message : "Unknown error"}`,
1272
+ error
1273
+ );
1274
+ }
1275
+ }
1276
+ /**
1277
+ * Disconnects from the AWS Bedrock session and cleans up resources.
1278
+ *
1279
+ * Pushes a `sessionEnd` event to the queue before signalling close,
1280
+ * then schedules client destruction on the next tick so the async
1281
+ * iterator has a chance to yield the event to the SDK.
1282
+ */
1283
+ close() {
1284
+ if (this.state === "disconnected") {
1285
+ return;
1286
+ }
1287
+ this.state = "disconnected";
1288
+ this.processingStream = false;
1289
+ if (this.turnCompleteTimeout) {
1290
+ clearTimeout(this.turnCompleteTimeout);
1291
+ this.turnCompleteTimeout = void 0;
1292
+ }
1293
+ const eventQueue = this._eventQueue;
1294
+ const signalQueue = this._signalQueue;
1295
+ if (eventQueue && signalQueue) {
1296
+ eventQueue.push({ event: { sessionEnd: {} } });
1297
+ signalQueue();
1298
+ }
1299
+ const closeSignal = this._closeSignal;
1300
+ if (closeSignal) {
1301
+ closeSignal();
1302
+ }
1303
+ if (this.inputStream) {
1304
+ this.inputStream.end();
1305
+ this.inputStream = void 0;
1306
+ }
1307
+ for (const stream of this.speakerStreams.values()) {
1308
+ stream.end();
1309
+ }
1310
+ this.speakerStreams.clear();
1311
+ const client = this.client;
1312
+ this.client = void 0;
1313
+ this.stream = void 0;
1314
+ if (client) {
1315
+ setImmediate(() => {
1316
+ if (typeof client.destroy === "function") {
1317
+ client.destroy();
1318
+ }
1319
+ });
1320
+ }
1321
+ this.log("Disconnected from AWS Bedrock Nova 2 Sonic");
1322
+ }
1323
+ /**
1324
+ * Equips the voice instance with a set of instructions.
1325
+ */
1326
+ addInstructions(instructions) {
1327
+ this.instructions = instructions;
1328
+ }
1329
+ /**
1330
+ * Equips the voice instance with a set of tools.
1331
+ */
1332
+ addTools(tools) {
1333
+ this.tools = tools || {};
1334
+ }
1335
+ /**
1336
+ * Convert text to speech
1337
+ */
1338
+ async speak(input, _options) {
1339
+ if (this.state !== "connected") {
1340
+ throw new NovaSonicError("not_connected" /* NOT_CONNECTED */, "Not connected. Call connect() first.");
1341
+ }
1342
+ let text = "";
1343
+ if (typeof input !== "string") {
1344
+ const chunks = [];
1345
+ for await (const chunk of input) {
1346
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(String(chunk)));
1347
+ }
1348
+ text = Buffer.concat(chunks).toString("utf-8");
1349
+ } else {
1350
+ text = input;
1351
+ }
1352
+ if (text.trim().length === 0) {
1353
+ throw new NovaSonicError("validation_error" /* VALIDATION_ERROR */, "Input text is empty");
1354
+ }
1355
+ this.currentResponseId = `response-${Date.now()}`;
1356
+ const speakerStream = new stream.PassThrough();
1357
+ speakerStream.id = this.currentResponseId;
1358
+ this.speakerStreams.set(this.currentResponseId, speakerStream);
1359
+ this.emit("speaker", speakerStream);
1360
+ const promptName = this._promptName;
1361
+ if (!promptName) {
1362
+ throw new NovaSonicError(
1363
+ "not_connected" /* NOT_CONNECTED */,
1364
+ "Prompt name not initialized. Connection may not be fully established."
1365
+ );
1366
+ }
1367
+ if (!this.promptStarted) {
1368
+ throw new NovaSonicError(
1369
+ "invalid_state" /* INVALID_STATE */,
1370
+ "Prompt not started. This should not happen - prompt should be started during connection."
1371
+ );
1372
+ }
1373
+ const contentName = crypto.randomUUID();
1374
+ await this.sendClientEvent({
1375
+ contentStart: {
1376
+ promptName,
1377
+ contentName,
1378
+ type: "TEXT",
1379
+ interactive: true,
1380
+ role: "USER",
1381
+ textInputConfiguration: {
1382
+ mediaType: "text/plain"
1383
+ }
1384
+ }
1385
+ });
1386
+ await this.sendClientEvent({
1387
+ textInput: {
1388
+ promptName,
1389
+ contentName,
1390
+ content: text
1391
+ }
1392
+ });
1393
+ await this.sendClientEvent({
1394
+ contentEnd: {
1395
+ promptName,
1396
+ contentName
1397
+ }
1398
+ });
1399
+ }
1400
+ /**
1401
+ * Convert speech to text (transcription)
1402
+ * For Nova Sonic, this is the same as send() - both stream audio input
1403
+ */
1404
+ async listen(audioStream, _options) {
1405
+ if (audioStream && typeof audioStream === "object" && "read" in audioStream) {
1406
+ await this.send(audioStream);
1407
+ } else {
1408
+ throw new NovaSonicError("invalid_audio_format" /* INVALID_AUDIO_FORMAT */, "Unsupported audio stream format for listen()");
1409
+ }
1410
+ }
1411
+ /**
1412
+ * Streams audio data in real-time to the AWS Bedrock service.
1413
+ * Following AWS Nova 2 Sonic event sequence:
1414
+ * 1. contentStart (AUDIO, USER) - if not already sent
1415
+ * 2. audioInput events (one per chunk)
1416
+ * 3. contentEnd - when audio stream ends (handled separately via endAudioInput)
1417
+ */
1418
+ async send(audioData) {
1419
+ this.log(`[send] Current state: ${this.state}`);
1420
+ if (this.state !== "connected") {
1421
+ this.log(`[send] ERROR: State is '${this.state}', expected 'connected'`);
1422
+ throw new NovaSonicError(
1423
+ "not_connected" /* NOT_CONNECTED */,
1424
+ `Not connected. Current state: ${this.state}. Call connect() first.`
1425
+ );
1426
+ }
1427
+ this.log(`[send] State check passed, proceeding with send`);
1428
+ if (!(audioData instanceof Int16Array) && !(audioData && typeof audioData === "object" && "read" in audioData)) {
1429
+ throw new NovaSonicError("invalid_audio_format" /* INVALID_AUDIO_FORMAT */, "Unsupported audio data format");
1430
+ }
1431
+ if (this.turnCompleted || this.hasSentContentEnd) {
1432
+ this.log(
1433
+ `[send] Starting new turn - resetting flags. turnCompleted=${this.turnCompleted}, hasSentContentEnd=${this.hasSentContentEnd}.`
1434
+ );
1435
+ const needNewContent = this.hasSentContentEnd;
1436
+ this.turnCompleted = false;
1437
+ this.hasSentContentEnd = false;
1438
+ this.streamRestartAttempted = false;
1439
+ if (needNewContent) {
1440
+ this.audioContentStarted = false;
1441
+ this.log(`[send] contentEnd was previously sent - will create new audio content container`);
1442
+ }
1443
+ this.log(
1444
+ `[send] State reset: turnCompleted=false, hasSentContentEnd=false, audioContentStarted=${this.audioContentStarted}`
1445
+ );
1446
+ }
1447
+ if (!this.promptStarted) {
1448
+ this.promptStarted = true;
1449
+ }
1450
+ const promptName = this._promptName;
1451
+ if (!promptName) {
1452
+ throw new NovaSonicError(
1453
+ "not_connected" /* NOT_CONNECTED */,
1454
+ "Prompt name not initialized. Connection may not be fully established."
1455
+ );
1456
+ }
1457
+ if (!this.audioContentStarted) {
1458
+ const audioContentId = crypto.randomUUID();
1459
+ this.audioContentName = audioContentId;
1460
+ this.log(`[send] First audio send - sending AUDIO contentStart with contentName: ${audioContentId}`);
1461
+ await this.sendClientEvent({
1462
+ contentStart: {
1463
+ promptName,
1464
+ contentName: audioContentId,
1465
+ type: "AUDIO",
1466
+ interactive: true,
1467
+ role: "USER",
1468
+ audioInputConfiguration: {
1469
+ mediaType: "audio/lpcm",
1470
+ sampleRateHertz: 16e3,
1471
+ sampleSizeBits: 16,
1472
+ channelCount: 1,
1473
+ encoding: "base64",
1474
+ audioType: "SPEECH"
1475
+ }
1476
+ }
1477
+ });
1478
+ this.audioContentStarted = true;
1479
+ this.log(`[send] AUDIO contentStart sent, ready to stream audio`);
1480
+ } else {
1481
+ this.log(`[send] AUDIO contentStart already sent, sending audioInput chunks directly`);
1482
+ }
1483
+ if (!this.audioContentName) {
1484
+ throw new NovaSonicError("invalid_state" /* INVALID_STATE */, "Audio content name not initialized. This should not happen.");
1485
+ }
1486
+ const contentName = this.audioContentName;
1487
+ if (audioData instanceof Int16Array) {
1488
+ const buffer = Buffer.from(audioData.buffer, audioData.byteOffset, audioData.byteLength);
1489
+ const base64Audio = buffer.toString("base64");
1490
+ this.log(
1491
+ `[send] Sending audioInput chunk, size: ${buffer.length} bytes, contentName: ${contentName}, turnCompleted: ${this.turnCompleted}, hasSentContentEnd: ${this.hasSentContentEnd}, audioContentStarted: ${this.audioContentStarted}, state: ${this.state}`
1492
+ );
1493
+ if (this.state !== "connected") {
1494
+ this.log(`[send] ERROR: State changed to '${this.state}' during send!`);
1495
+ throw new NovaSonicError("not_connected" /* NOT_CONNECTED */, `Connection lost during send. State: ${this.state}`);
1496
+ }
1497
+ await this.sendClientEvent({
1498
+ audioInput: {
1499
+ promptName,
1500
+ contentName,
1501
+ content: base64Audio
1502
+ }
1503
+ });
1504
+ this.log(`[send] audioInput chunk sent successfully`);
1505
+ } else if (audioData && typeof audioData === "object" && "read" in audioData) {
1506
+ const stream = audioData;
1507
+ for await (const chunk of stream) {
1508
+ const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
1509
+ const base64Audio = buffer.toString("base64");
1510
+ this.log(
1511
+ `[send] Sending audioInput chunk from stream, size: ${buffer.length} bytes, contentName: ${contentName}, turnCompleted: ${this.turnCompleted}, hasSentContentEnd: ${this.hasSentContentEnd}`
1512
+ );
1513
+ await this.sendClientEvent({
1514
+ audioInput: {
1515
+ promptName,
1516
+ contentName,
1517
+ content: base64Audio
1518
+ }
1519
+ });
1520
+ }
1521
+ } else {
1522
+ throw new NovaSonicError("invalid_audio_format" /* INVALID_AUDIO_FORMAT */, "Unsupported audio data format");
1523
+ }
1524
+ }
1525
+ /**
1526
+ * End audio input stream (sends contentEnd for audio)
1527
+ * Call this when done sending audio chunks
1528
+ */
1529
+ async endAudioInput() {
1530
+ if (this.hasSentContentEnd) {
1531
+ this.log("[endAudioInput] contentEnd already sent for this turn, skipping");
1532
+ return;
1533
+ }
1534
+ if (this.turnCompleted) {
1535
+ this.log(
1536
+ "[endAudioInput] Turn already completed by AWS, skipping contentEnd. Resetting turnCompleted flag for next turn."
1537
+ );
1538
+ this.turnCompleted = false;
1539
+ this.hasSentContentEnd = false;
1540
+ return;
1541
+ }
1542
+ if (this.audioContentStarted && this.audioContentName && this._promptName) {
1543
+ const promptName = this._promptName;
1544
+ this.log("[endAudioInput] Sending contentEnd for audio input");
1545
+ await this.sendClientEvent({
1546
+ contentEnd: {
1547
+ promptName,
1548
+ contentName: this.audioContentName
1549
+ }
1550
+ });
1551
+ this.hasSentContentEnd = true;
1552
+ } else {
1553
+ this.log(
1554
+ "[endAudioInput] Cannot send contentEnd: audioContentStarted=" + this.audioContentStarted + ", audioContentName=" + this.audioContentName
1555
+ );
1556
+ }
1557
+ }
1558
+ /**
1559
+ * Register an event listener
1560
+ */
1561
+ on(event, callback) {
1562
+ if (!this.events[event]) {
1563
+ this.events[event] = [];
1564
+ }
1565
+ this.events[event].push(callback);
1566
+ }
1567
+ /**
1568
+ * Remove an event listener
1569
+ */
1570
+ off(event, callback) {
1571
+ if (!this.events[event]) {
1572
+ return;
1573
+ }
1574
+ const index = this.events[event].indexOf(callback);
1575
+ if (index !== -1) {
1576
+ this.events[event].splice(index, 1);
1577
+ }
1578
+ }
1579
+ /**
1580
+ * Emit an event with arguments
1581
+ */
1582
+ emit(event, data) {
1583
+ if (!this.events[event]) {
1584
+ this.log(`[NovaSonic] emit('${event}'): No listeners registered for this event`);
1585
+ return;
1586
+ }
1587
+ const listenerCount = this.events[event].length;
1588
+ this.log(`[NovaSonic] emit('${event}'): Calling ${listenerCount} listener(s)`);
1589
+ for (const callback of this.events[event]) {
1590
+ try {
1591
+ callback(data);
1592
+ this.log(`[NovaSonic] emit('${event}'): Successfully called one listener`);
1593
+ } catch (error) {
1594
+ this.log(`Error in event handler for ${event}:`, error);
1595
+ }
1596
+ }
1597
+ this.log(`[NovaSonic] emit('${event}'): Finished calling all ${listenerCount} listener(s)`);
1598
+ }
1599
+ /**
1600
+ * Get listener status
1601
+ */
1602
+ async getListener() {
1603
+ return { enabled: this.state === "connected" };
1604
+ }
1605
+ /**
1606
+ * Log helper
1607
+ */
1608
+ log(...args) {
1609
+ if (this.debug) {
1610
+ console.info("[NovaSonicVoice]", ...args);
1611
+ }
1612
+ }
1613
+ };
1614
+
1615
+ exports.NovaSonicError = NovaSonicError;
1616
+ exports.NovaSonicErrorCode = NovaSonicErrorCode;
1617
+ exports.NovaSonicVoice = NovaSonicVoice;
1618
+ //# sourceMappingURL=index.cjs.map
1619
+ //# sourceMappingURL=index.cjs.map