@mastra/voice-aws-nova-sonic 0.0.0-studio-cli-20260504022012

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1615 @@
1
+ import { randomUUID } from 'crypto';
2
+ import { PassThrough } from 'stream';
3
+ import { BedrockRuntimeClient, InvokeModelWithBidirectionalStreamCommand } from '@aws-sdk/client-bedrock-runtime';
4
+ import { MastraVoice } from '@mastra/core/voice';
5
+ import { NodeHttp2Handler } from '@smithy/node-http-handler';
6
+ import { defaultProvider } from '@aws-sdk/credential-provider-node';
7
+
8
+ // src/index.ts
9
+
10
+ // src/types.ts
11
+ var NovaSonicErrorCode = /* @__PURE__ */ ((NovaSonicErrorCode2) => {
12
+ NovaSonicErrorCode2["CONNECTION_FAILED"] = "connection_failed";
13
+ NovaSonicErrorCode2["CONNECTION_NOT_ESTABLISHED"] = "connection_not_established";
14
+ NovaSonicErrorCode2["AUTHENTICATION_FAILED"] = "authentication_failed";
15
+ NovaSonicErrorCode2["CREDENTIALS_MISSING"] = "credentials_missing";
16
+ NovaSonicErrorCode2["REGION_INVALID"] = "region_invalid";
17
+ NovaSonicErrorCode2["WEBSOCKET_ERROR"] = "websocket_error";
18
+ NovaSonicErrorCode2["AUDIO_PROCESSING_ERROR"] = "audio_processing_error";
19
+ NovaSonicErrorCode2["AUDIO_STREAM_ERROR"] = "audio_stream_error";
20
+ NovaSonicErrorCode2["SPEAKER_STREAM_ERROR"] = "speaker_stream_error";
21
+ NovaSonicErrorCode2["TRANSCRIPTION_TIMEOUT"] = "transcription_timeout";
22
+ NovaSonicErrorCode2["TRANSCRIPTION_FAILED"] = "transcription_failed";
23
+ NovaSonicErrorCode2["TOOL_EXECUTION_ERROR"] = "tool_execution_error";
24
+ NovaSonicErrorCode2["TOOL_NOT_FOUND"] = "tool_not_found";
25
+ NovaSonicErrorCode2["SESSION_CONFIG_UPDATE_FAILED"] = "session_config_update_failed";
26
+ NovaSonicErrorCode2["INVALID_AUDIO_FORMAT"] = "invalid_audio_format";
27
+ NovaSonicErrorCode2["NOT_CONNECTED"] = "not_connected";
28
+ NovaSonicErrorCode2["INVALID_STATE"] = "invalid_state";
29
+ NovaSonicErrorCode2["VALIDATION_ERROR"] = "validation_error";
30
+ NovaSonicErrorCode2["UNKNOWN_ERROR"] = "unknown_error";
31
+ return NovaSonicErrorCode2;
32
+ })(NovaSonicErrorCode || {});
33
+
34
+ // src/utils/errors.ts
35
+ var NovaSonicError = class extends Error {
36
+ code;
37
+ details;
38
+ timestamp;
39
+ constructor(code, message, details) {
40
+ super(message);
41
+ this.name = "NovaSonicError";
42
+ this.code = code;
43
+ this.details = details;
44
+ this.timestamp = Date.now();
45
+ }
46
+ toEventData() {
47
+ return {
48
+ message: this.message,
49
+ code: this.code,
50
+ details: this.details,
51
+ timestamp: this.timestamp
52
+ };
53
+ }
54
+ };
55
+
56
+ // src/utils/auth.ts
57
+ async function getAwsCredentials(explicitCredentials, debug) {
58
+ if (explicitCredentials) {
59
+ if (debug) {
60
+ console.info("[getAwsCredentials] Using explicit credentials provided in config");
61
+ }
62
+ return explicitCredentials;
63
+ }
64
+ try {
65
+ if (debug) {
66
+ console.info("[getAwsCredentials] Using default credential provider chain");
67
+ }
68
+ const credentials = await defaultProvider()();
69
+ if (debug) {
70
+ console.info("[getAwsCredentials] Credentials retrieved successfully");
71
+ }
72
+ return credentials;
73
+ } catch (error) {
74
+ if (error instanceof NovaSonicError) {
75
+ throw error;
76
+ }
77
+ throw new NovaSonicError(
78
+ "authentication_failed" /* AUTHENTICATION_FAILED */,
79
+ `Failed to load AWS credentials: ${error instanceof Error ? error.message : "Unknown error"}`,
80
+ error
81
+ );
82
+ }
83
+ }
84
+
85
+ // src/index.ts
86
+ var DEFAULT_MODEL = "amazon.nova-2-sonic-v1:0";
87
+ var DEFAULT_REGION = "us-east-1";
88
+ var NovaSonicVoice = class extends MastraVoice {
89
+ client;
90
+ stream;
91
+ inputStream;
92
+ // Input stream for sending events to AWS
93
+ _eventQueue;
94
+ _signalQueue;
95
+ _closeSignal;
96
+ _promptName;
97
+ state = "disconnected";
98
+ events;
99
+ instructions;
100
+ tools;
101
+ requestContext;
102
+ debug;
103
+ region;
104
+ model;
105
+ credentials;
106
+ speakerStreams;
107
+ currentResponseId;
108
+ processingStream = false;
109
+ streamRestartAttempted = false;
110
+ // Prevent multiple restart attempts
111
+ sessionConfig;
112
+ promptStarted = false;
113
+ // Track if promptStart was sent (now sent during connection)
114
+ audioContentName;
115
+ audioContentStarted = false;
116
+ hasSentContentEnd = false;
117
+ // Track if contentEnd has been sent for current turn
118
+ turnCompleted = false;
119
+ // Track if turn has been completed (to prevent sending contentEnd after turn completion)
120
+ turnCompleteTimeout;
121
+ // Timeout for fallback turn completion
122
+ isReceivingAssistantAudio = false;
123
+ // Track if we're currently receiving assistant audio output
124
+ currentTextGenerationStage;
125
+ // Track generationStage (SPECULATIVE|FINAL) for current text content block
126
+ /**
127
+ * Creates a new instance of NovaSonicVoice.
128
+ *
129
+ * @param config - Configuration options for the voice instance
130
+ * @param config.region - AWS region (defaults to us-east-1)
131
+ * @param config.model - The model ID to use (defaults to amazon.nova-2-sonic-v1:0)
132
+ * @param config.credentials - AWS credentials (optional, uses default credential chain)
133
+ * @param config.speaker - Voice name/identifier
134
+ * @param config.languageCode - Language code for the voice
135
+ * @param config.debug - Enable debug mode
136
+ *
137
+ * @example
138
+ * ```typescript
139
+ * const voice = new NovaSonicVoice({
140
+ * region: 'us-east-1',
141
+ * model: 'amazon.nova-2-sonic-v1:0',
142
+ * speaker: 'default',
143
+ * });
144
+ * ```
145
+ */
146
+ constructor(config = {}) {
147
+ let normalizedConfig;
148
+ if ("realtimeConfig" in config || "speechModel" in config || "listeningModel" in config) {
149
+ normalizedConfig = config;
150
+ } else {
151
+ const configOptions = config;
152
+ normalizedConfig = {
153
+ realtimeConfig: {
154
+ model: configOptions.model || DEFAULT_MODEL,
155
+ apiKey: void 0,
156
+ // AWS doesn't use API keys
157
+ options: configOptions
158
+ },
159
+ speaker: typeof configOptions.speaker === "string" ? configOptions.speaker : "matthew"
160
+ };
161
+ }
162
+ super(normalizedConfig);
163
+ const options = normalizedConfig.realtimeConfig?.options || config;
164
+ this.region = options.region || DEFAULT_REGION;
165
+ this.model = options.model || DEFAULT_MODEL;
166
+ this.credentials = options.credentials;
167
+ this.debug = options.debug || false;
168
+ this.sessionConfig = options.sessionConfig;
169
+ this.events = {};
170
+ this.speakerStreams = /* @__PURE__ */ new Map();
171
+ const validRegions = ["us-east-1", "us-west-2", "ap-northeast-1"];
172
+ if (!validRegions.includes(this.region)) {
173
+ throw new NovaSonicError(
174
+ "region_invalid" /* REGION_INVALID */,
175
+ `Invalid region: ${this.region}. Supported regions: ${validRegions.join(", ")}`
176
+ );
177
+ }
178
+ }
179
+ /**
180
+ * Returns a list of available voice speakers.
181
+ *
182
+ * Nova 2 Sonic provides expressive voices across multiple languages.
183
+ * Tiffany (en-US, feminine) and Matthew (en-US, masculine) are polyglot
184
+ * voices that can speak all supported languages.
185
+ *
186
+ * @returns Promise resolving to an array of voice objects
187
+ */
188
+ async getSpeakers() {
189
+ return Promise.resolve([
190
+ // English (US) - Polyglot voices
191
+ { voiceId: "tiffany", name: "Tiffany", language: "English", locale: "en-US", gender: "feminine", polyglot: true },
192
+ {
193
+ voiceId: "matthew",
194
+ name: "Matthew",
195
+ language: "English",
196
+ locale: "en-US",
197
+ gender: "masculine",
198
+ polyglot: true
199
+ },
200
+ // English (UK)
201
+ { voiceId: "amy", name: "Amy", language: "English", locale: "en-GB", gender: "feminine", polyglot: false },
202
+ // English (Australia)
203
+ { voiceId: "olivia", name: "Olivia", language: "English", locale: "en-AU", gender: "feminine", polyglot: false },
204
+ // English (Indian)
205
+ { voiceId: "kiara", name: "Kiara", language: "English", locale: "en-IN", gender: "feminine", polyglot: false },
206
+ { voiceId: "arjun", name: "Arjun", language: "English", locale: "en-IN", gender: "masculine", polyglot: false },
207
+ // French
208
+ { voiceId: "ambre", name: "Ambre", language: "French", locale: "fr-FR", gender: "feminine", polyglot: false },
209
+ {
210
+ voiceId: "florian",
211
+ name: "Florian",
212
+ language: "French",
213
+ locale: "fr-FR",
214
+ gender: "masculine",
215
+ polyglot: false
216
+ },
217
+ // Italian
218
+ {
219
+ voiceId: "beatrice",
220
+ name: "Beatrice",
221
+ language: "Italian",
222
+ locale: "it-IT",
223
+ gender: "feminine",
224
+ polyglot: false
225
+ },
226
+ {
227
+ voiceId: "lorenzo",
228
+ name: "Lorenzo",
229
+ language: "Italian",
230
+ locale: "it-IT",
231
+ gender: "masculine",
232
+ polyglot: false
233
+ },
234
+ // German
235
+ { voiceId: "tina", name: "Tina", language: "German", locale: "de-DE", gender: "feminine", polyglot: false },
236
+ {
237
+ voiceId: "lennart",
238
+ name: "Lennart",
239
+ language: "German",
240
+ locale: "de-DE",
241
+ gender: "masculine",
242
+ polyglot: false
243
+ },
244
+ // Spanish (US)
245
+ { voiceId: "lupe", name: "Lupe", language: "Spanish", locale: "es-US", gender: "feminine", polyglot: false },
246
+ { voiceId: "carlos", name: "Carlos", language: "Spanish", locale: "es-US", gender: "masculine", polyglot: false },
247
+ // Portuguese
248
+ {
249
+ voiceId: "carolina",
250
+ name: "Carolina",
251
+ language: "Portuguese",
252
+ locale: "pt-BR",
253
+ gender: "feminine",
254
+ polyglot: false
255
+ },
256
+ { voiceId: "leo", name: "Leo", language: "Portuguese", locale: "pt-BR", gender: "masculine", polyglot: false },
257
+ // Hindi
258
+ { voiceId: "kiara", name: "Kiara", language: "Hindi", locale: "hi-IN", gender: "feminine", polyglot: false },
259
+ { voiceId: "arjun", name: "Arjun", language: "Hindi", locale: "hi-IN", gender: "masculine", polyglot: false }
260
+ ]);
261
+ }
262
+ /**
263
+ * Establishes a connection to the AWS Bedrock bidirectional streaming service.
264
+ * Must be called before using speak, listen, or send functions.
265
+ *
266
+ * @throws {NovaSonicError} If connection fails or credentials are missing
267
+ *
268
+ * @example
269
+ * ```typescript
270
+ * await voice.connect();
271
+ * // Now ready for voice interactions
272
+ * ```
273
+ */
274
+ async connect({ requestContext } = {}) {
275
+ if (this.state === "connected" || this.state === "connecting") {
276
+ this.log("Already connected or connecting");
277
+ return;
278
+ }
279
+ this.state = "connecting";
280
+ this.requestContext = requestContext;
281
+ this.streamRestartAttempted = false;
282
+ try {
283
+ await this.createBedrockClient();
284
+ const asyncIterable = this.createEventQueue();
285
+ this.enqueueInitialSessionEvents();
286
+ await this.sendInitialConnectCommand(asyncIterable);
287
+ this.processStream().catch((error) => {
288
+ this.log("Error in stream processing:", error);
289
+ this.emit("error", {
290
+ message: error instanceof Error ? error.message : "Stream processing error",
291
+ code: "STREAM_PROCESSING_ERROR",
292
+ details: error
293
+ });
294
+ });
295
+ this.log("Connected to AWS Bedrock Nova 2 Sonic");
296
+ } catch (error) {
297
+ this.state = "disconnected";
298
+ if (this.client) {
299
+ if (typeof this.client.destroy === "function") {
300
+ this.client.destroy();
301
+ }
302
+ this.client = void 0;
303
+ }
304
+ this.log("Connection error:", error);
305
+ const errorMessage = error instanceof Error ? error.message : "Unknown error during connection";
306
+ throw new NovaSonicError("connection_failed" /* CONNECTION_FAILED */, `Failed to connect to AWS Bedrock: ${errorMessage}`, error);
307
+ }
308
+ }
309
+ /**
310
+ * Resolve credentials and initialize the Bedrock Runtime client over HTTP/2.
311
+ */
312
+ async createBedrockClient() {
313
+ this.log("Getting AWS credentials...");
314
+ const credentials = await getAwsCredentials(this.credentials, this.debug);
315
+ if (!credentials) {
316
+ throw new NovaSonicError(
317
+ "credentials_missing" /* CREDENTIALS_MISSING */,
318
+ "AWS credentials are required. Please configure AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables or provide credentials in the config."
319
+ );
320
+ }
321
+ this.log("Credentials retrieved:", {
322
+ hasAccessKeyId: !!credentials.accessKeyId,
323
+ hasSecretAccessKey: !!credentials.secretAccessKey,
324
+ hasSessionToken: !!credentials.sessionToken,
325
+ accessKeyIdPrefix: credentials.accessKeyId ? `${credentials.accessKeyId.substring(0, 6)}...` : "missing",
326
+ expiration: credentials.expiration ? credentials.expiration.toISOString() : "no expiration"
327
+ });
328
+ this.log(`Initializing Bedrock Runtime client for region: ${this.region}, model: ${this.model}`);
329
+ const nodeHttp2Handler = new NodeHttp2Handler({
330
+ requestTimeout: 3e5,
331
+ // 5 minutes
332
+ sessionTimeout: 3e5,
333
+ // 5 minutes
334
+ disableConcurrentStreams: false,
335
+ maxConcurrentStreams: 20
336
+ });
337
+ this.client = new BedrockRuntimeClient({
338
+ region: this.region,
339
+ credentials,
340
+ requestHandler: nodeHttp2Handler
341
+ });
342
+ }
343
+ /**
344
+ * Build the async-iterable event queue used as the request body for the
345
+ * bidirectional stream. Returns the iterable and wires up internal queue
346
+ * helpers (_eventQueue, _signalQueue, _closeSignal) used by sendClientEvent.
347
+ */
348
+ createEventQueue() {
349
+ this.log("Creating bidirectional stream command...");
350
+ const voiceInstance = this;
351
+ const eventQueue = [];
352
+ const pendingResolvers = [];
353
+ let closeSignal = false;
354
+ const signalQueue = () => {
355
+ if (pendingResolvers.length > 0) {
356
+ voiceInstance.log(`[AsyncIterable] Signaling queue - resolving ${pendingResolvers.length} pending Promise(s)`);
357
+ const resolvers = [...pendingResolvers];
358
+ pendingResolvers.length = 0;
359
+ resolvers.forEach((resolve) => resolve());
360
+ } else {
361
+ voiceInstance.log("[AsyncIterable] signalQueue called but no pending Promise");
362
+ }
363
+ };
364
+ const asyncIterable = {
365
+ [Symbol.asyncIterator]: () => {
366
+ voiceInstance.log("[AsyncIterable] Iterator requested");
367
+ return {
368
+ next: async () => {
369
+ try {
370
+ if (closeSignal || voiceInstance.state === "disconnected") {
371
+ voiceInstance.log(`[AsyncIterable] Stream closed (state: ${voiceInstance.state}), done = true`);
372
+ return { value: void 0, done: true };
373
+ }
374
+ if (eventQueue.length === 0) {
375
+ try {
376
+ voiceInstance.log("[AsyncIterable] Queue empty, waiting for signal...");
377
+ await new Promise((resolve) => {
378
+ pendingResolvers.push(resolve);
379
+ voiceInstance.log(
380
+ `[AsyncIterable] Promise created, waiting for signal (${pendingResolvers.length} pending)...`
381
+ );
382
+ setImmediate(() => {
383
+ if (eventQueue.length > 0) {
384
+ voiceInstance.log("[AsyncIterable] Data arrived before wait, resolving immediately");
385
+ const index = pendingResolvers.indexOf(resolve);
386
+ if (index !== -1) {
387
+ pendingResolvers.splice(index, 1);
388
+ resolve();
389
+ }
390
+ return;
391
+ }
392
+ if (closeSignal || voiceInstance.state === "disconnected") {
393
+ voiceInstance.log("[AsyncIterable] Closed before wait, resolving");
394
+ const index = pendingResolvers.indexOf(resolve);
395
+ if (index !== -1) {
396
+ pendingResolvers.splice(index, 1);
397
+ resolve();
398
+ }
399
+ return;
400
+ }
401
+ });
402
+ });
403
+ voiceInstance.log("[AsyncIterable] Promise resolved, checking queue...");
404
+ } catch (error) {
405
+ if (error instanceof Error && error.message === "Stream closed") {
406
+ voiceInstance.log("[AsyncIterable] Stream closed during wait");
407
+ return { value: void 0, done: true };
408
+ }
409
+ voiceInstance.log("[AsyncIterable] Error during wait:", error);
410
+ }
411
+ }
412
+ if (closeSignal) {
413
+ voiceInstance.log("[AsyncIterable] Stream closed (closeSignal)");
414
+ return { value: void 0, done: true };
415
+ }
416
+ if (voiceInstance.state === "disconnected") {
417
+ voiceInstance.log("[AsyncIterable] Stream closed (disconnected state)");
418
+ return { value: void 0, done: true };
419
+ }
420
+ while (eventQueue.length === 0 && !closeSignal) {
421
+ if (voiceInstance.state === "disconnected") {
422
+ voiceInstance.log("[AsyncIterable] Stream closed before wait loop");
423
+ return { value: void 0, done: true };
424
+ }
425
+ voiceInstance.log("[AsyncIterable] Queue still empty, waiting again...");
426
+ await new Promise((resolve) => {
427
+ pendingResolvers.push(resolve);
428
+ setImmediate(() => {
429
+ if (eventQueue.length > 0 || closeSignal || voiceInstance.state === "disconnected") {
430
+ const index = pendingResolvers.indexOf(resolve);
431
+ if (index !== -1) {
432
+ pendingResolvers.splice(index, 1);
433
+ resolve();
434
+ }
435
+ }
436
+ });
437
+ });
438
+ if (closeSignal || voiceInstance.state === "disconnected") {
439
+ voiceInstance.log("[AsyncIterable] Stream closed during wait loop");
440
+ return { value: void 0, done: true };
441
+ }
442
+ }
443
+ const nextEvent = eventQueue.shift();
444
+ const eventJson = JSON.stringify(nextEvent);
445
+ const eventBytes = Buffer.from(eventJson, "utf-8");
446
+ voiceInstance.log(`[AsyncIterable] Yielding event of size: ${eventBytes.length}`);
447
+ return {
448
+ value: {
449
+ chunk: {
450
+ bytes: eventBytes
451
+ }
452
+ },
453
+ done: false
454
+ };
455
+ } catch (error) {
456
+ voiceInstance.log("[AsyncIterable] Error in iterator:", error);
457
+ closeSignal = true;
458
+ return { value: void 0, done: true };
459
+ }
460
+ },
461
+ return: async () => {
462
+ voiceInstance.log("[AsyncIterable] Iterator return() called");
463
+ closeSignal = true;
464
+ signalQueue();
465
+ return { value: void 0, done: true };
466
+ },
467
+ throw: async (error) => {
468
+ voiceInstance.log("[AsyncIterable] Iterator throw() called:", error);
469
+ closeSignal = true;
470
+ signalQueue();
471
+ throw error;
472
+ }
473
+ };
474
+ }
475
+ };
476
+ this._eventQueue = eventQueue;
477
+ this._signalQueue = signalQueue;
478
+ this._closeSignal = () => {
479
+ closeSignal = true;
480
+ signalQueue();
481
+ };
482
+ return asyncIterable;
483
+ }
484
+ /**
485
+ * Pre-populate the event queue with the AWS Nova Sonic connection
486
+ * handshake events: sessionStart, promptStart, then a SYSTEM text content
487
+ * block carrying the configured instructions. AUDIO contentStart is NOT
488
+ * sent here; it is deferred to the first send() call.
489
+ */
490
+ enqueueInitialSessionEvents() {
491
+ const eventQueue = this._eventQueue;
492
+ if (!eventQueue) {
493
+ throw new NovaSonicError(
494
+ "connection_failed" /* CONNECTION_FAILED */,
495
+ "Event queue must be initialized before enqueueing session events"
496
+ );
497
+ }
498
+ this.log("Pre-populating queue with sessionStart and promptStart events...");
499
+ const promptName = randomUUID();
500
+ this._promptName = promptName;
501
+ const sessionStartEvent = {};
502
+ if (this.sessionConfig) {
503
+ if (this.sessionConfig.inferenceConfiguration) {
504
+ sessionStartEvent.inferenceConfiguration = {
505
+ maxTokens: this.sessionConfig.inferenceConfiguration.maxTokens || 4096,
506
+ topP: this.sessionConfig.inferenceConfiguration.topP || 0.9,
507
+ temperature: this.sessionConfig.inferenceConfiguration.temperature || 0.7,
508
+ ...this.sessionConfig.inferenceConfiguration.topK !== void 0 && {
509
+ topK: this.sessionConfig.inferenceConfiguration.topK
510
+ },
511
+ ...this.sessionConfig.inferenceConfiguration.stopSequences && {
512
+ stopSequences: this.sessionConfig.inferenceConfiguration.stopSequences
513
+ }
514
+ };
515
+ } else {
516
+ sessionStartEvent.inferenceConfiguration = {
517
+ maxTokens: 4096,
518
+ topP: 0.9,
519
+ temperature: 0.7
520
+ };
521
+ }
522
+ if (this.sessionConfig.turnDetectionConfiguration) {
523
+ sessionStartEvent.turnDetectionConfiguration = {
524
+ ...this.sessionConfig.turnDetectionConfiguration.endpointingSensitivity && {
525
+ endpointingSensitivity: this.sessionConfig.turnDetectionConfiguration.endpointingSensitivity
526
+ }
527
+ };
528
+ }
529
+ } else {
530
+ sessionStartEvent.inferenceConfiguration = {
531
+ maxTokens: 4096,
532
+ topP: 0.9,
533
+ temperature: 0.7
534
+ };
535
+ }
536
+ eventQueue.push({
537
+ event: {
538
+ sessionStart: sessionStartEvent
539
+ }
540
+ });
541
+ let voiceId = "matthew";
542
+ if (this.sessionConfig?.voice) {
543
+ if (typeof this.sessionConfig.voice === "string") {
544
+ voiceId = this.sessionConfig.voice;
545
+ } else if (this.sessionConfig.voice.name) {
546
+ voiceId = this.sessionConfig.voice.name;
547
+ }
548
+ } else if (this.speaker && this.speaker !== "default") {
549
+ if (typeof this.speaker === "string") {
550
+ voiceId = this.speaker;
551
+ } else {
552
+ const speakerObj = this.speaker;
553
+ if (speakerObj && typeof speakerObj === "object" && speakerObj.name) {
554
+ voiceId = speakerObj.name;
555
+ }
556
+ }
557
+ }
558
+ const promptStartEvent = {
559
+ promptName,
560
+ textOutputConfiguration: {
561
+ mediaType: "text/plain"
562
+ },
563
+ // AWS REQUIRES this - cannot be omitted
564
+ audioOutputConfiguration: {
565
+ mediaType: "audio/lpcm",
566
+ sampleRateHertz: 24e3,
567
+ sampleSizeBits: 16,
568
+ channelCount: 1,
569
+ voiceId,
570
+ encoding: "base64",
571
+ audioType: "SPEECH"
572
+ }
573
+ };
574
+ if (this.sessionConfig?.tools && this.sessionConfig.tools.length > 0) {
575
+ promptStartEvent.toolConfiguration = {
576
+ tools: this.sessionConfig.tools.map((tool) => {
577
+ let inputSchemaJson;
578
+ if (typeof tool.inputSchema === "string") {
579
+ inputSchemaJson = tool.inputSchema;
580
+ } else {
581
+ inputSchemaJson = JSON.stringify(tool.inputSchema);
582
+ }
583
+ return {
584
+ toolSpec: {
585
+ name: tool.name,
586
+ description: tool.description,
587
+ inputSchema: {
588
+ json: inputSchemaJson
589
+ }
590
+ }
591
+ };
592
+ }),
593
+ // toolChoice goes inside toolConfiguration for Nova 2 Sonic
594
+ ...this.sessionConfig?.toolChoice && { toolChoice: this.sessionConfig.toolChoice }
595
+ };
596
+ } else if (this.sessionConfig?.toolChoice) {
597
+ promptStartEvent.toolConfiguration = {
598
+ toolChoice: this.sessionConfig.toolChoice
599
+ };
600
+ }
601
+ eventQueue.push({
602
+ event: {
603
+ promptStart: promptStartEvent
604
+ }
605
+ });
606
+ this.promptStarted = true;
607
+ const systemContentName = randomUUID();
608
+ eventQueue.push({
609
+ event: {
610
+ contentStart: {
611
+ promptName,
612
+ contentName: systemContentName,
613
+ type: "TEXT",
614
+ interactive: false,
615
+ role: "SYSTEM",
616
+ textInputConfiguration: {
617
+ mediaType: "text/plain"
618
+ }
619
+ }
620
+ }
621
+ });
622
+ eventQueue.push({
623
+ event: {
624
+ textInput: {
625
+ promptName,
626
+ contentName: systemContentName,
627
+ content: this.instructions || ""
628
+ }
629
+ }
630
+ });
631
+ eventQueue.push({
632
+ event: {
633
+ contentEnd: {
634
+ promptName,
635
+ contentName: systemContentName
636
+ }
637
+ }
638
+ });
639
+ this.audioContentStarted = false;
640
+ this.log(`Queue pre-populated with ${eventQueue.length} event(s)`);
641
+ }
642
+ /**
643
+ * Issue the InvokeModelWithBidirectionalStreamCommand to AWS Bedrock with
644
+ * a 5-second abort timeout that tears down the client on hang to avoid
645
+ * leaked HTTP/2 sessions. On success the response stream is stored and the
646
+ * voice transitions to 'connected'.
647
+ */
648
+ async sendInitialConnectCommand(asyncIterable) {
649
+ if (!this.client) {
650
+ throw new NovaSonicError(
651
+ "connection_failed" /* CONNECTION_FAILED */,
652
+ "Bedrock client must be created before sending the initial command"
653
+ );
654
+ }
655
+ const command = new InvokeModelWithBidirectionalStreamCommand({
656
+ modelId: this.model,
657
+ body: asyncIterable
658
+ // Type assertion needed as SDK types may be strict
659
+ });
660
+ const sendStartTime = Date.now();
661
+ const abortController = new AbortController();
662
+ const timeoutId = setTimeout(() => {
663
+ this.log("[DEBUG] client.send() timeout after 5 seconds - aborting request");
664
+ abortController.abort();
665
+ }, 5e3);
666
+ let response;
667
+ try {
668
+ response = await this.client.send(command, { abortSignal: abortController.signal });
669
+ } catch (error) {
670
+ const sendDuration2 = Date.now() - sendStartTime;
671
+ if (abortController.signal.aborted) {
672
+ this.log(`[DEBUG] client.send() aborted after ${sendDuration2}ms`);
673
+ this._closeSignal?.();
674
+ this.client.destroy();
675
+ throw new Error("client.send() timeout");
676
+ }
677
+ this.log(`[DEBUG] client.send() error after ${sendDuration2}ms:`, error);
678
+ throw error;
679
+ } finally {
680
+ clearTimeout(timeoutId);
681
+ }
682
+ const sendDuration = Date.now() - sendStartTime;
683
+ this.log(`[DEBUG] client.send() completed in ${sendDuration}ms`);
684
+ this.log("Received response from AWS Bedrock");
685
+ this.stream = response.body;
686
+ this.log(
687
+ `[DEBUG] Response stream is async iterable: ${this.stream && typeof this.stream[Symbol.asyncIterator] === "function"}`
688
+ );
689
+ this.state = "connected";
690
+ this.log(`[STATE] State set to 'connected'`);
691
+ }
692
+ /**
693
+ * Process the bidirectional stream from AWS Bedrock
694
+ */
695
+ async processStream() {
696
+ if (!this.stream) {
697
+ this.log("[Stream] No stream available, cannot process");
698
+ return;
699
+ }
700
+ if (this.processingStream) {
701
+ this.log("[Stream] Already processing stream, skipping");
702
+ return;
703
+ }
704
+ this.processingStream = true;
705
+ this.log("[Stream] Starting stream processing");
706
+ let eventCount = 0;
707
+ let lastEventTime = Date.now();
708
+ try {
709
+ for await (const chunk of this.stream) {
710
+ if (chunk.chunk) {
711
+ const textResponse = Buffer.from(chunk.chunk.bytes || []).toString("utf-8");
712
+ eventCount++;
713
+ const now = Date.now();
714
+ const timeSinceLastEvent = now - lastEventTime;
715
+ lastEventTime = now;
716
+ this.log(
717
+ `[Stream] Received chunk #${eventCount}, length: ${textResponse.length}, time since last: ${timeSinceLastEvent}ms`
718
+ );
719
+ try {
720
+ const jsonResponse = JSON.parse(textResponse);
721
+ this.log(`[Stream] ========================================`);
722
+ this.log(`[Stream] Parsed JSON response, keys: ${Object.keys(jsonResponse).join(", ")}`);
723
+ if (jsonResponse.event) {
724
+ const eventKeys = Object.keys(jsonResponse.event);
725
+ this.log(`[Stream] Event keys: ${eventKeys.join(", ")}`);
726
+ if (jsonResponse.event.contentStart) {
727
+ this.log(`[Stream] \u2192 Handling contentStart`);
728
+ this.handleServerEvent({ contentStart: jsonResponse.event.contentStart });
729
+ } else if (jsonResponse.event.textOutput) {
730
+ this.log(
731
+ `[Stream] \u2192 Handling textOutput, content length: ${jsonResponse.event.textOutput?.content?.length ?? 0}`
732
+ );
733
+ this.handleServerEvent({ textOutput: jsonResponse.event.textOutput });
734
+ } else if (jsonResponse.event.audioOutput) {
735
+ this.handleServerEvent({ audioOutput: jsonResponse.event.audioOutput });
736
+ } else if (jsonResponse.event.toolUse) {
737
+ this.handleServerEvent({ toolUse: jsonResponse.event.toolUse });
738
+ } else if (jsonResponse.event.contentEnd && jsonResponse.event.contentEnd.type === "TOOL") {
739
+ this.handleServerEvent({ contentEnd: jsonResponse.event.contentEnd });
740
+ } else if (jsonResponse.event.contentEnd) {
741
+ this.log(
742
+ `[Stream] Found contentEnd, type: ${jsonResponse.event.contentEnd.type}, stopReason: ${jsonResponse.event.contentEnd.stopReason}`
743
+ );
744
+ this.handleServerEvent({ contentEnd: jsonResponse.event.contentEnd });
745
+ } else if (jsonResponse.event.completionStart) {
746
+ this.log(
747
+ "[Stream] Found completionStart inside event object:",
748
+ JSON.stringify(jsonResponse.event.completionStart, null, 2)
749
+ );
750
+ this.emit("completionStart", jsonResponse.event.completionStart);
751
+ } else if (jsonResponse.event.completionEnd) {
752
+ this.log(
753
+ "[Stream] Found completionEnd inside event object:",
754
+ JSON.stringify(jsonResponse.event.completionEnd, null, 2)
755
+ );
756
+ this.handleServerEvent({ completionEnd: jsonResponse.event.completionEnd });
757
+ } else {
758
+ const eventKeys2 = Object.keys(jsonResponse.event || {});
759
+ this.log(`[Stream] Event keys for other events: ${eventKeys2.join(", ")}`);
760
+ if (eventKeys2.length > 0) {
761
+ if (eventKeys2.includes("completionEnd")) {
762
+ this.log("[Stream] Found completionEnd in other events, handling explicitly");
763
+ this.handleServerEvent({ completionEnd: jsonResponse.event.completionEnd });
764
+ } else {
765
+ const eventKey = eventKeys2[0];
766
+ this.log(`[Stream] Dispatching other event: ${eventKey}`);
767
+ const eventValue = jsonResponse.event[eventKey];
768
+ if (eventValue !== void 0) {
769
+ if (eventKey === "completionEnd") {
770
+ this.handleServerEvent({ completionEnd: eventValue });
771
+ } else {
772
+ this.handleServerEvent({ [eventKey]: eventValue });
773
+ }
774
+ }
775
+ }
776
+ } else if (Object.keys(jsonResponse).length > 0) {
777
+ this.log(`[Stream] Unknown event structure, keys:`, Object.keys(jsonResponse).join(", "));
778
+ }
779
+ }
780
+ } else {
781
+ if (this.debug) {
782
+ this.log(
783
+ '[Stream] Received event without "event" wrapper, keys:',
784
+ Object.keys(jsonResponse).join(", ")
785
+ );
786
+ }
787
+ if (jsonResponse.usageEvent) {
788
+ this.emit("usage", {
789
+ inputTokens: jsonResponse.usageEvent.totalInputTokens || 0,
790
+ outputTokens: jsonResponse.usageEvent.totalOutputTokens || 0,
791
+ totalTokens: jsonResponse.usageEvent.totalTokens || 0
792
+ });
793
+ }
794
+ if (jsonResponse.completionEnd) {
795
+ this.log(
796
+ "[Stream] Found completionEnd at top level:",
797
+ JSON.stringify(jsonResponse.completionEnd, null, 2)
798
+ );
799
+ this.handleServerEvent({ completionEnd: jsonResponse.completionEnd });
800
+ }
801
+ if (!jsonResponse.event && !jsonResponse.completionEnd && !jsonResponse.usageEvent) {
802
+ this.log(
803
+ "[Stream] Received response without event wrapper, keys:",
804
+ Object.keys(jsonResponse).join(", ")
805
+ );
806
+ }
807
+ if (jsonResponse.completionStart || jsonResponse.event?.completionStart) {
808
+ const completionStart = jsonResponse.completionStart || jsonResponse.event.completionStart;
809
+ this.log("[Stream] Found completionStart:", JSON.stringify(completionStart, null, 2));
810
+ this.emit("completionStart", completionStart);
811
+ }
812
+ }
813
+ } catch (parseError) {
814
+ this.log("[Stream] Failed to parse JSON response:", textResponse.substring(0, 200));
815
+ this.emit("error", {
816
+ message: "Failed to parse stream response",
817
+ code: "PARSE_ERROR",
818
+ details: parseError
819
+ });
820
+ }
821
+ } else if (chunk.internalServerException) {
822
+ this.emit("error", {
823
+ message: "Internal server error",
824
+ code: "INTERNAL_SERVER_ERROR",
825
+ details: chunk.internalServerException
826
+ });
827
+ } else if (chunk.modelStreamErrorException) {
828
+ this.emit("error", {
829
+ message: "Model stream error",
830
+ code: "MODEL_STREAM_ERROR",
831
+ details: chunk.modelStreamErrorException
832
+ });
833
+ } else if (chunk.modelTimeoutException) {
834
+ this.emit("error", {
835
+ message: "Model timeout",
836
+ code: "MODEL_TIMEOUT",
837
+ details: chunk.modelTimeoutException
838
+ });
839
+ } else if (chunk.serviceUnavailableException) {
840
+ this.emit("error", {
841
+ message: "Service unavailable",
842
+ code: "SERVICE_UNAVAILABLE",
843
+ details: chunk.serviceUnavailableException
844
+ });
845
+ } else if (chunk.throttlingException) {
846
+ this.emit("error", {
847
+ message: "Request throttled",
848
+ code: "THROTTLING",
849
+ details: chunk.throttlingException
850
+ });
851
+ } else if (chunk.validationException) {
852
+ this.emit("error", {
853
+ message: "Validation error",
854
+ code: "VALIDATION_ERROR",
855
+ details: chunk.validationException
856
+ });
857
+ }
858
+ }
859
+ } catch (streamError) {
860
+ this.log("[Stream] Error in processStream:", streamError);
861
+ this.emit("error", {
862
+ message: "Stream processing error",
863
+ code: "STREAM_ERROR",
864
+ details: streamError instanceof Error ? streamError.message : String(streamError)
865
+ });
866
+ } finally {
867
+ this.processingStream = false;
868
+ this.log(
869
+ `[Stream] processStream finished, processingStream set to false. Total events received: ${eventCount || 0}`
870
+ );
871
+ this.log(`[Stream] Stream state: state=${this.state}, stream exists=${!!this.stream}`);
872
+ if (!this.turnCompleted && this.audioContentStarted) {
873
+ this.log("[Stream] Stream ended but turn not completed - signaling turn completion as fallback");
874
+ this.log(
875
+ `[Stream] State: turnCompleted=${this.turnCompleted}, audioContentStarted=${this.audioContentStarted}, hasSentContentEnd=${this.hasSentContentEnd}`
876
+ );
877
+ this.turnCompleted = true;
878
+ this.emit("turnComplete", { timestamp: Date.now() });
879
+ if (this.currentResponseId) {
880
+ const stream = this.speakerStreams.get(this.currentResponseId);
881
+ if (stream) {
882
+ stream.end();
883
+ }
884
+ this.speakerStreams.delete(this.currentResponseId);
885
+ this.currentResponseId = void 0;
886
+ }
887
+ this.hasSentContentEnd = false;
888
+ this.log("[Stream] Turn completion signaled, ready for next turn");
889
+ } else if (this.turnCompleted) {
890
+ this.log("[Stream] Stream ended and turn was already completed");
891
+ } else {
892
+ this.log(
893
+ `[Stream] Stream ended but turn not completed - audioContentStarted=${this.audioContentStarted}, turnCompleted=${this.turnCompleted}`
894
+ );
895
+ }
896
+ if (this.stream && this.state === "connected" && !this.processingStream && !this.streamRestartAttempted) {
897
+ this.log("[Stream] Stream still open but processing stopped - will restart stream processing");
898
+ this.streamRestartAttempted = true;
899
+ setImmediate(() => {
900
+ if (this.stream && this.state === "connected" && !this.processingStream) {
901
+ this.log("[Stream] Restarting stream processing for subsequent turns");
902
+ this.processStream().catch((error) => {
903
+ this.log("[Stream] Error restarting stream processing:", error);
904
+ this.streamRestartAttempted = false;
905
+ });
906
+ } else {
907
+ this.streamRestartAttempted = false;
908
+ }
909
+ });
910
+ } else {
911
+ if (this.streamRestartAttempted) {
912
+ this.log("[Stream] Stream restart already attempted, skipping");
913
+ }
914
+ }
915
+ }
916
+ }
917
+ /**
918
+ * Handle server events from AWS Bedrock
919
+ */
920
+ handleServerEvent(event) {
921
+ if (this.debug) {
922
+ this.log("Received event, keys:", Object.keys(event).join(", "));
923
+ }
924
+ if (event.contentStart) {
925
+ this.handleContentStart(event.contentStart);
926
+ }
927
+ if (event.textOutput) {
928
+ this.handleTextOutput(event.textOutput);
929
+ }
930
+ if (event.audioOutput?.content) {
931
+ this.handleAudioOutput(event.audioOutput);
932
+ }
933
+ if (event.toolUse) {
934
+ this.handleToolUse(event.toolUse);
935
+ }
936
+ if (event.contentEnd) {
937
+ this.handleContentEnd(event.contentEnd);
938
+ }
939
+ if (event.completionEnd) {
940
+ this.handleCompletionEnd(event.completionEnd);
941
+ }
942
+ if (event.error) {
943
+ this.emit("error", {
944
+ message: event.error.message || "Unknown error",
945
+ code: event.error.code || "UNKNOWN_ERROR",
946
+ details: event.error
947
+ });
948
+ }
949
+ }
950
+ /**
951
+ * Handle a contentStart event. Tracks generationStage for text content
952
+ * blocks so the corresponding 'writing' events can be tagged
953
+ * SPECULATIVE/FINAL for the client.
954
+ */
955
+ handleContentStart(contentStart) {
956
+ const role = contentStart.role?.toLowerCase();
957
+ const contentType = contentStart.type;
958
+ this.log(`[Event] contentStart: type=${contentType || "unknown"}, role=${role}`);
959
+ this.emit("contentStart", contentStart);
960
+ if (contentType === "TEXT" && contentStart.additionalModelFields) {
961
+ try {
962
+ const additionalFields = JSON.parse(contentStart.additionalModelFields);
963
+ this.currentTextGenerationStage = additionalFields.generationStage;
964
+ this.log(`[Event] Text content generationStage: ${this.currentTextGenerationStage}`);
965
+ } catch {
966
+ this.currentTextGenerationStage = void 0;
967
+ }
968
+ } else if (contentType === "TEXT") {
969
+ this.currentTextGenerationStage = void 0;
970
+ }
971
+ }
972
+ /**
973
+ * Handle a textOutput event. Detects interruption (barge-in) markers in
974
+ * the payload, otherwise emits a 'writing' event with the text and
975
+ * current generationStage.
976
+ */
977
+ handleTextOutput(textOutput) {
978
+ const text = textOutput.content || "";
979
+ const role = textOutput.role?.toLowerCase() || "assistant";
980
+ this.log(`[Event] textOutput received: role=${role}, text length=${text.length}`);
981
+ let isInterrupted = false;
982
+ try {
983
+ const parsed = JSON.parse(text);
984
+ if (parsed && parsed.interrupted === true) {
985
+ isInterrupted = true;
986
+ }
987
+ } catch {
988
+ if (/interrupted/i.test(text)) {
989
+ isInterrupted = true;
990
+ }
991
+ }
992
+ if (isInterrupted) {
993
+ this.log(`[Event] Interrupt detected, emitting interrupt event`);
994
+ this.emit("interrupt", { type: "user", timestamp: Date.now() });
995
+ return;
996
+ }
997
+ const generationStage = this.currentTextGenerationStage;
998
+ this.log(`[Event] Emitting 'writing': role=${role}, generationStage=${generationStage}, length=${text.length}`);
999
+ this.emit("writing", { text, role, generationStage });
1000
+ }
1001
+ /**
1002
+ * Handle an audioOutput event. Decodes the base64 LPCM payload, emits
1003
+ * 'speaking' with both the base64 string and an Int16Array view, and
1004
+ * forwards bytes to any active speaker stream.
1005
+ */
1006
+ handleAudioOutput(audioOutput) {
1007
+ try {
1008
+ const content = audioOutput.content;
1009
+ const audioBytes = Buffer.from(content, "base64");
1010
+ this.log(`[Event] Audio output: ${audioBytes.length} bytes`);
1011
+ this.isReceivingAssistantAudio = true;
1012
+ const audioData = new Int16Array(audioBytes.buffer, audioBytes.byteOffset, audioBytes.byteLength / 2);
1013
+ this.emit("speaking", {
1014
+ audio: content,
1015
+ audioData,
1016
+ response_id: this.currentResponseId
1017
+ });
1018
+ if (this.currentResponseId) {
1019
+ const stream = this.speakerStreams.get(this.currentResponseId);
1020
+ if (stream) {
1021
+ stream.write(audioBytes);
1022
+ }
1023
+ }
1024
+ } catch (error) {
1025
+ this.log("[Event] Error decoding audio:", error);
1026
+ this.emit("error", {
1027
+ message: "Failed to decode audio",
1028
+ code: "AUDIO_DECODE_ERROR",
1029
+ details: error
1030
+ });
1031
+ }
1032
+ }
1033
+ /**
1034
+ * Handle a toolUse event. Emits 'toolCall' and dispatches to the
1035
+ * configured tool's execute() function via handleToolCall().
1036
+ */
1037
+ handleToolUse(toolUse) {
1038
+ const toolUseId = toolUse.toolUseId || "";
1039
+ const toolName = toolUse.toolName || "";
1040
+ const toolInput = toolUse.input || {};
1041
+ this.emit("toolCall", {
1042
+ name: toolName,
1043
+ args: toolInput,
1044
+ id: toolUseId
1045
+ });
1046
+ if (this.tools && toolName in this.tools) {
1047
+ void this.handleToolCall(toolName, toolInput, toolUseId);
1048
+ }
1049
+ }
1050
+ /**
1051
+ * Handle a contentEnd event. Forwards it to clients, then routes by
1052
+ * stopReason / type:
1053
+ * - INTERRUPTED: emit 'interrupt' and tear down the active speaker stream
1054
+ * - TOOL: end the active speaker stream
1055
+ * - AUDIO with END_TURN: signal turnComplete (assistant audio finished)
1056
+ * - AUDIO with PARTIAL_TURN while receiving assistant audio: schedule
1057
+ * fallback turnComplete in case completionEnd never arrives
1058
+ * - AUDIO otherwise: user input ended, reset turn flags
1059
+ */
1060
+ handleContentEnd(contentEnd) {
1061
+ this.log(`[Event] contentEnd received: type=${contentEnd.type}, stopReason=${contentEnd.stopReason}`);
1062
+ this.emit("contentEnd", contentEnd);
1063
+ if (contentEnd.stopReason === "INTERRUPTED") {
1064
+ this.log("[Event] Content interrupted by user (barge-in)");
1065
+ this.emit("interrupt", { type: "user", timestamp: Date.now() });
1066
+ if (this.currentResponseId) {
1067
+ const stream = this.speakerStreams.get(this.currentResponseId);
1068
+ if (stream) {
1069
+ stream.destroy();
1070
+ }
1071
+ this.speakerStreams.delete(this.currentResponseId);
1072
+ }
1073
+ this.currentResponseId = void 0;
1074
+ this.log("[Event] After interruption, keeping audioContentStarted=true for continued streaming");
1075
+ } else if (contentEnd.type === "TOOL" && this.currentResponseId) {
1076
+ const stream = this.speakerStreams.get(this.currentResponseId);
1077
+ if (stream) {
1078
+ stream.end();
1079
+ }
1080
+ } else if (contentEnd.type === "AUDIO") {
1081
+ if (contentEnd.stopReason === "END_TURN") {
1082
+ this.log(`[Event] contentEnd (AUDIO) with stopReason END_TURN - signaling turn complete`);
1083
+ if (this.currentResponseId) {
1084
+ const stream = this.speakerStreams.get(this.currentResponseId);
1085
+ if (stream) {
1086
+ stream.end();
1087
+ }
1088
+ this.speakerStreams.delete(this.currentResponseId);
1089
+ this.currentResponseId = void 0;
1090
+ }
1091
+ if (!this.turnCompleted) {
1092
+ this.turnCompleted = true;
1093
+ this.emit("turnComplete", { timestamp: Date.now() });
1094
+ this.hasSentContentEnd = false;
1095
+ this.log(
1096
+ `[Event] Turn complete (from contentEnd AUDIO with END_TURN), ready for next turn. audioContentStarted: ${this.audioContentStarted}, audioContentName: ${this.audioContentName}`
1097
+ );
1098
+ } else {
1099
+ this.log(
1100
+ `[Event] contentEnd (AUDIO) with END_TURN received but turn already completed - skipping duplicate turnComplete emission`
1101
+ );
1102
+ }
1103
+ if (!this.turnCompleteTimeout) {
1104
+ this.turnCompleteTimeout = setTimeout(() => {
1105
+ this.log(`[Event] Timeout: completionEnd not received, but turn already completed from contentEnd`);
1106
+ this.turnCompleteTimeout = void 0;
1107
+ }, 1e3);
1108
+ }
1109
+ } else {
1110
+ if (this.isReceivingAssistantAudio && contentEnd.stopReason === "PARTIAL_TURN") {
1111
+ this.isReceivingAssistantAudio = false;
1112
+ if (!this.turnCompleteTimeout && !this.turnCompleted) {
1113
+ this.log(
1114
+ `[Event] contentEnd (AUDIO) with PARTIAL_TURN for assistant output - waiting for completionEnd, setting fallback timeout`
1115
+ );
1116
+ this.turnCompleteTimeout = setTimeout(() => {
1117
+ if (!this.turnCompleted) {
1118
+ this.log(
1119
+ `[Event] Fallback: completionEnd not received after contentEnd (AUDIO) with PARTIAL_TURN, signaling turn complete`
1120
+ );
1121
+ this.turnCompleted = true;
1122
+ this.emit("turnComplete", { timestamp: Date.now() });
1123
+ if (this.currentResponseId) {
1124
+ const stream = this.speakerStreams.get(this.currentResponseId);
1125
+ if (stream) {
1126
+ stream.end();
1127
+ }
1128
+ this.speakerStreams.delete(this.currentResponseId);
1129
+ this.currentResponseId = void 0;
1130
+ }
1131
+ this.hasSentContentEnd = false;
1132
+ this.turnCompleteTimeout = void 0;
1133
+ }
1134
+ }, 2e3);
1135
+ }
1136
+ } else {
1137
+ this.hasSentContentEnd = false;
1138
+ this.turnCompleted = false;
1139
+ this.log(
1140
+ `[Event] contentEnd (AUDIO) - user input ended, stopReason: ${contentEnd.stopReason}. Keeping audioContentStarted=true for next turn. Reset hasSentContentEnd=false, turnCompleted=false.`
1141
+ );
1142
+ }
1143
+ }
1144
+ } else if (contentEnd.type === "TEXT") {
1145
+ this.currentTextGenerationStage = void 0;
1146
+ this.log(
1147
+ `[Event] contentEnd (TEXT) received, stopReason: ${contentEnd.stopReason}. Turn completion handled by completionEnd/contentEnd(AUDIO).`
1148
+ );
1149
+ if (contentEnd.stopReason === "END_TURN") {
1150
+ this.hasSentContentEnd = false;
1151
+ }
1152
+ }
1153
+ }
1154
+ /**
1155
+ * Handle a completionEnd event. AWS uses this as the definitive signal
1156
+ * that a turn (and all audio output) has finished. Tears down the active
1157
+ * speaker stream, clears any fallback timer, emits 'turnComplete' once,
1158
+ * and forwards token usage if reported.
1159
+ */
1160
+ handleCompletionEnd(completionEnd) {
1161
+ this.log(`[Event] completionEnd received, stopReason: ${completionEnd.stopReason}`);
1162
+ if (this.turnCompleteTimeout) {
1163
+ clearTimeout(this.turnCompleteTimeout);
1164
+ this.turnCompleteTimeout = void 0;
1165
+ }
1166
+ if (this.currentResponseId) {
1167
+ const stream = this.speakerStreams.get(this.currentResponseId);
1168
+ if (stream) {
1169
+ stream.end();
1170
+ }
1171
+ this.speakerStreams.delete(this.currentResponseId);
1172
+ this.currentResponseId = void 0;
1173
+ }
1174
+ this.isReceivingAssistantAudio = false;
1175
+ if (!this.turnCompleted) {
1176
+ this.log(
1177
+ `[Event] completionEnd - signaling turn complete (stopReason: ${completionEnd.stopReason || "undefined"})`
1178
+ );
1179
+ this.turnCompleted = true;
1180
+ this.emit("turnComplete", { timestamp: Date.now() });
1181
+ this.hasSentContentEnd = false;
1182
+ } else {
1183
+ this.log(`[Event] completionEnd received but turn already completed - skipping duplicate turnComplete emission`);
1184
+ }
1185
+ if (completionEnd.usage) {
1186
+ this.emit("usage", {
1187
+ inputTokens: completionEnd.usage.inputTokens || 0,
1188
+ outputTokens: completionEnd.usage.outputTokens || 0,
1189
+ totalTokens: (completionEnd.usage.inputTokens || 0) + (completionEnd.usage.outputTokens || 0)
1190
+ });
1191
+ }
1192
+ }
1193
+ /**
1194
+ * Handle tool execution
1195
+ */
1196
+ async handleToolCall(toolName, args, toolUseId) {
1197
+ const tool = this.tools?.[toolName];
1198
+ if (!tool || !tool.execute) {
1199
+ this.emit("error", {
1200
+ message: `Tool ${toolName} not found or has no execute function`,
1201
+ code: "TOOL_NOT_FOUND"
1202
+ });
1203
+ return;
1204
+ }
1205
+ try {
1206
+ const result = await tool.execute(
1207
+ { context: args, requestContext: this.requestContext },
1208
+ {
1209
+ toolCallId: toolUseId,
1210
+ messages: []
1211
+ }
1212
+ );
1213
+ await this.sendClientEvent({
1214
+ toolResult: {
1215
+ toolUseId,
1216
+ content: [
1217
+ {
1218
+ json: typeof result === "object" ? result : { result }
1219
+ }
1220
+ ]
1221
+ }
1222
+ });
1223
+ } catch (error) {
1224
+ this.emit("error", {
1225
+ message: `Error executing tool ${toolName}: ${error instanceof Error ? error.message : "Unknown error"}`,
1226
+ code: "TOOL_EXECUTION_ERROR",
1227
+ details: error
1228
+ });
1229
+ await this.sendClientEvent({
1230
+ toolResult: {
1231
+ toolUseId,
1232
+ content: [
1233
+ {
1234
+ text: `Error: ${error instanceof Error ? error.message : "Unknown error"}`
1235
+ }
1236
+ ]
1237
+ }
1238
+ });
1239
+ }
1240
+ }
1241
+ /**
1242
+ * Send a client event to AWS Bedrock
1243
+ * Events are sent through the input stream that was passed to the bidirectional stream command
1244
+ */
1245
+ async sendClientEvent(event) {
1246
+ if (this.state !== "connected") {
1247
+ throw new NovaSonicError("not_connected" /* NOT_CONNECTED */, "Not connected to AWS Bedrock. Call connect() first.");
1248
+ }
1249
+ try {
1250
+ const eventQueue = this._eventQueue;
1251
+ const signalQueue = this._signalQueue;
1252
+ if (!eventQueue || !signalQueue) {
1253
+ throw new NovaSonicError(
1254
+ "not_connected" /* NOT_CONNECTED */,
1255
+ "Event queue not initialized. Connection may not be fully established."
1256
+ );
1257
+ }
1258
+ this.log(`[sendClientEvent] Adding event to queue (queue size: ${eventQueue.length})`);
1259
+ eventQueue.push({ event });
1260
+ this.log(`[sendClientEvent] Event added, queue size now: ${eventQueue.length}, signaling...`);
1261
+ signalQueue();
1262
+ this.log(`[sendClientEvent] Signal sent`);
1263
+ if (this.debug) {
1264
+ this.log("Sent client event, keys:", Object.keys(event).join(", "));
1265
+ }
1266
+ } catch (error) {
1267
+ throw new NovaSonicError(
1268
+ "websocket_error" /* WEBSOCKET_ERROR */,
1269
+ `Failed to send client event: ${error instanceof Error ? error.message : "Unknown error"}`,
1270
+ error
1271
+ );
1272
+ }
1273
+ }
1274
+ /**
1275
+ * Disconnects from the AWS Bedrock session and cleans up resources.
1276
+ *
1277
+ * Pushes a `sessionEnd` event to the queue before signalling close,
1278
+ * then schedules client destruction on the next tick so the async
1279
+ * iterator has a chance to yield the event to the SDK.
1280
+ */
1281
+ close() {
1282
+ if (this.state === "disconnected") {
1283
+ return;
1284
+ }
1285
+ this.state = "disconnected";
1286
+ this.processingStream = false;
1287
+ if (this.turnCompleteTimeout) {
1288
+ clearTimeout(this.turnCompleteTimeout);
1289
+ this.turnCompleteTimeout = void 0;
1290
+ }
1291
+ const eventQueue = this._eventQueue;
1292
+ const signalQueue = this._signalQueue;
1293
+ if (eventQueue && signalQueue) {
1294
+ eventQueue.push({ event: { sessionEnd: {} } });
1295
+ signalQueue();
1296
+ }
1297
+ const closeSignal = this._closeSignal;
1298
+ if (closeSignal) {
1299
+ closeSignal();
1300
+ }
1301
+ if (this.inputStream) {
1302
+ this.inputStream.end();
1303
+ this.inputStream = void 0;
1304
+ }
1305
+ for (const stream of this.speakerStreams.values()) {
1306
+ stream.end();
1307
+ }
1308
+ this.speakerStreams.clear();
1309
+ const client = this.client;
1310
+ this.client = void 0;
1311
+ this.stream = void 0;
1312
+ if (client) {
1313
+ setImmediate(() => {
1314
+ if (typeof client.destroy === "function") {
1315
+ client.destroy();
1316
+ }
1317
+ });
1318
+ }
1319
+ this.log("Disconnected from AWS Bedrock Nova 2 Sonic");
1320
+ }
1321
+ /**
1322
+ * Equips the voice instance with a set of instructions.
1323
+ */
1324
+ addInstructions(instructions) {
1325
+ this.instructions = instructions;
1326
+ }
1327
+ /**
1328
+ * Equips the voice instance with a set of tools.
1329
+ */
1330
+ addTools(tools) {
1331
+ this.tools = tools || {};
1332
+ }
1333
+ /**
1334
+ * Convert text to speech
1335
+ */
1336
+ async speak(input, _options) {
1337
+ if (this.state !== "connected") {
1338
+ throw new NovaSonicError("not_connected" /* NOT_CONNECTED */, "Not connected. Call connect() first.");
1339
+ }
1340
+ let text = "";
1341
+ if (typeof input !== "string") {
1342
+ const chunks = [];
1343
+ for await (const chunk of input) {
1344
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(String(chunk)));
1345
+ }
1346
+ text = Buffer.concat(chunks).toString("utf-8");
1347
+ } else {
1348
+ text = input;
1349
+ }
1350
+ if (text.trim().length === 0) {
1351
+ throw new NovaSonicError("validation_error" /* VALIDATION_ERROR */, "Input text is empty");
1352
+ }
1353
+ this.currentResponseId = `response-${Date.now()}`;
1354
+ const speakerStream = new PassThrough();
1355
+ speakerStream.id = this.currentResponseId;
1356
+ this.speakerStreams.set(this.currentResponseId, speakerStream);
1357
+ this.emit("speaker", speakerStream);
1358
+ const promptName = this._promptName;
1359
+ if (!promptName) {
1360
+ throw new NovaSonicError(
1361
+ "not_connected" /* NOT_CONNECTED */,
1362
+ "Prompt name not initialized. Connection may not be fully established."
1363
+ );
1364
+ }
1365
+ if (!this.promptStarted) {
1366
+ throw new NovaSonicError(
1367
+ "invalid_state" /* INVALID_STATE */,
1368
+ "Prompt not started. This should not happen - prompt should be started during connection."
1369
+ );
1370
+ }
1371
+ const contentName = randomUUID();
1372
+ await this.sendClientEvent({
1373
+ contentStart: {
1374
+ promptName,
1375
+ contentName,
1376
+ type: "TEXT",
1377
+ interactive: true,
1378
+ role: "USER",
1379
+ textInputConfiguration: {
1380
+ mediaType: "text/plain"
1381
+ }
1382
+ }
1383
+ });
1384
+ await this.sendClientEvent({
1385
+ textInput: {
1386
+ promptName,
1387
+ contentName,
1388
+ content: text
1389
+ }
1390
+ });
1391
+ await this.sendClientEvent({
1392
+ contentEnd: {
1393
+ promptName,
1394
+ contentName
1395
+ }
1396
+ });
1397
+ }
1398
+ /**
1399
+ * Convert speech to text (transcription)
1400
+ * For Nova Sonic, this is the same as send() - both stream audio input
1401
+ */
1402
+ async listen(audioStream, _options) {
1403
+ if (audioStream && typeof audioStream === "object" && "read" in audioStream) {
1404
+ await this.send(audioStream);
1405
+ } else {
1406
+ throw new NovaSonicError("invalid_audio_format" /* INVALID_AUDIO_FORMAT */, "Unsupported audio stream format for listen()");
1407
+ }
1408
+ }
1409
+ /**
1410
+ * Streams audio data in real-time to the AWS Bedrock service.
1411
+ * Following AWS Nova 2 Sonic event sequence:
1412
+ * 1. contentStart (AUDIO, USER) - if not already sent
1413
+ * 2. audioInput events (one per chunk)
1414
+ * 3. contentEnd - when audio stream ends (handled separately via endAudioInput)
1415
+ */
1416
+ async send(audioData) {
1417
+ this.log(`[send] Current state: ${this.state}`);
1418
+ if (this.state !== "connected") {
1419
+ this.log(`[send] ERROR: State is '${this.state}', expected 'connected'`);
1420
+ throw new NovaSonicError(
1421
+ "not_connected" /* NOT_CONNECTED */,
1422
+ `Not connected. Current state: ${this.state}. Call connect() first.`
1423
+ );
1424
+ }
1425
+ this.log(`[send] State check passed, proceeding with send`);
1426
+ if (!(audioData instanceof Int16Array) && !(audioData && typeof audioData === "object" && "read" in audioData)) {
1427
+ throw new NovaSonicError("invalid_audio_format" /* INVALID_AUDIO_FORMAT */, "Unsupported audio data format");
1428
+ }
1429
+ if (this.turnCompleted || this.hasSentContentEnd) {
1430
+ this.log(
1431
+ `[send] Starting new turn - resetting flags. turnCompleted=${this.turnCompleted}, hasSentContentEnd=${this.hasSentContentEnd}.`
1432
+ );
1433
+ const needNewContent = this.hasSentContentEnd;
1434
+ this.turnCompleted = false;
1435
+ this.hasSentContentEnd = false;
1436
+ this.streamRestartAttempted = false;
1437
+ if (needNewContent) {
1438
+ this.audioContentStarted = false;
1439
+ this.log(`[send] contentEnd was previously sent - will create new audio content container`);
1440
+ }
1441
+ this.log(
1442
+ `[send] State reset: turnCompleted=false, hasSentContentEnd=false, audioContentStarted=${this.audioContentStarted}`
1443
+ );
1444
+ }
1445
+ if (!this.promptStarted) {
1446
+ this.promptStarted = true;
1447
+ }
1448
+ const promptName = this._promptName;
1449
+ if (!promptName) {
1450
+ throw new NovaSonicError(
1451
+ "not_connected" /* NOT_CONNECTED */,
1452
+ "Prompt name not initialized. Connection may not be fully established."
1453
+ );
1454
+ }
1455
+ if (!this.audioContentStarted) {
1456
+ const audioContentId = randomUUID();
1457
+ this.audioContentName = audioContentId;
1458
+ this.log(`[send] First audio send - sending AUDIO contentStart with contentName: ${audioContentId}`);
1459
+ await this.sendClientEvent({
1460
+ contentStart: {
1461
+ promptName,
1462
+ contentName: audioContentId,
1463
+ type: "AUDIO",
1464
+ interactive: true,
1465
+ role: "USER",
1466
+ audioInputConfiguration: {
1467
+ mediaType: "audio/lpcm",
1468
+ sampleRateHertz: 16e3,
1469
+ sampleSizeBits: 16,
1470
+ channelCount: 1,
1471
+ encoding: "base64",
1472
+ audioType: "SPEECH"
1473
+ }
1474
+ }
1475
+ });
1476
+ this.audioContentStarted = true;
1477
+ this.log(`[send] AUDIO contentStart sent, ready to stream audio`);
1478
+ } else {
1479
+ this.log(`[send] AUDIO contentStart already sent, sending audioInput chunks directly`);
1480
+ }
1481
+ if (!this.audioContentName) {
1482
+ throw new NovaSonicError("invalid_state" /* INVALID_STATE */, "Audio content name not initialized. This should not happen.");
1483
+ }
1484
+ const contentName = this.audioContentName;
1485
+ if (audioData instanceof Int16Array) {
1486
+ const buffer = Buffer.from(audioData.buffer, audioData.byteOffset, audioData.byteLength);
1487
+ const base64Audio = buffer.toString("base64");
1488
+ this.log(
1489
+ `[send] Sending audioInput chunk, size: ${buffer.length} bytes, contentName: ${contentName}, turnCompleted: ${this.turnCompleted}, hasSentContentEnd: ${this.hasSentContentEnd}, audioContentStarted: ${this.audioContentStarted}, state: ${this.state}`
1490
+ );
1491
+ if (this.state !== "connected") {
1492
+ this.log(`[send] ERROR: State changed to '${this.state}' during send!`);
1493
+ throw new NovaSonicError("not_connected" /* NOT_CONNECTED */, `Connection lost during send. State: ${this.state}`);
1494
+ }
1495
+ await this.sendClientEvent({
1496
+ audioInput: {
1497
+ promptName,
1498
+ contentName,
1499
+ content: base64Audio
1500
+ }
1501
+ });
1502
+ this.log(`[send] audioInput chunk sent successfully`);
1503
+ } else if (audioData && typeof audioData === "object" && "read" in audioData) {
1504
+ const stream = audioData;
1505
+ for await (const chunk of stream) {
1506
+ const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
1507
+ const base64Audio = buffer.toString("base64");
1508
+ this.log(
1509
+ `[send] Sending audioInput chunk from stream, size: ${buffer.length} bytes, contentName: ${contentName}, turnCompleted: ${this.turnCompleted}, hasSentContentEnd: ${this.hasSentContentEnd}`
1510
+ );
1511
+ await this.sendClientEvent({
1512
+ audioInput: {
1513
+ promptName,
1514
+ contentName,
1515
+ content: base64Audio
1516
+ }
1517
+ });
1518
+ }
1519
+ } else {
1520
+ throw new NovaSonicError("invalid_audio_format" /* INVALID_AUDIO_FORMAT */, "Unsupported audio data format");
1521
+ }
1522
+ }
1523
+ /**
1524
+ * End audio input stream (sends contentEnd for audio)
1525
+ * Call this when done sending audio chunks
1526
+ */
1527
+ async endAudioInput() {
1528
+ if (this.hasSentContentEnd) {
1529
+ this.log("[endAudioInput] contentEnd already sent for this turn, skipping");
1530
+ return;
1531
+ }
1532
+ if (this.turnCompleted) {
1533
+ this.log(
1534
+ "[endAudioInput] Turn already completed by AWS, skipping contentEnd. Resetting turnCompleted flag for next turn."
1535
+ );
1536
+ this.turnCompleted = false;
1537
+ this.hasSentContentEnd = false;
1538
+ return;
1539
+ }
1540
+ if (this.audioContentStarted && this.audioContentName && this._promptName) {
1541
+ const promptName = this._promptName;
1542
+ this.log("[endAudioInput] Sending contentEnd for audio input");
1543
+ await this.sendClientEvent({
1544
+ contentEnd: {
1545
+ promptName,
1546
+ contentName: this.audioContentName
1547
+ }
1548
+ });
1549
+ this.hasSentContentEnd = true;
1550
+ } else {
1551
+ this.log(
1552
+ "[endAudioInput] Cannot send contentEnd: audioContentStarted=" + this.audioContentStarted + ", audioContentName=" + this.audioContentName
1553
+ );
1554
+ }
1555
+ }
1556
+ /**
1557
+ * Register an event listener
1558
+ */
1559
+ on(event, callback) {
1560
+ if (!this.events[event]) {
1561
+ this.events[event] = [];
1562
+ }
1563
+ this.events[event].push(callback);
1564
+ }
1565
+ /**
1566
+ * Remove an event listener
1567
+ */
1568
+ off(event, callback) {
1569
+ if (!this.events[event]) {
1570
+ return;
1571
+ }
1572
+ const index = this.events[event].indexOf(callback);
1573
+ if (index !== -1) {
1574
+ this.events[event].splice(index, 1);
1575
+ }
1576
+ }
1577
+ /**
1578
+ * Emit an event with arguments
1579
+ */
1580
+ emit(event, data) {
1581
+ if (!this.events[event]) {
1582
+ this.log(`[NovaSonic] emit('${event}'): No listeners registered for this event`);
1583
+ return;
1584
+ }
1585
+ const listenerCount = this.events[event].length;
1586
+ this.log(`[NovaSonic] emit('${event}'): Calling ${listenerCount} listener(s)`);
1587
+ for (const callback of this.events[event]) {
1588
+ try {
1589
+ callback(data);
1590
+ this.log(`[NovaSonic] emit('${event}'): Successfully called one listener`);
1591
+ } catch (error) {
1592
+ this.log(`Error in event handler for ${event}:`, error);
1593
+ }
1594
+ }
1595
+ this.log(`[NovaSonic] emit('${event}'): Finished calling all ${listenerCount} listener(s)`);
1596
+ }
1597
+ /**
1598
+ * Get listener status
1599
+ */
1600
+ async getListener() {
1601
+ return { enabled: this.state === "connected" };
1602
+ }
1603
+ /**
1604
+ * Log helper
1605
+ */
1606
+ log(...args) {
1607
+ if (this.debug) {
1608
+ console.info("[NovaSonicVoice]", ...args);
1609
+ }
1610
+ }
1611
+ };
1612
+
1613
+ export { NovaSonicError, NovaSonicErrorCode, NovaSonicVoice };
1614
+ //# sourceMappingURL=index.js.map
1615
+ //# sourceMappingURL=index.js.map