@astropods/messaging 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,6 +45,9 @@ export interface AgentResponse {
45
45
  threadMetadata?: ThreadMetadata;
46
46
  error?: ErrorResponse;
47
47
  contextRequest?: ThreadHistoryRequest;
48
+ transcript?: Transcript;
49
+ audioConfig?: AudioStreamConfig;
50
+ audioChunk?: AudioChunk;
48
51
  }
49
52
  export interface StatusUpdate {
50
53
  status: 'THINKING' | 'SEARCHING' | 'GENERATING' | 'PROCESSING' | 'ANALYZING' | 'CUSTOM';
@@ -77,6 +80,11 @@ export interface ErrorResponse {
77
80
  details?: string;
78
81
  retryable?: boolean;
79
82
  }
83
+ export interface Transcript {
84
+ text: string;
85
+ messageId?: string;
86
+ language?: string;
87
+ }
80
88
  export interface ThreadHistoryRequest {
81
89
  conversationId: string;
82
90
  maxMessages?: number;
@@ -129,11 +137,54 @@ export interface AgentConfig {
129
137
  systemPrompt: string;
130
138
  tools: AgentToolConfig[];
131
139
  }
140
+ /**
141
+ * Supported audio encoding formats. Matches the AudioEncoding protobuf enum.
142
+ *
143
+ * Common sources:
144
+ * - LINEAR16: Universal PCM baseline (any platform)
145
+ * - MULAW: Twilio / telephony (G.711 mu-law, 8kHz)
146
+ * - WEBM_OPUS: Browser MediaRecorder default
147
+ * - AAC: iOS native recording
148
+ */
149
+ export type AudioEncoding = 'LINEAR16' | 'MULAW' | 'OPUS' | 'MP3' | 'WEBM_OPUS' | 'OGG_OPUS' | 'FLAC' | 'AAC';
150
+ /**
151
+ * Configuration sent at the start of an audio segment to describe the format.
152
+ * Maps to the AudioStreamConfig protobuf message.
153
+ */
154
+ export interface AudioStreamConfig {
155
+ encoding: AudioEncoding;
156
+ sampleRate: number;
157
+ channels: number;
158
+ language?: string;
159
+ conversationId: string;
160
+ source?: string;
161
+ }
162
+ /**
163
+ * A chunk of raw audio bytes. Maps to the AudioChunk protobuf message.
164
+ *
165
+ * Chunks arrive sequentially during a segment. When done=true, the segment
166
+ * is complete and the agent should run STT on the accumulated audio.
167
+ */
168
+ export interface AudioChunk {
169
+ data: Buffer | Uint8Array;
170
+ sequence?: number;
171
+ done?: boolean;
172
+ }
173
+ /**
174
+ * Maps an AudioEncoding to the filetype string expected by Mastra's voice.listen().
175
+ *
176
+ * Usage:
177
+ * const filetype = audioEncodingToFiletype(config.encoding);
178
+ * const transcript = await agent.voice.listen(audioStream, { filetype });
179
+ */
180
+ export declare function audioEncodingToFiletype(encoding: AudioEncoding): string;
132
181
  export interface ConversationRequest {
133
182
  message?: Message;
134
183
  feedback?: any;
135
184
  agentConfig?: AgentConfig;
136
185
  agentResponse?: AgentResponse;
186
+ audioConfig?: AudioStreamConfig;
187
+ audio?: AudioChunk;
137
188
  }
138
189
  export interface ReconnectOptions {
139
190
  /** Maximum number of reconnect attempts. Default: Infinity */
@@ -244,6 +295,61 @@ export declare class ConversationStream extends EventEmitter {
244
295
  * Send a status update for a conversation
245
296
  */
246
297
  sendStatusUpdate(conversationId: string, status: StatusUpdate): void;
298
+ /**
299
+ * Send a transcript of the user's audio input back to the platform.
300
+ *
301
+ * After the agent runs STT on the audio, it calls this to send the transcribed
302
+ * text back to the platform (web adapter). The platform uses it to replace the
303
+ * "[audio]" placeholder message with the actual spoken text in the chat UI.
304
+ *
305
+ * @param conversationId - The conversation this transcript belongs to
306
+ * @param text - The transcribed text from STT
307
+ * @param messageId - Optional: the original "[audio]" message ID to update
308
+ * @param language - Optional: BCP-47 language detected by STT (e.g. "en-US")
309
+ */
310
+ sendTranscript(conversationId: string, text: string, messageId?: string, language?: string): void;
311
+ /**
312
+ * Send an audio stream config through the bidi stream.
313
+ * Must be called before sendAudioChunk() so the receiver knows the encoding.
314
+ */
315
+ sendAudioConfig(config: AudioStreamConfig): void;
316
+ /**
317
+ * Send a raw audio chunk through the bidi stream.
318
+ * The chunk's sequence number should increase monotonically.
319
+ */
320
+ sendAudioChunk(chunk: AudioChunk): void;
321
+ /**
322
+ * Signal end of the current audio segment by sending an empty chunk with done=true.
323
+ * The receiver should process all accumulated audio (e.g. run STT).
324
+ * After this, more audio can follow — either new config or more chunks.
325
+ */
326
+ endAudio(): void;
327
+ /**
328
+ * Converts incoming audioChunk events into a Web Streams API ReadableStream.
329
+ *
330
+ * This is the primary integration point with Mastra's voice system. The agent
331
+ * listens for the 'audioConfig' event to know the format, then calls this
332
+ * method to get a stream it can pass directly to voice.listen():
333
+ *
334
+ * ```typescript
335
+ * conversation.on('audioConfig', async (config) => {
336
+ * const audioStream = conversation.audioAsReadable();
337
+ * const filetype = audioEncodingToFiletype(config.encoding);
338
+ * const transcript = await agent.voice.listen(audioStream, { filetype });
339
+ * // ... process transcript
340
+ * });
341
+ * ```
342
+ *
343
+ * The ReadableStream:
344
+ * - Yields Uint8Array chunks as audioChunk events arrive
345
+ * - Closes when an AudioChunk with done=true arrives (end of segment)
346
+ * - Closes when the ConversationStream emits 'end' (intentional close)
347
+ * - Errors when the ConversationStream emits 'error'
348
+ * - Properly cleans up all event listeners on close, error, or cancel
349
+ *
350
+ * @returns A ReadableStream<Uint8Array> suitable for Mastra voice.listen()
351
+ */
352
+ audioAsReadable(): ReadableStream<Uint8Array>;
247
353
  /**
248
354
  * End the stream intentionally. Emits 'end' and prevents any further reconnects.
249
355
  */
@@ -34,10 +34,31 @@ var __importStar = (this && this.__importStar) || (function () {
34
34
  })();
35
35
  Object.defineProperty(exports, "__esModule", { value: true });
36
36
  exports.Helpers = exports.MessageStream = exports.ConversationStream = exports.MessagingClient = void 0;
37
+ exports.audioEncodingToFiletype = audioEncodingToFiletype;
37
38
  const grpc = __importStar(require("@grpc/grpc-js"));
38
39
  const protoLoader = __importStar(require("@grpc/proto-loader"));
39
40
  const path_1 = require("path");
40
41
  const events_1 = require("events");
42
+ /**
43
+ * Maps an AudioEncoding to the filetype string expected by Mastra's voice.listen().
44
+ *
45
+ * Usage:
46
+ * const filetype = audioEncodingToFiletype(config.encoding);
47
+ * const transcript = await agent.voice.listen(audioStream, { filetype });
48
+ */
49
+ function audioEncodingToFiletype(encoding) {
50
+ const map = {
51
+ LINEAR16: 'wav',
52
+ MULAW: 'wav',
53
+ OPUS: 'opus',
54
+ MP3: 'mp3',
55
+ WEBM_OPUS: 'webm',
56
+ OGG_OPUS: 'ogg',
57
+ FLAC: 'flac',
58
+ AAC: 'm4a',
59
+ };
60
+ return map[encoding] ?? 'wav';
61
+ }
41
62
  // gRPC status codes: DEADLINE_EXCEEDED=4, INTERNAL=13, UNAVAILABLE=14, RESOURCE_EXHAUSTED=8
42
63
  const DEFAULT_RETRYABLE_STATUS_CODES = [4, 8, 13, 14];
43
64
  function resolveReconnectOptions(options) {
@@ -234,6 +255,17 @@ class ConversationStream extends events_1.EventEmitter {
234
255
  attachHandlers(stream) {
235
256
  stream.on('data', (response) => {
236
257
  this.retryCount = 0;
258
+ // Emit audio-specific events if present.
259
+ // The server sends audio data through the bidi stream as AgentResponse
260
+ // messages with audioConfig or audioChunk payloads. We emit dedicated
261
+ // events for these so the agent can handle audio separately from text,
262
+ // while still emitting the generic 'response' event for observability.
263
+ if (response.audioConfig) {
264
+ this.emit('audioConfig', response.audioConfig);
265
+ }
266
+ else if (response.audioChunk) {
267
+ this.emit('audioChunk', response.audioChunk);
268
+ }
237
269
  this.emit('response', response);
238
270
  });
239
271
  stream.on('error', (error) => {
@@ -345,6 +377,131 @@ class ConversationStream extends events_1.EventEmitter {
345
377
  status,
346
378
  });
347
379
  }
380
+ /**
381
+ * Send a transcript of the user's audio input back to the platform.
382
+ *
383
+ * After the agent runs STT on the audio, it calls this to send the transcribed
384
+ * text back to the platform (web adapter). The platform uses it to replace the
385
+ * "[audio]" placeholder message with the actual spoken text in the chat UI.
386
+ *
387
+ * @param conversationId - The conversation this transcript belongs to
388
+ * @param text - The transcribed text from STT
389
+ * @param messageId - Optional: the original "[audio]" message ID to update
390
+ * @param language - Optional: BCP-47 language detected by STT (e.g. "en-US")
391
+ */
392
+ sendTranscript(conversationId, text, messageId, language) {
393
+ this.sendAgentResponse({
394
+ conversationId,
395
+ transcript: { text, messageId, language },
396
+ });
397
+ }
398
+ // --- Audio support ---
399
+ //
400
+ // These methods handle sending audio data through the gRPC bidi stream.
401
+ // Two directions:
402
+ // - Agent → Server (sendAudioConfig/sendAudioChunk/endAudio): used when the
403
+ // agent needs to forward audio upstream (less common)
404
+ // - Server → Agent (audioConfig/audioChunk events + audioAsReadable): the main
405
+ // path where the server forwards client mic audio to the agent for STT
406
+ /**
407
+ * Send an audio stream config through the bidi stream.
408
+ * Must be called before sendAudioChunk() so the receiver knows the encoding.
409
+ */
410
+ sendAudioConfig(config) {
411
+ this.write({ audioConfig: config });
412
+ }
413
+ /**
414
+ * Send a raw audio chunk through the bidi stream.
415
+ * The chunk's sequence number should increase monotonically.
416
+ */
417
+ sendAudioChunk(chunk) {
418
+ this.write({ audio: chunk });
419
+ }
420
+ /**
421
+ * Signal end of the current audio segment by sending an empty chunk with done=true.
422
+ * The receiver should process all accumulated audio (e.g. run STT).
423
+ * After this, more audio can follow — either new config or more chunks.
424
+ */
425
+ endAudio() {
426
+ this.write({ audio: { data: Buffer.alloc(0), done: true } });
427
+ }
428
+ /**
429
+ * Converts incoming audioChunk events into a Web Streams API ReadableStream.
430
+ *
431
+ * This is the primary integration point with Mastra's voice system. The agent
432
+ * listens for the 'audioConfig' event to know the format, then calls this
433
+ * method to get a stream it can pass directly to voice.listen():
434
+ *
435
+ * ```typescript
436
+ * conversation.on('audioConfig', async (config) => {
437
+ * const audioStream = conversation.audioAsReadable();
438
+ * const filetype = audioEncodingToFiletype(config.encoding);
439
+ * const transcript = await agent.voice.listen(audioStream, { filetype });
440
+ * // ... process transcript
441
+ * });
442
+ * ```
443
+ *
444
+ * The ReadableStream:
445
+ * - Yields Uint8Array chunks as audioChunk events arrive
446
+ * - Closes when an AudioChunk with done=true arrives (end of segment)
447
+ * - Closes when the ConversationStream emits 'end' (intentional close)
448
+ * - Errors when the ConversationStream emits 'error'
449
+ * - Properly cleans up all event listeners on close, error, or cancel
450
+ *
451
+ * @returns A ReadableStream<Uint8Array> suitable for Mastra voice.listen()
452
+ */
453
+ audioAsReadable() {
454
+ // Centralized cleanup to prevent listener leaks. Called on:
455
+ // - done=true chunk (normal completion)
456
+ // - stream 'end' event (intentional close)
457
+ // - stream 'error' event
458
+ // - ReadableStream cancel() (consumer gave up, e.g. reader.cancel())
459
+ const cleanup = () => {
460
+ this.removeListener('audioChunk', onChunk);
461
+ this.removeListener('end', onEnd);
462
+ this.removeListener('error', onError);
463
+ };
464
+ const onChunk = (chunk) => {
465
+ if (chunk.done) {
466
+ cleanup();
467
+ try {
468
+ controller.close();
469
+ }
470
+ catch { }
471
+ }
472
+ else {
473
+ controller.enqueue(new Uint8Array(chunk.data));
474
+ }
475
+ };
476
+ const onEnd = () => {
477
+ cleanup();
478
+ try {
479
+ controller.close();
480
+ }
481
+ catch { }
482
+ };
483
+ const onError = (err) => {
484
+ cleanup();
485
+ try {
486
+ controller.error(err);
487
+ }
488
+ catch { }
489
+ };
490
+ let controller;
491
+ return new ReadableStream({
492
+ start: (ctrl) => {
493
+ controller = ctrl;
494
+ this.on('audioChunk', onChunk);
495
+ this.once('end', onEnd);
496
+ this.once('error', onError);
497
+ },
498
+ cancel: () => {
499
+ // Consumer cancelled (e.g. reader.cancel()) — remove all listeners
500
+ // to prevent memory leaks
501
+ cleanup();
502
+ },
503
+ });
504
+ }
348
505
  /**
349
506
  * End the stream intentionally. Emits 'end' and prevents any further reconnects.
350
507
  */
@@ -0,0 +1,71 @@
1
+ // Audio streaming types for the messaging system.
2
+ //
3
+ // These types enable raw audio input from any frontend (browser, phone, mobile app)
4
+ // to be streamed through the messaging server to an agent. The messaging system is
5
+ // a pass-through — it does NOT perform speech-to-text, transcoding, or voice activity
6
+ // detection. The agent handles STT via Mastra's voice provider abstraction.
7
+ //
8
+ // Data flow:
9
+ // Client (mic) → WebSocket → Server → gRPC (these types) → Agent → Mastra voice.listen()
10
+ //
11
+ // Two ways audio enters the system:
12
+ // 1. ProcessAudioStream RPC: dedicated audio-only streaming (AudioStreamRequest)
13
+ // 2. ProcessConversation RPC: audio mixed into the bidi stream (ConversationRequest.audio_config/audio)
14
+ //
15
+ // Both converge on the same types: AudioStreamConfig describes the format,
16
+ // AudioChunk carries the bytes, and done=true signals end of an utterance.
17
+
18
+ syntax = "proto3";
19
+
20
+ package astro.messaging.v1;
21
+
22
+ option go_package = "github.com/postman/astro/messaging/v1;messagingv1";
23
+
24
+ // Audio encoding format — covers browser, telephony, and mobile sources.
25
+ // The agent uses this to configure the STT provider (e.g. Whisper, Deepgram).
26
+ enum AudioEncoding {
27
+ AUDIO_ENCODING_UNSPECIFIED = 0;
28
+ LINEAR16 = 1; // PCM signed 16-bit little-endian — universal baseline, any platform
29
+ MULAW = 2; // G.711 mu-law — Twilio and traditional telephony (8kHz)
30
+ OPUS = 3; // Raw Opus frames — low-latency codec
31
+ MP3 = 4; // MP3 — batch uploads, pre-recorded audio
32
+ WEBM_OPUS = 5; // WebM container with Opus — browser MediaRecorder default
33
+ OGG_OPUS = 6; // OGG container with Opus — Firefox MediaRecorder
34
+ FLAC = 7; // FLAC lossless — high-quality uploads
35
+ AAC = 8; // AAC — iOS native recording
36
+ }
37
+
38
+ // Sent once at the start of an audio segment to tell the agent what format
39
+ // the subsequent AudioChunk bytes are in. Without this, the agent can't
40
+ // decode the raw bytes.
41
+ message AudioStreamConfig {
42
+ AudioEncoding encoding = 1; // What codec the audio bytes use
43
+ int32 sample_rate = 2; // Hz: 8000 (telephony), 16000 (speech), 48000 (browser)
44
+ int32 channels = 3; // 1 = mono (speech default), 2 = stereo
45
+ string language = 4; // BCP-47 hint for STT, e.g. "en-US" (optional)
46
+ string conversation_id = 5; // Links this audio to an existing conversation
47
+
48
+ // Source metadata — helps the agent pick the right STT config.
49
+ // Examples: "browser", "twilio", "vonage", "mobile", "upload"
50
+ string source = 6;
51
+ }
52
+
53
+ // A chunk of raw audio bytes in the encoding specified by AudioStreamConfig.
54
+ //
55
+ // Chunks arrive in order during a segment. When done=true, the segment is
56
+ // complete and the agent should run STT on all accumulated chunks.
57
+ // The data field may be empty on the final done=true chunk.
58
+ message AudioChunk {
59
+ bytes data = 1; // Raw audio bytes (pass-through, no transcoding)
60
+ int64 sequence = 2; // Monotonic sequence number for ordering
61
+ bool done = 3; // true = end of segment, process accumulated audio
62
+ }
63
+
64
+ // Wrapper for the ProcessAudioStream RPC (dedicated audio streaming).
65
+ // The first message MUST be config, all subsequent messages are audio chunks.
66
+ message AudioStreamRequest {
67
+ oneof request {
68
+ AudioStreamConfig config = 1; // First message: tells agent the audio format
69
+ AudioChunk audio = 2; // Subsequent: raw audio bytes
70
+ }
71
+ }
@@ -4,6 +4,7 @@ package astro.messaging.v1;
4
4
 
5
5
  import "google/protobuf/timestamp.proto";
6
6
  import "astro/messaging/v1/message.proto";
7
+ import "astro/messaging/v1/audio.proto";
7
8
 
8
9
  option go_package = "github.com/postman/astro/messaging/v1;messagingv1";
9
10
 
@@ -22,6 +23,9 @@ message AgentResponse {
22
23
  ThreadMetadata thread_metadata = 7; // Thread title, creation
23
24
  ErrorResponse error = 8; // Error during processing
24
25
  ThreadHistoryRequest context_request = 9; // Request cached context (optional)
26
+ Transcript transcript = 10; // Audio transcript (agent → platform)
27
+ AudioStreamConfig audio_config = 11; // Audio session config (server → agent)
28
+ AudioChunk audio_chunk = 12; // Audio data (server → agent)
25
29
  }
26
30
  }
27
31
 
@@ -144,6 +148,14 @@ message ThreadMetadata {
144
148
  bool create_new = 3; // Create new thread
145
149
  }
146
150
 
151
+ // Transcript of user audio input (agent → platform after STT)
152
+ // Used to update a placeholder message with the actual transcribed text
153
+ message Transcript {
154
+ string text = 1; // Transcribed text
155
+ string message_id = 2; // User message ID to update (optional)
156
+ string language = 3; // Detected language BCP-47 (optional)
157
+ }
158
+
147
159
  // Error response from agent
148
160
  message ErrorResponse {
149
161
  enum ErrorCode {
@@ -5,6 +5,7 @@ package astro.messaging.v1;
5
5
  import "astro/messaging/v1/message.proto";
6
6
  import "astro/messaging/v1/response.proto";
7
7
  import "astro/messaging/v1/feedback.proto";
8
+ import "astro/messaging/v1/audio.proto";
8
9
  import "astro/messaging/v1/config.proto";
9
10
  import "google/protobuf/timestamp.proto";
10
11
 
@@ -29,6 +30,11 @@ service AgentMessaging {
29
30
  rpc GetConversationMetadata(ConversationMetadataRequest)
30
31
  returns (ConversationMetadataResponse);
31
32
 
33
+ // Audio: client streams raw audio, server responds with text
34
+ // First message MUST be AudioStreamConfig, rest are AudioChunks
35
+ rpc ProcessAudioStream(stream AudioStreamRequest)
36
+ returns (stream AgentResponse);
37
+
32
38
  // Health check
33
39
  rpc HealthCheck(HealthCheckRequest)
34
40
  returns (HealthCheckResponse);
@@ -41,6 +47,8 @@ message ConversationRequest {
41
47
  PlatformFeedback feedback = 2;
42
48
  AgentConfig agent_config = 3;
43
49
  AgentResponse agent_response = 4;
50
+ AudioStreamConfig audio_config = 5; // Start audio within conversation
51
+ AudioChunk audio = 6; // Audio data within conversation
44
52
  }
45
53
  }
46
54
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@astropods/messaging",
3
3
  "license": "Apache-2.0",
4
- "version": "0.0.2",
4
+ "version": "0.0.3",
5
5
  "description": "TypeScript SDK for Astro Messaging",
6
6
  "main": "dist/index.js",
7
7
  "types": "dist/index.d.ts",
@@ -10,7 +10,7 @@
10
10
  ],
11
11
  "scripts": {
12
12
  "postinstall": "rm -rf proto && ln -sf ../../proto proto",
13
- "build": "tsc && cp -r ../../proto dist/proto",
13
+ "build": "tsc && rm -rf dist/proto && cp -r ../../proto dist/proto",
14
14
  "watch": "tsc --watch",
15
15
  "test": "bun test",
16
16
  "test:watch": "bun test --watch"