@astropods/messaging 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -45,6 +45,9 @@ export interface AgentResponse {
|
|
|
45
45
|
threadMetadata?: ThreadMetadata;
|
|
46
46
|
error?: ErrorResponse;
|
|
47
47
|
contextRequest?: ThreadHistoryRequest;
|
|
48
|
+
transcript?: Transcript;
|
|
49
|
+
audioConfig?: AudioStreamConfig;
|
|
50
|
+
audioChunk?: AudioChunk;
|
|
48
51
|
}
|
|
49
52
|
export interface StatusUpdate {
|
|
50
53
|
status: 'THINKING' | 'SEARCHING' | 'GENERATING' | 'PROCESSING' | 'ANALYZING' | 'CUSTOM';
|
|
@@ -77,6 +80,11 @@ export interface ErrorResponse {
|
|
|
77
80
|
details?: string;
|
|
78
81
|
retryable?: boolean;
|
|
79
82
|
}
|
|
83
|
+
export interface Transcript {
|
|
84
|
+
text: string;
|
|
85
|
+
messageId?: string;
|
|
86
|
+
language?: string;
|
|
87
|
+
}
|
|
80
88
|
export interface ThreadHistoryRequest {
|
|
81
89
|
conversationId: string;
|
|
82
90
|
maxMessages?: number;
|
|
@@ -129,11 +137,68 @@ export interface AgentConfig {
|
|
|
129
137
|
systemPrompt: string;
|
|
130
138
|
tools: AgentToolConfig[];
|
|
131
139
|
}
|
|
140
|
+
/**
|
|
141
|
+
* Supported audio encoding formats. Matches the AudioEncoding protobuf enum.
|
|
142
|
+
*
|
|
143
|
+
* Common sources:
|
|
144
|
+
* - LINEAR16: Universal PCM baseline (any platform)
|
|
145
|
+
* - MULAW: Twilio / telephony (G.711 mu-law, 8kHz)
|
|
146
|
+
* - WEBM_OPUS: Browser MediaRecorder default
|
|
147
|
+
* - AAC: iOS native recording
|
|
148
|
+
*/
|
|
149
|
+
export type AudioEncoding = 'LINEAR16' | 'MULAW' | 'OPUS' | 'MP3' | 'WEBM_OPUS' | 'OGG_OPUS' | 'FLAC' | 'AAC';
|
|
150
|
+
/**
|
|
151
|
+
* Configuration sent at the start of an audio segment to describe the format.
|
|
152
|
+
* Maps to the AudioStreamConfig protobuf message.
|
|
153
|
+
*/
|
|
154
|
+
export interface AudioStreamConfig {
|
|
155
|
+
encoding: AudioEncoding;
|
|
156
|
+
sampleRate: number;
|
|
157
|
+
channels: number;
|
|
158
|
+
language?: string;
|
|
159
|
+
conversationId: string;
|
|
160
|
+
source?: string;
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* A chunk of raw audio bytes. Maps to the AudioChunk protobuf message.
|
|
164
|
+
*
|
|
165
|
+
* Chunks arrive sequentially during a segment. When done=true, the segment
|
|
166
|
+
* is complete and the agent should run STT on the accumulated audio.
|
|
167
|
+
*/
|
|
168
|
+
export interface AudioChunk {
|
|
169
|
+
data: Buffer | Uint8Array;
|
|
170
|
+
sequence?: number;
|
|
171
|
+
done?: boolean;
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Maps an AudioEncoding to the filetype string expected by Mastra's voice.listen().
|
|
175
|
+
*
|
|
176
|
+
* Usage:
|
|
177
|
+
* const filetype = audioEncodingToFiletype(config.encoding);
|
|
178
|
+
* const transcript = await agent.voice.listen(audioStream, { filetype });
|
|
179
|
+
*/
|
|
180
|
+
export declare function audioEncodingToFiletype(encoding: AudioEncoding): string;
|
|
132
181
|
export interface ConversationRequest {
|
|
133
182
|
message?: Message;
|
|
134
183
|
feedback?: any;
|
|
135
184
|
agentConfig?: AgentConfig;
|
|
136
185
|
agentResponse?: AgentResponse;
|
|
186
|
+
audioConfig?: AudioStreamConfig;
|
|
187
|
+
audio?: AudioChunk;
|
|
188
|
+
}
|
|
189
|
+
export interface ReconnectOptions {
|
|
190
|
+
/** Maximum number of reconnect attempts. Default: Infinity */
|
|
191
|
+
maxRetries?: number;
|
|
192
|
+
/** Initial delay before first retry in ms. Default: 500 */
|
|
193
|
+
initialDelayMs?: number;
|
|
194
|
+
/** Maximum delay between retries in ms. Default: 30_000 */
|
|
195
|
+
maxDelayMs?: number;
|
|
196
|
+
/** Apply full jitter to backoff delay. Default: true */
|
|
197
|
+
jitter?: boolean;
|
|
198
|
+
/** Maximum number of writes to queue during reconnect. Default: 1000 */
|
|
199
|
+
maxBufferSize?: number;
|
|
200
|
+
/** gRPC status codes that trigger a reconnect attempt. Default: UNAVAILABLE, DEADLINE_EXCEEDED, INTERNAL, RESOURCE_EXHAUSTED */
|
|
201
|
+
retryableStatusCodes?: number[];
|
|
137
202
|
}
|
|
138
203
|
/**
|
|
139
204
|
* MessagingClient provides a TypeScript interface to the Astro Messaging gRPC service
|
|
@@ -149,9 +214,14 @@ export declare class MessagingClient extends EventEmitter {
|
|
|
149
214
|
*/
|
|
150
215
|
connect(): Promise<void>;
|
|
151
216
|
/**
|
|
152
|
-
*
|
|
217
|
+
* Connect with automatic retry on failure (exponential backoff).
|
|
218
|
+
* Emits 'reconnecting' before each retry and 'reconnected' on success after failures.
|
|
153
219
|
*/
|
|
154
|
-
|
|
220
|
+
connectWithRetry(options?: ReconnectOptions): Promise<void>;
|
|
221
|
+
/**
|
|
222
|
+
* Create a bidirectional conversation stream with optional reconnect support
|
|
223
|
+
*/
|
|
224
|
+
createConversationStream(options?: ReconnectOptions): ConversationStream;
|
|
155
225
|
/**
|
|
156
226
|
* Process a single message (server-side streaming)
|
|
157
227
|
*/
|
|
@@ -176,11 +246,31 @@ export declare class MessagingClient extends EventEmitter {
|
|
|
176
246
|
close(): void;
|
|
177
247
|
}
|
|
178
248
|
/**
|
|
179
|
-
* ConversationStream wraps a bidirectional gRPC stream
|
|
249
|
+
* ConversationStream wraps a bidirectional gRPC stream with automatic reconnection.
|
|
250
|
+
*
|
|
251
|
+
* Events:
|
|
252
|
+
* - 'response' — AgentResponse received from server
|
|
253
|
+
* - 'reconnecting' — { attempt, reason, delayMs } — before each retry delay
|
|
254
|
+
* - 'reconnected' — { attempt } — after a successful stream recreation
|
|
255
|
+
* - 'error' — non-retryable error OR max retries exceeded
|
|
256
|
+
* - 'end' — only on intentional close(), not on unexpected stream drop
|
|
180
257
|
*/
|
|
181
258
|
export declare class ConversationStream extends EventEmitter {
|
|
259
|
+
private streamFactory;
|
|
182
260
|
private stream;
|
|
183
|
-
|
|
261
|
+
private writeBuffer;
|
|
262
|
+
private reconnecting;
|
|
263
|
+
private closed;
|
|
264
|
+
private retryCount;
|
|
265
|
+
private readonly opts;
|
|
266
|
+
constructor(streamFactory: () => any, options?: ReconnectOptions);
|
|
267
|
+
private attachHandlers;
|
|
268
|
+
private isRetryable;
|
|
269
|
+
private calculateDelay;
|
|
270
|
+
private scheduleReconnect;
|
|
271
|
+
private doReconnect;
|
|
272
|
+
private flushBuffer;
|
|
273
|
+
private write;
|
|
184
274
|
/**
|
|
185
275
|
* Send a message through the stream
|
|
186
276
|
*/
|
|
@@ -206,7 +296,62 @@ export declare class ConversationStream extends EventEmitter {
|
|
|
206
296
|
*/
|
|
207
297
|
sendStatusUpdate(conversationId: string, status: StatusUpdate): void;
|
|
208
298
|
/**
|
|
209
|
-
*
|
|
299
|
+
* Send a transcript of the user's audio input back to the platform.
|
|
300
|
+
*
|
|
301
|
+
* After the agent runs STT on the audio, it calls this to send the transcribed
|
|
302
|
+
* text back to the platform (web adapter). The platform uses it to replace the
|
|
303
|
+
* "[audio]" placeholder message with the actual spoken text in the chat UI.
|
|
304
|
+
*
|
|
305
|
+
* @param conversationId - The conversation this transcript belongs to
|
|
306
|
+
* @param text - The transcribed text from STT
|
|
307
|
+
* @param messageId - Optional: the original "[audio]" message ID to update
|
|
308
|
+
* @param language - Optional: BCP-47 language detected by STT (e.g. "en-US")
|
|
309
|
+
*/
|
|
310
|
+
sendTranscript(conversationId: string, text: string, messageId?: string, language?: string): void;
|
|
311
|
+
/**
|
|
312
|
+
* Send an audio stream config through the bidi stream.
|
|
313
|
+
* Must be called before sendAudioChunk() so the receiver knows the encoding.
|
|
314
|
+
*/
|
|
315
|
+
sendAudioConfig(config: AudioStreamConfig): void;
|
|
316
|
+
/**
|
|
317
|
+
* Send a raw audio chunk through the bidi stream.
|
|
318
|
+
* The chunk's sequence number should increase monotonically.
|
|
319
|
+
*/
|
|
320
|
+
sendAudioChunk(chunk: AudioChunk): void;
|
|
321
|
+
/**
|
|
322
|
+
* Signal end of the current audio segment by sending an empty chunk with done=true.
|
|
323
|
+
* The receiver should process all accumulated audio (e.g. run STT).
|
|
324
|
+
* After this, more audio can follow — either new config or more chunks.
|
|
325
|
+
*/
|
|
326
|
+
endAudio(): void;
|
|
327
|
+
/**
|
|
328
|
+
* Converts incoming audioChunk events into a Web Streams API ReadableStream.
|
|
329
|
+
*
|
|
330
|
+
* This is the primary integration point with Mastra's voice system. The agent
|
|
331
|
+
* listens for the 'audioConfig' event to know the format, then calls this
|
|
332
|
+
* method to get a stream it can pass directly to voice.listen():
|
|
333
|
+
*
|
|
334
|
+
* ```typescript
|
|
335
|
+
* conversation.on('audioConfig', async (config) => {
|
|
336
|
+
* const audioStream = conversation.audioAsReadable();
|
|
337
|
+
* const filetype = audioEncodingToFiletype(config.encoding);
|
|
338
|
+
* const transcript = await agent.voice.listen(audioStream, { filetype });
|
|
339
|
+
* // ... process transcript
|
|
340
|
+
* });
|
|
341
|
+
* ```
|
|
342
|
+
*
|
|
343
|
+
* The ReadableStream:
|
|
344
|
+
* - Yields Uint8Array chunks as audioChunk events arrive
|
|
345
|
+
* - Closes when an AudioChunk with done=true arrives (end of segment)
|
|
346
|
+
* - Closes when the ConversationStream emits 'end' (intentional close)
|
|
347
|
+
* - Errors when the ConversationStream emits 'error'
|
|
348
|
+
* - Properly cleans up all event listeners on close, error, or cancel
|
|
349
|
+
*
|
|
350
|
+
* @returns A ReadableStream<Uint8Array> suitable for Mastra voice.listen()
|
|
351
|
+
*/
|
|
352
|
+
audioAsReadable(): ReadableStream<Uint8Array>;
|
|
353
|
+
/**
|
|
354
|
+
* End the stream intentionally. Emits 'end' and prevents any further reconnects.
|
|
210
355
|
*/
|
|
211
356
|
end(): void;
|
|
212
357
|
}
|
package/dist/messaging-client.js
CHANGED
|
@@ -34,10 +34,43 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
34
34
|
})();
|
|
35
35
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
36
|
exports.Helpers = exports.MessageStream = exports.ConversationStream = exports.MessagingClient = void 0;
|
|
37
|
+
exports.audioEncodingToFiletype = audioEncodingToFiletype;
|
|
37
38
|
const grpc = __importStar(require("@grpc/grpc-js"));
|
|
38
39
|
const protoLoader = __importStar(require("@grpc/proto-loader"));
|
|
39
40
|
const path_1 = require("path");
|
|
40
41
|
const events_1 = require("events");
|
|
42
|
+
/**
|
|
43
|
+
* Maps an AudioEncoding to the filetype string expected by Mastra's voice.listen().
|
|
44
|
+
*
|
|
45
|
+
* Usage:
|
|
46
|
+
* const filetype = audioEncodingToFiletype(config.encoding);
|
|
47
|
+
* const transcript = await agent.voice.listen(audioStream, { filetype });
|
|
48
|
+
*/
|
|
49
|
+
function audioEncodingToFiletype(encoding) {
|
|
50
|
+
const map = {
|
|
51
|
+
LINEAR16: 'wav',
|
|
52
|
+
MULAW: 'wav',
|
|
53
|
+
OPUS: 'opus',
|
|
54
|
+
MP3: 'mp3',
|
|
55
|
+
WEBM_OPUS: 'webm',
|
|
56
|
+
OGG_OPUS: 'ogg',
|
|
57
|
+
FLAC: 'flac',
|
|
58
|
+
AAC: 'm4a',
|
|
59
|
+
};
|
|
60
|
+
return map[encoding] ?? 'wav';
|
|
61
|
+
}
|
|
62
|
+
// gRPC status codes: DEADLINE_EXCEEDED=4, INTERNAL=13, UNAVAILABLE=14, RESOURCE_EXHAUSTED=8
|
|
63
|
+
const DEFAULT_RETRYABLE_STATUS_CODES = [4, 8, 13, 14];
|
|
64
|
+
function resolveReconnectOptions(options) {
|
|
65
|
+
return {
|
|
66
|
+
maxRetries: options.maxRetries ?? Infinity,
|
|
67
|
+
initialDelayMs: options.initialDelayMs ?? 500,
|
|
68
|
+
maxDelayMs: options.maxDelayMs ?? 30000,
|
|
69
|
+
jitter: options.jitter ?? true,
|
|
70
|
+
maxBufferSize: options.maxBufferSize ?? 1000,
|
|
71
|
+
retryableStatusCodes: options.retryableStatusCodes ?? DEFAULT_RETRYABLE_STATUS_CODES,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
41
74
|
/**
|
|
42
75
|
* MessagingClient provides a TypeScript interface to the Astro Messaging gRPC service
|
|
43
76
|
*/
|
|
@@ -45,6 +78,7 @@ class MessagingClient extends events_1.EventEmitter {
|
|
|
45
78
|
constructor(serverAddress) {
|
|
46
79
|
super();
|
|
47
80
|
this.serverAddress = serverAddress;
|
|
81
|
+
this.conversationStream = null;
|
|
48
82
|
this.isConnected = false;
|
|
49
83
|
}
|
|
50
84
|
/**
|
|
@@ -67,14 +101,42 @@ class MessagingClient extends events_1.EventEmitter {
|
|
|
67
101
|
this.emit('connected');
|
|
68
102
|
}
|
|
69
103
|
/**
|
|
70
|
-
*
|
|
104
|
+
* Connect with automatic retry on failure (exponential backoff).
|
|
105
|
+
* Emits 'reconnecting' before each retry and 'reconnected' on success after failures.
|
|
106
|
+
*/
|
|
107
|
+
async connectWithRetry(options = {}) {
|
|
108
|
+
const opts = resolveReconnectOptions(options);
|
|
109
|
+
let retryCount = 0;
|
|
110
|
+
while (true) {
|
|
111
|
+
try {
|
|
112
|
+
await this.connect();
|
|
113
|
+
if (retryCount > 0) {
|
|
114
|
+
this.emit('reconnected', { attempt: retryCount });
|
|
115
|
+
}
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
catch (err) {
|
|
119
|
+
if (retryCount >= opts.maxRetries) {
|
|
120
|
+
throw err;
|
|
121
|
+
}
|
|
122
|
+
const base = Math.min(opts.initialDelayMs * Math.pow(2, retryCount), opts.maxDelayMs);
|
|
123
|
+
const delayMs = opts.jitter ? base * (0.5 + Math.random() * 0.5) : base;
|
|
124
|
+
this.emit('reconnecting', { attempt: retryCount + 1, reason: err, delayMs });
|
|
125
|
+
await new Promise(resolve => setTimeout(resolve, delayMs));
|
|
126
|
+
retryCount++;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Create a bidirectional conversation stream with optional reconnect support
|
|
71
132
|
*/
|
|
72
|
-
createConversationStream() {
|
|
133
|
+
createConversationStream(options) {
|
|
73
134
|
if (!this.isConnected) {
|
|
74
135
|
throw new Error('Client not connected. Call connect() first.');
|
|
75
136
|
}
|
|
76
|
-
|
|
77
|
-
|
|
137
|
+
const factory = () => this.client.ProcessConversation();
|
|
138
|
+
this.conversationStream = new ConversationStream(factory, options);
|
|
139
|
+
return this.conversationStream;
|
|
78
140
|
}
|
|
79
141
|
/**
|
|
80
142
|
* Process a single message (server-side streaming)
|
|
@@ -169,57 +231,133 @@ class MessagingClient extends events_1.EventEmitter {
|
|
|
169
231
|
}
|
|
170
232
|
exports.MessagingClient = MessagingClient;
|
|
171
233
|
/**
|
|
172
|
-
* ConversationStream wraps a bidirectional gRPC stream
|
|
234
|
+
* ConversationStream wraps a bidirectional gRPC stream with automatic reconnection.
|
|
235
|
+
*
|
|
236
|
+
* Events:
|
|
237
|
+
* - 'response' — AgentResponse received from server
|
|
238
|
+
* - 'reconnecting' — { attempt, reason, delayMs } — before each retry delay
|
|
239
|
+
* - 'reconnected' — { attempt } — after a successful stream recreation
|
|
240
|
+
* - 'error' — non-retryable error OR max retries exceeded
|
|
241
|
+
* - 'end' — only on intentional close(), not on unexpected stream drop
|
|
173
242
|
*/
|
|
174
243
|
class ConversationStream extends events_1.EventEmitter {
|
|
175
|
-
constructor(
|
|
244
|
+
constructor(streamFactory, options = {}) {
|
|
176
245
|
super();
|
|
177
|
-
this.
|
|
178
|
-
this.
|
|
246
|
+
this.streamFactory = streamFactory;
|
|
247
|
+
this.writeBuffer = [];
|
|
248
|
+
this.reconnecting = false;
|
|
249
|
+
this.closed = false;
|
|
250
|
+
this.retryCount = 0;
|
|
251
|
+
this.opts = resolveReconnectOptions(options);
|
|
252
|
+
this.stream = this.streamFactory();
|
|
253
|
+
this.attachHandlers(this.stream);
|
|
254
|
+
}
|
|
255
|
+
attachHandlers(stream) {
|
|
256
|
+
stream.on('data', (response) => {
|
|
257
|
+
this.retryCount = 0;
|
|
258
|
+
// Emit audio-specific events if present.
|
|
259
|
+
// The server sends audio data through the bidi stream as AgentResponse
|
|
260
|
+
// messages with audioConfig or audioChunk payloads. We emit dedicated
|
|
261
|
+
// events for these so the agent can handle audio separately from text,
|
|
262
|
+
// while still emitting the generic 'response' event for observability.
|
|
263
|
+
if (response.audioConfig) {
|
|
264
|
+
this.emit('audioConfig', response.audioConfig);
|
|
265
|
+
}
|
|
266
|
+
else if (response.audioChunk) {
|
|
267
|
+
this.emit('audioChunk', response.audioChunk);
|
|
268
|
+
}
|
|
179
269
|
this.emit('response', response);
|
|
180
270
|
});
|
|
181
|
-
|
|
182
|
-
this.
|
|
271
|
+
stream.on('error', (error) => {
|
|
272
|
+
if (!this.closed && this.isRetryable(error)) {
|
|
273
|
+
this.scheduleReconnect(error);
|
|
274
|
+
}
|
|
275
|
+
else {
|
|
276
|
+
this.emit('error', error);
|
|
277
|
+
}
|
|
183
278
|
});
|
|
184
|
-
|
|
185
|
-
this.
|
|
279
|
+
stream.on('end', () => {
|
|
280
|
+
if (!this.closed) {
|
|
281
|
+
this.scheduleReconnect(new Error('Stream ended unexpectedly'));
|
|
282
|
+
}
|
|
283
|
+
// If closed, 'end' was already emitted by end() — do nothing
|
|
186
284
|
});
|
|
187
285
|
}
|
|
286
|
+
isRetryable(error) {
|
|
287
|
+
return this.opts.retryableStatusCodes.includes(error.code);
|
|
288
|
+
}
|
|
289
|
+
calculateDelay() {
|
|
290
|
+
const base = Math.min(this.opts.initialDelayMs * Math.pow(2, this.retryCount), this.opts.maxDelayMs);
|
|
291
|
+
return this.opts.jitter ? base * (0.5 + Math.random() * 0.5) : base;
|
|
292
|
+
}
|
|
293
|
+
scheduleReconnect(reason) {
|
|
294
|
+
if (this.reconnecting || this.closed)
|
|
295
|
+
return;
|
|
296
|
+
if (this.retryCount >= this.opts.maxRetries) {
|
|
297
|
+
this.emit('error', new Error(`Max reconnection attempts (${this.opts.maxRetries}) exceeded`));
|
|
298
|
+
return;
|
|
299
|
+
}
|
|
300
|
+
const delayMs = this.calculateDelay();
|
|
301
|
+
this.reconnecting = true;
|
|
302
|
+
this.emit('reconnecting', { attempt: this.retryCount + 1, reason, delayMs });
|
|
303
|
+
setTimeout(() => this.doReconnect(), delayMs);
|
|
304
|
+
}
|
|
305
|
+
doReconnect() {
|
|
306
|
+
if (this.closed)
|
|
307
|
+
return;
|
|
308
|
+
this.retryCount++;
|
|
309
|
+
try {
|
|
310
|
+
this.stream = this.streamFactory();
|
|
311
|
+
this.attachHandlers(this.stream);
|
|
312
|
+
this.reconnecting = false;
|
|
313
|
+
this.emit('reconnected', { attempt: this.retryCount });
|
|
314
|
+
this.flushBuffer();
|
|
315
|
+
}
|
|
316
|
+
catch (err) {
|
|
317
|
+
this.reconnecting = false;
|
|
318
|
+
this.scheduleReconnect(err);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
flushBuffer() {
|
|
322
|
+
const toFlush = this.writeBuffer.splice(0);
|
|
323
|
+
for (const request of toFlush) {
|
|
324
|
+
this.stream.write(request);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
write(request) {
|
|
328
|
+
if (this.reconnecting || this.closed) {
|
|
329
|
+
if (this.writeBuffer.length >= this.opts.maxBufferSize) {
|
|
330
|
+
this.writeBuffer.shift(); // drop oldest
|
|
331
|
+
}
|
|
332
|
+
this.writeBuffer.push(request);
|
|
333
|
+
}
|
|
334
|
+
else {
|
|
335
|
+
this.stream.write(request);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
188
338
|
/**
|
|
189
339
|
* Send a message through the stream
|
|
190
340
|
*/
|
|
191
341
|
sendMessage(message) {
|
|
192
|
-
|
|
193
|
-
message,
|
|
194
|
-
};
|
|
195
|
-
this.stream.write(request);
|
|
342
|
+
this.write({ message });
|
|
196
343
|
}
|
|
197
344
|
/**
|
|
198
345
|
* Send platform feedback through the stream
|
|
199
346
|
*/
|
|
200
347
|
sendFeedback(feedback) {
|
|
201
|
-
|
|
202
|
-
feedback,
|
|
203
|
-
};
|
|
204
|
-
this.stream.write(request);
|
|
348
|
+
this.write({ feedback });
|
|
205
349
|
}
|
|
206
350
|
/**
|
|
207
351
|
* Send agent configuration through the stream
|
|
208
352
|
*/
|
|
209
353
|
sendAgentConfig(config) {
|
|
210
|
-
|
|
211
|
-
agentConfig: config,
|
|
212
|
-
};
|
|
213
|
-
this.stream.write(request);
|
|
354
|
+
this.write({ agentConfig: config });
|
|
214
355
|
}
|
|
215
356
|
/**
|
|
216
357
|
* Send a typed AgentResponse through the stream
|
|
217
358
|
*/
|
|
218
359
|
sendAgentResponse(response) {
|
|
219
|
-
|
|
220
|
-
agentResponse: response,
|
|
221
|
-
};
|
|
222
|
-
this.stream.write(request);
|
|
360
|
+
this.write({ agentResponse: response });
|
|
223
361
|
}
|
|
224
362
|
/**
|
|
225
363
|
* Send a content chunk (START/DELTA/END) for a conversation
|
|
@@ -240,10 +378,138 @@ class ConversationStream extends events_1.EventEmitter {
|
|
|
240
378
|
});
|
|
241
379
|
}
|
|
242
380
|
/**
|
|
243
|
-
*
|
|
381
|
+
* Send a transcript of the user's audio input back to the platform.
|
|
382
|
+
*
|
|
383
|
+
* After the agent runs STT on the audio, it calls this to send the transcribed
|
|
384
|
+
* text back to the platform (web adapter). The platform uses it to replace the
|
|
385
|
+
* "[audio]" placeholder message with the actual spoken text in the chat UI.
|
|
386
|
+
*
|
|
387
|
+
* @param conversationId - The conversation this transcript belongs to
|
|
388
|
+
* @param text - The transcribed text from STT
|
|
389
|
+
* @param messageId - Optional: the original "[audio]" message ID to update
|
|
390
|
+
* @param language - Optional: BCP-47 language detected by STT (e.g. "en-US")
|
|
391
|
+
*/
|
|
392
|
+
sendTranscript(conversationId, text, messageId, language) {
|
|
393
|
+
this.sendAgentResponse({
|
|
394
|
+
conversationId,
|
|
395
|
+
transcript: { text, messageId, language },
|
|
396
|
+
});
|
|
397
|
+
}
|
|
398
|
+
// --- Audio support ---
|
|
399
|
+
//
|
|
400
|
+
// These methods handle sending audio data through the gRPC bidi stream.
|
|
401
|
+
// Two directions:
|
|
402
|
+
// - Agent → Server (sendAudioConfig/sendAudioChunk/endAudio): used when the
|
|
403
|
+
// agent needs to forward audio upstream (less common)
|
|
404
|
+
// - Server → Agent (audioConfig/audioChunk events + audioAsReadable): the main
|
|
405
|
+
// path where the server forwards client mic audio to the agent for STT
|
|
406
|
+
/**
|
|
407
|
+
* Send an audio stream config through the bidi stream.
|
|
408
|
+
* Must be called before sendAudioChunk() so the receiver knows the encoding.
|
|
409
|
+
*/
|
|
410
|
+
sendAudioConfig(config) {
|
|
411
|
+
this.write({ audioConfig: config });
|
|
412
|
+
}
|
|
413
|
+
/**
|
|
414
|
+
* Send a raw audio chunk through the bidi stream.
|
|
415
|
+
* The chunk's sequence number should increase monotonically.
|
|
416
|
+
*/
|
|
417
|
+
sendAudioChunk(chunk) {
|
|
418
|
+
this.write({ audio: chunk });
|
|
419
|
+
}
|
|
420
|
+
/**
|
|
421
|
+
* Signal end of the current audio segment by sending an empty chunk with done=true.
|
|
422
|
+
* The receiver should process all accumulated audio (e.g. run STT).
|
|
423
|
+
* After this, more audio can follow — either new config or more chunks.
|
|
424
|
+
*/
|
|
425
|
+
endAudio() {
|
|
426
|
+
this.write({ audio: { data: Buffer.alloc(0), done: true } });
|
|
427
|
+
}
|
|
428
|
+
/**
|
|
429
|
+
* Converts incoming audioChunk events into a Web Streams API ReadableStream.
|
|
430
|
+
*
|
|
431
|
+
* This is the primary integration point with Mastra's voice system. The agent
|
|
432
|
+
* listens for the 'audioConfig' event to know the format, then calls this
|
|
433
|
+
* method to get a stream it can pass directly to voice.listen():
|
|
434
|
+
*
|
|
435
|
+
* ```typescript
|
|
436
|
+
* conversation.on('audioConfig', async (config) => {
|
|
437
|
+
* const audioStream = conversation.audioAsReadable();
|
|
438
|
+
* const filetype = audioEncodingToFiletype(config.encoding);
|
|
439
|
+
* const transcript = await agent.voice.listen(audioStream, { filetype });
|
|
440
|
+
* // ... process transcript
|
|
441
|
+
* });
|
|
442
|
+
* ```
|
|
443
|
+
*
|
|
444
|
+
* The ReadableStream:
|
|
445
|
+
* - Yields Uint8Array chunks as audioChunk events arrive
|
|
446
|
+
* - Closes when an AudioChunk with done=true arrives (end of segment)
|
|
447
|
+
* - Closes when the ConversationStream emits 'end' (intentional close)
|
|
448
|
+
* - Errors when the ConversationStream emits 'error'
|
|
449
|
+
* - Properly cleans up all event listeners on close, error, or cancel
|
|
450
|
+
*
|
|
451
|
+
* @returns A ReadableStream<Uint8Array> suitable for Mastra voice.listen()
|
|
452
|
+
*/
|
|
453
|
+
audioAsReadable() {
|
|
454
|
+
// Centralized cleanup to prevent listener leaks. Called on:
|
|
455
|
+
// - done=true chunk (normal completion)
|
|
456
|
+
// - stream 'end' event (intentional close)
|
|
457
|
+
// - stream 'error' event
|
|
458
|
+
// - ReadableStream cancel() (consumer gave up, e.g. reader.cancel())
|
|
459
|
+
const cleanup = () => {
|
|
460
|
+
this.removeListener('audioChunk', onChunk);
|
|
461
|
+
this.removeListener('end', onEnd);
|
|
462
|
+
this.removeListener('error', onError);
|
|
463
|
+
};
|
|
464
|
+
const onChunk = (chunk) => {
|
|
465
|
+
if (chunk.done) {
|
|
466
|
+
cleanup();
|
|
467
|
+
try {
|
|
468
|
+
controller.close();
|
|
469
|
+
}
|
|
470
|
+
catch { }
|
|
471
|
+
}
|
|
472
|
+
else {
|
|
473
|
+
controller.enqueue(new Uint8Array(chunk.data));
|
|
474
|
+
}
|
|
475
|
+
};
|
|
476
|
+
const onEnd = () => {
|
|
477
|
+
cleanup();
|
|
478
|
+
try {
|
|
479
|
+
controller.close();
|
|
480
|
+
}
|
|
481
|
+
catch { }
|
|
482
|
+
};
|
|
483
|
+
const onError = (err) => {
|
|
484
|
+
cleanup();
|
|
485
|
+
try {
|
|
486
|
+
controller.error(err);
|
|
487
|
+
}
|
|
488
|
+
catch { }
|
|
489
|
+
};
|
|
490
|
+
let controller;
|
|
491
|
+
return new ReadableStream({
|
|
492
|
+
start: (ctrl) => {
|
|
493
|
+
controller = ctrl;
|
|
494
|
+
this.on('audioChunk', onChunk);
|
|
495
|
+
this.once('end', onEnd);
|
|
496
|
+
this.once('error', onError);
|
|
497
|
+
},
|
|
498
|
+
cancel: () => {
|
|
499
|
+
// Consumer cancelled (e.g. reader.cancel()) — remove all listeners
|
|
500
|
+
// to prevent memory leaks
|
|
501
|
+
cleanup();
|
|
502
|
+
},
|
|
503
|
+
});
|
|
504
|
+
}
|
|
505
|
+
/**
|
|
506
|
+
* End the stream intentionally. Emits 'end' and prevents any further reconnects.
|
|
244
507
|
*/
|
|
245
508
|
end() {
|
|
509
|
+
this.closed = true;
|
|
510
|
+
this.writeBuffer = [];
|
|
246
511
|
this.stream.end();
|
|
512
|
+
this.emit('end');
|
|
247
513
|
}
|
|
248
514
|
}
|
|
249
515
|
exports.ConversationStream = ConversationStream;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
// Audio streaming types for the messaging system.
|
|
2
|
+
//
|
|
3
|
+
// These types enable raw audio input from any frontend (browser, phone, mobile app)
|
|
4
|
+
// to be streamed through the messaging server to an agent. The messaging system is
|
|
5
|
+
// a pass-through — it does NOT perform speech-to-text, transcoding, or voice activity
|
|
6
|
+
// detection. The agent handles STT via Mastra's voice provider abstraction.
|
|
7
|
+
//
|
|
8
|
+
// Data flow:
|
|
9
|
+
// Client (mic) → WebSocket → Server → gRPC (these types) → Agent → Mastra voice.listen()
|
|
10
|
+
//
|
|
11
|
+
// Two ways audio enters the system:
|
|
12
|
+
// 1. ProcessAudioStream RPC: dedicated audio-only streaming (AudioStreamRequest)
|
|
13
|
+
// 2. ProcessConversation RPC: audio mixed into the bidi stream (ConversationRequest.audio_config/audio)
|
|
14
|
+
//
|
|
15
|
+
// Both converge on the same types: AudioStreamConfig describes the format,
|
|
16
|
+
// AudioChunk carries the bytes, and done=true signals end of an utterance.
|
|
17
|
+
|
|
18
|
+
syntax = "proto3";
|
|
19
|
+
|
|
20
|
+
package astro.messaging.v1;
|
|
21
|
+
|
|
22
|
+
option go_package = "github.com/postman/astro/messaging/v1;messagingv1";
|
|
23
|
+
|
|
24
|
+
// Audio encoding format — covers browser, telephony, and mobile sources.
|
|
25
|
+
// The agent uses this to configure the STT provider (e.g. Whisper, Deepgram).
|
|
26
|
+
enum AudioEncoding {
|
|
27
|
+
AUDIO_ENCODING_UNSPECIFIED = 0;
|
|
28
|
+
LINEAR16 = 1; // PCM signed 16-bit little-endian — universal baseline, any platform
|
|
29
|
+
MULAW = 2; // G.711 mu-law — Twilio and traditional telephony (8kHz)
|
|
30
|
+
OPUS = 3; // Raw Opus frames — low-latency codec
|
|
31
|
+
MP3 = 4; // MP3 — batch uploads, pre-recorded audio
|
|
32
|
+
WEBM_OPUS = 5; // WebM container with Opus — browser MediaRecorder default
|
|
33
|
+
OGG_OPUS = 6; // OGG container with Opus — Firefox MediaRecorder
|
|
34
|
+
FLAC = 7; // FLAC lossless — high-quality uploads
|
|
35
|
+
AAC = 8; // AAC — iOS native recording
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Sent once at the start of an audio segment to tell the agent what format
|
|
39
|
+
// the subsequent AudioChunk bytes are in. Without this, the agent can't
|
|
40
|
+
// decode the raw bytes.
|
|
41
|
+
message AudioStreamConfig {
|
|
42
|
+
AudioEncoding encoding = 1; // What codec the audio bytes use
|
|
43
|
+
int32 sample_rate = 2; // Hz: 8000 (telephony), 16000 (speech), 48000 (browser)
|
|
44
|
+
int32 channels = 3; // 1 = mono (speech default), 2 = stereo
|
|
45
|
+
string language = 4; // BCP-47 hint for STT, e.g. "en-US" (optional)
|
|
46
|
+
string conversation_id = 5; // Links this audio to an existing conversation
|
|
47
|
+
|
|
48
|
+
// Source metadata — helps the agent pick the right STT config.
|
|
49
|
+
// Examples: "browser", "twilio", "vonage", "mobile", "upload"
|
|
50
|
+
string source = 6;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// A chunk of raw audio bytes in the encoding specified by AudioStreamConfig.
|
|
54
|
+
//
|
|
55
|
+
// Chunks arrive in order during a segment. When done=true, the segment is
|
|
56
|
+
// complete and the agent should run STT on all accumulated chunks.
|
|
57
|
+
// The data field may be empty on the final done=true chunk.
|
|
58
|
+
message AudioChunk {
|
|
59
|
+
bytes data = 1; // Raw audio bytes (pass-through, no transcoding)
|
|
60
|
+
int64 sequence = 2; // Monotonic sequence number for ordering
|
|
61
|
+
bool done = 3; // true = end of segment, process accumulated audio
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Wrapper for the ProcessAudioStream RPC (dedicated audio streaming).
|
|
65
|
+
// The first message MUST be config, all subsequent messages are audio chunks.
|
|
66
|
+
message AudioStreamRequest {
|
|
67
|
+
oneof request {
|
|
68
|
+
AudioStreamConfig config = 1; // First message: tells agent the audio format
|
|
69
|
+
AudioChunk audio = 2; // Subsequent: raw audio bytes
|
|
70
|
+
}
|
|
71
|
+
}
|
|
@@ -4,6 +4,7 @@ package astro.messaging.v1;
|
|
|
4
4
|
|
|
5
5
|
import "google/protobuf/timestamp.proto";
|
|
6
6
|
import "astro/messaging/v1/message.proto";
|
|
7
|
+
import "astro/messaging/v1/audio.proto";
|
|
7
8
|
|
|
8
9
|
option go_package = "github.com/postman/astro/messaging/v1;messagingv1";
|
|
9
10
|
|
|
@@ -22,6 +23,9 @@ message AgentResponse {
|
|
|
22
23
|
ThreadMetadata thread_metadata = 7; // Thread title, creation
|
|
23
24
|
ErrorResponse error = 8; // Error during processing
|
|
24
25
|
ThreadHistoryRequest context_request = 9; // Request cached context (optional)
|
|
26
|
+
Transcript transcript = 10; // Audio transcript (agent → platform)
|
|
27
|
+
AudioStreamConfig audio_config = 11; // Audio session config (server → agent)
|
|
28
|
+
AudioChunk audio_chunk = 12; // Audio data (server → agent)
|
|
25
29
|
}
|
|
26
30
|
}
|
|
27
31
|
|
|
@@ -144,6 +148,14 @@ message ThreadMetadata {
|
|
|
144
148
|
bool create_new = 3; // Create new thread
|
|
145
149
|
}
|
|
146
150
|
|
|
151
|
+
// Transcript of user audio input (agent → platform after STT)
|
|
152
|
+
// Used to update a placeholder message with the actual transcribed text
|
|
153
|
+
message Transcript {
|
|
154
|
+
string text = 1; // Transcribed text
|
|
155
|
+
string message_id = 2; // User message ID to update (optional)
|
|
156
|
+
string language = 3; // Detected language BCP-47 (optional)
|
|
157
|
+
}
|
|
158
|
+
|
|
147
159
|
// Error response from agent
|
|
148
160
|
message ErrorResponse {
|
|
149
161
|
enum ErrorCode {
|
|
@@ -5,6 +5,7 @@ package astro.messaging.v1;
|
|
|
5
5
|
import "astro/messaging/v1/message.proto";
|
|
6
6
|
import "astro/messaging/v1/response.proto";
|
|
7
7
|
import "astro/messaging/v1/feedback.proto";
|
|
8
|
+
import "astro/messaging/v1/audio.proto";
|
|
8
9
|
import "astro/messaging/v1/config.proto";
|
|
9
10
|
import "google/protobuf/timestamp.proto";
|
|
10
11
|
|
|
@@ -29,6 +30,11 @@ service AgentMessaging {
|
|
|
29
30
|
rpc GetConversationMetadata(ConversationMetadataRequest)
|
|
30
31
|
returns (ConversationMetadataResponse);
|
|
31
32
|
|
|
33
|
+
// Audio: client streams raw audio, server responds with text
|
|
34
|
+
// First message MUST be AudioStreamConfig, rest are AudioChunks
|
|
35
|
+
rpc ProcessAudioStream(stream AudioStreamRequest)
|
|
36
|
+
returns (stream AgentResponse);
|
|
37
|
+
|
|
32
38
|
// Health check
|
|
33
39
|
rpc HealthCheck(HealthCheckRequest)
|
|
34
40
|
returns (HealthCheckResponse);
|
|
@@ -41,6 +47,8 @@ message ConversationRequest {
|
|
|
41
47
|
PlatformFeedback feedback = 2;
|
|
42
48
|
AgentConfig agent_config = 3;
|
|
43
49
|
AgentResponse agent_response = 4;
|
|
50
|
+
AudioStreamConfig audio_config = 5; // Start audio within conversation
|
|
51
|
+
AudioChunk audio = 6; // Audio data within conversation
|
|
44
52
|
}
|
|
45
53
|
}
|
|
46
54
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@astropods/messaging",
|
|
3
3
|
"license": "Apache-2.0",
|
|
4
|
-
"version": "0.0.
|
|
4
|
+
"version": "0.0.3",
|
|
5
5
|
"description": "TypeScript SDK for Astro Messaging",
|
|
6
6
|
"main": "dist/index.js",
|
|
7
7
|
"types": "dist/index.d.ts",
|
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
"dist"
|
|
10
10
|
],
|
|
11
11
|
"scripts": {
|
|
12
|
-
"postinstall": "ln -sf ../../proto proto",
|
|
13
|
-
"build": "tsc && cp -r ../../proto dist/proto",
|
|
12
|
+
"postinstall": "rm -rf proto && ln -sf ../../proto proto",
|
|
13
|
+
"build": "tsc && rm -rf dist/proto && cp -r ../../proto dist/proto",
|
|
14
14
|
"watch": "tsc --watch",
|
|
15
15
|
"test": "bun test",
|
|
16
16
|
"test:watch": "bun test --watch"
|