npm - @astropods/messaging - Versions diffs - 0.0.2 → 0.0.3 - Mend

@astropods/messaging 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/messaging-client.d.ts +106 -0
package/dist/messaging-client.js +157 -0
package/dist/proto/astro/messaging/v1/audio.proto +71 -0
package/dist/proto/astro/messaging/v1/response.proto +12 -0
package/dist/proto/astro/messaging/v1/service.proto +8 -0
package/package.json +2 -2

package/dist/messaging-client.d.ts CHANGED Viewed

@@ -45,6 +45,9 @@ export interface AgentResponse {
     threadMetadata?: ThreadMetadata;
     error?: ErrorResponse;
     contextRequest?: ThreadHistoryRequest;
+    transcript?: Transcript;
+    audioConfig?: AudioStreamConfig;
+    audioChunk?: AudioChunk;
 }
 export interface StatusUpdate {
     status: 'THINKING' | 'SEARCHING' | 'GENERATING' | 'PROCESSING' | 'ANALYZING' | 'CUSTOM';
@@ -77,6 +80,11 @@ export interface ErrorResponse {
     details?: string;
     retryable?: boolean;
 }
+export interface Transcript {
+    text: string;
+    messageId?: string;
+    language?: string;
+}
 export interface ThreadHistoryRequest {
     conversationId: string;
     maxMessages?: number;
@@ -129,11 +137,54 @@ export interface AgentConfig {
     systemPrompt: string;
     tools: AgentToolConfig[];
 }
+/**
+ * Supported audio encoding formats. Matches the AudioEncoding protobuf enum.
+ *
+ * Common sources:
+ * - LINEAR16: Universal PCM baseline (any platform)
+ * - MULAW: Twilio / telephony (G.711 mu-law, 8kHz)
+ * - WEBM_OPUS: Browser MediaRecorder default
+ * - AAC: iOS native recording
+ */
+export type AudioEncoding = 'LINEAR16' | 'MULAW' | 'OPUS' | 'MP3' | 'WEBM_OPUS' | 'OGG_OPUS' | 'FLAC' | 'AAC';
+/**
+ * Configuration sent at the start of an audio segment to describe the format.
+ * Maps to the AudioStreamConfig protobuf message.
+ */
+export interface AudioStreamConfig {
+    encoding: AudioEncoding;
+    sampleRate: number;
+    channels: number;
+    language?: string;
+    conversationId: string;
+    source?: string;
+}
+/**
+ * A chunk of raw audio bytes. Maps to the AudioChunk protobuf message.
+ *
+ * Chunks arrive sequentially during a segment. When done=true, the segment
+ * is complete and the agent should run STT on the accumulated audio.
+ */
+export interface AudioChunk {
+    data: Buffer | Uint8Array;
+    sequence?: number;
+    done?: boolean;
+}
+/**
+ * Maps an AudioEncoding to the filetype string expected by Mastra's voice.listen().
+ *
+ * Usage:
+ *   const filetype = audioEncodingToFiletype(config.encoding);
+ *   const transcript = await agent.voice.listen(audioStream, { filetype });
+ */
+export declare function audioEncodingToFiletype(encoding: AudioEncoding): string;
 export interface ConversationRequest {
     message?: Message;
     feedback?: any;
     agentConfig?: AgentConfig;
     agentResponse?: AgentResponse;
+    audioConfig?: AudioStreamConfig;
+    audio?: AudioChunk;
 }
 export interface ReconnectOptions {
     /** Maximum number of reconnect attempts. Default: Infinity */
@@ -244,6 +295,61 @@ export declare class ConversationStream extends EventEmitter {
      * Send a status update for a conversation
      */
     sendStatusUpdate(conversationId: string, status: StatusUpdate): void;
+    /**
+     * Send a transcript of the user's audio input back to the platform.
+     *
+     * After the agent runs STT on the audio, it calls this to send the transcribed
+     * text back to the platform (web adapter). The platform uses it to replace the
+     * "[audio]" placeholder message with the actual spoken text in the chat UI.
+     *
+     * @param conversationId - The conversation this transcript belongs to
+     * @param text - The transcribed text from STT
+     * @param messageId - Optional: the original "[audio]" message ID to update
+     * @param language - Optional: BCP-47 language detected by STT (e.g. "en-US")
+     */
+    sendTranscript(conversationId: string, text: string, messageId?: string, language?: string): void;
+    /**
+     * Send an audio stream config through the bidi stream.
+     * Must be called before sendAudioChunk() so the receiver knows the encoding.
+     */
+    sendAudioConfig(config: AudioStreamConfig): void;
+    /**
+     * Send a raw audio chunk through the bidi stream.
+     * The chunk's sequence number should increase monotonically.
+     */
+    sendAudioChunk(chunk: AudioChunk): void;
+    /**
+     * Signal end of the current audio segment by sending an empty chunk with done=true.
+     * The receiver should process all accumulated audio (e.g. run STT).
+     * After this, more audio can follow — either new config or more chunks.
+     */
+    endAudio(): void;
+    /**
+     * Converts incoming audioChunk events into a Web Streams API ReadableStream.
+     *
+     * This is the primary integration point with Mastra's voice system. The agent
+     * listens for the 'audioConfig' event to know the format, then calls this
+     * method to get a stream it can pass directly to voice.listen():
+     *
+     * ```typescript
+     * conversation.on('audioConfig', async (config) => {
+     *   const audioStream = conversation.audioAsReadable();
+     *   const filetype = audioEncodingToFiletype(config.encoding);
+     *   const transcript = await agent.voice.listen(audioStream, { filetype });
+     *   // ... process transcript
+     * });
+     * ```
+     *
+     * The ReadableStream:
+     * - Yields Uint8Array chunks as audioChunk events arrive
+     * - Closes when an AudioChunk with done=true arrives (end of segment)
+     * - Closes when the ConversationStream emits 'end' (intentional close)
+     * - Errors when the ConversationStream emits 'error'
+     * - Properly cleans up all event listeners on close, error, or cancel
+     *
+     * @returns A ReadableStream<Uint8Array> suitable for Mastra voice.listen()
+     */
+    audioAsReadable(): ReadableStream<Uint8Array>;
     /**
      * End the stream intentionally. Emits 'end' and prevents any further reconnects.
      */

package/dist/messaging-client.js CHANGED Viewed

@@ -34,10 +34,31 @@ var __importStar = (this && this.__importStar) || (function () {
 })();
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.Helpers = exports.MessageStream = exports.ConversationStream = exports.MessagingClient = void 0;
+exports.audioEncodingToFiletype = audioEncodingToFiletype;
 const grpc = __importStar(require("@grpc/grpc-js"));
 const protoLoader = __importStar(require("@grpc/proto-loader"));
 const path_1 = require("path");
 const events_1 = require("events");
+/**
+ * Maps an AudioEncoding to the filetype string expected by Mastra's voice.listen().
+ *
+ * Usage:
+ *   const filetype = audioEncodingToFiletype(config.encoding);
+ *   const transcript = await agent.voice.listen(audioStream, { filetype });
+ */
+function audioEncodingToFiletype(encoding) {
+    const map = {
+        LINEAR16: 'wav',
+        MULAW: 'wav',
+        OPUS: 'opus',
+        MP3: 'mp3',
+        WEBM_OPUS: 'webm',
+        OGG_OPUS: 'ogg',
+        FLAC: 'flac',
+        AAC: 'm4a',
+    };
+    return map[encoding] ?? 'wav';
+}
 // gRPC status codes: DEADLINE_EXCEEDED=4, INTERNAL=13, UNAVAILABLE=14, RESOURCE_EXHAUSTED=8
 const DEFAULT_RETRYABLE_STATUS_CODES = [4, 8, 13, 14];
 function resolveReconnectOptions(options) {
@@ -234,6 +255,17 @@ class ConversationStream extends events_1.EventEmitter {
     attachHandlers(stream) {
         stream.on('data', (response) => {
             this.retryCount = 0;
+            // Emit audio-specific events if present.
+            // The server sends audio data through the bidi stream as AgentResponse
+            // messages with audioConfig or audioChunk payloads. We emit dedicated
+            // events for these so the agent can handle audio separately from text,
+            // while still emitting the generic 'response' event for observability.
+            if (response.audioConfig) {
+                this.emit('audioConfig', response.audioConfig);
+            }
+            else if (response.audioChunk) {
+                this.emit('audioChunk', response.audioChunk);
+            }
             this.emit('response', response);
         });
         stream.on('error', (error) => {
@@ -345,6 +377,131 @@ class ConversationStream extends events_1.EventEmitter {
             status,
         });
     }
+    /**
+     * Send a transcript of the user's audio input back to the platform.
+     *
+     * After the agent runs STT on the audio, it calls this to send the transcribed
+     * text back to the platform (web adapter). The platform uses it to replace the
+     * "[audio]" placeholder message with the actual spoken text in the chat UI.
+     *
+     * @param conversationId - The conversation this transcript belongs to
+     * @param text - The transcribed text from STT
+     * @param messageId - Optional: the original "[audio]" message ID to update
+     * @param language - Optional: BCP-47 language detected by STT (e.g. "en-US")
+     */
+    sendTranscript(conversationId, text, messageId, language) {
+        this.sendAgentResponse({
+            conversationId,
+            transcript: { text, messageId, language },
+        });
+    }
+    // --- Audio support ---
+    //
+    // These methods handle sending audio data through the gRPC bidi stream.
+    // Two directions:
+    //   - Agent → Server (sendAudioConfig/sendAudioChunk/endAudio): used when the
+    //     agent needs to forward audio upstream (less common)
+    //   - Server → Agent (audioConfig/audioChunk events + audioAsReadable): the main
+    //     path where the server forwards client mic audio to the agent for STT
+    /**
+     * Send an audio stream config through the bidi stream.
+     * Must be called before sendAudioChunk() so the receiver knows the encoding.
+     */
+    sendAudioConfig(config) {
+        this.write({ audioConfig: config });
+    }
+    /**
+     * Send a raw audio chunk through the bidi stream.
+     * The chunk's sequence number should increase monotonically.
+     */
+    sendAudioChunk(chunk) {
+        this.write({ audio: chunk });
+    }
+    /**
+     * Signal end of the current audio segment by sending an empty chunk with done=true.
+     * The receiver should process all accumulated audio (e.g. run STT).
+     * After this, more audio can follow — either new config or more chunks.
+     */
+    endAudio() {
+        this.write({ audio: { data: Buffer.alloc(0), done: true } });
+    }
+    /**
+     * Converts incoming audioChunk events into a Web Streams API ReadableStream.
+     *
+     * This is the primary integration point with Mastra's voice system. The agent
+     * listens for the 'audioConfig' event to know the format, then calls this
+     * method to get a stream it can pass directly to voice.listen():
+     *
+     * ```typescript
+     * conversation.on('audioConfig', async (config) => {
+     *   const audioStream = conversation.audioAsReadable();
+     *   const filetype = audioEncodingToFiletype(config.encoding);
+     *   const transcript = await agent.voice.listen(audioStream, { filetype });
+     *   // ... process transcript
+     * });
+     * ```
+     *
+     * The ReadableStream:
+     * - Yields Uint8Array chunks as audioChunk events arrive
+     * - Closes when an AudioChunk with done=true arrives (end of segment)
+     * - Closes when the ConversationStream emits 'end' (intentional close)
+     * - Errors when the ConversationStream emits 'error'
+     * - Properly cleans up all event listeners on close, error, or cancel
+     *
+     * @returns A ReadableStream<Uint8Array> suitable for Mastra voice.listen()
+     */
+    audioAsReadable() {
+        // Centralized cleanup to prevent listener leaks. Called on:
+        // - done=true chunk (normal completion)
+        // - stream 'end' event (intentional close)
+        // - stream 'error' event
+        // - ReadableStream cancel() (consumer gave up, e.g. reader.cancel())
+        const cleanup = () => {
+            this.removeListener('audioChunk', onChunk);
+            this.removeListener('end', onEnd);
+            this.removeListener('error', onError);
+        };
+        const onChunk = (chunk) => {
+            if (chunk.done) {
+                cleanup();
+                try {
+                    controller.close();
+                }
+                catch { }
+            }
+            else {
+                controller.enqueue(new Uint8Array(chunk.data));
+            }
+        };
+        const onEnd = () => {
+            cleanup();
+            try {
+                controller.close();
+            }
+            catch { }
+        };
+        const onError = (err) => {
+            cleanup();
+            try {
+                controller.error(err);
+            }
+            catch { }
+        };
+        let controller;
+        return new ReadableStream({
+            start: (ctrl) => {
+                controller = ctrl;
+                this.on('audioChunk', onChunk);
+                this.once('end', onEnd);
+                this.once('error', onError);
+            },
+            cancel: () => {
+                // Consumer cancelled (e.g. reader.cancel()) — remove all listeners
+                // to prevent memory leaks
+                cleanup();
+            },
+        });
+    }
     /**
      * End the stream intentionally. Emits 'end' and prevents any further reconnects.
      */

package/dist/proto/astro/messaging/v1/audio.proto ADDED Viewed

@@ -0,0 +1,71 @@
+// Audio streaming types for the messaging system.
+//
+// These types enable raw audio input from any frontend (browser, phone, mobile app)
+// to be streamed through the messaging server to an agent. The messaging system is
+// a pass-through — it does NOT perform speech-to-text, transcoding, or voice activity
+// detection. The agent handles STT via Mastra's voice provider abstraction.
+//
+// Data flow:
+//   Client (mic) → WebSocket → Server → gRPC (these types) → Agent → Mastra voice.listen()
+//
+// Two ways audio enters the system:
+//   1. ProcessAudioStream RPC: dedicated audio-only streaming (AudioStreamRequest)
+//   2. ProcessConversation RPC: audio mixed into the bidi stream (ConversationRequest.audio_config/audio)
+//
+// Both converge on the same types: AudioStreamConfig describes the format,
+// AudioChunk carries the bytes, and done=true signals end of an utterance.
+syntax = "proto3";
+package astro.messaging.v1;
+option go_package = "github.com/postman/astro/messaging/v1;messagingv1";
+// Audio encoding format — covers browser, telephony, and mobile sources.
+// The agent uses this to configure the STT provider (e.g. Whisper, Deepgram).
+enum AudioEncoding {
+  AUDIO_ENCODING_UNSPECIFIED = 0;
+  LINEAR16 = 1;          // PCM signed 16-bit little-endian — universal baseline, any platform
+  MULAW = 2;             // G.711 mu-law — Twilio and traditional telephony (8kHz)
+  OPUS = 3;              // Raw Opus frames — low-latency codec
+  MP3 = 4;               // MP3 — batch uploads, pre-recorded audio
+  WEBM_OPUS = 5;         // WebM container with Opus — browser MediaRecorder default
+  OGG_OPUS = 6;          // OGG container with Opus — Firefox MediaRecorder
+  FLAC = 7;              // FLAC lossless — high-quality uploads
+  AAC = 8;               // AAC — iOS native recording
+}
+// Sent once at the start of an audio segment to tell the agent what format
+// the subsequent AudioChunk bytes are in. Without this, the agent can't
+// decode the raw bytes.
+message AudioStreamConfig {
+  AudioEncoding encoding = 1;     // What codec the audio bytes use
+  int32 sample_rate = 2;          // Hz: 8000 (telephony), 16000 (speech), 48000 (browser)
+  int32 channels = 3;             // 1 = mono (speech default), 2 = stereo
+  string language = 4;            // BCP-47 hint for STT, e.g. "en-US" (optional)
+  string conversation_id = 5;     // Links this audio to an existing conversation
+  // Source metadata — helps the agent pick the right STT config.
+  // Examples: "browser", "twilio", "vonage", "mobile", "upload"
+  string source = 6;
+}
+// A chunk of raw audio bytes in the encoding specified by AudioStreamConfig.
+//
+// Chunks arrive in order during a segment. When done=true, the segment is
+// complete and the agent should run STT on all accumulated chunks.
+// The data field may be empty on the final done=true chunk.
+message AudioChunk {
+  bytes data = 1;                 // Raw audio bytes (pass-through, no transcoding)
+  int64 sequence = 2;             // Monotonic sequence number for ordering
+  bool done = 3;                  // true = end of segment, process accumulated audio
+}
+// Wrapper for the ProcessAudioStream RPC (dedicated audio streaming).
+// The first message MUST be config, all subsequent messages are audio chunks.
+message AudioStreamRequest {
+  oneof request {
+    AudioStreamConfig config = 1;    // First message: tells agent the audio format
+    AudioChunk audio = 2;            // Subsequent: raw audio bytes
+  }
+}

package/dist/proto/astro/messaging/v1/response.proto CHANGED Viewed

@@ -4,6 +4,7 @@ package astro.messaging.v1;
 import "google/protobuf/timestamp.proto";
 import "astro/messaging/v1/message.proto";
+import "astro/messaging/v1/audio.proto";
 option go_package = "github.com/postman/astro/messaging/v1;messagingv1";
@@ -22,6 +23,9 @@ message AgentResponse {
     ThreadMetadata thread_metadata = 7;   // Thread title, creation
     ErrorResponse error = 8;              // Error during processing
     ThreadHistoryRequest context_request = 9;  // Request cached context (optional)
+    Transcript transcript = 10;           // Audio transcript (agent → platform)
+    AudioStreamConfig audio_config = 11;  // Audio session config (server → agent)
+    AudioChunk audio_chunk = 12;          // Audio data (server → agent)
   }
 }
@@ -144,6 +148,14 @@ message ThreadMetadata {
   bool create_new = 3;                    // Create new thread
 }
+// Transcript of user audio input (agent → platform after STT)
+// Used to update a placeholder message with the actual transcribed text
+message Transcript {
+  string text = 1;                        // Transcribed text
+  string message_id = 2;                  // User message ID to update (optional)
+  string language = 3;                    // Detected language BCP-47 (optional)
+}
 // Error response from agent
 message ErrorResponse {
   enum ErrorCode {

package/dist/proto/astro/messaging/v1/service.proto CHANGED Viewed

@@ -5,6 +5,7 @@ package astro.messaging.v1;
 import "astro/messaging/v1/message.proto";
 import "astro/messaging/v1/response.proto";
 import "astro/messaging/v1/feedback.proto";
+import "astro/messaging/v1/audio.proto";
 import "astro/messaging/v1/config.proto";
 import "google/protobuf/timestamp.proto";
@@ -29,6 +30,11 @@ service AgentMessaging {
   rpc GetConversationMetadata(ConversationMetadataRequest)
       returns (ConversationMetadataResponse);
+  // Audio: client streams raw audio, server responds with text
+  // First message MUST be AudioStreamConfig, rest are AudioChunks
+  rpc ProcessAudioStream(stream AudioStreamRequest)
+      returns (stream AgentResponse);
   // Health check
   rpc HealthCheck(HealthCheckRequest)
       returns (HealthCheckResponse);
@@ -41,6 +47,8 @@ message ConversationRequest {
     PlatformFeedback feedback = 2;
     AgentConfig agent_config = 3;
     AgentResponse agent_response = 4;
+    AudioStreamConfig audio_config = 5;  // Start audio within conversation
+    AudioChunk audio = 6;                // Audio data within conversation
   }
 }

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@astropods/messaging",
   "license": "Apache-2.0",
-  "version": "0.0.2",
+  "version": "0.0.3",
   "description": "TypeScript SDK for Astro Messaging",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",
@@ -10,7 +10,7 @@
   ],
   "scripts": {
     "postinstall": "rm -rf proto && ln -sf ../../proto proto",
-    "build": "tsc && cp -r ../../proto dist/proto",
+    "build": "tsc && rm -rf dist/proto && cp -r ../../proto dist/proto",
     "watch": "tsc --watch",
     "test": "bun test",
     "test:watch": "bun test --watch"