npm - @drawdream/livespeech - Versions diffs - 0.1.2 → 0.1.3 - Mend

@drawdream/livespeech 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md CHANGED Viewed

@@ -5,6 +5,15 @@
 A TypeScript/JavaScript SDK for real-time speech-to-speech AI conversations.
+## Features
+- 🎙️ **Real-time Voice Conversations** - Natural, low-latency voice interactions
+- 🌐 **Multi-language Support** - Korean, English, Japanese, Chinese, and more
+- 🔊 **Streaming Audio** - Send and receive audio in real-time
+- 📝 **Live Transcription** - Get transcriptions of both user and AI speech
+- 🔄 **Auto-reconnection** - Automatic recovery from network issues
+- 🌐 **Browser & Node.js** - Works in both environments
 ## Installation
 ```bash
@@ -18,171 +27,306 @@ pnpm add @drawdream/livespeech
 ## Quick Start
 ```typescript
-import { LiveSpeechClient, Region } from '@drawdream/livespeech';
+import { LiveSpeechClient } from '@drawdream/livespeech';
 const client = new LiveSpeechClient({
-  region: 'ap-northeast-2',  // or Region.AP_NORTHEAST_2
+  region: 'ap-northeast-2',
   apiKey: 'your-api-key',
 });
-// Handle events
+// Set up event handlers
 client.setUserTranscriptHandler((text) => {
-  console.log(`You said: ${text}`);
+  console.log('You:', text);
 });
-client.setTranscriptHandler((text, isFinal) => {
-  console.log(`AI Transcript: ${text}`);
+client.setResponseHandler((text, isFinal) => {
+  console.log('AI:', text);
 });
 client.setAudioHandler((audioData) => {
-  // Play audio through speakers
+  playAudio(audioData);  // PCM16 @ 24kHz
+});
+client.setErrorHandler((error) => {
+  console.error('Error:', error.message);
 });
-// Connect and start session
+// Connect and start conversation
 await client.connect();
 await client.startSession({
   prePrompt: 'You are a helpful assistant.',
-  // pipelineMode: 'live' is the default
+  language: 'ko-KR',
 });
-// Start streaming and send audio
+// Stream audio
 client.audioStart();
-client.sendAudioChunk(audioBuffer);
+client.sendAudioChunk(pcmData);  // PCM16 @ 16kHz
+client.audioEnd();
+// Cleanup
+await client.endSession();
+client.disconnect();
 ```
-## Pipeline Modes
+## Audio Flow
-The SDK supports two pipeline modes for audio processing:
+```
+connect() → startSession() → audioStart() → sendAudioChunk()* → audioEnd() → endSession()
+```
-### Live Mode (Default)
+| Step | Description |
+|------|-------------|
+| `connect()` | Establish WebSocket connection |
+| `startSession(config)` | Start conversation with optional system prompt |
+| `audioStart()` | Begin audio streaming |
+| `sendAudioChunk(data)` | Send PCM16 audio (call multiple times) |
+| `audioEnd()` | End streaming, triggers AI response |
+| `endSession()` | End conversation |
+| `disconnect()` | Close connection |
-Uses Gemini 2.5 Flash Live API for end-to-end audio conversation. This provides:
-- **Lower latency** - Direct audio-to-audio processing
-- **Natural conversation** - Built-in voice activity detection and turn-taking
-- **Real-time transcription** - Both user and AI speech are transcribed
+## Configuration
 ```typescript
+const client = new LiveSpeechClient({
+  region: 'ap-northeast-2',       // Required: Seoul region
+  apiKey: 'your-api-key',         // Required: Your API key
+  autoReconnect: true,            // Auto-reconnect on disconnect
+  maxReconnectAttempts: 5,        // Maximum reconnection attempts
+  debug: false,                   // Enable debug logging
+});
 await client.startSession({
   prePrompt: 'You are a helpful assistant.',
-  pipelineMode: 'live',  // Default, can be omitted
+  language: 'ko-KR',              // Language: ko-KR, en-US, ja-JP, etc.
 });
 ```
-### Composed Mode
+## Events
-Uses separate STT + LLM + TTS services for more customization:
-- **More control** - Separate services for each step
-- **Custom voices** - Use different TTS voices
-- **Text responses** - Access to intermediate text responses
+| Event | Description | Key Properties |
+|-------|-------------|----------------|
+| `connected` | Connection established | `connectionId` |
+| `disconnected` | Connection closed | `reason`, `code` |
+| `sessionStarted` | Session created | `sessionId` |
+| `ready` | Ready for audio input | `timestamp` |
+| `userTranscript` | Your speech transcribed | `text` |
+| `response` | AI's response text | `text`, `isFinal` |
+| `audio` | AI's audio output | `data`, `sampleRate` |
+| `turnComplete` | AI finished speaking | `timestamp` |
+| `error` | Error occurred | `code`, `message` |
+### Simple Handlers
 ```typescript
-await client.startSession({
-  prePrompt: 'You are a helpful assistant.',
-  pipelineMode: 'composed',
+// Your speech transcription
+client.setUserTranscriptHandler((text) => {
+  console.log('You said:', text);
+});
+// AI's text response
+client.setResponseHandler((text, isFinal) => {
+  console.log('AI:', text, isFinal ? '(done)' : '...');
+});
+// AI's audio output
+client.setAudioHandler((data: Uint8Array) => {
+  // data: PCM16 audio
+  // Sample rate: 24000 Hz
+  playAudio(data);
+});
+// Error handling
+client.setErrorHandler((error) => {
+  console.error(`Error [${error.code}]: ${error.message}`);
 });
 ```
-## API Reference
+### Full Event API
+```typescript
+client.on('connected', (event) => {
+  console.log('Connected:', event.connectionId);
+});
+client.on('ready', () => {
+  console.log('Ready for audio');
+});
+client.on('userTranscript', (event) => {
+  console.log('You:', event.text);
+});
+client.on('response', (event) => {
+  console.log('AI:', event.text, event.isFinal);
+});
-### Regions
+client.on('audio', (event) => {
+  // event.data: Uint8Array (PCM16)
+  // event.sampleRate: 24000
+  playAudio(event.data);
+});
-The SDK provides built-in region support, so you don't need to remember endpoint URLs:
+client.on('turnComplete', () => {
+  console.log('AI finished speaking');
+});
-| Region | Identifier | Location |
-|--------|------------|----------|
-| `ap-northeast-2` | `Region.AP_NORTHEAST_2` | Asia Pacific (Seoul) |
-| `us-west-2` | `Region.US_WEST_2` | US West (Oregon) - Coming soon |
+client.on('error', (event) => {
+  console.error('Error:', event.code, event.message);
+});
+```
-### LiveSpeechClient
+## Audio Format
-#### Constructor Options
+### Input (Your Microphone)
-| Option | Type | Default | Description |
-|--------|------|---------|-------------|
-| `region` | `string` | **required** | Region identifier |
-| `apiKey` | `string` | **required** | API key for authentication |
-| `connectionTimeout` | `number` | `30000` | Connection timeout in ms |
-| `autoReconnect` | `boolean` | `true` | Auto-reconnect on disconnect |
-| `maxReconnectAttempts` | `number` | `5` | Max reconnection attempts |
-| `reconnectDelay` | `number` | `1000` | Base reconnection delay in ms |
-| `debug` | `boolean` | `false` | Enable debug logging |
+| Property | Value |
+|----------|-------|
+| Format | PCM16 (16-bit signed, little-endian) |
+| Sample Rate | 16,000 Hz |
+| Channels | 1 (Mono) |
+| Chunk Size | ~3200 bytes (100ms) |
-#### Methods
+### Output (AI Response)
-| Method | Description |
-|--------|-------------|
-| `connect()` | Connect to the server |
-| `disconnect()` | Disconnect from the server |
-| `startSession(config)` | Start a conversation session |
-| `endSession()` | End the current session |
-| `sendAudio(data, options?)` | Send audio data to be transcribed |
+| Property | Value |
+|----------|-------|
+| Format | PCM16 (16-bit signed, little-endian) |
+| Sample Rate | 24,000 Hz |
+| Channels | 1 (Mono) |
-#### Event Handlers
+## Browser Example
 ```typescript
-// Simple handlers
-client.setUserTranscriptHandler((text) => {}); // User's speech transcription
-client.setTranscriptHandler((text, isFinal) => {}); // AI's speech transcription (live mode)
-client.setResponseHandler((text, isFinal) => {}); // AI text response (composed mode)
-client.setAudioHandler((audioData) => {});
-client.setErrorHandler((error) => {});
-// Full event API
-client.on('connected', (event) => {});
-client.on('disconnected', (event) => {});
-client.on('sessionStarted', (event) => {});
-client.on('sessionEnded', (event) => {});
-client.on('userTranscript', (event) => {}); // User's speech transcription
-client.on('transcript', (event) => {}); // AI's speech transcription
-client.on('response', (event) => {}); // AI text response
-client.on('audio', (event) => {});
-client.on('error', (event) => {});
-client.on('reconnecting', (event) => {});
-client.on('ready', (event) => {}); // Gemini Live ready (live mode)
-client.on('turnComplete', (event) => {}); // AI finished speaking (live mode)
-```
+import { LiveSpeechClient, float32ToInt16, int16ToUint8 } from '@drawdream/livespeech';
-### SessionConfig
+const client = new LiveSpeechClient({
+  region: 'ap-northeast-2',
+  apiKey: 'your-api-key',
+});
-| Option | Type | Default | Description |
-|--------|------|---------|-------------|
-| `prePrompt` | `string` | - | System prompt for the AI |
-| `language` | `string` | `'en-US'` | Language code for speech (e.g., "ko-KR") |
-| `pipelineMode` | `'live' \| 'composed'` | `'live'` | Audio processing mode |
+// Handlers
+client.setUserTranscriptHandler((text) => console.log('You:', text));
+client.setResponseHandler((text) => console.log('AI:', text));
+client.setAudioHandler((data) => playAudioChunk(data));
-## Audio Utilities
+// Connect
+await client.connect();
+await client.startSession({ prePrompt: 'You are a helpful assistant.' });
-The SDK includes audio encoding/decoding utilities:
+// Capture microphone
+const stream = await navigator.mediaDevices.getUserMedia({
+  audio: { sampleRate: 16000, channelCount: 1 }
+});
+const audioContext = new AudioContext({ sampleRate: 16000 });
+const source = audioContext.createMediaStreamSource(stream);
+const processor = audioContext.createScriptProcessor(4096, 1, 1);
+processor.onaudioprocess = (e) => {
+  const float32 = e.inputBuffer.getChannelData(0);
+  const int16 = float32ToInt16(float32);
+  const pcm = int16ToUint8(int16);
+  client.sendAudioChunk(pcm);
+};
+source.connect(processor);
+processor.connect(audioContext.destination);
+// Start streaming
+client.audioStart();
+// Stop later
+client.audioEnd();
+stream.getTracks().forEach(track => track.stop());
+```
+## Audio Utilities
 ```typescript
 import {
-  encodeAudioToBase64,
-  decodeBase64ToAudio,
-  float32ToInt16,
-  int16ToFloat32,
-  wrapPcmInWav,
+  float32ToInt16,    // Web Audio Float32 → PCM16
+  int16ToFloat32,    // PCM16 → Float32
+  int16ToUint8,      // Int16Array → Uint8Array
+  uint8ToInt16,      // Uint8Array → Int16Array
+  wrapPcmInWav,      // Create WAV file
+  AudioEncoder,      // Base64 encoding/decoding
 } from '@drawdream/livespeech';
-// Convert Float32 audio samples to PCM16
-const pcmData = float32ToInt16(float32Samples);
+// Convert Web Audio to PCM16 for sending
+const float32 = audioBuffer.getChannelData(0);
+const int16 = float32ToInt16(float32);
+const pcmBytes = int16ToUint8(int16);
+client.sendAudioChunk(pcmBytes);
-// Create WAV file from PCM data
-const wavFile = wrapPcmInWav(pcmData, { sampleRate: 16000 });
+// Convert received PCM16 to Web Audio
+const receivedInt16 = uint8ToInt16(audioEvent.data);
+const float32Data = int16ToFloat32(receivedInt16);
 ```
-## Browser Usage
+## Error Handling
-The SDK works in both Node.js and browser environments:
+```typescript
+client.on('error', (event) => {
+  switch (event.code) {
+    case 'authentication_failed':
+      console.error('Invalid API key');
+      break;
+    case 'connection_timeout':
+      console.error('Connection timed out');
+      break;
+    case 'rate_limit':
+      console.error('Rate limit exceeded');
+      break;
+    default:
+      console.error(`Error: ${event.message}`);
+  }
+});
-```html
-<script type="module">
-import { LiveSpeechClient } from '@drawdream/livespeech';
+client.on('disconnected', (event) => {
+  if (event.reason === 'error') {
+    console.log('Will auto-reconnect...');
+  }
+});
-// Use the Web Audio API to capture microphone
-const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-const audioContext = new AudioContext({ sampleRate: 16000 });
-// ... process audio and send to client
-</script>
+client.on('reconnecting', (event) => {
+  console.log(`Reconnecting ${event.attempt}/${event.maxAttempts}`);
+});
+```
+## Client Properties
+| Property | Type | Description |
+|----------|------|-------------|
+| `isConnected` | `boolean` | Connection status |
+| `hasActiveSession` | `boolean` | Session status |
+| `isAudioStreaming` | `boolean` | Streaming status |
+| `connectionId` | `string \| null` | Current connection ID |
+| `currentSessionId` | `string \| null` | Current session ID |
+## Regions
+| Region | Code | Location |
+|--------|------|----------|
+| Asia Pacific (Seoul) | `ap-northeast-2` | Korea |
+## TypeScript Types
+```typescript
+import type {
+  LiveSpeechConfig,
+  SessionConfig,
+  LiveSpeechEvent,
+  ConnectedEvent,
+  DisconnectedEvent,
+  SessionStartedEvent,
+  ReadyEvent,
+  UserTranscriptEvent,
+  ResponseEvent,
+  AudioEvent,
+  TurnCompleteEvent,
+  ErrorEvent,
+  ErrorCode,
+} from '@drawdream/livespeech';
 ```
 ## License

package/dist/index.d.mts CHANGED Viewed

@@ -23,8 +23,8 @@ declare function isValidRegion(value: string): value is Region;
 /**
  * Pipeline mode for audio processing
- * - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
- * - 'composed': Uses separate STT + LLM + TTS services
+ * - 'live': Direct audio-to-audio conversation (default, lower latency)
+ * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
  */
 type PipelineMode = 'live' | 'composed';
 /**
@@ -88,11 +88,18 @@ interface SessionConfig {
     language?: string;
     /**
      * Pipeline mode for audio processing
-     * - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
+     * - 'live': Direct audio-to-audio conversation (default, lower latency)
      * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
      * @default "live"
      */
     pipelineMode?: PipelineMode;
+    /**
+     * Enable AI to speak first before user input (live mode only)
+     * When enabled, the AI will initiate the conversation based on the prePrompt.
+     * Make sure your prePrompt includes instructions for how the AI should greet the user.
+     * @default false
+     */
+    aiSpeaksFirst?: boolean;
 }
 /**
  * Internal resolved configuration with defaults applied
@@ -353,7 +360,7 @@ interface ServerTurnCompleteMessage extends BaseServerMessage {
 }
 /**
  * Ready message from server
- * Indicates the Gemini Live session is ready for audio input
+ * Indicates the session is ready for audio input
  */
 interface ServerReadyMessage extends BaseServerMessage {
     type: 'ready';

package/dist/index.d.ts CHANGED Viewed

@@ -23,8 +23,8 @@ declare function isValidRegion(value: string): value is Region;
 /**
  * Pipeline mode for audio processing
- * - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
- * - 'composed': Uses separate STT + LLM + TTS services
+ * - 'live': Direct audio-to-audio conversation (default, lower latency)
+ * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
  */
 type PipelineMode = 'live' | 'composed';
 /**
@@ -88,11 +88,18 @@ interface SessionConfig {
     language?: string;
     /**
      * Pipeline mode for audio processing
-     * - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
+     * - 'live': Direct audio-to-audio conversation (default, lower latency)
      * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
      * @default "live"
      */
     pipelineMode?: PipelineMode;
+    /**
+     * Enable AI to speak first before user input (live mode only)
+     * When enabled, the AI will initiate the conversation based on the prePrompt.
+     * Make sure your prePrompt includes instructions for how the AI should greet the user.
+     * @default false
+     */
+    aiSpeaksFirst?: boolean;
 }
 /**
  * Internal resolved configuration with defaults applied
@@ -353,7 +360,7 @@ interface ServerTurnCompleteMessage extends BaseServerMessage {
 }
 /**
  * Ready message from server
- * Indicates the Gemini Live session is ready for audio input
+ * Indicates the session is ready for audio input
  */
 interface ServerReadyMessage extends BaseServerMessage {
     type: 'ready';

package/dist/index.js CHANGED Viewed

@@ -734,6 +734,9 @@ var LiveSpeechClient = class {
         startMessage.language = config.language;
       }
       startMessage.pipelineMode = config?.pipelineMode ?? "live";
+      if (config?.aiSpeaksFirst) {
+        startMessage.aiSpeaksFirst = config.aiSpeaksFirst;
+      }
       this.connection.send(startMessage);
     });
   }

package/dist/index.mjs CHANGED Viewed

@@ -695,6 +695,9 @@ var LiveSpeechClient = class {
         startMessage.language = config.language;
       }
       startMessage.pipelineMode = config?.pipelineMode ?? "live";
+      if (config?.aiSpeaksFirst) {
+        startMessage.aiSpeaksFirst = config.aiSpeaksFirst;
+      }
       this.connection.send(startMessage);
     });
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@drawdream/livespeech",
-  "version": "0.1.2",
+  "version": "0.1.3",
   "description": "Real-time speech-to-speech AI conversation SDK",
   "main": "dist/index.js",
   "module": "dist/index.mjs",