npm - @volley/recognition-client-sdk - Versions diffs - 0.1.200 → 0.1.210 - Mend

@volley/recognition-client-sdk 0.1.200 → 0.1.210

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +115 -15
package/dist/{browser-CDQ_TzeH.d.ts → browser-C4ZssGoU.d.ts} +4 -3
package/dist/index.d.ts +9 -5
package/dist/index.js +24 -7
package/dist/index.js.map +1 -1
package/dist/recog-client-sdk.browser.d.ts +1 -1
package/dist/recog-client-sdk.browser.js +24 -7
package/dist/recog-client-sdk.browser.js.map +1 -1
package/package.json +16 -16
package/src/config-builder.spec.ts +265 -0
package/src/factory.spec.ts +215 -0
package/src/factory.ts +4 -0
package/src/recognition-client.spec.ts +179 -0
package/src/recognition-client.ts +44 -1
package/src/recognition-client.types.ts +2 -2
package/src/simplified-vgf-recognition-client.spec.ts +6 -0
package/src/simplified-vgf-recognition-client.ts +3 -3
package/src/utils/message-handler.spec.ts +311 -0
package/src/utils/url-builder.spec.ts +203 -0

package/README.md CHANGED Viewed

@@ -11,20 +11,19 @@ npm install @volley/recognition-client-sdk
 ## Quick Start
 ```typescript
-import { createClientWithBuilder } from '@volley/recognition-client-sdk';
+import { createClientWithBuilder, RecognitionProvider, DeepgramModel } from '@volley/recognition-client-sdk';
-// Create client with builder pattern
+// Create client with builder pattern (recommended)
 const client = createClientWithBuilder(builder =>
   builder
     .url('ws://localhost:3101/ws/v1/recognize')
-    .provider('deepgram')
-    .model('nova-2')
+    .provider(RecognitionProvider.DEEPGRAM)
+    .model(DeepgramModel.NOVA_2)
     .onTranscript(result => {
       console.log('Final:', result.finalTranscript);
       console.log('Interim:', result.pendingTranscript);
     })
     .onError(error => console.error(error))
-    .build()
 );
 // Stream audio
@@ -33,17 +32,41 @@ client.sendAudio(pcm16AudioChunk);  // Call repeatedly with audio chunks
 await client.stopRecording();       // Wait for final transcript
 ```
+### Alternative: Direct Client Creation
+```typescript
+import {
+  RealTimeTwoWayWebSocketRecognitionClient,
+  RecognitionProvider,
+  DeepgramModel,
+  Language
+} from '@volley/recognition-client-sdk';
+const client = new RealTimeTwoWayWebSocketRecognitionClient({
+  url: 'ws://localhost:3101/ws/v1/recognize',
+  asrRequestConfig: {
+    provider: RecognitionProvider.DEEPGRAM,
+    model: DeepgramModel.NOVA_2,
+    language: Language.ENGLISH_US
+  },
+  onTranscript: (result) => console.log(result),
+  onError: (error) => console.error(error)
+});
+```
 ## Configuration
 ### Basic Setup
 ```typescript
+import { RecognitionProvider, DeepgramModel, Language } from '@volley/recognition-client-sdk';
 builder
   .url('ws://localhost:3101/ws/v1/recognize')
-  .provider('deepgram')           // deepgram, google, assemblyai
-  .model('nova-2')                 // Provider-specific model
-  .language('en')                  // Language code
-  .interimResults(true)            // Enable partial transcripts
+  .provider(RecognitionProvider.DEEPGRAM)  // DEEPGRAM, GOOGLE
+  .model(DeepgramModel.NOVA_2)              // Provider-specific model enum
+  .language(Language.ENGLISH_US)            // Language enum
+  .interimResults(true)                     // Enable partial transcripts
 ```
 ### Event Handlers
@@ -87,21 +110,53 @@ client.isConnected();             // Check connection status
 ```typescript
 {
-  finalTranscript?: string;       // Confirmed text
-  pendingTranscript?: string;     // Yet to confirm text (can change by ASR vendors)
-  is_finished?: boolean;          // Transcription complete (last one)
-  voiceStart?: number;            // Voice activity start (ms)
-  voiceDuration?: number;         // Voice duration (ms)
+  type: 'Transcription';                   // Message type discriminator
+  audioUtteranceId: string;                // Session UUID
+  finalTranscript: string;                 // Confirmed text (won't change)
+  finalTranscriptConfidence?: number;      // Confidence 0-1 for final transcript
+  pendingTranscript?: string;              // In-progress text (may change)
+  pendingTranscriptConfidence?: number;    // Confidence 0-1 for pending transcript
+  is_finished: boolean;                    // Transcription complete (last message)
+  voiceStart?: number;                     // Voice activity start time (ms from stream start)
+  voiceDuration?: number;                  // Voice duration (ms)
+  voiceEnd?: number;                       // Voice activity end time (ms from stream start)
+  startTimestamp?: number;                 // Transcription start timestamp (ms)
+  endTimestamp?: number;                   // Transcription end timestamp (ms)
+  receivedAtMs?: number;                   // Server receive timestamp (ms since epoch)
+  accumulatedAudioTimeMs?: number;         // Total audio duration sent (ms)
 }
 ```
 ## Providers
 ### Deepgram
 ```typescript
-builder.provider('deepgram').model('nova-2')  // or 'nova-2-general'
+import { RecognitionProvider, DeepgramModel } from '@volley/recognition-client-sdk';
+builder
+  .provider(RecognitionProvider.DEEPGRAM)
+  .model(DeepgramModel.NOVA_2);        // NOVA_2, NOVA_3, FLUX_GENERAL_EN
+```
+### Google Cloud Speech-to-Text
+```typescript
+import { RecognitionProvider, GoogleModel } from '@volley/recognition-client-sdk';
+builder
+  .provider(RecognitionProvider.GOOGLE)
+  .model(GoogleModel.LATEST_SHORT);    // LATEST_SHORT, LATEST_LONG, TELEPHONY, etc.
 ```
+Available Google models:
+- `LATEST_SHORT` - Optimized for short audio (< 1 minute)
+- `LATEST_LONG` - Optimized for long audio (> 1 minute)
+- `TELEPHONY` - Optimized for phone audio
+- `TELEPHONY_SHORT` - Short telephony audio
+- `MEDICAL_DICTATION` - Medical dictation (premium)
+- `MEDICAL_CONVERSATION` - Medical conversations (premium)
 ## Audio Format
@@ -128,6 +183,42 @@ builder.onDisconnected((code, reason) => {
 });
 ```
+## Troubleshooting
+### Connection Issues
+**WebSocket fails to connect**
+- Verify the recognition service is running
+- Check the WebSocket URL format: `ws://` or `wss://`
+- Ensure network allows WebSocket connections
+**Authentication errors**
+- Verify `audioUtteranceId` is provided
+- Check if service requires additional auth headers
+### Audio Issues
+**No transcription results**
+- Confirm audio format is PCM16, 16kHz, mono
+- Check if audio chunks are being sent (use `onAudioSent` callback)
+- Verify audio data is not empty or corrupted
+**Poor transcription quality**
+- Try different models (e.g., `NOVA_2` vs `NOVA_2_GENERAL`)
+- Adjust language setting to match audio
+- Ensure audio sample rate matches configuration
+### Performance Issues
+**High latency**
+- Use smaller audio chunks (e.g., 100ms instead of 500ms)
+- Choose a model optimized for real-time (e.g., Deepgram Nova 2)
+- Check network latency to service
+**Memory issues**
+- Call `disconnect()` when done to clean up resources
+- Avoid keeping multiple client instances active
 ## Publishing
 This package uses automated publishing via semantic-release with npm Trusted Publishers (OIDC).
@@ -163,6 +254,15 @@ pnpm build
 npm publish --provenance --access public
 ```
+## Contributing
+This SDK is part of the Recognition Service monorepo. To contribute:
+1. Make changes to SDK or libs
+2. Test locally with `pnpm test`
+3. Create PR to `dev` branch with conventional commit messages (`feat:`, `fix:`, etc.)
+4. After merge, automated workflow will publish new version to npm
 ## License
 Proprietary

package/dist/{browser-CDQ_TzeH.d.ts → browser-C4ZssGoU.d.ts} RENAMED Viewed

@@ -851,9 +851,9 @@ interface IRecognitionClient {
     /**
      * Send audio data to the recognition service
      * Audio is buffered locally and sent when connection is ready.
-     * @param audioData - PCM audio data as ArrayBuffer or typed array view
+     * @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
      */
-    sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
+    sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
     /**
      * Stop recording and wait for final transcript
      * The server will close the connection after sending the final transcript.
@@ -1006,7 +1006,8 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
      */
     private cleanup;
     connect(): Promise<void>;
-    sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
+    sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
+    private sendAudioInternal;
     stopRecording(): Promise<void>;
     getAudioUtteranceId(): string;
     getState(): ClientState;

package/dist/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { A as ASRRequestConfig, G as GameContextV1, R as RecognitionCallbackUrl, T as TranscriptionResultV1, M as MetadataResultV1, E as ErrorResultV1, a as RealTimeTwoWayWebSocketRecognitionClientConfig, I as IRecognitionClient, C as ClientState, b as IRecognitionClientConfig } from './browser-CDQ_TzeH.js';
-export { k as ASRRequestV1, f as AudioEncoding, h as ControlSignal, h as ControlSignalTypeV1, D as DeepgramModel, F as FunctionCallResultV1, m as GoogleModel, e as IRecognitionClientStats, L as Language, c as RealTimeTwoWayWebSocketRecognitionClient, g as RecognitionContextTypeV1, l as RecognitionProvider, j as RecognitionResultTypeV1, S as SampleRate, d as TranscriptionResult, i as isNormalDisconnection } from './browser-CDQ_TzeH.js';
+import { A as ASRRequestConfig, G as GameContextV1, R as RecognitionCallbackUrl, T as TranscriptionResultV1, M as MetadataResultV1, E as ErrorResultV1, a as RealTimeTwoWayWebSocketRecognitionClientConfig, I as IRecognitionClient, C as ClientState, b as IRecognitionClientConfig } from './browser-C4ZssGoU.js';
+export { k as ASRRequestV1, f as AudioEncoding, h as ControlSignal, h as ControlSignalTypeV1, D as DeepgramModel, F as FunctionCallResultV1, m as GoogleModel, e as IRecognitionClientStats, L as Language, c as RealTimeTwoWayWebSocketRecognitionClient, g as RecognitionContextTypeV1, l as RecognitionProvider, j as RecognitionResultTypeV1, S as SampleRate, d as TranscriptionResult, i as isNormalDisconnection } from './browser-C4ZssGoU.js';
 import { z } from 'zod';
 /**
@@ -169,9 +169,13 @@ declare class ConfigBuilder {
  * ```typescript
  * const client = createClient({
  *   url: 'ws://localhost:3101/ws/v1/recognize',
+ *   audioUtteranceId: 'unique-id',
  *   onTranscript: (result) => console.log(result)
  * });
  * ```
+ *
+ * @param config - Client configuration
+ * @returns Configured recognition client instance
  */
 declare function createClient(config: RealTimeTwoWayWebSocketRecognitionClientConfig): IRecognitionClient;
 /**
@@ -308,9 +312,9 @@ interface ISimplifiedVGFRecognitionClient {
     connect(): Promise<void>;
     /**
      * Send audio data for transcription
-     * @param audioData - PCM audio data as ArrayBuffer or typed array
+     * @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
      */
-    sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
+    sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
     /**
      * Stop recording and wait for final transcription
      * @returns Promise that resolves when transcription is complete
@@ -361,7 +365,7 @@ declare class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognitio
     private stateChangeCallback;
     constructor(config: SimplifiedVGFClientConfig);
     connect(): Promise<void>;
-    sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
+    sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
     stopRecording(): Promise<void>;
     getAudioUtteranceId(): string;
     getState(): ClientState;

package/dist/index.js CHANGED Viewed

@@ -302,13 +302,7 @@ var TimerSchema = z.object({
    * Total duration of all audio chunks sent to this provider session
    * @example 2500 (2.5 seconds of audio has been sent)
    */
-  accumulatedAudioTimeMs: z.number().optional(),
-  /**
-   * Estimated cost in USD for this session
-   * Calculated by the job based on audio duration and provider pricing
-   * @example 0.0025 (quarter of a cent)
-   */
-  costInUSD: z.number().optional().default(0)
+  accumulatedAudioTimeMs: z.number().optional()
 });
 var RawMessageSchema = z.object({
   type: z.literal(ProviderMessageType.RAW),
@@ -1477,6 +1471,18 @@ function isNormalDisconnection(code) {
   return code === 1e3;
 }
 __name(isNormalDisconnection, "isNormalDisconnection");
+async function blobToArrayBuffer(blob) {
+  if (typeof blob.arrayBuffer === "function") {
+    return await blob.arrayBuffer();
+  }
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = () => resolve(reader.result);
+    reader.onerror = () => reject(reader.error);
+    reader.readAsArrayBuffer(blob);
+  });
+}
+__name(blobToArrayBuffer, "blobToArrayBuffer");
 var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient {
   static {
     __name(this, "RealTimeTwoWayWebSocketRecognitionClient");
@@ -1666,6 +1672,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
     return this.connectionPromise;
   }
   sendAudio(audioData) {
+    if (audioData instanceof Blob) {
+      blobToArrayBuffer(audioData).then((arrayBuffer) => {
+        this.sendAudioInternal(arrayBuffer);
+      }).catch((error) => {
+        this.log("error", "Failed to convert Blob to ArrayBuffer", error);
+      });
+      return;
+    }
+    this.sendAudioInternal(audioData);
+  }
+  sendAudioInternal(audioData) {
     const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
     if (bytes === 0) return;
     this.audioBuffer.write(audioData);