@volley/recognition-client-sdk 0.1.200 → 0.1.210

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,20 +11,19 @@ npm install @volley/recognition-client-sdk
11
11
  ## Quick Start
12
12
 
13
13
  ```typescript
14
- import { createClientWithBuilder } from '@volley/recognition-client-sdk';
14
+ import { createClientWithBuilder, RecognitionProvider, DeepgramModel } from '@volley/recognition-client-sdk';
15
15
 
16
- // Create client with builder pattern
16
+ // Create client with builder pattern (recommended)
17
17
  const client = createClientWithBuilder(builder =>
18
18
  builder
19
19
  .url('ws://localhost:3101/ws/v1/recognize')
20
- .provider('deepgram')
21
- .model('nova-2')
20
+ .provider(RecognitionProvider.DEEPGRAM)
21
+ .model(DeepgramModel.NOVA_2)
22
22
  .onTranscript(result => {
23
23
  console.log('Final:', result.finalTranscript);
24
24
  console.log('Interim:', result.pendingTranscript);
25
25
  })
26
26
  .onError(error => console.error(error))
27
- .build()
28
27
  );
29
28
 
30
29
  // Stream audio
@@ -33,17 +32,41 @@ client.sendAudio(pcm16AudioChunk); // Call repeatedly with audio chunks
33
32
  await client.stopRecording(); // Wait for final transcript
34
33
  ```
35
34
 
35
+ ### Alternative: Direct Client Creation
36
+
37
+ ```typescript
38
+ import {
39
+ RealTimeTwoWayWebSocketRecognitionClient,
40
+ RecognitionProvider,
41
+ DeepgramModel,
42
+ Language
43
+ } from '@volley/recognition-client-sdk';
44
+
45
+ const client = new RealTimeTwoWayWebSocketRecognitionClient({
46
+ url: 'ws://localhost:3101/ws/v1/recognize',
47
+ asrRequestConfig: {
48
+ provider: RecognitionProvider.DEEPGRAM,
49
+ model: DeepgramModel.NOVA_2,
50
+ language: Language.ENGLISH_US
51
+ },
52
+ onTranscript: (result) => console.log(result),
53
+ onError: (error) => console.error(error)
54
+ });
55
+ ```
56
+
36
57
  ## Configuration
37
58
 
38
59
  ### Basic Setup
39
60
 
40
61
  ```typescript
62
+ import { RecognitionProvider, DeepgramModel, Language } from '@volley/recognition-client-sdk';
63
+
41
64
  builder
42
65
  .url('ws://localhost:3101/ws/v1/recognize')
43
- .provider('deepgram') // deepgram, google, assemblyai
44
- .model('nova-2') // Provider-specific model
45
- .language('en') // Language code
46
- .interimResults(true) // Enable partial transcripts
66
+ .provider(RecognitionProvider.DEEPGRAM) // DEEPGRAM, GOOGLE
67
+ .model(DeepgramModel.NOVA_2) // Provider-specific model enum
68
+ .language(Language.ENGLISH_US) // Language enum
69
+ .interimResults(true) // Enable partial transcripts
47
70
  ```
48
71
 
49
72
  ### Event Handlers
@@ -87,21 +110,53 @@ client.isConnected(); // Check connection status
87
110
 
88
111
  ```typescript
89
112
  {
90
- finalTranscript?: string; // Confirmed text
91
- pendingTranscript?: string; // Yet to confirm text (can change by ASR vendors)
92
- is_finished?: boolean; // Transcription complete (last one)
93
- voiceStart?: number; // Voice activity start (ms)
94
- voiceDuration?: number; // Voice duration (ms)
113
+ type: 'Transcription'; // Message type discriminator
114
+ audioUtteranceId: string; // Session UUID
115
+ finalTranscript: string; // Confirmed text (won't change)
116
+ finalTranscriptConfidence?: number; // Confidence 0-1 for final transcript
117
+ pendingTranscript?: string; // In-progress text (may change)
118
+ pendingTranscriptConfidence?: number; // Confidence 0-1 for pending transcript
119
+ is_finished: boolean; // Transcription complete (last message)
120
+ voiceStart?: number; // Voice activity start time (ms from stream start)
121
+ voiceDuration?: number; // Voice duration (ms)
122
+ voiceEnd?: number; // Voice activity end time (ms from stream start)
123
+ startTimestamp?: number; // Transcription start timestamp (ms)
124
+ endTimestamp?: number; // Transcription end timestamp (ms)
125
+ receivedAtMs?: number; // Server receive timestamp (ms since epoch)
126
+ accumulatedAudioTimeMs?: number; // Total audio duration sent (ms)
95
127
  }
96
128
  ```
97
129
 
98
130
  ## Providers
99
131
 
100
132
  ### Deepgram
133
+
101
134
  ```typescript
102
- builder.provider('deepgram').model('nova-2') // or 'nova-2-general'
135
+ import { RecognitionProvider, DeepgramModel } from '@volley/recognition-client-sdk';
136
+
137
+ builder
138
+ .provider(RecognitionProvider.DEEPGRAM)
139
+ .model(DeepgramModel.NOVA_2); // NOVA_2, NOVA_3, FLUX_GENERAL_EN
140
+ ```
141
+
142
+ ### Google Cloud Speech-to-Text
143
+
144
+ ```typescript
145
+ import { RecognitionProvider, GoogleModel } from '@volley/recognition-client-sdk';
146
+
147
+ builder
148
+ .provider(RecognitionProvider.GOOGLE)
149
+ .model(GoogleModel.LATEST_SHORT); // LATEST_SHORT, LATEST_LONG, TELEPHONY, etc.
103
150
  ```
104
151
 
152
+ Available Google models:
153
+ - `LATEST_SHORT` - Optimized for short audio (< 1 minute)
154
+ - `LATEST_LONG` - Optimized for long audio (> 1 minute)
155
+ - `TELEPHONY` - Optimized for phone audio
156
+ - `TELEPHONY_SHORT` - Short telephony audio
157
+ - `MEDICAL_DICTATION` - Medical dictation (premium)
158
+ - `MEDICAL_CONVERSATION` - Medical conversations (premium)
159
+
105
160
 
106
161
  ## Audio Format
107
162
 
@@ -128,6 +183,42 @@ builder.onDisconnected((code, reason) => {
128
183
  });
129
184
  ```
130
185
 
186
+ ## Troubleshooting
187
+
188
+ ### Connection Issues
189
+
190
+ **WebSocket fails to connect**
191
+ - Verify the recognition service is running
192
+ - Check the WebSocket URL format: `ws://` or `wss://`
193
+ - Ensure network allows WebSocket connections
194
+
195
+ **Authentication errors**
196
+ - Verify `audioUtteranceId` is provided
197
+ - Check if service requires additional auth headers
198
+
199
+ ### Audio Issues
200
+
201
+ **No transcription results**
202
+ - Confirm audio format is PCM16, 16kHz, mono
203
+ - Check if audio chunks are being sent (use `onAudioSent` callback)
204
+ - Verify audio data is not empty or corrupted
205
+
206
+ **Poor transcription quality**
207
+ - Try different models (e.g., `NOVA_2` vs `NOVA_2_GENERAL`)
208
+ - Adjust language setting to match audio
209
+ - Ensure audio sample rate matches configuration
210
+
211
+ ### Performance Issues
212
+
213
+ **High latency**
214
+ - Use smaller audio chunks (e.g., 100ms instead of 500ms)
215
+ - Choose a model optimized for real-time (e.g., Deepgram Nova 2)
216
+ - Check network latency to service
217
+
218
+ **Memory issues**
219
+ - Call `disconnect()` when done to clean up resources
220
+ - Avoid keeping multiple client instances active
221
+
131
222
  ## Publishing
132
223
 
133
224
  This package uses automated publishing via semantic-release with npm Trusted Publishers (OIDC).
@@ -163,6 +254,15 @@ pnpm build
163
254
  npm publish --provenance --access public
164
255
  ```
165
256
 
257
+ ## Contributing
258
+
259
+ This SDK is part of the Recognition Service monorepo. To contribute:
260
+
261
+ 1. Make changes to SDK or libs
262
+ 2. Test locally with `pnpm test`
263
+ 3. Create PR to `dev` branch with conventional commit messages (`feat:`, `fix:`, etc.)
264
+ 4. After merge, automated workflow will publish new version to npm
265
+
166
266
  ## License
167
267
 
168
268
  Proprietary
@@ -851,9 +851,9 @@ interface IRecognitionClient {
851
851
  /**
852
852
  * Send audio data to the recognition service
853
853
  * Audio is buffered locally and sent when connection is ready.
854
- * @param audioData - PCM audio data as ArrayBuffer or typed array view
854
+ * @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
855
855
  */
856
- sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
856
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
857
857
  /**
858
858
  * Stop recording and wait for final transcript
859
859
  * The server will close the connection after sending the final transcript.
@@ -1006,7 +1006,8 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
1006
1006
  */
1007
1007
  private cleanup;
1008
1008
  connect(): Promise<void>;
1009
- sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
1009
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
1010
+ private sendAudioInternal;
1010
1011
  stopRecording(): Promise<void>;
1011
1012
  getAudioUtteranceId(): string;
1012
1013
  getState(): ClientState;
package/dist/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
- import { A as ASRRequestConfig, G as GameContextV1, R as RecognitionCallbackUrl, T as TranscriptionResultV1, M as MetadataResultV1, E as ErrorResultV1, a as RealTimeTwoWayWebSocketRecognitionClientConfig, I as IRecognitionClient, C as ClientState, b as IRecognitionClientConfig } from './browser-CDQ_TzeH.js';
2
- export { k as ASRRequestV1, f as AudioEncoding, h as ControlSignal, h as ControlSignalTypeV1, D as DeepgramModel, F as FunctionCallResultV1, m as GoogleModel, e as IRecognitionClientStats, L as Language, c as RealTimeTwoWayWebSocketRecognitionClient, g as RecognitionContextTypeV1, l as RecognitionProvider, j as RecognitionResultTypeV1, S as SampleRate, d as TranscriptionResult, i as isNormalDisconnection } from './browser-CDQ_TzeH.js';
1
+ import { A as ASRRequestConfig, G as GameContextV1, R as RecognitionCallbackUrl, T as TranscriptionResultV1, M as MetadataResultV1, E as ErrorResultV1, a as RealTimeTwoWayWebSocketRecognitionClientConfig, I as IRecognitionClient, C as ClientState, b as IRecognitionClientConfig } from './browser-C4ZssGoU.js';
2
+ export { k as ASRRequestV1, f as AudioEncoding, h as ControlSignal, h as ControlSignalTypeV1, D as DeepgramModel, F as FunctionCallResultV1, m as GoogleModel, e as IRecognitionClientStats, L as Language, c as RealTimeTwoWayWebSocketRecognitionClient, g as RecognitionContextTypeV1, l as RecognitionProvider, j as RecognitionResultTypeV1, S as SampleRate, d as TranscriptionResult, i as isNormalDisconnection } from './browser-C4ZssGoU.js';
3
3
  import { z } from 'zod';
4
4
 
5
5
  /**
@@ -169,9 +169,13 @@ declare class ConfigBuilder {
169
169
  * ```typescript
170
170
  * const client = createClient({
171
171
  * url: 'ws://localhost:3101/ws/v1/recognize',
172
+ * audioUtteranceId: 'unique-id',
172
173
  * onTranscript: (result) => console.log(result)
173
174
  * });
174
175
  * ```
176
+ *
177
+ * @param config - Client configuration
178
+ * @returns Configured recognition client instance
175
179
  */
176
180
  declare function createClient(config: RealTimeTwoWayWebSocketRecognitionClientConfig): IRecognitionClient;
177
181
  /**
@@ -308,9 +312,9 @@ interface ISimplifiedVGFRecognitionClient {
308
312
  connect(): Promise<void>;
309
313
  /**
310
314
  * Send audio data for transcription
311
- * @param audioData - PCM audio data as ArrayBuffer or typed array
315
+ * @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
312
316
  */
313
- sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
317
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
314
318
  /**
315
319
  * Stop recording and wait for final transcription
316
320
  * @returns Promise that resolves when transcription is complete
@@ -361,7 +365,7 @@ declare class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognitio
361
365
  private stateChangeCallback;
362
366
  constructor(config: SimplifiedVGFClientConfig);
363
367
  connect(): Promise<void>;
364
- sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
368
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
365
369
  stopRecording(): Promise<void>;
366
370
  getAudioUtteranceId(): string;
367
371
  getState(): ClientState;
package/dist/index.js CHANGED
@@ -302,13 +302,7 @@ var TimerSchema = z.object({
302
302
  * Total duration of all audio chunks sent to this provider session
303
303
  * @example 2500 (2.5 seconds of audio has been sent)
304
304
  */
305
- accumulatedAudioTimeMs: z.number().optional(),
306
- /**
307
- * Estimated cost in USD for this session
308
- * Calculated by the job based on audio duration and provider pricing
309
- * @example 0.0025 (quarter of a cent)
310
- */
311
- costInUSD: z.number().optional().default(0)
305
+ accumulatedAudioTimeMs: z.number().optional()
312
306
  });
313
307
  var RawMessageSchema = z.object({
314
308
  type: z.literal(ProviderMessageType.RAW),
@@ -1477,6 +1471,18 @@ function isNormalDisconnection(code) {
1477
1471
  return code === 1e3;
1478
1472
  }
1479
1473
  __name(isNormalDisconnection, "isNormalDisconnection");
1474
+ async function blobToArrayBuffer(blob) {
1475
+ if (typeof blob.arrayBuffer === "function") {
1476
+ return await blob.arrayBuffer();
1477
+ }
1478
+ return new Promise((resolve, reject) => {
1479
+ const reader = new FileReader();
1480
+ reader.onload = () => resolve(reader.result);
1481
+ reader.onerror = () => reject(reader.error);
1482
+ reader.readAsArrayBuffer(blob);
1483
+ });
1484
+ }
1485
+ __name(blobToArrayBuffer, "blobToArrayBuffer");
1480
1486
  var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient {
1481
1487
  static {
1482
1488
  __name(this, "RealTimeTwoWayWebSocketRecognitionClient");
@@ -1666,6 +1672,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
1666
1672
  return this.connectionPromise;
1667
1673
  }
1668
1674
  sendAudio(audioData) {
1675
+ if (audioData instanceof Blob) {
1676
+ blobToArrayBuffer(audioData).then((arrayBuffer) => {
1677
+ this.sendAudioInternal(arrayBuffer);
1678
+ }).catch((error) => {
1679
+ this.log("error", "Failed to convert Blob to ArrayBuffer", error);
1680
+ });
1681
+ return;
1682
+ }
1683
+ this.sendAudioInternal(audioData);
1684
+ }
1685
+ sendAudioInternal(audioData) {
1669
1686
  const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
1670
1687
  if (bytes === 0) return;
1671
1688
  this.audioBuffer.write(audioData);