@volley/recognition-client-sdk 0.1.200 → 0.1.210
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +115 -15
- package/dist/{browser-CDQ_TzeH.d.ts → browser-C4ZssGoU.d.ts} +4 -3
- package/dist/index.d.ts +9 -5
- package/dist/index.js +24 -7
- package/dist/index.js.map +1 -1
- package/dist/recog-client-sdk.browser.d.ts +1 -1
- package/dist/recog-client-sdk.browser.js +24 -7
- package/dist/recog-client-sdk.browser.js.map +1 -1
- package/package.json +16 -16
- package/src/config-builder.spec.ts +265 -0
- package/src/factory.spec.ts +215 -0
- package/src/factory.ts +4 -0
- package/src/recognition-client.spec.ts +179 -0
- package/src/recognition-client.ts +44 -1
- package/src/recognition-client.types.ts +2 -2
- package/src/simplified-vgf-recognition-client.spec.ts +6 -0
- package/src/simplified-vgf-recognition-client.ts +3 -3
- package/src/utils/message-handler.spec.ts +311 -0
- package/src/utils/url-builder.spec.ts +203 -0
package/README.md
CHANGED
|
@@ -11,20 +11,19 @@ npm install @volley/recognition-client-sdk
|
|
|
11
11
|
## Quick Start
|
|
12
12
|
|
|
13
13
|
```typescript
|
|
14
|
-
import { createClientWithBuilder } from '@volley/recognition-client-sdk';
|
|
14
|
+
import { createClientWithBuilder, RecognitionProvider, DeepgramModel } from '@volley/recognition-client-sdk';
|
|
15
15
|
|
|
16
|
-
// Create client with builder pattern
|
|
16
|
+
// Create client with builder pattern (recommended)
|
|
17
17
|
const client = createClientWithBuilder(builder =>
|
|
18
18
|
builder
|
|
19
19
|
.url('ws://localhost:3101/ws/v1/recognize')
|
|
20
|
-
.provider(
|
|
21
|
-
.model(
|
|
20
|
+
.provider(RecognitionProvider.DEEPGRAM)
|
|
21
|
+
.model(DeepgramModel.NOVA_2)
|
|
22
22
|
.onTranscript(result => {
|
|
23
23
|
console.log('Final:', result.finalTranscript);
|
|
24
24
|
console.log('Interim:', result.pendingTranscript);
|
|
25
25
|
})
|
|
26
26
|
.onError(error => console.error(error))
|
|
27
|
-
.build()
|
|
28
27
|
);
|
|
29
28
|
|
|
30
29
|
// Stream audio
|
|
@@ -33,17 +32,41 @@ client.sendAudio(pcm16AudioChunk); // Call repeatedly with audio chunks
|
|
|
33
32
|
await client.stopRecording(); // Wait for final transcript
|
|
34
33
|
```
|
|
35
34
|
|
|
35
|
+
### Alternative: Direct Client Creation
|
|
36
|
+
|
|
37
|
+
```typescript
|
|
38
|
+
import {
|
|
39
|
+
RealTimeTwoWayWebSocketRecognitionClient,
|
|
40
|
+
RecognitionProvider,
|
|
41
|
+
DeepgramModel,
|
|
42
|
+
Language
|
|
43
|
+
} from '@volley/recognition-client-sdk';
|
|
44
|
+
|
|
45
|
+
const client = new RealTimeTwoWayWebSocketRecognitionClient({
|
|
46
|
+
url: 'ws://localhost:3101/ws/v1/recognize',
|
|
47
|
+
asrRequestConfig: {
|
|
48
|
+
provider: RecognitionProvider.DEEPGRAM,
|
|
49
|
+
model: DeepgramModel.NOVA_2,
|
|
50
|
+
language: Language.ENGLISH_US
|
|
51
|
+
},
|
|
52
|
+
onTranscript: (result) => console.log(result),
|
|
53
|
+
onError: (error) => console.error(error)
|
|
54
|
+
});
|
|
55
|
+
```
|
|
56
|
+
|
|
36
57
|
## Configuration
|
|
37
58
|
|
|
38
59
|
### Basic Setup
|
|
39
60
|
|
|
40
61
|
```typescript
|
|
62
|
+
import { RecognitionProvider, DeepgramModel, Language } from '@volley/recognition-client-sdk';
|
|
63
|
+
|
|
41
64
|
builder
|
|
42
65
|
.url('ws://localhost:3101/ws/v1/recognize')
|
|
43
|
-
.provider(
|
|
44
|
-
.model(
|
|
45
|
-
.language(
|
|
46
|
-
.interimResults(true)
|
|
66
|
+
.provider(RecognitionProvider.DEEPGRAM) // DEEPGRAM, GOOGLE
|
|
67
|
+
.model(DeepgramModel.NOVA_2) // Provider-specific model enum
|
|
68
|
+
.language(Language.ENGLISH_US) // Language enum
|
|
69
|
+
.interimResults(true) // Enable partial transcripts
|
|
47
70
|
```
|
|
48
71
|
|
|
49
72
|
### Event Handlers
|
|
@@ -87,21 +110,53 @@ client.isConnected(); // Check connection status
|
|
|
87
110
|
|
|
88
111
|
```typescript
|
|
89
112
|
{
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
113
|
+
type: 'Transcription'; // Message type discriminator
|
|
114
|
+
audioUtteranceId: string; // Session UUID
|
|
115
|
+
finalTranscript: string; // Confirmed text (won't change)
|
|
116
|
+
finalTranscriptConfidence?: number; // Confidence 0-1 for final transcript
|
|
117
|
+
pendingTranscript?: string; // In-progress text (may change)
|
|
118
|
+
pendingTranscriptConfidence?: number; // Confidence 0-1 for pending transcript
|
|
119
|
+
is_finished: boolean; // Transcription complete (last message)
|
|
120
|
+
voiceStart?: number; // Voice activity start time (ms from stream start)
|
|
121
|
+
voiceDuration?: number; // Voice duration (ms)
|
|
122
|
+
voiceEnd?: number; // Voice activity end time (ms from stream start)
|
|
123
|
+
startTimestamp?: number; // Transcription start timestamp (ms)
|
|
124
|
+
endTimestamp?: number; // Transcription end timestamp (ms)
|
|
125
|
+
receivedAtMs?: number; // Server receive timestamp (ms since epoch)
|
|
126
|
+
accumulatedAudioTimeMs?: number; // Total audio duration sent (ms)
|
|
95
127
|
}
|
|
96
128
|
```
|
|
97
129
|
|
|
98
130
|
## Providers
|
|
99
131
|
|
|
100
132
|
### Deepgram
|
|
133
|
+
|
|
101
134
|
```typescript
|
|
102
|
-
|
|
135
|
+
import { RecognitionProvider, DeepgramModel } from '@volley/recognition-client-sdk';
|
|
136
|
+
|
|
137
|
+
builder
|
|
138
|
+
.provider(RecognitionProvider.DEEPGRAM)
|
|
139
|
+
.model(DeepgramModel.NOVA_2); // NOVA_2, NOVA_3, FLUX_GENERAL_EN
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Google Cloud Speech-to-Text
|
|
143
|
+
|
|
144
|
+
```typescript
|
|
145
|
+
import { RecognitionProvider, GoogleModel } from '@volley/recognition-client-sdk';
|
|
146
|
+
|
|
147
|
+
builder
|
|
148
|
+
.provider(RecognitionProvider.GOOGLE)
|
|
149
|
+
.model(GoogleModel.LATEST_SHORT); // LATEST_SHORT, LATEST_LONG, TELEPHONY, etc.
|
|
103
150
|
```
|
|
104
151
|
|
|
152
|
+
Available Google models:
|
|
153
|
+
- `LATEST_SHORT` - Optimized for short audio (< 1 minute)
|
|
154
|
+
- `LATEST_LONG` - Optimized for long audio (> 1 minute)
|
|
155
|
+
- `TELEPHONY` - Optimized for phone audio
|
|
156
|
+
- `TELEPHONY_SHORT` - Short telephony audio
|
|
157
|
+
- `MEDICAL_DICTATION` - Medical dictation (premium)
|
|
158
|
+
- `MEDICAL_CONVERSATION` - Medical conversations (premium)
|
|
159
|
+
|
|
105
160
|
|
|
106
161
|
## Audio Format
|
|
107
162
|
|
|
@@ -128,6 +183,42 @@ builder.onDisconnected((code, reason) => {
|
|
|
128
183
|
});
|
|
129
184
|
```
|
|
130
185
|
|
|
186
|
+
## Troubleshooting
|
|
187
|
+
|
|
188
|
+
### Connection Issues
|
|
189
|
+
|
|
190
|
+
**WebSocket fails to connect**
|
|
191
|
+
- Verify the recognition service is running
|
|
192
|
+
- Check the WebSocket URL format: `ws://` or `wss://`
|
|
193
|
+
- Ensure network allows WebSocket connections
|
|
194
|
+
|
|
195
|
+
**Authentication errors**
|
|
196
|
+
- Verify `audioUtteranceId` is provided
|
|
197
|
+
- Check if service requires additional auth headers
|
|
198
|
+
|
|
199
|
+
### Audio Issues
|
|
200
|
+
|
|
201
|
+
**No transcription results**
|
|
202
|
+
- Confirm audio format is PCM16, 16kHz, mono
|
|
203
|
+
- Check if audio chunks are being sent (use `onAudioSent` callback)
|
|
204
|
+
- Verify audio data is not empty or corrupted
|
|
205
|
+
|
|
206
|
+
**Poor transcription quality**
|
|
207
|
+
- Try different models (e.g., `NOVA_2` vs `NOVA_2_GENERAL`)
|
|
208
|
+
- Adjust language setting to match audio
|
|
209
|
+
- Ensure audio sample rate matches configuration
|
|
210
|
+
|
|
211
|
+
### Performance Issues
|
|
212
|
+
|
|
213
|
+
**High latency**
|
|
214
|
+
- Use smaller audio chunks (e.g., 100ms instead of 500ms)
|
|
215
|
+
- Choose a model optimized for real-time (e.g., Deepgram Nova 2)
|
|
216
|
+
- Check network latency to service
|
|
217
|
+
|
|
218
|
+
**Memory issues**
|
|
219
|
+
- Call `disconnect()` when done to clean up resources
|
|
220
|
+
- Avoid keeping multiple client instances active
|
|
221
|
+
|
|
131
222
|
## Publishing
|
|
132
223
|
|
|
133
224
|
This package uses automated publishing via semantic-release with npm Trusted Publishers (OIDC).
|
|
@@ -163,6 +254,15 @@ pnpm build
|
|
|
163
254
|
npm publish --provenance --access public
|
|
164
255
|
```
|
|
165
256
|
|
|
257
|
+
## Contributing
|
|
258
|
+
|
|
259
|
+
This SDK is part of the Recognition Service monorepo. To contribute:
|
|
260
|
+
|
|
261
|
+
1. Make changes to SDK or libs
|
|
262
|
+
2. Test locally with `pnpm test`
|
|
263
|
+
3. Create PR to `dev` branch with conventional commit messages (`feat:`, `fix:`, etc.)
|
|
264
|
+
4. After merge, automated workflow will publish new version to npm
|
|
265
|
+
|
|
166
266
|
## License
|
|
167
267
|
|
|
168
268
|
Proprietary
|
|
@@ -851,9 +851,9 @@ interface IRecognitionClient {
|
|
|
851
851
|
/**
|
|
852
852
|
* Send audio data to the recognition service
|
|
853
853
|
* Audio is buffered locally and sent when connection is ready.
|
|
854
|
-
* @param audioData - PCM audio data as ArrayBuffer
|
|
854
|
+
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
855
855
|
*/
|
|
856
|
-
sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
|
|
856
|
+
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
857
857
|
/**
|
|
858
858
|
* Stop recording and wait for final transcript
|
|
859
859
|
* The server will close the connection after sending the final transcript.
|
|
@@ -1006,7 +1006,8 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
1006
1006
|
*/
|
|
1007
1007
|
private cleanup;
|
|
1008
1008
|
connect(): Promise<void>;
|
|
1009
|
-
sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
|
|
1009
|
+
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1010
|
+
private sendAudioInternal;
|
|
1010
1011
|
stopRecording(): Promise<void>;
|
|
1011
1012
|
getAudioUtteranceId(): string;
|
|
1012
1013
|
getState(): ClientState;
|
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { A as ASRRequestConfig, G as GameContextV1, R as RecognitionCallbackUrl, T as TranscriptionResultV1, M as MetadataResultV1, E as ErrorResultV1, a as RealTimeTwoWayWebSocketRecognitionClientConfig, I as IRecognitionClient, C as ClientState, b as IRecognitionClientConfig } from './browser-
|
|
2
|
-
export { k as ASRRequestV1, f as AudioEncoding, h as ControlSignal, h as ControlSignalTypeV1, D as DeepgramModel, F as FunctionCallResultV1, m as GoogleModel, e as IRecognitionClientStats, L as Language, c as RealTimeTwoWayWebSocketRecognitionClient, g as RecognitionContextTypeV1, l as RecognitionProvider, j as RecognitionResultTypeV1, S as SampleRate, d as TranscriptionResult, i as isNormalDisconnection } from './browser-
|
|
1
|
+
import { A as ASRRequestConfig, G as GameContextV1, R as RecognitionCallbackUrl, T as TranscriptionResultV1, M as MetadataResultV1, E as ErrorResultV1, a as RealTimeTwoWayWebSocketRecognitionClientConfig, I as IRecognitionClient, C as ClientState, b as IRecognitionClientConfig } from './browser-C4ZssGoU.js';
|
|
2
|
+
export { k as ASRRequestV1, f as AudioEncoding, h as ControlSignal, h as ControlSignalTypeV1, D as DeepgramModel, F as FunctionCallResultV1, m as GoogleModel, e as IRecognitionClientStats, L as Language, c as RealTimeTwoWayWebSocketRecognitionClient, g as RecognitionContextTypeV1, l as RecognitionProvider, j as RecognitionResultTypeV1, S as SampleRate, d as TranscriptionResult, i as isNormalDisconnection } from './browser-C4ZssGoU.js';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
|
|
5
5
|
/**
|
|
@@ -169,9 +169,13 @@ declare class ConfigBuilder {
|
|
|
169
169
|
* ```typescript
|
|
170
170
|
* const client = createClient({
|
|
171
171
|
* url: 'ws://localhost:3101/ws/v1/recognize',
|
|
172
|
+
* audioUtteranceId: 'unique-id',
|
|
172
173
|
* onTranscript: (result) => console.log(result)
|
|
173
174
|
* });
|
|
174
175
|
* ```
|
|
176
|
+
*
|
|
177
|
+
* @param config - Client configuration
|
|
178
|
+
* @returns Configured recognition client instance
|
|
175
179
|
*/
|
|
176
180
|
declare function createClient(config: RealTimeTwoWayWebSocketRecognitionClientConfig): IRecognitionClient;
|
|
177
181
|
/**
|
|
@@ -308,9 +312,9 @@ interface ISimplifiedVGFRecognitionClient {
|
|
|
308
312
|
connect(): Promise<void>;
|
|
309
313
|
/**
|
|
310
314
|
* Send audio data for transcription
|
|
311
|
-
* @param audioData - PCM audio data as ArrayBuffer
|
|
315
|
+
* @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
|
|
312
316
|
*/
|
|
313
|
-
sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
|
|
317
|
+
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
314
318
|
/**
|
|
315
319
|
* Stop recording and wait for final transcription
|
|
316
320
|
* @returns Promise that resolves when transcription is complete
|
|
@@ -361,7 +365,7 @@ declare class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognitio
|
|
|
361
365
|
private stateChangeCallback;
|
|
362
366
|
constructor(config: SimplifiedVGFClientConfig);
|
|
363
367
|
connect(): Promise<void>;
|
|
364
|
-
sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
|
|
368
|
+
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
365
369
|
stopRecording(): Promise<void>;
|
|
366
370
|
getAudioUtteranceId(): string;
|
|
367
371
|
getState(): ClientState;
|
package/dist/index.js
CHANGED
|
@@ -302,13 +302,7 @@ var TimerSchema = z.object({
|
|
|
302
302
|
* Total duration of all audio chunks sent to this provider session
|
|
303
303
|
* @example 2500 (2.5 seconds of audio has been sent)
|
|
304
304
|
*/
|
|
305
|
-
accumulatedAudioTimeMs: z.number().optional()
|
|
306
|
-
/**
|
|
307
|
-
* Estimated cost in USD for this session
|
|
308
|
-
* Calculated by the job based on audio duration and provider pricing
|
|
309
|
-
* @example 0.0025 (quarter of a cent)
|
|
310
|
-
*/
|
|
311
|
-
costInUSD: z.number().optional().default(0)
|
|
305
|
+
accumulatedAudioTimeMs: z.number().optional()
|
|
312
306
|
});
|
|
313
307
|
var RawMessageSchema = z.object({
|
|
314
308
|
type: z.literal(ProviderMessageType.RAW),
|
|
@@ -1477,6 +1471,18 @@ function isNormalDisconnection(code) {
|
|
|
1477
1471
|
return code === 1e3;
|
|
1478
1472
|
}
|
|
1479
1473
|
__name(isNormalDisconnection, "isNormalDisconnection");
|
|
1474
|
+
async function blobToArrayBuffer(blob) {
|
|
1475
|
+
if (typeof blob.arrayBuffer === "function") {
|
|
1476
|
+
return await blob.arrayBuffer();
|
|
1477
|
+
}
|
|
1478
|
+
return new Promise((resolve, reject) => {
|
|
1479
|
+
const reader = new FileReader();
|
|
1480
|
+
reader.onload = () => resolve(reader.result);
|
|
1481
|
+
reader.onerror = () => reject(reader.error);
|
|
1482
|
+
reader.readAsArrayBuffer(blob);
|
|
1483
|
+
});
|
|
1484
|
+
}
|
|
1485
|
+
__name(blobToArrayBuffer, "blobToArrayBuffer");
|
|
1480
1486
|
var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient {
|
|
1481
1487
|
static {
|
|
1482
1488
|
__name(this, "RealTimeTwoWayWebSocketRecognitionClient");
|
|
@@ -1666,6 +1672,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
1666
1672
|
return this.connectionPromise;
|
|
1667
1673
|
}
|
|
1668
1674
|
sendAudio(audioData) {
|
|
1675
|
+
if (audioData instanceof Blob) {
|
|
1676
|
+
blobToArrayBuffer(audioData).then((arrayBuffer) => {
|
|
1677
|
+
this.sendAudioInternal(arrayBuffer);
|
|
1678
|
+
}).catch((error) => {
|
|
1679
|
+
this.log("error", "Failed to convert Blob to ArrayBuffer", error);
|
|
1680
|
+
});
|
|
1681
|
+
return;
|
|
1682
|
+
}
|
|
1683
|
+
this.sendAudioInternal(audioData);
|
|
1684
|
+
}
|
|
1685
|
+
sendAudioInternal(audioData) {
|
|
1669
1686
|
const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
1670
1687
|
if (bytes === 0) return;
|
|
1671
1688
|
this.audioBuffer.write(audioData);
|