@juspay/neurolink 9.61.2 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +373 -355
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +248 -12
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +248 -12
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +3 -1
package/dist/lib/types/tts.d.ts
CHANGED
|
@@ -6,9 +6,13 @@
|
|
|
6
6
|
* @module types/ttsTypes
|
|
7
7
|
*/
|
|
8
8
|
/**
|
|
9
|
-
* Supported audio formats for TTS output
|
|
9
|
+
* Supported audio formats for TTS output, STT input, and Realtime PCM streams.
|
|
10
|
+
*
|
|
11
|
+
* `pcm16` is included for the OpenAI Realtime PCM16 output stream — the chunk
|
|
12
|
+
* is raw PCM, not a RIFF/WAV-headered file. Consumers must not pass `pcm16`
|
|
13
|
+
* bytes to a WAV duration parser.
|
|
10
14
|
*/
|
|
11
|
-
export type
|
|
15
|
+
export type TTSAudioFormat = "mp3" | "wav" | "ogg" | "opus" | "m4a" | "flac" | "webm" | "mp4" | "mpeg" | "mpga" | "pcm16";
|
|
12
16
|
/**
|
|
13
17
|
* TTS quality settings
|
|
14
18
|
*/
|
|
@@ -51,7 +55,7 @@ export type TTSOptions = {
|
|
|
51
55
|
/** Voice identifier (e.g., "en-US-Neural2-C") */
|
|
52
56
|
voice?: string;
|
|
53
57
|
/** Audio format (default: mp3) */
|
|
54
|
-
format?:
|
|
58
|
+
format?: TTSAudioFormat;
|
|
55
59
|
/** Speaking rate 0.25-4.0 (default: 1.0) */
|
|
56
60
|
speed?: number;
|
|
57
61
|
/** Voice pitch adjustment -20.0 to 20.0 semitones (default: 0.0) */
|
|
@@ -64,6 +68,8 @@ export type TTSOptions = {
|
|
|
64
68
|
output?: string;
|
|
65
69
|
/** Auto-play audio after generation (default: false) */
|
|
66
70
|
play?: boolean;
|
|
71
|
+
/** Override TTS provider (e.g., "elevenlabs", "openai-tts", "azure-tts") */
|
|
72
|
+
provider?: string;
|
|
67
73
|
};
|
|
68
74
|
/**
|
|
69
75
|
* TTS audio result returned from generation
|
|
@@ -72,7 +78,7 @@ export type TTSResult = {
|
|
|
72
78
|
/** Audio data as Buffer */
|
|
73
79
|
buffer: Buffer;
|
|
74
80
|
/** Audio format */
|
|
75
|
-
format:
|
|
81
|
+
format: TTSAudioFormat;
|
|
76
82
|
/** Audio file size in bytes */
|
|
77
83
|
size: number;
|
|
78
84
|
/** Duration in seconds (if available) */
|
|
@@ -105,9 +111,15 @@ export type AudioSaveResult = {
|
|
|
105
111
|
error?: string;
|
|
106
112
|
};
|
|
107
113
|
/** Allowed TTS voice types */
|
|
108
|
-
export type
|
|
114
|
+
export type TTSVoiceType = "standard" | "wavenet" | "neural" | "chirp" | "unknown";
|
|
109
115
|
/** Allowed genders for TTS voices */
|
|
110
|
-
export type
|
|
116
|
+
export type TTSGender = "male" | "female" | "neutral";
|
|
117
|
+
/** @deprecated Use `TTSAudioFormat` instead. */
|
|
118
|
+
export type AudioFormat = TTSAudioFormat;
|
|
119
|
+
/** @deprecated Use `TTSVoiceType` instead. */
|
|
120
|
+
export type VoiceType = TTSVoiceType;
|
|
121
|
+
/** @deprecated Use `TTSGender` instead. */
|
|
122
|
+
export type Gender = TTSGender;
|
|
111
123
|
/**
|
|
112
124
|
* TTS voice information
|
|
113
125
|
*/
|
|
@@ -120,17 +132,17 @@ export type TTSVoice = {
|
|
|
120
132
|
languageCode: string;
|
|
121
133
|
/** All supported language codes */
|
|
122
134
|
languageCodes: string[];
|
|
123
|
-
/**
|
|
124
|
-
gender:
|
|
135
|
+
/** TTSGender */
|
|
136
|
+
gender: TTSGender;
|
|
125
137
|
/** Voice type */
|
|
126
|
-
type?:
|
|
138
|
+
type?: TTSVoiceType;
|
|
127
139
|
/** Voice description (optional) */
|
|
128
140
|
description?: string;
|
|
129
141
|
/** Natural sample rate in Hz (optional) */
|
|
130
142
|
naturalSampleRateHertz?: number;
|
|
131
143
|
};
|
|
132
144
|
/** Valid audio formats as an array for runtime validation */
|
|
133
|
-
export declare const VALID_AUDIO_FORMATS: readonly
|
|
145
|
+
export declare const VALID_AUDIO_FORMATS: readonly TTSAudioFormat[];
|
|
134
146
|
/** Valid TTS quality levels as an array for runtime validation */
|
|
135
147
|
export declare const VALID_TTS_QUALITIES: readonly TTSQuality[];
|
|
136
148
|
/** Valid Google TTS audio formats */
|
|
@@ -153,7 +165,7 @@ export type TTSChunk = {
|
|
|
153
165
|
/** Audio data chunk as Buffer */
|
|
154
166
|
data: Buffer;
|
|
155
167
|
/** Audio format of this chunk */
|
|
156
|
-
format:
|
|
168
|
+
format: TTSAudioFormat;
|
|
157
169
|
/** Chunk sequence number (0-indexed) */
|
|
158
170
|
index: number;
|
|
159
171
|
/** Whether this is the final audio chunk */
|
package/dist/lib/types/tts.js
CHANGED
|
@@ -11,6 +11,13 @@ export const VALID_AUDIO_FORMATS = [
|
|
|
11
11
|
"wav",
|
|
12
12
|
"ogg",
|
|
13
13
|
"opus",
|
|
14
|
+
"m4a",
|
|
15
|
+
"flac",
|
|
16
|
+
"webm",
|
|
17
|
+
"mp4",
|
|
18
|
+
"mpeg",
|
|
19
|
+
"mpga",
|
|
20
|
+
"pcm16",
|
|
14
21
|
];
|
|
15
22
|
/** Valid TTS quality levels as an array for runtime validation */
|
|
16
23
|
export const VALID_TTS_QUALITIES = ["standard", "hd"];
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Voice and Speech Type Definitions for NeuroLink
|
|
3
|
+
*
|
|
4
|
+
* Core voice types: capabilities, provider config, audio utilities,
|
|
5
|
+
* events, and provider abstractions.
|
|
6
|
+
*
|
|
7
|
+
* STT types are in ./stt.ts
|
|
8
|
+
* Realtime types are in ./realtime.ts
|
|
9
|
+
* TTS types are in ./tts.ts
|
|
10
|
+
*
|
|
11
|
+
* @module types/voice
|
|
12
|
+
*/
|
|
13
|
+
export * from "./tts.js";
|
|
14
|
+
export * from "./stt.js";
|
|
15
|
+
export * from "./realtime.js";
|
|
16
|
+
import type { TTSAudioFormat, TTSOptions, TTSResult, TTSVoice } from "./tts.js";
|
|
17
|
+
import type { TTSHandler } from "./common.js";
|
|
18
|
+
import type { STTResult, STTHandler } from "./stt.js";
|
|
19
|
+
import type { RealtimeHandler } from "./realtime.js";
|
|
20
|
+
/**
|
|
21
|
+
* Voice capability types supported by providers
|
|
22
|
+
*/
|
|
23
|
+
export type VoiceCapability = "tts" | "stt" | "realtime" | "streaming";
|
|
24
|
+
/**
|
|
25
|
+
* Voice provider types
|
|
26
|
+
*/
|
|
27
|
+
export type VoiceProviderType = "tts" | "stt" | "realtime";
|
|
28
|
+
/**
|
|
29
|
+
* Voice provider name union type
|
|
30
|
+
*/
|
|
31
|
+
export type VoiceProviderName = "google-tts" | "elevenlabs" | "openai-tts" | "azure-tts" | "sarvam" | "murf" | "playai" | "speechify" | "cartesia" | "deepgram" | "gladia" | "whisper" | "assemblyai" | "google-stt" | "azure-stt" | "openai-realtime" | "gemini-live";
|
|
32
|
+
/**
|
|
33
|
+
* Base voice provider configuration
|
|
34
|
+
*/
|
|
35
|
+
export type VoiceProviderConfig = {
|
|
36
|
+
/** Provider identifier */
|
|
37
|
+
name: string;
|
|
38
|
+
/** API key or credentials */
|
|
39
|
+
apiKey?: string;
|
|
40
|
+
/** Custom endpoint URL */
|
|
41
|
+
baseUrl?: string;
|
|
42
|
+
/** Request timeout in milliseconds */
|
|
43
|
+
timeout?: number;
|
|
44
|
+
/** Maximum retries for failed requests */
|
|
45
|
+
maxRetries?: number;
|
|
46
|
+
/** Provider-specific options */
|
|
47
|
+
options?: Record<string, unknown>;
|
|
48
|
+
};
|
|
49
|
+
/**
|
|
50
|
+
* Audio format details
|
|
51
|
+
*/
|
|
52
|
+
export type AudioFormatDetails = {
|
|
53
|
+
/** Format name */
|
|
54
|
+
format: TTSAudioFormat;
|
|
55
|
+
/** MIME type */
|
|
56
|
+
mimeType: string;
|
|
57
|
+
/** File extension */
|
|
58
|
+
extension: string;
|
|
59
|
+
/** Whether format supports streaming */
|
|
60
|
+
supportsStreaming: boolean;
|
|
61
|
+
/** Typical sample rates */
|
|
62
|
+
sampleRates: number[];
|
|
63
|
+
/** Bit depths */
|
|
64
|
+
bitDepths: number[];
|
|
65
|
+
};
|
|
66
|
+
/**
|
|
67
|
+
* Audio conversion options
|
|
68
|
+
*/
|
|
69
|
+
export type AudioConversionOptions = {
|
|
70
|
+
/** Target format */
|
|
71
|
+
targetFormat: TTSAudioFormat;
|
|
72
|
+
/** Target sample rate */
|
|
73
|
+
sampleRate?: number;
|
|
74
|
+
/** Target bit depth */
|
|
75
|
+
bitDepth?: number;
|
|
76
|
+
/** Number of channels */
|
|
77
|
+
channels?: number;
|
|
78
|
+
/** Normalize audio level */
|
|
79
|
+
normalize?: boolean;
|
|
80
|
+
};
|
|
81
|
+
/**
|
|
82
|
+
* Audio stream chunk for streaming operations
|
|
83
|
+
*/
|
|
84
|
+
export type AudioStreamChunk = {
|
|
85
|
+
/** Audio data */
|
|
86
|
+
data: Buffer;
|
|
87
|
+
/** Chunk index */
|
|
88
|
+
index: number;
|
|
89
|
+
/** Whether this is the final chunk */
|
|
90
|
+
isFinal: boolean;
|
|
91
|
+
/** Audio format */
|
|
92
|
+
format: TTSAudioFormat;
|
|
93
|
+
/** Sample rate */
|
|
94
|
+
sampleRate: number;
|
|
95
|
+
/** Timestamp offset in milliseconds */
|
|
96
|
+
timestampMs: number;
|
|
97
|
+
/** Duration of this chunk in milliseconds */
|
|
98
|
+
durationMs: number;
|
|
99
|
+
};
|
|
100
|
+
/**
|
|
101
|
+
* Voice event types for event-driven architectures
|
|
102
|
+
*/
|
|
103
|
+
export type VoiceEventType = "synthesis.started" | "synthesis.progress" | "synthesis.completed" | "synthesis.error" | "transcription.started" | "transcription.partial" | "transcription.completed" | "transcription.error" | "realtime.connected" | "realtime.audio.received" | "realtime.text.received" | "realtime.disconnected" | "realtime.error";
|
|
104
|
+
/**
|
|
105
|
+
* Voice event for event-driven operations
|
|
106
|
+
*/
|
|
107
|
+
export type VoiceEvent<T = unknown> = {
|
|
108
|
+
type: VoiceEventType;
|
|
109
|
+
timestamp: Date;
|
|
110
|
+
provider: VoiceProviderName;
|
|
111
|
+
data: T;
|
|
112
|
+
metadata?: Record<string, unknown>;
|
|
113
|
+
};
|
|
114
|
+
/**
|
|
115
|
+
* Voice operation result union
|
|
116
|
+
*/
|
|
117
|
+
export type VoiceResult = TTSResult | STTResult;
|
|
118
|
+
/**
|
|
119
|
+
* Voice conversation turn
|
|
120
|
+
*/
|
|
121
|
+
export type VoiceTurn = {
|
|
122
|
+
role: "user" | "assistant";
|
|
123
|
+
text: string;
|
|
124
|
+
audio?: Buffer;
|
|
125
|
+
timestamp: Date;
|
|
126
|
+
metadata?: {
|
|
127
|
+
duration?: number;
|
|
128
|
+
confidence?: number;
|
|
129
|
+
language?: string;
|
|
130
|
+
provider?: string;
|
|
131
|
+
voice?: string;
|
|
132
|
+
[key: string]: unknown;
|
|
133
|
+
};
|
|
134
|
+
};
|
|
135
|
+
/**
|
|
136
|
+
* TTS-capable voice provider type
|
|
137
|
+
*/
|
|
138
|
+
export type TTSProvider = {
|
|
139
|
+
/**
|
|
140
|
+
* Synthesize text to speech
|
|
141
|
+
*/
|
|
142
|
+
synthesize(text: string, options: TTSOptions): Promise<TTSResult>;
|
|
143
|
+
/**
|
|
144
|
+
* Stream synthesized audio chunks
|
|
145
|
+
*/
|
|
146
|
+
synthesizeStream?(text: string, options: TTSOptions): AsyncIterable<TTSStreamChunk>;
|
|
147
|
+
/**
|
|
148
|
+
* Get available voices
|
|
149
|
+
*/
|
|
150
|
+
getVoices(languageCode?: string): Promise<TTSVoice[]>;
|
|
151
|
+
/**
|
|
152
|
+
* Maximum text length supported
|
|
153
|
+
*/
|
|
154
|
+
readonly maxTextLength: number;
|
|
155
|
+
};
|
|
156
|
+
/**
|
|
157
|
+
* TTS stream chunk for streaming synthesis
|
|
158
|
+
*/
|
|
159
|
+
export type TTSStreamChunk = {
|
|
160
|
+
/** Audio data chunk */
|
|
161
|
+
data: Buffer;
|
|
162
|
+
/** Chunk sequence number */
|
|
163
|
+
index: number;
|
|
164
|
+
/** Whether this is the final chunk */
|
|
165
|
+
isFinal: boolean;
|
|
166
|
+
/** Audio format */
|
|
167
|
+
format: string;
|
|
168
|
+
/** Sample rate */
|
|
169
|
+
sampleRate?: number;
|
|
170
|
+
/** Timestamp offset in audio (milliseconds) */
|
|
171
|
+
timestampMs?: number;
|
|
172
|
+
};
|
|
173
|
+
/**
|
|
174
|
+
* Voice error codes (general)
|
|
175
|
+
*/
|
|
176
|
+
export declare const VOICE_ERROR_CODES: {
|
|
177
|
+
readonly PROVIDER_NOT_FOUND: "VOICE_PROVIDER_NOT_FOUND";
|
|
178
|
+
readonly INVALID_CONFIGURATION: "VOICE_INVALID_CONFIGURATION";
|
|
179
|
+
readonly INITIALIZATION_FAILED: "VOICE_INITIALIZATION_FAILED";
|
|
180
|
+
readonly OPERATION_CANCELLED: "VOICE_OPERATION_CANCELLED";
|
|
181
|
+
readonly PROVIDER_NOT_CONFIGURED: "VOICE_PROVIDER_NOT_CONFIGURED";
|
|
182
|
+
readonly PROVIDER_NOT_SUPPORTED: "VOICE_PROVIDER_NOT_SUPPORTED";
|
|
183
|
+
readonly FEATURE_NOT_SUPPORTED: "VOICE_FEATURE_NOT_SUPPORTED";
|
|
184
|
+
readonly TTS_EMPTY_TEXT: "VOICE_TTS_EMPTY_TEXT";
|
|
185
|
+
readonly TTS_TEXT_TOO_LONG: "VOICE_TTS_TEXT_TOO_LONG";
|
|
186
|
+
readonly TTS_SYNTHESIS_FAILED: "VOICE_TTS_SYNTHESIS_FAILED";
|
|
187
|
+
readonly STT_EMPTY_AUDIO: "VOICE_STT_EMPTY_AUDIO";
|
|
188
|
+
readonly STT_INVALID_FORMAT: "VOICE_STT_INVALID_FORMAT";
|
|
189
|
+
readonly STT_TRANSCRIPTION_FAILED: "VOICE_STT_TRANSCRIPTION_FAILED";
|
|
190
|
+
readonly REALTIME_CONNECTION_FAILED: "VOICE_REALTIME_CONNECTION_FAILED";
|
|
191
|
+
readonly REALTIME_SESSION_ERROR: "VOICE_REALTIME_SESSION_ERROR";
|
|
192
|
+
readonly NETWORK_ERROR: "VOICE_NETWORK_ERROR";
|
|
193
|
+
readonly TIMEOUT: "VOICE_TIMEOUT";
|
|
194
|
+
};
|
|
195
|
+
/**
|
|
196
|
+
* Supported audio formats with details
|
|
197
|
+
*/
|
|
198
|
+
export declare const AUDIO_FORMAT_DETAILS: Partial<Record<TTSAudioFormat, AudioFormatDetails>>;
|
|
199
|
+
import type { ErrorCategory, ErrorSeverity } from "../constants/enums.js";
|
|
200
|
+
export type VoiceErrorOptions = {
|
|
201
|
+
code: string;
|
|
202
|
+
message: string;
|
|
203
|
+
category?: ErrorCategory;
|
|
204
|
+
severity?: ErrorSeverity;
|
|
205
|
+
retriable?: boolean;
|
|
206
|
+
context?: Record<string, unknown>;
|
|
207
|
+
originalError?: Error;
|
|
208
|
+
provider?: string;
|
|
209
|
+
};
|
|
210
|
+
export type AudioMetadata = {
|
|
211
|
+
format: TTSAudioFormat;
|
|
212
|
+
duration: number;
|
|
213
|
+
sampleRate: number;
|
|
214
|
+
channels: number;
|
|
215
|
+
bitDepth: number;
|
|
216
|
+
samples: number;
|
|
217
|
+
size: number;
|
|
218
|
+
};
|
|
219
|
+
export type StreamHandlerConfig = {
|
|
220
|
+
chunkDurationMs?: number;
|
|
221
|
+
sampleRate?: number;
|
|
222
|
+
bytesPerSample?: number;
|
|
223
|
+
format?: TTSAudioFormat;
|
|
224
|
+
highWaterMark?: number;
|
|
225
|
+
bufferTimeoutMs?: number;
|
|
226
|
+
};
|
|
227
|
+
export type StreamEvents = {
|
|
228
|
+
chunk: (chunk: AudioStreamChunk) => void;
|
|
229
|
+
end: () => void;
|
|
230
|
+
error: (error: Error) => void;
|
|
231
|
+
drain: () => void;
|
|
232
|
+
pause: () => void;
|
|
233
|
+
resume: () => void;
|
|
234
|
+
};
|
|
235
|
+
export type VoiceHandler = TTSHandler | STTHandler | RealtimeHandler;
|
|
236
|
+
export type AzureTTSOptions = TTSOptions & {
|
|
237
|
+
useSSML?: boolean;
|
|
238
|
+
ssmlTemplate?: string;
|
|
239
|
+
outputFormat?: string;
|
|
240
|
+
wordBoundary?: boolean;
|
|
241
|
+
/**
|
|
242
|
+
* Pass `text` through as raw SSML when it begins with `<speak`.
|
|
243
|
+
*
|
|
244
|
+
* **Security:** raw SSML can change voice, embed external content, or
|
|
245
|
+
* inject markup. Only enable when `text` originates from a TRUSTED source
|
|
246
|
+
* (your own server-built template, not end-user input). When this flag
|
|
247
|
+
* is false (default), all input — including text starting with `<speak`
|
|
248
|
+
* — is XML-escaped, preventing SSML injection.
|
|
249
|
+
*
|
|
250
|
+
* @default false
|
|
251
|
+
*/
|
|
252
|
+
allowRawSSML?: boolean;
|
|
253
|
+
};
|
|
254
|
+
export type ElevenLabsModel = "eleven_multilingual_v2" | "eleven_turbo_v2_5" | "eleven_turbo_v2" | "eleven_monolingual_v1";
|
|
255
|
+
export type ElevenLabsTTSOptions = TTSOptions & {
|
|
256
|
+
model?: ElevenLabsModel;
|
|
257
|
+
stability?: number;
|
|
258
|
+
similarityBoost?: number;
|
|
259
|
+
style?: number;
|
|
260
|
+
useSpeakerBoost?: boolean;
|
|
261
|
+
};
|
|
262
|
+
export type GoogleVoiceType = "Standard" | "WaveNet" | "Neural2" | "Studio" | "Polyglot";
|
|
263
|
+
export type GoogleTTSOptions = TTSOptions & {
|
|
264
|
+
voiceType?: GoogleVoiceType;
|
|
265
|
+
sampleRateHertz?: number;
|
|
266
|
+
effectsProfileId?: string[];
|
|
267
|
+
};
|
|
268
|
+
export type OpenAIVoice = "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer";
|
|
269
|
+
export type OpenAITTSModel = "tts-1" | "tts-1-hd";
|
|
270
|
+
export type OpenAITTSOptions = TTSOptions & {
|
|
271
|
+
model?: OpenAITTSModel;
|
|
272
|
+
};
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Voice and Speech Type Definitions for NeuroLink
|
|
3
|
+
*
|
|
4
|
+
* Core voice types: capabilities, provider config, audio utilities,
|
|
5
|
+
* events, and provider abstractions.
|
|
6
|
+
*
|
|
7
|
+
* STT types are in ./stt.ts
|
|
8
|
+
* Realtime types are in ./realtime.ts
|
|
9
|
+
* TTS types are in ./tts.ts
|
|
10
|
+
*
|
|
11
|
+
* @module types/voice
|
|
12
|
+
*/
|
|
13
|
+
// Re-export all TTS types
|
|
14
|
+
export * from "./tts.js";
|
|
15
|
+
// Re-export all STT types
|
|
16
|
+
export * from "./stt.js";
|
|
17
|
+
// Re-export all Realtime types
|
|
18
|
+
export * from "./realtime.js";
|
|
19
|
+
// ============================================================================
|
|
20
|
+
// ERROR CODES
|
|
21
|
+
// ============================================================================
|
|
22
|
+
/**
|
|
23
|
+
* Voice error codes (general)
|
|
24
|
+
*/
|
|
25
|
+
export const VOICE_ERROR_CODES = {
|
|
26
|
+
PROVIDER_NOT_FOUND: "VOICE_PROVIDER_NOT_FOUND",
|
|
27
|
+
INVALID_CONFIGURATION: "VOICE_INVALID_CONFIGURATION",
|
|
28
|
+
INITIALIZATION_FAILED: "VOICE_INITIALIZATION_FAILED",
|
|
29
|
+
OPERATION_CANCELLED: "VOICE_OPERATION_CANCELLED",
|
|
30
|
+
// General
|
|
31
|
+
PROVIDER_NOT_CONFIGURED: "VOICE_PROVIDER_NOT_CONFIGURED",
|
|
32
|
+
PROVIDER_NOT_SUPPORTED: "VOICE_PROVIDER_NOT_SUPPORTED",
|
|
33
|
+
FEATURE_NOT_SUPPORTED: "VOICE_FEATURE_NOT_SUPPORTED",
|
|
34
|
+
// TTS specific
|
|
35
|
+
TTS_EMPTY_TEXT: "VOICE_TTS_EMPTY_TEXT",
|
|
36
|
+
TTS_TEXT_TOO_LONG: "VOICE_TTS_TEXT_TOO_LONG",
|
|
37
|
+
TTS_SYNTHESIS_FAILED: "VOICE_TTS_SYNTHESIS_FAILED",
|
|
38
|
+
// STT specific
|
|
39
|
+
STT_EMPTY_AUDIO: "VOICE_STT_EMPTY_AUDIO",
|
|
40
|
+
STT_INVALID_FORMAT: "VOICE_STT_INVALID_FORMAT",
|
|
41
|
+
STT_TRANSCRIPTION_FAILED: "VOICE_STT_TRANSCRIPTION_FAILED",
|
|
42
|
+
// Realtime specific
|
|
43
|
+
REALTIME_CONNECTION_FAILED: "VOICE_REALTIME_CONNECTION_FAILED",
|
|
44
|
+
REALTIME_SESSION_ERROR: "VOICE_REALTIME_SESSION_ERROR",
|
|
45
|
+
// Network
|
|
46
|
+
NETWORK_ERROR: "VOICE_NETWORK_ERROR",
|
|
47
|
+
TIMEOUT: "VOICE_TIMEOUT",
|
|
48
|
+
};
|
|
49
|
+
// ============================================================================
|
|
50
|
+
// CONSTANTS
|
|
51
|
+
// ============================================================================
|
|
52
|
+
/**
|
|
53
|
+
* Supported audio formats with details
|
|
54
|
+
*/
|
|
55
|
+
export const AUDIO_FORMAT_DETAILS = {
|
|
56
|
+
mp3: {
|
|
57
|
+
format: "mp3",
|
|
58
|
+
mimeType: "audio/mpeg",
|
|
59
|
+
extension: ".mp3",
|
|
60
|
+
supportsStreaming: true,
|
|
61
|
+
sampleRates: [8000, 16000, 22050, 24000, 44100, 48000],
|
|
62
|
+
bitDepths: [16],
|
|
63
|
+
},
|
|
64
|
+
wav: {
|
|
65
|
+
format: "wav",
|
|
66
|
+
mimeType: "audio/wav",
|
|
67
|
+
extension: ".wav",
|
|
68
|
+
supportsStreaming: false,
|
|
69
|
+
sampleRates: [8000, 16000, 22050, 24000, 44100, 48000],
|
|
70
|
+
bitDepths: [8, 16, 24, 32],
|
|
71
|
+
},
|
|
72
|
+
ogg: {
|
|
73
|
+
format: "ogg",
|
|
74
|
+
mimeType: "audio/ogg",
|
|
75
|
+
extension: ".ogg",
|
|
76
|
+
supportsStreaming: true,
|
|
77
|
+
sampleRates: [8000, 16000, 22050, 24000, 44100, 48000],
|
|
78
|
+
bitDepths: [16],
|
|
79
|
+
},
|
|
80
|
+
opus: {
|
|
81
|
+
format: "opus",
|
|
82
|
+
mimeType: "audio/opus",
|
|
83
|
+
extension: ".opus",
|
|
84
|
+
supportsStreaming: true,
|
|
85
|
+
sampleRates: [8000, 12000, 16000, 24000, 48000],
|
|
86
|
+
bitDepths: [16],
|
|
87
|
+
},
|
|
88
|
+
m4a: {
|
|
89
|
+
format: "m4a",
|
|
90
|
+
mimeType: "audio/mp4",
|
|
91
|
+
extension: ".m4a",
|
|
92
|
+
supportsStreaming: false,
|
|
93
|
+
sampleRates: [44100, 48000],
|
|
94
|
+
bitDepths: [16],
|
|
95
|
+
},
|
|
96
|
+
flac: {
|
|
97
|
+
format: "flac",
|
|
98
|
+
mimeType: "audio/flac",
|
|
99
|
+
extension: ".flac",
|
|
100
|
+
supportsStreaming: false,
|
|
101
|
+
sampleRates: [44100, 48000, 96000],
|
|
102
|
+
bitDepths: [16, 24],
|
|
103
|
+
},
|
|
104
|
+
webm: {
|
|
105
|
+
format: "webm",
|
|
106
|
+
mimeType: "audio/webm",
|
|
107
|
+
extension: ".webm",
|
|
108
|
+
supportsStreaming: true,
|
|
109
|
+
sampleRates: [44100, 48000],
|
|
110
|
+
bitDepths: [16],
|
|
111
|
+
},
|
|
112
|
+
mp4: {
|
|
113
|
+
format: "mp4",
|
|
114
|
+
mimeType: "audio/mp4",
|
|
115
|
+
extension: ".mp4",
|
|
116
|
+
supportsStreaming: false,
|
|
117
|
+
sampleRates: [44100, 48000],
|
|
118
|
+
bitDepths: [16],
|
|
119
|
+
},
|
|
120
|
+
mpeg: {
|
|
121
|
+
format: "mpeg",
|
|
122
|
+
mimeType: "audio/mpeg",
|
|
123
|
+
extension: ".mpeg",
|
|
124
|
+
supportsStreaming: true,
|
|
125
|
+
sampleRates: [8000, 16000, 22050, 24000, 44100, 48000],
|
|
126
|
+
bitDepths: [16],
|
|
127
|
+
},
|
|
128
|
+
mpga: {
|
|
129
|
+
format: "mpga",
|
|
130
|
+
mimeType: "audio/mpeg",
|
|
131
|
+
extension: ".mpga",
|
|
132
|
+
supportsStreaming: true,
|
|
133
|
+
sampleRates: [8000, 16000, 22050, 24000, 44100, 48000],
|
|
134
|
+
bitDepths: [16],
|
|
135
|
+
},
|
|
136
|
+
};
|
|
137
|
+
//# sourceMappingURL=voice.js.map
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared utility: infer a `TTSAudioFormat` from a file path.
|
|
3
|
+
*
|
|
4
|
+
* Used by the CLI generate/stream handlers (m2) to set `stt.format` so
|
|
5
|
+
* `STTProcessor.transcribe()` can fail fast on incompatible provider/format
|
|
6
|
+
* combinations (e.g. MP3 to azure-stt). Pulled into a single helper to
|
|
7
|
+
* avoid duplicating the 11-element format list across two CLI handlers.
|
|
8
|
+
*/
|
|
9
|
+
import type { TTSAudioFormat } from "../types/index.js";
|
|
10
|
+
/**
|
|
11
|
+
* Returns the `TTSAudioFormat` that matches the file extension of `path`,
|
|
12
|
+
* or `undefined` when the path is missing or its extension isn't a known
|
|
13
|
+
* audio format. The check is case-insensitive.
|
|
14
|
+
*/
|
|
15
|
+
export declare function inferAudioFormatFromPath(path: string | undefined): TTSAudioFormat | undefined;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared utility: infer a `TTSAudioFormat` from a file path.
|
|
3
|
+
*
|
|
4
|
+
* Used by the CLI generate/stream handlers (m2) to set `stt.format` so
|
|
5
|
+
* `STTProcessor.transcribe()` can fail fast on incompatible provider/format
|
|
6
|
+
* combinations (e.g. MP3 to azure-stt). Pulled into a single helper to
|
|
7
|
+
* avoid duplicating the 11-element format list across two CLI handlers.
|
|
8
|
+
*/
|
|
9
|
+
const VALID_FORMATS = [
|
|
10
|
+
"mp3",
|
|
11
|
+
"wav",
|
|
12
|
+
"ogg",
|
|
13
|
+
"opus",
|
|
14
|
+
"m4a",
|
|
15
|
+
"flac",
|
|
16
|
+
"webm",
|
|
17
|
+
"mp4",
|
|
18
|
+
"mpeg",
|
|
19
|
+
"mpga",
|
|
20
|
+
"pcm16",
|
|
21
|
+
];
|
|
22
|
+
/**
|
|
23
|
+
* Returns the `TTSAudioFormat` that matches the file extension of `path`,
|
|
24
|
+
* or `undefined` when the path is missing or its extension isn't a known
|
|
25
|
+
* audio format. The check is case-insensitive.
|
|
26
|
+
*/
|
|
27
|
+
export function inferAudioFormatFromPath(path) {
|
|
28
|
+
if (!path) {
|
|
29
|
+
return undefined;
|
|
30
|
+
}
|
|
31
|
+
const ext = path.toLowerCase().split(".").pop();
|
|
32
|
+
return ext && VALID_FORMATS.includes(ext) ? ext : undefined;
|
|
33
|
+
}
|
|
34
|
+
//# sourceMappingURL=audioFormatDetector.js.map
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Speech-to-Text (STT) Processing Utility
|
|
3
|
+
*
|
|
4
|
+
* Central orchestrator for all STT operations across providers.
|
|
5
|
+
* Manages provider-specific STT handlers and audio transcription.
|
|
6
|
+
*
|
|
7
|
+
* @module utils/sttProcessor
|
|
8
|
+
*/
|
|
9
|
+
import type { STTOptions, STTResult, STTHandler } from "../types/index.js";
|
|
10
|
+
/**
|
|
11
|
+
* STT processor class for orchestrating speech-to-text operations
|
|
12
|
+
*
|
|
13
|
+
* Follows the same pattern as TTSProcessor, CSVProcessor, ImageProcessor, and PDFProcessor.
|
|
14
|
+
* Provides a unified interface for STT transcription across multiple providers.
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* ```typescript
|
|
18
|
+
* // Register a handler
|
|
19
|
+
* STTProcessor.registerHandler('whisper', whisperHandler);
|
|
20
|
+
*
|
|
21
|
+
* // Check if provider is supported
|
|
22
|
+
* if (STTProcessor.supports('whisper')) {
|
|
23
|
+
* // Provider is registered
|
|
24
|
+
* }
|
|
25
|
+
* ```
|
|
26
|
+
*/
|
|
27
|
+
export declare class STTProcessor {
|
|
28
|
+
/**
|
|
29
|
+
* Handler registry mapping provider names to STT handlers
|
|
30
|
+
* Uses Map for O(1) lookups and better type safety
|
|
31
|
+
*
|
|
32
|
+
* @private
|
|
33
|
+
*/
|
|
34
|
+
private static readonly handlers;
|
|
35
|
+
/**
|
|
36
|
+
* Default maximum audio duration for STT transcription (in seconds)
|
|
37
|
+
*
|
|
38
|
+
* Providers can override this value by specifying the `maxAudioDuration` property
|
|
39
|
+
* in their respective `STTHandler` implementation. If not specified, this default
|
|
40
|
+
* value will be used (5 minutes).
|
|
41
|
+
*
|
|
42
|
+
* @private
|
|
43
|
+
*/
|
|
44
|
+
private static readonly DEFAULT_MAX_AUDIO_DURATION;
|
|
45
|
+
/**
|
|
46
|
+
* Register an STT handler for a specific provider
|
|
47
|
+
*
|
|
48
|
+
* Allows providers to register their STT implementation at runtime.
|
|
49
|
+
*
|
|
50
|
+
* @param providerName - Provider identifier (e.g., 'whisper', 'deepgram')
|
|
51
|
+
* @param handler - STT handler implementation
|
|
52
|
+
*
|
|
53
|
+
* @example
|
|
54
|
+
* ```typescript
|
|
55
|
+
* const whisperHandler: STTHandler = {
|
|
56
|
+
* transcribe: async (audio, options) => { ... },
|
|
57
|
+
* getSupportedFormats: () => ["mp3", "wav"],
|
|
58
|
+
* isConfigured: () => true
|
|
59
|
+
* };
|
|
60
|
+
*
|
|
61
|
+
* STTProcessor.registerHandler('whisper', whisperHandler);
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
static registerHandler(providerName: string, handler: STTHandler): void;
|
|
65
|
+
/**
|
|
66
|
+
* Get a registered STT handler by provider name
|
|
67
|
+
*
|
|
68
|
+
* @private
|
|
69
|
+
* @param providerName - Provider identifier
|
|
70
|
+
* @returns Handler instance or undefined if not registered
|
|
71
|
+
*/
|
|
72
|
+
private static getHandler;
|
|
73
|
+
/**
|
|
74
|
+
* Check if a provider is supported (has a registered STT handler)
|
|
75
|
+
*
|
|
76
|
+
* @param providerName - Provider identifier
|
|
77
|
+
* @returns True if handler is registered
|
|
78
|
+
*
|
|
79
|
+
* @example
|
|
80
|
+
* ```typescript
|
|
81
|
+
* if (STTProcessor.supports('whisper')) {
|
|
82
|
+
* console.log('Whisper STT is supported');
|
|
83
|
+
* }
|
|
84
|
+
* ```
|
|
85
|
+
*/
|
|
86
|
+
static supports(providerName: string): boolean;
|
|
87
|
+
/**
|
|
88
|
+
* Transcribe audio to text using a registered STT provider
|
|
89
|
+
*
|
|
90
|
+
* Orchestrates the speech-to-text transcription process:
|
|
91
|
+
* 1. Validates audio input (non-empty)
|
|
92
|
+
* 2. Looks up the provider handler
|
|
93
|
+
* 3. Verifies provider configuration
|
|
94
|
+
* 4. Delegates transcription to the provider
|
|
95
|
+
* 5. Enriches result with provider metadata
|
|
96
|
+
*
|
|
97
|
+
* @param audio - Audio data as Buffer or ArrayBuffer
|
|
98
|
+
* @param provider - Provider identifier
|
|
99
|
+
* @param options - STT configuration options
|
|
100
|
+
* @returns Transcription result with text and metadata
|
|
101
|
+
* @throws STTError if validation fails or provider not supported/configured
|
|
102
|
+
*
|
|
103
|
+
* @example
|
|
104
|
+
* ```typescript
|
|
105
|
+
* const result = await STTProcessor.transcribe(audioBuffer, "whisper", {
|
|
106
|
+
* language: "en-US",
|
|
107
|
+
* punctuation: true,
|
|
108
|
+
* });
|
|
109
|
+
*
|
|
110
|
+
* console.log(`Transcription: ${result.text}`);
|
|
111
|
+
* console.log(`Confidence: ${result.confidence}`);
|
|
112
|
+
* ```
|
|
113
|
+
*/
|
|
114
|
+
static transcribe(audio: Buffer | ArrayBuffer, provider: string, options: STTOptions): Promise<STTResult>;
|
|
115
|
+
}
|