@framers/agentos 0.1.110 → 0.1.112
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/agency.d.ts.map +1 -1
- package/dist/api/agency.js +38 -2
- package/dist/api/agency.js.map +1 -1
- package/dist/api/agent.js +1 -1
- package/dist/api/agent.js.map +1 -1
- package/dist/api/strategies/debate.d.ts.map +1 -1
- package/dist/api/strategies/debate.js.map +1 -1
- package/dist/api/strategies/graph.d.ts.map +1 -1
- package/dist/api/strategies/graph.js +1 -2
- package/dist/api/strategies/graph.js.map +1 -1
- package/dist/api/strategies/hierarchical.d.ts.map +1 -1
- package/dist/api/strategies/hierarchical.js +1 -2
- package/dist/api/strategies/hierarchical.js.map +1 -1
- package/dist/api/strategies/index.d.ts +1 -9
- package/dist/api/strategies/index.d.ts.map +1 -1
- package/dist/api/strategies/index.js +1 -11
- package/dist/api/strategies/index.js.map +1 -1
- package/dist/api/strategies/parallel.d.ts.map +1 -1
- package/dist/api/strategies/parallel.js +23 -4
- package/dist/api/strategies/parallel.js.map +1 -1
- package/dist/api/strategies/review-loop.d.ts.map +1 -1
- package/dist/api/strategies/review-loop.js.map +1 -1
- package/dist/api/strategies/sequential.d.ts.map +1 -1
- package/dist/api/strategies/sequential.js +1 -2
- package/dist/api/strategies/sequential.js.map +1 -1
- package/dist/api/strategies/shared.d.ts +8 -0
- package/dist/api/strategies/shared.d.ts.map +1 -1
- package/dist/api/strategies/shared.js +10 -1
- package/dist/api/strategies/shared.js.map +1 -1
- package/dist/api/types.d.ts +6 -0
- package/dist/api/types.d.ts.map +1 -1
- package/dist/api/types.js.map +1 -1
- package/dist/memory/AgentMemory.d.ts +2 -1
- package/dist/memory/AgentMemory.d.ts.map +1 -1
- package/dist/memory/AgentMemory.js +1 -1
- package/dist/memory/AgentMemory.js.map +1 -1
- package/dist/memory/CognitiveMemoryManager.d.ts.map +1 -1
- package/dist/memory/CognitiveMemoryManager.js +7 -2
- package/dist/memory/CognitiveMemoryManager.js.map +1 -1
- package/dist/memory/facade/Memory.d.ts.map +1 -1
- package/dist/memory/facade/Memory.js +6 -9
- package/dist/memory/facade/Memory.js.map +1 -1
- package/dist/memory/store/MemoryStore.d.ts +9 -0
- package/dist/memory/store/MemoryStore.d.ts.map +1 -1
- package/dist/memory/store/MemoryStore.js +66 -6
- package/dist/memory/store/MemoryStore.js.map +1 -1
- package/dist/memory/store/SqliteMemoryGraph.d.ts.map +1 -1
- package/dist/memory/store/SqliteMemoryGraph.js +27 -13
- package/dist/memory/store/SqliteMemoryGraph.js.map +1 -1
- package/dist/speech/FallbackProxy.d.ts +194 -41
- package/dist/speech/FallbackProxy.d.ts.map +1 -1
- package/dist/speech/FallbackProxy.js +155 -32
- package/dist/speech/FallbackProxy.js.map +1 -1
- package/dist/speech/SpeechProviderResolver.d.ts +278 -36
- package/dist/speech/SpeechProviderResolver.d.ts.map +1 -1
- package/dist/speech/SpeechProviderResolver.js +306 -40
- package/dist/speech/SpeechProviderResolver.js.map +1 -1
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts +119 -19
- package/dist/speech/providers/AssemblyAISTTProvider.d.ts.map +1 -1
- package/dist/speech/providers/AssemblyAISTTProvider.js +153 -25
- package/dist/speech/providers/AssemblyAISTTProvider.js.map +1 -1
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts +121 -17
- package/dist/speech/providers/AzureSpeechSTTProvider.d.ts.map +1 -1
- package/dist/speech/providers/AzureSpeechSTTProvider.js +122 -14
- package/dist/speech/providers/AzureSpeechSTTProvider.js.map +1 -1
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts +130 -15
- package/dist/speech/providers/AzureSpeechTTSProvider.d.ts.map +1 -1
- package/dist/speech/providers/AzureSpeechTTSProvider.js +163 -18
- package/dist/speech/providers/AzureSpeechTTSProvider.js.map +1 -1
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts +159 -0
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts.map +1 -1
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.js +119 -0
- package/dist/speech/providers/BuiltInAdaptiveVadProvider.js.map +1 -1
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts +102 -16
- package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts.map +1 -1
- package/dist/speech/providers/DeepgramBatchSTTProvider.js +108 -13
- package/dist/speech/providers/DeepgramBatchSTTProvider.js.map +1 -1
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts +149 -0
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts.map +1 -1
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js +137 -2
- package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js.map +1 -1
- package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts +125 -0
- package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts.map +1 -1
- package/dist/speech/providers/OpenAITextToSpeechProvider.js +128 -4
- package/dist/speech/providers/OpenAITextToSpeechProvider.js.map +1 -1
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts +110 -0
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts.map +1 -1
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js +115 -0
- package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js.map +1 -1
- package/dist/voice/CallManager.d.ts.map +1 -1
- package/dist/voice/CallManager.js +9 -1
- package/dist/voice/CallManager.js.map +1 -1
- package/dist/voice/MediaStreamParser.d.ts +115 -6
- package/dist/voice/MediaStreamParser.d.ts.map +1 -1
- package/dist/voice/MediaStreamParser.js +44 -0
- package/dist/voice/MediaStreamParser.js.map +1 -1
- package/dist/voice/TelephonyStreamTransport.d.ts +112 -20
- package/dist/voice/TelephonyStreamTransport.d.ts.map +1 -1
- package/dist/voice/TelephonyStreamTransport.js +136 -30
- package/dist/voice/TelephonyStreamTransport.js.map +1 -1
- package/dist/voice/parsers/PlivoMediaStreamParser.d.ts +64 -6
- package/dist/voice/parsers/PlivoMediaStreamParser.d.ts.map +1 -1
- package/dist/voice/parsers/PlivoMediaStreamParser.js +67 -6
- package/dist/voice/parsers/PlivoMediaStreamParser.js.map +1 -1
- package/dist/voice/parsers/TelnyxMediaStreamParser.d.ts +55 -8
- package/dist/voice/parsers/TelnyxMediaStreamParser.d.ts.map +1 -1
- package/dist/voice/parsers/TelnyxMediaStreamParser.js +60 -9
- package/dist/voice/parsers/TelnyxMediaStreamParser.js.map +1 -1
- package/dist/voice/parsers/TwilioMediaStreamParser.d.ts +73 -11
- package/dist/voice/parsers/TwilioMediaStreamParser.d.ts.map +1 -1
- package/dist/voice/parsers/TwilioMediaStreamParser.js +81 -12
- package/dist/voice/parsers/TwilioMediaStreamParser.js.map +1 -1
- package/dist/voice/providers/plivo.d.ts +108 -12
- package/dist/voice/providers/plivo.d.ts.map +1 -1
- package/dist/voice/providers/plivo.js +106 -9
- package/dist/voice/providers/plivo.js.map +1 -1
- package/dist/voice/providers/telnyx.d.ts +110 -20
- package/dist/voice/providers/telnyx.d.ts.map +1 -1
- package/dist/voice/providers/telnyx.js +111 -20
- package/dist/voice/providers/telnyx.js.map +1 -1
- package/dist/voice/providers/twilio.d.ts +91 -13
- package/dist/voice/providers/twilio.d.ts.map +1 -1
- package/dist/voice/providers/twilio.js +94 -14
- package/dist/voice/providers/twilio.js.map +1 -1
- package/dist/voice/twiml.d.ts +70 -12
- package/dist/voice/twiml.d.ts.map +1 -1
- package/dist/voice/twiml.js +70 -12
- package/dist/voice/twiml.js.map +1 -1
- package/dist/voice/types.d.ts +142 -15
- package/dist/voice/types.d.ts.map +1 -1
- package/dist/voice/types.js +34 -3
- package/dist/voice/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,52 +1,156 @@
|
|
|
1
|
-
/**
|
|
1
|
+
/**
|
|
2
|
+
* Converts Azure's 100-nanosecond tick unit to seconds.
|
|
3
|
+
*
|
|
4
|
+
* Azure Cognitive Services uses "ticks" (100-nanosecond units) for all
|
|
5
|
+
* timing fields. One second = 10,000,000 ticks.
|
|
6
|
+
*
|
|
7
|
+
* @param ticks - Duration in 100-nanosecond Azure ticks.
|
|
8
|
+
* @returns Duration in seconds.
|
|
9
|
+
*
|
|
10
|
+
* @example
|
|
11
|
+
* ```ts
|
|
12
|
+
* ticksToSeconds(30_000_000); // 3.0 seconds
|
|
13
|
+
* ticksToSeconds(15_000_000); // 1.5 seconds
|
|
14
|
+
* ```
|
|
15
|
+
*/
|
|
2
16
|
function ticksToSeconds(ticks) {
|
|
3
17
|
return ticks / 10000000;
|
|
4
18
|
}
|
|
5
19
|
/**
|
|
6
20
|
* Speech-to-text provider that uses the Azure Cognitive Services Speech REST API.
|
|
7
21
|
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
22
|
+
* ## Azure REST Endpoint Format
|
|
23
|
+
*
|
|
24
|
+
* The endpoint URL follows this pattern:
|
|
25
|
+
* ```
|
|
26
|
+
* https://{region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language={lang}
|
|
27
|
+
* ```
|
|
28
|
+
*
|
|
29
|
+
* - `{region}` — The Azure region from config (e.g. `eastus`, `westeurope`).
|
|
30
|
+
* - `{lang}` — BCP-47 language code from options or `'en-US'` default.
|
|
31
|
+
* - The `/conversation/` path segment selects the conversation recognition mode
|
|
32
|
+
* (as opposed to `/interactive/` or `/dictation/`).
|
|
33
|
+
*
|
|
34
|
+
* ## Authentication: `Ocp-Apim-Subscription-Key`
|
|
35
|
+
*
|
|
36
|
+
* Azure Cognitive Services uses the `Ocp-Apim-Subscription-Key` HTTP header
|
|
37
|
+
* for authentication, which differs from the typical `Authorization: Bearer`
|
|
38
|
+
* pattern. The subscription key is sent as a plain-text header value — no
|
|
39
|
+
* "Bearer" or "Token" prefix.
|
|
40
|
+
*
|
|
41
|
+
* An alternative is to use a short-lived token from the token endpoint, but
|
|
42
|
+
* this provider uses the simpler key-based approach for reliability.
|
|
43
|
+
*
|
|
44
|
+
* ## NoMatch Handling
|
|
45
|
+
*
|
|
46
|
+
* When Azure's recognizer detects audio but cannot identify any speech, it
|
|
47
|
+
* returns `RecognitionStatus: 'NoMatch'` instead of raising an HTTP error.
|
|
48
|
+
* This provider maps `NoMatch` to an empty-text result (`text: ''`) with
|
|
49
|
+
* `isFinal: true`, matching the Azure Speech SDK's behaviour. This prevents
|
|
50
|
+
* the fallback proxy from unnecessarily trying another provider when the
|
|
51
|
+
* audio genuinely contains no speech.
|
|
52
|
+
*
|
|
53
|
+
* ## Limitations
|
|
54
|
+
*
|
|
55
|
+
* - Audio must be PCM WAV format. The `Content-Type` is hardcoded to
|
|
56
|
+
* `audio/wav` regardless of the `audio.mimeType` value.
|
|
57
|
+
* - Streaming is not supported — use the Azure Speech SDK for real-time STT.
|
|
58
|
+
* - Speaker diarization is not available via the REST API.
|
|
59
|
+
*
|
|
60
|
+
* @see {@link AzureSpeechSTTProviderConfig} for configuration options
|
|
61
|
+
* @see {@link AzureSpeechTTSProvider} for the corresponding TTS provider
|
|
12
62
|
*
|
|
13
63
|
* @example
|
|
14
64
|
* ```ts
|
|
15
|
-
* const provider = new AzureSpeechSTTProvider({
|
|
16
|
-
*
|
|
17
|
-
*
|
|
65
|
+
* const provider = new AzureSpeechSTTProvider({
|
|
66
|
+
* key: process.env.AZURE_SPEECH_KEY!,
|
|
67
|
+
* region: 'eastus',
|
|
68
|
+
* });
|
|
69
|
+
* const result = await provider.transcribe(
|
|
70
|
+
* { data: wavBuffer, mimeType: 'audio/wav' },
|
|
71
|
+
* { language: 'de-DE' },
|
|
72
|
+
* );
|
|
73
|
+
* console.log(result.text); // '' if no speech detected
|
|
18
74
|
* ```
|
|
19
75
|
*/
|
|
20
76
|
export class AzureSpeechSTTProvider {
|
|
77
|
+
/**
|
|
78
|
+
* Creates a new AzureSpeechSTTProvider.
|
|
79
|
+
*
|
|
80
|
+
* @param config - Provider configuration including the subscription key and region.
|
|
81
|
+
*
|
|
82
|
+
* @example
|
|
83
|
+
* ```ts
|
|
84
|
+
* const provider = new AzureSpeechSTTProvider({
|
|
85
|
+
* key: 'your-azure-subscription-key',
|
|
86
|
+
* region: 'eastus',
|
|
87
|
+
* });
|
|
88
|
+
* ```
|
|
89
|
+
*/
|
|
21
90
|
constructor(config) {
|
|
22
91
|
this.config = config;
|
|
92
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
23
93
|
this.id = 'azure-speech-stt';
|
|
94
|
+
/** Human-readable display name for UI and logging. */
|
|
24
95
|
this.displayName = 'Azure Speech (STT)';
|
|
96
|
+
/** This provider uses synchronous HTTP requests, not WebSocket streaming. */
|
|
25
97
|
this.supportsStreaming = false;
|
|
26
98
|
this.fetchImpl = config.fetchImpl ?? fetch;
|
|
27
99
|
}
|
|
28
|
-
/**
|
|
100
|
+
/**
|
|
101
|
+
* Returns the human-readable provider name.
|
|
102
|
+
*
|
|
103
|
+
* @returns The display name string `'Azure Speech (STT)'`.
|
|
104
|
+
*
|
|
105
|
+
* @example
|
|
106
|
+
* ```ts
|
|
107
|
+
* provider.getProviderName(); // 'Azure Speech (STT)'
|
|
108
|
+
* ```
|
|
109
|
+
*/
|
|
29
110
|
getProviderName() {
|
|
30
111
|
return this.displayName;
|
|
31
112
|
}
|
|
32
113
|
/**
|
|
33
114
|
* Transcribes an audio buffer using the Azure Speech recognition REST endpoint.
|
|
34
115
|
*
|
|
35
|
-
*
|
|
36
|
-
*
|
|
37
|
-
*
|
|
38
|
-
* @
|
|
116
|
+
* Sends the raw audio as PCM WAV and returns a normalized result. Azure's
|
|
117
|
+
* `NoMatch` status is treated as an empty transcript (not an error).
|
|
118
|
+
*
|
|
119
|
+
* @param audio - Raw audio data. Azure expects PCM WAV format; the
|
|
120
|
+
* Content-Type header is always set to `'audio/wav'` regardless of
|
|
121
|
+
* `audio.mimeType`.
|
|
122
|
+
* @param options - Optional transcription settings. Only `language` is
|
|
123
|
+
* supported by the Azure REST endpoint.
|
|
124
|
+
* @returns A promise resolving to the normalized transcription result.
|
|
125
|
+
* @throws {Error} When the Azure API returns a non-2xx HTTP status code.
|
|
126
|
+
* The error message includes the status and response body text.
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```ts
|
|
130
|
+
* const result = await provider.transcribe(
|
|
131
|
+
* { data: wavBuffer, durationSeconds: 5 },
|
|
132
|
+
* { language: 'fr-FR' },
|
|
133
|
+
* );
|
|
134
|
+
* if (result.text === '') {
|
|
135
|
+
* console.log('No speech detected in the audio');
|
|
136
|
+
* }
|
|
137
|
+
* ```
|
|
39
138
|
*/
|
|
40
139
|
async transcribe(audio, options = {}) {
|
|
41
140
|
const lang = options.language ?? 'en-US';
|
|
42
141
|
const { key, region } = this.config;
|
|
142
|
+
// Build the Azure STT REST endpoint URL.
|
|
143
|
+
// The /conversation/ path selects conversation recognition mode which is
|
|
144
|
+
// the most general-purpose mode for varied audio content.
|
|
43
145
|
const url = `https://${region}.stt.speech.microsoft.com` +
|
|
44
146
|
`/speech/recognition/conversation/cognitiveservices/v1` +
|
|
45
147
|
`?language=${encodeURIComponent(lang)}`;
|
|
46
148
|
const response = await this.fetchImpl(url, {
|
|
47
149
|
method: 'POST',
|
|
48
150
|
headers: {
|
|
151
|
+
// Azure uses this non-standard header for subscription key auth
|
|
49
152
|
'Ocp-Apim-Subscription-Key': key,
|
|
153
|
+
// Hardcoded to audio/wav because Azure's REST endpoint requires WAV format
|
|
50
154
|
'Content-Type': 'audio/wav',
|
|
51
155
|
},
|
|
52
156
|
body: audio.data,
|
|
@@ -56,7 +160,9 @@ export class AzureSpeechSTTProvider {
|
|
|
56
160
|
throw new Error(`Azure Speech STT failed (${response.status}): ${message}`);
|
|
57
161
|
}
|
|
58
162
|
const payload = (await response.json());
|
|
59
|
-
// NoMatch means the recognizer found no speech
|
|
163
|
+
// NoMatch means the recognizer detected audio but found no speech content.
|
|
164
|
+
// Return an empty result instead of throwing — this is the expected behaviour
|
|
165
|
+
// for silence or noise-only audio, matching the Azure Speech SDK pattern.
|
|
60
166
|
if (payload.RecognitionStatus === 'NoMatch') {
|
|
61
167
|
return {
|
|
62
168
|
text: '',
|
|
@@ -70,6 +176,8 @@ export class AzureSpeechSTTProvider {
|
|
|
70
176
|
},
|
|
71
177
|
};
|
|
72
178
|
}
|
|
179
|
+
// Convert Azure's 100-nanosecond ticks to seconds, falling back to the
|
|
180
|
+
// client-provided duration estimate if the API doesn't return Duration.
|
|
73
181
|
const durationSeconds = typeof payload.Duration === 'number'
|
|
74
182
|
? ticksToSeconds(payload.Duration)
|
|
75
183
|
: audio.durationSeconds;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AzureSpeechSTTProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechSTTProvider.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"AzureSpeechSTTProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechSTTProvider.ts"],"names":[],"mappings":"AA+EA;;;;;;;;;;;;;;GAcG;AACH,SAAS,cAAc,CAAC,KAAa;IACnC,OAAO,KAAK,GAAG,QAAU,CAAC;AAC5B,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwDG;AACH,MAAM,OAAO,sBAAsB;IAajC;;;;;;;;;;;;OAYG;IACH,YAA6B,MAAoC;QAApC,WAAM,GAAN,MAAM,CAA8B;QAzBjE,uEAAuE;QACvD,OAAE,GAAG,kBAAkB,CAAC;QAExC,sDAAsD;QACtC,gBAAW,GAAG,oBAAoB,CAAC;QAEnD,6EAA6E;QAC7D,sBAAiB,GAAG,KAAK,CAAC;QAmBxC,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,KAAK,CAAC;IAC7C,CAAC;IAED;;;;;;;;;OASG;IACH,eAAe;QACb,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;;;OAyBG;IACH,KAAK,CAAC,UAAU,CACd,KAAuB,EACvB,UAAsC,EAAE;QAExC,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC;QACzC,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC;QAEpC,yCAAyC;QACzC,yEAAyE;QACzE,0DAA0D;QAC1D,MAAM,GAAG,GACP,WAAW,MAAM,2BAA2B;YAC5C,uDAAuD;YACvD,aAAa,kBAAkB,CAAC,IAAI,CAAC,EAAE,CAAC;QAE1C,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE;YACzC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,gEAAgE;gBAChE,2BAA2B,EAAE,GAAG;gBAChC,2EAA2E;gBAC3E,cAAc,EAAE,WAAW;aAC5B;YACD,IAAI,EAAE,KAAK,CAAC,IAA2B;SACxC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,4BAA4B,QAAQ,CAAC,MAAM,MAAM,OAAO,EAAE,CAAC,CAAC;QAC9E,CAAC;QAED,MAAM,OAAO,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAwB,CAAC;QAE/D,2EAA2E;QAC3E,8EAA8E;QAC9E,0EAA0E;QAC1E,IAAI,OAAO,CAAC,iBAAiB,KAAK,SAAS,EAAE,CAAC;YAC5C,OAAO;gBACL,IAAI,EAAE,EAAE;gBACR,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE,CAAC;gBACP,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,OAAO;gBACzB,KAAK,EAAE;oBACL,eAAe,EAAE,CAAC,KAAK,CAAC,eAAe,IAAI,CAAC,CAAC,GAAG,EAAE;oBAClD,SAAS,EAAE,kBAAkB;iBAC9B;aACF,CAAC;QACJ,CAAC;QAED,uEAAuE;QACvE,wEAAwE;QACxE,MAAM,eAAe,GACnB,OAAO,OAAO,CAAC,QAAQ,KAAK,QAAQ;YAClC,CAAC,CAAC,cAAc,CAAC,OAAO,CAAC,QAAQ,CAAC;YAClC,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC;QAE5B,OAAO;YACL,IAAI,EAAE,OAAO,CAAC,WAAW,IAAI,EAAE;YAC/B,QAAQ,EAAE,IAAI;YACd,eAAe;YACf,IAAI,EAAE,CAAC;YACP,gBAAgB,EAAE,OAAO;YACzB,OAAO,EAAE,IAAI;YACb,KAAK,EAAE;gBACL,eAAe,EAAE,CAAC,eAAe,IAAI,CAAC,CAAC,GAAG,EAAE;gBAC5C,SAAS,EAAE,kBAAkB;aAC9B;SACF,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -1,59 +1,174 @@
|
|
|
1
1
|
import type { SpeechSynthesisOptions, SpeechSynthesisResult, SpeechVoice, TextToSpeechProvider } from '../types.js';
|
|
2
|
-
/**
|
|
2
|
+
/**
|
|
3
|
+
* Configuration for the {@link AzureSpeechTTSProvider}.
|
|
4
|
+
*
|
|
5
|
+
* @see {@link AzureSpeechTTSProvider} for usage examples
|
|
6
|
+
* @see https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech
|
|
7
|
+
*/
|
|
3
8
|
export interface AzureSpeechTTSProviderConfig {
|
|
4
|
-
/**
|
|
9
|
+
/**
|
|
10
|
+
* Azure Cognitive Services subscription key.
|
|
11
|
+
* Sent as the `Ocp-Apim-Subscription-Key` header value.
|
|
12
|
+
*
|
|
13
|
+
* @see {@link AzureSpeechSTTProviderConfig.key} for the same pattern on STT
|
|
14
|
+
*/
|
|
5
15
|
key: string;
|
|
6
|
-
/**
|
|
16
|
+
/**
|
|
17
|
+
* Azure region where the Speech resource is deployed, e.g. `'eastus'`,
|
|
18
|
+
* `'westeurope'`, `'southeastasia'`.
|
|
19
|
+
*
|
|
20
|
+
* The region determines the REST endpoint hostname:
|
|
21
|
+
* `https://{region}.tts.speech.microsoft.com`
|
|
22
|
+
*/
|
|
7
23
|
region: string;
|
|
8
24
|
/**
|
|
9
25
|
* Default voice name to use when none is specified per-request.
|
|
26
|
+
* Must be a valid Azure voice short-name (e.g. `'en-US-JennyNeural'`).
|
|
27
|
+
*
|
|
10
28
|
* @default 'en-US-JennyNeural'
|
|
29
|
+
* @see https://learn.microsoft.com/azure/ai-services/speech-service/language-support#prebuilt-neural-voices
|
|
11
30
|
*/
|
|
12
31
|
defaultVoice?: string;
|
|
13
32
|
/**
|
|
14
|
-
* Custom fetch implementation
|
|
15
|
-
*
|
|
33
|
+
* Custom fetch implementation for dependency injection in tests.
|
|
34
|
+
* @default globalThis.fetch
|
|
16
35
|
*/
|
|
17
36
|
fetchImpl?: typeof fetch;
|
|
18
37
|
}
|
|
19
38
|
/**
|
|
20
39
|
* Text-to-speech provider that uses the Azure Cognitive Services Speech REST API.
|
|
21
40
|
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
41
|
+
* ## SSML Generation
|
|
42
|
+
*
|
|
43
|
+
* Azure's TTS REST endpoint requires SSML (Speech Synthesis Markup Language) as
|
|
44
|
+
* the request body — it does not accept plain text. This provider generates
|
|
45
|
+
* minimal SSML via {@link buildSsml} that wraps the input text in `<speak>`
|
|
46
|
+
* and `<voice>` elements. Special XML characters in the text are escaped via
|
|
47
|
+
* {@link escapeXml} to prevent malformed XML.
|
|
48
|
+
*
|
|
49
|
+
* ## `X-Microsoft-OutputFormat` Options
|
|
50
|
+
*
|
|
51
|
+
* The `X-Microsoft-OutputFormat` header controls the audio encoding. This
|
|
52
|
+
* provider uses `'audio-24khz-96kbitrate-mono-mp3'` which provides:
|
|
53
|
+
* - 24 kHz sample rate (high quality for speech)
|
|
54
|
+
* - 96 kbps bitrate (good balance of quality and file size)
|
|
55
|
+
* - Mono channel (sufficient for speech synthesis)
|
|
56
|
+
* - MP3 format (universally supported)
|
|
57
|
+
*
|
|
58
|
+
* Other available formats include:
|
|
59
|
+
* - `'audio-16khz-128kbitrate-mono-mp3'` — Lower sample rate, higher bitrate
|
|
60
|
+
* - `'audio-24khz-160kbitrate-mono-mp3'` — Higher bitrate for better quality
|
|
61
|
+
* - `'riff-24khz-16bit-mono-pcm'` — Uncompressed WAV
|
|
62
|
+
* - `'ogg-24khz-16bit-mono-opus'` — Opus codec in OGG container
|
|
63
|
+
*
|
|
64
|
+
* @see https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs
|
|
65
|
+
*
|
|
66
|
+
* ## Voice Listing
|
|
67
|
+
*
|
|
68
|
+
* The {@link listAvailableVoices} method fetches the full list of neural voices
|
|
69
|
+
* available in the configured Azure region via
|
|
70
|
+
* `GET /cognitiveservices/voices/list`. Results are mapped to the normalized
|
|
71
|
+
* {@link SpeechVoice} shape.
|
|
72
|
+
*
|
|
73
|
+
* @see {@link AzureSpeechTTSProviderConfig} for configuration options
|
|
74
|
+
* @see {@link AzureSpeechSTTProvider} for the corresponding STT provider
|
|
25
75
|
*
|
|
26
76
|
* @example
|
|
27
77
|
* ```ts
|
|
28
|
-
* const provider = new AzureSpeechTTSProvider({
|
|
78
|
+
* const provider = new AzureSpeechTTSProvider({
|
|
79
|
+
* key: process.env.AZURE_SPEECH_KEY!,
|
|
80
|
+
* region: 'eastus',
|
|
81
|
+
* defaultVoice: 'en-US-GuyNeural',
|
|
82
|
+
* });
|
|
29
83
|
* const result = await provider.synthesize('Hello world');
|
|
30
84
|
* // result.audioBuffer contains MP3 bytes
|
|
85
|
+
* // result.mimeType === 'audio/mpeg'
|
|
31
86
|
* ```
|
|
32
87
|
*/
|
|
33
88
|
export declare class AzureSpeechTTSProvider implements TextToSpeechProvider {
|
|
34
89
|
private readonly config;
|
|
90
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
35
91
|
readonly id = "azure-speech-tts";
|
|
92
|
+
/** Human-readable display name for UI and logging. */
|
|
36
93
|
readonly displayName = "Azure Speech (TTS)";
|
|
94
|
+
/**
|
|
95
|
+
* Marked as streaming-capable because the provider can be used within a
|
|
96
|
+
* streaming pipeline — though the actual HTTP request is a single
|
|
97
|
+
* synchronous call that returns the complete audio buffer.
|
|
98
|
+
*/
|
|
37
99
|
readonly supportsStreaming = true;
|
|
100
|
+
/** Fetch implementation — injected for testability, defaults to global fetch. */
|
|
38
101
|
private readonly fetchImpl;
|
|
102
|
+
/** Resolved default voice name used when no voice is specified per-request. */
|
|
39
103
|
private readonly defaultVoice;
|
|
104
|
+
/**
|
|
105
|
+
* Creates a new AzureSpeechTTSProvider.
|
|
106
|
+
*
|
|
107
|
+
* @param config - Provider configuration including the subscription key,
|
|
108
|
+
* region, and optional default voice.
|
|
109
|
+
*
|
|
110
|
+
* @example
|
|
111
|
+
* ```ts
|
|
112
|
+
* const provider = new AzureSpeechTTSProvider({
|
|
113
|
+
* key: 'your-azure-subscription-key',
|
|
114
|
+
* region: 'westeurope',
|
|
115
|
+
* defaultVoice: 'de-DE-ConradNeural',
|
|
116
|
+
* });
|
|
117
|
+
* ```
|
|
118
|
+
*/
|
|
40
119
|
constructor(config: AzureSpeechTTSProviderConfig);
|
|
41
|
-
/**
|
|
120
|
+
/**
|
|
121
|
+
* Returns the human-readable provider name.
|
|
122
|
+
*
|
|
123
|
+
* @returns The display name string `'Azure Speech (TTS)'`.
|
|
124
|
+
*
|
|
125
|
+
* @example
|
|
126
|
+
* ```ts
|
|
127
|
+
* provider.getProviderName(); // 'Azure Speech (TTS)'
|
|
128
|
+
* ```
|
|
129
|
+
*/
|
|
42
130
|
getProviderName(): string;
|
|
43
131
|
/**
|
|
44
132
|
* Synthesizes speech from plain text using the Azure TTS REST endpoint.
|
|
45
133
|
*
|
|
46
|
-
*
|
|
47
|
-
*
|
|
134
|
+
* The text is wrapped in SSML, sent to Azure, and the response audio buffer
|
|
135
|
+
* (MP3 format) is returned along with metadata.
|
|
136
|
+
*
|
|
137
|
+
* @param text - The plain-text utterance to convert to audio. XML special
|
|
138
|
+
* characters are automatically escaped.
|
|
139
|
+
* @param options - Optional synthesis settings. Use `options.voice` to
|
|
140
|
+
* override the default voice with any valid Azure voice short-name.
|
|
48
141
|
* @returns A promise resolving to the MP3 audio buffer and metadata.
|
|
49
|
-
* @throws When the Azure API returns a non-2xx status.
|
|
142
|
+
* @throws {Error} When the Azure API returns a non-2xx status code.
|
|
143
|
+
* Common causes: invalid subscription key (401), region mismatch (404),
|
|
144
|
+
* invalid SSML (400), or quota exceeded (429).
|
|
145
|
+
*
|
|
146
|
+
* @example
|
|
147
|
+
* ```ts
|
|
148
|
+
* const result = await provider.synthesize('Guten Tag!', {
|
|
149
|
+
* voice: 'de-DE-ConradNeural',
|
|
150
|
+
* });
|
|
151
|
+
* fs.writeFileSync('output.mp3', result.audioBuffer);
|
|
152
|
+
* ```
|
|
50
153
|
*/
|
|
51
154
|
synthesize(text: string, options?: SpeechSynthesisOptions): Promise<SpeechSynthesisResult>;
|
|
52
155
|
/**
|
|
53
156
|
* Retrieves the list of available neural voices from the Azure region.
|
|
54
157
|
*
|
|
55
|
-
*
|
|
56
|
-
*
|
|
158
|
+
* Fetches from `GET /cognitiveservices/voices/list` and maps each entry
|
|
159
|
+
* to the normalized {@link SpeechVoice} shape. The list includes all
|
|
160
|
+
* neural and standard voices available in the configured region.
|
|
161
|
+
*
|
|
162
|
+
* @returns A promise resolving to an array of normalized voice entries.
|
|
163
|
+
* @throws {Error} When the Azure API returns a non-2xx status code
|
|
164
|
+
* (e.g. invalid key, network error).
|
|
165
|
+
*
|
|
166
|
+
* @example
|
|
167
|
+
* ```ts
|
|
168
|
+
* const voices = await provider.listAvailableVoices();
|
|
169
|
+
* const englishVoices = voices.filter(v => v.lang.startsWith('en-'));
|
|
170
|
+
* console.log(`Found ${englishVoices.length} English voices`);
|
|
171
|
+
* ```
|
|
57
172
|
*/
|
|
58
173
|
listAvailableVoices(): Promise<SpeechVoice[]>;
|
|
59
174
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"AzureSpeechTTSProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechTTSProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,sBAAsB,EACtB,qBAAqB,EACrB,WAAW,EACX,oBAAoB,EACrB,MAAM,aAAa,CAAC;AAErB
|
|
1
|
+
{"version":3,"file":"AzureSpeechTTSProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechTTSProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,sBAAsB,EACtB,qBAAqB,EACrB,WAAW,EACX,oBAAoB,EACrB,MAAM,aAAa,CAAC;AAErB;;;;;GAKG;AACH,MAAM,WAAW,4BAA4B;IAC3C;;;;;OAKG;IACH,GAAG,EAAE,MAAM,CAAC;IAEZ;;;;;;OAMG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;;;;;OAMG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AAmHD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiDG;AACH,qBAAa,sBAAuB,YAAW,oBAAoB;IAmCrD,OAAO,CAAC,QAAQ,CAAC,MAAM;IAlCnC,uEAAuE;IACvE,SAAgB,EAAE,sBAAsB;IAExC,sDAAsD;IACtD,SAAgB,WAAW,wBAAwB;IAEnD;;;;OAIG;IACH,SAAgB,iBAAiB,QAAQ;IAEzC,iFAAiF;IACjF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;IAEzC,+EAA+E;IAC/E,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAS;IAEtC;;;;;;;;;;;;;;OAcG;gBAC0B,MAAM,EAAE,4BAA4B;IAKjE;;;;;;;;;OASG;IACH,eAAe,IAAI,MAAM;IAIzB;;;;;;;;;;;;;;;;;;;;;;OAsBG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,sBAA2B,GACnC,OAAO,CAAC,qBAAqB,CAAC;IA4CjC;;;;;;;;;;;;;;;;;OAiBG;IACG,mBAAmB,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;CAgBpD"}
|
|
@@ -1,21 +1,56 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Escapes special XML characters in text before embedding it in SSML.
|
|
3
|
-
*
|
|
4
|
-
*
|
|
3
|
+
*
|
|
4
|
+
* Azure's TTS endpoint expects well-formed XML in the request body. Unescaped
|
|
5
|
+
* `<`, `>`, `&`, `"`, or `'` characters in the input text would cause a 400
|
|
6
|
+
* Bad Request error because they break the XML structure.
|
|
7
|
+
*
|
|
8
|
+
* The five standard XML entity replacements are applied:
|
|
9
|
+
* - `&` -> `&` (must be first to avoid double-escaping)
|
|
10
|
+
* - `<` -> `<`
|
|
11
|
+
* - `>` -> `>`
|
|
12
|
+
* - `"` -> `"`
|
|
13
|
+
* - `'` -> `'`
|
|
14
|
+
*
|
|
15
|
+
* @param text - Raw plain text to escape for safe XML embedding.
|
|
16
|
+
* @returns The XML-safe escaped string.
|
|
17
|
+
*
|
|
18
|
+
* @example
|
|
19
|
+
* ```ts
|
|
20
|
+
* escapeXml('Hello & <world>'); // 'Hello & <world>'
|
|
21
|
+
* ```
|
|
5
22
|
*/
|
|
6
23
|
function escapeXml(text) {
|
|
7
24
|
return text
|
|
8
|
-
.replace(/&/g, '&')
|
|
25
|
+
.replace(/&/g, '&') // Must be first to avoid double-escaping
|
|
9
26
|
.replace(/</g, '<')
|
|
10
27
|
.replace(/>/g, '>')
|
|
11
28
|
.replace(/"/g, '"')
|
|
12
29
|
.replace(/'/g, ''');
|
|
13
30
|
}
|
|
14
31
|
/**
|
|
15
|
-
* Builds the SSML
|
|
32
|
+
* Builds the SSML (Speech Synthesis Markup Language) payload for the Azure
|
|
33
|
+
* TTS REST endpoint.
|
|
34
|
+
*
|
|
35
|
+
* The generated SSML wraps the escaped text in a `<voice>` element with the
|
|
36
|
+
* specified voice name. The outer `<speak>` element declares SSML version 1.0
|
|
37
|
+
* and the W3C synthesis namespace.
|
|
16
38
|
*
|
|
17
|
-
*
|
|
39
|
+
* More advanced SSML features (prosody, emphasis, break) could be added here
|
|
40
|
+
* but are not currently needed for basic synthesis.
|
|
41
|
+
*
|
|
42
|
+
* @param text - Plain-text utterance to synthesize (will be XML-escaped).
|
|
18
43
|
* @param voice - Azure voice short-name, e.g. `'en-US-JennyNeural'`.
|
|
44
|
+
* @returns Well-formed SSML string ready to send as the request body.
|
|
45
|
+
*
|
|
46
|
+
* @see {@link escapeXml} for the XML escaping logic
|
|
47
|
+
* @see https://learn.microsoft.com/azure/ai-services/speech-service/speech-synthesis-markup
|
|
48
|
+
*
|
|
49
|
+
* @example
|
|
50
|
+
* ```ts
|
|
51
|
+
* buildSsml('Hello world', 'en-US-JennyNeural');
|
|
52
|
+
* // '<speak version="1.0" xmlns="..."><voice name="en-US-JennyNeural">Hello world</voice></speak>'
|
|
53
|
+
* ```
|
|
19
54
|
*/
|
|
20
55
|
function buildSsml(text, voice) {
|
|
21
56
|
return (`<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">` +
|
|
@@ -23,7 +58,17 @@ function buildSsml(text, voice) {
|
|
|
23
58
|
`</speak>`);
|
|
24
59
|
}
|
|
25
60
|
/**
|
|
26
|
-
* Maps an Azure voice list entry to the
|
|
61
|
+
* Maps an Azure voice list entry to the normalized {@link SpeechVoice} shape.
|
|
62
|
+
*
|
|
63
|
+
* The gender field is lowercased and validated against known values. Unknown
|
|
64
|
+
* gender strings (if Azure adds new values) are passed through as-is since
|
|
65
|
+
* the {@link SpeechVoice.gender} type accepts `string`.
|
|
66
|
+
*
|
|
67
|
+
* @param entry - A single voice entry from the Azure voices/list endpoint.
|
|
68
|
+
* @returns The normalized voice object.
|
|
69
|
+
*
|
|
70
|
+
* @see {@link AzureVoiceEntry} for the input shape
|
|
71
|
+
* @see {@link SpeechVoice} for the output shape
|
|
27
72
|
*/
|
|
28
73
|
function mapVoice(entry) {
|
|
29
74
|
const gender = entry.Gender?.toLowerCase();
|
|
@@ -40,48 +85,135 @@ function mapVoice(entry) {
|
|
|
40
85
|
/**
|
|
41
86
|
* Text-to-speech provider that uses the Azure Cognitive Services Speech REST API.
|
|
42
87
|
*
|
|
43
|
-
*
|
|
44
|
-
*
|
|
45
|
-
*
|
|
88
|
+
* ## SSML Generation
|
|
89
|
+
*
|
|
90
|
+
* Azure's TTS REST endpoint requires SSML (Speech Synthesis Markup Language) as
|
|
91
|
+
* the request body — it does not accept plain text. This provider generates
|
|
92
|
+
* minimal SSML via {@link buildSsml} that wraps the input text in `<speak>`
|
|
93
|
+
* and `<voice>` elements. Special XML characters in the text are escaped via
|
|
94
|
+
* {@link escapeXml} to prevent malformed XML.
|
|
95
|
+
*
|
|
96
|
+
* ## `X-Microsoft-OutputFormat` Options
|
|
97
|
+
*
|
|
98
|
+
* The `X-Microsoft-OutputFormat` header controls the audio encoding. This
|
|
99
|
+
* provider uses `'audio-24khz-96kbitrate-mono-mp3'` which provides:
|
|
100
|
+
* - 24 kHz sample rate (high quality for speech)
|
|
101
|
+
* - 96 kbps bitrate (good balance of quality and file size)
|
|
102
|
+
* - Mono channel (sufficient for speech synthesis)
|
|
103
|
+
* - MP3 format (universally supported)
|
|
104
|
+
*
|
|
105
|
+
* Other available formats include:
|
|
106
|
+
* - `'audio-16khz-128kbitrate-mono-mp3'` — Lower sample rate, higher bitrate
|
|
107
|
+
* - `'audio-24khz-160kbitrate-mono-mp3'` — Higher bitrate for better quality
|
|
108
|
+
* - `'riff-24khz-16bit-mono-pcm'` — Uncompressed WAV
|
|
109
|
+
* - `'ogg-24khz-16bit-mono-opus'` — Opus codec in OGG container
|
|
110
|
+
*
|
|
111
|
+
* @see https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs
|
|
112
|
+
*
|
|
113
|
+
* ## Voice Listing
|
|
114
|
+
*
|
|
115
|
+
* The {@link listAvailableVoices} method fetches the full list of neural voices
|
|
116
|
+
* available in the configured Azure region via
|
|
117
|
+
* `GET /cognitiveservices/voices/list`. Results are mapped to the normalized
|
|
118
|
+
* {@link SpeechVoice} shape.
|
|
119
|
+
*
|
|
120
|
+
* @see {@link AzureSpeechTTSProviderConfig} for configuration options
|
|
121
|
+
* @see {@link AzureSpeechSTTProvider} for the corresponding STT provider
|
|
46
122
|
*
|
|
47
123
|
* @example
|
|
48
124
|
* ```ts
|
|
49
|
-
* const provider = new AzureSpeechTTSProvider({
|
|
125
|
+
* const provider = new AzureSpeechTTSProvider({
|
|
126
|
+
* key: process.env.AZURE_SPEECH_KEY!,
|
|
127
|
+
* region: 'eastus',
|
|
128
|
+
* defaultVoice: 'en-US-GuyNeural',
|
|
129
|
+
* });
|
|
50
130
|
* const result = await provider.synthesize('Hello world');
|
|
51
131
|
* // result.audioBuffer contains MP3 bytes
|
|
132
|
+
* // result.mimeType === 'audio/mpeg'
|
|
52
133
|
* ```
|
|
53
134
|
*/
|
|
54
135
|
export class AzureSpeechTTSProvider {
|
|
136
|
+
/**
|
|
137
|
+
* Creates a new AzureSpeechTTSProvider.
|
|
138
|
+
*
|
|
139
|
+
* @param config - Provider configuration including the subscription key,
|
|
140
|
+
* region, and optional default voice.
|
|
141
|
+
*
|
|
142
|
+
* @example
|
|
143
|
+
* ```ts
|
|
144
|
+
* const provider = new AzureSpeechTTSProvider({
|
|
145
|
+
* key: 'your-azure-subscription-key',
|
|
146
|
+
* region: 'westeurope',
|
|
147
|
+
* defaultVoice: 'de-DE-ConradNeural',
|
|
148
|
+
* });
|
|
149
|
+
* ```
|
|
150
|
+
*/
|
|
55
151
|
constructor(config) {
|
|
56
152
|
this.config = config;
|
|
153
|
+
/** Unique provider identifier used for registration and resolution. */
|
|
57
154
|
this.id = 'azure-speech-tts';
|
|
155
|
+
/** Human-readable display name for UI and logging. */
|
|
58
156
|
this.displayName = 'Azure Speech (TTS)';
|
|
157
|
+
/**
|
|
158
|
+
* Marked as streaming-capable because the provider can be used within a
|
|
159
|
+
* streaming pipeline — though the actual HTTP request is a single
|
|
160
|
+
* synchronous call that returns the complete audio buffer.
|
|
161
|
+
*/
|
|
59
162
|
this.supportsStreaming = true;
|
|
60
163
|
this.fetchImpl = config.fetchImpl ?? fetch;
|
|
61
164
|
this.defaultVoice = config.defaultVoice ?? 'en-US-JennyNeural';
|
|
62
165
|
}
|
|
63
|
-
/**
|
|
166
|
+
/**
|
|
167
|
+
* Returns the human-readable provider name.
|
|
168
|
+
*
|
|
169
|
+
* @returns The display name string `'Azure Speech (TTS)'`.
|
|
170
|
+
*
|
|
171
|
+
* @example
|
|
172
|
+
* ```ts
|
|
173
|
+
* provider.getProviderName(); // 'Azure Speech (TTS)'
|
|
174
|
+
* ```
|
|
175
|
+
*/
|
|
64
176
|
getProviderName() {
|
|
65
177
|
return this.displayName;
|
|
66
178
|
}
|
|
67
179
|
/**
|
|
68
180
|
* Synthesizes speech from plain text using the Azure TTS REST endpoint.
|
|
69
181
|
*
|
|
70
|
-
*
|
|
71
|
-
*
|
|
182
|
+
* The text is wrapped in SSML, sent to Azure, and the response audio buffer
|
|
183
|
+
* (MP3 format) is returned along with metadata.
|
|
184
|
+
*
|
|
185
|
+
* @param text - The plain-text utterance to convert to audio. XML special
|
|
186
|
+
* characters are automatically escaped.
|
|
187
|
+
* @param options - Optional synthesis settings. Use `options.voice` to
|
|
188
|
+
* override the default voice with any valid Azure voice short-name.
|
|
72
189
|
* @returns A promise resolving to the MP3 audio buffer and metadata.
|
|
73
|
-
* @throws When the Azure API returns a non-2xx status.
|
|
190
|
+
* @throws {Error} When the Azure API returns a non-2xx status code.
|
|
191
|
+
* Common causes: invalid subscription key (401), region mismatch (404),
|
|
192
|
+
* invalid SSML (400), or quota exceeded (429).
|
|
193
|
+
*
|
|
194
|
+
* @example
|
|
195
|
+
* ```ts
|
|
196
|
+
* const result = await provider.synthesize('Guten Tag!', {
|
|
197
|
+
* voice: 'de-DE-ConradNeural',
|
|
198
|
+
* });
|
|
199
|
+
* fs.writeFileSync('output.mp3', result.audioBuffer);
|
|
200
|
+
* ```
|
|
74
201
|
*/
|
|
75
202
|
async synthesize(text, options = {}) {
|
|
76
203
|
const voice = options.voice ?? this.defaultVoice;
|
|
77
204
|
const { key, region } = this.config;
|
|
205
|
+
// Azure TTS endpoint — note it uses tts.speech.microsoft.com (not stt.)
|
|
78
206
|
const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`;
|
|
79
207
|
const ssml = buildSsml(text, voice);
|
|
80
208
|
const response = await this.fetchImpl(url, {
|
|
81
209
|
method: 'POST',
|
|
82
210
|
headers: {
|
|
211
|
+
// Azure's standard subscription key authentication header
|
|
83
212
|
'Ocp-Apim-Subscription-Key': key,
|
|
213
|
+
// SSML content type — Azure rejects plain text
|
|
84
214
|
'Content-Type': 'application/ssml+xml',
|
|
215
|
+
// Output format header — determines the audio encoding, sample rate,
|
|
216
|
+
// and container format of the response body
|
|
85
217
|
'X-Microsoft-OutputFormat': 'audio-24khz-96kbitrate-mono-mp3',
|
|
86
218
|
},
|
|
87
219
|
body: ssml,
|
|
@@ -90,12 +222,13 @@ export class AzureSpeechTTSProvider {
|
|
|
90
222
|
const message = await response.text();
|
|
91
223
|
throw new Error(`Azure Speech TTS failed (${response.status}): ${message}`);
|
|
92
224
|
}
|
|
225
|
+
// Read the complete audio response into a Buffer
|
|
93
226
|
const arrayBuffer = await response.arrayBuffer();
|
|
94
227
|
const audioBuffer = Buffer.from(arrayBuffer);
|
|
95
228
|
return {
|
|
96
229
|
audioBuffer,
|
|
97
|
-
mimeType: 'audio/mpeg',
|
|
98
|
-
cost: 0,
|
|
230
|
+
mimeType: 'audio/mpeg', // Matches the X-Microsoft-OutputFormat MP3 selection
|
|
231
|
+
cost: 0, // Cost tracking is handled at a higher layer
|
|
99
232
|
voiceUsed: voice,
|
|
100
233
|
providerName: this.displayName,
|
|
101
234
|
usage: {
|
|
@@ -107,8 +240,20 @@ export class AzureSpeechTTSProvider {
|
|
|
107
240
|
/**
|
|
108
241
|
* Retrieves the list of available neural voices from the Azure region.
|
|
109
242
|
*
|
|
110
|
-
*
|
|
111
|
-
*
|
|
243
|
+
* Fetches from `GET /cognitiveservices/voices/list` and maps each entry
|
|
244
|
+
* to the normalized {@link SpeechVoice} shape. The list includes all
|
|
245
|
+
* neural and standard voices available in the configured region.
|
|
246
|
+
*
|
|
247
|
+
* @returns A promise resolving to an array of normalized voice entries.
|
|
248
|
+
* @throws {Error} When the Azure API returns a non-2xx status code
|
|
249
|
+
* (e.g. invalid key, network error).
|
|
250
|
+
*
|
|
251
|
+
* @example
|
|
252
|
+
* ```ts
|
|
253
|
+
* const voices = await provider.listAvailableVoices();
|
|
254
|
+
* const englishVoices = voices.filter(v => v.lang.startsWith('en-'));
|
|
255
|
+
* console.log(`Found ${englishVoices.length} English voices`);
|
|
256
|
+
* ```
|
|
112
257
|
*/
|
|
113
258
|
async listAvailableVoices() {
|
|
114
259
|
const { key, region } = this.config;
|