@absolutejs/voice-azure 0.0.1-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -0
- package/dist/azure.d.ts +3 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +232 -0
- package/dist/types.d.ts +26 -0
- package/package.json +38 -0
package/README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# `@absolutejs/voice-azure`
|
|
2
|
+
|
|
3
|
+
Azure Speech (Cognitive Services) adapter for `@absolutejs/voice`.
|
|
4
|
+
|
|
5
|
+
Currently ships **Neural Text-to-Speech** via Azure's REST `/cognitiveservices/v1` endpoint. Streaming Speech-to-Text via Azure's WebSocket USP protocol is the next package update (see "Roadmap" below).
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```sh
|
|
10
|
+
bun add @absolutejs/voice-azure
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
`@absolutejs/voice` is a runtime dependency.
|
|
14
|
+
|
|
15
|
+
## TTS
|
|
16
|
+
|
|
17
|
+
```ts
|
|
18
|
+
import { voice } from "@absolutejs/voice";
|
|
19
|
+
import { azureTTS } from "@absolutejs/voice-azure";
|
|
20
|
+
|
|
21
|
+
const app = voice({
|
|
22
|
+
// ... stt + other voice options ...
|
|
23
|
+
tts: azureTTS({
|
|
24
|
+
region: "eastus",
|
|
25
|
+
subscriptionKey: process.env.AZURE_SPEECH_KEY!,
|
|
26
|
+
voice: "en-US-JennyNeural",
|
|
27
|
+
// optional:
|
|
28
|
+
outputFormat: "raw-24khz-16bit-mono-pcm", // default
|
|
29
|
+
language: "en-US", // default
|
|
30
|
+
voiceStyle: "cheerful",
|
|
31
|
+
styleDegree: 1.5,
|
|
32
|
+
prosody: { rate: "fast", pitch: "+5%" },
|
|
33
|
+
}),
|
|
34
|
+
});
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
For telephony bridges, use a μ-law raw format at 8 kHz:
|
|
38
|
+
|
|
39
|
+
```ts
|
|
40
|
+
azureTTS({
|
|
41
|
+
region,
|
|
42
|
+
subscriptionKey,
|
|
43
|
+
voice: "en-US-AriaNeural",
|
|
44
|
+
outputFormat: "raw-8khz-8bit-mono-mulaw",
|
|
45
|
+
});
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Bearer-token auth (10-minute Azure auth tokens) is also supported:
|
|
49
|
+
|
|
50
|
+
```ts
|
|
51
|
+
azureTTS({ region, token, voice });
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Options
|
|
55
|
+
|
|
56
|
+
| Option | Required | Default | Notes |
|
|
57
|
+
| --- | --- | --- | --- |
|
|
58
|
+
| `subscriptionKey` / `token` | one of | — | Subscription key sent as `Ocp-Apim-Subscription-Key`, or short-lived bearer token sent as `Authorization`. |
|
|
59
|
+
| `voice` | yes | — | Azure voice name, e.g. `en-US-JennyNeural`, `fr-FR-DeniseNeural`. |
|
|
60
|
+
| `region` | yes\* | — | Azure region (`eastus`, `westus`, `francecentral`, …). \* Or pass `baseUrl` directly. |
|
|
61
|
+
| `baseUrl` | no | `https://{region}.tts.speech.microsoft.com` | Override for sovereign clouds or Azure private endpoints. |
|
|
62
|
+
| `endpointPath` | no | `/cognitiveservices/v1` | Override if you front the service with a gateway. |
|
|
63
|
+
| `outputFormat` | no | `raw-24khz-16bit-mono-pcm` | Must be a `raw-*` format (mp3/wav variants are rejected because they aren't streamable frame-by-frame). |
|
|
64
|
+
| `language` | no | `en-US` | Used in the SSML `xml:lang` attribute. |
|
|
65
|
+
| `voiceStyle` | no | — | Azure neural style (`cheerful`, `empathetic`, `customerservice`, …). |
|
|
66
|
+
| `styleDegree` | no | — | Only applied when `voiceStyle` is set (0..2 typically). |
|
|
67
|
+
| `prosody` | no | — | `{ rate, pitch, volume }` — strings forwarded to the SSML `<prosody>` element. |
|
|
68
|
+
| `userAgent` | no | `@absolutejs/voice-azure` | Sent as `User-Agent`. |
|
|
69
|
+
| `fetch` | no | `globalThis.fetch` | Inject for tests; opportunistic HTTP/2 multiplexing is enabled for HTTPS targets. |
|
|
70
|
+
|
|
71
|
+
## Notes
|
|
72
|
+
|
|
73
|
+
- Only `raw-*` output formats are supported because the voice runtime needs framed PCM/μ-law/α-law to feed transports without buffering the whole response. If you need MP3/WAV for offline assets, call the Azure REST API directly.
|
|
74
|
+
- The adapter aborts the in-flight HTTP request on `session.close(reason)` and refuses further `send()` calls.
|
|
75
|
+
- Whitespace-only `send()` is a no-op (matches the ElevenLabs and Cartesia adapters).
|
|
76
|
+
- Bearer tokens expire after 10 minutes by default — refresh externally and pass the new value into a fresh adapter, or stick with `subscriptionKey` for long-running deployments.
|
|
77
|
+
|
|
78
|
+
## Roadmap
|
|
79
|
+
|
|
80
|
+
- **STT (streaming via WebSocket USP)** — next package update. Will land as `azureSTT({ region, subscriptionKey, language, ... })` in this same package without breaking existing `azureTTS` callers.
|
|
81
|
+
- **Custom voices / endpoint id** — once the TTS path has a paying customer who needs it.
|
|
82
|
+
- **Speaker recognition / pronunciation assessment** — out of scope for the voice-agent path; covered better by direct Azure SDK use.
|
package/dist/azure.d.ts
ADDED
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/azure.ts
|
|
3
|
+
var DEFAULT_OUTPUT_FORMAT = "raw-24khz-16bit-mono-pcm";
|
|
4
|
+
var DEFAULT_LANGUAGE = "en-US";
|
|
5
|
+
var DEFAULT_USER_AGENT = "@absolutejs/voice-azure";
|
|
6
|
+
var DEFAULT_ENDPOINT_PATH = "/cognitiveservices/v1";
|
|
7
|
+
var isHttpsUrl = (url) => typeof url === "string" ? url.startsWith("https://") : url.protocol === "https:";
|
|
8
|
+
var h2IfHttps = (url) => isHttpsUrl(url) ? { protocol: "http2" } : {};
|
|
9
|
+
var createListenerMap = () => ({
|
|
10
|
+
audio: new Set,
|
|
11
|
+
close: new Set,
|
|
12
|
+
error: new Set
|
|
13
|
+
});
|
|
14
|
+
var emit = async (listeners, event, payload) => {
|
|
15
|
+
for (const listener of listeners[event]) {
|
|
16
|
+
await listener(payload);
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
var resolveErrorMessage = (error) => {
|
|
20
|
+
if (typeof error === "string" && error.trim())
|
|
21
|
+
return error;
|
|
22
|
+
if (error instanceof Error && error.message.trim())
|
|
23
|
+
return error.message;
|
|
24
|
+
return "Azure TTS request failed";
|
|
25
|
+
};
|
|
26
|
+
var escapeXml = (value) => value.replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">").replaceAll('"', """).replaceAll("'", "'");
|
|
27
|
+
var escapeAttr = (value) => escapeXml(value);
|
|
28
|
+
var parseSampleRate = (format) => {
|
|
29
|
+
const match = format.match(/(?:audio|raw)-(\d+)(?:khz|hz)/i);
|
|
30
|
+
if (!match)
|
|
31
|
+
return 24000;
|
|
32
|
+
const value = Number(match[1]);
|
|
33
|
+
if (!Number.isFinite(value))
|
|
34
|
+
return 24000;
|
|
35
|
+
return format.toLowerCase().includes("hz") && !format.toLowerCase().includes("khz") ? value : value * 1000;
|
|
36
|
+
};
|
|
37
|
+
var resolveRawSampleRate = (format) => {
|
|
38
|
+
const lower = format.toLowerCase();
|
|
39
|
+
if (lower.startsWith("raw-")) {
|
|
40
|
+
const segment = lower.split("-")[1] ?? "";
|
|
41
|
+
if (segment.endsWith("khz")) {
|
|
42
|
+
const value = Number(segment.slice(0, -3));
|
|
43
|
+
return Number.isFinite(value) ? value * 1000 : 24000;
|
|
44
|
+
}
|
|
45
|
+
if (segment.endsWith("hz")) {
|
|
46
|
+
const value = Number(segment.slice(0, -2));
|
|
47
|
+
return Number.isFinite(value) ? value : 24000;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return parseSampleRate(format);
|
|
51
|
+
};
|
|
52
|
+
var resolveAudioFormat = (format) => {
|
|
53
|
+
const lower = format.toLowerCase();
|
|
54
|
+
if (!lower.startsWith("raw-")) {
|
|
55
|
+
throw new Error(`Unsupported Azure output format "${format}" for @absolutejs/voice TTS streaming. ` + `Use a "raw-*" format such as raw-24khz-16bit-mono-pcm or raw-8khz-8bit-mono-mulaw.`);
|
|
56
|
+
}
|
|
57
|
+
const sampleRateHz = resolveRawSampleRate(format);
|
|
58
|
+
if (lower.endsWith("-mulaw")) {
|
|
59
|
+
return {
|
|
60
|
+
channels: 1,
|
|
61
|
+
container: "raw",
|
|
62
|
+
encoding: "mulaw",
|
|
63
|
+
sampleRateHz
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
if (lower.endsWith("-alaw")) {
|
|
67
|
+
return {
|
|
68
|
+
channels: 1,
|
|
69
|
+
container: "raw",
|
|
70
|
+
encoding: "alaw",
|
|
71
|
+
sampleRateHz
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
if (lower.includes("-16bit-") && lower.endsWith("-pcm")) {
|
|
75
|
+
return {
|
|
76
|
+
channels: 1,
|
|
77
|
+
container: "raw",
|
|
78
|
+
encoding: "pcm_s16le",
|
|
79
|
+
sampleRateHz
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
if (lower.includes("-8bit-") && lower.endsWith("-pcm")) {
|
|
83
|
+
return {
|
|
84
|
+
channels: 1,
|
|
85
|
+
container: "raw",
|
|
86
|
+
encoding: "pcm_s8",
|
|
87
|
+
sampleRateHz
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
throw new Error(`Unrecognized Azure raw format "${format}". Expected -pcm/-mulaw/-alaw suffix.`);
|
|
91
|
+
};
|
|
92
|
+
var resolveBaseUrl = (config) => {
|
|
93
|
+
if (config.baseUrl)
|
|
94
|
+
return config.baseUrl.replace(/\/$/, "");
|
|
95
|
+
if (!config.region) {
|
|
96
|
+
throw new Error('@absolutejs/voice-azure requires either baseUrl or region (e.g. "eastus").');
|
|
97
|
+
}
|
|
98
|
+
return `https://${config.region}.tts.speech.microsoft.com`;
|
|
99
|
+
};
|
|
100
|
+
var resolveEndpointPath = (config) => {
|
|
101
|
+
const path = config.endpointPath ?? DEFAULT_ENDPOINT_PATH;
|
|
102
|
+
return path.startsWith("/") ? path : `/${path}`;
|
|
103
|
+
};
|
|
104
|
+
var buildHeaders = (config, outputFormat) => {
|
|
105
|
+
const headers = {
|
|
106
|
+
"Content-Type": "application/ssml+xml",
|
|
107
|
+
"User-Agent": config.userAgent ?? DEFAULT_USER_AGENT,
|
|
108
|
+
"X-Microsoft-OutputFormat": outputFormat
|
|
109
|
+
};
|
|
110
|
+
if ("token" in config && config.token) {
|
|
111
|
+
headers["Authorization"] = `Bearer ${config.token}`;
|
|
112
|
+
} else if ("subscriptionKey" in config && config.subscriptionKey) {
|
|
113
|
+
headers["Ocp-Apim-Subscription-Key"] = config.subscriptionKey;
|
|
114
|
+
}
|
|
115
|
+
return headers;
|
|
116
|
+
};
|
|
117
|
+
var buildProsodyAttributes = (prosody) => {
|
|
118
|
+
if (!prosody)
|
|
119
|
+
return "";
|
|
120
|
+
const parts = [];
|
|
121
|
+
if (prosody.rate)
|
|
122
|
+
parts.push(`rate="${escapeAttr(prosody.rate)}"`);
|
|
123
|
+
if (prosody.pitch)
|
|
124
|
+
parts.push(`pitch="${escapeAttr(prosody.pitch)}"`);
|
|
125
|
+
if (prosody.volume)
|
|
126
|
+
parts.push(`volume="${escapeAttr(prosody.volume)}"`);
|
|
127
|
+
return parts.length === 0 ? "" : ` ${parts.join(" ")}`;
|
|
128
|
+
};
|
|
129
|
+
var buildSsmlPayload = (config, text) => {
|
|
130
|
+
const language = config.language ?? DEFAULT_LANGUAGE;
|
|
131
|
+
const escapedText = escapeXml(text);
|
|
132
|
+
const styleDegreeAttribute = config.voiceStyle && typeof config.styleDegree === "number" ? ` styledegree="${String(config.styleDegree)}"` : "";
|
|
133
|
+
const styledInner = config.voiceStyle ? `<mstts:express-as style="${escapeAttr(config.voiceStyle)}"${styleDegreeAttribute}>${escapedText}</mstts:express-as>` : escapedText;
|
|
134
|
+
const prosodyAttributes = buildProsodyAttributes(config.prosody);
|
|
135
|
+
const innerWithProsody = prosodyAttributes ? `<prosody${prosodyAttributes}>${styledInner}</prosody>` : styledInner;
|
|
136
|
+
const mstssNs = config.voiceStyle ? ` xmlns:mstts="http://www.w3.org/2001/mstts"` : "";
|
|
137
|
+
return `<speak version="1.0" xml:lang="${escapeAttr(language)}"${mstssNs}>` + `<voice name="${escapeAttr(config.voice)}">${innerWithProsody}</voice></speak>`;
|
|
138
|
+
};
|
|
139
|
+
var buildTtsUrl = (config) => new URL(`${resolveBaseUrl(config)}${resolveEndpointPath(config)}`);
|
|
140
|
+
var azureTTS = (config) => {
|
|
141
|
+
if (!(("subscriptionKey" in config) && config.subscriptionKey) && !(("token" in config) && config.token)) {
|
|
142
|
+
throw new Error("@absolutejs/voice-azure requires either subscriptionKey or token for authentication.");
|
|
143
|
+
}
|
|
144
|
+
if (!config.voice) {
|
|
145
|
+
throw new Error('@absolutejs/voice-azure requires a voice name (e.g. "en-US-JennyNeural").');
|
|
146
|
+
}
|
|
147
|
+
resolveBaseUrl(config);
|
|
148
|
+
const fetchImpl = config.fetch ?? globalThis.fetch;
|
|
149
|
+
const outputFormat = config.outputFormat ?? DEFAULT_OUTPUT_FORMAT;
|
|
150
|
+
const audioFormat = resolveAudioFormat(outputFormat);
|
|
151
|
+
return {
|
|
152
|
+
kind: "tts",
|
|
153
|
+
open: () => {
|
|
154
|
+
const listeners = createListenerMap();
|
|
155
|
+
const activeControllers = new Set;
|
|
156
|
+
let closed = false;
|
|
157
|
+
return {
|
|
158
|
+
close: async (reason) => {
|
|
159
|
+
if (closed)
|
|
160
|
+
return;
|
|
161
|
+
closed = true;
|
|
162
|
+
for (const controller of activeControllers) {
|
|
163
|
+
controller.abort(reason);
|
|
164
|
+
}
|
|
165
|
+
await emit(listeners, "close", {
|
|
166
|
+
reason,
|
|
167
|
+
recoverable: false,
|
|
168
|
+
type: "close"
|
|
169
|
+
});
|
|
170
|
+
},
|
|
171
|
+
on: (event, handler) => {
|
|
172
|
+
listeners[event].add(handler);
|
|
173
|
+
return () => {
|
|
174
|
+
listeners[event].delete(handler);
|
|
175
|
+
};
|
|
176
|
+
},
|
|
177
|
+
send: async (text) => {
|
|
178
|
+
if (closed)
|
|
179
|
+
return;
|
|
180
|
+
const trimmed = text.trim();
|
|
181
|
+
if (!trimmed)
|
|
182
|
+
return;
|
|
183
|
+
const controller = new AbortController;
|
|
184
|
+
activeControllers.add(controller);
|
|
185
|
+
try {
|
|
186
|
+
const target = buildTtsUrl(config);
|
|
187
|
+
const response = await fetchImpl(target, {
|
|
188
|
+
...h2IfHttps(target),
|
|
189
|
+
body: buildSsmlPayload(config, trimmed),
|
|
190
|
+
headers: buildHeaders(config, outputFormat),
|
|
191
|
+
method: "POST",
|
|
192
|
+
signal: controller.signal
|
|
193
|
+
});
|
|
194
|
+
if (!response.ok || !response.body) {
|
|
195
|
+
const bodyText = await response.text().catch(() => "");
|
|
196
|
+
throw new Error(`Azure TTS returned ${String(response.status)} ${response.statusText}${bodyText ? `: ${bodyText.slice(0, 200)}` : ""}`);
|
|
197
|
+
}
|
|
198
|
+
const reader = response.body.getReader();
|
|
199
|
+
try {
|
|
200
|
+
while (true) {
|
|
201
|
+
const { done, value } = await reader.read();
|
|
202
|
+
if (done || !value)
|
|
203
|
+
break;
|
|
204
|
+
await emit(listeners, "audio", {
|
|
205
|
+
chunk: value,
|
|
206
|
+
format: audioFormat,
|
|
207
|
+
receivedAt: Date.now(),
|
|
208
|
+
type: "audio"
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
} finally {
|
|
212
|
+
reader.releaseLock();
|
|
213
|
+
}
|
|
214
|
+
} catch (error) {
|
|
215
|
+
if (error.name === "AbortError")
|
|
216
|
+
return;
|
|
217
|
+
await emit(listeners, "error", {
|
|
218
|
+
error: error instanceof Error ? error : new Error(resolveErrorMessage(error)),
|
|
219
|
+
recoverable: false,
|
|
220
|
+
type: "error"
|
|
221
|
+
});
|
|
222
|
+
} finally {
|
|
223
|
+
activeControllers.delete(controller);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
};
|
|
229
|
+
};
|
|
230
|
+
export {
|
|
231
|
+
azureTTS
|
|
232
|
+
};
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export type AzureTTSOutputFormat = 'audio-16khz-32kbitrate-mono-mp3' | 'audio-16khz-64kbitrate-mono-mp3' | 'audio-16khz-128kbitrate-mono-mp3' | 'audio-24khz-48kbitrate-mono-mp3' | 'audio-24khz-96kbitrate-mono-mp3' | 'audio-24khz-160kbitrate-mono-mp3' | 'audio-48khz-96kbitrate-mono-mp3' | 'audio-48khz-192kbitrate-mono-mp3' | 'raw-8khz-8bit-mono-alaw' | 'raw-8khz-8bit-mono-mulaw' | 'raw-8khz-16bit-mono-pcm' | 'raw-16khz-16bit-mono-pcm' | 'raw-22050hz-16bit-mono-pcm' | 'raw-24khz-16bit-mono-pcm' | 'raw-44100hz-16bit-mono-pcm' | 'raw-48khz-16bit-mono-pcm' | (string & {});
|
|
2
|
+
export type AzureTTSAuth = {
|
|
3
|
+
subscriptionKey: string;
|
|
4
|
+
token?: never;
|
|
5
|
+
} | {
|
|
6
|
+
subscriptionKey?: never;
|
|
7
|
+
token: string;
|
|
8
|
+
};
|
|
9
|
+
export type AzureTTSProsody = {
|
|
10
|
+
pitch?: string;
|
|
11
|
+
rate?: string;
|
|
12
|
+
volume?: string;
|
|
13
|
+
};
|
|
14
|
+
export type AzureTTSOptions = AzureTTSAuth & {
|
|
15
|
+
baseUrl?: string;
|
|
16
|
+
endpointPath?: string;
|
|
17
|
+
fetch?: typeof fetch;
|
|
18
|
+
language?: string;
|
|
19
|
+
outputFormat?: AzureTTSOutputFormat;
|
|
20
|
+
prosody?: AzureTTSProsody;
|
|
21
|
+
region?: string;
|
|
22
|
+
styleDegree?: number;
|
|
23
|
+
userAgent?: string;
|
|
24
|
+
voice: string;
|
|
25
|
+
voiceStyle?: string;
|
|
26
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@absolutejs/voice-azure",
|
|
3
|
+
"version": "0.0.1-beta.1",
|
|
4
|
+
"description": "Azure Speech (Cognitive Services) adapter for @absolutejs/voice — Neural TTS shipped, streaming STT next",
|
|
5
|
+
"repository": {
|
|
6
|
+
"type": "git",
|
|
7
|
+
"url": "https://github.com/absolutejs/voice-adapters.git",
|
|
8
|
+
"directory": "azure"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist",
|
|
12
|
+
"README.md"
|
|
13
|
+
],
|
|
14
|
+
"main": "./dist/index.js",
|
|
15
|
+
"types": "./dist/index.d.ts",
|
|
16
|
+
"exports": {
|
|
17
|
+
".": {
|
|
18
|
+
"import": "./dist/index.js",
|
|
19
|
+
"types": "./dist/index.d.ts"
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
"license": "CC BY-NC 4.0",
|
|
23
|
+
"author": "Alex Kahn",
|
|
24
|
+
"scripts": {
|
|
25
|
+
"build": "rm -rf dist && bun build ./src/index.ts --outdir dist --target bun --external @absolutejs/voice && tsc --emitDeclarationOnly --project tsconfig.json",
|
|
26
|
+
"format": "prettier --write \"./**/*.{js,ts,json,md}\"",
|
|
27
|
+
"release": "bun run format && bun run build && bun publish --access public",
|
|
28
|
+
"test": "bun test",
|
|
29
|
+
"typecheck": "bun run tsc --noEmit"
|
|
30
|
+
},
|
|
31
|
+
"dependencies": {
|
|
32
|
+
"@absolutejs/voice": "0.0.22-beta.471"
|
|
33
|
+
},
|
|
34
|
+
"devDependencies": {
|
|
35
|
+
"@types/bun": "1.3.9",
|
|
36
|
+
"typescript": "^5.9.3"
|
|
37
|
+
}
|
|
38
|
+
}
|