@framers/agentos 0.1.110 → 0.1.111

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/dist/api/agency.d.ts.map +1 -1
  2. package/dist/api/agency.js +38 -2
  3. package/dist/api/agency.js.map +1 -1
  4. package/dist/api/agent.js +1 -1
  5. package/dist/api/agent.js.map +1 -1
  6. package/dist/api/strategies/debate.d.ts.map +1 -1
  7. package/dist/api/strategies/debate.js.map +1 -1
  8. package/dist/api/strategies/graph.d.ts.map +1 -1
  9. package/dist/api/strategies/graph.js +1 -2
  10. package/dist/api/strategies/graph.js.map +1 -1
  11. package/dist/api/strategies/hierarchical.d.ts.map +1 -1
  12. package/dist/api/strategies/hierarchical.js +1 -2
  13. package/dist/api/strategies/hierarchical.js.map +1 -1
  14. package/dist/api/strategies/index.d.ts +1 -9
  15. package/dist/api/strategies/index.d.ts.map +1 -1
  16. package/dist/api/strategies/index.js +1 -11
  17. package/dist/api/strategies/index.js.map +1 -1
  18. package/dist/api/strategies/parallel.d.ts.map +1 -1
  19. package/dist/api/strategies/parallel.js +23 -4
  20. package/dist/api/strategies/parallel.js.map +1 -1
  21. package/dist/api/strategies/review-loop.d.ts.map +1 -1
  22. package/dist/api/strategies/review-loop.js.map +1 -1
  23. package/dist/api/strategies/sequential.d.ts.map +1 -1
  24. package/dist/api/strategies/sequential.js +1 -2
  25. package/dist/api/strategies/sequential.js.map +1 -1
  26. package/dist/api/strategies/shared.d.ts +8 -0
  27. package/dist/api/strategies/shared.d.ts.map +1 -1
  28. package/dist/api/strategies/shared.js +10 -1
  29. package/dist/api/strategies/shared.js.map +1 -1
  30. package/dist/api/types.d.ts +6 -0
  31. package/dist/api/types.d.ts.map +1 -1
  32. package/dist/api/types.js.map +1 -1
  33. package/dist/memory/AgentMemory.d.ts +2 -1
  34. package/dist/memory/AgentMemory.d.ts.map +1 -1
  35. package/dist/memory/AgentMemory.js +1 -1
  36. package/dist/memory/AgentMemory.js.map +1 -1
  37. package/dist/memory/CognitiveMemoryManager.d.ts.map +1 -1
  38. package/dist/memory/CognitiveMemoryManager.js +7 -2
  39. package/dist/memory/CognitiveMemoryManager.js.map +1 -1
  40. package/dist/memory/facade/Memory.d.ts.map +1 -1
  41. package/dist/memory/facade/Memory.js +6 -9
  42. package/dist/memory/facade/Memory.js.map +1 -1
  43. package/dist/memory/store/MemoryStore.d.ts +9 -0
  44. package/dist/memory/store/MemoryStore.d.ts.map +1 -1
  45. package/dist/memory/store/MemoryStore.js +66 -6
  46. package/dist/memory/store/MemoryStore.js.map +1 -1
  47. package/dist/memory/store/SqliteMemoryGraph.d.ts.map +1 -1
  48. package/dist/memory/store/SqliteMemoryGraph.js +27 -13
  49. package/dist/memory/store/SqliteMemoryGraph.js.map +1 -1
  50. package/dist/speech/FallbackProxy.d.ts +194 -41
  51. package/dist/speech/FallbackProxy.d.ts.map +1 -1
  52. package/dist/speech/FallbackProxy.js +155 -32
  53. package/dist/speech/FallbackProxy.js.map +1 -1
  54. package/dist/speech/SpeechProviderResolver.d.ts +278 -36
  55. package/dist/speech/SpeechProviderResolver.d.ts.map +1 -1
  56. package/dist/speech/SpeechProviderResolver.js +306 -40
  57. package/dist/speech/SpeechProviderResolver.js.map +1 -1
  58. package/dist/speech/providers/AssemblyAISTTProvider.d.ts +119 -19
  59. package/dist/speech/providers/AssemblyAISTTProvider.d.ts.map +1 -1
  60. package/dist/speech/providers/AssemblyAISTTProvider.js +153 -25
  61. package/dist/speech/providers/AssemblyAISTTProvider.js.map +1 -1
  62. package/dist/speech/providers/AzureSpeechSTTProvider.d.ts +121 -17
  63. package/dist/speech/providers/AzureSpeechSTTProvider.d.ts.map +1 -1
  64. package/dist/speech/providers/AzureSpeechSTTProvider.js +122 -14
  65. package/dist/speech/providers/AzureSpeechSTTProvider.js.map +1 -1
  66. package/dist/speech/providers/AzureSpeechTTSProvider.d.ts +130 -15
  67. package/dist/speech/providers/AzureSpeechTTSProvider.d.ts.map +1 -1
  68. package/dist/speech/providers/AzureSpeechTTSProvider.js +163 -18
  69. package/dist/speech/providers/AzureSpeechTTSProvider.js.map +1 -1
  70. package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts +159 -0
  71. package/dist/speech/providers/BuiltInAdaptiveVadProvider.d.ts.map +1 -1
  72. package/dist/speech/providers/BuiltInAdaptiveVadProvider.js +119 -0
  73. package/dist/speech/providers/BuiltInAdaptiveVadProvider.js.map +1 -1
  74. package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts +102 -16
  75. package/dist/speech/providers/DeepgramBatchSTTProvider.d.ts.map +1 -1
  76. package/dist/speech/providers/DeepgramBatchSTTProvider.js +108 -13
  77. package/dist/speech/providers/DeepgramBatchSTTProvider.js.map +1 -1
  78. package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts +149 -0
  79. package/dist/speech/providers/ElevenLabsTextToSpeechProvider.d.ts.map +1 -1
  80. package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js +137 -2
  81. package/dist/speech/providers/ElevenLabsTextToSpeechProvider.js.map +1 -1
  82. package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts +125 -0
  83. package/dist/speech/providers/OpenAITextToSpeechProvider.d.ts.map +1 -1
  84. package/dist/speech/providers/OpenAITextToSpeechProvider.js +128 -4
  85. package/dist/speech/providers/OpenAITextToSpeechProvider.js.map +1 -1
  86. package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts +110 -0
  87. package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.d.ts.map +1 -1
  88. package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js +115 -0
  89. package/dist/speech/providers/OpenAIWhisperSpeechToTextProvider.js.map +1 -1
  90. package/package.json +1 -1
@@ -1,52 +1,156 @@
1
- /** Converts Azure 100-nanosecond ticks to seconds. */
1
+ /**
2
+ * Converts Azure's 100-nanosecond tick unit to seconds.
3
+ *
4
+ * Azure Cognitive Services uses "ticks" (100-nanosecond units) for all
5
+ * timing fields. One second = 10,000,000 ticks.
6
+ *
7
+ * @param ticks - Duration in 100-nanosecond Azure ticks.
8
+ * @returns Duration in seconds.
9
+ *
10
+ * @example
11
+ * ```ts
12
+ * ticksToSeconds(30_000_000); // 3.0 seconds
13
+ * ticksToSeconds(15_000_000); // 1.5 seconds
14
+ * ```
15
+ */
2
16
  function ticksToSeconds(ticks) {
3
17
  return ticks / 10000000;
4
18
  }
5
19
  /**
6
20
  * Speech-to-text provider that uses the Azure Cognitive Services Speech REST API.
7
21
  *
8
- * Sends WAV audio as a raw binary body and returns a normalised
9
- * {@link SpeechTranscriptionResult}. A `RecognitionStatus` of `'NoMatch'`
10
- * is mapped to an empty text result rather than an error, matching the
11
- * Azure SDK behaviour.
22
+ * ## Azure REST Endpoint Format
23
+ *
24
+ * The endpoint URL follows this pattern:
25
+ * ```
26
+ * https://{region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language={lang}
27
+ * ```
28
+ *
29
+ * - `{region}` — The Azure region from config (e.g. `eastus`, `westeurope`).
30
+ * - `{lang}` — BCP-47 language code from options or `'en-US'` default.
31
+ * - The `/conversation/` path segment selects the conversation recognition mode
32
+ * (as opposed to `/interactive/` or `/dictation/`).
33
+ *
34
+ * ## Authentication: `Ocp-Apim-Subscription-Key`
35
+ *
36
+ * Azure Cognitive Services uses the `Ocp-Apim-Subscription-Key` HTTP header
37
+ * for authentication, which differs from the typical `Authorization: Bearer`
38
+ * pattern. The subscription key is sent as a plain-text header value — no
39
+ * "Bearer" or "Token" prefix.
40
+ *
41
+ * An alternative is to use a short-lived token from the token endpoint, but
42
+ * this provider uses the simpler key-based approach for reliability.
43
+ *
44
+ * ## NoMatch Handling
45
+ *
46
+ * When Azure's recognizer detects audio but cannot identify any speech, it
47
+ * returns `RecognitionStatus: 'NoMatch'` instead of raising an HTTP error.
48
+ * This provider maps `NoMatch` to an empty-text result (`text: ''`) with
49
+ * `isFinal: true`, matching the Azure Speech SDK's behaviour. This prevents
50
+ * the fallback proxy from unnecessarily trying another provider when the
51
+ * audio genuinely contains no speech.
52
+ *
53
+ * ## Limitations
54
+ *
55
+ * - Audio must be PCM WAV format. The `Content-Type` is hardcoded to
56
+ * `audio/wav` regardless of the `audio.mimeType` value.
57
+ * - Streaming is not supported — use the Azure Speech SDK for real-time STT.
58
+ * - Speaker diarization is not available via the REST API.
59
+ *
60
+ * @see {@link AzureSpeechSTTProviderConfig} for configuration options
61
+ * @see {@link AzureSpeechTTSProvider} for the corresponding TTS provider
12
62
  *
13
63
  * @example
14
64
  * ```ts
15
- * const provider = new AzureSpeechSTTProvider({ key: process.env.AZURE_SPEECH_KEY!, region: 'eastus' });
16
- * const result = await provider.transcribe({ data: wavBuffer });
17
- * console.log(result.text);
65
+ * const provider = new AzureSpeechSTTProvider({
66
+ * key: process.env.AZURE_SPEECH_KEY!,
67
+ * region: 'eastus',
68
+ * });
69
+ * const result = await provider.transcribe(
70
+ * { data: wavBuffer, mimeType: 'audio/wav' },
71
+ * { language: 'de-DE' },
72
+ * );
73
+ * console.log(result.text); // '' if no speech detected
18
74
  * ```
19
75
  */
20
76
  export class AzureSpeechSTTProvider {
77
+ /**
78
+ * Creates a new AzureSpeechSTTProvider.
79
+ *
80
+ * @param config - Provider configuration including the subscription key and region.
81
+ *
82
+ * @example
83
+ * ```ts
84
+ * const provider = new AzureSpeechSTTProvider({
85
+ * key: 'your-azure-subscription-key',
86
+ * region: 'eastus',
87
+ * });
88
+ * ```
89
+ */
21
90
  constructor(config) {
22
91
  this.config = config;
92
+ /** Unique provider identifier used for registration and resolution. */
23
93
  this.id = 'azure-speech-stt';
94
+ /** Human-readable display name for UI and logging. */
24
95
  this.displayName = 'Azure Speech (STT)';
96
+ /** This provider uses synchronous HTTP requests, not WebSocket streaming. */
25
97
  this.supportsStreaming = false;
26
98
  this.fetchImpl = config.fetchImpl ?? fetch;
27
99
  }
28
- /** Returns the human-readable provider name. */
100
+ /**
101
+ * Returns the human-readable provider name.
102
+ *
103
+ * @returns The display name string `'Azure Speech (STT)'`.
104
+ *
105
+ * @example
106
+ * ```ts
107
+ * provider.getProviderName(); // 'Azure Speech (STT)'
108
+ * ```
109
+ */
29
110
  getProviderName() {
30
111
  return this.displayName;
31
112
  }
32
113
  /**
33
114
  * Transcribes an audio buffer using the Azure Speech recognition REST endpoint.
34
115
  *
35
- * @param audio - Raw audio data. Azure expects PCM WAV; pass `mimeType: 'audio/wav'`.
36
- * @param options - Optional transcription settings (language…).
37
- * @returns A promise resolving to the normalised transcription result.
38
- * @throws When the Azure API returns a non-2xx status.
116
+ * Sends the raw audio as PCM WAV and returns a normalized result. Azure's
117
+ * `NoMatch` status is treated as an empty transcript (not an error).
118
+ *
119
+ * @param audio - Raw audio data. Azure expects PCM WAV format; the
120
+ * Content-Type header is always set to `'audio/wav'` regardless of
121
+ * `audio.mimeType`.
122
+ * @param options - Optional transcription settings. Only `language` is
123
+ * supported by the Azure REST endpoint.
124
+ * @returns A promise resolving to the normalized transcription result.
125
+ * @throws {Error} When the Azure API returns a non-2xx HTTP status code.
126
+ * The error message includes the status and response body text.
127
+ *
128
+ * @example
129
+ * ```ts
130
+ * const result = await provider.transcribe(
131
+ * { data: wavBuffer, durationSeconds: 5 },
132
+ * { language: 'fr-FR' },
133
+ * );
134
+ * if (result.text === '') {
135
+ * console.log('No speech detected in the audio');
136
+ * }
137
+ * ```
39
138
  */
40
139
  async transcribe(audio, options = {}) {
41
140
  const lang = options.language ?? 'en-US';
42
141
  const { key, region } = this.config;
142
+ // Build the Azure STT REST endpoint URL.
143
+ // The /conversation/ path selects conversation recognition mode which is
144
+ // the most general-purpose mode for varied audio content.
43
145
  const url = `https://${region}.stt.speech.microsoft.com` +
44
146
  `/speech/recognition/conversation/cognitiveservices/v1` +
45
147
  `?language=${encodeURIComponent(lang)}`;
46
148
  const response = await this.fetchImpl(url, {
47
149
  method: 'POST',
48
150
  headers: {
151
+ // Azure uses this non-standard header for subscription key auth
49
152
  'Ocp-Apim-Subscription-Key': key,
153
+ // Hardcoded to audio/wav because Azure's REST endpoint requires WAV format
50
154
  'Content-Type': 'audio/wav',
51
155
  },
52
156
  body: audio.data,
@@ -56,7 +160,9 @@ export class AzureSpeechSTTProvider {
56
160
  throw new Error(`Azure Speech STT failed (${response.status}): ${message}`);
57
161
  }
58
162
  const payload = (await response.json());
59
- // NoMatch means the recognizer found no speech — return empty text gracefully.
163
+ // NoMatch means the recognizer detected audio but found no speech content.
164
+ // Return an empty result instead of throwing — this is the expected behaviour
165
+ // for silence or noise-only audio, matching the Azure Speech SDK pattern.
60
166
  if (payload.RecognitionStatus === 'NoMatch') {
61
167
  return {
62
168
  text: '',
@@ -70,6 +176,8 @@ export class AzureSpeechSTTProvider {
70
176
  },
71
177
  };
72
178
  }
179
+ // Convert Azure's 100-nanosecond ticks to seconds, falling back to the
180
+ // client-provided duration estimate if the API doesn't return Duration.
73
181
  const durationSeconds = typeof payload.Duration === 'number'
74
182
  ? ticksToSeconds(payload.Duration)
75
183
  : audio.durationSeconds;
@@ -1 +1 @@
1
- {"version":3,"file":"AzureSpeechSTTProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechSTTProvider.ts"],"names":[],"mappings":"AAiCA,sDAAsD;AACtD,SAAS,cAAc,CAAC,KAAa;IACnC,OAAO,KAAK,GAAG,QAAU,CAAC;AAC5B,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,OAAO,sBAAsB;IAOjC,YAA6B,MAAoC;QAApC,WAAM,GAAN,MAAM,CAA8B;QANjD,OAAE,GAAG,kBAAkB,CAAC;QACxB,gBAAW,GAAG,oBAAoB,CAAC;QACnC,sBAAiB,GAAG,KAAK,CAAC;QAKxC,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,KAAK,CAAC;IAC7C,CAAC;IAED,gDAAgD;IAChD,eAAe;QACb,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,UAAU,CACd,KAAuB,EACvB,UAAsC,EAAE;QAExC,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC;QACzC,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC;QAEpC,MAAM,GAAG,GACP,WAAW,MAAM,2BAA2B;YAC5C,uDAAuD;YACvD,aAAa,kBAAkB,CAAC,IAAI,CAAC,EAAE,CAAC;QAE1C,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE;YACzC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,2BAA2B,EAAE,GAAG;gBAChC,cAAc,EAAE,WAAW;aAC5B;YACD,IAAI,EAAE,KAAK,CAAC,IAA2B;SACxC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,4BAA4B,QAAQ,CAAC,MAAM,MAAM,OAAO,EAAE,CAAC,CAAC;QAC9E,CAAC;QAED,MAAM,OAAO,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAwB,CAAC;QAE/D,+EAA+E;QAC/E,IAAI,OAAO,CAAC,iBAAiB,KAAK,SAAS,EAAE,CAAC;YAC5C,OAAO;gBACL,IAAI,EAAE,EAAE;gBACR,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE,CAAC;gBACP,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,OAAO;gBACzB,KAAK,EAAE;oBACL,eAAe,EAAE,CAAC,KAAK,CAAC,eAAe,IAAI,CAAC,CAAC,GAAG,EAAE;oBAClD,SAAS,EAAE,kBAAkB;iBAC9B;aACF,CAAC;QACJ,CAAC;QAED,MAAM,eAAe,GACnB,OAAO,OAAO,CAAC,QAAQ,KAAK,QAAQ;YAClC,CAAC,CAAC,cAAc,CAAC,OAAO,CAAC,QAAQ,CAAC;YAClC,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC;QAE5B,OAAO;YACL,IAAI,EAAE,OAAO,CAAC,WAAW,IAAI,EAAE;YAC/B,QAAQ,EAAE,IAAI;YACd,eAAe;YACf,IAAI,EAAE,CAAC;YACP,gBAAgB,EAAE,OAAO;YACzB,OAAO,EAAE,IAAI;YACb,KAAK,EAAE;gBACL,eAAe,EAAE,CAAC,eAAe,IAAI,CAAC,CAAC,GAAG,EAAE;gBAC5C,SAAS,EAAE,kBAAkB;aAC9B;SACF,CAAC;IACJ,CAAC;CACF"}
1
+ {"version":3,"file":"AzureSpeechSTTProvider.js","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechSTTProvider.ts"],"names":[],"mappings":"AA+EA;;;;;;;;;;;;;;GAcG;AACH,SAAS,cAAc,CAAC,KAAa;IACnC,OAAO,KAAK,GAAG,QAAU,CAAC;AAC5B,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwDG;AACH,MAAM,OAAO,sBAAsB;IAajC;;;;;;;;;;;;OAYG;IACH,YAA6B,MAAoC;QAApC,WAAM,GAAN,MAAM,CAA8B;QAzBjE,uEAAuE;QACvD,OAAE,GAAG,kBAAkB,CAAC;QAExC,sDAAsD;QACtC,gBAAW,GAAG,oBAAoB,CAAC;QAEnD,6EAA6E;QAC7D,sBAAiB,GAAG,KAAK,CAAC;QAmBxC,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,KAAK,CAAC;IAC7C,CAAC;IAED;;;;;;;;;OASG;IACH,eAAe;QACb,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;;;OAyBG;IACH,KAAK,CAAC,UAAU,CACd,KAAuB,EACvB,UAAsC,EAAE;QAExC,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC;QACzC,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC;QAEpC,yCAAyC;QACzC,yEAAyE;QACzE,0DAA0D;QAC1D,MAAM,GAAG,GACP,WAAW,MAAM,2BAA2B;YAC5C,uDAAuD;YACvD,aAAa,kBAAkB,CAAC,IAAI,CAAC,EAAE,CAAC;QAE1C,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE;YACzC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,gEAAgE;gBAChE,2BAA2B,EAAE,GAAG;gBAChC,2EAA2E;gBAC3E,cAAc,EAAE,WAAW;aAC5B;YACD,IAAI,EAAE,KAAK,CAAC,IAA2B;SACxC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,4BAA4B,QAAQ,CAAC,MAAM,MAAM,OAAO,EAAE,CAAC,CAAC;QAC9E,CAAC;QAED,MAAM,OAAO,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAwB,CAAC;QAE/D,2EAA2E;QAC3E,8EAA8E;QAC9E,0EAA0E;QAC1E,IAAI,OAAO,CAAC,iBAAiB,KAAK,SAAS,EAAE,CAAC;YAC5C,OAAO;gBACL,IAAI,EAAE,EAAE;gBACR,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE,CAAC;gBACP,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,OAAO;gBACzB,KAAK,EAAE;oBACL,eAAe,EAAE,CAAC,KAAK,CAAC,eAAe,IAAI,CAAC,CAAC,GAAG,EAAE;oBAClD,SAAS,EAAE,kBAAkB;iBAC9B;aACF,CAAC;QACJ,CAAC;QAED,uEAAuE;QACvE,wEAAwE;QACxE,MAAM,eAAe,GACnB,OAAO,OAAO,CAAC,QAAQ,KAAK,QAAQ;YAClC,CAAC,CAAC,cAAc,CAAC,OAAO,CAAC,QAAQ,CAAC;YAClC,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC;QAE5B,OAAO;YACL,IAAI,EAAE,OAAO,CAAC,WAAW,IAAI,EAAE;YAC/B,QAAQ,EAAE,IAAI;YACd,eAAe;YACf,IAAI,EAAE,CAAC;YACP,gBAAgB,EAAE,OAAO;YACzB,OAAO,EAAE,IAAI;YACb,KAAK,EAAE;gBACL,eAAe,EAAE,CAAC,eAAe,IAAI,CAAC,CAAC,GAAG,EAAE;gBAC5C,SAAS,EAAE,kBAAkB;aAC9B;SACF,CAAC;IACJ,CAAC;CACF"}
@@ -1,59 +1,174 @@
1
1
  import type { SpeechSynthesisOptions, SpeechSynthesisResult, SpeechVoice, TextToSpeechProvider } from '../types.js';
2
- /** Configuration for the AzureSpeechTTSProvider. */
2
+ /**
3
+ * Configuration for the {@link AzureSpeechTTSProvider}.
4
+ *
5
+ * @see {@link AzureSpeechTTSProvider} for usage examples
6
+ * @see https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech
7
+ */
3
8
  export interface AzureSpeechTTSProviderConfig {
4
- /** Azure Cognitive Services subscription key. */
9
+ /**
10
+ * Azure Cognitive Services subscription key.
11
+ * Sent as the `Ocp-Apim-Subscription-Key` header value.
12
+ *
13
+ * @see {@link AzureSpeechSTTProviderConfig.key} for the same pattern on STT
14
+ */
5
15
  key: string;
6
- /** Azure region, e.g. `'eastus'` or `'westeurope'`. */
16
+ /**
17
+ * Azure region where the Speech resource is deployed, e.g. `'eastus'`,
18
+ * `'westeurope'`, `'southeastasia'`.
19
+ *
20
+ * The region determines the REST endpoint hostname:
21
+ * `https://{region}.tts.speech.microsoft.com`
22
+ */
7
23
  region: string;
8
24
  /**
9
25
  * Default voice name to use when none is specified per-request.
26
+ * Must be a valid Azure voice short-name (e.g. `'en-US-JennyNeural'`).
27
+ *
10
28
  * @default 'en-US-JennyNeural'
29
+ * @see https://learn.microsoft.com/azure/ai-services/speech-service/language-support#prebuilt-neural-voices
11
30
  */
12
31
  defaultVoice?: string;
13
32
  /**
14
- * Custom fetch implementation, useful for testing.
15
- * Defaults to the global `fetch`.
33
+ * Custom fetch implementation for dependency injection in tests.
34
+ * @default globalThis.fetch
16
35
  */
17
36
  fetchImpl?: typeof fetch;
18
37
  }
19
38
  /**
20
39
  * Text-to-speech provider that uses the Azure Cognitive Services Speech REST API.
21
40
  *
22
- * Generates audio via SSML synthesis and returns the raw MP3 buffer. Streaming
23
- * is supported in the sense that the provider can be used inside a streaming
24
- * pipeline the actual HTTP request is a single synchronous call.
41
+ * ## SSML Generation
42
+ *
43
+ * Azure's TTS REST endpoint requires SSML (Speech Synthesis Markup Language) as
44
+ * the request body — it does not accept plain text. This provider generates
45
+ * minimal SSML via {@link buildSsml} that wraps the input text in `<speak>`
46
+ * and `<voice>` elements. Special XML characters in the text are escaped via
47
+ * {@link escapeXml} to prevent malformed XML.
48
+ *
49
+ * ## `X-Microsoft-OutputFormat` Options
50
+ *
51
+ * The `X-Microsoft-OutputFormat` header controls the audio encoding. This
52
+ * provider uses `'audio-24khz-96kbitrate-mono-mp3'` which provides:
53
+ * - 24 kHz sample rate (high quality for speech)
54
+ * - 96 kbps bitrate (good balance of quality and file size)
55
+ * - Mono channel (sufficient for speech synthesis)
56
+ * - MP3 format (universally supported)
57
+ *
58
+ * Other available formats include:
59
+ * - `'audio-16khz-128kbitrate-mono-mp3'` — Lower sample rate, higher bitrate
60
+ * - `'audio-24khz-160kbitrate-mono-mp3'` — Higher bitrate for better quality
61
+ * - `'riff-24khz-16bit-mono-pcm'` — Uncompressed WAV
62
+ * - `'ogg-24khz-16bit-mono-opus'` — Opus codec in OGG container
63
+ *
64
+ * @see https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs
65
+ *
66
+ * ## Voice Listing
67
+ *
68
+ * The {@link listAvailableVoices} method fetches the full list of neural voices
69
+ * available in the configured Azure region via
70
+ * `GET /cognitiveservices/voices/list`. Results are mapped to the normalized
71
+ * {@link SpeechVoice} shape.
72
+ *
73
+ * @see {@link AzureSpeechTTSProviderConfig} for configuration options
74
+ * @see {@link AzureSpeechSTTProvider} for the corresponding STT provider
25
75
  *
26
76
  * @example
27
77
  * ```ts
28
- * const provider = new AzureSpeechTTSProvider({ key: process.env.AZURE_SPEECH_KEY!, region: 'eastus' });
78
+ * const provider = new AzureSpeechTTSProvider({
79
+ * key: process.env.AZURE_SPEECH_KEY!,
80
+ * region: 'eastus',
81
+ * defaultVoice: 'en-US-GuyNeural',
82
+ * });
29
83
  * const result = await provider.synthesize('Hello world');
30
84
  * // result.audioBuffer contains MP3 bytes
85
+ * // result.mimeType === 'audio/mpeg'
31
86
  * ```
32
87
  */
33
88
  export declare class AzureSpeechTTSProvider implements TextToSpeechProvider {
34
89
  private readonly config;
90
+ /** Unique provider identifier used for registration and resolution. */
35
91
  readonly id = "azure-speech-tts";
92
+ /** Human-readable display name for UI and logging. */
36
93
  readonly displayName = "Azure Speech (TTS)";
94
+ /**
95
+ * Marked as streaming-capable because the provider can be used within a
96
+ * streaming pipeline — though the actual HTTP request is a single
97
+ * synchronous call that returns the complete audio buffer.
98
+ */
37
99
  readonly supportsStreaming = true;
100
+ /** Fetch implementation — injected for testability, defaults to global fetch. */
38
101
  private readonly fetchImpl;
102
+ /** Resolved default voice name used when no voice is specified per-request. */
39
103
  private readonly defaultVoice;
104
+ /**
105
+ * Creates a new AzureSpeechTTSProvider.
106
+ *
107
+ * @param config - Provider configuration including the subscription key,
108
+ * region, and optional default voice.
109
+ *
110
+ * @example
111
+ * ```ts
112
+ * const provider = new AzureSpeechTTSProvider({
113
+ * key: 'your-azure-subscription-key',
114
+ * region: 'westeurope',
115
+ * defaultVoice: 'de-DE-ConradNeural',
116
+ * });
117
+ * ```
118
+ */
40
119
  constructor(config: AzureSpeechTTSProviderConfig);
41
- /** Returns the human-readable provider name. */
120
+ /**
121
+ * Returns the human-readable provider name.
122
+ *
123
+ * @returns The display name string `'Azure Speech (TTS)'`.
124
+ *
125
+ * @example
126
+ * ```ts
127
+ * provider.getProviderName(); // 'Azure Speech (TTS)'
128
+ * ```
129
+ */
42
130
  getProviderName(): string;
43
131
  /**
44
132
  * Synthesizes speech from plain text using the Azure TTS REST endpoint.
45
133
  *
46
- * @param text - The utterance to convert to audio.
47
- * @param options - Optional synthesis settings (voice override…).
134
+ * The text is wrapped in SSML, sent to Azure, and the response audio buffer
135
+ * (MP3 format) is returned along with metadata.
136
+ *
137
+ * @param text - The plain-text utterance to convert to audio. XML special
138
+ * characters are automatically escaped.
139
+ * @param options - Optional synthesis settings. Use `options.voice` to
140
+ * override the default voice with any valid Azure voice short-name.
48
141
  * @returns A promise resolving to the MP3 audio buffer and metadata.
49
- * @throws When the Azure API returns a non-2xx status.
142
+ * @throws {Error} When the Azure API returns a non-2xx status code.
143
+ * Common causes: invalid subscription key (401), region mismatch (404),
144
+ * invalid SSML (400), or quota exceeded (429).
145
+ *
146
+ * @example
147
+ * ```ts
148
+ * const result = await provider.synthesize('Guten Tag!', {
149
+ * voice: 'de-DE-ConradNeural',
150
+ * });
151
+ * fs.writeFileSync('output.mp3', result.audioBuffer);
152
+ * ```
50
153
  */
51
154
  synthesize(text: string, options?: SpeechSynthesisOptions): Promise<SpeechSynthesisResult>;
52
155
  /**
53
156
  * Retrieves the list of available neural voices from the Azure region.
54
157
  *
55
- * @returns A promise resolving to an array of normalised {@link SpeechVoice} entries.
56
- * @throws When the Azure API returns a non-2xx status.
158
+ * Fetches from `GET /cognitiveservices/voices/list` and maps each entry
159
+ * to the normalized {@link SpeechVoice} shape. The list includes all
160
+ * neural and standard voices available in the configured region.
161
+ *
162
+ * @returns A promise resolving to an array of normalized voice entries.
163
+ * @throws {Error} When the Azure API returns a non-2xx status code
164
+ * (e.g. invalid key, network error).
165
+ *
166
+ * @example
167
+ * ```ts
168
+ * const voices = await provider.listAvailableVoices();
169
+ * const englishVoices = voices.filter(v => v.lang.startsWith('en-'));
170
+ * console.log(`Found ${englishVoices.length} English voices`);
171
+ * ```
57
172
  */
58
173
  listAvailableVoices(): Promise<SpeechVoice[]>;
59
174
  }
@@ -1 +1 @@
1
- {"version":3,"file":"AzureSpeechTTSProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechTTSProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,sBAAsB,EACtB,qBAAqB,EACrB,WAAW,EACX,oBAAoB,EACrB,MAAM,aAAa,CAAC;AAErB,oDAAoD;AACpD,MAAM,WAAW,4BAA4B;IAC3C,iDAAiD;IACjD,GAAG,EAAE,MAAM,CAAC;IACZ,uDAAuD;IACvD,MAAM,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AAyDD;;;;;;;;;;;;;GAaG;AACH,qBAAa,sBAAuB,YAAW,oBAAoB;IAQrD,OAAO,CAAC,QAAQ,CAAC,MAAM;IAPnC,SAAgB,EAAE,sBAAsB;IACxC,SAAgB,WAAW,wBAAwB;IACnD,SAAgB,iBAAiB,QAAQ;IAEzC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;IACzC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAS;gBAET,MAAM,EAAE,4BAA4B;IAKjE,gDAAgD;IAChD,eAAe,IAAI,MAAM;IAIzB;;;;;;;OAOG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,sBAA2B,GACnC,OAAO,CAAC,qBAAqB,CAAC;IAsCjC;;;;;OAKG;IACG,mBAAmB,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;CAgBpD"}
1
+ {"version":3,"file":"AzureSpeechTTSProvider.d.ts","sourceRoot":"","sources":["../../../src/speech/providers/AzureSpeechTTSProvider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,sBAAsB,EACtB,qBAAqB,EACrB,WAAW,EACX,oBAAoB,EACrB,MAAM,aAAa,CAAC;AAErB;;;;;GAKG;AACH,MAAM,WAAW,4BAA4B;IAC3C;;;;;OAKG;IACH,GAAG,EAAE,MAAM,CAAC;IAEZ;;;;;;OAMG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;;;;;OAMG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,KAAK,CAAC;CAC1B;AAmHD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiDG;AACH,qBAAa,sBAAuB,YAAW,oBAAoB;IAmCrD,OAAO,CAAC,QAAQ,CAAC,MAAM;IAlCnC,uEAAuE;IACvE,SAAgB,EAAE,sBAAsB;IAExC,sDAAsD;IACtD,SAAgB,WAAW,wBAAwB;IAEnD;;;;OAIG;IACH,SAAgB,iBAAiB,QAAQ;IAEzC,iFAAiF;IACjF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAe;IAEzC,+EAA+E;IAC/E,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAS;IAEtC;;;;;;;;;;;;;;OAcG;gBAC0B,MAAM,EAAE,4BAA4B;IAKjE;;;;;;;;;OASG;IACH,eAAe,IAAI,MAAM;IAIzB;;;;;;;;;;;;;;;;;;;;;;OAsBG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,sBAA2B,GACnC,OAAO,CAAC,qBAAqB,CAAC;IA4CjC;;;;;;;;;;;;;;;;;OAiBG;IACG,mBAAmB,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;CAgBpD"}
@@ -1,21 +1,56 @@
1
1
  /**
2
2
  * Escapes special XML characters in text before embedding it in SSML.
3
- * Azure's TTS endpoint expects well-formed XML; unescaped `<`, `>`, or `&`
4
- * characters in the input text would cause a 400 error.
3
+ *
4
+ * Azure's TTS endpoint expects well-formed XML in the request body. Unescaped
5
+ * `<`, `>`, `&`, `"`, or `'` characters in the input text would cause a 400
6
+ * Bad Request error because they break the XML structure.
7
+ *
8
+ * The five standard XML entity replacements are applied:
9
+ * - `&` -> `&amp;` (must be first to avoid double-escaping)
10
+ * - `<` -> `&lt;`
11
+ * - `>` -> `&gt;`
12
+ * - `"` -> `&quot;`
13
+ * - `'` -> `&apos;`
14
+ *
15
+ * @param text - Raw plain text to escape for safe XML embedding.
16
+ * @returns The XML-safe escaped string.
17
+ *
18
+ * @example
19
+ * ```ts
20
+ * escapeXml('Hello & <world>'); // 'Hello &amp; &lt;world&gt;'
21
+ * ```
5
22
  */
6
23
  function escapeXml(text) {
7
24
  return text
8
- .replace(/&/g, '&amp;')
25
+ .replace(/&/g, '&amp;') // Must be first to avoid double-escaping
9
26
  .replace(/</g, '&lt;')
10
27
  .replace(/>/g, '&gt;')
11
28
  .replace(/"/g, '&quot;')
12
29
  .replace(/'/g, '&apos;');
13
30
  }
14
31
  /**
15
- * Builds the SSML payload sent to the Azure TTS REST endpoint.
32
+ * Builds the SSML (Speech Synthesis Markup Language) payload for the Azure
33
+ * TTS REST endpoint.
34
+ *
35
+ * The generated SSML wraps the escaped text in a `<voice>` element with the
36
+ * specified voice name. The outer `<speak>` element declares SSML version 1.0
37
+ * and the W3C synthesis namespace.
16
38
  *
17
- * @param text - Plain-text utterance to synthesize.
39
+ * More advanced SSML features (prosody, emphasis, break) could be added here
40
+ * but are not currently needed for basic synthesis.
41
+ *
42
+ * @param text - Plain-text utterance to synthesize (will be XML-escaped).
18
43
  * @param voice - Azure voice short-name, e.g. `'en-US-JennyNeural'`.
44
+ * @returns Well-formed SSML string ready to send as the request body.
45
+ *
46
+ * @see {@link escapeXml} for the XML escaping logic
47
+ * @see https://learn.microsoft.com/azure/ai-services/speech-service/speech-synthesis-markup
48
+ *
49
+ * @example
50
+ * ```ts
51
+ * buildSsml('Hello world', 'en-US-JennyNeural');
52
+ * // '<speak version="1.0" xmlns="..."><voice name="en-US-JennyNeural">Hello world</voice></speak>'
53
+ * ```
19
54
  */
20
55
  function buildSsml(text, voice) {
21
56
  return (`<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">` +
@@ -23,7 +58,17 @@ function buildSsml(text, voice) {
23
58
  `</speak>`);
24
59
  }
25
60
  /**
26
- * Maps an Azure voice list entry to the normalised {@link SpeechVoice} shape.
61
+ * Maps an Azure voice list entry to the normalized {@link SpeechVoice} shape.
62
+ *
63
+ * The gender field is lowercased and validated against known values. Unknown
64
+ * gender strings (if Azure adds new values) are passed through as-is since
65
+ * the {@link SpeechVoice.gender} type accepts `string`.
66
+ *
67
+ * @param entry - A single voice entry from the Azure voices/list endpoint.
68
+ * @returns The normalized voice object.
69
+ *
70
+ * @see {@link AzureVoiceEntry} for the input shape
71
+ * @see {@link SpeechVoice} for the output shape
27
72
  */
28
73
  function mapVoice(entry) {
29
74
  const gender = entry.Gender?.toLowerCase();
@@ -40,48 +85,135 @@ function mapVoice(entry) {
40
85
  /**
41
86
  * Text-to-speech provider that uses the Azure Cognitive Services Speech REST API.
42
87
  *
43
- * Generates audio via SSML synthesis and returns the raw MP3 buffer. Streaming
44
- * is supported in the sense that the provider can be used inside a streaming
45
- * pipeline the actual HTTP request is a single synchronous call.
88
+ * ## SSML Generation
89
+ *
90
+ * Azure's TTS REST endpoint requires SSML (Speech Synthesis Markup Language) as
91
+ * the request body — it does not accept plain text. This provider generates
92
+ * minimal SSML via {@link buildSsml} that wraps the input text in `<speak>`
93
+ * and `<voice>` elements. Special XML characters in the text are escaped via
94
+ * {@link escapeXml} to prevent malformed XML.
95
+ *
96
+ * ## `X-Microsoft-OutputFormat` Options
97
+ *
98
+ * The `X-Microsoft-OutputFormat` header controls the audio encoding. This
99
+ * provider uses `'audio-24khz-96kbitrate-mono-mp3'` which provides:
100
+ * - 24 kHz sample rate (high quality for speech)
101
+ * - 96 kbps bitrate (good balance of quality and file size)
102
+ * - Mono channel (sufficient for speech synthesis)
103
+ * - MP3 format (universally supported)
104
+ *
105
+ * Other available formats include:
106
+ * - `'audio-16khz-128kbitrate-mono-mp3'` — Lower sample rate, higher bitrate
107
+ * - `'audio-24khz-160kbitrate-mono-mp3'` — Higher bitrate for better quality
108
+ * - `'riff-24khz-16bit-mono-pcm'` — Uncompressed WAV
109
+ * - `'ogg-24khz-16bit-mono-opus'` — Opus codec in OGG container
110
+ *
111
+ * @see https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs
112
+ *
113
+ * ## Voice Listing
114
+ *
115
+ * The {@link listAvailableVoices} method fetches the full list of neural voices
116
+ * available in the configured Azure region via
117
+ * `GET /cognitiveservices/voices/list`. Results are mapped to the normalized
118
+ * {@link SpeechVoice} shape.
119
+ *
120
+ * @see {@link AzureSpeechTTSProviderConfig} for configuration options
121
+ * @see {@link AzureSpeechSTTProvider} for the corresponding STT provider
46
122
  *
47
123
  * @example
48
124
  * ```ts
49
- * const provider = new AzureSpeechTTSProvider({ key: process.env.AZURE_SPEECH_KEY!, region: 'eastus' });
125
+ * const provider = new AzureSpeechTTSProvider({
126
+ * key: process.env.AZURE_SPEECH_KEY!,
127
+ * region: 'eastus',
128
+ * defaultVoice: 'en-US-GuyNeural',
129
+ * });
50
130
  * const result = await provider.synthesize('Hello world');
51
131
  * // result.audioBuffer contains MP3 bytes
132
+ * // result.mimeType === 'audio/mpeg'
52
133
  * ```
53
134
  */
54
135
  export class AzureSpeechTTSProvider {
136
+ /**
137
+ * Creates a new AzureSpeechTTSProvider.
138
+ *
139
+ * @param config - Provider configuration including the subscription key,
140
+ * region, and optional default voice.
141
+ *
142
+ * @example
143
+ * ```ts
144
+ * const provider = new AzureSpeechTTSProvider({
145
+ * key: 'your-azure-subscription-key',
146
+ * region: 'westeurope',
147
+ * defaultVoice: 'de-DE-ConradNeural',
148
+ * });
149
+ * ```
150
+ */
55
151
  constructor(config) {
56
152
  this.config = config;
153
+ /** Unique provider identifier used for registration and resolution. */
57
154
  this.id = 'azure-speech-tts';
155
+ /** Human-readable display name for UI and logging. */
58
156
  this.displayName = 'Azure Speech (TTS)';
157
+ /**
158
+ * Marked as streaming-capable because the provider can be used within a
159
+ * streaming pipeline — though the actual HTTP request is a single
160
+ * synchronous call that returns the complete audio buffer.
161
+ */
59
162
  this.supportsStreaming = true;
60
163
  this.fetchImpl = config.fetchImpl ?? fetch;
61
164
  this.defaultVoice = config.defaultVoice ?? 'en-US-JennyNeural';
62
165
  }
63
- /** Returns the human-readable provider name. */
166
+ /**
167
+ * Returns the human-readable provider name.
168
+ *
169
+ * @returns The display name string `'Azure Speech (TTS)'`.
170
+ *
171
+ * @example
172
+ * ```ts
173
+ * provider.getProviderName(); // 'Azure Speech (TTS)'
174
+ * ```
175
+ */
64
176
  getProviderName() {
65
177
  return this.displayName;
66
178
  }
67
179
  /**
68
180
  * Synthesizes speech from plain text using the Azure TTS REST endpoint.
69
181
  *
70
- * @param text - The utterance to convert to audio.
71
- * @param options - Optional synthesis settings (voice override…).
182
+ * The text is wrapped in SSML, sent to Azure, and the response audio buffer
183
+ * (MP3 format) is returned along with metadata.
184
+ *
185
+ * @param text - The plain-text utterance to convert to audio. XML special
186
+ * characters are automatically escaped.
187
+ * @param options - Optional synthesis settings. Use `options.voice` to
188
+ * override the default voice with any valid Azure voice short-name.
72
189
  * @returns A promise resolving to the MP3 audio buffer and metadata.
73
- * @throws When the Azure API returns a non-2xx status.
190
+ * @throws {Error} When the Azure API returns a non-2xx status code.
191
+ * Common causes: invalid subscription key (401), region mismatch (404),
192
+ * invalid SSML (400), or quota exceeded (429).
193
+ *
194
+ * @example
195
+ * ```ts
196
+ * const result = await provider.synthesize('Guten Tag!', {
197
+ * voice: 'de-DE-ConradNeural',
198
+ * });
199
+ * fs.writeFileSync('output.mp3', result.audioBuffer);
200
+ * ```
74
201
  */
75
202
  async synthesize(text, options = {}) {
76
203
  const voice = options.voice ?? this.defaultVoice;
77
204
  const { key, region } = this.config;
205
+ // Azure TTS endpoint — note it uses tts.speech.microsoft.com (not stt.)
78
206
  const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`;
79
207
  const ssml = buildSsml(text, voice);
80
208
  const response = await this.fetchImpl(url, {
81
209
  method: 'POST',
82
210
  headers: {
211
+ // Azure's standard subscription key authentication header
83
212
  'Ocp-Apim-Subscription-Key': key,
213
+ // SSML content type — Azure rejects plain text
84
214
  'Content-Type': 'application/ssml+xml',
215
+ // Output format header — determines the audio encoding, sample rate,
216
+ // and container format of the response body
85
217
  'X-Microsoft-OutputFormat': 'audio-24khz-96kbitrate-mono-mp3',
86
218
  },
87
219
  body: ssml,
@@ -90,12 +222,13 @@ export class AzureSpeechTTSProvider {
90
222
  const message = await response.text();
91
223
  throw new Error(`Azure Speech TTS failed (${response.status}): ${message}`);
92
224
  }
225
+ // Read the complete audio response into a Buffer
93
226
  const arrayBuffer = await response.arrayBuffer();
94
227
  const audioBuffer = Buffer.from(arrayBuffer);
95
228
  return {
96
229
  audioBuffer,
97
- mimeType: 'audio/mpeg',
98
- cost: 0,
230
+ mimeType: 'audio/mpeg', // Matches the X-Microsoft-OutputFormat MP3 selection
231
+ cost: 0, // Cost tracking is handled at a higher layer
99
232
  voiceUsed: voice,
100
233
  providerName: this.displayName,
101
234
  usage: {
@@ -107,8 +240,20 @@ export class AzureSpeechTTSProvider {
107
240
  /**
108
241
  * Retrieves the list of available neural voices from the Azure region.
109
242
  *
110
- * @returns A promise resolving to an array of normalised {@link SpeechVoice} entries.
111
- * @throws When the Azure API returns a non-2xx status.
243
+ * Fetches from `GET /cognitiveservices/voices/list` and maps each entry
244
+ * to the normalized {@link SpeechVoice} shape. The list includes all
245
+ * neural and standard voices available in the configured region.
246
+ *
247
+ * @returns A promise resolving to an array of normalized voice entries.
248
+ * @throws {Error} When the Azure API returns a non-2xx status code
249
+ * (e.g. invalid key, network error).
250
+ *
251
+ * @example
252
+ * ```ts
253
+ * const voices = await provider.listAvailableVoices();
254
+ * const englishVoices = voices.filter(v => v.lang.startsWith('en-'));
255
+ * console.log(`Found ${englishVoices.length} English voices`);
256
+ * ```
112
257
  */
113
258
  async listAvailableVoices() {
114
259
  const { key, region } = this.config;