@speech-sdk/core 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. package/README.md +227 -108
  2. package/dist/__tests__/e2e/_save-audio.d.ts +0 -42
  3. package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
  4. package/dist/__tests__/e2e/_save-audio.js +0 -59
  5. package/dist/__tests__/e2e/_save-audio.js.map +1 -1
  6. package/dist/audio-decode.d.ts +7 -0
  7. package/dist/audio-decode.d.ts.map +1 -0
  8. package/dist/audio-decode.js +109 -0
  9. package/dist/audio-decode.js.map +1 -0
  10. package/dist/audio-duration.d.ts +0 -5
  11. package/dist/audio-duration.d.ts.map +1 -1
  12. package/dist/audio-duration.js +5 -21
  13. package/dist/audio-duration.js.map +1 -1
  14. package/dist/audio-output.d.ts +39 -0
  15. package/dist/audio-output.d.ts.map +1 -0
  16. package/dist/audio-output.js +111 -0
  17. package/dist/audio-output.js.map +1 -0
  18. package/dist/audio-utils.d.ts +2 -10
  19. package/dist/audio-utils.d.ts.map +1 -1
  20. package/dist/audio-utils.js +57 -15
  21. package/dist/audio-utils.js.map +1 -1
  22. package/dist/captions.d.ts +0 -108
  23. package/dist/captions.d.ts.map +1 -1
  24. package/dist/captions.js +8 -98
  25. package/dist/captions.js.map +1 -1
  26. package/dist/conversation/attribute-timestamps.d.ts +26 -0
  27. package/dist/conversation/attribute-timestamps.d.ts.map +1 -0
  28. package/dist/conversation/attribute-timestamps.js +276 -0
  29. package/dist/conversation/attribute-timestamps.js.map +1 -0
  30. package/dist/conversation/dispatch.d.ts +5 -5
  31. package/dist/conversation/dispatch.d.ts.map +1 -1
  32. package/dist/conversation/dispatch.js +18 -8
  33. package/dist/conversation/dispatch.js.map +1 -1
  34. package/dist/conversation/errors.d.ts +3 -0
  35. package/dist/conversation/errors.d.ts.map +1 -1
  36. package/dist/conversation/errors.js +6 -0
  37. package/dist/conversation/errors.js.map +1 -1
  38. package/dist/conversation/pcm-concat.d.ts +0 -24
  39. package/dist/conversation/pcm-concat.d.ts.map +1 -1
  40. package/dist/conversation/pcm-concat.js +8 -183
  41. package/dist/conversation/pcm-concat.js.map +1 -1
  42. package/dist/conversation/proportional-fill.d.ts +10 -0
  43. package/dist/conversation/proportional-fill.d.ts.map +1 -0
  44. package/dist/conversation/proportional-fill.js +64 -0
  45. package/dist/conversation/proportional-fill.js.map +1 -0
  46. package/dist/conversation/silence-detection.d.ts +14 -0
  47. package/dist/conversation/silence-detection.d.ts.map +1 -0
  48. package/dist/conversation/silence-detection.js +52 -0
  49. package/dist/conversation/silence-detection.js.map +1 -0
  50. package/dist/conversation/stitch.d.ts +9 -6
  51. package/dist/conversation/stitch.d.ts.map +1 -1
  52. package/dist/conversation/stitch.js +72 -51
  53. package/dist/conversation/stitch.js.map +1 -1
  54. package/dist/conversation/types.d.ts +7 -37
  55. package/dist/conversation/types.d.ts.map +1 -1
  56. package/dist/conversation/validate.d.ts +1 -16
  57. package/dist/conversation/validate.d.ts.map +1 -1
  58. package/dist/conversation/validate.js +29 -29
  59. package/dist/conversation/validate.js.map +1 -1
  60. package/dist/default-stt-fallback.d.ts +3 -0
  61. package/dist/default-stt-fallback.d.ts.map +1 -0
  62. package/dist/default-stt-fallback.js +11 -0
  63. package/dist/default-stt-fallback.js.map +1 -0
  64. package/dist/derive-timestamps.d.ts +1 -5
  65. package/dist/derive-timestamps.d.ts.map +1 -1
  66. package/dist/derive-timestamps.js +1 -15
  67. package/dist/derive-timestamps.js.map +1 -1
  68. package/dist/encoders/mp3.d.ts +6 -0
  69. package/dist/encoders/mp3.d.ts.map +1 -0
  70. package/dist/encoders/mp3.js +54 -0
  71. package/dist/encoders/mp3.js.map +1 -0
  72. package/dist/errors.d.ts +20 -13
  73. package/dist/errors.d.ts.map +1 -1
  74. package/dist/errors.js +49 -15
  75. package/dist/errors.js.map +1 -1
  76. package/dist/generate-conversation.d.ts +5 -4
  77. package/dist/generate-conversation.d.ts.map +1 -1
  78. package/dist/generate-conversation.js +250 -93
  79. package/dist/generate-conversation.js.map +1 -1
  80. package/dist/generate-speech.d.ts +7 -28
  81. package/dist/generate-speech.d.ts.map +1 -1
  82. package/dist/generate-speech.js +185 -94
  83. package/dist/generate-speech.js.map +1 -1
  84. package/dist/index.d.ts +7 -11
  85. package/dist/index.d.ts.map +1 -1
  86. package/dist/index.js +6 -4
  87. package/dist/index.js.map +1 -1
  88. package/dist/logger.d.ts.map +1 -1
  89. package/dist/logger.js +2 -13
  90. package/dist/logger.js.map +1 -1
  91. package/dist/metadata.d.ts +0 -22
  92. package/dist/metadata.d.ts.map +1 -1
  93. package/dist/pronunciations/errors.d.ts +5 -0
  94. package/dist/pronunciations/errors.d.ts.map +1 -0
  95. package/dist/pronunciations/errors.js +8 -0
  96. package/dist/pronunciations/errors.js.map +1 -0
  97. package/dist/pronunciations/inverse-align.d.ts +4 -0
  98. package/dist/pronunciations/inverse-align.d.ts.map +1 -0
  99. package/dist/pronunciations/inverse-align.js +54 -0
  100. package/dist/pronunciations/inverse-align.js.map +1 -0
  101. package/dist/pronunciations/merge.d.ts +4 -0
  102. package/dist/pronunciations/merge.d.ts.map +1 -0
  103. package/dist/pronunciations/merge.js +13 -0
  104. package/dist/pronunciations/merge.js.map +1 -0
  105. package/dist/pronunciations/substitute.d.ts +6 -0
  106. package/dist/pronunciations/substitute.d.ts.map +1 -0
  107. package/dist/pronunciations/substitute.js +67 -0
  108. package/dist/pronunciations/substitute.js.map +1 -0
  109. package/dist/pronunciations/types.d.ts +18 -0
  110. package/dist/pronunciations/types.d.ts.map +1 -0
  111. package/dist/pronunciations/types.js +2 -0
  112. package/dist/pronunciations/types.js.map +1 -0
  113. package/dist/pronunciations/validate.d.ts +3 -0
  114. package/dist/pronunciations/validate.d.ts.map +1 -0
  115. package/dist/pronunciations/validate.js +26 -0
  116. package/dist/pronunciations/validate.js.map +1 -0
  117. package/dist/provider-utils.d.ts +4 -9
  118. package/dist/provider-utils.d.ts.map +1 -1
  119. package/dist/provider-utils.js +60 -51
  120. package/dist/provider-utils.js.map +1 -1
  121. package/dist/providers/cartesia/alignment.d.ts +0 -16
  122. package/dist/providers/cartesia/alignment.d.ts.map +1 -1
  123. package/dist/providers/cartesia/alignment.js +1 -6
  124. package/dist/providers/cartesia/alignment.js.map +1 -1
  125. package/dist/providers/cartesia/index.d.ts +29 -19
  126. package/dist/providers/cartesia/index.d.ts.map +1 -1
  127. package/dist/providers/cartesia/index.js +116 -80
  128. package/dist/providers/cartesia/index.js.map +1 -1
  129. package/dist/providers/deepgram/index.d.ts +23 -8
  130. package/dist/providers/deepgram/index.d.ts.map +1 -1
  131. package/dist/providers/deepgram/index.js +51 -18
  132. package/dist/providers/deepgram/index.js.map +1 -1
  133. package/dist/providers/elevenlabs/alignment.d.ts +7 -21
  134. package/dist/providers/elevenlabs/alignment.d.ts.map +1 -1
  135. package/dist/providers/elevenlabs/alignment.js +8 -9
  136. package/dist/providers/elevenlabs/alignment.js.map +1 -1
  137. package/dist/providers/elevenlabs/index.d.ts +14 -38
  138. package/dist/providers/elevenlabs/index.d.ts.map +1 -1
  139. package/dist/providers/elevenlabs/index.js +186 -169
  140. package/dist/providers/elevenlabs/index.js.map +1 -1
  141. package/dist/providers/fal/index.d.ts +11 -20
  142. package/dist/providers/fal/index.d.ts.map +1 -1
  143. package/dist/providers/fal/index.js +49 -37
  144. package/dist/providers/fal/index.js.map +1 -1
  145. package/dist/providers/fish-audio/index.d.ts +14 -8
  146. package/dist/providers/fish-audio/index.d.ts.map +1 -1
  147. package/dist/providers/fish-audio/index.js +47 -19
  148. package/dist/providers/fish-audio/index.js.map +1 -1
  149. package/dist/providers/gateway/index.d.ts +76 -0
  150. package/dist/providers/gateway/index.d.ts.map +1 -0
  151. package/dist/providers/gateway/index.js +251 -0
  152. package/dist/providers/gateway/index.js.map +1 -0
  153. package/dist/providers/google/index.d.ts +12 -20
  154. package/dist/providers/google/index.d.ts.map +1 -1
  155. package/dist/providers/google/index.js +180 -162
  156. package/dist/providers/google/index.js.map +1 -1
  157. package/dist/providers/hume/alignment.d.ts +30 -35
  158. package/dist/providers/hume/alignment.d.ts.map +1 -1
  159. package/dist/providers/hume/alignment.js +14 -8
  160. package/dist/providers/hume/alignment.js.map +1 -1
  161. package/dist/providers/hume/index.d.ts +16 -16
  162. package/dist/providers/hume/index.d.ts.map +1 -1
  163. package/dist/providers/hume/index.js +79 -65
  164. package/dist/providers/hume/index.js.map +1 -1
  165. package/dist/providers/inworld/alignment.d.ts +8 -22
  166. package/dist/providers/inworld/alignment.d.ts.map +1 -1
  167. package/dist/providers/inworld/alignment.js +9 -8
  168. package/dist/providers/inworld/alignment.js.map +1 -1
  169. package/dist/providers/inworld/index.d.ts +17 -20
  170. package/dist/providers/inworld/index.d.ts.map +1 -1
  171. package/dist/providers/inworld/index.js +79 -47
  172. package/dist/providers/inworld/index.js.map +1 -1
  173. package/dist/providers/mistral/index.d.ts +14 -8
  174. package/dist/providers/mistral/index.d.ts.map +1 -1
  175. package/dist/providers/mistral/index.js +63 -48
  176. package/dist/providers/mistral/index.js.map +1 -1
  177. package/dist/providers/murf/alignment.d.ts +10 -19
  178. package/dist/providers/murf/alignment.d.ts.map +1 -1
  179. package/dist/providers/murf/alignment.js +10 -5
  180. package/dist/providers/murf/alignment.js.map +1 -1
  181. package/dist/providers/murf/index.d.ts +15 -16
  182. package/dist/providers/murf/index.d.ts.map +1 -1
  183. package/dist/providers/murf/index.js +105 -58
  184. package/dist/providers/murf/index.js.map +1 -1
  185. package/dist/providers/openai/index.d.ts +43 -29
  186. package/dist/providers/openai/index.d.ts.map +1 -1
  187. package/dist/providers/openai/index.js +294 -106
  188. package/dist/providers/openai/index.js.map +1 -1
  189. package/dist/providers/resemble/alignment.d.ts +8 -29
  190. package/dist/providers/resemble/alignment.d.ts.map +1 -1
  191. package/dist/providers/resemble/alignment.js +9 -12
  192. package/dist/providers/resemble/alignment.js.map +1 -1
  193. package/dist/providers/resemble/index.d.ts +21 -11
  194. package/dist/providers/resemble/index.d.ts.map +1 -1
  195. package/dist/providers/resemble/index.js +89 -49
  196. package/dist/providers/resemble/index.js.map +1 -1
  197. package/dist/providers/smallest-ai/index.d.ts +47 -0
  198. package/dist/providers/smallest-ai/index.d.ts.map +1 -0
  199. package/dist/providers/smallest-ai/index.js +107 -0
  200. package/dist/providers/smallest-ai/index.js.map +1 -0
  201. package/dist/providers/xai/index.d.ts +25 -9
  202. package/dist/providers/xai/index.d.ts.map +1 -1
  203. package/dist/providers/xai/index.js +63 -40
  204. package/dist/providers/xai/index.js.map +1 -1
  205. package/dist/providers.d.ts +31 -0
  206. package/dist/providers.d.ts.map +1 -0
  207. package/dist/providers.js +16 -0
  208. package/dist/providers.js.map +1 -0
  209. package/dist/resolve-provider.d.ts.map +1 -1
  210. package/dist/resolve-provider.js +8 -51
  211. package/dist/resolve-provider.js.map +1 -1
  212. package/dist/retry-options.d.ts +6 -0
  213. package/dist/retry-options.d.ts.map +1 -0
  214. package/dist/retry-options.js +48 -0
  215. package/dist/retry-options.js.map +1 -0
  216. package/dist/speech-provider.d.ts +28 -53
  217. package/dist/speech-provider.d.ts.map +1 -1
  218. package/dist/speech-provider.js +5 -26
  219. package/dist/speech-provider.js.map +1 -1
  220. package/dist/speech-result.d.ts +8 -9
  221. package/dist/speech-result.d.ts.map +1 -1
  222. package/dist/speech-result.js.map +1 -1
  223. package/dist/speech-to-text-provider.d.ts +0 -12
  224. package/dist/speech-to-text-provider.d.ts.map +1 -1
  225. package/dist/stream-speech.d.ts +4 -2
  226. package/dist/stream-speech.d.ts.map +1 -1
  227. package/dist/stream-speech.js +36 -22
  228. package/dist/stream-speech.js.map +1 -1
  229. package/dist/timestamps.d.ts +3 -17
  230. package/dist/timestamps.d.ts.map +1 -1
  231. package/dist/turns.d.ts +9 -0
  232. package/dist/turns.d.ts.map +1 -0
  233. package/dist/turns.js +21 -0
  234. package/dist/turns.js.map +1 -0
  235. package/dist/types.d.ts +31 -0
  236. package/dist/types.d.ts.map +1 -1
  237. package/dist/volume-adjust.d.ts +0 -6
  238. package/dist/volume-adjust.d.ts.map +1 -1
  239. package/dist/volume-adjust.js +4 -16
  240. package/dist/volume-adjust.js.map +1 -1
  241. package/package.json +13 -66
  242. package/dist/stt-providers/openai/index.d.ts +0 -42
  243. package/dist/stt-providers/openai/index.d.ts.map +0 -1
  244. package/dist/stt-providers/openai/index.js +0 -184
  245. package/dist/stt-providers/openai/index.js.map +0 -1
package/README.md CHANGED
@@ -1,10 +1,24 @@
1
+ <div align="center">
2
+
3
+ <img src="https://github.com/user-attachments/assets/42d9b528-e507-4162-8120-338bb0c92650" alt="Speech SDK" width="140" />
4
+
1
5
  # Speech SDK
2
6
 
3
- [![npm version](https://img.shields.io/npm/v/@speech-sdk/core)](https://www.npmjs.com/package/@speech-sdk/core)
4
- [![npm downloads](https://img.shields.io/npm/dm/@speech-sdk/core)](https://www.npmjs.com/package/@speech-sdk/core)
5
- [![license](https://img.shields.io/npm/l/@speech-sdk/core)](https://github.com/Jellypod-Inc/speech-sdk/blob/main/LICENSE)
7
+ **Text-to-speech across 13 providers, one API.**
8
+
9
+ A lightweight, provider-agnostic TypeScript SDK. Zero lock-in. Runs in Node.js, Edge runtimes, and the browser.
10
+
11
+ [![npm version](https://img.shields.io/npm/v/@speech-sdk/core?style=flat-square)](https://www.npmjs.com/package/@speech-sdk/core)
12
+ [![npm downloads](https://img.shields.io/npm/dm/@speech-sdk/core?style=flat-square)](https://www.npmjs.com/package/@speech-sdk/core)
13
+ [![license](https://img.shields.io/npm/l/@speech-sdk/core?style=flat-square)](https://github.com/Jellypod-Inc/speech-sdk/blob/main/LICENSE)
14
+ [![Discord](https://img.shields.io/badge/Discord-Join-5865F2?style=flat-square&logo=discord&logoColor=white)](https://discord.gg/xcTQMU3nCV)
15
+ [![Stars](https://img.shields.io/github/stars/Jellypod-Inc/speech-sdk?style=flat-square&logo=github&label=stars)](https://github.com/Jellypod-Inc/speech-sdk/stargazers)
16
+
17
+ **[Quick start](#quick-start)** · **[Providers](#supported-providers)** · **[Streaming](#streaming)** · **[Multi-Speaker Conversations](#conversations)** · **[Timestamps](#timestamps)**
18
+
19
+ </div>
6
20
 
7
- A lightweight, provider-agnostic TypeScript SDK for text-to-speech. One API, 13 providers, zero lock-in. Runs in Node.js, Edge runtimes, and the browser.
21
+ <br />
8
22
 
9
23
  <img width="1200" height="630" alt="Speech SDK" src="https://github.com/user-attachments/assets/b90c0235-9405-4939-bffa-75fc82be5afb" />
10
24
 
@@ -12,19 +26,12 @@ Learn more at [speechsdk.dev](https://speechsdk.dev/).
12
26
 
13
27
  ## Features
14
28
 
15
- - **Universal** — `generateSpeech()` works across OpenAI, ElevenLabs, Deepgram, Cartesia, Hume, Google Gemini TTS, Fish Audio, Inworld, Murf, Resemble, fal, Mistral, and xAI.
29
+ - **Universal** — one `generateSpeech()` call across every supported provider.
16
30
  - **Streaming** — `streamSpeech()` returns a standard `ReadableStream<Uint8Array>`.
17
- - **Conversations** — `generateConversation()` produces multi-speaker audio, using native dialogue endpoints when available and stitching locally when not.
18
- - **Word-level timestamps** — `timestamps: "on"` returns alignment, using the provider's native data or falling back to STT.
31
+ - **Conversations** — `generateConversation()` produces multi-speaker audio, picking a gateway, native-dialogue, or local-stitch path automatically.
32
+ - **Word-level timestamps** — `timestamps: true` returns alignment, using the provider's native data or falling back to STT.
19
33
  - **Volume normalization** — RMS-level outputs to an absolute loudness target.
20
- - **Audio tags & voice cloning** — `[laugh]`, `[sigh]`, emotion cues; reference-audio cloning where supported.
21
-
22
- ## Contents
23
-
24
- - [Install](#install) · [Quick start](#quick-start) · [Supported providers](#supported-providers)
25
- - [Streaming](#streaming) · [Conversations](#conversations) · [Timestamps](#timestamps)
26
- - [Volume normalization](#volume-normalization) · [Audio tags](#audio-tags) · [Voice cloning](#voice-cloning)
27
- - [Custom configuration](#custom-configuration) · [API reference](#api-reference) · [Error handling](#error-handling) · [Development](#development)
34
+ - **Audio tags & voice cloning** — bracket cues like `[laugh]` and reference-audio cloning where supported.
28
35
 
29
36
  ## Install
30
37
 
@@ -51,25 +58,51 @@ result.audio.base64; // string (lazy)
51
58
  result.audio.mediaType; // "audio/mpeg"
52
59
  ```
53
60
 
54
- Pass a `provider/model` string, or just the provider name to use its default model. API keys are read from env vars automatically.
61
+ Pass a `provider/model` string, or just the provider name to use its default model. The string above is enough to get going — set one env var and you're done.
62
+
63
+ ## Gateway vs direct provider
64
+
65
+ The SDK has two ways to reach a provider, and the choice is made by **how you pass `model`**:
66
+
67
+ ```ts
68
+ // 1. String → routes through Speech Gateway (https://api.speechgateway.com)
69
+ // Needs SPEECH_GATEWAY_API_KEY (sign up at https://speechgateway.com).
70
+ await generateSpeech({ model: 'openai/gpt-4o-mini-tts', text: '...', voice: 'alloy' });
71
+
72
+ // 2. Factory → calls the provider directly (no proxy hop)
73
+ // Reads the provider's env var (e.g. OPENAI_API_KEY), or pass apiKey to the factory.
74
+ import { createOpenAI } from '@speech-sdk/core/providers';
75
+ await generateSpeech({ model: createOpenAI()('gpt-4o-mini-tts'), text: '...', voice: 'alloy' });
76
+ ```
77
+
78
+ | | Speech Gateway (string) | Direct provider (factory) |
79
+ |---|---|---|
80
+ | When to use | You want a single endpoint and easy provider swaps | You already have provider keys, want zero-hop latency, or need provider features the gateway hasn't surfaced |
81
+ | Setup | `SPEECH_GATEWAY_API_KEY` only | One env var per provider you use |
82
+ | Key resolution | `apiKey` option → `SPEECH_GATEWAY_API_KEY` | `createX({ apiKey })` → `<PROVIDER>_API_KEY` |
83
+ | Endpoint | `api.speechgateway.com` | Provider's own API |
84
+
85
+ The gateway also accepts `createSpeechGateway({ apiKey, baseURL })` if you want to construct it explicitly (e.g. for a custom proxy URL).
55
86
 
56
87
  ## Supported providers
57
88
 
58
- | Provider | Prefix | Default model | Env var |
59
- |---|---|---|---|
60
- | [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `gpt-4o-mini-tts` | `OPENAI_API_KEY` |
61
- | [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `eleven_multilingual_v2` | `ELEVENLABS_API_KEY` |
62
- | [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `aura-2` | `DEEPGRAM_API_KEY` |
63
- | [Cartesia](https://docs.cartesia.ai) | `cartesia` | `sonic-3` | `CARTESIA_API_KEY` |
64
- | [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `octave-2` | `HUME_API_KEY` |
65
- | [Inworld](https://docs.inworld.ai/tts) | `inworld` | `inworld-tts-1.5-max` | `INWORLD_API_KEY` |
66
- | [Google Gemini TTS](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `gemini-2.5-flash-preview-tts` | `GOOGLE_API_KEY` |
67
- | [Fish Audio](https://docs.fish.audio) | `fish-audio` | `s2-pro` | `FISH_AUDIO_API_KEY` |
68
- | [Murf](https://murf.ai/api/docs) | `murf` | `GEN2` | `MURF_API_KEY` |
69
- | [Resemble](https://docs.resemble.ai) | `resemble` | `default` | `RESEMBLE_API_KEY` |
70
- | [fal](https://fal.ai/models) | `fal-ai` | *(user-specified)* | `FAL_API_KEY` |
71
- | [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `voxtral-mini-tts-2603` | `MISTRAL_API_KEY` |
72
- | [xAI](https://docs.x.ai/docs/models) | `xai` | `grok-tts` | `XAI_API_KEY` |
89
+ | Provider | Prefix | Env var |
90
+ |---|---|---|
91
+ | [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `OPENAI_API_KEY` |
92
+ | [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `ELEVENLABS_API_KEY` |
93
+ | [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `DEEPGRAM_API_KEY` |
94
+ | [Cartesia](https://docs.cartesia.ai) | `cartesia` | `CARTESIA_API_KEY` |
95
+ | [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `HUME_API_KEY` |
96
+ | [Inworld](https://docs.inworld.ai/tts) | `inworld` | `INWORLD_API_KEY` |
97
+ | [Google Gemini TTS](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `GOOGLE_API_KEY` |
98
+ | [Fish Audio](https://docs.fish.audio) | `fish-audio` | `FISH_AUDIO_API_KEY` |
99
+ | [Murf](https://murf.ai/api/docs) | `murf` | `MURF_API_KEY` |
100
+ | [Resemble](https://docs.resemble.ai) | `resemble` | `RESEMBLE_API_KEY` |
101
+ | [fal](https://fal.ai/models) | `fal-ai` | `FAL_API_KEY` |
102
+ | [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `MISTRAL_API_KEY` |
103
+ | [xAI](https://docs.x.ai/docs/models) | `xai` | `XAI_API_KEY` |
104
+
105
+ The env var applies when you call the provider directly via its factory. Pass a string `model` like `"openai/tts-1"` to route through Speech Gateway instead, which reads `SPEECH_GATEWAY_API_KEY` — see [Gateway vs direct provider](#gateway-vs-direct-provider). Most providers ship a default model (`createOpenAI()()`); a few (e.g. fal) require an explicit model id. See the linked docs for each provider's full model list.
73
106
 
74
107
  Provider-specific parameters pass through via `providerOptions` using each API's native field names.
75
108
 
@@ -95,13 +128,16 @@ return new Response(audio, { headers: { 'Content-Type': mediaType } });
95
128
 
96
129
  ## Conversations
97
130
 
98
- `generateConversation()` produces a single multi-voice clip from an ordered array of turns, picking the best path automatically:
131
+ `generateConversation()` produces a single multi-voice clip from an ordered array of turns. The path is chosen by what the turns are:
132
+
133
+ - **Gateway** — every turn uses a gateway-routed string model (e.g. `"openai/tts-1"`). One request to Speech Gateway; the server handles rendering, stitching, and normalization. The SDK never stitches locally on this path — clone voices on gateway models throw `StitchUnsupportedError`.
134
+ - **Native dialogue** — every turn uses the same direct-provider model and that model exposes a multi-speaker endpoint. One API call, naturally mixed.
135
+ - **Stitch** — direct-provider conversations that don't qualify for native dialogue (multi-provider, or no dialogue endpoint). Runs turns in parallel, RMS-levels each, inserts silence, returns a single WAV.
99
136
 
100
- - **Native dialogue** — one provider with a multi-speaker endpoint (ElevenLabs v3, Gemini TTS, Hume Octave, Fish Audio S2-Pro, fal Dia). One API call, natural mix.
101
- - **Stitch fallback** — multi-provider or no dialogue endpoint. Runs turns in parallel, RMS-levels each, inserts silence, returns a single WAV.
137
+ Mixing gateway-routed turns with direct-provider turns in one call throws `MixedDispatchError`.
102
138
 
103
139
  ```ts
104
- import { generateConversation } from '@speech-sdk/core/conversation';
140
+ import { generateConversation } from '@speech-sdk/core';
105
141
 
106
142
  const result = await generateConversation({
107
143
  turns: [
@@ -112,16 +148,7 @@ const result = await generateConversation({
112
148
  });
113
149
  ```
114
150
 
115
- Options: `gapMs` (default 300), `normalizeVolume` (default `true`), `volumeDbfs` (default `-20`), `maxConcurrency` (default 6), `maxRetries` (default 2), `timestamps`, `timestampProvider`, `apiKey`, `providerOptions`, `abortSignal`, `headers`. Per-turn overrides: `model`, `providerOptions` (stitch path only — throws `ConversationInputError` on native).
116
-
117
- **Native dialogue caps:**
118
-
119
- | Provider | Models | Voice constraints |
120
- |---|---|---|
121
- | ElevenLabs | `eleven_v3` | 1–10 voices, ≤ 2,000 chars |
122
- | Google | `gemini-2.5-{flash,pro}-preview-tts`, `gemini-3.1-flash-tts-preview` | **Exactly 2 voices** |
123
- | Hume | `octave-1`, `octave-2` | 1–4 voices |
124
- | Fish Audio | `s2-pro` | 1–4 voices |
151
+ Options: `gapMs` (default 300), `volumeDbfs` (default `-20`), `maxConcurrency` (default 6), `maxRetries` (default 2), `timestamps`, `apiKey`, `providerOptions`, `abortSignal`, `headers`. Per-turn overrides: `model`, `providerOptions` (stitch path only — throws `ConversationInputError` on native). Native-dialogue models enforce their own voice-count and character limits; violations throw `DialogueConstraintError`.
125
152
 
126
153
  ## Timestamps
127
154
 
@@ -132,7 +159,7 @@ const result = await generateSpeech({
132
159
  model: 'elevenlabs/eleven_multilingual_v2',
133
160
  text: 'Hello from speech-sdk!',
134
161
  voice: 'JBFqnCBsd6RMkjVDRZzb',
135
- timestamps: 'on',
162
+ timestamps: true,
136
163
  });
137
164
 
138
165
  result.timestamps;
@@ -143,43 +170,57 @@ result.timestamps;
143
170
  // ]
144
171
  ```
145
172
 
146
- | Mode | Behavior |
173
+ | Value | Behavior |
147
174
  |---|---|
148
- | `"auto"` *(default)* | Return timestamps only if the provider supplies them natively. Free. |
149
- | `"on"` | Always return timestamps. Uses native alignment when available; otherwise transcribes the audio via STT (extra cost + latency). |
150
- | `"off"` | Never return timestamps. |
175
+ | `true` | Always return timestamps. Uses native alignment when available; otherwise transcribes the audio via STT (extra cost + latency). |
176
+ | `false` *(default)* | Never return timestamps. |
151
177
 
152
- On `"on"`, the fallback defaults to OpenAI Whisper (`openai/whisper-1`, needs `OPENAI_API_KEY`). Override by constructing a `ResolvedSTTModel` via a factory and passing it as `timestampProvider`:
178
+ With `timestamps: true`, models without native alignment require an STT fallback. The SDK automatically uses OpenAI Whisper when `OPENAI_API_KEY` is set in the environment — no extra configuration needed. Gateway-routed models (string model IDs like `"openai/tts-1"`) do not need a fallback the gateway server provides it.
179
+
180
+ **Resolution order:** factory `fallbackSTT` → `OPENAI_API_KEY` env var (automatic Whisper fallback) → throws `TimestampKeyMissingError`.
181
+
182
+ Configure `fallbackSTT` on the factory to use a different key or STT model (set it once, applies to all calls):
153
183
 
154
184
  ```ts
155
- import { createOpenAISTT } from '@speech-sdk/core/stt/openai';
185
+ import { generateSpeech } from '@speech-sdk/core';
186
+ import { createOpenAI, createElevenLabs } from '@speech-sdk/core/providers';
156
187
 
157
- await generateSpeech({
158
- model: 'cartesia/sonic-3',
159
- text: 'Hello!',
160
- voice: 'voice-id',
161
- timestamps: 'on',
162
- timestampProvider: createOpenAISTT({ apiKey: process.env.MY_WHISPER_KEY })('whisper-1'),
188
+ const elevenlabs = createElevenLabs({
189
+ apiKey: process.env.ELEVENLABS_API_KEY,
190
+ fallbackSTT: createOpenAI({ apiKey: process.env.MY_OPENAI_KEY }).stt('whisper-1'),
191
+ });
192
+
193
+ const result = await generateSpeech({
194
+ model: elevenlabs('eleven_flash_v2'),
195
+ voice: 'JBFqnCBsd6RMkjVDRZzb',
196
+ text: 'Hello, world.',
197
+ timestamps: true,
163
198
  });
164
199
  ```
165
200
 
166
- **Per-provider support:**
201
+ Whether a given model returns native alignment or transcribes via the STT fallback is a provider detail — both paths produce the same `WordTimestamp[]` shape.
167
202
 
168
- | Provider | Timestamps |
169
- |---|---|
170
- | ElevenLabs (`eleven_v3`, `eleven_multilingual_v2`, `eleven_flash_v2`, `eleven_flash_v2_5`) | **Native** — returned in the TTS response, free on `"auto"` |
171
- | Murf (`GEN2`) | **Native** — `wordDurations` returned in the TTS response, free on `"auto"` (FALCON streaming model has no native alignment) |
172
- | Hume (`octave-2`) | **Native** — word alignment from the JSON `/v0/tts` endpoint, free on `"auto"` (`octave-1` has no native alignment) |
173
- | Inworld (`inworld-tts-1.5-max`, `inworld-tts-1.5-mini`) | **Native** — `timestampInfo.wordAlignment` returned in the TTS response, free on `"auto"` (best on English/Spanish) |
174
- | Cartesia (`sonic-3`, `sonic-2`) | **Native** — routed through `/tts/sse` with `add_timestamps: true`; merges interleaved chunk + timestamps events into audio + `WordTimestamp[]` |
175
- | Resemble (`default`) | **Native** — `audio_timestamps` always returned by `/synthesize`; SDK aggregates grapheme-level timing into words (mirrors ElevenLabs aggregator) |
176
- | All others (OpenAI, Deepgram, Google, Fish Audio, fal, Mistral, xAI) | No native alignment; `"on"` transcribes via the STT fallback, `"auto"` returns `undefined` |
203
+ `generateConversation` accepts the same options and returns `ConversationWordTimestamp[]` — every word carries a `turnIndex: number` pointing back into the input `turns[]`. This is what lets you build chat-bubble UIs, speaker-attributed transcripts, and "who's speaking now?" lookups during playback without re-deriving turn boundaries.
177
204
 
178
- `generateConversation` accepts the same options and returns a flat `WordTimestamp[]` across all turns — stitch-path timings are offset by cumulative turn duration + gap.
205
+ ```ts
206
+ import { generateConversation, timestampsToTurns } from '@speech-sdk/core';
207
+
208
+ const result = await generateConversation({
209
+ model: 'elevenlabs/eleven_v3',
210
+ turns: [
211
+ { voice: 'rachel', text: 'Hi there.' },
212
+ { voice: 'adam', text: 'Hello!' },
213
+ ],
214
+ timestamps: true,
215
+ });
216
+
217
+ // Collapse consecutive words from the same turn into per-turn timings:
218
+ const turnTimestamps = timestampsToTurns(result.timestamps ?? []);
219
+ ```
179
220
 
180
221
  ### Captions (SRT / WebVTT)
181
222
 
182
- Convert word-level timestamps into a caption file. SRT is the default; pass `format: 'vtt'` for WebVTT (required for HTML `<track>`).
223
+ `timestampsToCaptions()` converts word-level timestamps into a caption file. SRT is the default; pass `format: 'vtt'` for WebVTT.
183
224
 
184
225
  ```ts
185
226
  import { generateSpeech, timestampsToCaptions } from '@speech-sdk/core';
@@ -188,33 +229,14 @@ const { timestamps } = await generateSpeech({
188
229
  model: 'elevenlabs/eleven_v3',
189
230
  text: 'Hello world. This is a test.',
190
231
  voice: 'JBFqnCBsd6RMkjVDRZzb',
191
- timestamps: 'on',
232
+ timestamps: true,
192
233
  });
193
234
 
194
235
  const srt = timestampsToCaptions(timestamps ?? []);
195
- // 1
196
- // 00:00:00,000 --> 00:00:01,200
197
- // Hello world.
198
- //
199
- // 2
200
- // 00:00:01,300 --> 00:00:02,800
201
- // This is a test.
202
-
203
236
  const vtt = timestampsToCaptions(timestamps ?? [], { format: 'vtt' });
204
- // WEBVTT
205
- //
206
- // 1
207
- // 00:00:00.000 --> 00:00:01.200
208
- // Hello world.
209
- //
210
- // 2
211
- // 00:00:01.300 --> 00:00:02.800
212
- // This is a test.
213
237
  ```
214
238
 
215
- Output follows the SubRip and [W3C WebVTT](https://www.w3.org/TR/webvtt1/) conventions: comma-decimal (SRT) vs period-decimal (VTT) timestamps, sequential numeric cue IDs, blank-line cue separators with a trailing blank line, and HTML-escaped body text (`&`, `<`, `>`) on the VTT path.
216
-
217
- Cues break on sentence boundaries (`.`, `!`, `?`), then subdivide long sentences by character count, cue duration, and soft comma breaks. Pass `CaptionsOptions` to customize `format`, `maxLineLength`, `maxLinesPerCue`, `maxCharsPerCue`, `maxCueDurationMs`, or `longPhraseCommaBreakChars`.
239
+ Cues break on sentence boundaries, then subdivide long sentences by character count, cue duration, and soft comma breaks. Pass `CaptionsOptions` to customize `format`, `maxLineLength`, `maxLinesPerCue`, `maxCharsPerCue`, `maxCueDurationMs`, or `longPhraseCommaBreakChars`.
218
240
 
219
241
  ## Volume normalization
220
242
 
@@ -231,11 +253,37 @@ const result = await generateSpeech({
231
253
  result.audio.mediaType; // "audio/wav" — re-encoded after normalization
232
254
  ```
233
255
 
234
- `generateConversation` normalizes by default. Pass `normalizeVolume: false` to skip. Throws `VolumeAdjustmentUnsupportedError` if the provider has no decodable PCM/WAV mode.
256
+ `generateConversation` always normalizes; override the target with `volumeDbfs`. A warning is surfaced (and the raw mix passes through) if the provider has no decodable PCM/WAV mode.
257
+
258
+ ### Output format
259
+
260
+ By default, `generateSpeech` preserves the provider or gateway response format.
261
+ `generateConversation` returns WAV when the SDK stitches direct-provider audio.
262
+
263
+ Pass `output` to request a specific final format:
264
+
265
+ ```ts
266
+ const result = await generateSpeech({
267
+ model: createOpenAI()('tts-1'),
268
+ voice: 'alloy',
269
+ text: 'Hello',
270
+ output: { format: 'mp3', bitrate: 96 },
271
+ });
272
+
273
+ result.audio.mediaType; // "audio/mpeg"
274
+ ```
275
+
276
+ Supported explicit formats are `wav`, `mp3`, and `pcm`.
277
+
278
+ For direct providers, the SDK first asks each provider whether it can natively produce the requested format. If yes, the provider returns it directly and the SDK passes the bytes through unchanged. If the provider can return WAV/PCM but not the requested format (e.g. ElevenLabs has no native WAV output, Cartesia has no native MP3), the SDK requests a decodable format and converts via mediabunny. The SDK never decodes compressed audio (mp3/opus/aac) — providers must return wav/pcm for any local conversion to succeed.
279
+
280
+ For gateway models, the SDK forwards `output` to the gateway API unchanged.
281
+
282
+ MP3 encoding uses [`@mediabunny/mp3-encoder`](https://mediabunny.dev/guide/extensions/mp3-encoder), loaded dynamically only when MP3 output is requested and the host environment does not already provide native MP3 encoding.
235
283
 
236
284
  ## Audio tags
237
285
 
238
- Bracket syntax `[tag]` adds expressive cues. Unsupported tags are stripped with warnings in `result.warnings`.
286
+ Bracket syntax `[tag]` adds expressive cues. Each provider handles tags natively where supported, maps them to its closest equivalent, or strips them and surfaces a warning in `result.warnings`.
239
287
 
240
288
  ```ts
241
289
  await generateSpeech({
@@ -245,21 +293,47 @@ await generateSpeech({
245
293
  });
246
294
  ```
247
295
 
248
- | Provider | Behavior |
249
- |---|---|
250
- | OpenAI (`gpt-4o-mini-tts`) | Mapped to the `instructions` field |
251
- | ElevenLabs (`eleven_v3`) | Passed through natively |
252
- | Google (`gemini-3.1-flash-tts-preview`) | Passed through natively |
253
- | Cartesia (`sonic-3`) | Emotion tags → SSML; `[laughter]` passed through; unknown stripped |
254
- | All others | Stripped with warnings |
296
+ ## Pronunciations
297
+
298
+ Customize how specific words are pronounced. Rules are applied as text substitution before the request is sent to the provider; word timestamps are inverse-mapped on return so the substitution is invisible to the caller.
299
+
300
+ ```ts
301
+ import { generateSpeech } from '@speech-sdk/core';
302
+
303
+ await generateSpeech({
304
+ model: 'openai/tts-1', // gateway path; or use createOpenAI()(...)
305
+ voice: 'alloy',
306
+ text: 'What is LLM?',
307
+ pronunciations: {
308
+ rules: [{ word: 'LLM', replacement: 'el el em' }],
309
+ },
310
+ });
311
+ ```
312
+
313
+ Stored dictionaries are referenced by ID and resolved server-side (gateway path only):
314
+
315
+ ```ts
316
+ await generateSpeech({
317
+ model: 'openai/tts-1',
318
+ voice: 'alloy',
319
+ text: 'What is LLM?',
320
+ pronunciations: {
321
+ dictionaryIds: ['dict_company_terms'],
322
+ rules: [{ word: 'LLM', replacement: 'el el em' }], // overrides dict matches
323
+ },
324
+ });
325
+ ```
326
+
327
+ `dictionaryIds` requires the gateway path. On the direct-provider path, passing dictionary IDs throws `DictionaryIdsRequireGatewayError`. Inline `rules` work on both paths.
328
+
329
+ The same option is available on `streamSpeech` and `generateConversation`. On `generateConversation`, the option applies globally to every turn.
255
330
 
256
331
  ## Voice cloning
257
332
 
258
333
  Some providers support reference-audio cloning. Pass a voice object instead of a string.
259
334
 
260
335
  ```ts
261
- import { createMistral } from '@speech-sdk/core/mistral';
262
- import { createFal } from '@speech-sdk/core/fal-ai';
336
+ import { createFal, createMistral } from '@speech-sdk/core/providers';
263
337
 
264
338
  // Base64 reference:
265
339
  await generateSpeech({
@@ -282,7 +356,7 @@ Factory functions give you custom API keys, base URLs, or `fetch` implementation
282
356
 
283
357
  ```ts
284
358
  import { generateSpeech } from '@speech-sdk/core';
285
- import { createOpenAI } from '@speech-sdk/core/openai';
359
+ import { createOpenAI } from '@speech-sdk/core/providers';
286
360
 
287
361
  const myOpenAI = createOpenAI({
288
362
  apiKey: 'sk-...',
@@ -296,6 +370,43 @@ await generateSpeech({
296
370
  });
297
371
  ```
298
372
 
373
+ ## Public imports
374
+
375
+ The root package exports the main runtime APIs:
376
+
377
+ ```ts
378
+ import {
379
+ generateSpeech,
380
+ streamSpeech,
381
+ generateConversation,
382
+ timestampsToCaptions,
383
+ ApiError,
384
+ } from '@speech-sdk/core';
385
+ ```
386
+
387
+ Provider and STT factories live under `@speech-sdk/core/providers`:
388
+
389
+ ```ts
390
+ import {
391
+ createOpenAI,
392
+ createElevenLabs,
393
+ createCartesia,
394
+ createSpeechGateway,
395
+ } from '@speech-sdk/core/providers';
396
+ ```
397
+
398
+ Public types live under `@speech-sdk/core/types`:
399
+
400
+ ```ts
401
+ import type {
402
+ GenerateSpeechOptions,
403
+ SpeechResult,
404
+ ConversationResult,
405
+ Voice,
406
+ WordTimestamp,
407
+ } from '@speech-sdk/core/types';
408
+ ```
409
+
299
410
  ## API reference
300
411
 
301
412
  ```ts
@@ -305,8 +416,7 @@ generateSpeech({
305
416
  voice: Voice, // required — string | { url } | { audio }
306
417
  providerOptions?: object,
307
418
  volumeDbfs?: number, // ≤ 0
308
- timestamps?: "on" | "auto" | "off", // default "auto"
309
- timestampProvider?: ResolvedSTTModel, // override the STT fallback
419
+ timestamps?: boolean, // default false
310
420
  maxRetries?: number, // default 2
311
421
  abortSignal?: AbortSignal,
312
422
  headers?: Record<string, string>,
@@ -321,6 +431,11 @@ interface SpeechResult {
321
431
  }
322
432
 
323
433
  interface WordTimestamp { text: string; start: number; end: number } // seconds
434
+
435
+ // Returned by generateConversation — extends WordTimestamp with turnIndex
436
+ interface ConversationWordTimestamp extends WordTimestamp {
437
+ turnIndex: number; // index into the input turns[] array
438
+ }
324
439
  ```
325
440
 
326
441
  ## Error handling
@@ -333,23 +448,27 @@ try {
333
448
  } catch (error) {
334
449
  if (error instanceof ApiError) {
335
450
  error.statusCode; // 401, 429, 500, ...
336
- error.model; // "openai/gpt-4o-mini-tts"
337
451
  error.responseBody;
452
+ error.code; // stable machine-readable code (optional)
453
+ error.retryAfterMs; // parsed Retry-After header in ms (optional)
338
454
  }
339
455
  }
340
456
  ```
341
457
 
458
+ `ApiError.code` is populated from the RFC 7807 `application/problem+json` `code` extension when the upstream provides one (currently only the Speech Gateway). Match on `err.code` over `err.message` text — codes are a stable contract, messages aren't.
459
+
342
460
  | Error | When |
343
461
  |---|---|
344
462
  | `ApiError` | Provider returned non-2xx |
463
+ | `MissingApiKeyError` | No `apiKey` passed and the provider's env var is unset |
345
464
  | `NoSpeechGeneratedError` | Empty input (after tag stripping) or empty provider response |
346
465
  | `StreamingNotSupportedError` | `streamSpeech()` on a non-streaming model |
347
466
  | `VolumeAdjustmentUnsupportedError` | `volumeDbfs` with no decodable output mode |
348
- | `TimestampKeyMissingError` | `timestamps: "on"` fallback key missing |
467
+ | `TimestampKeyMissingError` | `timestamps: true` with no native support, no `fallbackSTT` configured, and `OPENAI_API_KEY` not set |
349
468
  | `ConversationInputError` / `DialogueConstraintError` / `StitchUnsupportedError` | `generateConversation` validation / native caps / stitch incompatibility |
350
469
  | `SpeechSDKError` | Base class |
351
470
 
352
- Retries 5xx and network errors with exponential backoff ([p-retry](https://github.com/sindresorhus/p-retry)); does not retry 4xx. Default 2 retries; override via `maxRetries`.
471
+ Retries 5xx (except 501), 429, and network errors with jittered exponential backoff ([p-retry](https://github.com/sindresorhus/p-retry)); other 4xx and 501 are terminal. When a retriable error carries a `Retry-After` header, the SDK sleeps that long before the next attempt — capped at 60s to avoid pathological waits. The parsed value is surfaced as `ApiError.retryAfterMs` whenever the header is present, even on terminal errors that aren't retried. Default 2 retries; override via `maxRetries`.
353
472
 
354
473
  ## Development
355
474
 
@@ -1,58 +1,16 @@
1
1
  import { generateConversation as _generateConversation } from "../../generate-conversation.js";
2
2
  import { generateSpeech as _generateSpeech } from "../../generate-speech.js";
3
3
  import type { WordTimestamp } from "../../timestamps.js";
4
- /**
5
- * Write a test-generated audio file to `SPEECH_SDK_E2E_OUTPUT_DIR` if the env
6
- * var is set. No-op otherwise, so normal CI runs don't produce artifacts.
7
- * Usually you don't need to call this directly — use the `generateSpeech`,
8
- * `generateConversation`, and `collectStreamAndSave` helpers exported from
9
- * this module, which autosave using the current test name.
10
- *
11
- * Output layout: `$SPEECH_SDK_E2E_OUTPUT_DIR/<provider-file>/<test-slug>.<ext>`.
12
- * If the same test saves multiple times, subsequent files are suffixed `-2`,
13
- * `-3`, etc.
14
- */
15
4
  export declare function maybeSaveAudio(name: string, audio: {
16
5
  uint8Array: Uint8Array;
17
6
  mediaType: string;
18
7
  }): Promise<void>;
19
- /**
20
- * Like {@link maybeSaveAudio}, plus — when `timestamps` is non-empty — also
21
- * writes the raw alignment JSON and rendered SRT/VTT caption files alongside
22
- * the audio. All four files share the same stem so they stay paired across
23
- * multi-call tests. Still a no-op when `SPEECH_SDK_E2E_OUTPUT_DIR` is unset.
24
- *
25
- * Output layout (when timestamps present):
26
- * ```
27
- * <dir>/<bucket>/<slug>.<audio-ext>
28
- * <dir>/<bucket>/<slug>.timestamps.json
29
- * <dir>/<bucket>/<slug>.srt
30
- * <dir>/<bucket>/<slug>.vtt
31
- * ```
32
- */
33
8
  export declare function maybeSaveResult(name: string, audio: {
34
9
  uint8Array: Uint8Array;
35
10
  mediaType: string;
36
11
  }, timestamps?: readonly WordTimestamp[]): Promise<void>;
37
- /**
38
- * Drop-in replacement for `generateSpeech` that autosaves to
39
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
40
- * result includes word timestamps, also writes paired `.timestamps.json`,
41
- * `.srt`, and `.vtt` files.
42
- */
43
12
  export declare const generateSpeech: typeof _generateSpeech;
44
- /**
45
- * Drop-in replacement for `generateConversation` that autosaves to
46
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
47
- * result includes word timestamps, also writes paired `.timestamps.json`,
48
- * `.srt`, and `.vtt` files.
49
- */
50
13
  export declare const generateConversation: typeof _generateConversation;
51
- /**
52
- * Collects a streamed `streamSpeech` result into bytes AND autosaves them to
53
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. Use in place
54
- * of `collectStream(result.audio)` in e2e tests.
55
- */
56
14
  export declare function collectStreamAndSave(result: {
57
15
  audio: ReadableStream<Uint8Array>;
58
16
  mediaType: string;
@@ -1 +1 @@
1
- {"version":3,"file":"_save-audio.d.ts","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC7E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AA8FzD;;;;;;;;;;GAUG;AACH,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GACnD,OAAO,CAAC,IAAI,CAAC,CAEf;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAsB,eAAe,CACnC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,EACpD,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,GACpC,OAAO,CAAC,IAAI,CAAC,CA8Bf;AAOD;;;;;GAKG;AACH,eAAO,MAAM,cAAc,EAAE,OAAO,eAMR,CAAC;AAE7B;;;;;GAKG;AACH,eAAO,MAAM,oBAAoB,EAAE,OAAO,qBAMR,CAAC;AAEnC;;;;GAIG;AACH,wBAAsB,oBAAoB,CAAC,MAAM,EAAE;IACjD,KAAK,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;CACnB,GAAG,OAAO,CAAC,UAAU,CAAC,CAOtB"}
1
+ {"version":3,"file":"_save-audio.d.ts","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC7E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AA6EzD,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GACnD,OAAO,CAAC,IAAI,CAAC,CAEf;AAED,wBAAsB,eAAe,CACnC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,EACpD,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,GACpC,OAAO,CAAC,IAAI,CAAC,CA8Bf;AAOD,eAAO,MAAM,cAAc,EAAE,OAAO,eAMR,CAAC;AAE7B,eAAO,MAAM,oBAAoB,EAAE,OAAO,qBAMR,CAAC;AAEnC,wBAAsB,oBAAoB,CAAC,MAAM,EAAE;IACjD,KAAK,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;CACnB,GAAG,OAAO,CAAC,UAAU,CAAC,CAOtB"}
@@ -49,12 +49,6 @@ function currentTestContext() {
49
49
  testPath: state.testPath,
50
50
  };
51
51
  }
52
- /**
53
- * Derives the subdirectory for a given test file. e2e tests are named like
54
- * `openai.e2e.test.ts` / `conversation-google.e2e.test.ts`; we strip the
55
- * `.e2e.test.ts` suffix and use that as the per-provider bucket so a full run
56
- * doesn't dump 100+ files into a single flat directory.
57
- */
58
52
  function providerBucket(testPath) {
59
53
  if (!testPath) {
60
54
  return "unknown";
@@ -62,18 +56,7 @@ function providerBucket(testPath) {
62
56
  const base = basename(testPath).replace(E2E_TEST_SUFFIX, "");
63
57
  return slugify(base) || "unknown";
64
58
  }
65
- // Counter keyed by `${bucket}/${slug}` so multiple generate/stream calls
66
- // within a single test don't overwrite each other. Vitest isolates modules
67
- // per file, so this resets per test file — collisions are only meaningful
68
- // within the same `it`.
69
59
  const callCounts = new Map();
70
- /**
71
- * Reserves a filename stem (without extension) for the next save call.
72
- * First call returns `slug`; subsequent calls return `slug-2`, `slug-3`, etc.
73
- * A single stem is shared across all sibling outputs from one logical save
74
- * (audio + timestamps + captions), so they remain paired even across
75
- * multiple saves within the same test.
76
- */
77
60
  function nextStem(bucket, slug) {
78
61
  const key = `${bucket}/${slug}`;
79
62
  const n = (callCounts.get(key) ?? 0) + 1;
@@ -84,34 +67,9 @@ async function writeAndLog(file, data) {
84
67
  await writeFile(file, data);
85
68
  console.log(`[e2e-save] wrote ${file}`);
86
69
  }
87
- /**
88
- * Write a test-generated audio file to `SPEECH_SDK_E2E_OUTPUT_DIR` if the env
89
- * var is set. No-op otherwise, so normal CI runs don't produce artifacts.
90
- * Usually you don't need to call this directly — use the `generateSpeech`,
91
- * `generateConversation`, and `collectStreamAndSave` helpers exported from
92
- * this module, which autosave using the current test name.
93
- *
94
- * Output layout: `$SPEECH_SDK_E2E_OUTPUT_DIR/<provider-file>/<test-slug>.<ext>`.
95
- * If the same test saves multiple times, subsequent files are suffixed `-2`,
96
- * `-3`, etc.
97
- */
98
70
  export async function maybeSaveAudio(name, audio) {
99
71
  await maybeSaveResult(name, audio);
100
72
  }
101
- /**
102
- * Like {@link maybeSaveAudio}, plus — when `timestamps` is non-empty — also
103
- * writes the raw alignment JSON and rendered SRT/VTT caption files alongside
104
- * the audio. All four files share the same stem so they stay paired across
105
- * multi-call tests. Still a no-op when `SPEECH_SDK_E2E_OUTPUT_DIR` is unset.
106
- *
107
- * Output layout (when timestamps present):
108
- * ```
109
- * <dir>/<bucket>/<slug>.<audio-ext>
110
- * <dir>/<bucket>/<slug>.timestamps.json
111
- * <dir>/<bucket>/<slug>.srt
112
- * <dir>/<bucket>/<slug>.vtt
113
- * ```
114
- */
115
73
  export async function maybeSaveResult(name, audio, timestamps) {
116
74
  const dir = resolveOutputDir();
117
75
  if (!dir) {
@@ -133,33 +91,16 @@ function currentTestSlug() {
133
91
  const { currentTestName } = currentTestContext();
134
92
  return slugify(currentTestName ?? "unnamed") || "unnamed";
135
93
  }
136
- /**
137
- * Drop-in replacement for `generateSpeech` that autosaves to
138
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
139
- * result includes word timestamps, also writes paired `.timestamps.json`,
140
- * `.srt`, and `.vtt` files.
141
- */
142
94
  export const generateSpeech = (async (options) => {
143
95
  const result = await _generateSpeech(options);
144
96
  await maybeSaveResult(currentTestSlug(), result.audio, result.timestamps);
145
97
  return result;
146
98
  });
147
- /**
148
- * Drop-in replacement for `generateConversation` that autosaves to
149
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
150
- * result includes word timestamps, also writes paired `.timestamps.json`,
151
- * `.srt`, and `.vtt` files.
152
- */
153
99
  export const generateConversation = (async (options) => {
154
100
  const result = await _generateConversation(options);
155
101
  await maybeSaveResult(currentTestSlug(), result.audio, result.timestamps);
156
102
  return result;
157
103
  });
158
- /**
159
- * Collects a streamed `streamSpeech` result into bytes AND autosaves them to
160
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. Use in place
161
- * of `collectStream(result.audio)` in e2e tests.
162
- */
163
104
  export async function collectStreamAndSave(result) {
164
105
  const bytes = await collectStream(result.audio);
165
106
  await maybeSaveAudio(currentTestSlug(), {