npm - @speech-sdk/core - Versions diffs - 0.7.0 → 0.8.0 - Mend

@speech-sdk/core 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

package/README.md +227 -108
package/dist/__tests__/e2e/_save-audio.d.ts +0 -42
package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
package/dist/__tests__/e2e/_save-audio.js +0 -59
package/dist/__tests__/e2e/_save-audio.js.map +1 -1
package/dist/audio-decode.d.ts +7 -0
package/dist/audio-decode.d.ts.map +1 -0
package/dist/audio-decode.js +109 -0
package/dist/audio-decode.js.map +1 -0
package/dist/audio-duration.d.ts +0 -5
package/dist/audio-duration.d.ts.map +1 -1
package/dist/audio-duration.js +5 -21
package/dist/audio-duration.js.map +1 -1
package/dist/audio-output.d.ts +39 -0
package/dist/audio-output.d.ts.map +1 -0
package/dist/audio-output.js +111 -0
package/dist/audio-output.js.map +1 -0
package/dist/audio-utils.d.ts +2 -10
package/dist/audio-utils.d.ts.map +1 -1
package/dist/audio-utils.js +57 -15
package/dist/audio-utils.js.map +1 -1
package/dist/captions.d.ts +0 -108
package/dist/captions.d.ts.map +1 -1
package/dist/captions.js +8 -98
package/dist/captions.js.map +1 -1
package/dist/conversation/attribute-timestamps.d.ts +26 -0
package/dist/conversation/attribute-timestamps.d.ts.map +1 -0
package/dist/conversation/attribute-timestamps.js +276 -0
package/dist/conversation/attribute-timestamps.js.map +1 -0
package/dist/conversation/dispatch.d.ts +5 -5
package/dist/conversation/dispatch.d.ts.map +1 -1
package/dist/conversation/dispatch.js +18 -8
package/dist/conversation/dispatch.js.map +1 -1
package/dist/conversation/errors.d.ts +3 -0
package/dist/conversation/errors.d.ts.map +1 -1
package/dist/conversation/errors.js +6 -0
package/dist/conversation/errors.js.map +1 -1
package/dist/conversation/pcm-concat.d.ts +0 -24
package/dist/conversation/pcm-concat.d.ts.map +1 -1
package/dist/conversation/pcm-concat.js +8 -183
package/dist/conversation/pcm-concat.js.map +1 -1
package/dist/conversation/proportional-fill.d.ts +10 -0
package/dist/conversation/proportional-fill.d.ts.map +1 -0
package/dist/conversation/proportional-fill.js +64 -0
package/dist/conversation/proportional-fill.js.map +1 -0
package/dist/conversation/silence-detection.d.ts +14 -0
package/dist/conversation/silence-detection.d.ts.map +1 -0
package/dist/conversation/silence-detection.js +52 -0
package/dist/conversation/silence-detection.js.map +1 -0
package/dist/conversation/stitch.d.ts +9 -6
package/dist/conversation/stitch.d.ts.map +1 -1
package/dist/conversation/stitch.js +72 -51
package/dist/conversation/stitch.js.map +1 -1
package/dist/conversation/types.d.ts +7 -37
package/dist/conversation/types.d.ts.map +1 -1
package/dist/conversation/validate.d.ts +1 -16
package/dist/conversation/validate.d.ts.map +1 -1
package/dist/conversation/validate.js +29 -29
package/dist/conversation/validate.js.map +1 -1
package/dist/default-stt-fallback.d.ts +3 -0
package/dist/default-stt-fallback.d.ts.map +1 -0
package/dist/default-stt-fallback.js +11 -0
package/dist/default-stt-fallback.js.map +1 -0
package/dist/derive-timestamps.d.ts +1 -5
package/dist/derive-timestamps.d.ts.map +1 -1
package/dist/derive-timestamps.js +1 -15
package/dist/derive-timestamps.js.map +1 -1
package/dist/encoders/mp3.d.ts +6 -0
package/dist/encoders/mp3.d.ts.map +1 -0
package/dist/encoders/mp3.js +54 -0
package/dist/encoders/mp3.js.map +1 -0
package/dist/errors.d.ts +20 -13
package/dist/errors.d.ts.map +1 -1
package/dist/errors.js +49 -15
package/dist/errors.js.map +1 -1
package/dist/generate-conversation.d.ts +5 -4
package/dist/generate-conversation.d.ts.map +1 -1
package/dist/generate-conversation.js +250 -93
package/dist/generate-conversation.js.map +1 -1
package/dist/generate-speech.d.ts +7 -28
package/dist/generate-speech.d.ts.map +1 -1
package/dist/generate-speech.js +185 -94
package/dist/generate-speech.js.map +1 -1
package/dist/index.d.ts +7 -11
package/dist/index.d.ts.map +1 -1
package/dist/index.js +6 -4
package/dist/index.js.map +1 -1
package/dist/logger.d.ts.map +1 -1
package/dist/logger.js +2 -13
package/dist/logger.js.map +1 -1
package/dist/metadata.d.ts +0 -22
package/dist/metadata.d.ts.map +1 -1
package/dist/pronunciations/errors.d.ts +5 -0
package/dist/pronunciations/errors.d.ts.map +1 -0
package/dist/pronunciations/errors.js +8 -0
package/dist/pronunciations/errors.js.map +1 -0
package/dist/pronunciations/inverse-align.d.ts +4 -0
package/dist/pronunciations/inverse-align.d.ts.map +1 -0
package/dist/pronunciations/inverse-align.js +54 -0
package/dist/pronunciations/inverse-align.js.map +1 -0
package/dist/pronunciations/merge.d.ts +4 -0
package/dist/pronunciations/merge.d.ts.map +1 -0
package/dist/pronunciations/merge.js +13 -0
package/dist/pronunciations/merge.js.map +1 -0
package/dist/pronunciations/substitute.d.ts +6 -0
package/dist/pronunciations/substitute.d.ts.map +1 -0
package/dist/pronunciations/substitute.js +67 -0
package/dist/pronunciations/substitute.js.map +1 -0
package/dist/pronunciations/types.d.ts +18 -0
package/dist/pronunciations/types.d.ts.map +1 -0
package/dist/pronunciations/types.js +2 -0
package/dist/pronunciations/types.js.map +1 -0
package/dist/pronunciations/validate.d.ts +3 -0
package/dist/pronunciations/validate.d.ts.map +1 -0
package/dist/pronunciations/validate.js +26 -0
package/dist/pronunciations/validate.js.map +1 -0
package/dist/provider-utils.d.ts +4 -9
package/dist/provider-utils.d.ts.map +1 -1
package/dist/provider-utils.js +60 -51
package/dist/provider-utils.js.map +1 -1
package/dist/providers/cartesia/alignment.d.ts +0 -16
package/dist/providers/cartesia/alignment.d.ts.map +1 -1
package/dist/providers/cartesia/alignment.js +1 -6
package/dist/providers/cartesia/alignment.js.map +1 -1
package/dist/providers/cartesia/index.d.ts +29 -19
package/dist/providers/cartesia/index.d.ts.map +1 -1
package/dist/providers/cartesia/index.js +116 -80
package/dist/providers/cartesia/index.js.map +1 -1
package/dist/providers/deepgram/index.d.ts +23 -8
package/dist/providers/deepgram/index.d.ts.map +1 -1
package/dist/providers/deepgram/index.js +51 -18
package/dist/providers/deepgram/index.js.map +1 -1
package/dist/providers/elevenlabs/alignment.d.ts +7 -21
package/dist/providers/elevenlabs/alignment.d.ts.map +1 -1
package/dist/providers/elevenlabs/alignment.js +8 -9
package/dist/providers/elevenlabs/alignment.js.map +1 -1
package/dist/providers/elevenlabs/index.d.ts +14 -38
package/dist/providers/elevenlabs/index.d.ts.map +1 -1
package/dist/providers/elevenlabs/index.js +186 -169
package/dist/providers/elevenlabs/index.js.map +1 -1
package/dist/providers/fal/index.d.ts +11 -20
package/dist/providers/fal/index.d.ts.map +1 -1
package/dist/providers/fal/index.js +49 -37
package/dist/providers/fal/index.js.map +1 -1
package/dist/providers/fish-audio/index.d.ts +14 -8
package/dist/providers/fish-audio/index.d.ts.map +1 -1
package/dist/providers/fish-audio/index.js +47 -19
package/dist/providers/fish-audio/index.js.map +1 -1
package/dist/providers/gateway/index.d.ts +76 -0
package/dist/providers/gateway/index.d.ts.map +1 -0
package/dist/providers/gateway/index.js +251 -0
package/dist/providers/gateway/index.js.map +1 -0
package/dist/providers/google/index.d.ts +12 -20
package/dist/providers/google/index.d.ts.map +1 -1
package/dist/providers/google/index.js +180 -162
package/dist/providers/google/index.js.map +1 -1
package/dist/providers/hume/alignment.d.ts +30 -35
package/dist/providers/hume/alignment.d.ts.map +1 -1
package/dist/providers/hume/alignment.js +14 -8
package/dist/providers/hume/alignment.js.map +1 -1
package/dist/providers/hume/index.d.ts +16 -16
package/dist/providers/hume/index.d.ts.map +1 -1
package/dist/providers/hume/index.js +79 -65
package/dist/providers/hume/index.js.map +1 -1
package/dist/providers/inworld/alignment.d.ts +8 -22
package/dist/providers/inworld/alignment.d.ts.map +1 -1
package/dist/providers/inworld/alignment.js +9 -8
package/dist/providers/inworld/alignment.js.map +1 -1
package/dist/providers/inworld/index.d.ts +17 -20
package/dist/providers/inworld/index.d.ts.map +1 -1
package/dist/providers/inworld/index.js +79 -47
package/dist/providers/inworld/index.js.map +1 -1
package/dist/providers/mistral/index.d.ts +14 -8
package/dist/providers/mistral/index.d.ts.map +1 -1
package/dist/providers/mistral/index.js +63 -48
package/dist/providers/mistral/index.js.map +1 -1
package/dist/providers/murf/alignment.d.ts +10 -19
package/dist/providers/murf/alignment.d.ts.map +1 -1
package/dist/providers/murf/alignment.js +10 -5
package/dist/providers/murf/alignment.js.map +1 -1
package/dist/providers/murf/index.d.ts +15 -16
package/dist/providers/murf/index.d.ts.map +1 -1
package/dist/providers/murf/index.js +105 -58
package/dist/providers/murf/index.js.map +1 -1
package/dist/providers/openai/index.d.ts +43 -29
package/dist/providers/openai/index.d.ts.map +1 -1
package/dist/providers/openai/index.js +294 -106
package/dist/providers/openai/index.js.map +1 -1
package/dist/providers/resemble/alignment.d.ts +8 -29
package/dist/providers/resemble/alignment.d.ts.map +1 -1
package/dist/providers/resemble/alignment.js +9 -12
package/dist/providers/resemble/alignment.js.map +1 -1
package/dist/providers/resemble/index.d.ts +21 -11
package/dist/providers/resemble/index.d.ts.map +1 -1
package/dist/providers/resemble/index.js +89 -49
package/dist/providers/resemble/index.js.map +1 -1
package/dist/providers/smallest-ai/index.d.ts +47 -0
package/dist/providers/smallest-ai/index.d.ts.map +1 -0
package/dist/providers/smallest-ai/index.js +107 -0
package/dist/providers/smallest-ai/index.js.map +1 -0
package/dist/providers/xai/index.d.ts +25 -9
package/dist/providers/xai/index.d.ts.map +1 -1
package/dist/providers/xai/index.js +63 -40
package/dist/providers/xai/index.js.map +1 -1
package/dist/providers.d.ts +31 -0
package/dist/providers.d.ts.map +1 -0
package/dist/providers.js +16 -0
package/dist/providers.js.map +1 -0
package/dist/resolve-provider.d.ts.map +1 -1
package/dist/resolve-provider.js +8 -51
package/dist/resolve-provider.js.map +1 -1
package/dist/retry-options.d.ts +6 -0
package/dist/retry-options.d.ts.map +1 -0
package/dist/retry-options.js +48 -0
package/dist/retry-options.js.map +1 -0
package/dist/speech-provider.d.ts +28 -53
package/dist/speech-provider.d.ts.map +1 -1
package/dist/speech-provider.js +5 -26
package/dist/speech-provider.js.map +1 -1
package/dist/speech-result.d.ts +8 -9
package/dist/speech-result.d.ts.map +1 -1
package/dist/speech-result.js.map +1 -1
package/dist/speech-to-text-provider.d.ts +0 -12
package/dist/speech-to-text-provider.d.ts.map +1 -1
package/dist/stream-speech.d.ts +4 -2
package/dist/stream-speech.d.ts.map +1 -1
package/dist/stream-speech.js +36 -22
package/dist/stream-speech.js.map +1 -1
package/dist/timestamps.d.ts +3 -17
package/dist/timestamps.d.ts.map +1 -1
package/dist/turns.d.ts +9 -0
package/dist/turns.d.ts.map +1 -0
package/dist/turns.js +21 -0
package/dist/turns.js.map +1 -0
package/dist/types.d.ts +31 -0
package/dist/types.d.ts.map +1 -1
package/dist/volume-adjust.d.ts +0 -6
package/dist/volume-adjust.d.ts.map +1 -1
package/dist/volume-adjust.js +4 -16
package/dist/volume-adjust.js.map +1 -1
package/package.json +13 -66
package/dist/stt-providers/openai/index.d.ts +0 -42
package/dist/stt-providers/openai/index.d.ts.map +0 -1
package/dist/stt-providers/openai/index.js +0 -184
package/dist/stt-providers/openai/index.js.map +0 -1

package/README.md CHANGED Viewed

@@ -1,10 +1,24 @@
+<div align="center">
+<img src="https://github.com/user-attachments/assets/42d9b528-e507-4162-8120-338bb0c92650" alt="Speech SDK" width="140" />
 # Speech SDK
-[![npm version](https://img.shields.io/npm/v/@speech-sdk/core)](https://www.npmjs.com/package/@speech-sdk/core)
-[![npm downloads](https://img.shields.io/npm/dm/@speech-sdk/core)](https://www.npmjs.com/package/@speech-sdk/core)
-[![license](https://img.shields.io/npm/l/@speech-sdk/core)](https://github.com/Jellypod-Inc/speech-sdk/blob/main/LICENSE)
+**Text-to-speech across 13 providers, one API.**
+A lightweight, provider-agnostic TypeScript SDK. Zero lock-in. Runs in Node.js, Edge runtimes, and the browser.
+[![npm version](https://img.shields.io/npm/v/@speech-sdk/core?style=flat-square)](https://www.npmjs.com/package/@speech-sdk/core)
+[![npm downloads](https://img.shields.io/npm/dm/@speech-sdk/core?style=flat-square)](https://www.npmjs.com/package/@speech-sdk/core)
+[![license](https://img.shields.io/npm/l/@speech-sdk/core?style=flat-square)](https://github.com/Jellypod-Inc/speech-sdk/blob/main/LICENSE)
+[![Discord](https://img.shields.io/badge/Discord-Join-5865F2?style=flat-square&logo=discord&logoColor=white)](https://discord.gg/xcTQMU3nCV)
+[![Stars](https://img.shields.io/github/stars/Jellypod-Inc/speech-sdk?style=flat-square&logo=github&label=stars)](https://github.com/Jellypod-Inc/speech-sdk/stargazers)
+**[Quick start](#quick-start)** · **[Providers](#supported-providers)** · **[Streaming](#streaming)** · **[Multi-Speaker Conversations](#conversations)** · **[Timestamps](#timestamps)**
+</div>
-A lightweight, provider-agnostic TypeScript SDK for text-to-speech. One API, 13 providers, zero lock-in. Runs in Node.js, Edge runtimes, and the browser.
+<br />
 <img width="1200" height="630" alt="Speech SDK" src="https://github.com/user-attachments/assets/b90c0235-9405-4939-bffa-75fc82be5afb" />
@@ -12,19 +26,12 @@ Learn more at [speechsdk.dev](https://speechsdk.dev/).
 ## Features
-- **Universal** — `generateSpeech()` works across OpenAI, ElevenLabs, Deepgram, Cartesia, Hume, Google Gemini TTS, Fish Audio, Inworld, Murf, Resemble, fal, Mistral, and xAI.
+- **Universal** — one `generateSpeech()` call across every supported provider.
 - **Streaming** — `streamSpeech()` returns a standard `ReadableStream<Uint8Array>`.
-- **Conversations** — `generateConversation()` produces multi-speaker audio, using native dialogue endpoints when available and stitching locally when not.
-- **Word-level timestamps** — `timestamps: "on"` returns alignment, using the provider's native data or falling back to STT.
+- **Conversations** — `generateConversation()` produces multi-speaker audio, picking a gateway, native-dialogue, or local-stitch path automatically.
+- **Word-level timestamps** — `timestamps: true` returns alignment, using the provider's native data or falling back to STT.
 - **Volume normalization** — RMS-level outputs to an absolute loudness target.
-- **Audio tags & voice cloning** — `[laugh]`, `[sigh]`, emotion cues; reference-audio cloning where supported.
-## Contents
-- [Install](#install) · [Quick start](#quick-start) · [Supported providers](#supported-providers)
-- [Streaming](#streaming) · [Conversations](#conversations) · [Timestamps](#timestamps)
-- [Volume normalization](#volume-normalization) · [Audio tags](#audio-tags) · [Voice cloning](#voice-cloning)
-- [Custom configuration](#custom-configuration) · [API reference](#api-reference) · [Error handling](#error-handling) · [Development](#development)
+- **Audio tags & voice cloning** — bracket cues like `[laugh]` and reference-audio cloning where supported.
 ## Install
@@ -51,25 +58,51 @@ result.audio.base64;      // string (lazy)
 result.audio.mediaType;   // "audio/mpeg"
 ```
-Pass a `provider/model` string, or just the provider name to use its default model. API keys are read from env vars automatically.
+Pass a `provider/model` string, or just the provider name to use its default model. The string above is enough to get going — set one env var and you're done.
+## Gateway vs direct provider
+The SDK has two ways to reach a provider, and the choice is made by **how you pass `model`**:
+```ts
+// 1. String → routes through Speech Gateway (https://api.speechgateway.com)
+//    Needs SPEECH_GATEWAY_API_KEY (sign up at https://speechgateway.com).
+await generateSpeech({ model: 'openai/gpt-4o-mini-tts', text: '...', voice: 'alloy' });
+// 2. Factory → calls the provider directly (no proxy hop)
+//    Reads the provider's env var (e.g. OPENAI_API_KEY), or pass apiKey to the factory.
+import { createOpenAI } from '@speech-sdk/core/providers';
+await generateSpeech({ model: createOpenAI()('gpt-4o-mini-tts'), text: '...', voice: 'alloy' });
+```
+| | Speech Gateway (string) | Direct provider (factory) |
+|---|---|---|
+| When to use | You want a single endpoint and easy provider swaps | You already have provider keys, want zero-hop latency, or need provider features the gateway hasn't surfaced |
+| Setup | `SPEECH_GATEWAY_API_KEY` only | One env var per provider you use |
+| Key resolution | `apiKey` option → `SPEECH_GATEWAY_API_KEY` | `createX({ apiKey })` → `<PROVIDER>_API_KEY` |
+| Endpoint | `api.speechgateway.com` | Provider's own API |
+The gateway also accepts `createSpeechGateway({ apiKey, baseURL })` if you want to construct it explicitly (e.g. for a custom proxy URL).
 ## Supported providers
-| Provider | Prefix | Default model | Env var |
-|---|---|---|---|
-| [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `gpt-4o-mini-tts` | `OPENAI_API_KEY` |
-| [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `eleven_multilingual_v2` | `ELEVENLABS_API_KEY` |
-| [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `aura-2` | `DEEPGRAM_API_KEY` |
-| [Cartesia](https://docs.cartesia.ai) | `cartesia` | `sonic-3` | `CARTESIA_API_KEY` |
-| [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `octave-2` | `HUME_API_KEY` |
-| [Inworld](https://docs.inworld.ai/tts) | `inworld` | `inworld-tts-1.5-max` | `INWORLD_API_KEY` |
-| [Google Gemini TTS](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `gemini-2.5-flash-preview-tts` | `GOOGLE_API_KEY` |
-| [Fish Audio](https://docs.fish.audio) | `fish-audio` | `s2-pro` | `FISH_AUDIO_API_KEY` |
-| [Murf](https://murf.ai/api/docs) | `murf` | `GEN2` | `MURF_API_KEY` |
-| [Resemble](https://docs.resemble.ai) | `resemble` | `default` | `RESEMBLE_API_KEY` |
-| [fal](https://fal.ai/models) | `fal-ai` | *(user-specified)* | `FAL_API_KEY` |
-| [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `voxtral-mini-tts-2603` | `MISTRAL_API_KEY` |
-| [xAI](https://docs.x.ai/docs/models) | `xai` | `grok-tts` | `XAI_API_KEY` |
+| Provider | Prefix | Env var |
+|---|---|---|
+| [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `OPENAI_API_KEY` |
+| [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `ELEVENLABS_API_KEY` |
+| [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `DEEPGRAM_API_KEY` |
+| [Cartesia](https://docs.cartesia.ai) | `cartesia` | `CARTESIA_API_KEY` |
+| [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `HUME_API_KEY` |
+| [Inworld](https://docs.inworld.ai/tts) | `inworld` | `INWORLD_API_KEY` |
+| [Google Gemini TTS](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `GOOGLE_API_KEY` |
+| [Fish Audio](https://docs.fish.audio) | `fish-audio` | `FISH_AUDIO_API_KEY` |
+| [Murf](https://murf.ai/api/docs) | `murf` | `MURF_API_KEY` |
+| [Resemble](https://docs.resemble.ai) | `resemble` | `RESEMBLE_API_KEY` |
+| [fal](https://fal.ai/models) | `fal-ai` | `FAL_API_KEY` |
+| [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `MISTRAL_API_KEY` |
+| [xAI](https://docs.x.ai/docs/models) | `xai` | `XAI_API_KEY` |
+The env var applies when you call the provider directly via its factory. Pass a string `model` like `"openai/tts-1"` to route through Speech Gateway instead, which reads `SPEECH_GATEWAY_API_KEY` — see [Gateway vs direct provider](#gateway-vs-direct-provider). Most providers ship a default model (`createOpenAI()()`); a few (e.g. fal) require an explicit model id. See the linked docs for each provider's full model list.
 Provider-specific parameters pass through via `providerOptions` using each API's native field names.
@@ -95,13 +128,16 @@ return new Response(audio, { headers: { 'Content-Type': mediaType } });
 ## Conversations
-`generateConversation()` produces a single multi-voice clip from an ordered array of turns, picking the best path automatically:
+`generateConversation()` produces a single multi-voice clip from an ordered array of turns. The path is chosen by what the turns are:
+- **Gateway** — every turn uses a gateway-routed string model (e.g. `"openai/tts-1"`). One request to Speech Gateway; the server handles rendering, stitching, and normalization. The SDK never stitches locally on this path — clone voices on gateway models throw `StitchUnsupportedError`.
+- **Native dialogue** — every turn uses the same direct-provider model and that model exposes a multi-speaker endpoint. One API call, naturally mixed.
+- **Stitch** — direct-provider conversations that don't qualify for native dialogue (multi-provider, or no dialogue endpoint). Runs turns in parallel, RMS-levels each, inserts silence, returns a single WAV.
-- **Native dialogue** — one provider with a multi-speaker endpoint (ElevenLabs v3, Gemini TTS, Hume Octave, Fish Audio S2-Pro, fal Dia). One API call, natural mix.
-- **Stitch fallback** — multi-provider or no dialogue endpoint. Runs turns in parallel, RMS-levels each, inserts silence, returns a single WAV.
+Mixing gateway-routed turns with direct-provider turns in one call throws `MixedDispatchError`.
 ```ts
-import { generateConversation } from '@speech-sdk/core/conversation';
+import { generateConversation } from '@speech-sdk/core';
 const result = await generateConversation({
   turns: [
@@ -112,16 +148,7 @@ const result = await generateConversation({
 });
 ```
-Options: `gapMs` (default 300), `normalizeVolume` (default `true`), `volumeDbfs` (default `-20`), `maxConcurrency` (default 6), `maxRetries` (default 2), `timestamps`, `timestampProvider`, `apiKey`, `providerOptions`, `abortSignal`, `headers`. Per-turn overrides: `model`, `providerOptions` (stitch path only — throws `ConversationInputError` on native).
-**Native dialogue caps:**
-| Provider | Models | Voice constraints |
-|---|---|---|
-| ElevenLabs | `eleven_v3` | 1–10 voices, ≤ 2,000 chars |
-| Google | `gemini-2.5-{flash,pro}-preview-tts`, `gemini-3.1-flash-tts-preview` | **Exactly 2 voices** |
-| Hume | `octave-1`, `octave-2` | 1–4 voices |
-| Fish Audio | `s2-pro` | 1–4 voices |
+Options: `gapMs` (default 300), `volumeDbfs` (default `-20`), `maxConcurrency` (default 6), `maxRetries` (default 2), `timestamps`, `apiKey`, `providerOptions`, `abortSignal`, `headers`. Per-turn overrides: `model`, `providerOptions` (stitch path only — throws `ConversationInputError` on native). Native-dialogue models enforce their own voice-count and character limits; violations throw `DialogueConstraintError`.
 ## Timestamps
@@ -132,7 +159,7 @@ const result = await generateSpeech({
   model: 'elevenlabs/eleven_multilingual_v2',
   text: 'Hello from speech-sdk!',
   voice: 'JBFqnCBsd6RMkjVDRZzb',
-  timestamps: 'on',
+  timestamps: true,
 });
 result.timestamps;
@@ -143,43 +170,57 @@ result.timestamps;
 // ]
 ```
-| Mode | Behavior |
+| Value | Behavior |
 |---|---|
-| `"auto"` *(default)* | Return timestamps only if the provider supplies them natively. Free. |
-| `"on"` | Always return timestamps. Uses native alignment when available; otherwise transcribes the audio via STT (extra cost + latency). |
-| `"off"` | Never return timestamps. |
+| `true` | Always return timestamps. Uses native alignment when available; otherwise transcribes the audio via STT (extra cost + latency). |
+| `false` *(default)* | Never return timestamps. |
-On `"on"`, the fallback defaults to OpenAI Whisper (`openai/whisper-1`, needs `OPENAI_API_KEY`). Override by constructing a `ResolvedSTTModel` via a factory and passing it as `timestampProvider`:
+With `timestamps: true`, models without native alignment require an STT fallback. The SDK automatically uses OpenAI Whisper when `OPENAI_API_KEY` is set in the environment — no extra configuration needed. Gateway-routed models (string model IDs like `"openai/tts-1"`) do not need a fallback — the gateway server provides it.
+**Resolution order:** factory `fallbackSTT` → `OPENAI_API_KEY` env var (automatic Whisper fallback) → throws `TimestampKeyMissingError`.
+Configure `fallbackSTT` on the factory to use a different key or STT model (set it once, applies to all calls):
 ```ts
-import { createOpenAISTT } from '@speech-sdk/core/stt/openai';
+import { generateSpeech } from '@speech-sdk/core';
+import { createOpenAI, createElevenLabs } from '@speech-sdk/core/providers';
-await generateSpeech({
-  model: 'cartesia/sonic-3',
-  text: 'Hello!',
-  voice: 'voice-id',
-  timestamps: 'on',
-  timestampProvider: createOpenAISTT({ apiKey: process.env.MY_WHISPER_KEY })('whisper-1'),
+const elevenlabs = createElevenLabs({
+  apiKey: process.env.ELEVENLABS_API_KEY,
+  fallbackSTT: createOpenAI({ apiKey: process.env.MY_OPENAI_KEY }).stt('whisper-1'),
+});
+const result = await generateSpeech({
+  model: elevenlabs('eleven_flash_v2'),
+  voice: 'JBFqnCBsd6RMkjVDRZzb',
+  text: 'Hello, world.',
+  timestamps: true,
 });
 ```
-**Per-provider support:**
+Whether a given model returns native alignment or transcribes via the STT fallback is a provider detail — both paths produce the same `WordTimestamp[]` shape.
-| Provider | Timestamps |
-|---|---|
-| ElevenLabs (`eleven_v3`, `eleven_multilingual_v2`, `eleven_flash_v2`, `eleven_flash_v2_5`) | **Native** — returned in the TTS response, free on `"auto"` |
-| Murf (`GEN2`) | **Native** — `wordDurations` returned in the TTS response, free on `"auto"` (FALCON streaming model has no native alignment) |
-| Hume (`octave-2`) | **Native** — word alignment from the JSON `/v0/tts` endpoint, free on `"auto"` (`octave-1` has no native alignment) |
-| Inworld (`inworld-tts-1.5-max`, `inworld-tts-1.5-mini`) | **Native** — `timestampInfo.wordAlignment` returned in the TTS response, free on `"auto"` (best on English/Spanish) |
-| Cartesia (`sonic-3`, `sonic-2`) | **Native** — routed through `/tts/sse` with `add_timestamps: true`; merges interleaved chunk + timestamps events into audio + `WordTimestamp[]` |
-| Resemble (`default`) | **Native** — `audio_timestamps` always returned by `/synthesize`; SDK aggregates grapheme-level timing into words (mirrors ElevenLabs aggregator) |
-| All others (OpenAI, Deepgram, Google, Fish Audio, fal, Mistral, xAI) | No native alignment; `"on"` transcribes via the STT fallback, `"auto"` returns `undefined` |
+`generateConversation` accepts the same options and returns `ConversationWordTimestamp[]` — every word carries a `turnIndex: number` pointing back into the input `turns[]`. This is what lets you build chat-bubble UIs, speaker-attributed transcripts, and "who's speaking now?" lookups during playback without re-deriving turn boundaries.
-`generateConversation` accepts the same options and returns a flat `WordTimestamp[]` across all turns — stitch-path timings are offset by cumulative turn duration + gap.
+```ts
+import { generateConversation, timestampsToTurns } from '@speech-sdk/core';
+const result = await generateConversation({
+  model: 'elevenlabs/eleven_v3',
+  turns: [
+    { voice: 'rachel', text: 'Hi there.' },
+    { voice: 'adam',   text: 'Hello!' },
+  ],
+  timestamps: true,
+});
+// Collapse consecutive words from the same turn into per-turn timings:
+const turnTimestamps = timestampsToTurns(result.timestamps ?? []);
+```
 ### Captions (SRT / WebVTT)
-Convert word-level timestamps into a caption file. SRT is the default; pass `format: 'vtt'` for WebVTT (required for HTML `<track>`).
+`timestampsToCaptions()` converts word-level timestamps into a caption file. SRT is the default; pass `format: 'vtt'` for WebVTT.
 ```ts
 import { generateSpeech, timestampsToCaptions } from '@speech-sdk/core';
@@ -188,33 +229,14 @@ const { timestamps } = await generateSpeech({
   model: 'elevenlabs/eleven_v3',
   text: 'Hello world. This is a test.',
   voice: 'JBFqnCBsd6RMkjVDRZzb',
-  timestamps: 'on',
+  timestamps: true,
 });
 const srt = timestampsToCaptions(timestamps ?? []);
-// 1
-// 00:00:00,000 --> 00:00:01,200
-// Hello world.
-//
-// 2
-// 00:00:01,300 --> 00:00:02,800
-// This is a test.
 const vtt = timestampsToCaptions(timestamps ?? [], { format: 'vtt' });
-// WEBVTT
-//
-// 1
-// 00:00:00.000 --> 00:00:01.200
-// Hello world.
-//
-// 2
-// 00:00:01.300 --> 00:00:02.800
-// This is a test.
 ```
-Output follows the SubRip and [W3C WebVTT](https://www.w3.org/TR/webvtt1/) conventions: comma-decimal (SRT) vs period-decimal (VTT) timestamps, sequential numeric cue IDs, blank-line cue separators with a trailing blank line, and HTML-escaped body text (`&`, `<`, `>`) on the VTT path.
-Cues break on sentence boundaries (`.`, `!`, `?`), then subdivide long sentences by character count, cue duration, and soft comma breaks. Pass `CaptionsOptions` to customize `format`, `maxLineLength`, `maxLinesPerCue`, `maxCharsPerCue`, `maxCueDurationMs`, or `longPhraseCommaBreakChars`.
+Cues break on sentence boundaries, then subdivide long sentences by character count, cue duration, and soft comma breaks. Pass `CaptionsOptions` to customize `format`, `maxLineLength`, `maxLinesPerCue`, `maxCharsPerCue`, `maxCueDurationMs`, or `longPhraseCommaBreakChars`.
 ## Volume normalization
@@ -231,11 +253,37 @@ const result = await generateSpeech({
 result.audio.mediaType;  // "audio/wav" — re-encoded after normalization
 ```
-`generateConversation` normalizes by default. Pass `normalizeVolume: false` to skip. Throws `VolumeAdjustmentUnsupportedError` if the provider has no decodable PCM/WAV mode.
+`generateConversation` always normalizes; override the target with `volumeDbfs`. A warning is surfaced (and the raw mix passes through) if the provider has no decodable PCM/WAV mode.
+### Output format
+By default, `generateSpeech` preserves the provider or gateway response format.
+`generateConversation` returns WAV when the SDK stitches direct-provider audio.
+Pass `output` to request a specific final format:
+```ts
+const result = await generateSpeech({
+  model: createOpenAI()('tts-1'),
+  voice: 'alloy',
+  text: 'Hello',
+  output: { format: 'mp3', bitrate: 96 },
+});
+result.audio.mediaType; // "audio/mpeg"
+```
+Supported explicit formats are `wav`, `mp3`, and `pcm`.
+For direct providers, the SDK first asks each provider whether it can natively produce the requested format. If yes, the provider returns it directly and the SDK passes the bytes through unchanged. If the provider can return WAV/PCM but not the requested format (e.g. ElevenLabs has no native WAV output, Cartesia has no native MP3), the SDK requests a decodable format and converts via mediabunny. The SDK never decodes compressed audio (mp3/opus/aac) — providers must return wav/pcm for any local conversion to succeed.
+For gateway models, the SDK forwards `output` to the gateway API unchanged.
+MP3 encoding uses [`@mediabunny/mp3-encoder`](https://mediabunny.dev/guide/extensions/mp3-encoder), loaded dynamically only when MP3 output is requested and the host environment does not already provide native MP3 encoding.
 ## Audio tags
-Bracket syntax `[tag]` adds expressive cues. Unsupported tags are stripped with warnings in `result.warnings`.
+Bracket syntax `[tag]` adds expressive cues. Each provider handles tags natively where supported, maps them to its closest equivalent, or strips them and surfaces a warning in `result.warnings`.
 ```ts
 await generateSpeech({
@@ -245,21 +293,47 @@ await generateSpeech({
 });
 ```
-| Provider | Behavior |
-|---|---|
-| OpenAI (`gpt-4o-mini-tts`) | Mapped to the `instructions` field |
-| ElevenLabs (`eleven_v3`) | Passed through natively |
-| Google (`gemini-3.1-flash-tts-preview`) | Passed through natively |
-| Cartesia (`sonic-3`) | Emotion tags → SSML; `[laughter]` passed through; unknown stripped |
-| All others | Stripped with warnings |
+## Pronunciations
+Customize how specific words are pronounced. Rules are applied as text substitution before the request is sent to the provider; word timestamps are inverse-mapped on return so the substitution is invisible to the caller.
+```ts
+import { generateSpeech } from '@speech-sdk/core';
+await generateSpeech({
+  model: 'openai/tts-1', // gateway path; or use createOpenAI()(...)
+  voice: 'alloy',
+  text: 'What is LLM?',
+  pronunciations: {
+    rules: [{ word: 'LLM', replacement: 'el el em' }],
+  },
+});
+```
+Stored dictionaries are referenced by ID and resolved server-side (gateway path only):
+```ts
+await generateSpeech({
+  model: 'openai/tts-1',
+  voice: 'alloy',
+  text: 'What is LLM?',
+  pronunciations: {
+    dictionaryIds: ['dict_company_terms'],
+    rules: [{ word: 'LLM', replacement: 'el el em' }], // overrides dict matches
+  },
+});
+```
+`dictionaryIds` requires the gateway path. On the direct-provider path, passing dictionary IDs throws `DictionaryIdsRequireGatewayError`. Inline `rules` work on both paths.
+The same option is available on `streamSpeech` and `generateConversation`. On `generateConversation`, the option applies globally to every turn.
 ## Voice cloning
 Some providers support reference-audio cloning. Pass a voice object instead of a string.
 ```ts
-import { createMistral } from '@speech-sdk/core/mistral';
-import { createFal } from '@speech-sdk/core/fal-ai';
+import { createFal, createMistral } from '@speech-sdk/core/providers';
 // Base64 reference:
 await generateSpeech({
@@ -282,7 +356,7 @@ Factory functions give you custom API keys, base URLs, or `fetch` implementation
 ```ts
 import { generateSpeech } from '@speech-sdk/core';
-import { createOpenAI } from '@speech-sdk/core/openai';
+import { createOpenAI } from '@speech-sdk/core/providers';
 const myOpenAI = createOpenAI({
   apiKey: 'sk-...',
@@ -296,6 +370,43 @@ await generateSpeech({
 });
 ```
+## Public imports
+The root package exports the main runtime APIs:
+```ts
+import {
+  generateSpeech,
+  streamSpeech,
+  generateConversation,
+  timestampsToCaptions,
+  ApiError,
+} from '@speech-sdk/core';
+```
+Provider and STT factories live under `@speech-sdk/core/providers`:
+```ts
+import {
+  createOpenAI,
+  createElevenLabs,
+  createCartesia,
+  createSpeechGateway,
+} from '@speech-sdk/core/providers';
+```
+Public types live under `@speech-sdk/core/types`:
+```ts
+import type {
+  GenerateSpeechOptions,
+  SpeechResult,
+  ConversationResult,
+  Voice,
+  WordTimestamp,
+} from '@speech-sdk/core/types';
+```
 ## API reference
 ```ts
@@ -305,8 +416,7 @@ generateSpeech({
   voice: Voice,                           // required — string | { url } | { audio }
   providerOptions?: object,
   volumeDbfs?: number,                    // ≤ 0
-  timestamps?: "on" | "auto" | "off",     // default "auto"
-  timestampProvider?: ResolvedSTTModel,   // override the STT fallback
+  timestamps?: boolean,                   // default false
   maxRetries?: number,                    // default 2
   abortSignal?: AbortSignal,
   headers?: Record<string, string>,
@@ -321,6 +431,11 @@ interface SpeechResult {
 }
 interface WordTimestamp { text: string; start: number; end: number }  // seconds
+// Returned by generateConversation — extends WordTimestamp with turnIndex
+interface ConversationWordTimestamp extends WordTimestamp {
+  turnIndex: number;  // index into the input turns[] array
+}
 ```
 ## Error handling
@@ -333,23 +448,27 @@ try {
 } catch (error) {
   if (error instanceof ApiError) {
     error.statusCode;    // 401, 429, 500, ...
-    error.model;         // "openai/gpt-4o-mini-tts"
     error.responseBody;
+    error.code;          // stable machine-readable code (optional)
+    error.retryAfterMs;  // parsed Retry-After header in ms (optional)
   }
 }
 ```
+`ApiError.code` is populated from the RFC 7807 `application/problem+json` `code` extension when the upstream provides one (currently only the Speech Gateway). Match on `err.code` over `err.message` text — codes are a stable contract, messages aren't.
 | Error | When |
 |---|---|
 | `ApiError` | Provider returned non-2xx |
+| `MissingApiKeyError` | No `apiKey` passed and the provider's env var is unset |
 | `NoSpeechGeneratedError` | Empty input (after tag stripping) or empty provider response |
 | `StreamingNotSupportedError` | `streamSpeech()` on a non-streaming model |
 | `VolumeAdjustmentUnsupportedError` | `volumeDbfs` with no decodable output mode |
-| `TimestampKeyMissingError` | `timestamps: "on"` fallback key missing |
+| `TimestampKeyMissingError` | `timestamps: true` with no native support, no `fallbackSTT` configured, and `OPENAI_API_KEY` not set |
 | `ConversationInputError` / `DialogueConstraintError` / `StitchUnsupportedError` | `generateConversation` validation / native caps / stitch incompatibility |
 | `SpeechSDKError` | Base class |
-Retries 5xx and network errors with exponential backoff ([p-retry](https://github.com/sindresorhus/p-retry)); does not retry 4xx. Default 2 retries; override via `maxRetries`.
+Retries 5xx (except 501), 429, and network errors with jittered exponential backoff ([p-retry](https://github.com/sindresorhus/p-retry)); other 4xx and 501 are terminal. When a retriable error carries a `Retry-After` header, the SDK sleeps that long before the next attempt — capped at 60s to avoid pathological waits. The parsed value is surfaced as `ApiError.retryAfterMs` whenever the header is present, even on terminal errors that aren't retried. Default 2 retries; override via `maxRetries`.
 ## Development

package/dist/__tests__/e2e/_save-audio.d.ts CHANGED Viewed

@@ -1,58 +1,16 @@
 import { generateConversation as _generateConversation } from "../../generate-conversation.js";
 import { generateSpeech as _generateSpeech } from "../../generate-speech.js";
 import type { WordTimestamp } from "../../timestamps.js";
-/**
- * Write a test-generated audio file to `SPEECH_SDK_E2E_OUTPUT_DIR` if the env
- * var is set. No-op otherwise, so normal CI runs don't produce artifacts.
- * Usually you don't need to call this directly — use the `generateSpeech`,
- * `generateConversation`, and `collectStreamAndSave` helpers exported from
- * this module, which autosave using the current test name.
- *
- * Output layout: `$SPEECH_SDK_E2E_OUTPUT_DIR/<provider-file>/<test-slug>.<ext>`.
- * If the same test saves multiple times, subsequent files are suffixed `-2`,
- * `-3`, etc.
- */
 export declare function maybeSaveAudio(name: string, audio: {
     uint8Array: Uint8Array;
     mediaType: string;
 }): Promise<void>;
-/**
- * Like {@link maybeSaveAudio}, plus — when `timestamps` is non-empty — also
- * writes the raw alignment JSON and rendered SRT/VTT caption files alongside
- * the audio. All four files share the same stem so they stay paired across
- * multi-call tests. Still a no-op when `SPEECH_SDK_E2E_OUTPUT_DIR` is unset.
- *
- * Output layout (when timestamps present):
- * ```
- * <dir>/<bucket>/<slug>.<audio-ext>
- * <dir>/<bucket>/<slug>.timestamps.json
- * <dir>/<bucket>/<slug>.srt
- * <dir>/<bucket>/<slug>.vtt
- * ```
- */
 export declare function maybeSaveResult(name: string, audio: {
     uint8Array: Uint8Array;
     mediaType: string;
 }, timestamps?: readonly WordTimestamp[]): Promise<void>;
-/**
- * Drop-in replacement for `generateSpeech` that autosaves to
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
- * result includes word timestamps, also writes paired `.timestamps.json`,
- * `.srt`, and `.vtt` files.
- */
 export declare const generateSpeech: typeof _generateSpeech;
-/**
- * Drop-in replacement for `generateConversation` that autosaves to
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
- * result includes word timestamps, also writes paired `.timestamps.json`,
- * `.srt`, and `.vtt` files.
- */
 export declare const generateConversation: typeof _generateConversation;
-/**
- * Collects a streamed `streamSpeech` result into bytes AND autosaves them to
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. Use in place
- * of `collectStream(result.audio)` in e2e tests.
- */
 export declare function collectStreamAndSave(result: {
     audio: ReadableStream<Uint8Array>;
     mediaType: string;

package/dist/__tests__/e2e/_save-audio.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"_save-audio.d.ts","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC7E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;~~AA8FzD;;;;;;;;;;GAUG;AACH~~,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GACnD,OAAO,CAAC,IAAI,CAAC,CAEf;AAED~~;;;;;;;;;;;;;GAaG;AACH~~,wBAAsB,eAAe,CACnC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,EACpD,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,GACpC,OAAO,CAAC,IAAI,CAAC,CA8Bf;AAOD~~;;;;;GAKG;AACH~~,eAAO,MAAM,cAAc,EAAE,OAAO,eAMR,CAAC;AAE7B~~;;;;;GAKG;AACH~~,eAAO,MAAM,oBAAoB,EAAE,OAAO,qBAMR,CAAC;AAEnC~~;;;;GAIG;AACH~~,wBAAsB,oBAAoB,CAAC,MAAM,EAAE;IACjD,KAAK,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;CACnB,GAAG,OAAO,CAAC,UAAU,CAAC,CAOtB"}
1	+ {"version":3,"file":"_save-audio.d.ts","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC7E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AA6EzD,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GACnD,OAAO,CAAC,IAAI,CAAC,CAEf;AAED,wBAAsB,eAAe,CACnC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,EACpD,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,GACpC,OAAO,CAAC,IAAI,CAAC,CA8Bf;AAOD,eAAO,MAAM,cAAc,EAAE,OAAO,eAMR,CAAC;AAE7B,eAAO,MAAM,oBAAoB,EAAE,OAAO,qBAMR,CAAC;AAEnC,wBAAsB,oBAAoB,CAAC,MAAM,EAAE;IACjD,KAAK,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;CACnB,GAAG,OAAO,CAAC,UAAU,CAAC,CAOtB"}

package/dist/__tests__/e2e/_save-audio.js CHANGED Viewed

@@ -49,12 +49,6 @@ function currentTestContext() {
         testPath: state.testPath,
     };
 }
-/**
- * Derives the subdirectory for a given test file. e2e tests are named like
- * `openai.e2e.test.ts` / `conversation-google.e2e.test.ts`; we strip the
- * `.e2e.test.ts` suffix and use that as the per-provider bucket so a full run
- * doesn't dump 100+ files into a single flat directory.
- */
 function providerBucket(testPath) {
     if (!testPath) {
         return "unknown";
@@ -62,18 +56,7 @@ function providerBucket(testPath) {
     const base = basename(testPath).replace(E2E_TEST_SUFFIX, "");
     return slugify(base) || "unknown";
 }
-// Counter keyed by `${bucket}/${slug}` so multiple generate/stream calls
-// within a single test don't overwrite each other. Vitest isolates modules
-// per file, so this resets per test file — collisions are only meaningful
-// within the same `it`.
 const callCounts = new Map();
-/**
- * Reserves a filename stem (without extension) for the next save call.
- * First call returns `slug`; subsequent calls return `slug-2`, `slug-3`, etc.
- * A single stem is shared across all sibling outputs from one logical save
- * (audio + timestamps + captions), so they remain paired even across
- * multiple saves within the same test.
- */
 function nextStem(bucket, slug) {
     const key = `${bucket}/${slug}`;
     const n = (callCounts.get(key) ?? 0) + 1;
@@ -84,34 +67,9 @@ async function writeAndLog(file, data) {
     await writeFile(file, data);
     console.log(`[e2e-save] wrote ${file}`);
 }
-/**
- * Write a test-generated audio file to `SPEECH_SDK_E2E_OUTPUT_DIR` if the env
- * var is set. No-op otherwise, so normal CI runs don't produce artifacts.
- * Usually you don't need to call this directly — use the `generateSpeech`,
- * `generateConversation`, and `collectStreamAndSave` helpers exported from
- * this module, which autosave using the current test name.
- *
- * Output layout: `$SPEECH_SDK_E2E_OUTPUT_DIR/<provider-file>/<test-slug>.<ext>`.
- * If the same test saves multiple times, subsequent files are suffixed `-2`,
- * `-3`, etc.
- */
 export async function maybeSaveAudio(name, audio) {
     await maybeSaveResult(name, audio);
 }
-/**
- * Like {@link maybeSaveAudio}, plus — when `timestamps` is non-empty — also
- * writes the raw alignment JSON and rendered SRT/VTT caption files alongside
- * the audio. All four files share the same stem so they stay paired across
- * multi-call tests. Still a no-op when `SPEECH_SDK_E2E_OUTPUT_DIR` is unset.
- *
- * Output layout (when timestamps present):
- * ```
- * <dir>/<bucket>/<slug>.<audio-ext>
- * <dir>/<bucket>/<slug>.timestamps.json
- * <dir>/<bucket>/<slug>.srt
- * <dir>/<bucket>/<slug>.vtt
- * ```
- */
 export async function maybeSaveResult(name, audio, timestamps) {
     const dir = resolveOutputDir();
     if (!dir) {
@@ -133,33 +91,16 @@ function currentTestSlug() {
     const { currentTestName } = currentTestContext();
     return slugify(currentTestName ?? "unnamed") || "unnamed";
 }
-/**
- * Drop-in replacement for `generateSpeech` that autosaves to
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
- * result includes word timestamps, also writes paired `.timestamps.json`,
- * `.srt`, and `.vtt` files.
- */
 export const generateSpeech = (async (options) => {
     const result = await _generateSpeech(options);
     await maybeSaveResult(currentTestSlug(), result.audio, result.timestamps);
     return result;
 });
-/**
- * Drop-in replacement for `generateConversation` that autosaves to
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
- * result includes word timestamps, also writes paired `.timestamps.json`,
- * `.srt`, and `.vtt` files.
- */
 export const generateConversation = (async (options) => {
     const result = await _generateConversation(options);
     await maybeSaveResult(currentTestSlug(), result.audio, result.timestamps);
     return result;
 });
-/**
- * Collects a streamed `streamSpeech` result into bytes AND autosaves them to
- * `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. Use in place
- * of `collectStream(result.audio)` in e2e tests.
- */
 export async function collectStreamAndSave(result) {
     const bytes = await collectStream(result.audio);
     await maybeSaveAudio(currentTestSlug(), {