npm - @speech-sdk/core - Versions diffs - 0.6.1 → 0.7.0 - Mend

@speech-sdk/core 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

package/LICENSE +202 -21
package/README.md +215 -269
package/dist/__tests__/e2e/_save-audio.d.ts +51 -2
package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
package/dist/__tests__/e2e/_save-audio.js +139 -11
package/dist/__tests__/e2e/_save-audio.js.map +1 -1
package/dist/audio-utils.d.ts +2 -0
package/dist/audio-utils.d.ts.map +1 -1
package/dist/audio-utils.js +9 -0
package/dist/audio-utils.js.map +1 -1
package/dist/captions.d.ts +137 -0
package/dist/captions.d.ts.map +1 -0
package/dist/captions.js +283 -0
package/dist/captions.js.map +1 -0
package/dist/conversation/stitch.d.ts +5 -0
package/dist/conversation/stitch.d.ts.map +1 -1
package/dist/conversation/stitch.js +37 -0
package/dist/conversation/stitch.js.map +1 -1
package/dist/conversation/types.d.ts +16 -0
package/dist/conversation/types.d.ts.map +1 -1
package/dist/conversation/validate.d.ts.map +1 -1
package/dist/conversation/validate.js +0 -6
package/dist/conversation/validate.js.map +1 -1
package/dist/derive-timestamps.d.ts +14 -0
package/dist/derive-timestamps.d.ts.map +1 -0
package/dist/derive-timestamps.js +38 -0
package/dist/derive-timestamps.js.map +1 -0
package/dist/errors.d.ts +25 -0
package/dist/errors.d.ts.map +1 -1
package/dist/errors.js +28 -0
package/dist/errors.js.map +1 -1
package/dist/generate-conversation.d.ts +2 -1
package/dist/generate-conversation.d.ts.map +1 -1
package/dist/generate-conversation.js +72 -0
package/dist/generate-conversation.js.map +1 -1
package/dist/generate-speech.d.ts +18 -1
package/dist/generate-speech.d.ts.map +1 -1
package/dist/generate-speech.js +73 -16
package/dist/generate-speech.js.map +1 -1
package/dist/index.d.ts +6 -2
package/dist/index.d.ts.map +1 -1
package/dist/index.js +2 -1
package/dist/index.js.map +1 -1
package/dist/logger.d.ts +2 -0
package/dist/logger.d.ts.map +1 -0
package/dist/logger.js +40 -0
package/dist/logger.js.map +1 -0
package/dist/provider-utils.d.ts +8 -0
package/dist/provider-utils.d.ts.map +1 -1
package/dist/provider-utils.js +16 -2
package/dist/provider-utils.js.map +1 -1
package/dist/providers/cartesia/alignment.d.ts +24 -0
package/dist/providers/cartesia/alignment.d.ts.map +1 -0
package/dist/providers/cartesia/alignment.js +23 -0
package/dist/providers/cartesia/alignment.js.map +1 -0
package/dist/providers/cartesia/index.d.ts +12 -2
package/dist/providers/cartesia/index.d.ts.map +1 -1
package/dist/providers/cartesia/index.js +137 -2
package/dist/providers/cartesia/index.js.map +1 -1
package/dist/providers/elevenlabs/alignment.d.ts +24 -0
package/dist/providers/elevenlabs/alignment.d.ts.map +1 -0
package/dist/providers/elevenlabs/alignment.js +48 -0
package/dist/providers/elevenlabs/alignment.js.map +1 -0
package/dist/providers/elevenlabs/index.d.ts +19 -4
package/dist/providers/elevenlabs/index.d.ts.map +1 -1
package/dist/providers/elevenlabs/index.js +83 -13
package/dist/providers/elevenlabs/index.js.map +1 -1
package/dist/providers/fal/index.d.ts +0 -25
package/dist/providers/fal/index.d.ts.map +1 -1
package/dist/providers/fal/index.js +3 -58
package/dist/providers/fal/index.js.map +1 -1
package/dist/providers/hume/alignment.d.ts +38 -0
package/dist/providers/hume/alignment.d.ts.map +1 -0
package/dist/providers/hume/alignment.js +31 -0
package/dist/providers/hume/alignment.js.map +1 -0
package/dist/providers/hume/index.d.ts +8 -1
package/dist/providers/hume/index.d.ts.map +1 -1
package/dist/providers/hume/index.js +75 -1
package/dist/providers/hume/index.js.map +1 -1
package/dist/providers/inworld/alignment.d.ts +25 -0
package/dist/providers/inworld/alignment.d.ts.map +1 -0
package/dist/providers/inworld/alignment.js +23 -0
package/dist/providers/inworld/alignment.js.map +1 -0
package/dist/providers/inworld/index.d.ts +11 -2
package/dist/providers/inworld/index.d.ts.map +1 -1
package/dist/providers/inworld/index.js +11 -2
package/dist/providers/inworld/index.js.map +1 -1
package/dist/providers/murf/alignment.d.ts +22 -0
package/dist/providers/murf/alignment.d.ts.map +1 -0
package/dist/providers/murf/alignment.js +17 -0
package/dist/providers/murf/alignment.js.map +1 -0
package/dist/providers/murf/index.d.ts +8 -1
package/dist/providers/murf/index.d.ts.map +1 -1
package/dist/providers/murf/index.js +10 -1
package/dist/providers/murf/index.js.map +1 -1
package/dist/providers/openai/index.d.ts +12 -3
package/dist/providers/openai/index.d.ts.map +1 -1
package/dist/providers/openai/index.js +7 -3
package/dist/providers/openai/index.js.map +1 -1
package/dist/providers/resemble/alignment.d.ts +32 -0
package/dist/providers/resemble/alignment.d.ts.map +1 -0
package/dist/providers/resemble/alignment.js +57 -0
package/dist/providers/resemble/alignment.js.map +1 -0
package/dist/providers/resemble/index.d.ts +7 -1
package/dist/providers/resemble/index.d.ts.map +1 -1
package/dist/providers/resemble/index.js +13 -1
package/dist/providers/resemble/index.js.map +1 -1
package/dist/resolve-provider.d.ts.map +1 -1
package/dist/resolve-provider.js +3 -12
package/dist/resolve-provider.js.map +1 -1
package/dist/speech-provider.d.ts +48 -4
package/dist/speech-provider.d.ts.map +1 -1
package/dist/speech-provider.js +16 -0
package/dist/speech-provider.js.map +1 -1
package/dist/speech-result.d.ts +10 -0
package/dist/speech-result.d.ts.map +1 -1
package/dist/speech-result.js.map +1 -1
package/dist/speech-to-text-provider.d.ts +40 -0
package/dist/speech-to-text-provider.d.ts.map +1 -0
package/dist/speech-to-text-provider.js +2 -0
package/dist/speech-to-text-provider.js.map +1 -0
package/dist/stt-providers/openai/index.d.ts +42 -0
package/dist/stt-providers/openai/index.d.ts.map +1 -0
package/dist/stt-providers/openai/index.js +184 -0
package/dist/stt-providers/openai/index.js.map +1 -0
package/dist/timestamps.d.ts +23 -0
package/dist/timestamps.d.ts.map +1 -0
package/dist/timestamps.js +2 -0
package/dist/timestamps.js.map +1 -0
package/package.json +6 -2

package/README.md CHANGED Viewed

@@ -4,28 +4,38 @@
 [![npm downloads](https://img.shields.io/npm/dm/@speech-sdk/core)](https://www.npmjs.com/package/@speech-sdk/core)
 [![license](https://img.shields.io/npm/l/@speech-sdk/core)](https://github.com/Jellypod-Inc/speech-sdk/blob/main/LICENSE)
-The Speech SDK is a lightweight, provider-agnostic TypeScript toolkit designed to help build text-to-speech powered applications using popular providers like OpenAI, ElevenLabs, Deepgram, Cartesia, Google, and more. Cross-platform (Node.js, Edge, Browser) with minimal dependencies.
+A lightweight, provider-agnostic TypeScript SDK for text-to-speech. One API, 13 providers, zero lock-in. Runs in Node.js, Edge runtimes, and the browser.
-To learn more about the Speech SDK, check out [https://speechsdk.dev/](https://speechsdk.dev/).
+<img width="1200" height="630" alt="Speech SDK" src="https://github.com/user-attachments/assets/b90c0235-9405-4939-bffa-75fc82be5afb" />
-<img width="1200" height="630" alt="og-3" src="https://github.com/user-attachments/assets/b90c0235-9405-4939-bffa-75fc82be5afb" />
+Learn more at [speechsdk.dev](https://speechsdk.dev/).
+## Features
-## Install
+- **Universal** — `generateSpeech()` works across OpenAI, ElevenLabs, Deepgram, Cartesia, Hume, Google Gemini TTS, Fish Audio, Inworld, Murf, Resemble, fal, Mistral, and xAI.
+- **Streaming** — `streamSpeech()` returns a standard `ReadableStream<Uint8Array>`.
+- **Conversations** — `generateConversation()` produces multi-speaker audio, using native dialogue endpoints when available and stitching locally when not.
+- **Word-level timestamps** — `timestamps: "on"` returns alignment, using the provider's native data or falling back to STT.
+- **Volume normalization** — RMS-level outputs to an absolute loudness target.
+- **Audio tags & voice cloning** — `[laugh]`, `[sigh]`, emotion cues; reference-audio cloning where supported.
-```bash
-npm install @speech-sdk/core
-```
+## Contents
-### Using an AI Coding Assistant?
+- [Install](#install) · [Quick start](#quick-start) · [Supported providers](#supported-providers)
+- [Streaming](#streaming) · [Conversations](#conversations) · [Timestamps](#timestamps)
+- [Volume normalization](#volume-normalization) · [Audio tags](#audio-tags) · [Voice cloning](#voice-cloning)
+- [Custom configuration](#custom-configuration) · [API reference](#api-reference) · [Error handling](#error-handling) · [Development](#development)
-Add the speech-sdk skill to give your AI assistant full knowledge of this library:
+## Install
 ```bash
-npx skills add Jellypod-Inc/speech-sdk --skill speech-sdk
+npm install @speech-sdk/core
 ```
-## Quick Start
+> [!TIP]
+> Using an AI coding assistant? Add the speech-sdk skill to give it full knowledge of this library: `npx skills add Jellypod-Inc/speech-sdk --skill speech-sdk`.
+## Quick start
 ```ts
 import { generateSpeech } from '@speech-sdk/core';
@@ -36,383 +46,319 @@ const result = await generateSpeech({
   voice: 'alloy',
 });
-// Access the audio
 result.audio.uint8Array;  // Uint8Array
-result.audio.base64;      // string (lazy-computed)
+result.audio.base64;      // string (lazy)
 result.audio.mediaType;   // "audio/mpeg"
 ```
-### Volume normalization
-Pass `volumeDbfs` to RMS-normalize the output to an absolute target loudness (must be ≤ 0; lower is quieter; -20 is the broadcast/podcast voice convention with ~20 dB of peak headroom):
+Pass a `provider/model` string, or just the provider name to use its default model. API keys are read from env vars automatically.
-```ts
-const result = await generateSpeech({
-  model: 'openai/gpt-4o-mini-tts',
-  text: 'Hello from speech-sdk!',
-  voice: 'alloy',
-  volumeDbfs: -20,
-});
+## Supported providers
-result.audio.mediaType;   // "audio/wav" — re-encoded after normalization
-```
+| Provider | Prefix | Default model | Env var |
+|---|---|---|---|
+| [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `gpt-4o-mini-tts` | `OPENAI_API_KEY` |
+| [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `eleven_multilingual_v2` | `ELEVENLABS_API_KEY` |
+| [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `aura-2` | `DEEPGRAM_API_KEY` |
+| [Cartesia](https://docs.cartesia.ai) | `cartesia` | `sonic-3` | `CARTESIA_API_KEY` |
+| [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `octave-2` | `HUME_API_KEY` |
+| [Inworld](https://docs.inworld.ai/tts) | `inworld` | `inworld-tts-1.5-max` | `INWORLD_API_KEY` |
+| [Google Gemini TTS](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `gemini-2.5-flash-preview-tts` | `GOOGLE_API_KEY` |
+| [Fish Audio](https://docs.fish.audio) | `fish-audio` | `s2-pro` | `FISH_AUDIO_API_KEY` |
+| [Murf](https://murf.ai/api/docs) | `murf` | `GEN2` | `MURF_API_KEY` |
+| [Resemble](https://docs.resemble.ai) | `resemble` | `default` | `RESEMBLE_API_KEY` |
+| [fal](https://fal.ai/models) | `fal-ai` | *(user-specified)* | `FAL_API_KEY` |
+| [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `voxtral-mini-tts-2603` | `MISTRAL_API_KEY` |
+| [xAI](https://docs.x.ai/docs/models) | `xai` | `grok-tts` | `XAI_API_KEY` |
-When `volumeDbfs` is set the SDK transparently asks the provider for its decodable PCM/WAV mode, normalizes the samples, and returns 16-bit mono WAV — so the response `mediaType` switches to `audio/wav` regardless of the provider's native default. Throws `VolumeAdjustmentUnsupportedError` if the provider has no decodable output mode.
+Provider-specific parameters pass through via `providerOptions` using each API's native field names.
 ## Streaming
-Use `streamSpeech()` instead of `generateSpeech()` to receive audio bytes incrementally as the provider produces them. The result's `audio` field is a standard `ReadableStream<Uint8Array>` that works in Node, Edge runtimes, and browsers.
+`streamSpeech()` returns audio incrementally as a `ReadableStream<Uint8Array>`.
 ```ts
-import { streamSpeech } from "@speech-sdk/core";
+import { streamSpeech } from '@speech-sdk/core';
 const { audio, mediaType } = await streamSpeech({
-  model: "openai/tts-1",
-  text: "Hello from the speech SDK!",
-  voice: "alloy",
-});
-```
-### Pipe to a file (Node)
-```ts
-import { createWriteStream } from "node:fs";
-import { Readable } from "node:stream";
-const { audio } = await streamSpeech({
-  model: "elevenlabs/eleven_flash_v2_5",
-  text: "Hello world",
-  voice: "JBFqnCBsd6RMkjVDRZzb",
-});
-await new Promise((resolve, reject) => {
-  Readable.fromWeb(audio).pipe(createWriteStream("out.mp3")).on("finish", resolve).on("error", reject);
+  model: 'cartesia/sonic-3',
+  text: 'Streaming straight to the client.',
+  voice: 'voice-id',
 });
-```
-### Forward to an HTTP response (Edge / Workers / Next.js Route Handler)
-```ts
-export async function GET() {
-  const { audio, mediaType } = await streamSpeech({
-    model: "cartesia/sonic-3",
-    text: "Streaming straight to the client.",
-    voice: "voice-id",
-  });
-  return new Response(audio, { headers: { "Content-Type": mediaType } });
-}
-```
-### Read chunks manually
-```ts
-const reader = audio.getReader();
-while (true) {
-  const { value, done } = await reader.read();
-  if (done) break;
-  // value is a Uint8Array of audio bytes
-}
-```
-### Capability check
-Check whether a model supports streaming before calling `streamSpeech()`:
-```ts
-import { hasFeature } from "@speech-sdk/core";
-const model = provider.models.find((m) => m.id === "tts-1");
-if (hasFeature(model, "streaming")) {
-  // safe to call streamSpeech()
-}
+// Forward to an HTTP response:
+return new Response(audio, { headers: { 'Content-Type': mediaType } });
 ```
-Calling `streamSpeech()` on a model that doesn't declare the `"streaming"` feature throws `StreamingNotSupportedError`.
-### Errors and retries
-Retries apply only to the initial request, until response headers arrive. Once bytes start flowing, mid-stream errors propagate to the `ReadableStream` consumer as a stream error and are not retried. Pass `maxRetries` (default `2`) and an `abortSignal` the same way as `generateSpeech()`.
+> [!NOTE]
+> Retries apply only until response headers arrive; mid-stream errors propagate to the consumer. Calling `streamSpeech()` on a non-streaming model throws `StreamingNotSupportedError`.
 ## Conversations
-`generateConversation()` produces a single multi-voice audio clip from an ordered array of turns. It picks the best path automatically:
+`generateConversation()` produces a single multi-voice clip from an ordered array of turns, picking the best path automatically:
-- **Native dialogue** — when every turn shares one model and that provider has a real multi-speaker dialogue endpoint, the SDK makes a single API call and returns the provider's natural mix. Works with **ElevenLabs v3**, **Google Gemini TTS** (exactly 2 voices), **Hume Octave**, **Fish Audio S2-Pro**, and **fal Dia**.
-- **Stitch fallback** — when turns span multiple providers, or the chosen model has no native dialogue endpoint, the SDK calls `generateSpeech()` per turn in parallel, normalizes each result to PCM, RMS-levels them so quieter providers don't get drowned out, inserts a configurable silence between turns, and returns a single WAV.
+- **Native dialogue** — one provider with a multi-speaker endpoint (ElevenLabs v3, Gemini TTS, Hume Octave, Fish Audio S2-Pro, fal Dia). One API call, natural mix.
+- **Stitch fallback** — multi-provider or no dialogue endpoint. Runs turns in parallel, RMS-levels each, inserts silence, returns a single WAV.
 ```ts
-import { generateConversation } from "@speech-sdk/core/conversation";
+import { generateConversation } from '@speech-sdk/core/conversation';
 const result = await generateConversation({
   turns: [
-    { model: "openai/tts-1", voice: "nova", text: "Hi, I'm hosted by OpenAI." },
-    { model: "elevenlabs/eleven_multilingual_v2", voice: "JBFqnCBsd6RMkjVDRZzb", text: "And I'm hosted by ElevenLabs." },
-    { model: "google/gemini-3.1-flash-tts-preview", voice: "Kore", text: "I'm Gemini three-point-one flash TTS." },
-    { model: "hume/octave-2", voice: "Kora", text: "And I'm Hume Octave. Thanks for listening." },
+    { model: 'openai/tts-1',                     voice: 'nova',                 text: "Hi, I'm hosted by OpenAI." },
+    { model: 'elevenlabs/eleven_multilingual_v2', voice: 'JBFqnCBsd6RMkjVDRZzb', text: "And I'm hosted by ElevenLabs." },
+    { model: 'hume/octave-2',                    voice: 'Kora',                 text: "I'm Hume Octave. Thanks for listening." },
   ],
 });
-result.audio.uint8Array;  // Uint8Array of one combined WAV
-result.audio.mediaType;   // "audio/wav"
 ```
-The return type is the standard `SpeechResult`, so it composes with everything else in the SDK.
+Options: `gapMs` (default 300), `normalizeVolume` (default `true`), `volumeDbfs` (default `-20`), `maxConcurrency` (default 6), `maxRetries` (default 2), `timestamps`, `timestampProvider`, `apiKey`, `providerOptions`, `abortSignal`, `headers`. Per-turn overrides: `model`, `providerOptions` (stitch path only — throws `ConversationInputError` on native).
+**Native dialogue caps:**
+| Provider | Models | Voice constraints |
+|---|---|---|
+| ElevenLabs | `eleven_v3` | 1–10 voices, ≤ 2,000 chars |
+| Google | `gemini-2.5-{flash,pro}-preview-tts`, `gemini-3.1-flash-tts-preview` | **Exactly 2 voices** |
+| Hume | `octave-1`, `octave-2` | 1–4 voices |
+| Fish Audio | `s2-pro` | 1–4 voices |
+## Timestamps
-### Conversation options
+Pass `timestamps` to get word-level alignment. Timings are in seconds from the start of the audio.
 ```ts
-generateConversation({
-  model?: string | ResolvedModel,                 // default model for all turns
-  turns: ConversationTurn[],                      // 1..N turns; up to 4 unique voices
-  gapMs?: number,                                 // silence between turns (stitch path), default 300
-  normalizeVolume?: boolean,                      // RMS-level the output, default true
-  volumeDbfs?: number,                            // RMS target loudness in dBFS (≤0), default -20
-  maxConcurrency?: number,                        // cap parallel generateSpeech calls, default 6
-  maxRetries?: number,                            // per-turn retries, default 2
-  apiKey?: string,
-  providerOptions?: Record<string, unknown>,      // forwarded to every provider; per-turn override available
-  abortSignal?: AbortSignal,
-  headers?: Record<string, string>,
+const result = await generateSpeech({
+  model: 'elevenlabs/eleven_multilingual_v2',
+  text: 'Hello from speech-sdk!',
+  voice: 'JBFqnCBsd6RMkjVDRZzb',
+  timestamps: 'on',
 });
-interface ConversationTurn {
-  voice: Voice;                                   // required
-  text: string;                                   // required, non-empty
-  model?: string | ResolvedModel;                 // per-turn override of the top-level model
-  providerOptions?: Record<string, unknown>,
-}
+result.timestamps;
+// [
+//   { text: "Hello",  start: 0.00, end: 0.32 },
+//   { text: "from",   start: 0.36, end: 0.55 },
+//   ...
+// ]
 ```
-### Volume normalization
+| Mode | Behavior |
+|---|---|
+| `"auto"` *(default)* | Return timestamps only if the provider supplies them natively. Free. |
+| `"on"` | Always return timestamps. Uses native alignment when available; otherwise transcribes the audio via STT (extra cost + latency). |
+| `"off"` | Never return timestamps. |
-`normalizeVolume: true` (the default) RMS-normalizes the output to an absolute target loudness — broadcast/podcast voice convention — so two `generateConversation` calls produce comparable levels regardless of provider mix or content. The target defaults to **−20 dBFS** (~20 dB of peak headroom), and is configurable via `volumeDbfs` (must be ≤ 0; lower is quieter).
+On `"on"`, the fallback defaults to OpenAI Whisper (`openai/whisper-1`, needs `OPENAI_API_KEY`). Override by constructing a `ResolvedSTTModel` via a factory and passing it as `timestampProvider`:
 ```ts
-await generateConversation({
-  turns: [...],
-  volumeDbfs: -16,           // a touch louder than the default
+import { createOpenAISTT } from '@speech-sdk/core/stt/openai';
+await generateSpeech({
+  model: 'cartesia/sonic-3',
+  text: 'Hello!',
+  voice: 'voice-id',
+  timestamps: 'on',
+  timestampProvider: createOpenAISTT({ apiKey: process.env.MY_WHISPER_KEY })('whisper-1'),
 });
 ```
-Normalization runs on **both paths** — stitched multi-provider conversations and single-provider native dialogue. On the native path the SDK transparently asks the provider for its decodable PCM/WAV mode (via `getStitchOptions`), levels the result, and re-encodes as 16-bit mono WAV — so the response `mediaType` becomes `audio/wav` whenever normalization runs. If a native dialogue provider can't emit decodable audio, the request still succeeds but a `warning` is appended explaining that volume normalization was skipped.
+**Per-provider support:**
-Pass `normalizeVolume: false` to skip normalization entirely (zero work) and keep the raw provider audio bytes and `mediaType` untouched.
+| Provider | Timestamps |
+|---|---|
+| ElevenLabs (`eleven_v3`, `eleven_multilingual_v2`, `eleven_flash_v2`, `eleven_flash_v2_5`) | **Native** — returned in the TTS response, free on `"auto"` |
+| Murf (`GEN2`) | **Native** — `wordDurations` returned in the TTS response, free on `"auto"` (FALCON streaming model has no native alignment) |
+| Hume (`octave-2`) | **Native** — word alignment from the JSON `/v0/tts` endpoint, free on `"auto"` (`octave-1` has no native alignment) |
+| Inworld (`inworld-tts-1.5-max`, `inworld-tts-1.5-mini`) | **Native** — `timestampInfo.wordAlignment` returned in the TTS response, free on `"auto"` (best on English/Spanish) |
+| Cartesia (`sonic-3`, `sonic-2`) | **Native** — routed through `/tts/sse` with `add_timestamps: true`; merges interleaved chunk + timestamps events into audio + `WordTimestamp[]` |
+| Resemble (`default`) | **Native** — `audio_timestamps` always returned by `/synthesize`; SDK aggregates grapheme-level timing into words (mirrors ElevenLabs aggregator) |
+| All others (OpenAI, Deepgram, Google, Fish Audio, fal, Mistral, xAI) | No native alignment; `"on"` transcribes via the STT fallback, `"auto"` returns `undefined` |
-### Errors
+`generateConversation` accepts the same options and returns a flat `WordTimestamp[]` across all turns — stitch-path timings are offset by cumulative turn duration + gap.
-Conversation-specific errors (importable from `@speech-sdk/core/conversation/errors`):
+### Captions (SRT / WebVTT)
-| Error | When |
-|---|---|
-| `ConversationInputError` | Validation failure — empty turns, blank text, more than 4 unique voices, or a turn missing a model |
-| `DialogueConstraintError` | A native-dialogue provider was selected but the conversation violates its constraints (e.g. 3 voices on Gemini, which requires exactly 2) |
-| `StitchUnsupportedError` | The stitch path was selected but a chosen provider/model can't emit PCM/WAV |
+Convert word-level timestamps into a caption file. SRT is the default; pass `format: 'vtt'` for WebVTT (required for HTML `<track>`).
-### Native dialogue caps
+```ts
+import { generateSpeech, timestampsToCaptions } from '@speech-sdk/core';
-| Provider | Native dialogue model | Voice constraints |
-|---|---|---|
-| ElevenLabs | `eleven_v3` | 1–10 voices, ≤ 2,000 total chars |
-| Google | `gemini-2.5-flash-preview-tts`, `gemini-2.5-pro-preview-tts`, `gemini-3.1-flash-tts-preview` | **Exactly 2 voices** (API requirement) |
-| Hume | `octave-1`, `octave-2` | 1–4 voices |
-| Fish Audio | `s2-pro` | 1–4 voices |
-| fal | `dia-tts` | 1–2 voices |
-Across the SDK, conversations are capped at **4 unique voices** total regardless of provider.
-## Supported Providers
-Use `provider/model` strings. Passing just the provider name uses its default model.
-| Provider | String Prefix | Default Model | Env Var | Docs |
-|---|---|---|---|---|
-| [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `gpt-4o-mini-tts` | `OPENAI_API_KEY` | [API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech) |
-| [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `eleven_multilingual_v2` | `ELEVENLABS_API_KEY` | [API Reference](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) |
-| [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `aura-2` | `DEEPGRAM_API_KEY` | [API Reference](https://developers.deepgram.com/docs/tts-models) |
-| [Cartesia](https://docs.cartesia.ai) | `cartesia` | `sonic-3` | `CARTESIA_API_KEY` | [API Reference](https://docs.cartesia.ai/api-reference/tts/bytes) |
-| [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `octave-2` | `HUME_API_KEY` | [API Reference](https://dev.hume.ai/reference/text-to-speech-tts/synthesize-json) |
-| [Inworld](https://docs.inworld.ai/tts) | `inworld` | `inworld-tts-1.5-max` | `INWORLD_API_KEY` | [API Reference](https://docs.inworld.ai/tts/api-reference) |
-| [Google (Gemini TTS)](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `gemini-2.5-flash-preview-tts` | `GOOGLE_API_KEY` | [API Reference](https://ai.google.dev/gemini-api/docs/text-generation) |
-| [Fish Audio](https://docs.fish.audio) | `fish-audio` | `s2-pro` | `FISH_AUDIO_API_KEY` | [API Reference](https://docs.fish.audio/developer-guide/core-features/text-to-speech) |
-| [Murf](https://murf.ai/api/docs) | `murf` | `GEN2` | `MURF_API_KEY` | [API Reference](https://murf.ai/api/docs/api-reference/text-to-speech/generate) |
-| [Resemble](https://docs.resemble.ai) | `resemble` | `default` | `RESEMBLE_API_KEY` | [API Reference](https://docs.resemble.ai/api-reference/text-to-speech/synthesize) |
-| [fal](https://fal.ai/models) | `fal-ai` | *(user-specified)* | `FAL_API_KEY` | [API Reference](https://fal.ai/models) |
-| [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `voxtral-mini-tts-2603` | `MISTRAL_API_KEY` | [API Reference](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) |
-| [xAI](https://docs.x.ai/docs/models) | `xai` | `grok-tts` | `XAI_API_KEY` | [API Reference](https://docs.x.ai/docs/api-reference#text-to-speech) |
+const { timestamps } = await generateSpeech({
+  model: 'elevenlabs/eleven_v3',
+  text: 'Hello world. This is a test.',
+  voice: 'JBFqnCBsd6RMkjVDRZzb',
+  timestamps: 'on',
+});
-```ts
-generateSpeech({ model: 'openai/tts-1', text: '...', voice: 'alloy' });
-generateSpeech({ model: 'elevenlabs/eleven_v3', text: '...', voice: 'voice-id' });
-generateSpeech({ model: 'deepgram/aura-2', text: '...', voice: 'thalia-en' });
-generateSpeech({ model: 'inworld/inworld-tts-1.5-max', text: '...', voice: 'Ashley' });
-generateSpeech({ model: 'openai', text: '...', voice: 'alloy' });  // uses default model
+const srt = timestampsToCaptions(timestamps ?? []);
+// 1
+// 00:00:00,000 --> 00:00:01,200
+// Hello world.
+//
+// 2
+// 00:00:01,300 --> 00:00:02,800
+// This is a test.
+const vtt = timestampsToCaptions(timestamps ?? [], { format: 'vtt' });
+// WEBVTT
+//
+// 1
+// 00:00:00.000 --> 00:00:01.200
+// Hello world.
+//
+// 2
+// 00:00:01.300 --> 00:00:02.800
+// This is a test.
 ```
-Provider-specific API parameters can be passed via `providerOptions` — these are sent directly to the provider's API using the API's own field names.
-## Custom Configuration
+Output follows the SubRip and [W3C WebVTT](https://www.w3.org/TR/webvtt1/) conventions: comma-decimal (SRT) vs period-decimal (VTT) timestamps, sequential numeric cue IDs, blank-line cue separators with a trailing blank line, and HTML-escaped body text (`&`, `<`, `>`) on the VTT path.
-Use factory functions when you need custom API keys, base URLs, or fetch implementations:
+Cues break on sentence boundaries (`.`, `!`, `?`), then subdivide long sentences by character count, cue duration, and soft comma breaks. Pass `CaptionsOptions` to customize `format`, `maxLineLength`, `maxLinesPerCue`, `maxCharsPerCue`, `maxCueDurationMs`, or `longPhraseCommaBreakChars`.
-```ts
-import { generateSpeech } from '@speech-sdk/core';
-import { createOpenAI } from '@speech-sdk/core/openai';
-import { createElevenLabs } from '@speech-sdk/core/elevenlabs';
+## Volume normalization
-const myOpenAI = createOpenAI({
-  apiKey: 'sk-...',
-  baseURL: 'https://my-proxy.com/v1',
-});
+Pass `volumeDbfs` to RMS-normalize to an absolute target loudness (must be ≤ 0; `-20` is the broadcast/podcast convention).
+```ts
 const result = await generateSpeech({
-  model: myOpenAI('gpt-4o-mini-tts'),
+  model: 'openai/gpt-4o-mini-tts',
   text: 'Hello!',
   voice: 'alloy',
+  volumeDbfs: -20,
 });
-```
-### API Key Resolution
+result.audio.mediaType;  // "audio/wav" — re-encoded after normalization
+```
-When using string models (e.g., `'openai/tts-1'`), API keys are resolved from environment variables (see table above). Factory functions accept an explicit `apiKey` option which takes precedence.
+`generateConversation` normalizes by default. Pass `normalizeVolume: false` to skip. Throws `VolumeAdjustmentUnsupportedError` if the provider has no decodable PCM/WAV mode.
-## Audio Tags
+## Audio tags
-Use bracket syntax `[tag]` to add expressive audio cues like laughter, sighs, or emotions. Provider support varies — unsupported tags are automatically stripped with warnings returned in `result.warnings`.
+Bracket syntax `[tag]` adds expressive cues. Unsupported tags are stripped with warnings in `result.warnings`.
 ```ts
-const result = await generateSpeech({
+await generateSpeech({
   model: 'elevenlabs/eleven_v3',
   text: '[laugh] Oh that is so funny! [sigh] But seriously though.',
   voice: 'voice-id',
 });
-console.log(result.warnings); // undefined — eleven_v3 supports all tags
 ```
-### Provider behavior
 | Provider | Behavior |
 |---|---|
-| OpenAI (`gpt-4o-mini-tts`) | Tags mapped to the `instructions` field for expressive delivery control |
-| ElevenLabs (`eleven_v3`) | All `[tag]` passed through natively |
-| Google (`gemini-3.1-flash-tts-preview`) | All `[tag]` passed through natively (e.g. `[whispers]`, `[shouting]`, `[sighs]`, `[laugh]`) |
-| Cartesia (`sonic-3`) | Emotion tags (`[happy]`, `[sad]`, `[angry]`, etc.) converted to SSML; `[laughter]` passed through; unknown tags stripped |
-| All others | Tags stripped and warnings returned |
-```ts
-// OpenAI gpt-4o-mini-tts — tags are mapped to the `instructions` field
-const result = await generateSpeech({
-  model: 'openai/gpt-4o-mini-tts',
-  text: '[cheerfully] Hi John how are you? [soft] I\'m feeling great',
-  voice: 'alloy',
-});
-// Sent to OpenAI:
-//   input: "Hi John how are you? I'm feeling great"
-//   instructions: "Delivery shifts through the text in order: begin cheerfully, then soft."
-console.log(result.warnings); // undefined
-```
+| OpenAI (`gpt-4o-mini-tts`) | Mapped to the `instructions` field |
+| ElevenLabs (`eleven_v3`) | Passed through natively |
+| Google (`gemini-3.1-flash-tts-preview`) | Passed through natively |
+| Cartesia (`sonic-3`) | Emotion tags → SSML; `[laughter]` passed through; unknown stripped |
+| All others | Stripped with warnings |
-## Voice Cloning
+## Voice cloning
-Some providers support voice cloning via reference audio. Pass a voice object instead of a string:
+Some providers support reference-audio cloning. Pass a voice object instead of a string.
 ```ts
 import { createMistral } from '@speech-sdk/core/mistral';
+import { createFal } from '@speech-sdk/core/fal-ai';
-const mistral = createMistral();
-// Clone from base64 audio
-const result = await generateSpeech({
-  model: mistral(),
+// Base64 reference:
+await generateSpeech({
+  model: createMistral()(),
   text: 'Hello!',
   voice: { audio: 'base64-encoded-audio...' },
 });
-```
-Clone from a URL (fal):
-```ts
-import { createFal } from '@speech-sdk/core/fal-ai';
-const fal = createFal();
-const result = await generateSpeech({
-  model: fal('fal-ai/chatterbox'),
+// URL reference:
+await generateSpeech({
+  model: createFal()('fal-ai/f5-tts'),
   text: 'Hello!',
   voice: { url: 'https://example.com/reference.wav' },
 });
 ```
-## Options
+## Custom configuration
+Factory functions give you custom API keys, base URLs, or `fetch` implementations:
 ```ts
-generateSpeech({
-  model: string | ResolvedModel,  // required
-  text: string,                   // required
-  voice: Voice,                   // required
-  providerOptions?: object,       // provider-specific API params
-  maxRetries?: number,            // default: 2 (retries on 5xx/network errors)
-  abortSignal?: AbortSignal,      // cancel the request
-  headers?: Record<string, string>, // additional HTTP headers
+import { generateSpeech } from '@speech-sdk/core';
+import { createOpenAI } from '@speech-sdk/core/openai';
+const myOpenAI = createOpenAI({
+  apiKey: 'sk-...',
+  baseURL: 'https://my-proxy.com/v1',
+});
+await generateSpeech({
+  model: myOpenAI('gpt-4o-mini-tts'),
+  text: 'Hello!',
+  voice: 'alloy',
 });
 ```
-## Result
+## API reference
 ```ts
+generateSpeech({
+  model: string | ResolvedModel,          // required
+  text: string,                           // required
+  voice: Voice,                           // required — string | { url } | { audio }
+  providerOptions?: object,
+  volumeDbfs?: number,                    // ≤ 0
+  timestamps?: "on" | "auto" | "off",     // default "auto"
+  timestampProvider?: ResolvedSTTModel,   // override the STT fallback
+  maxRetries?: number,                    // default 2
+  abortSignal?: AbortSignal,
+  headers?: Record<string, string>,
+}): Promise<SpeechResult>
 interface SpeechResult {
-  audio: {
-    uint8Array: Uint8Array;   // raw audio bytes
-    base64: string;           // base64 encoded (lazy)
-    mediaType: string;        // e.g. "audio/mpeg"
-  };
+  audio: { uint8Array: Uint8Array; base64: string; mediaType: string };
+  metadata: { latencyMs: number; inputChars: number; provider: string; model: string; audioDurationMs?: number; ttfbMs?: number };
+  timestamps?: WordTimestamp[];
   providerMetadata?: Record<string, unknown>;
+  warnings?: string[];
 }
+interface WordTimestamp { text: string; start: number; end: number }  // seconds
 ```
-## Error Handling
+## Error handling
 ```ts
-import { generateSpeech, ApiError, SpeechSDKError } from '@speech-sdk/core';
+import { generateSpeech, ApiError } from '@speech-sdk/core';
 try {
-  const result = await generateSpeech({ ... });
+  await generateSpeech({ /* ... */ });
 } catch (error) {
   if (error instanceof ApiError) {
-    console.log(error.statusCode);  // 401
-    console.log(error.model);       // "openai/gpt-4o-mini-tts"
-    console.log(error.responseBody);
+    error.statusCode;    // 401, 429, 500, ...
+    error.model;         // "openai/gpt-4o-mini-tts"
+    error.responseBody;
   }
 }
 ```
 | Error | When |
 |---|---|
-| `ApiError` | Provider API returns a non-2xx response |
-| `NoSpeechGeneratedError` | Provider returned empty audio |
-| `SpeechSDKError` | Base class for all errors |
+| `ApiError` | Provider returned non-2xx |
+| `NoSpeechGeneratedError` | Empty input (after tag stripping) or empty provider response |
+| `StreamingNotSupportedError` | `streamSpeech()` on a non-streaming model |
+| `VolumeAdjustmentUnsupportedError` | `volumeDbfs` with no decodable output mode |
+| `TimestampKeyMissingError` | `timestamps: "on"` fallback key missing |
+| `ConversationInputError` / `DialogueConstraintError` / `StitchUnsupportedError` | `generateConversation` validation / native caps / stitch incompatibility |
+| `SpeechSDKError` | Base class |
-## Retry
-Built-in retry with exponential backoff via [p-retry](https://github.com/sindresorhus/p-retry). Retries on 5xx and network errors. Does not retry 4xx errors. Default: 2 retries.
+Retries 5xx and network errors with exponential backoff ([p-retry](https://github.com/sindresorhus/p-retry)); does not retry 4xx. Default 2 retries; override via `maxRetries`.
 ## Development
 ```bash
 pnpm install
-pnpm test                       # unit tests
-pnpm run test:e2e               # e2e tests (requires API keys)
-pnpm run typecheck              # type-check without emitting
+pnpm test              # unit tests
+pnpm run test:e2e      # e2e tests (requires provider API keys)
+pnpm run typecheck
+pnpm fix               # format + lint
 ```
-E2E tests hit real provider APIs. Set the relevant API key environment variables in a `.env` file or export them in your shell.
-Set `SPEECH_SDK_E2E_OUTPUT_DIR` to have the conversation e2e tests write their generated audio to disk (useful for sampling/comparing provider output):
-```bash
-SPEECH_SDK_E2E_OUTPUT_DIR=~/Downloads/convos pnpm run test:e2e
-```
-## License
-MIT
+E2E tests hit real provider APIs. Set the relevant keys in `.env` or export them. Set `SPEECH_SDK_E2E_OUTPUT_DIR=~/Downloads/convos` to write conversation e2e audio to disk.