@speech-sdk/core 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +227 -108
- package/dist/__tests__/e2e/_save-audio.d.ts +0 -42
- package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
- package/dist/__tests__/e2e/_save-audio.js +0 -59
- package/dist/__tests__/e2e/_save-audio.js.map +1 -1
- package/dist/audio-decode.d.ts +7 -0
- package/dist/audio-decode.d.ts.map +1 -0
- package/dist/audio-decode.js +109 -0
- package/dist/audio-decode.js.map +1 -0
- package/dist/audio-duration.d.ts +0 -5
- package/dist/audio-duration.d.ts.map +1 -1
- package/dist/audio-duration.js +5 -21
- package/dist/audio-duration.js.map +1 -1
- package/dist/audio-output.d.ts +39 -0
- package/dist/audio-output.d.ts.map +1 -0
- package/dist/audio-output.js +111 -0
- package/dist/audio-output.js.map +1 -0
- package/dist/audio-utils.d.ts +2 -10
- package/dist/audio-utils.d.ts.map +1 -1
- package/dist/audio-utils.js +57 -15
- package/dist/audio-utils.js.map +1 -1
- package/dist/captions.d.ts +0 -108
- package/dist/captions.d.ts.map +1 -1
- package/dist/captions.js +8 -98
- package/dist/captions.js.map +1 -1
- package/dist/conversation/attribute-timestamps.d.ts +26 -0
- package/dist/conversation/attribute-timestamps.d.ts.map +1 -0
- package/dist/conversation/attribute-timestamps.js +276 -0
- package/dist/conversation/attribute-timestamps.js.map +1 -0
- package/dist/conversation/dispatch.d.ts +5 -5
- package/dist/conversation/dispatch.d.ts.map +1 -1
- package/dist/conversation/dispatch.js +18 -8
- package/dist/conversation/dispatch.js.map +1 -1
- package/dist/conversation/errors.d.ts +3 -0
- package/dist/conversation/errors.d.ts.map +1 -1
- package/dist/conversation/errors.js +6 -0
- package/dist/conversation/errors.js.map +1 -1
- package/dist/conversation/pcm-concat.d.ts +0 -24
- package/dist/conversation/pcm-concat.d.ts.map +1 -1
- package/dist/conversation/pcm-concat.js +8 -183
- package/dist/conversation/pcm-concat.js.map +1 -1
- package/dist/conversation/proportional-fill.d.ts +10 -0
- package/dist/conversation/proportional-fill.d.ts.map +1 -0
- package/dist/conversation/proportional-fill.js +64 -0
- package/dist/conversation/proportional-fill.js.map +1 -0
- package/dist/conversation/silence-detection.d.ts +14 -0
- package/dist/conversation/silence-detection.d.ts.map +1 -0
- package/dist/conversation/silence-detection.js +52 -0
- package/dist/conversation/silence-detection.js.map +1 -0
- package/dist/conversation/stitch.d.ts +9 -6
- package/dist/conversation/stitch.d.ts.map +1 -1
- package/dist/conversation/stitch.js +72 -51
- package/dist/conversation/stitch.js.map +1 -1
- package/dist/conversation/types.d.ts +7 -37
- package/dist/conversation/types.d.ts.map +1 -1
- package/dist/conversation/validate.d.ts +1 -16
- package/dist/conversation/validate.d.ts.map +1 -1
- package/dist/conversation/validate.js +29 -29
- package/dist/conversation/validate.js.map +1 -1
- package/dist/default-stt-fallback.d.ts +3 -0
- package/dist/default-stt-fallback.d.ts.map +1 -0
- package/dist/default-stt-fallback.js +11 -0
- package/dist/default-stt-fallback.js.map +1 -0
- package/dist/derive-timestamps.d.ts +1 -5
- package/dist/derive-timestamps.d.ts.map +1 -1
- package/dist/derive-timestamps.js +1 -15
- package/dist/derive-timestamps.js.map +1 -1
- package/dist/encoders/mp3.d.ts +6 -0
- package/dist/encoders/mp3.d.ts.map +1 -0
- package/dist/encoders/mp3.js +54 -0
- package/dist/encoders/mp3.js.map +1 -0
- package/dist/errors.d.ts +20 -13
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +49 -15
- package/dist/errors.js.map +1 -1
- package/dist/generate-conversation.d.ts +5 -4
- package/dist/generate-conversation.d.ts.map +1 -1
- package/dist/generate-conversation.js +250 -93
- package/dist/generate-conversation.js.map +1 -1
- package/dist/generate-speech.d.ts +7 -28
- package/dist/generate-speech.d.ts.map +1 -1
- package/dist/generate-speech.js +185 -94
- package/dist/generate-speech.js.map +1 -1
- package/dist/index.d.ts +7 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -4
- package/dist/index.js.map +1 -1
- package/dist/logger.d.ts.map +1 -1
- package/dist/logger.js +2 -13
- package/dist/logger.js.map +1 -1
- package/dist/metadata.d.ts +0 -22
- package/dist/metadata.d.ts.map +1 -1
- package/dist/pronunciations/errors.d.ts +5 -0
- package/dist/pronunciations/errors.d.ts.map +1 -0
- package/dist/pronunciations/errors.js +8 -0
- package/dist/pronunciations/errors.js.map +1 -0
- package/dist/pronunciations/inverse-align.d.ts +4 -0
- package/dist/pronunciations/inverse-align.d.ts.map +1 -0
- package/dist/pronunciations/inverse-align.js +54 -0
- package/dist/pronunciations/inverse-align.js.map +1 -0
- package/dist/pronunciations/merge.d.ts +4 -0
- package/dist/pronunciations/merge.d.ts.map +1 -0
- package/dist/pronunciations/merge.js +13 -0
- package/dist/pronunciations/merge.js.map +1 -0
- package/dist/pronunciations/substitute.d.ts +6 -0
- package/dist/pronunciations/substitute.d.ts.map +1 -0
- package/dist/pronunciations/substitute.js +67 -0
- package/dist/pronunciations/substitute.js.map +1 -0
- package/dist/pronunciations/types.d.ts +18 -0
- package/dist/pronunciations/types.d.ts.map +1 -0
- package/dist/pronunciations/types.js +2 -0
- package/dist/pronunciations/types.js.map +1 -0
- package/dist/pronunciations/validate.d.ts +3 -0
- package/dist/pronunciations/validate.d.ts.map +1 -0
- package/dist/pronunciations/validate.js +26 -0
- package/dist/pronunciations/validate.js.map +1 -0
- package/dist/provider-utils.d.ts +4 -9
- package/dist/provider-utils.d.ts.map +1 -1
- package/dist/provider-utils.js +60 -51
- package/dist/provider-utils.js.map +1 -1
- package/dist/providers/cartesia/alignment.d.ts +0 -16
- package/dist/providers/cartesia/alignment.d.ts.map +1 -1
- package/dist/providers/cartesia/alignment.js +1 -6
- package/dist/providers/cartesia/alignment.js.map +1 -1
- package/dist/providers/cartesia/index.d.ts +29 -19
- package/dist/providers/cartesia/index.d.ts.map +1 -1
- package/dist/providers/cartesia/index.js +116 -80
- package/dist/providers/cartesia/index.js.map +1 -1
- package/dist/providers/deepgram/index.d.ts +23 -8
- package/dist/providers/deepgram/index.d.ts.map +1 -1
- package/dist/providers/deepgram/index.js +51 -18
- package/dist/providers/deepgram/index.js.map +1 -1
- package/dist/providers/elevenlabs/alignment.d.ts +7 -21
- package/dist/providers/elevenlabs/alignment.d.ts.map +1 -1
- package/dist/providers/elevenlabs/alignment.js +8 -9
- package/dist/providers/elevenlabs/alignment.js.map +1 -1
- package/dist/providers/elevenlabs/index.d.ts +14 -38
- package/dist/providers/elevenlabs/index.d.ts.map +1 -1
- package/dist/providers/elevenlabs/index.js +186 -169
- package/dist/providers/elevenlabs/index.js.map +1 -1
- package/dist/providers/fal/index.d.ts +11 -20
- package/dist/providers/fal/index.d.ts.map +1 -1
- package/dist/providers/fal/index.js +49 -37
- package/dist/providers/fal/index.js.map +1 -1
- package/dist/providers/fish-audio/index.d.ts +14 -8
- package/dist/providers/fish-audio/index.d.ts.map +1 -1
- package/dist/providers/fish-audio/index.js +47 -19
- package/dist/providers/fish-audio/index.js.map +1 -1
- package/dist/providers/gateway/index.d.ts +76 -0
- package/dist/providers/gateway/index.d.ts.map +1 -0
- package/dist/providers/gateway/index.js +251 -0
- package/dist/providers/gateway/index.js.map +1 -0
- package/dist/providers/google/index.d.ts +12 -20
- package/dist/providers/google/index.d.ts.map +1 -1
- package/dist/providers/google/index.js +180 -162
- package/dist/providers/google/index.js.map +1 -1
- package/dist/providers/hume/alignment.d.ts +30 -35
- package/dist/providers/hume/alignment.d.ts.map +1 -1
- package/dist/providers/hume/alignment.js +14 -8
- package/dist/providers/hume/alignment.js.map +1 -1
- package/dist/providers/hume/index.d.ts +16 -16
- package/dist/providers/hume/index.d.ts.map +1 -1
- package/dist/providers/hume/index.js +79 -65
- package/dist/providers/hume/index.js.map +1 -1
- package/dist/providers/inworld/alignment.d.ts +8 -22
- package/dist/providers/inworld/alignment.d.ts.map +1 -1
- package/dist/providers/inworld/alignment.js +9 -8
- package/dist/providers/inworld/alignment.js.map +1 -1
- package/dist/providers/inworld/index.d.ts +17 -20
- package/dist/providers/inworld/index.d.ts.map +1 -1
- package/dist/providers/inworld/index.js +79 -47
- package/dist/providers/inworld/index.js.map +1 -1
- package/dist/providers/mistral/index.d.ts +14 -8
- package/dist/providers/mistral/index.d.ts.map +1 -1
- package/dist/providers/mistral/index.js +63 -48
- package/dist/providers/mistral/index.js.map +1 -1
- package/dist/providers/murf/alignment.d.ts +10 -19
- package/dist/providers/murf/alignment.d.ts.map +1 -1
- package/dist/providers/murf/alignment.js +10 -5
- package/dist/providers/murf/alignment.js.map +1 -1
- package/dist/providers/murf/index.d.ts +15 -16
- package/dist/providers/murf/index.d.ts.map +1 -1
- package/dist/providers/murf/index.js +105 -58
- package/dist/providers/murf/index.js.map +1 -1
- package/dist/providers/openai/index.d.ts +43 -29
- package/dist/providers/openai/index.d.ts.map +1 -1
- package/dist/providers/openai/index.js +294 -106
- package/dist/providers/openai/index.js.map +1 -1
- package/dist/providers/resemble/alignment.d.ts +8 -29
- package/dist/providers/resemble/alignment.d.ts.map +1 -1
- package/dist/providers/resemble/alignment.js +9 -12
- package/dist/providers/resemble/alignment.js.map +1 -1
- package/dist/providers/resemble/index.d.ts +21 -11
- package/dist/providers/resemble/index.d.ts.map +1 -1
- package/dist/providers/resemble/index.js +89 -49
- package/dist/providers/resemble/index.js.map +1 -1
- package/dist/providers/smallest-ai/index.d.ts +47 -0
- package/dist/providers/smallest-ai/index.d.ts.map +1 -0
- package/dist/providers/smallest-ai/index.js +107 -0
- package/dist/providers/smallest-ai/index.js.map +1 -0
- package/dist/providers/xai/index.d.ts +25 -9
- package/dist/providers/xai/index.d.ts.map +1 -1
- package/dist/providers/xai/index.js +63 -40
- package/dist/providers/xai/index.js.map +1 -1
- package/dist/providers.d.ts +31 -0
- package/dist/providers.d.ts.map +1 -0
- package/dist/providers.js +16 -0
- package/dist/providers.js.map +1 -0
- package/dist/resolve-provider.d.ts.map +1 -1
- package/dist/resolve-provider.js +8 -51
- package/dist/resolve-provider.js.map +1 -1
- package/dist/retry-options.d.ts +6 -0
- package/dist/retry-options.d.ts.map +1 -0
- package/dist/retry-options.js +48 -0
- package/dist/retry-options.js.map +1 -0
- package/dist/speech-provider.d.ts +28 -53
- package/dist/speech-provider.d.ts.map +1 -1
- package/dist/speech-provider.js +5 -26
- package/dist/speech-provider.js.map +1 -1
- package/dist/speech-result.d.ts +8 -9
- package/dist/speech-result.d.ts.map +1 -1
- package/dist/speech-result.js.map +1 -1
- package/dist/speech-to-text-provider.d.ts +0 -12
- package/dist/speech-to-text-provider.d.ts.map +1 -1
- package/dist/stream-speech.d.ts +4 -2
- package/dist/stream-speech.d.ts.map +1 -1
- package/dist/stream-speech.js +36 -22
- package/dist/stream-speech.js.map +1 -1
- package/dist/timestamps.d.ts +3 -17
- package/dist/timestamps.d.ts.map +1 -1
- package/dist/turns.d.ts +9 -0
- package/dist/turns.d.ts.map +1 -0
- package/dist/turns.js +21 -0
- package/dist/turns.js.map +1 -0
- package/dist/types.d.ts +31 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/volume-adjust.d.ts +0 -6
- package/dist/volume-adjust.d.ts.map +1 -1
- package/dist/volume-adjust.js +4 -16
- package/dist/volume-adjust.js.map +1 -1
- package/package.json +13 -66
- package/dist/stt-providers/openai/index.d.ts +0 -42
- package/dist/stt-providers/openai/index.d.ts.map +0 -1
- package/dist/stt-providers/openai/index.js +0 -184
- package/dist/stt-providers/openai/index.js.map +0 -1
package/README.md
CHANGED
|
@@ -1,10 +1,24 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<img src="https://github.com/user-attachments/assets/42d9b528-e507-4162-8120-338bb0c92650" alt="Speech SDK" width="140" />
|
|
4
|
+
|
|
1
5
|
# Speech SDK
|
|
2
6
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
7
|
+
**Text-to-speech across 13 providers, one API.**
|
|
8
|
+
|
|
9
|
+
A lightweight, provider-agnostic TypeScript SDK. Zero lock-in. Runs in Node.js, Edge runtimes, and the browser.
|
|
10
|
+
|
|
11
|
+
[](https://www.npmjs.com/package/@speech-sdk/core)
|
|
12
|
+
[](https://www.npmjs.com/package/@speech-sdk/core)
|
|
13
|
+
[](https://github.com/Jellypod-Inc/speech-sdk/blob/main/LICENSE)
|
|
14
|
+
[](https://discord.gg/xcTQMU3nCV)
|
|
15
|
+
[](https://github.com/Jellypod-Inc/speech-sdk/stargazers)
|
|
16
|
+
|
|
17
|
+
**[Quick start](#quick-start)** · **[Providers](#supported-providers)** · **[Streaming](#streaming)** · **[Multi-Speaker Conversations](#conversations)** · **[Timestamps](#timestamps)**
|
|
18
|
+
|
|
19
|
+
</div>
|
|
6
20
|
|
|
7
|
-
|
|
21
|
+
<br />
|
|
8
22
|
|
|
9
23
|
<img width="1200" height="630" alt="Speech SDK" src="https://github.com/user-attachments/assets/b90c0235-9405-4939-bffa-75fc82be5afb" />
|
|
10
24
|
|
|
@@ -12,19 +26,12 @@ Learn more at [speechsdk.dev](https://speechsdk.dev/).
|
|
|
12
26
|
|
|
13
27
|
## Features
|
|
14
28
|
|
|
15
|
-
- **Universal** — `generateSpeech()`
|
|
29
|
+
- **Universal** — one `generateSpeech()` call across every supported provider.
|
|
16
30
|
- **Streaming** — `streamSpeech()` returns a standard `ReadableStream<Uint8Array>`.
|
|
17
|
-
- **Conversations** — `generateConversation()` produces multi-speaker audio,
|
|
18
|
-
- **Word-level timestamps** — `timestamps:
|
|
31
|
+
- **Conversations** — `generateConversation()` produces multi-speaker audio, picking a gateway, native-dialogue, or local-stitch path automatically.
|
|
32
|
+
- **Word-level timestamps** — `timestamps: true` returns alignment, using the provider's native data or falling back to STT.
|
|
19
33
|
- **Volume normalization** — RMS-level outputs to an absolute loudness target.
|
|
20
|
-
- **Audio tags & voice cloning** — `[laugh]
|
|
21
|
-
|
|
22
|
-
## Contents
|
|
23
|
-
|
|
24
|
-
- [Install](#install) · [Quick start](#quick-start) · [Supported providers](#supported-providers)
|
|
25
|
-
- [Streaming](#streaming) · [Conversations](#conversations) · [Timestamps](#timestamps)
|
|
26
|
-
- [Volume normalization](#volume-normalization) · [Audio tags](#audio-tags) · [Voice cloning](#voice-cloning)
|
|
27
|
-
- [Custom configuration](#custom-configuration) · [API reference](#api-reference) · [Error handling](#error-handling) · [Development](#development)
|
|
34
|
+
- **Audio tags & voice cloning** — bracket cues like `[laugh]` and reference-audio cloning where supported.
|
|
28
35
|
|
|
29
36
|
## Install
|
|
30
37
|
|
|
@@ -51,25 +58,51 @@ result.audio.base64; // string (lazy)
|
|
|
51
58
|
result.audio.mediaType; // "audio/mpeg"
|
|
52
59
|
```
|
|
53
60
|
|
|
54
|
-
Pass a `provider/model` string, or just the provider name to use its default model.
|
|
61
|
+
Pass a `provider/model` string, or just the provider name to use its default model. The string above is enough to get going — set one env var and you're done.
|
|
62
|
+
|
|
63
|
+
## Gateway vs direct provider
|
|
64
|
+
|
|
65
|
+
The SDK has two ways to reach a provider, and the choice is made by **how you pass `model`**:
|
|
66
|
+
|
|
67
|
+
```ts
|
|
68
|
+
// 1. String → routes through Speech Gateway (https://api.speechgateway.com)
|
|
69
|
+
// Needs SPEECH_GATEWAY_API_KEY (sign up at https://speechgateway.com).
|
|
70
|
+
await generateSpeech({ model: 'openai/gpt-4o-mini-tts', text: '...', voice: 'alloy' });
|
|
71
|
+
|
|
72
|
+
// 2. Factory → calls the provider directly (no proxy hop)
|
|
73
|
+
// Reads the provider's env var (e.g. OPENAI_API_KEY), or pass apiKey to the factory.
|
|
74
|
+
import { createOpenAI } from '@speech-sdk/core/providers';
|
|
75
|
+
await generateSpeech({ model: createOpenAI()('gpt-4o-mini-tts'), text: '...', voice: 'alloy' });
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
| | Speech Gateway (string) | Direct provider (factory) |
|
|
79
|
+
|---|---|---|
|
|
80
|
+
| When to use | You want a single endpoint and easy provider swaps | You already have provider keys, want zero-hop latency, or need provider features the gateway hasn't surfaced |
|
|
81
|
+
| Setup | `SPEECH_GATEWAY_API_KEY` only | One env var per provider you use |
|
|
82
|
+
| Key resolution | `apiKey` option → `SPEECH_GATEWAY_API_KEY` | `createX({ apiKey })` → `<PROVIDER>_API_KEY` |
|
|
83
|
+
| Endpoint | `api.speechgateway.com` | Provider's own API |
|
|
84
|
+
|
|
85
|
+
The gateway also accepts `createSpeechGateway({ apiKey, baseURL })` if you want to construct it explicitly (e.g. for a custom proxy URL).
|
|
55
86
|
|
|
56
87
|
## Supported providers
|
|
57
88
|
|
|
58
|
-
| Provider | Prefix |
|
|
59
|
-
|
|
60
|
-
| [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `
|
|
61
|
-
| [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `
|
|
62
|
-
| [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `
|
|
63
|
-
| [Cartesia](https://docs.cartesia.ai) | `cartesia` | `
|
|
64
|
-
| [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `
|
|
65
|
-
| [Inworld](https://docs.inworld.ai/tts) | `inworld` | `
|
|
66
|
-
| [Google Gemini TTS](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `
|
|
67
|
-
| [Fish Audio](https://docs.fish.audio) | `fish-audio` | `
|
|
68
|
-
| [Murf](https://murf.ai/api/docs) | `murf` | `
|
|
69
|
-
| [Resemble](https://docs.resemble.ai) | `resemble` | `
|
|
70
|
-
| [fal](https://fal.ai/models) | `fal-ai` |
|
|
71
|
-
| [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `
|
|
72
|
-
| [xAI](https://docs.x.ai/docs/models) | `xai` | `
|
|
89
|
+
| Provider | Prefix | Env var |
|
|
90
|
+
|---|---|---|
|
|
91
|
+
| [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `OPENAI_API_KEY` |
|
|
92
|
+
| [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `ELEVENLABS_API_KEY` |
|
|
93
|
+
| [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `DEEPGRAM_API_KEY` |
|
|
94
|
+
| [Cartesia](https://docs.cartesia.ai) | `cartesia` | `CARTESIA_API_KEY` |
|
|
95
|
+
| [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `HUME_API_KEY` |
|
|
96
|
+
| [Inworld](https://docs.inworld.ai/tts) | `inworld` | `INWORLD_API_KEY` |
|
|
97
|
+
| [Google Gemini TTS](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `GOOGLE_API_KEY` |
|
|
98
|
+
| [Fish Audio](https://docs.fish.audio) | `fish-audio` | `FISH_AUDIO_API_KEY` |
|
|
99
|
+
| [Murf](https://murf.ai/api/docs) | `murf` | `MURF_API_KEY` |
|
|
100
|
+
| [Resemble](https://docs.resemble.ai) | `resemble` | `RESEMBLE_API_KEY` |
|
|
101
|
+
| [fal](https://fal.ai/models) | `fal-ai` | `FAL_API_KEY` |
|
|
102
|
+
| [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `MISTRAL_API_KEY` |
|
|
103
|
+
| [xAI](https://docs.x.ai/docs/models) | `xai` | `XAI_API_KEY` |
|
|
104
|
+
|
|
105
|
+
The env var applies when you call the provider directly via its factory. Pass a string `model` like `"openai/tts-1"` to route through Speech Gateway instead, which reads `SPEECH_GATEWAY_API_KEY` — see [Gateway vs direct provider](#gateway-vs-direct-provider). Most providers ship a default model (`createOpenAI()()`); a few (e.g. fal) require an explicit model id. See the linked docs for each provider's full model list.
|
|
73
106
|
|
|
74
107
|
Provider-specific parameters pass through via `providerOptions` using each API's native field names.
|
|
75
108
|
|
|
@@ -95,13 +128,16 @@ return new Response(audio, { headers: { 'Content-Type': mediaType } });
|
|
|
95
128
|
|
|
96
129
|
## Conversations
|
|
97
130
|
|
|
98
|
-
`generateConversation()` produces a single multi-voice clip from an ordered array of turns
|
|
131
|
+
`generateConversation()` produces a single multi-voice clip from an ordered array of turns. The path is chosen by what the turns are:
|
|
132
|
+
|
|
133
|
+
- **Gateway** — every turn uses a gateway-routed string model (e.g. `"openai/tts-1"`). One request to Speech Gateway; the server handles rendering, stitching, and normalization. The SDK never stitches locally on this path — clone voices on gateway models throw `StitchUnsupportedError`.
|
|
134
|
+
- **Native dialogue** — every turn uses the same direct-provider model and that model exposes a multi-speaker endpoint. One API call, naturally mixed.
|
|
135
|
+
- **Stitch** — direct-provider conversations that don't qualify for native dialogue (multi-provider, or no dialogue endpoint). Runs turns in parallel, RMS-levels each, inserts silence, returns a single WAV.
|
|
99
136
|
|
|
100
|
-
-
|
|
101
|
-
- **Stitch fallback** — multi-provider or no dialogue endpoint. Runs turns in parallel, RMS-levels each, inserts silence, returns a single WAV.
|
|
137
|
+
Mixing gateway-routed turns with direct-provider turns in one call throws `MixedDispatchError`.
|
|
102
138
|
|
|
103
139
|
```ts
|
|
104
|
-
import { generateConversation } from '@speech-sdk/core
|
|
140
|
+
import { generateConversation } from '@speech-sdk/core';
|
|
105
141
|
|
|
106
142
|
const result = await generateConversation({
|
|
107
143
|
turns: [
|
|
@@ -112,16 +148,7 @@ const result = await generateConversation({
|
|
|
112
148
|
});
|
|
113
149
|
```
|
|
114
150
|
|
|
115
|
-
Options: `gapMs` (default 300), `
|
|
116
|
-
|
|
117
|
-
**Native dialogue caps:**
|
|
118
|
-
|
|
119
|
-
| Provider | Models | Voice constraints |
|
|
120
|
-
|---|---|---|
|
|
121
|
-
| ElevenLabs | `eleven_v3` | 1–10 voices, ≤ 2,000 chars |
|
|
122
|
-
| Google | `gemini-2.5-{flash,pro}-preview-tts`, `gemini-3.1-flash-tts-preview` | **Exactly 2 voices** |
|
|
123
|
-
| Hume | `octave-1`, `octave-2` | 1–4 voices |
|
|
124
|
-
| Fish Audio | `s2-pro` | 1–4 voices |
|
|
151
|
+
Options: `gapMs` (default 300), `volumeDbfs` (default `-20`), `maxConcurrency` (default 6), `maxRetries` (default 2), `timestamps`, `apiKey`, `providerOptions`, `abortSignal`, `headers`. Per-turn overrides: `model`, `providerOptions` (stitch path only — throws `ConversationInputError` on native). Native-dialogue models enforce their own voice-count and character limits; violations throw `DialogueConstraintError`.
|
|
125
152
|
|
|
126
153
|
## Timestamps
|
|
127
154
|
|
|
@@ -132,7 +159,7 @@ const result = await generateSpeech({
|
|
|
132
159
|
model: 'elevenlabs/eleven_multilingual_v2',
|
|
133
160
|
text: 'Hello from speech-sdk!',
|
|
134
161
|
voice: 'JBFqnCBsd6RMkjVDRZzb',
|
|
135
|
-
timestamps:
|
|
162
|
+
timestamps: true,
|
|
136
163
|
});
|
|
137
164
|
|
|
138
165
|
result.timestamps;
|
|
@@ -143,43 +170,57 @@ result.timestamps;
|
|
|
143
170
|
// ]
|
|
144
171
|
```
|
|
145
172
|
|
|
146
|
-
|
|
|
173
|
+
| Value | Behavior |
|
|
147
174
|
|---|---|
|
|
148
|
-
| `
|
|
149
|
-
| `
|
|
150
|
-
| `"off"` | Never return timestamps. |
|
|
175
|
+
| `true` | Always return timestamps. Uses native alignment when available; otherwise transcribes the audio via STT (extra cost + latency). |
|
|
176
|
+
| `false` *(default)* | Never return timestamps. |
|
|
151
177
|
|
|
152
|
-
|
|
178
|
+
With `timestamps: true`, models without native alignment require an STT fallback. The SDK automatically uses OpenAI Whisper when `OPENAI_API_KEY` is set in the environment — no extra configuration needed. Gateway-routed models (string model IDs like `"openai/tts-1"`) do not need a fallback — the gateway server provides it.
|
|
179
|
+
|
|
180
|
+
**Resolution order:** factory `fallbackSTT` → `OPENAI_API_KEY` env var (automatic Whisper fallback) → throws `TimestampKeyMissingError`.
|
|
181
|
+
|
|
182
|
+
Configure `fallbackSTT` on the factory to use a different key or STT model (set it once, applies to all calls):
|
|
153
183
|
|
|
154
184
|
```ts
|
|
155
|
-
import {
|
|
185
|
+
import { generateSpeech } from '@speech-sdk/core';
|
|
186
|
+
import { createOpenAI, createElevenLabs } from '@speech-sdk/core/providers';
|
|
156
187
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
188
|
+
const elevenlabs = createElevenLabs({
|
|
189
|
+
apiKey: process.env.ELEVENLABS_API_KEY,
|
|
190
|
+
fallbackSTT: createOpenAI({ apiKey: process.env.MY_OPENAI_KEY }).stt('whisper-1'),
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
const result = await generateSpeech({
|
|
194
|
+
model: elevenlabs('eleven_flash_v2'),
|
|
195
|
+
voice: 'JBFqnCBsd6RMkjVDRZzb',
|
|
196
|
+
text: 'Hello, world.',
|
|
197
|
+
timestamps: true,
|
|
163
198
|
});
|
|
164
199
|
```
|
|
165
200
|
|
|
166
|
-
|
|
201
|
+
Whether a given model returns native alignment or transcribes via the STT fallback is a provider detail — both paths produce the same `WordTimestamp[]` shape.
|
|
167
202
|
|
|
168
|
-
|
|
169
|
-
|---|---|
|
|
170
|
-
| ElevenLabs (`eleven_v3`, `eleven_multilingual_v2`, `eleven_flash_v2`, `eleven_flash_v2_5`) | **Native** — returned in the TTS response, free on `"auto"` |
|
|
171
|
-
| Murf (`GEN2`) | **Native** — `wordDurations` returned in the TTS response, free on `"auto"` (FALCON streaming model has no native alignment) |
|
|
172
|
-
| Hume (`octave-2`) | **Native** — word alignment from the JSON `/v0/tts` endpoint, free on `"auto"` (`octave-1` has no native alignment) |
|
|
173
|
-
| Inworld (`inworld-tts-1.5-max`, `inworld-tts-1.5-mini`) | **Native** — `timestampInfo.wordAlignment` returned in the TTS response, free on `"auto"` (best on English/Spanish) |
|
|
174
|
-
| Cartesia (`sonic-3`, `sonic-2`) | **Native** — routed through `/tts/sse` with `add_timestamps: true`; merges interleaved chunk + timestamps events into audio + `WordTimestamp[]` |
|
|
175
|
-
| Resemble (`default`) | **Native** — `audio_timestamps` always returned by `/synthesize`; SDK aggregates grapheme-level timing into words (mirrors ElevenLabs aggregator) |
|
|
176
|
-
| All others (OpenAI, Deepgram, Google, Fish Audio, fal, Mistral, xAI) | No native alignment; `"on"` transcribes via the STT fallback, `"auto"` returns `undefined` |
|
|
203
|
+
`generateConversation` accepts the same options and returns `ConversationWordTimestamp[]` — every word carries a `turnIndex: number` pointing back into the input `turns[]`. This is what lets you build chat-bubble UIs, speaker-attributed transcripts, and "who's speaking now?" lookups during playback without re-deriving turn boundaries.
|
|
177
204
|
|
|
178
|
-
|
|
205
|
+
```ts
|
|
206
|
+
import { generateConversation, timestampsToTurns } from '@speech-sdk/core';
|
|
207
|
+
|
|
208
|
+
const result = await generateConversation({
|
|
209
|
+
model: 'elevenlabs/eleven_v3',
|
|
210
|
+
turns: [
|
|
211
|
+
{ voice: 'rachel', text: 'Hi there.' },
|
|
212
|
+
{ voice: 'adam', text: 'Hello!' },
|
|
213
|
+
],
|
|
214
|
+
timestamps: true,
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
// Collapse consecutive words from the same turn into per-turn timings:
|
|
218
|
+
const turnTimestamps = timestampsToTurns(result.timestamps ?? []);
|
|
219
|
+
```
|
|
179
220
|
|
|
180
221
|
### Captions (SRT / WebVTT)
|
|
181
222
|
|
|
182
|
-
|
|
223
|
+
`timestampsToCaptions()` converts word-level timestamps into a caption file. SRT is the default; pass `format: 'vtt'` for WebVTT.
|
|
183
224
|
|
|
184
225
|
```ts
|
|
185
226
|
import { generateSpeech, timestampsToCaptions } from '@speech-sdk/core';
|
|
@@ -188,33 +229,14 @@ const { timestamps } = await generateSpeech({
|
|
|
188
229
|
model: 'elevenlabs/eleven_v3',
|
|
189
230
|
text: 'Hello world. This is a test.',
|
|
190
231
|
voice: 'JBFqnCBsd6RMkjVDRZzb',
|
|
191
|
-
timestamps:
|
|
232
|
+
timestamps: true,
|
|
192
233
|
});
|
|
193
234
|
|
|
194
235
|
const srt = timestampsToCaptions(timestamps ?? []);
|
|
195
|
-
// 1
|
|
196
|
-
// 00:00:00,000 --> 00:00:01,200
|
|
197
|
-
// Hello world.
|
|
198
|
-
//
|
|
199
|
-
// 2
|
|
200
|
-
// 00:00:01,300 --> 00:00:02,800
|
|
201
|
-
// This is a test.
|
|
202
|
-
|
|
203
236
|
const vtt = timestampsToCaptions(timestamps ?? [], { format: 'vtt' });
|
|
204
|
-
// WEBVTT
|
|
205
|
-
//
|
|
206
|
-
// 1
|
|
207
|
-
// 00:00:00.000 --> 00:00:01.200
|
|
208
|
-
// Hello world.
|
|
209
|
-
//
|
|
210
|
-
// 2
|
|
211
|
-
// 00:00:01.300 --> 00:00:02.800
|
|
212
|
-
// This is a test.
|
|
213
237
|
```
|
|
214
238
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
Cues break on sentence boundaries (`.`, `!`, `?`), then subdivide long sentences by character count, cue duration, and soft comma breaks. Pass `CaptionsOptions` to customize `format`, `maxLineLength`, `maxLinesPerCue`, `maxCharsPerCue`, `maxCueDurationMs`, or `longPhraseCommaBreakChars`.
|
|
239
|
+
Cues break on sentence boundaries, then subdivide long sentences by character count, cue duration, and soft comma breaks. Pass `CaptionsOptions` to customize `format`, `maxLineLength`, `maxLinesPerCue`, `maxCharsPerCue`, `maxCueDurationMs`, or `longPhraseCommaBreakChars`.
|
|
218
240
|
|
|
219
241
|
## Volume normalization
|
|
220
242
|
|
|
@@ -231,11 +253,37 @@ const result = await generateSpeech({
|
|
|
231
253
|
result.audio.mediaType; // "audio/wav" — re-encoded after normalization
|
|
232
254
|
```
|
|
233
255
|
|
|
234
|
-
`generateConversation` normalizes
|
|
256
|
+
`generateConversation` always normalizes; override the target with `volumeDbfs`. A warning is surfaced (and the raw mix passes through) if the provider has no decodable PCM/WAV mode.
|
|
257
|
+
|
|
258
|
+
### Output format
|
|
259
|
+
|
|
260
|
+
By default, `generateSpeech` preserves the provider or gateway response format.
|
|
261
|
+
`generateConversation` returns WAV when the SDK stitches direct-provider audio.
|
|
262
|
+
|
|
263
|
+
Pass `output` to request a specific final format:
|
|
264
|
+
|
|
265
|
+
```ts
|
|
266
|
+
const result = await generateSpeech({
|
|
267
|
+
model: createOpenAI()('tts-1'),
|
|
268
|
+
voice: 'alloy',
|
|
269
|
+
text: 'Hello',
|
|
270
|
+
output: { format: 'mp3', bitrate: 96 },
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
result.audio.mediaType; // "audio/mpeg"
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
Supported explicit formats are `wav`, `mp3`, and `pcm`.
|
|
277
|
+
|
|
278
|
+
For direct providers, the SDK first asks each provider whether it can natively produce the requested format. If yes, the provider returns it directly and the SDK passes the bytes through unchanged. If the provider can return WAV/PCM but not the requested format (e.g. ElevenLabs has no native WAV output, Cartesia has no native MP3), the SDK requests a decodable format and converts via mediabunny. The SDK never decodes compressed audio (mp3/opus/aac) — providers must return wav/pcm for any local conversion to succeed.
|
|
279
|
+
|
|
280
|
+
For gateway models, the SDK forwards `output` to the gateway API unchanged.
|
|
281
|
+
|
|
282
|
+
MP3 encoding uses [`@mediabunny/mp3-encoder`](https://mediabunny.dev/guide/extensions/mp3-encoder), loaded dynamically only when MP3 output is requested and the host environment does not already provide native MP3 encoding.
|
|
235
283
|
|
|
236
284
|
## Audio tags
|
|
237
285
|
|
|
238
|
-
Bracket syntax `[tag]` adds expressive cues.
|
|
286
|
+
Bracket syntax `[tag]` adds expressive cues. Each provider handles tags natively where supported, maps them to its closest equivalent, or strips them and surfaces a warning in `result.warnings`.
|
|
239
287
|
|
|
240
288
|
```ts
|
|
241
289
|
await generateSpeech({
|
|
@@ -245,21 +293,47 @@ await generateSpeech({
|
|
|
245
293
|
});
|
|
246
294
|
```
|
|
247
295
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
296
|
+
## Pronunciations
|
|
297
|
+
|
|
298
|
+
Customize how specific words are pronounced. Rules are applied as text substitution before the request is sent to the provider; word timestamps are inverse-mapped on return so the substitution is invisible to the caller.
|
|
299
|
+
|
|
300
|
+
```ts
|
|
301
|
+
import { generateSpeech } from '@speech-sdk/core';
|
|
302
|
+
|
|
303
|
+
await generateSpeech({
|
|
304
|
+
model: 'openai/tts-1', // gateway path; or use createOpenAI()(...)
|
|
305
|
+
voice: 'alloy',
|
|
306
|
+
text: 'What is LLM?',
|
|
307
|
+
pronunciations: {
|
|
308
|
+
rules: [{ word: 'LLM', replacement: 'el el em' }],
|
|
309
|
+
},
|
|
310
|
+
});
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
Stored dictionaries are referenced by ID and resolved server-side (gateway path only):
|
|
314
|
+
|
|
315
|
+
```ts
|
|
316
|
+
await generateSpeech({
|
|
317
|
+
model: 'openai/tts-1',
|
|
318
|
+
voice: 'alloy',
|
|
319
|
+
text: 'What is LLM?',
|
|
320
|
+
pronunciations: {
|
|
321
|
+
dictionaryIds: ['dict_company_terms'],
|
|
322
|
+
rules: [{ word: 'LLM', replacement: 'el el em' }], // overrides dict matches
|
|
323
|
+
},
|
|
324
|
+
});
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
`dictionaryIds` requires the gateway path. On the direct-provider path, passing dictionary IDs throws `DictionaryIdsRequireGatewayError`. Inline `rules` work on both paths.
|
|
328
|
+
|
|
329
|
+
The same option is available on `streamSpeech` and `generateConversation`. On `generateConversation`, the option applies globally to every turn.
|
|
255
330
|
|
|
256
331
|
## Voice cloning
|
|
257
332
|
|
|
258
333
|
Some providers support reference-audio cloning. Pass a voice object instead of a string.
|
|
259
334
|
|
|
260
335
|
```ts
|
|
261
|
-
import { createMistral } from '@speech-sdk/core/
|
|
262
|
-
import { createFal } from '@speech-sdk/core/fal-ai';
|
|
336
|
+
import { createFal, createMistral } from '@speech-sdk/core/providers';
|
|
263
337
|
|
|
264
338
|
// Base64 reference:
|
|
265
339
|
await generateSpeech({
|
|
@@ -282,7 +356,7 @@ Factory functions give you custom API keys, base URLs, or `fetch` implementation
|
|
|
282
356
|
|
|
283
357
|
```ts
|
|
284
358
|
import { generateSpeech } from '@speech-sdk/core';
|
|
285
|
-
import { createOpenAI } from '@speech-sdk/core/
|
|
359
|
+
import { createOpenAI } from '@speech-sdk/core/providers';
|
|
286
360
|
|
|
287
361
|
const myOpenAI = createOpenAI({
|
|
288
362
|
apiKey: 'sk-...',
|
|
@@ -296,6 +370,43 @@ await generateSpeech({
|
|
|
296
370
|
});
|
|
297
371
|
```
|
|
298
372
|
|
|
373
|
+
## Public imports
|
|
374
|
+
|
|
375
|
+
The root package exports the main runtime APIs:
|
|
376
|
+
|
|
377
|
+
```ts
|
|
378
|
+
import {
|
|
379
|
+
generateSpeech,
|
|
380
|
+
streamSpeech,
|
|
381
|
+
generateConversation,
|
|
382
|
+
timestampsToCaptions,
|
|
383
|
+
ApiError,
|
|
384
|
+
} from '@speech-sdk/core';
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
Provider and STT factories live under `@speech-sdk/core/providers`:
|
|
388
|
+
|
|
389
|
+
```ts
|
|
390
|
+
import {
|
|
391
|
+
createOpenAI,
|
|
392
|
+
createElevenLabs,
|
|
393
|
+
createCartesia,
|
|
394
|
+
createSpeechGateway,
|
|
395
|
+
} from '@speech-sdk/core/providers';
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
Public types live under `@speech-sdk/core/types`:
|
|
399
|
+
|
|
400
|
+
```ts
|
|
401
|
+
import type {
|
|
402
|
+
GenerateSpeechOptions,
|
|
403
|
+
SpeechResult,
|
|
404
|
+
ConversationResult,
|
|
405
|
+
Voice,
|
|
406
|
+
WordTimestamp,
|
|
407
|
+
} from '@speech-sdk/core/types';
|
|
408
|
+
```
|
|
409
|
+
|
|
299
410
|
## API reference
|
|
300
411
|
|
|
301
412
|
```ts
|
|
@@ -305,8 +416,7 @@ generateSpeech({
|
|
|
305
416
|
voice: Voice, // required — string | { url } | { audio }
|
|
306
417
|
providerOptions?: object,
|
|
307
418
|
volumeDbfs?: number, // ≤ 0
|
|
308
|
-
timestamps?:
|
|
309
|
-
timestampProvider?: ResolvedSTTModel, // override the STT fallback
|
|
419
|
+
timestamps?: boolean, // default false
|
|
310
420
|
maxRetries?: number, // default 2
|
|
311
421
|
abortSignal?: AbortSignal,
|
|
312
422
|
headers?: Record<string, string>,
|
|
@@ -321,6 +431,11 @@ interface SpeechResult {
|
|
|
321
431
|
}
|
|
322
432
|
|
|
323
433
|
interface WordTimestamp { text: string; start: number; end: number } // seconds
|
|
434
|
+
|
|
435
|
+
// Returned by generateConversation — extends WordTimestamp with turnIndex
|
|
436
|
+
interface ConversationWordTimestamp extends WordTimestamp {
|
|
437
|
+
turnIndex: number; // index into the input turns[] array
|
|
438
|
+
}
|
|
324
439
|
```
|
|
325
440
|
|
|
326
441
|
## Error handling
|
|
@@ -333,23 +448,27 @@ try {
|
|
|
333
448
|
} catch (error) {
|
|
334
449
|
if (error instanceof ApiError) {
|
|
335
450
|
error.statusCode; // 401, 429, 500, ...
|
|
336
|
-
error.model; // "openai/gpt-4o-mini-tts"
|
|
337
451
|
error.responseBody;
|
|
452
|
+
error.code; // stable machine-readable code (optional)
|
|
453
|
+
error.retryAfterMs; // parsed Retry-After header in ms (optional)
|
|
338
454
|
}
|
|
339
455
|
}
|
|
340
456
|
```
|
|
341
457
|
|
|
458
|
+
`ApiError.code` is populated from the RFC 7807 `application/problem+json` `code` extension when the upstream provides one (currently only the Speech Gateway). Match on `err.code` over `err.message` text — codes are a stable contract, messages aren't.
|
|
459
|
+
|
|
342
460
|
| Error | When |
|
|
343
461
|
|---|---|
|
|
344
462
|
| `ApiError` | Provider returned non-2xx |
|
|
463
|
+
| `MissingApiKeyError` | No `apiKey` passed and the provider's env var is unset |
|
|
345
464
|
| `NoSpeechGeneratedError` | Empty input (after tag stripping) or empty provider response |
|
|
346
465
|
| `StreamingNotSupportedError` | `streamSpeech()` on a non-streaming model |
|
|
347
466
|
| `VolumeAdjustmentUnsupportedError` | `volumeDbfs` with no decodable output mode |
|
|
348
|
-
| `TimestampKeyMissingError` | `timestamps:
|
|
467
|
+
| `TimestampKeyMissingError` | `timestamps: true` with no native support, no `fallbackSTT` configured, and `OPENAI_API_KEY` not set |
|
|
349
468
|
| `ConversationInputError` / `DialogueConstraintError` / `StitchUnsupportedError` | `generateConversation` validation / native caps / stitch incompatibility |
|
|
350
469
|
| `SpeechSDKError` | Base class |
|
|
351
470
|
|
|
352
|
-
Retries 5xx and network errors with exponential backoff ([p-retry](https://github.com/sindresorhus/p-retry));
|
|
471
|
+
Retries 5xx (except 501), 429, and network errors with jittered exponential backoff ([p-retry](https://github.com/sindresorhus/p-retry)); other 4xx and 501 are terminal. When a retriable error carries a `Retry-After` header, the SDK sleeps that long before the next attempt — capped at 60s to avoid pathological waits. The parsed value is surfaced as `ApiError.retryAfterMs` whenever the header is present, even on terminal errors that aren't retried. Default 2 retries; override via `maxRetries`.
|
|
353
472
|
|
|
354
473
|
## Development
|
|
355
474
|
|
|
@@ -1,58 +1,16 @@
|
|
|
1
1
|
import { generateConversation as _generateConversation } from "../../generate-conversation.js";
|
|
2
2
|
import { generateSpeech as _generateSpeech } from "../../generate-speech.js";
|
|
3
3
|
import type { WordTimestamp } from "../../timestamps.js";
|
|
4
|
-
/**
|
|
5
|
-
* Write a test-generated audio file to `SPEECH_SDK_E2E_OUTPUT_DIR` if the env
|
|
6
|
-
* var is set. No-op otherwise, so normal CI runs don't produce artifacts.
|
|
7
|
-
* Usually you don't need to call this directly — use the `generateSpeech`,
|
|
8
|
-
* `generateConversation`, and `collectStreamAndSave` helpers exported from
|
|
9
|
-
* this module, which autosave using the current test name.
|
|
10
|
-
*
|
|
11
|
-
* Output layout: `$SPEECH_SDK_E2E_OUTPUT_DIR/<provider-file>/<test-slug>.<ext>`.
|
|
12
|
-
* If the same test saves multiple times, subsequent files are suffixed `-2`,
|
|
13
|
-
* `-3`, etc.
|
|
14
|
-
*/
|
|
15
4
|
export declare function maybeSaveAudio(name: string, audio: {
|
|
16
5
|
uint8Array: Uint8Array;
|
|
17
6
|
mediaType: string;
|
|
18
7
|
}): Promise<void>;
|
|
19
|
-
/**
|
|
20
|
-
* Like {@link maybeSaveAudio}, plus — when `timestamps` is non-empty — also
|
|
21
|
-
* writes the raw alignment JSON and rendered SRT/VTT caption files alongside
|
|
22
|
-
* the audio. All four files share the same stem so they stay paired across
|
|
23
|
-
* multi-call tests. Still a no-op when `SPEECH_SDK_E2E_OUTPUT_DIR` is unset.
|
|
24
|
-
*
|
|
25
|
-
* Output layout (when timestamps present):
|
|
26
|
-
* ```
|
|
27
|
-
* <dir>/<bucket>/<slug>.<audio-ext>
|
|
28
|
-
* <dir>/<bucket>/<slug>.timestamps.json
|
|
29
|
-
* <dir>/<bucket>/<slug>.srt
|
|
30
|
-
* <dir>/<bucket>/<slug>.vtt
|
|
31
|
-
* ```
|
|
32
|
-
*/
|
|
33
8
|
export declare function maybeSaveResult(name: string, audio: {
|
|
34
9
|
uint8Array: Uint8Array;
|
|
35
10
|
mediaType: string;
|
|
36
11
|
}, timestamps?: readonly WordTimestamp[]): Promise<void>;
|
|
37
|
-
/**
|
|
38
|
-
* Drop-in replacement for `generateSpeech` that autosaves to
|
|
39
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
|
|
40
|
-
* result includes word timestamps, also writes paired `.timestamps.json`,
|
|
41
|
-
* `.srt`, and `.vtt` files.
|
|
42
|
-
*/
|
|
43
12
|
export declare const generateSpeech: typeof _generateSpeech;
|
|
44
|
-
/**
|
|
45
|
-
* Drop-in replacement for `generateConversation` that autosaves to
|
|
46
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
|
|
47
|
-
* result includes word timestamps, also writes paired `.timestamps.json`,
|
|
48
|
-
* `.srt`, and `.vtt` files.
|
|
49
|
-
*/
|
|
50
13
|
export declare const generateConversation: typeof _generateConversation;
|
|
51
|
-
/**
|
|
52
|
-
* Collects a streamed `streamSpeech` result into bytes AND autosaves them to
|
|
53
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. Use in place
|
|
54
|
-
* of `collectStream(result.audio)` in e2e tests.
|
|
55
|
-
*/
|
|
56
14
|
export declare function collectStreamAndSave(result: {
|
|
57
15
|
audio: ReadableStream<Uint8Array>;
|
|
58
16
|
mediaType: string;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"_save-audio.d.ts","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC7E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;
|
|
1
|
+
{"version":3,"file":"_save-audio.d.ts","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC7E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AA6EzD,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GACnD,OAAO,CAAC,IAAI,CAAC,CAEf;AAED,wBAAsB,eAAe,CACnC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,EACpD,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,GACpC,OAAO,CAAC,IAAI,CAAC,CA8Bf;AAOD,eAAO,MAAM,cAAc,EAAE,OAAO,eAMR,CAAC;AAE7B,eAAO,MAAM,oBAAoB,EAAE,OAAO,qBAMR,CAAC;AAEnC,wBAAsB,oBAAoB,CAAC,MAAM,EAAE;IACjD,KAAK,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;CACnB,GAAG,OAAO,CAAC,UAAU,CAAC,CAOtB"}
|
|
@@ -49,12 +49,6 @@ function currentTestContext() {
|
|
|
49
49
|
testPath: state.testPath,
|
|
50
50
|
};
|
|
51
51
|
}
|
|
52
|
-
/**
|
|
53
|
-
* Derives the subdirectory for a given test file. e2e tests are named like
|
|
54
|
-
* `openai.e2e.test.ts` / `conversation-google.e2e.test.ts`; we strip the
|
|
55
|
-
* `.e2e.test.ts` suffix and use that as the per-provider bucket so a full run
|
|
56
|
-
* doesn't dump 100+ files into a single flat directory.
|
|
57
|
-
*/
|
|
58
52
|
function providerBucket(testPath) {
|
|
59
53
|
if (!testPath) {
|
|
60
54
|
return "unknown";
|
|
@@ -62,18 +56,7 @@ function providerBucket(testPath) {
|
|
|
62
56
|
const base = basename(testPath).replace(E2E_TEST_SUFFIX, "");
|
|
63
57
|
return slugify(base) || "unknown";
|
|
64
58
|
}
|
|
65
|
-
// Counter keyed by `${bucket}/${slug}` so multiple generate/stream calls
|
|
66
|
-
// within a single test don't overwrite each other. Vitest isolates modules
|
|
67
|
-
// per file, so this resets per test file — collisions are only meaningful
|
|
68
|
-
// within the same `it`.
|
|
69
59
|
const callCounts = new Map();
|
|
70
|
-
/**
|
|
71
|
-
* Reserves a filename stem (without extension) for the next save call.
|
|
72
|
-
* First call returns `slug`; subsequent calls return `slug-2`, `slug-3`, etc.
|
|
73
|
-
* A single stem is shared across all sibling outputs from one logical save
|
|
74
|
-
* (audio + timestamps + captions), so they remain paired even across
|
|
75
|
-
* multiple saves within the same test.
|
|
76
|
-
*/
|
|
77
60
|
function nextStem(bucket, slug) {
|
|
78
61
|
const key = `${bucket}/${slug}`;
|
|
79
62
|
const n = (callCounts.get(key) ?? 0) + 1;
|
|
@@ -84,34 +67,9 @@ async function writeAndLog(file, data) {
|
|
|
84
67
|
await writeFile(file, data);
|
|
85
68
|
console.log(`[e2e-save] wrote ${file}`);
|
|
86
69
|
}
|
|
87
|
-
/**
|
|
88
|
-
* Write a test-generated audio file to `SPEECH_SDK_E2E_OUTPUT_DIR` if the env
|
|
89
|
-
* var is set. No-op otherwise, so normal CI runs don't produce artifacts.
|
|
90
|
-
* Usually you don't need to call this directly — use the `generateSpeech`,
|
|
91
|
-
* `generateConversation`, and `collectStreamAndSave` helpers exported from
|
|
92
|
-
* this module, which autosave using the current test name.
|
|
93
|
-
*
|
|
94
|
-
* Output layout: `$SPEECH_SDK_E2E_OUTPUT_DIR/<provider-file>/<test-slug>.<ext>`.
|
|
95
|
-
* If the same test saves multiple times, subsequent files are suffixed `-2`,
|
|
96
|
-
* `-3`, etc.
|
|
97
|
-
*/
|
|
98
70
|
export async function maybeSaveAudio(name, audio) {
|
|
99
71
|
await maybeSaveResult(name, audio);
|
|
100
72
|
}
|
|
101
|
-
/**
|
|
102
|
-
* Like {@link maybeSaveAudio}, plus — when `timestamps` is non-empty — also
|
|
103
|
-
* writes the raw alignment JSON and rendered SRT/VTT caption files alongside
|
|
104
|
-
* the audio. All four files share the same stem so they stay paired across
|
|
105
|
-
* multi-call tests. Still a no-op when `SPEECH_SDK_E2E_OUTPUT_DIR` is unset.
|
|
106
|
-
*
|
|
107
|
-
* Output layout (when timestamps present):
|
|
108
|
-
* ```
|
|
109
|
-
* <dir>/<bucket>/<slug>.<audio-ext>
|
|
110
|
-
* <dir>/<bucket>/<slug>.timestamps.json
|
|
111
|
-
* <dir>/<bucket>/<slug>.srt
|
|
112
|
-
* <dir>/<bucket>/<slug>.vtt
|
|
113
|
-
* ```
|
|
114
|
-
*/
|
|
115
73
|
export async function maybeSaveResult(name, audio, timestamps) {
|
|
116
74
|
const dir = resolveOutputDir();
|
|
117
75
|
if (!dir) {
|
|
@@ -133,33 +91,16 @@ function currentTestSlug() {
|
|
|
133
91
|
const { currentTestName } = currentTestContext();
|
|
134
92
|
return slugify(currentTestName ?? "unnamed") || "unnamed";
|
|
135
93
|
}
|
|
136
|
-
/**
|
|
137
|
-
* Drop-in replacement for `generateSpeech` that autosaves to
|
|
138
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
|
|
139
|
-
* result includes word timestamps, also writes paired `.timestamps.json`,
|
|
140
|
-
* `.srt`, and `.vtt` files.
|
|
141
|
-
*/
|
|
142
94
|
export const generateSpeech = (async (options) => {
|
|
143
95
|
const result = await _generateSpeech(options);
|
|
144
96
|
await maybeSaveResult(currentTestSlug(), result.audio, result.timestamps);
|
|
145
97
|
return result;
|
|
146
98
|
});
|
|
147
|
-
/**
|
|
148
|
-
* Drop-in replacement for `generateConversation` that autosaves to
|
|
149
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
|
|
150
|
-
* result includes word timestamps, also writes paired `.timestamps.json`,
|
|
151
|
-
* `.srt`, and `.vtt` files.
|
|
152
|
-
*/
|
|
153
99
|
export const generateConversation = (async (options) => {
|
|
154
100
|
const result = await _generateConversation(options);
|
|
155
101
|
await maybeSaveResult(currentTestSlug(), result.audio, result.timestamps);
|
|
156
102
|
return result;
|
|
157
103
|
});
|
|
158
|
-
/**
|
|
159
|
-
* Collects a streamed `streamSpeech` result into bytes AND autosaves them to
|
|
160
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. Use in place
|
|
161
|
-
* of `collectStream(result.audio)` in e2e tests.
|
|
162
|
-
*/
|
|
163
104
|
export async function collectStreamAndSave(result) {
|
|
164
105
|
const bytes = await collectStream(result.audio);
|
|
165
106
|
await maybeSaveAudio(currentTestSlug(), {
|