@speech-sdk/core 0.7.0 → 0.8.1-alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -108
- package/dist/__tests__/e2e/_save-audio.d.ts +0 -42
- package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
- package/dist/__tests__/e2e/_save-audio.js +0 -59
- package/dist/__tests__/e2e/_save-audio.js.map +1 -1
- package/dist/audio-duration.d.ts +0 -5
- package/dist/audio-duration.d.ts.map +1 -1
- package/dist/audio-duration.js +3 -10
- package/dist/audio-duration.js.map +1 -1
- package/dist/audio-utils.d.ts +0 -10
- package/dist/audio-utils.d.ts.map +1 -1
- package/dist/audio-utils.js +2 -14
- package/dist/audio-utils.js.map +1 -1
- package/dist/captions.d.ts +0 -108
- package/dist/captions.d.ts.map +1 -1
- package/dist/captions.js +8 -98
- package/dist/captions.js.map +1 -1
- package/dist/conversation/attribute-timestamps.d.ts +26 -0
- package/dist/conversation/attribute-timestamps.d.ts.map +1 -0
- package/dist/conversation/attribute-timestamps.js +276 -0
- package/dist/conversation/attribute-timestamps.js.map +1 -0
- package/dist/conversation/dispatch.d.ts +5 -5
- package/dist/conversation/dispatch.d.ts.map +1 -1
- package/dist/conversation/dispatch.js +18 -8
- package/dist/conversation/dispatch.js.map +1 -1
- package/dist/conversation/errors.d.ts +3 -0
- package/dist/conversation/errors.d.ts.map +1 -1
- package/dist/conversation/errors.js +6 -0
- package/dist/conversation/errors.js.map +1 -1
- package/dist/conversation/pcm-concat.d.ts +0 -23
- package/dist/conversation/pcm-concat.d.ts.map +1 -1
- package/dist/conversation/pcm-concat.js +5 -43
- package/dist/conversation/pcm-concat.js.map +1 -1
- package/dist/conversation/proportional-fill.d.ts +10 -0
- package/dist/conversation/proportional-fill.d.ts.map +1 -0
- package/dist/conversation/proportional-fill.js +64 -0
- package/dist/conversation/proportional-fill.js.map +1 -0
- package/dist/conversation/silence-detection.d.ts +14 -0
- package/dist/conversation/silence-detection.d.ts.map +1 -0
- package/dist/conversation/silence-detection.js +52 -0
- package/dist/conversation/silence-detection.js.map +1 -0
- package/dist/conversation/stitch.d.ts +5 -6
- package/dist/conversation/stitch.d.ts.map +1 -1
- package/dist/conversation/stitch.js +42 -36
- package/dist/conversation/stitch.js.map +1 -1
- package/dist/conversation/types.d.ts +1 -35
- package/dist/conversation/types.d.ts.map +1 -1
- package/dist/conversation/validate.d.ts +1 -16
- package/dist/conversation/validate.d.ts.map +1 -1
- package/dist/conversation/validate.js +29 -29
- package/dist/conversation/validate.js.map +1 -1
- package/dist/default-stt-fallback.d.ts +3 -0
- package/dist/default-stt-fallback.d.ts.map +1 -0
- package/dist/default-stt-fallback.js +11 -0
- package/dist/default-stt-fallback.js.map +1 -0
- package/dist/derive-timestamps.d.ts +1 -5
- package/dist/derive-timestamps.d.ts.map +1 -1
- package/dist/derive-timestamps.js +1 -15
- package/dist/derive-timestamps.js.map +1 -1
- package/dist/errors.d.ts +5 -12
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +12 -14
- package/dist/errors.js.map +1 -1
- package/dist/generate-conversation.d.ts +4 -3
- package/dist/generate-conversation.d.ts.map +1 -1
- package/dist/generate-conversation.js +162 -67
- package/dist/generate-conversation.js.map +1 -1
- package/dist/generate-speech.d.ts +1 -26
- package/dist/generate-speech.d.ts.map +1 -1
- package/dist/generate-speech.js +85 -64
- package/dist/generate-speech.js.map +1 -1
- package/dist/index.d.ts +4 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5 -4
- package/dist/index.js.map +1 -1
- package/dist/logger.d.ts.map +1 -1
- package/dist/logger.js +2 -13
- package/dist/logger.js.map +1 -1
- package/dist/metadata.d.ts +0 -22
- package/dist/metadata.d.ts.map +1 -1
- package/dist/provider-utils.d.ts +3 -9
- package/dist/provider-utils.d.ts.map +1 -1
- package/dist/provider-utils.js +34 -51
- package/dist/provider-utils.js.map +1 -1
- package/dist/providers/cartesia/alignment.d.ts +0 -16
- package/dist/providers/cartesia/alignment.d.ts.map +1 -1
- package/dist/providers/cartesia/alignment.js +1 -6
- package/dist/providers/cartesia/alignment.js.map +1 -1
- package/dist/providers/cartesia/index.d.ts +7 -19
- package/dist/providers/cartesia/index.d.ts.map +1 -1
- package/dist/providers/cartesia/index.js +68 -80
- package/dist/providers/cartesia/index.js.map +1 -1
- package/dist/providers/deepgram/index.d.ts +7 -8
- package/dist/providers/deepgram/index.d.ts.map +1 -1
- package/dist/providers/deepgram/index.js +17 -18
- package/dist/providers/deepgram/index.js.map +1 -1
- package/dist/providers/elevenlabs/alignment.d.ts +7 -21
- package/dist/providers/elevenlabs/alignment.d.ts.map +1 -1
- package/dist/providers/elevenlabs/alignment.js +8 -9
- package/dist/providers/elevenlabs/alignment.js.map +1 -1
- package/dist/providers/elevenlabs/index.d.ts +7 -38
- package/dist/providers/elevenlabs/index.d.ts.map +1 -1
- package/dist/providers/elevenlabs/index.js +161 -169
- package/dist/providers/elevenlabs/index.js.map +1 -1
- package/dist/providers/fal/index.d.ts +7 -18
- package/dist/providers/fal/index.d.ts.map +1 -1
- package/dist/providers/fal/index.js +37 -31
- package/dist/providers/fal/index.js.map +1 -1
- package/dist/providers/fish-audio/index.d.ts +7 -8
- package/dist/providers/fish-audio/index.d.ts.map +1 -1
- package/dist/providers/fish-audio/index.js +23 -19
- package/dist/providers/fish-audio/index.js.map +1 -1
- package/dist/providers/gateway/index.d.ts +68 -0
- package/dist/providers/gateway/index.d.ts.map +1 -0
- package/dist/providers/gateway/index.js +236 -0
- package/dist/providers/gateway/index.js.map +1 -0
- package/dist/providers/google/index.d.ts +7 -20
- package/dist/providers/google/index.d.ts.map +1 -1
- package/dist/providers/google/index.js +161 -151
- package/dist/providers/google/index.js.map +1 -1
- package/dist/providers/hume/alignment.d.ts +30 -35
- package/dist/providers/hume/alignment.d.ts.map +1 -1
- package/dist/providers/hume/alignment.js +14 -8
- package/dist/providers/hume/alignment.js.map +1 -1
- package/dist/providers/hume/index.d.ts +7 -16
- package/dist/providers/hume/index.d.ts.map +1 -1
- package/dist/providers/hume/index.js +55 -65
- package/dist/providers/hume/index.js.map +1 -1
- package/dist/providers/inworld/alignment.d.ts +8 -22
- package/dist/providers/inworld/alignment.d.ts.map +1 -1
- package/dist/providers/inworld/alignment.js +9 -8
- package/dist/providers/inworld/alignment.js.map +1 -1
- package/dist/providers/inworld/index.d.ts +7 -20
- package/dist/providers/inworld/index.d.ts.map +1 -1
- package/dist/providers/inworld/index.js +47 -39
- package/dist/providers/inworld/index.js.map +1 -1
- package/dist/providers/mistral/index.d.ts +7 -8
- package/dist/providers/mistral/index.d.ts.map +1 -1
- package/dist/providers/mistral/index.js +39 -38
- package/dist/providers/mistral/index.js.map +1 -1
- package/dist/providers/murf/alignment.d.ts +10 -19
- package/dist/providers/murf/alignment.d.ts.map +1 -1
- package/dist/providers/murf/alignment.js +10 -5
- package/dist/providers/murf/alignment.js.map +1 -1
- package/dist/providers/murf/index.d.ts +7 -16
- package/dist/providers/murf/index.d.ts.map +1 -1
- package/dist/providers/murf/index.js +65 -57
- package/dist/providers/murf/index.js.map +1 -1
- package/dist/providers/openai/index.d.ts +36 -29
- package/dist/providers/openai/index.d.ts.map +1 -1
- package/dist/providers/openai/index.js +270 -106
- package/dist/providers/openai/index.js.map +1 -1
- package/dist/providers/resemble/alignment.d.ts +8 -29
- package/dist/providers/resemble/alignment.d.ts.map +1 -1
- package/dist/providers/resemble/alignment.js +9 -12
- package/dist/providers/resemble/alignment.js.map +1 -1
- package/dist/providers/resemble/index.d.ts +7 -11
- package/dist/providers/resemble/index.d.ts.map +1 -1
- package/dist/providers/resemble/index.js +54 -48
- package/dist/providers/resemble/index.js.map +1 -1
- package/dist/providers/xai/index.d.ts +7 -9
- package/dist/providers/xai/index.d.ts.map +1 -1
- package/dist/providers/xai/index.js +37 -40
- package/dist/providers/xai/index.js.map +1 -1
- package/dist/providers.d.ts +29 -0
- package/dist/providers.d.ts.map +1 -0
- package/dist/providers.js +15 -0
- package/dist/providers.js.map +1 -0
- package/dist/resolve-provider.d.ts.map +1 -1
- package/dist/resolve-provider.js +8 -51
- package/dist/resolve-provider.js.map +1 -1
- package/dist/speech-provider.d.ts +13 -53
- package/dist/speech-provider.d.ts.map +1 -1
- package/dist/speech-provider.js +5 -26
- package/dist/speech-provider.js.map +1 -1
- package/dist/speech-result.d.ts +8 -9
- package/dist/speech-result.d.ts.map +1 -1
- package/dist/speech-result.js.map +1 -1
- package/dist/speech-to-text-provider.d.ts +0 -12
- package/dist/speech-to-text-provider.d.ts.map +1 -1
- package/dist/stream-speech.d.ts.map +1 -1
- package/dist/stream-speech.js +2 -3
- package/dist/stream-speech.js.map +1 -1
- package/dist/timestamps.d.ts +3 -17
- package/dist/timestamps.d.ts.map +1 -1
- package/dist/turns.d.ts +9 -0
- package/dist/turns.d.ts.map +1 -0
- package/dist/turns.js +21 -0
- package/dist/turns.js.map +1 -0
- package/dist/types.d.ts +25 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/volume-adjust.d.ts +0 -6
- package/dist/volume-adjust.d.ts.map +1 -1
- package/dist/volume-adjust.js +0 -6
- package/dist/volume-adjust.js.map +1 -1
- package/package.json +11 -66
- package/dist/stt-providers/openai/index.d.ts +0 -42
- package/dist/stt-providers/openai/index.d.ts.map +0 -1
- package/dist/stt-providers/openai/index.js +0 -184
- package/dist/stt-providers/openai/index.js.map +0 -1
package/README.md
CHANGED
|
@@ -1,10 +1,24 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<img src="https://github.com/user-attachments/assets/42d9b528-e507-4162-8120-338bb0c92650" alt="Speech SDK" width="140" />
|
|
4
|
+
|
|
1
5
|
# Speech SDK
|
|
2
6
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
7
|
+
**Text-to-speech across 13 providers, one API.**
|
|
8
|
+
|
|
9
|
+
A lightweight, provider-agnostic TypeScript SDK. Zero lock-in. Runs in Node.js, Edge runtimes, and the browser.
|
|
10
|
+
|
|
11
|
+
[](https://www.npmjs.com/package/@speech-sdk/core)
|
|
12
|
+
[](https://www.npmjs.com/package/@speech-sdk/core)
|
|
13
|
+
[](https://github.com/Jellypod-Inc/speech-sdk/blob/main/LICENSE)
|
|
14
|
+
[](https://discord.gg/xcTQMU3nCV)
|
|
15
|
+
[](https://github.com/Jellypod-Inc/speech-sdk/stargazers)
|
|
6
16
|
|
|
7
|
-
|
|
17
|
+
**[Quick start](#quick-start)** · **[Providers](#supported-providers)** · **[Streaming](#streaming)** · **[Multi-Speaker Conversations](#conversations)** · **[Timestamps](#timestamps)**
|
|
18
|
+
|
|
19
|
+
</div>
|
|
20
|
+
|
|
21
|
+
<br />
|
|
8
22
|
|
|
9
23
|
<img width="1200" height="630" alt="Speech SDK" src="https://github.com/user-attachments/assets/b90c0235-9405-4939-bffa-75fc82be5afb" />
|
|
10
24
|
|
|
@@ -12,19 +26,12 @@ Learn more at [speechsdk.dev](https://speechsdk.dev/).
|
|
|
12
26
|
|
|
13
27
|
## Features
|
|
14
28
|
|
|
15
|
-
- **Universal** — `generateSpeech()`
|
|
29
|
+
- **Universal** — one `generateSpeech()` call across every supported provider.
|
|
16
30
|
- **Streaming** — `streamSpeech()` returns a standard `ReadableStream<Uint8Array>`.
|
|
17
|
-
- **Conversations** — `generateConversation()` produces multi-speaker audio,
|
|
18
|
-
- **Word-level timestamps** — `timestamps:
|
|
31
|
+
- **Conversations** — `generateConversation()` produces multi-speaker audio, picking a gateway, native-dialogue, or local-stitch path automatically.
|
|
32
|
+
- **Word-level timestamps** — `timestamps: true` returns alignment, using the provider's native data or falling back to STT.
|
|
19
33
|
- **Volume normalization** — RMS-level outputs to an absolute loudness target.
|
|
20
|
-
- **Audio tags & voice cloning** — `[laugh]
|
|
21
|
-
|
|
22
|
-
## Contents
|
|
23
|
-
|
|
24
|
-
- [Install](#install) · [Quick start](#quick-start) · [Supported providers](#supported-providers)
|
|
25
|
-
- [Streaming](#streaming) · [Conversations](#conversations) · [Timestamps](#timestamps)
|
|
26
|
-
- [Volume normalization](#volume-normalization) · [Audio tags](#audio-tags) · [Voice cloning](#voice-cloning)
|
|
27
|
-
- [Custom configuration](#custom-configuration) · [API reference](#api-reference) · [Error handling](#error-handling) · [Development](#development)
|
|
34
|
+
- **Audio tags & voice cloning** — bracket cues like `[laugh]` and reference-audio cloning where supported.
|
|
28
35
|
|
|
29
36
|
## Install
|
|
30
37
|
|
|
@@ -51,25 +58,51 @@ result.audio.base64; // string (lazy)
|
|
|
51
58
|
result.audio.mediaType; // "audio/mpeg"
|
|
52
59
|
```
|
|
53
60
|
|
|
54
|
-
Pass a `provider/model` string, or just the provider name to use its default model.
|
|
61
|
+
Pass a `provider/model` string, or just the provider name to use its default model. The string above is enough to get going — set one env var and you're done.
|
|
62
|
+
|
|
63
|
+
## Gateway vs direct provider
|
|
64
|
+
|
|
65
|
+
The SDK has two ways to reach a provider, and the choice is made by **how you pass `model`**:
|
|
66
|
+
|
|
67
|
+
```ts
|
|
68
|
+
// 1. String → routes through Speech Gateway (https://api.speechgateway.com)
|
|
69
|
+
// Needs SPEECH_GATEWAY_API_KEY (sign up at https://speechgateway.com).
|
|
70
|
+
await generateSpeech({ model: 'openai/gpt-4o-mini-tts', text: '...', voice: 'alloy' });
|
|
71
|
+
|
|
72
|
+
// 2. Factory → calls the provider directly (no proxy hop)
|
|
73
|
+
// Reads the provider's env var (e.g. OPENAI_API_KEY), or pass apiKey to the factory.
|
|
74
|
+
import { createOpenAI } from '@speech-sdk/core/providers';
|
|
75
|
+
await generateSpeech({ model: createOpenAI()('gpt-4o-mini-tts'), text: '...', voice: 'alloy' });
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
| | Speech Gateway (string) | Direct provider (factory) |
|
|
79
|
+
|---|---|---|
|
|
80
|
+
| When to use | You want a single endpoint and easy provider swaps | You already have provider keys, want zero-hop latency, or need provider features the gateway hasn't surfaced |
|
|
81
|
+
| Setup | `SPEECH_GATEWAY_API_KEY` only | One env var per provider you use |
|
|
82
|
+
| Key resolution | `apiKey` option → `SPEECH_GATEWAY_API_KEY` | `createX({ apiKey })` → `<PROVIDER>_API_KEY` |
|
|
83
|
+
| Endpoint | `api.speechgateway.com` | Provider's own API |
|
|
84
|
+
|
|
85
|
+
The gateway also accepts `createSpeechGateway({ apiKey, baseURL })` if you want to construct it explicitly (e.g. for a custom proxy URL).
|
|
55
86
|
|
|
56
87
|
## Supported providers
|
|
57
88
|
|
|
58
|
-
| Provider | Prefix |
|
|
59
|
-
|
|
60
|
-
| [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `
|
|
61
|
-
| [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `
|
|
62
|
-
| [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `
|
|
63
|
-
| [Cartesia](https://docs.cartesia.ai) | `cartesia` | `
|
|
64
|
-
| [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `
|
|
65
|
-
| [Inworld](https://docs.inworld.ai/tts) | `inworld` | `
|
|
66
|
-
| [Google Gemini TTS](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `
|
|
67
|
-
| [Fish Audio](https://docs.fish.audio) | `fish-audio` | `
|
|
68
|
-
| [Murf](https://murf.ai/api/docs) | `murf` | `
|
|
69
|
-
| [Resemble](https://docs.resemble.ai) | `resemble` | `
|
|
70
|
-
| [fal](https://fal.ai/models) | `fal-ai` |
|
|
71
|
-
| [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `
|
|
72
|
-
| [xAI](https://docs.x.ai/docs/models) | `xai` | `
|
|
89
|
+
| Provider | Prefix | Env var |
|
|
90
|
+
|---|---|---|
|
|
91
|
+
| [OpenAI](https://platform.openai.com/docs/guides/text-to-speech) | `openai` | `OPENAI_API_KEY` |
|
|
92
|
+
| [ElevenLabs](https://elevenlabs.io/docs) | `elevenlabs` | `ELEVENLABS_API_KEY` |
|
|
93
|
+
| [Deepgram](https://developers.deepgram.com/docs/text-to-speech) | `deepgram` | `DEEPGRAM_API_KEY` |
|
|
94
|
+
| [Cartesia](https://docs.cartesia.ai) | `cartesia` | `CARTESIA_API_KEY` |
|
|
95
|
+
| [Hume](https://dev.hume.ai/docs/text-to-speech-tts/overview) | `hume` | `HUME_API_KEY` |
|
|
96
|
+
| [Inworld](https://docs.inworld.ai/tts) | `inworld` | `INWORLD_API_KEY` |
|
|
97
|
+
| [Google Gemini TTS](https://docs.cloud.google.com/text-to-speech/docs/gemini-tts) | `google` | `GOOGLE_API_KEY` |
|
|
98
|
+
| [Fish Audio](https://docs.fish.audio) | `fish-audio` | `FISH_AUDIO_API_KEY` |
|
|
99
|
+
| [Murf](https://murf.ai/api/docs) | `murf` | `MURF_API_KEY` |
|
|
100
|
+
| [Resemble](https://docs.resemble.ai) | `resemble` | `RESEMBLE_API_KEY` |
|
|
101
|
+
| [fal](https://fal.ai/models) | `fal-ai` | `FAL_API_KEY` |
|
|
102
|
+
| [Mistral](https://docs.mistral.ai/capabilities/audio/text_to_speech/speech) | `mistral` | `MISTRAL_API_KEY` |
|
|
103
|
+
| [xAI](https://docs.x.ai/docs/models) | `xai` | `XAI_API_KEY` |
|
|
104
|
+
|
|
105
|
+
The env var applies when you call the provider directly via its factory. Pass a string `model` like `"openai/tts-1"` to route through Speech Gateway instead, which reads `SPEECH_GATEWAY_API_KEY` — see [Gateway vs direct provider](#gateway-vs-direct-provider). Most providers ship a default model (`createOpenAI()()`); a few (e.g. fal) require an explicit model id. See the linked docs for each provider's full model list.
|
|
73
106
|
|
|
74
107
|
Provider-specific parameters pass through via `providerOptions` using each API's native field names.
|
|
75
108
|
|
|
@@ -95,13 +128,16 @@ return new Response(audio, { headers: { 'Content-Type': mediaType } });
|
|
|
95
128
|
|
|
96
129
|
## Conversations
|
|
97
130
|
|
|
98
|
-
`generateConversation()` produces a single multi-voice clip from an ordered array of turns
|
|
131
|
+
`generateConversation()` produces a single multi-voice clip from an ordered array of turns. The path is chosen by what the turns are:
|
|
132
|
+
|
|
133
|
+
- **Gateway** — every turn uses a gateway-routed string model (e.g. `"openai/tts-1"`). One request to Speech Gateway; the server handles rendering, stitching, and normalization. The SDK never stitches locally on this path — clone voices on gateway models throw `StitchUnsupportedError`.
|
|
134
|
+
- **Native dialogue** — every turn uses the same direct-provider model and that model exposes a multi-speaker endpoint. One API call, naturally mixed.
|
|
135
|
+
- **Stitch** — direct-provider conversations that don't qualify for native dialogue (multi-provider, or no dialogue endpoint). Runs turns in parallel, RMS-levels each, inserts silence, returns a single WAV.
|
|
99
136
|
|
|
100
|
-
-
|
|
101
|
-
- **Stitch fallback** — multi-provider or no dialogue endpoint. Runs turns in parallel, RMS-levels each, inserts silence, returns a single WAV.
|
|
137
|
+
Mixing gateway-routed turns with direct-provider turns in one call throws `MixedDispatchError`.
|
|
102
138
|
|
|
103
139
|
```ts
|
|
104
|
-
import { generateConversation } from '@speech-sdk/core
|
|
140
|
+
import { generateConversation } from '@speech-sdk/core';
|
|
105
141
|
|
|
106
142
|
const result = await generateConversation({
|
|
107
143
|
turns: [
|
|
@@ -112,16 +148,7 @@ const result = await generateConversation({
|
|
|
112
148
|
});
|
|
113
149
|
```
|
|
114
150
|
|
|
115
|
-
Options: `gapMs` (default 300), `
|
|
116
|
-
|
|
117
|
-
**Native dialogue caps:**
|
|
118
|
-
|
|
119
|
-
| Provider | Models | Voice constraints |
|
|
120
|
-
|---|---|---|
|
|
121
|
-
| ElevenLabs | `eleven_v3` | 1–10 voices, ≤ 2,000 chars |
|
|
122
|
-
| Google | `gemini-2.5-{flash,pro}-preview-tts`, `gemini-3.1-flash-tts-preview` | **Exactly 2 voices** |
|
|
123
|
-
| Hume | `octave-1`, `octave-2` | 1–4 voices |
|
|
124
|
-
| Fish Audio | `s2-pro` | 1–4 voices |
|
|
151
|
+
Options: `gapMs` (default 300), `volumeDbfs` (default `-20`), `maxConcurrency` (default 6), `maxRetries` (default 2), `timestamps`, `apiKey`, `providerOptions`, `abortSignal`, `headers`. Per-turn overrides: `model`, `providerOptions` (stitch path only — throws `ConversationInputError` on native). Native-dialogue models enforce their own voice-count and character limits; violations throw `DialogueConstraintError`.
|
|
125
152
|
|
|
126
153
|
## Timestamps
|
|
127
154
|
|
|
@@ -132,7 +159,7 @@ const result = await generateSpeech({
|
|
|
132
159
|
model: 'elevenlabs/eleven_multilingual_v2',
|
|
133
160
|
text: 'Hello from speech-sdk!',
|
|
134
161
|
voice: 'JBFqnCBsd6RMkjVDRZzb',
|
|
135
|
-
timestamps:
|
|
162
|
+
timestamps: true,
|
|
136
163
|
});
|
|
137
164
|
|
|
138
165
|
result.timestamps;
|
|
@@ -143,43 +170,57 @@ result.timestamps;
|
|
|
143
170
|
// ]
|
|
144
171
|
```
|
|
145
172
|
|
|
146
|
-
|
|
|
173
|
+
| Value | Behavior |
|
|
147
174
|
|---|---|
|
|
148
|
-
| `
|
|
149
|
-
| `
|
|
150
|
-
|
|
175
|
+
| `true` | Always return timestamps. Uses native alignment when available; otherwise transcribes the audio via STT (extra cost + latency). |
|
|
176
|
+
| `false` *(default)* | Never return timestamps. |
|
|
177
|
+
|
|
178
|
+
With `timestamps: true`, models without native alignment require an STT fallback. The SDK automatically uses OpenAI Whisper when `OPENAI_API_KEY` is set in the environment — no extra configuration needed. Gateway-routed models (string model IDs like `"openai/tts-1"`) do not need a fallback — the gateway server provides it.
|
|
151
179
|
|
|
152
|
-
|
|
180
|
+
**Resolution order:** factory `fallbackSTT` → `OPENAI_API_KEY` env var (automatic Whisper fallback) → throws `TimestampKeyMissingError`.
|
|
181
|
+
|
|
182
|
+
Configure `fallbackSTT` on the factory to use a different key or STT model (set it once, applies to all calls):
|
|
153
183
|
|
|
154
184
|
```ts
|
|
155
|
-
import {
|
|
185
|
+
import { generateSpeech } from '@speech-sdk/core';
|
|
186
|
+
import { createOpenAI, createElevenLabs } from '@speech-sdk/core/providers';
|
|
156
187
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
188
|
+
const elevenlabs = createElevenLabs({
|
|
189
|
+
apiKey: process.env.ELEVENLABS_API_KEY,
|
|
190
|
+
fallbackSTT: createOpenAI({ apiKey: process.env.MY_OPENAI_KEY }).stt('whisper-1'),
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
const result = await generateSpeech({
|
|
194
|
+
model: elevenlabs('eleven_flash_v2'),
|
|
195
|
+
voice: 'JBFqnCBsd6RMkjVDRZzb',
|
|
196
|
+
text: 'Hello, world.',
|
|
197
|
+
timestamps: true,
|
|
163
198
|
});
|
|
164
199
|
```
|
|
165
200
|
|
|
166
|
-
|
|
201
|
+
Whether a given model returns native alignment or transcribes via the STT fallback is a provider detail — both paths produce the same `WordTimestamp[]` shape.
|
|
167
202
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
203
|
+
`generateConversation` accepts the same options and returns `ConversationWordTimestamp[]` — every word carries a `turnIndex: number` pointing back into the input `turns[]`. This is what lets you build chat-bubble UIs, speaker-attributed transcripts, and "who's speaking now?" lookups during playback without re-deriving turn boundaries.
|
|
204
|
+
|
|
205
|
+
```ts
|
|
206
|
+
import { generateConversation, timestampsToTurns } from '@speech-sdk/core';
|
|
207
|
+
|
|
208
|
+
const result = await generateConversation({
|
|
209
|
+
model: 'elevenlabs/eleven_v3',
|
|
210
|
+
turns: [
|
|
211
|
+
{ voice: 'rachel', text: 'Hi there.' },
|
|
212
|
+
{ voice: 'adam', text: 'Hello!' },
|
|
213
|
+
],
|
|
214
|
+
timestamps: true,
|
|
215
|
+
});
|
|
177
216
|
|
|
178
|
-
|
|
217
|
+
// Collapse consecutive words from the same turn into per-turn timings:
|
|
218
|
+
const turnTimestamps = timestampsToTurns(result.timestamps ?? []);
|
|
219
|
+
```
|
|
179
220
|
|
|
180
221
|
### Captions (SRT / WebVTT)
|
|
181
222
|
|
|
182
|
-
|
|
223
|
+
`timestampsToCaptions()` converts word-level timestamps into a caption file. SRT is the default; pass `format: 'vtt'` for WebVTT.
|
|
183
224
|
|
|
184
225
|
```ts
|
|
185
226
|
import { generateSpeech, timestampsToCaptions } from '@speech-sdk/core';
|
|
@@ -188,33 +229,14 @@ const { timestamps } = await generateSpeech({
|
|
|
188
229
|
model: 'elevenlabs/eleven_v3',
|
|
189
230
|
text: 'Hello world. This is a test.',
|
|
190
231
|
voice: 'JBFqnCBsd6RMkjVDRZzb',
|
|
191
|
-
timestamps:
|
|
232
|
+
timestamps: true,
|
|
192
233
|
});
|
|
193
234
|
|
|
194
235
|
const srt = timestampsToCaptions(timestamps ?? []);
|
|
195
|
-
// 1
|
|
196
|
-
// 00:00:00,000 --> 00:00:01,200
|
|
197
|
-
// Hello world.
|
|
198
|
-
//
|
|
199
|
-
// 2
|
|
200
|
-
// 00:00:01,300 --> 00:00:02,800
|
|
201
|
-
// This is a test.
|
|
202
|
-
|
|
203
236
|
const vtt = timestampsToCaptions(timestamps ?? [], { format: 'vtt' });
|
|
204
|
-
// WEBVTT
|
|
205
|
-
//
|
|
206
|
-
// 1
|
|
207
|
-
// 00:00:00.000 --> 00:00:01.200
|
|
208
|
-
// Hello world.
|
|
209
|
-
//
|
|
210
|
-
// 2
|
|
211
|
-
// 00:00:01.300 --> 00:00:02.800
|
|
212
|
-
// This is a test.
|
|
213
237
|
```
|
|
214
238
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
Cues break on sentence boundaries (`.`, `!`, `?`), then subdivide long sentences by character count, cue duration, and soft comma breaks. Pass `CaptionsOptions` to customize `format`, `maxLineLength`, `maxLinesPerCue`, `maxCharsPerCue`, `maxCueDurationMs`, or `longPhraseCommaBreakChars`.
|
|
239
|
+
Cues break on sentence boundaries, then subdivide long sentences by character count, cue duration, and soft comma breaks. Pass `CaptionsOptions` to customize `format`, `maxLineLength`, `maxLinesPerCue`, `maxCharsPerCue`, `maxCueDurationMs`, or `longPhraseCommaBreakChars`.
|
|
218
240
|
|
|
219
241
|
## Volume normalization
|
|
220
242
|
|
|
@@ -231,11 +253,11 @@ const result = await generateSpeech({
|
|
|
231
253
|
result.audio.mediaType; // "audio/wav" — re-encoded after normalization
|
|
232
254
|
```
|
|
233
255
|
|
|
234
|
-
`generateConversation` normalizes
|
|
256
|
+
`generateConversation` always normalizes; override the target with `volumeDbfs`. A warning is surfaced (and the raw mix passes through) if the provider has no decodable PCM/WAV mode.
|
|
235
257
|
|
|
236
258
|
## Audio tags
|
|
237
259
|
|
|
238
|
-
Bracket syntax `[tag]` adds expressive cues.
|
|
260
|
+
Bracket syntax `[tag]` adds expressive cues. Each provider handles tags natively where supported, maps them to its closest equivalent, or strips them and surfaces a warning in `result.warnings`.
|
|
239
261
|
|
|
240
262
|
```ts
|
|
241
263
|
await generateSpeech({
|
|
@@ -245,21 +267,12 @@ await generateSpeech({
|
|
|
245
267
|
});
|
|
246
268
|
```
|
|
247
269
|
|
|
248
|
-
| Provider | Behavior |
|
|
249
|
-
|---|---|
|
|
250
|
-
| OpenAI (`gpt-4o-mini-tts`) | Mapped to the `instructions` field |
|
|
251
|
-
| ElevenLabs (`eleven_v3`) | Passed through natively |
|
|
252
|
-
| Google (`gemini-3.1-flash-tts-preview`) | Passed through natively |
|
|
253
|
-
| Cartesia (`sonic-3`) | Emotion tags → SSML; `[laughter]` passed through; unknown stripped |
|
|
254
|
-
| All others | Stripped with warnings |
|
|
255
|
-
|
|
256
270
|
## Voice cloning
|
|
257
271
|
|
|
258
272
|
Some providers support reference-audio cloning. Pass a voice object instead of a string.
|
|
259
273
|
|
|
260
274
|
```ts
|
|
261
|
-
import { createMistral } from '@speech-sdk/core/
|
|
262
|
-
import { createFal } from '@speech-sdk/core/fal-ai';
|
|
275
|
+
import { createFal, createMistral } from '@speech-sdk/core/providers';
|
|
263
276
|
|
|
264
277
|
// Base64 reference:
|
|
265
278
|
await generateSpeech({
|
|
@@ -282,7 +295,7 @@ Factory functions give you custom API keys, base URLs, or `fetch` implementation
|
|
|
282
295
|
|
|
283
296
|
```ts
|
|
284
297
|
import { generateSpeech } from '@speech-sdk/core';
|
|
285
|
-
import { createOpenAI } from '@speech-sdk/core/
|
|
298
|
+
import { createOpenAI } from '@speech-sdk/core/providers';
|
|
286
299
|
|
|
287
300
|
const myOpenAI = createOpenAI({
|
|
288
301
|
apiKey: 'sk-...',
|
|
@@ -296,6 +309,43 @@ await generateSpeech({
|
|
|
296
309
|
});
|
|
297
310
|
```
|
|
298
311
|
|
|
312
|
+
## Public imports
|
|
313
|
+
|
|
314
|
+
The root package exports the main runtime APIs:
|
|
315
|
+
|
|
316
|
+
```ts
|
|
317
|
+
import {
|
|
318
|
+
generateSpeech,
|
|
319
|
+
streamSpeech,
|
|
320
|
+
generateConversation,
|
|
321
|
+
timestampsToCaptions,
|
|
322
|
+
ApiError,
|
|
323
|
+
} from '@speech-sdk/core';
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
Provider and STT factories live under `@speech-sdk/core/providers`:
|
|
327
|
+
|
|
328
|
+
```ts
|
|
329
|
+
import {
|
|
330
|
+
createOpenAI,
|
|
331
|
+
createElevenLabs,
|
|
332
|
+
createCartesia,
|
|
333
|
+
createSpeechGateway,
|
|
334
|
+
} from '@speech-sdk/core/providers';
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
Public types live under `@speech-sdk/core/types`:
|
|
338
|
+
|
|
339
|
+
```ts
|
|
340
|
+
import type {
|
|
341
|
+
GenerateSpeechOptions,
|
|
342
|
+
SpeechResult,
|
|
343
|
+
ConversationResult,
|
|
344
|
+
Voice,
|
|
345
|
+
WordTimestamp,
|
|
346
|
+
} from '@speech-sdk/core/types';
|
|
347
|
+
```
|
|
348
|
+
|
|
299
349
|
## API reference
|
|
300
350
|
|
|
301
351
|
```ts
|
|
@@ -305,8 +355,7 @@ generateSpeech({
|
|
|
305
355
|
voice: Voice, // required — string | { url } | { audio }
|
|
306
356
|
providerOptions?: object,
|
|
307
357
|
volumeDbfs?: number, // ≤ 0
|
|
308
|
-
timestamps?:
|
|
309
|
-
timestampProvider?: ResolvedSTTModel, // override the STT fallback
|
|
358
|
+
timestamps?: boolean, // default false
|
|
310
359
|
maxRetries?: number, // default 2
|
|
311
360
|
abortSignal?: AbortSignal,
|
|
312
361
|
headers?: Record<string, string>,
|
|
@@ -321,6 +370,11 @@ interface SpeechResult {
|
|
|
321
370
|
}
|
|
322
371
|
|
|
323
372
|
interface WordTimestamp { text: string; start: number; end: number } // seconds
|
|
373
|
+
|
|
374
|
+
// Returned by generateConversation — extends WordTimestamp with turnIndex
|
|
375
|
+
interface ConversationWordTimestamp extends WordTimestamp {
|
|
376
|
+
turnIndex: number; // index into the input turns[] array
|
|
377
|
+
}
|
|
324
378
|
```
|
|
325
379
|
|
|
326
380
|
## Error handling
|
|
@@ -333,19 +387,22 @@ try {
|
|
|
333
387
|
} catch (error) {
|
|
334
388
|
if (error instanceof ApiError) {
|
|
335
389
|
error.statusCode; // 401, 429, 500, ...
|
|
336
|
-
error.model; // "openai/gpt-4o-mini-tts"
|
|
337
390
|
error.responseBody;
|
|
391
|
+
error.code; // stable machine-readable code (optional)
|
|
338
392
|
}
|
|
339
393
|
}
|
|
340
394
|
```
|
|
341
395
|
|
|
396
|
+
`ApiError.code` is populated from the RFC 7807 `application/problem+json` `code` extension when the upstream provides one (currently only the Speech Gateway). Match on `err.code` over `err.message` text — codes are a stable contract, messages aren't.
|
|
397
|
+
|
|
342
398
|
| Error | When |
|
|
343
399
|
|---|---|
|
|
344
400
|
| `ApiError` | Provider returned non-2xx |
|
|
401
|
+
| `MissingApiKeyError` | No `apiKey` passed and the provider's env var is unset |
|
|
345
402
|
| `NoSpeechGeneratedError` | Empty input (after tag stripping) or empty provider response |
|
|
346
403
|
| `StreamingNotSupportedError` | `streamSpeech()` on a non-streaming model |
|
|
347
404
|
| `VolumeAdjustmentUnsupportedError` | `volumeDbfs` with no decodable output mode |
|
|
348
|
-
| `TimestampKeyMissingError` | `timestamps:
|
|
405
|
+
| `TimestampKeyMissingError` | `timestamps: true` with no native support, no `fallbackSTT` configured, and `OPENAI_API_KEY` not set |
|
|
349
406
|
| `ConversationInputError` / `DialogueConstraintError` / `StitchUnsupportedError` | `generateConversation` validation / native caps / stitch incompatibility |
|
|
350
407
|
| `SpeechSDKError` | Base class |
|
|
351
408
|
|
|
@@ -1,58 +1,16 @@
|
|
|
1
1
|
import { generateConversation as _generateConversation } from "../../generate-conversation.js";
|
|
2
2
|
import { generateSpeech as _generateSpeech } from "../../generate-speech.js";
|
|
3
3
|
import type { WordTimestamp } from "../../timestamps.js";
|
|
4
|
-
/**
|
|
5
|
-
* Write a test-generated audio file to `SPEECH_SDK_E2E_OUTPUT_DIR` if the env
|
|
6
|
-
* var is set. No-op otherwise, so normal CI runs don't produce artifacts.
|
|
7
|
-
* Usually you don't need to call this directly — use the `generateSpeech`,
|
|
8
|
-
* `generateConversation`, and `collectStreamAndSave` helpers exported from
|
|
9
|
-
* this module, which autosave using the current test name.
|
|
10
|
-
*
|
|
11
|
-
* Output layout: `$SPEECH_SDK_E2E_OUTPUT_DIR/<provider-file>/<test-slug>.<ext>`.
|
|
12
|
-
* If the same test saves multiple times, subsequent files are suffixed `-2`,
|
|
13
|
-
* `-3`, etc.
|
|
14
|
-
*/
|
|
15
4
|
export declare function maybeSaveAudio(name: string, audio: {
|
|
16
5
|
uint8Array: Uint8Array;
|
|
17
6
|
mediaType: string;
|
|
18
7
|
}): Promise<void>;
|
|
19
|
-
/**
|
|
20
|
-
* Like {@link maybeSaveAudio}, plus — when `timestamps` is non-empty — also
|
|
21
|
-
* writes the raw alignment JSON and rendered SRT/VTT caption files alongside
|
|
22
|
-
* the audio. All four files share the same stem so they stay paired across
|
|
23
|
-
* multi-call tests. Still a no-op when `SPEECH_SDK_E2E_OUTPUT_DIR` is unset.
|
|
24
|
-
*
|
|
25
|
-
* Output layout (when timestamps present):
|
|
26
|
-
* ```
|
|
27
|
-
* <dir>/<bucket>/<slug>.<audio-ext>
|
|
28
|
-
* <dir>/<bucket>/<slug>.timestamps.json
|
|
29
|
-
* <dir>/<bucket>/<slug>.srt
|
|
30
|
-
* <dir>/<bucket>/<slug>.vtt
|
|
31
|
-
* ```
|
|
32
|
-
*/
|
|
33
8
|
export declare function maybeSaveResult(name: string, audio: {
|
|
34
9
|
uint8Array: Uint8Array;
|
|
35
10
|
mediaType: string;
|
|
36
11
|
}, timestamps?: readonly WordTimestamp[]): Promise<void>;
|
|
37
|
-
/**
|
|
38
|
-
* Drop-in replacement for `generateSpeech` that autosaves to
|
|
39
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
|
|
40
|
-
* result includes word timestamps, also writes paired `.timestamps.json`,
|
|
41
|
-
* `.srt`, and `.vtt` files.
|
|
42
|
-
*/
|
|
43
12
|
export declare const generateSpeech: typeof _generateSpeech;
|
|
44
|
-
/**
|
|
45
|
-
* Drop-in replacement for `generateConversation` that autosaves to
|
|
46
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
|
|
47
|
-
* result includes word timestamps, also writes paired `.timestamps.json`,
|
|
48
|
-
* `.srt`, and `.vtt` files.
|
|
49
|
-
*/
|
|
50
13
|
export declare const generateConversation: typeof _generateConversation;
|
|
51
|
-
/**
|
|
52
|
-
* Collects a streamed `streamSpeech` result into bytes AND autosaves them to
|
|
53
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. Use in place
|
|
54
|
-
* of `collectStream(result.audio)` in e2e tests.
|
|
55
|
-
*/
|
|
56
14
|
export declare function collectStreamAndSave(result: {
|
|
57
15
|
audio: ReadableStream<Uint8Array>;
|
|
58
16
|
mediaType: string;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"_save-audio.d.ts","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC7E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;
|
|
1
|
+
{"version":3,"file":"_save-audio.d.ts","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC7E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AA6EzD,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,GACnD,OAAO,CAAC,IAAI,CAAC,CAEf;AAED,wBAAsB,eAAe,CACnC,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;IAAE,UAAU,EAAE,UAAU,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,EACpD,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,GACpC,OAAO,CAAC,IAAI,CAAC,CA8Bf;AAOD,eAAO,MAAM,cAAc,EAAE,OAAO,eAMR,CAAC;AAE7B,eAAO,MAAM,oBAAoB,EAAE,OAAO,qBAMR,CAAC;AAEnC,wBAAsB,oBAAoB,CAAC,MAAM,EAAE;IACjD,KAAK,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;CACnB,GAAG,OAAO,CAAC,UAAU,CAAC,CAOtB"}
|
|
@@ -49,12 +49,6 @@ function currentTestContext() {
|
|
|
49
49
|
testPath: state.testPath,
|
|
50
50
|
};
|
|
51
51
|
}
|
|
52
|
-
/**
|
|
53
|
-
* Derives the subdirectory for a given test file. e2e tests are named like
|
|
54
|
-
* `openai.e2e.test.ts` / `conversation-google.e2e.test.ts`; we strip the
|
|
55
|
-
* `.e2e.test.ts` suffix and use that as the per-provider bucket so a full run
|
|
56
|
-
* doesn't dump 100+ files into a single flat directory.
|
|
57
|
-
*/
|
|
58
52
|
function providerBucket(testPath) {
|
|
59
53
|
if (!testPath) {
|
|
60
54
|
return "unknown";
|
|
@@ -62,18 +56,7 @@ function providerBucket(testPath) {
|
|
|
62
56
|
const base = basename(testPath).replace(E2E_TEST_SUFFIX, "");
|
|
63
57
|
return slugify(base) || "unknown";
|
|
64
58
|
}
|
|
65
|
-
// Counter keyed by `${bucket}/${slug}` so multiple generate/stream calls
|
|
66
|
-
// within a single test don't overwrite each other. Vitest isolates modules
|
|
67
|
-
// per file, so this resets per test file — collisions are only meaningful
|
|
68
|
-
// within the same `it`.
|
|
69
59
|
const callCounts = new Map();
|
|
70
|
-
/**
|
|
71
|
-
* Reserves a filename stem (without extension) for the next save call.
|
|
72
|
-
* First call returns `slug`; subsequent calls return `slug-2`, `slug-3`, etc.
|
|
73
|
-
* A single stem is shared across all sibling outputs from one logical save
|
|
74
|
-
* (audio + timestamps + captions), so they remain paired even across
|
|
75
|
-
* multiple saves within the same test.
|
|
76
|
-
*/
|
|
77
60
|
function nextStem(bucket, slug) {
|
|
78
61
|
const key = `${bucket}/${slug}`;
|
|
79
62
|
const n = (callCounts.get(key) ?? 0) + 1;
|
|
@@ -84,34 +67,9 @@ async function writeAndLog(file, data) {
|
|
|
84
67
|
await writeFile(file, data);
|
|
85
68
|
console.log(`[e2e-save] wrote ${file}`);
|
|
86
69
|
}
|
|
87
|
-
/**
|
|
88
|
-
* Write a test-generated audio file to `SPEECH_SDK_E2E_OUTPUT_DIR` if the env
|
|
89
|
-
* var is set. No-op otherwise, so normal CI runs don't produce artifacts.
|
|
90
|
-
* Usually you don't need to call this directly — use the `generateSpeech`,
|
|
91
|
-
* `generateConversation`, and `collectStreamAndSave` helpers exported from
|
|
92
|
-
* this module, which autosave using the current test name.
|
|
93
|
-
*
|
|
94
|
-
* Output layout: `$SPEECH_SDK_E2E_OUTPUT_DIR/<provider-file>/<test-slug>.<ext>`.
|
|
95
|
-
* If the same test saves multiple times, subsequent files are suffixed `-2`,
|
|
96
|
-
* `-3`, etc.
|
|
97
|
-
*/
|
|
98
70
|
export async function maybeSaveAudio(name, audio) {
|
|
99
71
|
await maybeSaveResult(name, audio);
|
|
100
72
|
}
|
|
101
|
-
/**
|
|
102
|
-
* Like {@link maybeSaveAudio}, plus — when `timestamps` is non-empty — also
|
|
103
|
-
* writes the raw alignment JSON and rendered SRT/VTT caption files alongside
|
|
104
|
-
* the audio. All four files share the same stem so they stay paired across
|
|
105
|
-
* multi-call tests. Still a no-op when `SPEECH_SDK_E2E_OUTPUT_DIR` is unset.
|
|
106
|
-
*
|
|
107
|
-
* Output layout (when timestamps present):
|
|
108
|
-
* ```
|
|
109
|
-
* <dir>/<bucket>/<slug>.<audio-ext>
|
|
110
|
-
* <dir>/<bucket>/<slug>.timestamps.json
|
|
111
|
-
* <dir>/<bucket>/<slug>.srt
|
|
112
|
-
* <dir>/<bucket>/<slug>.vtt
|
|
113
|
-
* ```
|
|
114
|
-
*/
|
|
115
73
|
export async function maybeSaveResult(name, audio, timestamps) {
|
|
116
74
|
const dir = resolveOutputDir();
|
|
117
75
|
if (!dir) {
|
|
@@ -133,33 +91,16 @@ function currentTestSlug() {
|
|
|
133
91
|
const { currentTestName } = currentTestContext();
|
|
134
92
|
return slugify(currentTestName ?? "unnamed") || "unnamed";
|
|
135
93
|
}
|
|
136
|
-
/**
|
|
137
|
-
* Drop-in replacement for `generateSpeech` that autosaves to
|
|
138
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
|
|
139
|
-
* result includes word timestamps, also writes paired `.timestamps.json`,
|
|
140
|
-
* `.srt`, and `.vtt` files.
|
|
141
|
-
*/
|
|
142
94
|
export const generateSpeech = (async (options) => {
|
|
143
95
|
const result = await _generateSpeech(options);
|
|
144
96
|
await maybeSaveResult(currentTestSlug(), result.audio, result.timestamps);
|
|
145
97
|
return result;
|
|
146
98
|
});
|
|
147
|
-
/**
|
|
148
|
-
* Drop-in replacement for `generateConversation` that autosaves to
|
|
149
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. When the
|
|
150
|
-
* result includes word timestamps, also writes paired `.timestamps.json`,
|
|
151
|
-
* `.srt`, and `.vtt` files.
|
|
152
|
-
*/
|
|
153
99
|
export const generateConversation = (async (options) => {
|
|
154
100
|
const result = await _generateConversation(options);
|
|
155
101
|
await maybeSaveResult(currentTestSlug(), result.audio, result.timestamps);
|
|
156
102
|
return result;
|
|
157
103
|
});
|
|
158
|
-
/**
|
|
159
|
-
* Collects a streamed `streamSpeech` result into bytes AND autosaves them to
|
|
160
|
-
* `SPEECH_SDK_E2E_OUTPUT_DIR` using the current vitest test name. Use in place
|
|
161
|
-
* of `collectStream(result.audio)` in e2e tests.
|
|
162
|
-
*/
|
|
163
104
|
export async function collectStreamAndSave(result) {
|
|
164
105
|
const bytes = await collectStream(result.audio);
|
|
165
106
|
await maybeSaveAudio(currentTestSlug(), {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"_save-audio.js","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAC3C,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,EAAE,oBAAoB,EAAE,MAAM,mBAAmB,CAAC;AACzD,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAE7E,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AAErD,SAAS,MAAM,CAAC,SAAiB;IAC/B,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC5D,OAAO,KAAK,CAAC;IACf,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/B,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/B,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,cAAc,GAAG,mBAAmB,CAAC;AAC3C,MAAM,+BAA+B,GAAG,UAAU,CAAC;AACnD,MAAM,eAAe,GAAG,+BAA+B,CAAC;AAExD,SAAS,OAAO,CAAC,IAAY;IAC3B,OAAO,IAAI;SACR,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC;SAC5B,OAAO,CAAC,+BAA+B,EAAE,EAAE,CAAC,CAAC;AAClD,CAAC;AAED,SAAS,gBAAgB;IACvB,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC;IAClD,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,OAAO,IAAI,CAAC;IACd,CAAC;IACD,OAAO,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,IAAI,EAAE,EAAE,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;AAChF,CAAC;AAED,SAAS,kBAAkB;IAIzB,yFAAyF;IACzF,MAAM,KAAK,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC;IAChC,OAAO;QACL,eAAe,EAAE,KAAK,CAAC,eAAe;QACtC,QAAQ,EAAE,KAAK,CAAC,QAAQ;KACzB,CAAC;AACJ,CAAC;AAED
|
|
1
|
+
{"version":3,"file":"_save-audio.js","sourceRoot":"","sources":["../../../src/__tests__/e2e/_save-audio.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAC3C,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,EAAE,oBAAoB,EAAE,MAAM,mBAAmB,CAAC;AACzD,OAAO,EAAE,oBAAoB,IAAI,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAC/F,OAAO,EAAE,cAAc,IAAI,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAE7E,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AAErD,SAAS,MAAM,CAAC,SAAiB;IAC/B,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC5D,OAAO,KAAK,CAAC;IACf,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/B,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QAC/B,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,cAAc,GAAG,mBAAmB,CAAC;AAC3C,MAAM,+BAA+B,GAAG,UAAU,CAAC;AACnD,MAAM,eAAe,GAAG,+BAA+B,CAAC;AAExD,SAAS,OAAO,CAAC,IAAY;IAC3B,OAAO,IAAI;SACR,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC;SAC5B,OAAO,CAAC,+BAA+B,EAAE,EAAE,CAAC,CAAC;AAClD,CAAC;AAED,SAAS,gBAAgB;IACvB,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC;IAClD,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,OAAO,IAAI,CAAC;IACd,CAAC;IACD,OAAO,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,IAAI,EAAE,EAAE,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;AAChF,CAAC;AAED,SAAS,kBAAkB;IAIzB,yFAAyF;IACzF,MAAM,KAAK,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC;IAChC,OAAO;QACL,eAAe,EAAE,KAAK,CAAC,eAAe;QACtC,QAAQ,EAAE,KAAK,CAAC,QAAQ;KACzB,CAAC;AACJ,CAAC;AAED,SAAS,cAAc,CAAC,QAA4B;IAClD,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,eAAe,EAAE,EAAE,CAAC,CAAC;IAC7D,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,SAAS,CAAC;AACpC,CAAC;AAED,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;AAE7C,SAAS,QAAQ,CAAC,MAAc,EAAE,IAAY;IAC5C,MAAM,GAAG,GAAG,GAAG,MAAM,IAAI,IAAI,EAAE,CAAC;IAChC,MAAM,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;IACzC,UAAU,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;IACvB,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,IAAI,IAAI,CAAC,EAAE,CAAC;AACzC,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,IAAY,EAAE,IAAyB;IAChE,MAAM,SAAS,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,EAAE,CAAC,CAAC;AAC1C,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAAY,EACZ,KAAoD;IAEpD,MAAM,eAAe,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;AACrC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,IAAY,EACZ,KAAoD,EACpD,UAAqC;IAErC,MAAM,GAAG,GAAG,gBAAgB,EAAE,CAAC;IAC/B,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,OAAO;IACT,CAAC;IACD,MAAM,EAAE,QAAQ,EAAE,GAAG,kBAAkB,EAAE,CAAC;IAC1C,MAAM,MAAM,GAAG,cAAc,CAAC,QAAQ,CAAC,CAAC;IACxC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;IACpC,MAAM,KAAK,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC5C,MAAM,IAAI,GAAG,QAAQ,CAAC,MAAM,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC;IAE7C,MAAM,WAAW,CACf,IAAI,CAAC,SAAS,EAAE,GAAG,IAAI,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,EAAE,CAAC,EACrD,KAAK,CAAC,UAAU,CACjB,CAAC;IAEF,IAAI,UAAU,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxC,MAAM,WAAW,CACf,IAAI,CAAC,SAAS,EAAE,GAAG,IAAI,kBAAkB,CAAC,EAC1C,GAAG,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAC3C,CAAC;QACF,MAAM,WAAW,CACf,IAAI,CAAC,SAAS,EAAE,GAAG,IAAI,MAAM,CAAC,EAC9B,oBAAoB,CAAC,UAAU,CAAC,CACjC,CAAC;QACF,MAAM,WAAW,CACf,IAAI,CAAC,SAAS,EAAE,GAAG,IAAI,MAAM,CAAC,EAC9B,oBAAoB,CAAC,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,CACpD,CAAC;IACJ,CAAC;AACH,CAAC;AAED,SAAS,eAAe;IACtB,MAAM,EAAE,eAAe,EAAE,GAAG,kBAAkB,EAAE,CAAC;IACjD,OAAO,OAAO,CAAC,eAAe,IAAI,SAAS,CAAC,IAAI,SAAS,CAAC;AAC5D,CAAC;AAED,MAAM,CAAC,MAAM,cAAc,GAA2B,CAAC,KAAK,EAC1D,OAA8C,EAC9C,EAAE;IACF,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,OAAO,CAAC,CAAC;IAC9C,MAAM,eAAe,CAAC,eAAe,EAAE,EAAE,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IAC1E,OAAO,MAAM,CAAC;AAChB,CAAC,CAA2B,CAAC;AAE7B,MAAM,CAAC,MAAM,oBAAoB,GAAiC,CAAC,KAAK,EACtE,OAAoD,EACpD,EAAE;IACF,MAAM,MAAM,GAAG,MAAM,qBAAqB,CAAC,OAAO,CAAC,CAAC;IACpD,MAAM,eAAe,CAAC,eAAe,EAAE,EAAE,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IAC1E,OAAO,MAAM,CAAC;AAChB,CAAC,CAAiC,CAAC;AAEnC,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,MAG1C;IACC,MAAM,KAAK,GAAG,MAAM,aAAa,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAChD,MAAM,cAAc,CAAC,eAAe,EAAE,EAAE;QACtC,UAAU,EAAE,KAAK;QACjB,SAAS,EAAE,MAAM,CAAC,SAAS;KAC5B,CAAC,CAAC;IACH,OAAO,KAAK,CAAC;AACf,CAAC"}
|
package/dist/audio-duration.d.ts
CHANGED
|
@@ -1,7 +1,2 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Compute audio duration in milliseconds from raw audio bytes.
|
|
3
|
-
* Uses mediabunny to parse the audio container (MP3, WAV, Ogg, FLAC, etc.)
|
|
4
|
-
* and extract duration. Returns undefined if parsing fails.
|
|
5
|
-
*/
|
|
6
1
|
export declare function computeAudioDuration(data: Uint8Array | string, mediaType: string): Promise<number | undefined>;
|
|
7
2
|
//# sourceMappingURL=audio-duration.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"audio-duration.d.ts","sourceRoot":"","sources":["../src/audio-duration.ts"],"names":[],"mappings":"AAEA
|
|
1
|
+
{"version":3,"file":"audio-duration.d.ts","sourceRoot":"","sources":["../src/audio-duration.ts"],"names":[],"mappings":"AAEA,wBAAsB,oBAAoB,CACxC,IAAI,EAAE,UAAU,GAAG,MAAM,EACzB,SAAS,EAAE,MAAM,GAChB,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC,CAoB7B"}
|