@speech-sdk/core 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +227 -108
- package/dist/__tests__/e2e/_save-audio.d.ts +0 -42
- package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
- package/dist/__tests__/e2e/_save-audio.js +0 -59
- package/dist/__tests__/e2e/_save-audio.js.map +1 -1
- package/dist/audio-decode.d.ts +7 -0
- package/dist/audio-decode.d.ts.map +1 -0
- package/dist/audio-decode.js +109 -0
- package/dist/audio-decode.js.map +1 -0
- package/dist/audio-duration.d.ts +0 -5
- package/dist/audio-duration.d.ts.map +1 -1
- package/dist/audio-duration.js +5 -21
- package/dist/audio-duration.js.map +1 -1
- package/dist/audio-output.d.ts +39 -0
- package/dist/audio-output.d.ts.map +1 -0
- package/dist/audio-output.js +111 -0
- package/dist/audio-output.js.map +1 -0
- package/dist/audio-utils.d.ts +2 -10
- package/dist/audio-utils.d.ts.map +1 -1
- package/dist/audio-utils.js +57 -15
- package/dist/audio-utils.js.map +1 -1
- package/dist/captions.d.ts +0 -108
- package/dist/captions.d.ts.map +1 -1
- package/dist/captions.js +8 -98
- package/dist/captions.js.map +1 -1
- package/dist/conversation/attribute-timestamps.d.ts +26 -0
- package/dist/conversation/attribute-timestamps.d.ts.map +1 -0
- package/dist/conversation/attribute-timestamps.js +276 -0
- package/dist/conversation/attribute-timestamps.js.map +1 -0
- package/dist/conversation/dispatch.d.ts +5 -5
- package/dist/conversation/dispatch.d.ts.map +1 -1
- package/dist/conversation/dispatch.js +18 -8
- package/dist/conversation/dispatch.js.map +1 -1
- package/dist/conversation/errors.d.ts +3 -0
- package/dist/conversation/errors.d.ts.map +1 -1
- package/dist/conversation/errors.js +6 -0
- package/dist/conversation/errors.js.map +1 -1
- package/dist/conversation/pcm-concat.d.ts +0 -24
- package/dist/conversation/pcm-concat.d.ts.map +1 -1
- package/dist/conversation/pcm-concat.js +8 -183
- package/dist/conversation/pcm-concat.js.map +1 -1
- package/dist/conversation/proportional-fill.d.ts +10 -0
- package/dist/conversation/proportional-fill.d.ts.map +1 -0
- package/dist/conversation/proportional-fill.js +64 -0
- package/dist/conversation/proportional-fill.js.map +1 -0
- package/dist/conversation/silence-detection.d.ts +14 -0
- package/dist/conversation/silence-detection.d.ts.map +1 -0
- package/dist/conversation/silence-detection.js +52 -0
- package/dist/conversation/silence-detection.js.map +1 -0
- package/dist/conversation/stitch.d.ts +9 -6
- package/dist/conversation/stitch.d.ts.map +1 -1
- package/dist/conversation/stitch.js +72 -51
- package/dist/conversation/stitch.js.map +1 -1
- package/dist/conversation/types.d.ts +7 -37
- package/dist/conversation/types.d.ts.map +1 -1
- package/dist/conversation/validate.d.ts +1 -16
- package/dist/conversation/validate.d.ts.map +1 -1
- package/dist/conversation/validate.js +29 -29
- package/dist/conversation/validate.js.map +1 -1
- package/dist/default-stt-fallback.d.ts +3 -0
- package/dist/default-stt-fallback.d.ts.map +1 -0
- package/dist/default-stt-fallback.js +11 -0
- package/dist/default-stt-fallback.js.map +1 -0
- package/dist/derive-timestamps.d.ts +1 -5
- package/dist/derive-timestamps.d.ts.map +1 -1
- package/dist/derive-timestamps.js +1 -15
- package/dist/derive-timestamps.js.map +1 -1
- package/dist/encoders/mp3.d.ts +6 -0
- package/dist/encoders/mp3.d.ts.map +1 -0
- package/dist/encoders/mp3.js +54 -0
- package/dist/encoders/mp3.js.map +1 -0
- package/dist/errors.d.ts +20 -13
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +49 -15
- package/dist/errors.js.map +1 -1
- package/dist/generate-conversation.d.ts +5 -4
- package/dist/generate-conversation.d.ts.map +1 -1
- package/dist/generate-conversation.js +250 -93
- package/dist/generate-conversation.js.map +1 -1
- package/dist/generate-speech.d.ts +7 -28
- package/dist/generate-speech.d.ts.map +1 -1
- package/dist/generate-speech.js +185 -94
- package/dist/generate-speech.js.map +1 -1
- package/dist/index.d.ts +7 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -4
- package/dist/index.js.map +1 -1
- package/dist/logger.d.ts.map +1 -1
- package/dist/logger.js +2 -13
- package/dist/logger.js.map +1 -1
- package/dist/metadata.d.ts +0 -22
- package/dist/metadata.d.ts.map +1 -1
- package/dist/pronunciations/errors.d.ts +5 -0
- package/dist/pronunciations/errors.d.ts.map +1 -0
- package/dist/pronunciations/errors.js +8 -0
- package/dist/pronunciations/errors.js.map +1 -0
- package/dist/pronunciations/inverse-align.d.ts +4 -0
- package/dist/pronunciations/inverse-align.d.ts.map +1 -0
- package/dist/pronunciations/inverse-align.js +54 -0
- package/dist/pronunciations/inverse-align.js.map +1 -0
- package/dist/pronunciations/merge.d.ts +4 -0
- package/dist/pronunciations/merge.d.ts.map +1 -0
- package/dist/pronunciations/merge.js +13 -0
- package/dist/pronunciations/merge.js.map +1 -0
- package/dist/pronunciations/substitute.d.ts +6 -0
- package/dist/pronunciations/substitute.d.ts.map +1 -0
- package/dist/pronunciations/substitute.js +67 -0
- package/dist/pronunciations/substitute.js.map +1 -0
- package/dist/pronunciations/types.d.ts +18 -0
- package/dist/pronunciations/types.d.ts.map +1 -0
- package/dist/pronunciations/types.js +2 -0
- package/dist/pronunciations/types.js.map +1 -0
- package/dist/pronunciations/validate.d.ts +3 -0
- package/dist/pronunciations/validate.d.ts.map +1 -0
- package/dist/pronunciations/validate.js +26 -0
- package/dist/pronunciations/validate.js.map +1 -0
- package/dist/provider-utils.d.ts +4 -9
- package/dist/provider-utils.d.ts.map +1 -1
- package/dist/provider-utils.js +60 -51
- package/dist/provider-utils.js.map +1 -1
- package/dist/providers/cartesia/alignment.d.ts +0 -16
- package/dist/providers/cartesia/alignment.d.ts.map +1 -1
- package/dist/providers/cartesia/alignment.js +1 -6
- package/dist/providers/cartesia/alignment.js.map +1 -1
- package/dist/providers/cartesia/index.d.ts +29 -19
- package/dist/providers/cartesia/index.d.ts.map +1 -1
- package/dist/providers/cartesia/index.js +116 -80
- package/dist/providers/cartesia/index.js.map +1 -1
- package/dist/providers/deepgram/index.d.ts +23 -8
- package/dist/providers/deepgram/index.d.ts.map +1 -1
- package/dist/providers/deepgram/index.js +51 -18
- package/dist/providers/deepgram/index.js.map +1 -1
- package/dist/providers/elevenlabs/alignment.d.ts +7 -21
- package/dist/providers/elevenlabs/alignment.d.ts.map +1 -1
- package/dist/providers/elevenlabs/alignment.js +8 -9
- package/dist/providers/elevenlabs/alignment.js.map +1 -1
- package/dist/providers/elevenlabs/index.d.ts +14 -38
- package/dist/providers/elevenlabs/index.d.ts.map +1 -1
- package/dist/providers/elevenlabs/index.js +186 -169
- package/dist/providers/elevenlabs/index.js.map +1 -1
- package/dist/providers/fal/index.d.ts +11 -20
- package/dist/providers/fal/index.d.ts.map +1 -1
- package/dist/providers/fal/index.js +49 -37
- package/dist/providers/fal/index.js.map +1 -1
- package/dist/providers/fish-audio/index.d.ts +14 -8
- package/dist/providers/fish-audio/index.d.ts.map +1 -1
- package/dist/providers/fish-audio/index.js +47 -19
- package/dist/providers/fish-audio/index.js.map +1 -1
- package/dist/providers/gateway/index.d.ts +76 -0
- package/dist/providers/gateway/index.d.ts.map +1 -0
- package/dist/providers/gateway/index.js +251 -0
- package/dist/providers/gateway/index.js.map +1 -0
- package/dist/providers/google/index.d.ts +12 -20
- package/dist/providers/google/index.d.ts.map +1 -1
- package/dist/providers/google/index.js +180 -162
- package/dist/providers/google/index.js.map +1 -1
- package/dist/providers/hume/alignment.d.ts +30 -35
- package/dist/providers/hume/alignment.d.ts.map +1 -1
- package/dist/providers/hume/alignment.js +14 -8
- package/dist/providers/hume/alignment.js.map +1 -1
- package/dist/providers/hume/index.d.ts +16 -16
- package/dist/providers/hume/index.d.ts.map +1 -1
- package/dist/providers/hume/index.js +79 -65
- package/dist/providers/hume/index.js.map +1 -1
- package/dist/providers/inworld/alignment.d.ts +8 -22
- package/dist/providers/inworld/alignment.d.ts.map +1 -1
- package/dist/providers/inworld/alignment.js +9 -8
- package/dist/providers/inworld/alignment.js.map +1 -1
- package/dist/providers/inworld/index.d.ts +17 -20
- package/dist/providers/inworld/index.d.ts.map +1 -1
- package/dist/providers/inworld/index.js +79 -47
- package/dist/providers/inworld/index.js.map +1 -1
- package/dist/providers/mistral/index.d.ts +14 -8
- package/dist/providers/mistral/index.d.ts.map +1 -1
- package/dist/providers/mistral/index.js +63 -48
- package/dist/providers/mistral/index.js.map +1 -1
- package/dist/providers/murf/alignment.d.ts +10 -19
- package/dist/providers/murf/alignment.d.ts.map +1 -1
- package/dist/providers/murf/alignment.js +10 -5
- package/dist/providers/murf/alignment.js.map +1 -1
- package/dist/providers/murf/index.d.ts +15 -16
- package/dist/providers/murf/index.d.ts.map +1 -1
- package/dist/providers/murf/index.js +105 -58
- package/dist/providers/murf/index.js.map +1 -1
- package/dist/providers/openai/index.d.ts +43 -29
- package/dist/providers/openai/index.d.ts.map +1 -1
- package/dist/providers/openai/index.js +294 -106
- package/dist/providers/openai/index.js.map +1 -1
- package/dist/providers/resemble/alignment.d.ts +8 -29
- package/dist/providers/resemble/alignment.d.ts.map +1 -1
- package/dist/providers/resemble/alignment.js +9 -12
- package/dist/providers/resemble/alignment.js.map +1 -1
- package/dist/providers/resemble/index.d.ts +21 -11
- package/dist/providers/resemble/index.d.ts.map +1 -1
- package/dist/providers/resemble/index.js +89 -49
- package/dist/providers/resemble/index.js.map +1 -1
- package/dist/providers/smallest-ai/index.d.ts +47 -0
- package/dist/providers/smallest-ai/index.d.ts.map +1 -0
- package/dist/providers/smallest-ai/index.js +107 -0
- package/dist/providers/smallest-ai/index.js.map +1 -0
- package/dist/providers/xai/index.d.ts +25 -9
- package/dist/providers/xai/index.d.ts.map +1 -1
- package/dist/providers/xai/index.js +63 -40
- package/dist/providers/xai/index.js.map +1 -1
- package/dist/providers.d.ts +31 -0
- package/dist/providers.d.ts.map +1 -0
- package/dist/providers.js +16 -0
- package/dist/providers.js.map +1 -0
- package/dist/resolve-provider.d.ts.map +1 -1
- package/dist/resolve-provider.js +8 -51
- package/dist/resolve-provider.js.map +1 -1
- package/dist/retry-options.d.ts +6 -0
- package/dist/retry-options.d.ts.map +1 -0
- package/dist/retry-options.js +48 -0
- package/dist/retry-options.js.map +1 -0
- package/dist/speech-provider.d.ts +28 -53
- package/dist/speech-provider.d.ts.map +1 -1
- package/dist/speech-provider.js +5 -26
- package/dist/speech-provider.js.map +1 -1
- package/dist/speech-result.d.ts +8 -9
- package/dist/speech-result.d.ts.map +1 -1
- package/dist/speech-result.js.map +1 -1
- package/dist/speech-to-text-provider.d.ts +0 -12
- package/dist/speech-to-text-provider.d.ts.map +1 -1
- package/dist/stream-speech.d.ts +4 -2
- package/dist/stream-speech.d.ts.map +1 -1
- package/dist/stream-speech.js +36 -22
- package/dist/stream-speech.js.map +1 -1
- package/dist/timestamps.d.ts +3 -17
- package/dist/timestamps.d.ts.map +1 -1
- package/dist/turns.d.ts +9 -0
- package/dist/turns.d.ts.map +1 -0
- package/dist/turns.js +21 -0
- package/dist/turns.js.map +1 -0
- package/dist/types.d.ts +31 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/volume-adjust.d.ts +0 -6
- package/dist/volume-adjust.d.ts.map +1 -1
- package/dist/volume-adjust.js +4 -16
- package/dist/volume-adjust.js.map +1 -1
- package/package.json +13 -66
- package/dist/stt-providers/openai/index.d.ts +0 -42
- package/dist/stt-providers/openai/index.d.ts.map +0 -1
- package/dist/stt-providers/openai/index.js +0 -184
- package/dist/stt-providers/openai/index.js.map +0 -1
|
@@ -1,40 +1,47 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
1
2
|
import { base64ToUint8Array } from "../../audio-utils.js";
|
|
2
3
|
import { SpeechSDKError } from "../../errors.js";
|
|
3
4
|
import { handleErrorResponse, resolveApiKey, SDK_USER_AGENT, } from "../../provider-utils.js";
|
|
4
|
-
import { snippetsToWordTimestamps } from "./alignment.js";
|
|
5
|
+
import { humeSnippetSchema, snippetsToWordTimestamps } from "./alignment.js";
|
|
6
|
+
const ttsResponseSchema = z.object({
|
|
7
|
+
generations: z
|
|
8
|
+
.array(z.object({
|
|
9
|
+
audio: z.string().optional(),
|
|
10
|
+
snippets: z.array(z.array(humeSnippetSchema)).optional(),
|
|
11
|
+
}))
|
|
12
|
+
.optional(),
|
|
13
|
+
});
|
|
14
|
+
export const HUME_PROVIDER_ID = "hume";
|
|
15
|
+
export const HUME_MODELS = [
|
|
16
|
+
{
|
|
17
|
+
id: "octave-2",
|
|
18
|
+
releaseDate: "2025-10-01",
|
|
19
|
+
languages: [
|
|
20
|
+
"en",
|
|
21
|
+
"fr",
|
|
22
|
+
"de",
|
|
23
|
+
"es",
|
|
24
|
+
"pt",
|
|
25
|
+
"ja",
|
|
26
|
+
"ko",
|
|
27
|
+
"hi",
|
|
28
|
+
"it",
|
|
29
|
+
"ar",
|
|
30
|
+
"ru",
|
|
31
|
+
],
|
|
32
|
+
features: ["streaming", "inline-voice-cloning", "timestamps"],
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
id: "octave-1",
|
|
36
|
+
releaseDate: "2025-03-01",
|
|
37
|
+
languages: ["en"],
|
|
38
|
+
features: ["streaming"],
|
|
39
|
+
},
|
|
40
|
+
];
|
|
5
41
|
export class HumeSpeechProvider {
|
|
6
|
-
id =
|
|
42
|
+
id = HUME_PROVIDER_ID;
|
|
7
43
|
defaultModel = "octave-2";
|
|
8
|
-
models =
|
|
9
|
-
{
|
|
10
|
-
id: "octave-2",
|
|
11
|
-
releaseDate: "2025-10-01",
|
|
12
|
-
languages: [
|
|
13
|
-
"en",
|
|
14
|
-
"fr",
|
|
15
|
-
"de",
|
|
16
|
-
"es",
|
|
17
|
-
"pt",
|
|
18
|
-
"ja",
|
|
19
|
-
"ko",
|
|
20
|
-
"hi",
|
|
21
|
-
"it",
|
|
22
|
-
"ar",
|
|
23
|
-
"ru",
|
|
24
|
-
],
|
|
25
|
-
features: [
|
|
26
|
-
"streaming",
|
|
27
|
-
"inline-voice-cloning",
|
|
28
|
-
{ id: "timestamps", mode: "native" },
|
|
29
|
-
],
|
|
30
|
-
},
|
|
31
|
-
{
|
|
32
|
-
id: "octave-1",
|
|
33
|
-
releaseDate: "2025-03-01",
|
|
34
|
-
languages: ["en"],
|
|
35
|
-
features: ["streaming"],
|
|
36
|
-
},
|
|
37
|
-
];
|
|
44
|
+
models = HUME_MODELS;
|
|
38
45
|
apiKey;
|
|
39
46
|
baseURL;
|
|
40
47
|
fetchFn;
|
|
@@ -50,7 +57,7 @@ export class HumeSpeechProvider {
|
|
|
50
57
|
if (modelId === "octave-1") {
|
|
51
58
|
return "1";
|
|
52
59
|
}
|
|
53
|
-
return
|
|
60
|
+
return;
|
|
54
61
|
}
|
|
55
62
|
async generate(options) {
|
|
56
63
|
const utterance = { text: options.text };
|
|
@@ -65,10 +72,7 @@ export class HumeSpeechProvider {
|
|
|
65
72
|
if (version != null) {
|
|
66
73
|
body.version = version;
|
|
67
74
|
}
|
|
68
|
-
// Native timestamps are only
|
|
69
|
-
// Hume returns alignment from the JSON `/v0/tts` endpoint — `/v0/tts/file`
|
|
70
|
-
// is bytes-only — so we route through it whenever the caller asks for
|
|
71
|
-
// word timing on a model that supports it.
|
|
75
|
+
// Native timestamps are Octave-2 only and only on JSON /v0/tts (/v0/tts/file is bytes-only).
|
|
72
76
|
if (options.includeTimestamps && version === "2") {
|
|
73
77
|
return this.generateWithTimestamps(options, body);
|
|
74
78
|
}
|
|
@@ -84,7 +88,7 @@ export class HumeSpeechProvider {
|
|
|
84
88
|
body: JSON.stringify(body),
|
|
85
89
|
signal: options.abortSignal,
|
|
86
90
|
});
|
|
87
|
-
await handleErrorResponse(response
|
|
91
|
+
await handleErrorResponse(response);
|
|
88
92
|
const arrayBuffer = await response.arrayBuffer();
|
|
89
93
|
const mediaType = response.headers.get("content-type") ?? "audio/mpeg";
|
|
90
94
|
return {
|
|
@@ -93,11 +97,7 @@ export class HumeSpeechProvider {
|
|
|
93
97
|
};
|
|
94
98
|
}
|
|
95
99
|
async generateWithTimestamps(options, baseBody) {
|
|
96
|
-
//
|
|
97
|
-
// utterance — its audio matches the top-level `generations[0].audio`
|
|
98
|
-
// byte-for-byte, so segment-relative timestamps line up with the audio
|
|
99
|
-
// we return. `include_timestamp_types: ["word"]` opts into word-level
|
|
100
|
-
// alignment (Hume defaults to none).
|
|
100
|
+
// split_utterances:false keeps one snippet per utterance so timestamps line up byte-for-byte with returned audio.
|
|
101
101
|
const body = {
|
|
102
102
|
...baseBody,
|
|
103
103
|
include_timestamp_types: ["word"],
|
|
@@ -115,8 +115,8 @@ export class HumeSpeechProvider {
|
|
|
115
115
|
body: JSON.stringify(body),
|
|
116
116
|
signal: options.abortSignal,
|
|
117
117
|
});
|
|
118
|
-
await handleErrorResponse(response
|
|
119
|
-
const payload = (await response.json());
|
|
118
|
+
await handleErrorResponse(response);
|
|
119
|
+
const payload = ttsResponseSchema.parse(await response.json());
|
|
120
120
|
const gen = payload.generations?.[0];
|
|
121
121
|
if (!gen?.audio) {
|
|
122
122
|
throw new SpeechSDKError(`hume/${options.modelId}: /v0/tts response missing generations[0].audio`);
|
|
@@ -125,8 +125,7 @@ export class HumeSpeechProvider {
|
|
|
125
125
|
const timestamps = gen.snippets
|
|
126
126
|
? snippetsToWordTimestamps(gen.snippets)
|
|
127
127
|
: undefined;
|
|
128
|
-
//
|
|
129
|
-
// Content-Type for the bytes — derive it from the requested format.
|
|
128
|
+
// No Content-Type for base64-in-JSON audio bytes; derive from requested format.
|
|
130
129
|
const format = (baseBody.format ?? {});
|
|
131
130
|
return {
|
|
132
131
|
audio,
|
|
@@ -159,7 +158,7 @@ export class HumeSpeechProvider {
|
|
|
159
158
|
body: JSON.stringify(body),
|
|
160
159
|
signal: options.abortSignal,
|
|
161
160
|
});
|
|
162
|
-
await handleErrorResponse(response
|
|
161
|
+
await handleErrorResponse(response);
|
|
163
162
|
if (!response.body) {
|
|
164
163
|
throw new Error(`hume/${options.modelId}: response has no body`);
|
|
165
164
|
}
|
|
@@ -170,26 +169,44 @@ export class HumeSpeechProvider {
|
|
|
170
169
|
}
|
|
171
170
|
getStitchOptions(modelId) {
|
|
172
171
|
if (this.models.some((m) => m.id === modelId)) {
|
|
173
|
-
// Hume Octave always
|
|
174
|
-
// API only accepts { type: "mp3" | "wav" | "pcm" } — there is no
|
|
175
|
-
// sample-rate option (verified against the Hume TS SDK's FormatPcm
|
|
176
|
-
// type and Hume's own 48 kHz "professional audio" claim). The
|
|
177
|
-
// response content-type omits the rate, so we declare it here for
|
|
178
|
-
// the stitch decoder.
|
|
172
|
+
// Hume Octave is always 48 kHz mono s16 PCM; /v0/tts/file has no rate option and the response omits it.
|
|
179
173
|
return {
|
|
180
174
|
providerOptions: { format: { type: "pcm" } },
|
|
181
175
|
mediaType: "audio/pcm;rate=48000",
|
|
182
176
|
};
|
|
183
177
|
}
|
|
184
|
-
return
|
|
178
|
+
return;
|
|
179
|
+
}
|
|
180
|
+
resolveOutputFormat(modelId, output) {
|
|
181
|
+
if (!this.models.some((m) => m.id === modelId)) {
|
|
182
|
+
return;
|
|
183
|
+
}
|
|
184
|
+
switch (output.format) {
|
|
185
|
+
case "wav":
|
|
186
|
+
return {
|
|
187
|
+
providerOptions: { format: { type: "wav" } },
|
|
188
|
+
expectedMediaType: "audio/wav",
|
|
189
|
+
};
|
|
190
|
+
case "mp3":
|
|
191
|
+
return {
|
|
192
|
+
providerOptions: { format: { type: "mp3" } },
|
|
193
|
+
expectedMediaType: "audio/mpeg",
|
|
194
|
+
};
|
|
195
|
+
case "pcm":
|
|
196
|
+
return {
|
|
197
|
+
providerOptions: { format: { type: "pcm" } },
|
|
198
|
+
expectedMediaType: "audio/pcm;rate=48000",
|
|
199
|
+
};
|
|
200
|
+
default:
|
|
201
|
+
return;
|
|
202
|
+
}
|
|
185
203
|
}
|
|
186
204
|
dialogueCapabilities(modelId) {
|
|
187
205
|
if (this.models.some((m) => m.id === modelId)) {
|
|
188
|
-
// Hume
|
|
189
|
-
// voice ceiling (4) to stay conservative.
|
|
206
|
+
// Hume publishes no hard maximum; cap conservatively at SDK-wide unique-voice ceiling of 4.
|
|
190
207
|
return { minVoices: 1, maxVoices: 4 };
|
|
191
208
|
}
|
|
192
|
-
return
|
|
209
|
+
return;
|
|
193
210
|
}
|
|
194
211
|
async generateDialogue(options) {
|
|
195
212
|
const utterances = options.turns.map((t) => ({
|
|
@@ -216,7 +233,7 @@ export class HumeSpeechProvider {
|
|
|
216
233
|
body: JSON.stringify(body),
|
|
217
234
|
signal: options.abortSignal,
|
|
218
235
|
});
|
|
219
|
-
await handleErrorResponse(response
|
|
236
|
+
await handleErrorResponse(response);
|
|
220
237
|
const arrayBuffer = await response.arrayBuffer();
|
|
221
238
|
return {
|
|
222
239
|
audio: new Uint8Array(arrayBuffer),
|
|
@@ -226,19 +243,16 @@ export class HumeSpeechProvider {
|
|
|
226
243
|
}
|
|
227
244
|
export function createHume(config = {}) {
|
|
228
245
|
const provider = new HumeSpeechProvider(config);
|
|
246
|
+
const fallbackSTT = config.fallbackSTT;
|
|
229
247
|
return function hume(modelId) {
|
|
230
248
|
return {
|
|
231
249
|
provider,
|
|
232
250
|
modelId: modelId ?? provider.defaultModel,
|
|
251
|
+
...(fallbackSTT && { fallbackSTT }),
|
|
233
252
|
};
|
|
234
253
|
};
|
|
235
254
|
}
|
|
236
|
-
|
|
237
|
-
* Map a Hume `format.type` value to a standard media type. Used when decoding
|
|
238
|
-
* base64 audio from `/v0/tts`, which delivers bytes inside a JSON body with
|
|
239
|
-
* no Content-Type hint for the audio itself. PCM is always 48 kHz mono s16
|
|
240
|
-
* (Hume's only documented PCM mode).
|
|
241
|
-
*/
|
|
255
|
+
// Hume's PCM mode is always 48 kHz mono s16.
|
|
242
256
|
function humeFormatToMediaType(formatType) {
|
|
243
257
|
if (!formatType) {
|
|
244
258
|
return "audio/mpeg";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/providers/hume/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EACL,mBAAmB,EACnB,aAAa,EACb,cAAc,GACf,MAAM,yBAAyB,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/providers/hume/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EACL,mBAAmB,EACnB,aAAa,EACb,cAAc,GACf,MAAM,yBAAyB,CAAC;AAQjC,OAAO,EAAE,iBAAiB,EAAE,wBAAwB,EAAE,MAAM,gBAAgB,CAAC;AAE7E,MAAM,iBAAiB,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,WAAW,EAAE,CAAC;SACX,KAAK,CACJ,CAAC,CAAC,MAAM,CAAC;QACP,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;QAC5B,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC,CAAC,QAAQ,EAAE;KACzD,CAAC,CACH;SACA,QAAQ,EAAE;CACd,CAAC,CAAC;AASH,MAAM,CAAC,MAAM,gBAAgB,GAAG,MAAe,CAAC;AAEhD,MAAM,CAAC,MAAM,WAAW,GAAyB;IAC/C;QACE,EAAE,EAAE,UAAU;QACd,WAAW,EAAE,YAAY;QACzB,SAAS,EAAE;YACT,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;SACI;QACV,QAAQ,EAAE,CAAC,WAAW,EAAE,sBAAsB,EAAE,YAAY,CAAC;KAC9D;IACD;QACE,EAAE,EAAE,UAAU;QACd,WAAW,EAAE,YAAY;QACzB,SAAS,EAAE,CAAC,IAAI,CAAU;QAC1B,QAAQ,EAAE,CAAC,WAAW,CAAC;KACxB;CACO,CAAC;AAEX,MAAM,OAAO,kBAAkB;IACpB,EAAE,GAAG,gBAAgB,CAAC;IACtB,YAAY,GAAG,UAAU,CAAC;IAE1B,MAAM,GAAG,WAAW,CAAC;IAEb,MAAM,CAAqB;IAC3B,OAAO,CAAS;IAChB,OAAO,CAA0B;IAElD,YAAY,MAAgC;QAC1C,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;QAC5B,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,OAAO,IAAI,wBAAwB,CAAC;QAC1D,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,KAAK,IAAI,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IACnE,CAAC;IAEO,cAAc,CAAC,OAAe;QACpC,IAAI,OAAO,KAAK,UAAU,EAAE,CAAC;YAC3B,OAAO,GAAG,CAAC;QACb,CAAC;QACD,IAAI,OAAO,KAAK,UAAU,EAAE,CAAC;YAC3B,OAAO,GAAG,CAAC;QACb,CAAC;QACD,OAAO;IACT,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAQd;QAMC,MAAM,SAAS,GAA4B,EAAE,IAAI,EAAE,OAAO,CAAC,IAAI,EAAE,CAAC;QAClE,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YAClB,SAAS,CAAC,KAAK,GAAG,EAAE,IAAI,EAAE,OAAO,CAAC,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,CAAC;QACjE,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAErD,MAAM,IAAI,GAA4B;YACpC,GAAG,OAAO,CAAC,eAAe;YAC1B,UAAU,EAAE,CAAC,SAAS,CAAC;SACxB,CAAC;QAEF,IAAI,OAAO,IAAI,IAAI,EAAE,CAAC;YACpB,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACzB,CAAC;QAED,6FAA6F;QAC7F,IAAI,OAAO,CAAC,iBAAiB,IAAI,OAAO,KAAK,GAAG,EAAE,CAAC;YACjD,OAAO,IAAI,CAAC,sBAAsB,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;QACpD,CAAC;QAED,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,OAAO,WAAW,CAAC;QAEvC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE;YACvC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,gBAAgB,EAAE,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,MAAM,CAAC;gBACpE,cAAc,EAAE,cAAc;gBAC9B,GAAG,OAAO,CAAC,OAAO;aACnB;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;YAC1B,MAAM,EAAE,OAAO,CAAC,WAAW;SAC5B,CAAC,CAAC;QAEH,MAAM,mBAAmB,CAAC,QAAQ,CAAC,CAAC;QAEpC,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC;QACjD,MAAM,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,YAAY,CAAC;QAEvE,OAAO;YACL,KAAK,EAAE,IAAI,UAAU,CAAC,WAAW,CAAC;YAClC,SAAS;SACV,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,sBAAsB,CAClC,OAKC,EACD,QAAiC;QAOjC,kHAAkH;QAClH,MAAM,IAAI,GAA4B;YACpC,GAAG,QAAQ;YACX,uBAAuB,EAAE,CAAC,MAAM,CAAC;YACjC,gBAAgB,EAAE,KAAK;SACxB,CAAC;QAEF,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,OAAO,MAAM,CAAC;QAClC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE;YACvC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,gBAAgB,EAAE,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,MAAM,CAAC;gBACpE,cAAc,EAAE,cAAc;gBAC9B,GAAG,OAAO,CAAC,OAAO;aACnB;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;YAC1B,MAAM,EAAE,OAAO,CAAC,WAAW;SAC5B,CAAC,CAAC;QAEH,MAAM,mBAAmB,CAAC,QAAQ,CAAC,CAAC;QAEpC,MAAM,OAAO,GAAG,iBAAiB,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;QAC/D,MAAM,GAAG,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC;QACrC,IAAI,CAAC,GAAG,EAAE,KAAK,EAAE,CAAC;YAChB,MAAM,IAAI,cAAc,CACtB,QAAQ,OAAO,CAAC,OAAO,iDAAiD,CACzE,CAAC;QACJ,CAAC;QAED,MAAM,KAAK,GAAG,kBAAkB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAC5C,MAAM,UAAU,GAAG,GAAG,CAAC,QAAQ;YAC7B,CAAC,CAAC,wBAAwB,CAAC,GAAG,CAAC,QAAQ,CAAC;YACxC,CAAC,CAAC,SAAS,CAAC;QAEd,gFAAgF;QAChF,MAAM,MAAM,GAAG,CAAC,QAAQ,CAAC,MAAM,IAAI,EAAE,CAAsB,CAAC;QAC5D,OAAO;YACL,KAAK;YACL,SAAS,EAAE,qBAAqB,CAAC,MAAM,CAAC,IAAI,CAAC;YAC7C,UAAU;SACX,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,OAOZ;QAKC,MAAM,SAAS,GAA4B,EAAE,IAAI,EAAE,OAAO,CAAC,IAAI,EAAE,CAAC;QAClE,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YAClB,SAAS,CAAC,KAAK,GAAG,EAAE,IAAI,EAAE,OAAO,CAAC,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,CAAC;QACjE,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAErD,MAAM,IAAI,GAA4B;YACpC,GAAG,OAAO,CAAC,eAAe;YAC1B,UAAU,EAAE,CAAC,SAAS,CAAC;SACxB,CAAC;QACF,IAAI,OAAO,IAAI,IAAI,EAAE,CAAC;YACpB,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACzB,CAAC;QAED,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,OAAO,kBAAkB,CAAC;QAE9C,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE;YACvC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,gBAAgB,EAAE,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,MAAM,CAAC;gBACpE,cAAc,EAAE,cAAc;gBAC9B,GAAG,OAAO,CAAC,OAAO;aACnB;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;YAC1B,MAAM,EAAE,OAAO,CAAC,WAAW;SAC5B,CAAC,CAAC;QAEH,MAAM,mBAAmB,CAAC,QAAQ,CAAC,CAAC;QAEpC,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnB,MAAM,IAAI,KAAK,CAAC,QAAQ,OAAO,CAAC,OAAO,wBAAwB,CAAC,CAAC;QACnE,CAAC;QAED,OAAO;YACL,MAAM,EAAE,QAAQ,CAAC,IAAI;YACrB,SAAS,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,YAAY;SAChE,CAAC;IACJ,CAAC;IAED,gBAAgB,CAAC,OAAe;QAC9B,IAAI,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,EAAE,CAAC;YAC9C,wGAAwG;YACxG,OAAO;gBACL,eAAe,EAAE,EAAE,MAAM,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE;gBAC5C,SAAS,EAAE,sBAAsB;aAClC,CAAC;QACJ,CAAC;QACD,OAAO;IACT,CAAC;IAED,mBAAmB,CAAC,OAAe,EAAE,MAAmB;QACtD,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,EAAE,CAAC;YAC/C,OAAO;QACT,CAAC;QACD,QAAQ,MAAM,CAAC,MAAM,EAAE,CAAC;YACtB,KAAK,KAAK;gBACR,OAAO;oBACL,eAAe,EAAE,EAAE,MAAM,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE;oBAC5C,iBAAiB,EAAE,WAAW;iBAC/B,CAAC;YACJ,KAAK,KAAK;gBACR,OAAO;oBACL,eAAe,EAAE,EAAE,MAAM,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE;oBAC5C,iBAAiB,EAAE,YAAY;iBAChC,CAAC;YACJ,KAAK,KAAK;gBACR,OAAO;oBACL,eAAe,EAAE,EAAE,MAAM,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE;oBAC5C,iBAAiB,EAAE,sBAAsB;iBAC1C,CAAC;YACJ;gBACE,OAAO;QACX,CAAC;IACH,CAAC;IAED,oBAAoB,CAAC,OAAe;QAClC,IAAI,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,EAAE,CAAC;YAC9C,4FAA4F;YAC5F,OAAO,EAAE,SAAS,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC;QACxC,CAAC;QACD,OAAO;IACT,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,OAMtB;QAKC,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC3C,IAAI,EAAE,CAAC,CAAC,IAAI;YACZ,KAAK,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE;SAC9C,CAAC,CAAC,CAAC;QAEJ,MAAM,OAAO,GAAG,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QACrD,MAAM,IAAI,GAA4B;YACpC,GAAG,OAAO,CAAC,eAAe;YAC1B,UAAU;SACX,CAAC;QACF,IAAI,OAAO,IAAI,IAAI,EAAE,CAAC;YACpB,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACzB,CAAC;QAED,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,OAAO,WAAW,CAAC;QACvC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE;YACvC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,gBAAgB,EAAE,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,MAAM,CAAC;gBACpE,cAAc,EAAE,cAAc;gBAC9B,GAAG,OAAO,CAAC,OAAO;aACnB;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;YAC1B,MAAM,EAAE,OAAO,CAAC,WAAW;SAC5B,CAAC,CAAC;QAEH,MAAM,mBAAmB,CAAC,QAAQ,CAAC,CAAC;QAEpC,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC;QACjD,OAAO;YACL,KAAK,EAAE,IAAI,UAAU,CAAC,WAAW,CAAC;YAClC,SAAS,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,YAAY;SAChE,CAAC;IACJ,CAAC;CACF;AAED,MAAM,UAAU,UAAU,CAAC,SAAmC,EAAE;IAC9D,MAAM,QAAQ,GAAG,IAAI,kBAAkB,CAAC,MAAM,CAAC,CAAC;IAChD,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;IAEvC,OAAO,SAAS,IAAI,CAAC,OAAgB;QACnC,OAAO;YACL,QAAQ;YACR,OAAO,EAAE,OAAO,IAAI,QAAQ,CAAC,YAAY;YACzC,GAAG,CAAC,WAAW,IAAI,EAAE,WAAW,EAAE,CAAC;SACpC,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC;AAED,6CAA6C;AAC7C,SAAS,qBAAqB,CAAC,UAA8B;IAC3D,IAAI,CAAC,UAAU,EAAE,CAAC;QAChB,OAAO,YAAY,CAAC;IACtB,CAAC;IACD,IAAI,UAAU,KAAK,KAAK,EAAE,CAAC;QACzB,OAAO,WAAW,CAAC;IACrB,CAAC;IACD,IAAI,UAAU,KAAK,KAAK,EAAE,CAAC;QACzB,OAAO,sBAAsB,CAAC;IAChC,CAAC;IACD,OAAO,YAAY,CAAC;AACtB,CAAC"}
|
|
@@ -1,25 +1,11 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
1
2
|
import type { WordTimestamp } from "../../timestamps.js";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
*
|
|
10
|
-
* `phoneticDetails` is only emitted by the 1.5 family and is unused by the
|
|
11
|
-
* SDK today — kept here for typing only.
|
|
12
|
-
*/
|
|
13
|
-
export interface InworldWordAlignment {
|
|
14
|
-
readonly phoneticDetails?: readonly unknown[];
|
|
15
|
-
readonly wordEndTimeSeconds: readonly number[];
|
|
16
|
-
readonly wordStartTimeSeconds: readonly number[];
|
|
17
|
-
readonly words: readonly string[];
|
|
18
|
-
}
|
|
19
|
-
/**
|
|
20
|
-
* Project Inworld's parallel word alignment arrays into the SDK's
|
|
21
|
-
* `WordTimestamp[]`. Skips entries past the shortest array length so a
|
|
22
|
-
* malformed response can't produce undefined start/end values.
|
|
23
|
-
*/
|
|
3
|
+
export declare const inworldWordAlignmentSchema: z.ZodObject<{
|
|
4
|
+
phoneticDetails: z.ZodOptional<z.ZodArray<z.ZodUnknown>>;
|
|
5
|
+
wordEndTimeSeconds: z.ZodArray<z.ZodNumber>;
|
|
6
|
+
wordStartTimeSeconds: z.ZodArray<z.ZodNumber>;
|
|
7
|
+
words: z.ZodArray<z.ZodString>;
|
|
8
|
+
}, z.core.$strip>;
|
|
9
|
+
export type InworldWordAlignment = z.infer<typeof inworldWordAlignmentSchema>;
|
|
24
10
|
export declare function wordAlignmentToWordTimestamps(alignment: InworldWordAlignment): WordTimestamp[];
|
|
25
11
|
//# sourceMappingURL=alignment.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"alignment.d.ts","sourceRoot":"","sources":["../../../src/providers/inworld/alignment.ts"],"names":[],"mappings":"AAAA,OAAO,
|
|
1
|
+
{"version":3,"file":"alignment.d.ts","sourceRoot":"","sources":["../../../src/providers/inworld/alignment.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAGzD,eAAO,MAAM,0BAA0B;;;;;iBAKrC,CAAC;AACH,MAAM,MAAM,oBAAoB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,0BAA0B,CAAC,CAAC;AAE9E,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,oBAAoB,GAC9B,aAAa,EAAE,CAkBjB"}
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
// Inworld `timestampInfo.wordAlignment` (timestamp_type: "WORD"). Times in seconds.
|
|
3
|
+
export const inworldWordAlignmentSchema = z.object({
|
|
4
|
+
phoneticDetails: z.array(z.unknown()).optional(),
|
|
5
|
+
wordEndTimeSeconds: z.array(z.number()),
|
|
6
|
+
wordStartTimeSeconds: z.array(z.number()),
|
|
7
|
+
words: z.array(z.string()),
|
|
8
|
+
});
|
|
6
9
|
export function wordAlignmentToWordTimestamps(alignment) {
|
|
7
10
|
const len = Math.min(alignment.words.length, alignment.wordStartTimeSeconds.length, alignment.wordEndTimeSeconds.length);
|
|
8
11
|
const out = [];
|
|
@@ -10,9 +13,7 @@ export function wordAlignmentToWordTimestamps(alignment) {
|
|
|
10
13
|
const text = alignment.words[i];
|
|
11
14
|
const start = alignment.wordStartTimeSeconds[i];
|
|
12
15
|
const end = alignment.wordEndTimeSeconds[i];
|
|
13
|
-
//
|
|
14
|
-
// silence or punctuation — drop them so the SDK's word list only
|
|
15
|
-
// contains real words.
|
|
16
|
+
// Drop empty-string entries (Inworld emits them for silence/punctuation).
|
|
16
17
|
if (text == null || start == null || end == null || text.length === 0) {
|
|
17
18
|
continue;
|
|
18
19
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"alignment.js","sourceRoot":"","sources":["../../../src/providers/inworld/alignment.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"alignment.js","sourceRoot":"","sources":["../../../src/providers/inworld/alignment.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,oFAAoF;AACpF,MAAM,CAAC,MAAM,0BAA0B,GAAG,CAAC,CAAC,MAAM,CAAC;IACjD,eAAe,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,QAAQ,EAAE;IAChD,kBAAkB,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;IACvC,oBAAoB,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;IACzC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;CAC3B,CAAC,CAAC;AAGH,MAAM,UAAU,6BAA6B,CAC3C,SAA+B;IAE/B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAClB,SAAS,CAAC,KAAK,CAAC,MAAM,EACtB,SAAS,CAAC,oBAAoB,CAAC,MAAM,EACrC,SAAS,CAAC,kBAAkB,CAAC,MAAM,CACpC,CAAC;IACF,MAAM,GAAG,GAAoB,EAAE,CAAC;IAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAChC,MAAM,KAAK,GAAG,SAAS,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC;QAChD,MAAM,GAAG,GAAG,SAAS,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC;QAC5C,0EAA0E;QAC1E,IAAI,IAAI,IAAI,IAAI,IAAI,KAAK,IAAI,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtE,SAAS;QACX,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;IACjC,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC"}
|
|
@@ -1,31 +1,19 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type { AudioOutput } from "../../audio-output.js";
|
|
2
|
+
import type { ModelInfo, ResolvedModel, SpeechProvider } from "../../speech-provider.js";
|
|
3
|
+
import type { ResolvedSTTModel } from "../../speech-to-text-provider.js";
|
|
2
4
|
import type { WordTimestamp } from "../../timestamps.js";
|
|
3
5
|
export interface InworldSpeechProviderConfig {
|
|
4
6
|
apiKey?: string;
|
|
5
7
|
baseURL?: string;
|
|
8
|
+
fallbackSTT?: ResolvedSTTModel;
|
|
6
9
|
fetch?: typeof globalThis.fetch;
|
|
7
10
|
}
|
|
11
|
+
export declare const INWORLD_PROVIDER_ID: "inworld";
|
|
12
|
+
export declare const INWORLD_MODELS: readonly ModelInfo[];
|
|
8
13
|
export declare class InworldSpeechProvider implements SpeechProvider<string, string> {
|
|
9
|
-
readonly id
|
|
14
|
+
readonly id: "inworld";
|
|
10
15
|
readonly defaultModel = "inworld-tts-1.5-max";
|
|
11
|
-
|
|
12
|
-
readonly models: readonly [{
|
|
13
|
-
readonly id: "inworld-tts-1.5-max";
|
|
14
|
-
readonly releaseDate: "2025-08-15";
|
|
15
|
-
readonly languages: readonly ["en", "es", "fr", "de", "it", "pt", "ja", "ko", "nl", "pl", "zh"];
|
|
16
|
-
readonly features: readonly ["streaming", {
|
|
17
|
-
readonly id: "timestamps";
|
|
18
|
-
readonly mode: "native";
|
|
19
|
-
}];
|
|
20
|
-
}, {
|
|
21
|
-
readonly id: "inworld-tts-1.5-mini";
|
|
22
|
-
readonly releaseDate: "2025-08-15";
|
|
23
|
-
readonly languages: readonly ["en", "es", "fr", "de", "it", "pt", "ja", "ko", "nl", "pl", "zh"];
|
|
24
|
-
readonly features: readonly ["streaming", {
|
|
25
|
-
readonly id: "timestamps";
|
|
26
|
-
readonly mode: "native";
|
|
27
|
-
}];
|
|
28
|
-
}];
|
|
16
|
+
readonly models: readonly ModelInfo[];
|
|
29
17
|
private readonly apiKey;
|
|
30
18
|
private readonly baseURL;
|
|
31
19
|
private readonly fetchFn;
|
|
@@ -66,6 +54,15 @@ export declare class InworldSpeechProvider implements SpeechProvider<string, str
|
|
|
66
54
|
};
|
|
67
55
|
mediaType: string;
|
|
68
56
|
} | undefined;
|
|
57
|
+
resolveOutputFormat(modelId: string, output: AudioOutput): {
|
|
58
|
+
providerOptions: {
|
|
59
|
+
audio_config: {
|
|
60
|
+
audio_encoding: string;
|
|
61
|
+
sample_rate_hertz: number;
|
|
62
|
+
};
|
|
63
|
+
};
|
|
64
|
+
expectedMediaType: string;
|
|
65
|
+
} | undefined;
|
|
69
66
|
}
|
|
70
67
|
export declare function createInworld(config?: InworldSpeechProviderConfig): (modelId?: string) => ResolvedModel<string>;
|
|
71
68
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/providers/inworld/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/providers/inworld/index.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAOzD,OAAO,KAAK,EACV,SAAS,EACT,aAAa,EACb,cAAc,EACf,MAAM,0BAA0B,CAAC;AAClC,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,kCAAkC,CAAC;AACzE,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAazD,MAAM,WAAW,2BAA2B;IAC1C,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,gBAAgB,CAAC;IAC/B,KAAK,CAAC,EAAE,OAAO,UAAU,CAAC,KAAK,CAAC;CACjC;AA0BD,eAAO,MAAM,mBAAmB,EAAG,SAAkB,CAAC;AAiBtD,eAAO,MAAM,cAAc,EAAE,SAAS,SAAS,EAarC,CAAC;AAEX,qBAAa,qBAAsB,YAAW,cAAc,CAAC,MAAM,EAAE,MAAM,CAAC;IAC1E,QAAQ,CAAC,EAAE,YAAuB;IAClC,QAAQ,CAAC,YAAY,yBAAyB;IAE9C,QAAQ,CAAC,MAAM,uBAAkB;IAEjC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAqB;IAC5C,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA0B;gBAEtC,MAAM,EAAE,2BAA2B;IAM/C,OAAO,CAAC,SAAS;IA4BX,QAAQ,CAAC,OAAO,EAAE;QACtB,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACjC,iBAAiB,CAAC,EAAE,OAAO,CAAC;KAC7B,GAAG,OAAO,CAAC;QACV,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,MAAM,CAAC;QAClB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC3C,UAAU,CAAC,EAAE,aAAa,EAAE,CAAC;KAC9B,CAAC;IAgDI,MAAM,CAAC,OAAO,EAAE;QACpB,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,MAAM,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;QACnC,SAAS,EAAE,MAAM,CAAC;QAClB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC;IAqCF,gBAAgB,CAAC,OAAO,EAAE,MAAM;;;;;;;;;IAehC,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW;;;;;;;;;CA+BzD;AAmGD,wBAAgB,aAAa,CAAC,MAAM,GAAE,2BAAgC,IAI5C,UAAU,MAAM,KAAG,aAAa,CAAC,MAAM,CAAC,CAOjE"}
|
|
@@ -1,5 +1,13 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { base64ToUint8Array } from "../../audio-utils.js";
|
|
1
3
|
import { handleErrorResponse, resolveApiKey, SDK_USER_AGENT, } from "../../provider-utils.js";
|
|
2
|
-
import { wordAlignmentToWordTimestamps, } from "./alignment.js";
|
|
4
|
+
import { inworldWordAlignmentSchema, wordAlignmentToWordTimestamps, } from "./alignment.js";
|
|
5
|
+
const ttsResponseSchema = z.object({
|
|
6
|
+
audioContent: z.string().optional(),
|
|
7
|
+
timestampInfo: z
|
|
8
|
+
.object({ wordAlignment: inworldWordAlignmentSchema.optional() })
|
|
9
|
+
.optional(),
|
|
10
|
+
});
|
|
3
11
|
const DEFAULT_AUDIO_ENCODING = "MP3";
|
|
4
12
|
const DEFAULT_SAMPLE_RATE_HERTZ = 48_000;
|
|
5
13
|
function mediaTypeForEncoding(encoding) {
|
|
@@ -16,38 +24,39 @@ function mediaTypeForEncoding(encoding) {
|
|
|
16
24
|
return "audio/mpeg";
|
|
17
25
|
}
|
|
18
26
|
}
|
|
27
|
+
export const INWORLD_PROVIDER_ID = "inworld";
|
|
28
|
+
// https://docs.inworld.ai/tts/overview#supported-languages
|
|
29
|
+
const INWORLD_LANGUAGES = [
|
|
30
|
+
"en",
|
|
31
|
+
"es",
|
|
32
|
+
"fr",
|
|
33
|
+
"de",
|
|
34
|
+
"it",
|
|
35
|
+
"pt",
|
|
36
|
+
"ja",
|
|
37
|
+
"ko",
|
|
38
|
+
"nl",
|
|
39
|
+
"pl",
|
|
40
|
+
"zh",
|
|
41
|
+
];
|
|
42
|
+
export const INWORLD_MODELS = [
|
|
43
|
+
{
|
|
44
|
+
id: "inworld-tts-1.5-max",
|
|
45
|
+
releaseDate: "2025-08-15",
|
|
46
|
+
languages: INWORLD_LANGUAGES,
|
|
47
|
+
features: ["streaming", "timestamps"],
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
id: "inworld-tts-1.5-mini",
|
|
51
|
+
releaseDate: "2025-08-15",
|
|
52
|
+
languages: INWORLD_LANGUAGES,
|
|
53
|
+
features: ["streaming", "timestamps"],
|
|
54
|
+
},
|
|
55
|
+
];
|
|
19
56
|
export class InworldSpeechProvider {
|
|
20
|
-
id =
|
|
57
|
+
id = INWORLD_PROVIDER_ID;
|
|
21
58
|
defaultModel = "inworld-tts-1.5-max";
|
|
22
|
-
|
|
23
|
-
// https://docs.inworld.ai/tts/overview#supported-languages
|
|
24
|
-
static LANGUAGES = [
|
|
25
|
-
"en",
|
|
26
|
-
"es",
|
|
27
|
-
"fr",
|
|
28
|
-
"de",
|
|
29
|
-
"it",
|
|
30
|
-
"pt",
|
|
31
|
-
"ja",
|
|
32
|
-
"ko",
|
|
33
|
-
"nl",
|
|
34
|
-
"pl",
|
|
35
|
-
"zh",
|
|
36
|
-
];
|
|
37
|
-
models = [
|
|
38
|
-
{
|
|
39
|
-
id: "inworld-tts-1.5-max",
|
|
40
|
-
releaseDate: "2025-08-15",
|
|
41
|
-
languages: InworldSpeechProvider.LANGUAGES,
|
|
42
|
-
features: ["streaming", { id: "timestamps", mode: "native" }],
|
|
43
|
-
},
|
|
44
|
-
{
|
|
45
|
-
id: "inworld-tts-1.5-mini",
|
|
46
|
-
releaseDate: "2025-08-15",
|
|
47
|
-
languages: InworldSpeechProvider.LANGUAGES,
|
|
48
|
-
features: ["streaming", { id: "timestamps", mode: "native" }],
|
|
49
|
-
},
|
|
50
|
-
];
|
|
59
|
+
models = INWORLD_MODELS;
|
|
51
60
|
apiKey;
|
|
52
61
|
baseURL;
|
|
53
62
|
fetchFn;
|
|
@@ -92,8 +101,8 @@ export class InworldSpeechProvider {
|
|
|
92
101
|
body: JSON.stringify(body),
|
|
93
102
|
signal: options.abortSignal,
|
|
94
103
|
});
|
|
95
|
-
await handleErrorResponse(response
|
|
96
|
-
const json = (await response.json());
|
|
104
|
+
await handleErrorResponse(response);
|
|
105
|
+
const json = ttsResponseSchema.parse(await response.json());
|
|
97
106
|
if (!json.audioContent) {
|
|
98
107
|
throw new Error(`inworld/${options.modelId}: response missing audioContent`);
|
|
99
108
|
}
|
|
@@ -121,7 +130,7 @@ export class InworldSpeechProvider {
|
|
|
121
130
|
body: JSON.stringify(body),
|
|
122
131
|
signal: options.abortSignal,
|
|
123
132
|
});
|
|
124
|
-
await handleErrorResponse(response
|
|
133
|
+
await handleErrorResponse(response);
|
|
125
134
|
if (!response.body) {
|
|
126
135
|
throw new Error(`inworld/${options.modelId}: response has no body`);
|
|
127
136
|
}
|
|
@@ -142,16 +151,39 @@ export class InworldSpeechProvider {
|
|
|
142
151
|
mediaType: "audio/wav",
|
|
143
152
|
};
|
|
144
153
|
}
|
|
145
|
-
return
|
|
154
|
+
return;
|
|
146
155
|
}
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
156
|
+
resolveOutputFormat(modelId, output) {
|
|
157
|
+
if (!this.models.some((m) => m.id === modelId)) {
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
160
|
+
switch (output.format) {
|
|
161
|
+
case "wav":
|
|
162
|
+
case "pcm":
|
|
163
|
+
// LINEAR16 returns 16-bit PCM with a WAV header; SDK unwraps for pcm.
|
|
164
|
+
return {
|
|
165
|
+
providerOptions: {
|
|
166
|
+
audio_config: {
|
|
167
|
+
audio_encoding: "LINEAR16",
|
|
168
|
+
sample_rate_hertz: 24_000,
|
|
169
|
+
},
|
|
170
|
+
},
|
|
171
|
+
expectedMediaType: "audio/wav",
|
|
172
|
+
};
|
|
173
|
+
case "mp3":
|
|
174
|
+
return {
|
|
175
|
+
providerOptions: {
|
|
176
|
+
audio_config: {
|
|
177
|
+
audio_encoding: "MP3",
|
|
178
|
+
sample_rate_hertz: 48_000,
|
|
179
|
+
},
|
|
180
|
+
},
|
|
181
|
+
expectedMediaType: "audio/mpeg",
|
|
182
|
+
};
|
|
183
|
+
default:
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
153
186
|
}
|
|
154
|
-
return bytes;
|
|
155
187
|
}
|
|
156
188
|
function extractAudio(line) {
|
|
157
189
|
const trimmed = line.trim();
|
|
@@ -169,7 +201,7 @@ function extractAudio(line) {
|
|
|
169
201
|
if (!b64) {
|
|
170
202
|
return null;
|
|
171
203
|
}
|
|
172
|
-
return
|
|
204
|
+
return base64ToUint8Array(b64);
|
|
173
205
|
}
|
|
174
206
|
function emitLine(line, controller) {
|
|
175
207
|
const audio = extractAudio(line);
|
|
@@ -212,8 +244,7 @@ function parseInworldNdjsonStream(source, model) {
|
|
|
212
244
|
}
|
|
213
245
|
catch (err) {
|
|
214
246
|
const error = err instanceof Error ? err : new Error(`${model}: ${String(err)}`);
|
|
215
|
-
// Cancel
|
|
216
|
-
// locked / hanging. Swallow cancel errors — we already have `error`.
|
|
247
|
+
// Cancel upstream so the fetch body isn't left locked; swallow cancel errors.
|
|
217
248
|
reader.cancel(error).catch(() => {
|
|
218
249
|
/* noop */
|
|
219
250
|
});
|
|
@@ -221,18 +252,19 @@ function parseInworldNdjsonStream(source, model) {
|
|
|
221
252
|
}
|
|
222
253
|
},
|
|
223
254
|
cancel(reason) {
|
|
224
|
-
//
|
|
225
|
-
// body so the HTTP connection can be released.
|
|
255
|
+
// Propagate cancel to upstream fetch so the HTTP connection is released.
|
|
226
256
|
return reader.cancel(reason);
|
|
227
257
|
},
|
|
228
258
|
});
|
|
229
259
|
}
|
|
230
260
|
export function createInworld(config = {}) {
|
|
231
261
|
const provider = new InworldSpeechProvider(config);
|
|
262
|
+
const fallbackSTT = config.fallbackSTT;
|
|
232
263
|
return function inworld(modelId) {
|
|
233
264
|
return {
|
|
234
265
|
provider,
|
|
235
266
|
modelId: modelId ?? provider.defaultModel,
|
|
267
|
+
...(fallbackSTT && { fallbackSTT }),
|
|
236
268
|
};
|
|
237
269
|
};
|
|
238
270
|
}
|