@speech-sdk/core 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +227 -108
- package/dist/__tests__/e2e/_save-audio.d.ts +0 -42
- package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
- package/dist/__tests__/e2e/_save-audio.js +0 -59
- package/dist/__tests__/e2e/_save-audio.js.map +1 -1
- package/dist/audio-decode.d.ts +7 -0
- package/dist/audio-decode.d.ts.map +1 -0
- package/dist/audio-decode.js +109 -0
- package/dist/audio-decode.js.map +1 -0
- package/dist/audio-duration.d.ts +0 -5
- package/dist/audio-duration.d.ts.map +1 -1
- package/dist/audio-duration.js +5 -21
- package/dist/audio-duration.js.map +1 -1
- package/dist/audio-output.d.ts +39 -0
- package/dist/audio-output.d.ts.map +1 -0
- package/dist/audio-output.js +111 -0
- package/dist/audio-output.js.map +1 -0
- package/dist/audio-utils.d.ts +2 -10
- package/dist/audio-utils.d.ts.map +1 -1
- package/dist/audio-utils.js +57 -15
- package/dist/audio-utils.js.map +1 -1
- package/dist/captions.d.ts +0 -108
- package/dist/captions.d.ts.map +1 -1
- package/dist/captions.js +8 -98
- package/dist/captions.js.map +1 -1
- package/dist/conversation/attribute-timestamps.d.ts +26 -0
- package/dist/conversation/attribute-timestamps.d.ts.map +1 -0
- package/dist/conversation/attribute-timestamps.js +276 -0
- package/dist/conversation/attribute-timestamps.js.map +1 -0
- package/dist/conversation/dispatch.d.ts +5 -5
- package/dist/conversation/dispatch.d.ts.map +1 -1
- package/dist/conversation/dispatch.js +18 -8
- package/dist/conversation/dispatch.js.map +1 -1
- package/dist/conversation/errors.d.ts +3 -0
- package/dist/conversation/errors.d.ts.map +1 -1
- package/dist/conversation/errors.js +6 -0
- package/dist/conversation/errors.js.map +1 -1
- package/dist/conversation/pcm-concat.d.ts +0 -24
- package/dist/conversation/pcm-concat.d.ts.map +1 -1
- package/dist/conversation/pcm-concat.js +8 -183
- package/dist/conversation/pcm-concat.js.map +1 -1
- package/dist/conversation/proportional-fill.d.ts +10 -0
- package/dist/conversation/proportional-fill.d.ts.map +1 -0
- package/dist/conversation/proportional-fill.js +64 -0
- package/dist/conversation/proportional-fill.js.map +1 -0
- package/dist/conversation/silence-detection.d.ts +14 -0
- package/dist/conversation/silence-detection.d.ts.map +1 -0
- package/dist/conversation/silence-detection.js +52 -0
- package/dist/conversation/silence-detection.js.map +1 -0
- package/dist/conversation/stitch.d.ts +9 -6
- package/dist/conversation/stitch.d.ts.map +1 -1
- package/dist/conversation/stitch.js +72 -51
- package/dist/conversation/stitch.js.map +1 -1
- package/dist/conversation/types.d.ts +7 -37
- package/dist/conversation/types.d.ts.map +1 -1
- package/dist/conversation/validate.d.ts +1 -16
- package/dist/conversation/validate.d.ts.map +1 -1
- package/dist/conversation/validate.js +29 -29
- package/dist/conversation/validate.js.map +1 -1
- package/dist/default-stt-fallback.d.ts +3 -0
- package/dist/default-stt-fallback.d.ts.map +1 -0
- package/dist/default-stt-fallback.js +11 -0
- package/dist/default-stt-fallback.js.map +1 -0
- package/dist/derive-timestamps.d.ts +1 -5
- package/dist/derive-timestamps.d.ts.map +1 -1
- package/dist/derive-timestamps.js +1 -15
- package/dist/derive-timestamps.js.map +1 -1
- package/dist/encoders/mp3.d.ts +6 -0
- package/dist/encoders/mp3.d.ts.map +1 -0
- package/dist/encoders/mp3.js +54 -0
- package/dist/encoders/mp3.js.map +1 -0
- package/dist/errors.d.ts +20 -13
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +49 -15
- package/dist/errors.js.map +1 -1
- package/dist/generate-conversation.d.ts +5 -4
- package/dist/generate-conversation.d.ts.map +1 -1
- package/dist/generate-conversation.js +250 -93
- package/dist/generate-conversation.js.map +1 -1
- package/dist/generate-speech.d.ts +7 -28
- package/dist/generate-speech.d.ts.map +1 -1
- package/dist/generate-speech.js +185 -94
- package/dist/generate-speech.js.map +1 -1
- package/dist/index.d.ts +7 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -4
- package/dist/index.js.map +1 -1
- package/dist/logger.d.ts.map +1 -1
- package/dist/logger.js +2 -13
- package/dist/logger.js.map +1 -1
- package/dist/metadata.d.ts +0 -22
- package/dist/metadata.d.ts.map +1 -1
- package/dist/pronunciations/errors.d.ts +5 -0
- package/dist/pronunciations/errors.d.ts.map +1 -0
- package/dist/pronunciations/errors.js +8 -0
- package/dist/pronunciations/errors.js.map +1 -0
- package/dist/pronunciations/inverse-align.d.ts +4 -0
- package/dist/pronunciations/inverse-align.d.ts.map +1 -0
- package/dist/pronunciations/inverse-align.js +54 -0
- package/dist/pronunciations/inverse-align.js.map +1 -0
- package/dist/pronunciations/merge.d.ts +4 -0
- package/dist/pronunciations/merge.d.ts.map +1 -0
- package/dist/pronunciations/merge.js +13 -0
- package/dist/pronunciations/merge.js.map +1 -0
- package/dist/pronunciations/substitute.d.ts +6 -0
- package/dist/pronunciations/substitute.d.ts.map +1 -0
- package/dist/pronunciations/substitute.js +67 -0
- package/dist/pronunciations/substitute.js.map +1 -0
- package/dist/pronunciations/types.d.ts +18 -0
- package/dist/pronunciations/types.d.ts.map +1 -0
- package/dist/pronunciations/types.js +2 -0
- package/dist/pronunciations/types.js.map +1 -0
- package/dist/pronunciations/validate.d.ts +3 -0
- package/dist/pronunciations/validate.d.ts.map +1 -0
- package/dist/pronunciations/validate.js +26 -0
- package/dist/pronunciations/validate.js.map +1 -0
- package/dist/provider-utils.d.ts +4 -9
- package/dist/provider-utils.d.ts.map +1 -1
- package/dist/provider-utils.js +60 -51
- package/dist/provider-utils.js.map +1 -1
- package/dist/providers/cartesia/alignment.d.ts +0 -16
- package/dist/providers/cartesia/alignment.d.ts.map +1 -1
- package/dist/providers/cartesia/alignment.js +1 -6
- package/dist/providers/cartesia/alignment.js.map +1 -1
- package/dist/providers/cartesia/index.d.ts +29 -19
- package/dist/providers/cartesia/index.d.ts.map +1 -1
- package/dist/providers/cartesia/index.js +116 -80
- package/dist/providers/cartesia/index.js.map +1 -1
- package/dist/providers/deepgram/index.d.ts +23 -8
- package/dist/providers/deepgram/index.d.ts.map +1 -1
- package/dist/providers/deepgram/index.js +51 -18
- package/dist/providers/deepgram/index.js.map +1 -1
- package/dist/providers/elevenlabs/alignment.d.ts +7 -21
- package/dist/providers/elevenlabs/alignment.d.ts.map +1 -1
- package/dist/providers/elevenlabs/alignment.js +8 -9
- package/dist/providers/elevenlabs/alignment.js.map +1 -1
- package/dist/providers/elevenlabs/index.d.ts +14 -38
- package/dist/providers/elevenlabs/index.d.ts.map +1 -1
- package/dist/providers/elevenlabs/index.js +186 -169
- package/dist/providers/elevenlabs/index.js.map +1 -1
- package/dist/providers/fal/index.d.ts +11 -20
- package/dist/providers/fal/index.d.ts.map +1 -1
- package/dist/providers/fal/index.js +49 -37
- package/dist/providers/fal/index.js.map +1 -1
- package/dist/providers/fish-audio/index.d.ts +14 -8
- package/dist/providers/fish-audio/index.d.ts.map +1 -1
- package/dist/providers/fish-audio/index.js +47 -19
- package/dist/providers/fish-audio/index.js.map +1 -1
- package/dist/providers/gateway/index.d.ts +76 -0
- package/dist/providers/gateway/index.d.ts.map +1 -0
- package/dist/providers/gateway/index.js +251 -0
- package/dist/providers/gateway/index.js.map +1 -0
- package/dist/providers/google/index.d.ts +12 -20
- package/dist/providers/google/index.d.ts.map +1 -1
- package/dist/providers/google/index.js +180 -162
- package/dist/providers/google/index.js.map +1 -1
- package/dist/providers/hume/alignment.d.ts +30 -35
- package/dist/providers/hume/alignment.d.ts.map +1 -1
- package/dist/providers/hume/alignment.js +14 -8
- package/dist/providers/hume/alignment.js.map +1 -1
- package/dist/providers/hume/index.d.ts +16 -16
- package/dist/providers/hume/index.d.ts.map +1 -1
- package/dist/providers/hume/index.js +79 -65
- package/dist/providers/hume/index.js.map +1 -1
- package/dist/providers/inworld/alignment.d.ts +8 -22
- package/dist/providers/inworld/alignment.d.ts.map +1 -1
- package/dist/providers/inworld/alignment.js +9 -8
- package/dist/providers/inworld/alignment.js.map +1 -1
- package/dist/providers/inworld/index.d.ts +17 -20
- package/dist/providers/inworld/index.d.ts.map +1 -1
- package/dist/providers/inworld/index.js +79 -47
- package/dist/providers/inworld/index.js.map +1 -1
- package/dist/providers/mistral/index.d.ts +14 -8
- package/dist/providers/mistral/index.d.ts.map +1 -1
- package/dist/providers/mistral/index.js +63 -48
- package/dist/providers/mistral/index.js.map +1 -1
- package/dist/providers/murf/alignment.d.ts +10 -19
- package/dist/providers/murf/alignment.d.ts.map +1 -1
- package/dist/providers/murf/alignment.js +10 -5
- package/dist/providers/murf/alignment.js.map +1 -1
- package/dist/providers/murf/index.d.ts +15 -16
- package/dist/providers/murf/index.d.ts.map +1 -1
- package/dist/providers/murf/index.js +105 -58
- package/dist/providers/murf/index.js.map +1 -1
- package/dist/providers/openai/index.d.ts +43 -29
- package/dist/providers/openai/index.d.ts.map +1 -1
- package/dist/providers/openai/index.js +294 -106
- package/dist/providers/openai/index.js.map +1 -1
- package/dist/providers/resemble/alignment.d.ts +8 -29
- package/dist/providers/resemble/alignment.d.ts.map +1 -1
- package/dist/providers/resemble/alignment.js +9 -12
- package/dist/providers/resemble/alignment.js.map +1 -1
- package/dist/providers/resemble/index.d.ts +21 -11
- package/dist/providers/resemble/index.d.ts.map +1 -1
- package/dist/providers/resemble/index.js +89 -49
- package/dist/providers/resemble/index.js.map +1 -1
- package/dist/providers/smallest-ai/index.d.ts +47 -0
- package/dist/providers/smallest-ai/index.d.ts.map +1 -0
- package/dist/providers/smallest-ai/index.js +107 -0
- package/dist/providers/smallest-ai/index.js.map +1 -0
- package/dist/providers/xai/index.d.ts +25 -9
- package/dist/providers/xai/index.d.ts.map +1 -1
- package/dist/providers/xai/index.js +63 -40
- package/dist/providers/xai/index.js.map +1 -1
- package/dist/providers.d.ts +31 -0
- package/dist/providers.d.ts.map +1 -0
- package/dist/providers.js +16 -0
- package/dist/providers.js.map +1 -0
- package/dist/resolve-provider.d.ts.map +1 -1
- package/dist/resolve-provider.js +8 -51
- package/dist/resolve-provider.js.map +1 -1
- package/dist/retry-options.d.ts +6 -0
- package/dist/retry-options.d.ts.map +1 -0
- package/dist/retry-options.js +48 -0
- package/dist/retry-options.js.map +1 -0
- package/dist/speech-provider.d.ts +28 -53
- package/dist/speech-provider.d.ts.map +1 -1
- package/dist/speech-provider.js +5 -26
- package/dist/speech-provider.js.map +1 -1
- package/dist/speech-result.d.ts +8 -9
- package/dist/speech-result.d.ts.map +1 -1
- package/dist/speech-result.js.map +1 -1
- package/dist/speech-to-text-provider.d.ts +0 -12
- package/dist/speech-to-text-provider.d.ts.map +1 -1
- package/dist/stream-speech.d.ts +4 -2
- package/dist/stream-speech.d.ts.map +1 -1
- package/dist/stream-speech.js +36 -22
- package/dist/stream-speech.js.map +1 -1
- package/dist/timestamps.d.ts +3 -17
- package/dist/timestamps.d.ts.map +1 -1
- package/dist/turns.d.ts +9 -0
- package/dist/turns.d.ts.map +1 -0
- package/dist/turns.js +21 -0
- package/dist/turns.js.map +1 -0
- package/dist/types.d.ts +31 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/volume-adjust.d.ts +0 -6
- package/dist/volume-adjust.d.ts.map +1 -1
- package/dist/volume-adjust.js +4 -16
- package/dist/volume-adjust.js.map +1 -1
- package/package.json +13 -66
- package/dist/stt-providers/openai/index.d.ts +0 -42
- package/dist/stt-providers/openai/index.d.ts.map +0 -1
- package/dist/stt-providers/openai/index.js +0 -184
- package/dist/stt-providers/openai/index.js.map +0 -1
|
@@ -1,146 +1,159 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
1
2
|
import { stripAudioTags } from "../../audio-tags.js";
|
|
2
|
-
import { parseMediaTypeParam, wrapPcm16Mono } from "../../audio-utils.js";
|
|
3
|
+
import { base64ToUint8Array, parseMediaTypeParam, wrapPcm16Mono, } from "../../audio-utils.js";
|
|
3
4
|
import { SpeechSDKError } from "../../errors.js";
|
|
4
5
|
import { handleErrorResponse, resolveApiKey, SDK_USER_AGENT, } from "../../provider-utils.js";
|
|
5
6
|
import { hasFeature, } from "../../speech-provider.js";
|
|
7
|
+
// Both /generateContent endpoints share the same shape; tolerate missing intermediate fields for nullability differences.
|
|
8
|
+
const generateContentResponseSchema = z.object({
|
|
9
|
+
candidates: z
|
|
10
|
+
.array(z.object({
|
|
11
|
+
content: z
|
|
12
|
+
.object({
|
|
13
|
+
parts: z
|
|
14
|
+
.array(z.object({
|
|
15
|
+
inlineData: z
|
|
16
|
+
.object({ data: z.string(), mimeType: z.string() })
|
|
17
|
+
.optional(),
|
|
18
|
+
}))
|
|
19
|
+
.optional(),
|
|
20
|
+
})
|
|
21
|
+
.optional(),
|
|
22
|
+
}))
|
|
23
|
+
.optional(),
|
|
24
|
+
});
|
|
6
25
|
const DEFAULT_GEMINI_SAMPLE_RATE = 24_000;
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
26
|
+
export const GOOGLE_PROVIDER_ID = "google";
|
|
27
|
+
const GOOGLE_GEMINI_2_5_LANGUAGES = [
|
|
28
|
+
"en",
|
|
29
|
+
"fr",
|
|
30
|
+
"de",
|
|
31
|
+
"es",
|
|
32
|
+
"pt",
|
|
33
|
+
"zh",
|
|
34
|
+
"ja",
|
|
35
|
+
"ko",
|
|
36
|
+
"hi",
|
|
37
|
+
"it",
|
|
38
|
+
"nl",
|
|
39
|
+
"pl",
|
|
40
|
+
"ru",
|
|
41
|
+
"sv",
|
|
42
|
+
"tr",
|
|
43
|
+
"id",
|
|
44
|
+
"ar",
|
|
45
|
+
"cs",
|
|
46
|
+
"da",
|
|
47
|
+
"fi",
|
|
48
|
+
"el",
|
|
49
|
+
"hu",
|
|
50
|
+
"ro",
|
|
51
|
+
"uk",
|
|
52
|
+
];
|
|
53
|
+
const GOOGLE_GEMINI_3_1_LANGUAGES = [
|
|
54
|
+
"af",
|
|
55
|
+
"am",
|
|
56
|
+
"ar",
|
|
57
|
+
"az",
|
|
58
|
+
"be",
|
|
59
|
+
"bg",
|
|
60
|
+
"bn",
|
|
61
|
+
"ca",
|
|
62
|
+
"ceb",
|
|
63
|
+
"cmn",
|
|
64
|
+
"cs",
|
|
65
|
+
"da",
|
|
66
|
+
"de",
|
|
67
|
+
"el",
|
|
68
|
+
"en",
|
|
69
|
+
"es",
|
|
70
|
+
"et",
|
|
71
|
+
"eu",
|
|
72
|
+
"fa",
|
|
73
|
+
"fi",
|
|
74
|
+
"fil",
|
|
75
|
+
"fr",
|
|
76
|
+
"gl",
|
|
77
|
+
"gu",
|
|
78
|
+
"he",
|
|
79
|
+
"hi",
|
|
80
|
+
"hr",
|
|
81
|
+
"ht",
|
|
82
|
+
"hu",
|
|
83
|
+
"hy",
|
|
84
|
+
"id",
|
|
85
|
+
"is",
|
|
86
|
+
"it",
|
|
87
|
+
"ja",
|
|
88
|
+
"jv",
|
|
89
|
+
"ka",
|
|
90
|
+
"kn",
|
|
91
|
+
"ko",
|
|
92
|
+
"kok",
|
|
93
|
+
"la",
|
|
94
|
+
"lb",
|
|
95
|
+
"lo",
|
|
96
|
+
"lt",
|
|
97
|
+
"lv",
|
|
98
|
+
"mai",
|
|
99
|
+
"mg",
|
|
100
|
+
"mk",
|
|
101
|
+
"ml",
|
|
102
|
+
"mn",
|
|
103
|
+
"mr",
|
|
104
|
+
"ms",
|
|
105
|
+
"my",
|
|
106
|
+
"nb",
|
|
107
|
+
"ne",
|
|
108
|
+
"nl",
|
|
109
|
+
"nn",
|
|
110
|
+
"or",
|
|
111
|
+
"pa",
|
|
112
|
+
"pl",
|
|
113
|
+
"ps",
|
|
114
|
+
"pt",
|
|
115
|
+
"ro",
|
|
116
|
+
"ru",
|
|
117
|
+
"sd",
|
|
118
|
+
"si",
|
|
119
|
+
"sk",
|
|
120
|
+
"sl",
|
|
121
|
+
"sq",
|
|
122
|
+
"sr",
|
|
123
|
+
"sv",
|
|
124
|
+
"sw",
|
|
125
|
+
"ta",
|
|
126
|
+
"te",
|
|
127
|
+
"th",
|
|
128
|
+
"tr",
|
|
129
|
+
"uk",
|
|
130
|
+
"ur",
|
|
131
|
+
"vi",
|
|
132
|
+
];
|
|
133
|
+
export const GOOGLE_MODELS = [
|
|
134
|
+
{
|
|
135
|
+
id: "gemini-3.1-flash-tts-preview",
|
|
136
|
+
releaseDate: "2026-04-15",
|
|
137
|
+
languages: GOOGLE_GEMINI_3_1_LANGUAGES,
|
|
138
|
+
features: ["streaming", "audio-tags"],
|
|
139
|
+
},
|
|
140
|
+
{
|
|
141
|
+
id: "gemini-2.5-flash-preview-tts",
|
|
142
|
+
releaseDate: "2025-05-01",
|
|
143
|
+
languages: GOOGLE_GEMINI_2_5_LANGUAGES,
|
|
144
|
+
features: ["streaming"],
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
id: "gemini-2.5-pro-preview-tts",
|
|
148
|
+
releaseDate: "2025-05-01",
|
|
149
|
+
languages: GOOGLE_GEMINI_2_5_LANGUAGES,
|
|
150
|
+
features: ["streaming"],
|
|
151
|
+
},
|
|
152
|
+
];
|
|
15
153
|
export class GoogleSpeechProvider {
|
|
16
|
-
id =
|
|
154
|
+
id = GOOGLE_PROVIDER_ID;
|
|
17
155
|
defaultModel = "gemini-2.5-flash-preview-tts";
|
|
18
|
-
|
|
19
|
-
"en",
|
|
20
|
-
"fr",
|
|
21
|
-
"de",
|
|
22
|
-
"es",
|
|
23
|
-
"pt",
|
|
24
|
-
"zh",
|
|
25
|
-
"ja",
|
|
26
|
-
"ko",
|
|
27
|
-
"hi",
|
|
28
|
-
"it",
|
|
29
|
-
"nl",
|
|
30
|
-
"pl",
|
|
31
|
-
"ru",
|
|
32
|
-
"sv",
|
|
33
|
-
"tr",
|
|
34
|
-
"id",
|
|
35
|
-
"ar",
|
|
36
|
-
"cs",
|
|
37
|
-
"da",
|
|
38
|
-
"fi",
|
|
39
|
-
"el",
|
|
40
|
-
"hu",
|
|
41
|
-
"ro",
|
|
42
|
-
"uk",
|
|
43
|
-
];
|
|
44
|
-
static GEMINI_3_1_LANGUAGES = [
|
|
45
|
-
"af",
|
|
46
|
-
"am",
|
|
47
|
-
"ar",
|
|
48
|
-
"az",
|
|
49
|
-
"be",
|
|
50
|
-
"bg",
|
|
51
|
-
"bn",
|
|
52
|
-
"ca",
|
|
53
|
-
"ceb",
|
|
54
|
-
"cmn",
|
|
55
|
-
"cs",
|
|
56
|
-
"da",
|
|
57
|
-
"de",
|
|
58
|
-
"el",
|
|
59
|
-
"en",
|
|
60
|
-
"es",
|
|
61
|
-
"et",
|
|
62
|
-
"eu",
|
|
63
|
-
"fa",
|
|
64
|
-
"fi",
|
|
65
|
-
"fil",
|
|
66
|
-
"fr",
|
|
67
|
-
"gl",
|
|
68
|
-
"gu",
|
|
69
|
-
"he",
|
|
70
|
-
"hi",
|
|
71
|
-
"hr",
|
|
72
|
-
"ht",
|
|
73
|
-
"hu",
|
|
74
|
-
"hy",
|
|
75
|
-
"id",
|
|
76
|
-
"is",
|
|
77
|
-
"it",
|
|
78
|
-
"ja",
|
|
79
|
-
"jv",
|
|
80
|
-
"ka",
|
|
81
|
-
"kn",
|
|
82
|
-
"ko",
|
|
83
|
-
"kok",
|
|
84
|
-
"la",
|
|
85
|
-
"lb",
|
|
86
|
-
"lo",
|
|
87
|
-
"lt",
|
|
88
|
-
"lv",
|
|
89
|
-
"mai",
|
|
90
|
-
"mg",
|
|
91
|
-
"mk",
|
|
92
|
-
"ml",
|
|
93
|
-
"mn",
|
|
94
|
-
"mr",
|
|
95
|
-
"ms",
|
|
96
|
-
"my",
|
|
97
|
-
"nb",
|
|
98
|
-
"ne",
|
|
99
|
-
"nl",
|
|
100
|
-
"nn",
|
|
101
|
-
"or",
|
|
102
|
-
"pa",
|
|
103
|
-
"pl",
|
|
104
|
-
"ps",
|
|
105
|
-
"pt",
|
|
106
|
-
"ro",
|
|
107
|
-
"ru",
|
|
108
|
-
"sd",
|
|
109
|
-
"si",
|
|
110
|
-
"sk",
|
|
111
|
-
"sl",
|
|
112
|
-
"sq",
|
|
113
|
-
"sr",
|
|
114
|
-
"sv",
|
|
115
|
-
"sw",
|
|
116
|
-
"ta",
|
|
117
|
-
"te",
|
|
118
|
-
"th",
|
|
119
|
-
"tr",
|
|
120
|
-
"uk",
|
|
121
|
-
"ur",
|
|
122
|
-
"vi",
|
|
123
|
-
];
|
|
124
|
-
models = [
|
|
125
|
-
{
|
|
126
|
-
id: "gemini-3.1-flash-tts-preview",
|
|
127
|
-
releaseDate: "2026-04-15",
|
|
128
|
-
languages: GoogleSpeechProvider.GEMINI_3_1_LANGUAGES,
|
|
129
|
-
features: ["streaming", "audio-tags"],
|
|
130
|
-
},
|
|
131
|
-
{
|
|
132
|
-
id: "gemini-2.5-flash-preview-tts",
|
|
133
|
-
releaseDate: "2025-05-01",
|
|
134
|
-
languages: GoogleSpeechProvider.GEMINI_2_5_LANGUAGES,
|
|
135
|
-
features: ["streaming"],
|
|
136
|
-
},
|
|
137
|
-
{
|
|
138
|
-
id: "gemini-2.5-pro-preview-tts",
|
|
139
|
-
releaseDate: "2025-05-01",
|
|
140
|
-
languages: GoogleSpeechProvider.GEMINI_2_5_LANGUAGES,
|
|
141
|
-
features: ["streaming"],
|
|
142
|
-
},
|
|
143
|
-
];
|
|
156
|
+
models = GOOGLE_MODELS;
|
|
144
157
|
apiKey;
|
|
145
158
|
baseURL;
|
|
146
159
|
fetchFn;
|
|
@@ -150,9 +163,7 @@ export class GoogleSpeechProvider {
|
|
|
150
163
|
config.baseURL ?? "https://generativelanguage.googleapis.com/v1beta";
|
|
151
164
|
this.fetchFn = config.fetch ?? globalThis.fetch.bind(globalThis);
|
|
152
165
|
}
|
|
153
|
-
// Gemini 3.1 Flash TTS supports inline audio tags
|
|
154
|
-
// [shouting], [sighs], [laugh]) natively — pass them through verbatim.
|
|
155
|
-
// Older Gemini TTS models do not, so strip them with a warning.
|
|
166
|
+
// Gemini 3.1 Flash TTS supports inline audio tags natively; older models don't and need stripping.
|
|
156
167
|
processAudioTags(text, modelId) {
|
|
157
168
|
if (this.models.some((m) => m.id === modelId && hasFeature(m, "audio-tags"))) {
|
|
158
169
|
return { text, warnings: [] };
|
|
@@ -193,32 +204,23 @@ export class GoogleSpeechProvider {
|
|
|
193
204
|
body: JSON.stringify(body),
|
|
194
205
|
signal: options.abortSignal,
|
|
195
206
|
});
|
|
196
|
-
await handleErrorResponse(response
|
|
197
|
-
const json = (await response.json());
|
|
207
|
+
await handleErrorResponse(response);
|
|
208
|
+
const json = generateContentResponseSchema.parse(await response.json());
|
|
198
209
|
const part = json.candidates?.[0]?.content?.parts?.find((p) => p.inlineData != null);
|
|
199
210
|
if (!part?.inlineData) {
|
|
200
211
|
throw new Error("No audio data in Gemini TTS response");
|
|
201
212
|
}
|
|
202
|
-
// Gemini returns raw 16-bit mono PCM
|
|
203
|
-
// the audio is directly playable by any client.
|
|
213
|
+
// Gemini returns raw 16-bit mono PCM; wrap as WAV so callers can play it directly.
|
|
204
214
|
const sampleRate = parseMediaTypeParam(part.inlineData.mimeType ?? "", "rate") ??
|
|
205
215
|
DEFAULT_GEMINI_SAMPLE_RATE;
|
|
206
|
-
const pcm =
|
|
216
|
+
const pcm = base64ToUint8Array(part.inlineData.data);
|
|
207
217
|
const wav = await wrapPcm16Mono(pcm, sampleRate);
|
|
208
218
|
return {
|
|
209
219
|
audio: wav,
|
|
210
220
|
mediaType: "audio/wav",
|
|
211
221
|
};
|
|
212
222
|
}
|
|
213
|
-
// Gemini
|
|
214
|
-
// audio progressively — the server synthesizes the full clip, then flushes
|
|
215
|
-
// it in a single burst. Time-to-first-byte matches `generateContent`, and
|
|
216
|
-
// the user-perceived behavior is identical. Rather than duplicate the
|
|
217
|
-
// request logic and deal with SSE parsing + chunked WAV assembly, we
|
|
218
|
-
// delegate to `generate()` and wrap the result in a single-chunk
|
|
219
|
-
// ReadableStream. True progressive Gemini TTS is only available via the
|
|
220
|
-
// Live API (`bidiGenerateContent`, WebSocket) on native-audio models,
|
|
221
|
-
// which is a separate integration not wired up in this SDK.
|
|
223
|
+
// streamGenerateContent flushes the full clip in one burst; we wrap generate() output as a single-chunk stream. Progressive Gemini TTS requires the Live API (not wired up here).
|
|
222
224
|
async stream(options) {
|
|
223
225
|
const { audio, mediaType, providerMetadata } = await this.generate(options);
|
|
224
226
|
const stream = new ReadableStream({
|
|
@@ -231,22 +233,36 @@ export class GoogleSpeechProvider {
|
|
|
231
233
|
}
|
|
232
234
|
getStitchOptions(modelId) {
|
|
233
235
|
if (this.models.some((m) => m.id === modelId)) {
|
|
234
|
-
//
|
|
235
|
-
// returning to callers, so stitch decoding uses the WAV codepath.
|
|
236
|
+
// Provider wraps Gemini's raw PCM as WAV before returning; stitch decoding uses the WAV codepath.
|
|
236
237
|
return {
|
|
237
238
|
providerOptions: {},
|
|
238
239
|
mediaType: "audio/wav",
|
|
239
240
|
};
|
|
240
241
|
}
|
|
241
|
-
return
|
|
242
|
+
return;
|
|
243
|
+
}
|
|
244
|
+
resolveOutputFormat(modelId, output) {
|
|
245
|
+
if (!this.models.some((m) => m.id === modelId)) {
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
// Gemini TTS endpoint has no format parameter — provider always wraps raw PCM as WAV.
|
|
249
|
+
// SDK conversion path handles pcm-unwrap and mp3-encode from the wav baseline.
|
|
250
|
+
if (output.format === "wav" ||
|
|
251
|
+
output.format === "pcm" ||
|
|
252
|
+
output.format === "mp3") {
|
|
253
|
+
return {
|
|
254
|
+
providerOptions: {},
|
|
255
|
+
expectedMediaType: "audio/wav",
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
return;
|
|
242
259
|
}
|
|
243
260
|
dialogueCapabilities(modelId) {
|
|
244
261
|
if (this.models.some((m) => m.id === modelId)) {
|
|
245
|
-
// Gemini multi-speaker TTS requires exactly 2 unique voices
|
|
246
|
-
// (empirically verified — API validator: "enabled_voices must equal 2").
|
|
262
|
+
// Gemini multi-speaker TTS requires exactly 2 unique voices (API validator: "enabled_voices must equal 2").
|
|
247
263
|
return { minVoices: 2, maxVoices: 2 };
|
|
248
264
|
}
|
|
249
|
-
return
|
|
265
|
+
return;
|
|
250
266
|
}
|
|
251
267
|
async generateDialogue(options) {
|
|
252
268
|
const apiKey = resolveApiKey(this.apiKey, "GOOGLE_API_KEY", "Google");
|
|
@@ -290,13 +306,13 @@ export class GoogleSpeechProvider {
|
|
|
290
306
|
body: JSON.stringify(body),
|
|
291
307
|
signal: options.abortSignal,
|
|
292
308
|
});
|
|
293
|
-
await handleErrorResponse(response
|
|
294
|
-
const json = (await response.json());
|
|
309
|
+
await handleErrorResponse(response);
|
|
310
|
+
const json = generateContentResponseSchema.parse(await response.json());
|
|
295
311
|
const part = json.candidates?.[0]?.content?.parts?.find((p) => p.inlineData?.data);
|
|
296
312
|
if (!part?.inlineData) {
|
|
297
313
|
throw new SpeechSDKError(`google/${options.modelId}: no inline audio in response`);
|
|
298
314
|
}
|
|
299
|
-
const pcm =
|
|
315
|
+
const pcm = base64ToUint8Array(part.inlineData.data);
|
|
300
316
|
const sampleRate = parseMediaTypeParam(part.inlineData.mimeType ?? "", "rate") ??
|
|
301
317
|
DEFAULT_GEMINI_SAMPLE_RATE;
|
|
302
318
|
const wav = await wrapPcm16Mono(pcm, sampleRate);
|
|
@@ -308,10 +324,12 @@ export class GoogleSpeechProvider {
|
|
|
308
324
|
}
|
|
309
325
|
export function createGoogle(config = {}) {
|
|
310
326
|
const provider = new GoogleSpeechProvider(config);
|
|
327
|
+
const fallbackSTT = config.fallbackSTT;
|
|
311
328
|
return function google(modelId) {
|
|
312
329
|
return {
|
|
313
330
|
provider,
|
|
314
331
|
modelId: modelId ?? provider.defaultModel,
|
|
332
|
+
...(fallbackSTT && { fallbackSTT }),
|
|
315
333
|
};
|
|
316
334
|
};
|
|
317
335
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/providers/google/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/providers/google/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EACL,kBAAkB,EAClB,mBAAmB,EACnB,aAAa,GACd,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EACL,mBAAmB,EACnB,aAAa,EACb,cAAc,GACf,MAAM,yBAAyB,CAAC;AACjC,OAAO,EACL,UAAU,GAIX,MAAM,0BAA0B,CAAC;AAGlC,0HAA0H;AAC1H,MAAM,6BAA6B,GAAG,CAAC,CAAC,MAAM,CAAC;IAC7C,UAAU,EAAE,CAAC;SACV,KAAK,CACJ,CAAC,CAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;aACP,MAAM,CAAC;YACN,KAAK,EAAE,CAAC;iBACL,KAAK,CACJ,CAAC,CAAC,MAAM,CAAC;gBACP,UAAU,EAAE,CAAC;qBACV,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;qBAClD,QAAQ,EAAE;aACd,CAAC,CACH;iBACA,QAAQ,EAAE;SACd,CAAC;aACD,QAAQ,EAAE;KACd,CAAC,CACH;SACA,QAAQ,EAAE;CACd,CAAC,CAAC;AAEH,MAAM,0BAA0B,GAAG,MAAM,CAAC;AAS1C,MAAM,CAAC,MAAM,kBAAkB,GAAG,QAAiB,CAAC;AAEpD,MAAM,2BAA2B,GAAG;IAClC,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;CACI,CAAC;AAEX,MAAM,2BAA2B,GAAG;IAClC,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;CACI,CAAC;AAEX,MAAM,CAAC,MAAM,aAAa,GAAyB;IACjD;QACE,EAAE,EAAE,8BAA8B;QAClC,WAAW,EAAE,YAAY;QACzB,SAAS,EAAE,2BAA2B;QACtC,QAAQ,EAAE,CAAC,WAAW,EAAE,YAAY,CAAC;KACtC;IACD;QACE,EAAE,EAAE,8BAA8B;QAClC,WAAW,EAAE,YAAY;QACzB,SAAS,EAAE,2BAA2B;QACtC,QAAQ,EAAE,CAAC,WAAW,CAAC;KACxB;IACD;QACE,EAAE,EAAE,4BAA4B;QAChC,WAAW,EAAE,YAAY;QACzB,SAAS,EAAE,2BAA2B;QACtC,QAAQ,EAAE,CAAC,WAAW,CAAC;KACxB;CACO,CAAC;AAEX,MAAM,OAAO,oBAAoB;IACtB,EAAE,GAAG,kBAAkB,CAAC;IACxB,YAAY,GAAG,8BAA8B,CAAC;IAE9C,MAAM,GAAG,aAAa,CAAC;IAEf,MAAM,CAAqB;IAC3B,OAAO,CAAS;IAChB,OAAO,CAA0B;IAElD,YAAY,MAAkC;QAC5C,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;QAC5B,IAAI,CAAC,OAAO;YACV,MAAM,CAAC,OAAO,IAAI,kDAAkD,CAAC;QACvE,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,KAAK,IAAI,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IACnE,CAAC;IAED,mGAAmG;IACnG,gBAAgB,CACd,IAAY,EACZ,OAAe;QAEf,IACE,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,IAAI,UAAU,CAAC,CAAC,EAAE,YAAY,CAAC,CAAC,EACxE,CAAC;YACD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC;QAChC,CAAC;QACD,OAAO,cAAc,CAAC,IAAI,EAAE,UAAU,OAAO,EAAE,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAOd;QAMC,MAAM,MAAM,GAAG,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,gBAAgB,EAAE,QAAQ,CAAC,CAAC;QAEtE,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,IAAI,MAAM,CAAC;QAE1C,MAAM,YAAY,GAA4B;YAC5C,YAAY,EAAE;gBACZ,qBAAqB,EAAE;oBACrB,UAAU,EAAE,SAAS;iBACtB;aACF;SACF,CAAC;QAEF,MAAM,IAAI,GAA4B;YACpC,QAAQ,EAAE;gBACR;oBACE,IAAI,EAAE,MAAM;oBACZ,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,IAAI,EAAE,CAAC;iBAChC;aACF;YACD,gBAAgB,EAAE;gBAChB,kBAAkB,EAAE,CAAC,OAAO,CAAC;gBAC7B,aAAa,EAAE,YAAY;gBAC3B,GAAG,OAAO,CAAC,eAAe;aAC3B;SACF,CAAC;QAEF,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,OAAO,WAAW,OAAO,CAAC,OAAO,wBAAwB,MAAM,EAAE,CAAC;QAEtF,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE;YACvC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,cAAc,EAAE,cAAc;gBAC9B,GAAG,OAAO,CAAC,OAAO;aACnB;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;YAC1B,MAAM,EAAE,OAAO,CAAC,WAAW;SAC5B,CAAC,CAAC;QAEH,MAAM,mBAAmB,CAAC,QAAQ,CAAC,CAAC;QAEpC,MAAM,IAAI,GAAG,6BAA6B,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;QAExE,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,IAAI,CACrD,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,IAAI,IAAI,CAC5B,CAAC;QAEF,IAAI,CAAC,IAAI,EAAE,UAAU,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,mFAAmF;QACnF,MAAM,UAAU,GACd,mBAAmB,CAAC,IAAI,CAAC,UAAU,CAAC,QAAQ,IAAI,EAAE,EAAE,MAAM,CAAC;YAC3D,0BAA0B,CAAC;QAC7B,MAAM,GAAG,GAAG,kBAAkB,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;QACrD,MAAM,GAAG,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;QAEjD,OAAO;YACL,KAAK,EAAE,GAAG;YACV,SAAS,EAAE,WAAW;SACvB,CAAC;IACJ,CAAC;IAED,kLAAkL;IAClL,KAAK,CAAC,MAAM,CAAC,OAOZ;QAKC,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;QAC5E,MAAM,MAAM,GAAG,IAAI,cAAc,CAAa;YAC5C,KAAK,CAAC,UAAU;gBACd,UAAU,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;gBAC1B,UAAU,CAAC,KAAK,EAAE,CAAC;YACrB,CAAC;SACF,CAAC,CAAC;QACH,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,gBAAgB,EAAE,CAAC;IACjD,CAAC;IAED,gBAAgB,CAAC,OAAe;QAC9B,IAAI,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,EAAE,CAAC;YAC9C,kGAAkG;YAClG,OAAO;gBACL,eAAe,EAAE,EAAE;gBACnB,SAAS,EAAE,WAAW;aACvB,CAAC;QACJ,CAAC;QACD,OAAO;IACT,CAAC;IAED,mBAAmB,CAAC,OAAe,EAAE,MAAmB;QACtD,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,EAAE,CAAC;YAC/C,OAAO;QACT,CAAC;QACD,sFAAsF;QACtF,+EAA+E;QAC/E,IACE,MAAM,CAAC,MAAM,KAAK,KAAK;YACvB,MAAM,CAAC,MAAM,KAAK,KAAK;YACvB,MAAM,CAAC,MAAM,KAAK,KAAK,EACvB,CAAC;YACD,OAAO;gBACL,eAAe,EAAE,EAAE;gBACnB,iBAAiB,EAAE,WAAW;aAC/B,CAAC;QACJ,CAAC;QACD,OAAO;IACT,CAAC;IAED,oBAAoB,CAAC,OAAe;QAClC,IAAI,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,EAAE,CAAC;YAC9C,4GAA4G;YAC5G,OAAO,EAAE,SAAS,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC;QACxC,CAAC;QACD,OAAO;IACT,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,OAMtB;QAKC,MAAM,MAAM,GAAG,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,gBAAgB,EAAE,QAAQ,CAAC,CAAC;QAEtE,MAAM,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC;QAC/C,MAAM,QAAQ,GAAa,EAAE,CAAC;QAC9B,KAAK,MAAM,IAAI,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YACjC,IAAI,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACzC,IAAI,CAAC,KAAK,EAAE,CAAC;gBACX,KAAK,GAAG,UAAU,YAAY,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;gBAC1C,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YACtC,CAAC;YACD,QAAQ,CAAC,IAAI,CAAC,GAAG,KAAK,KAAK,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;QAC1C,CAAC;QACD,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEjC,MAAM,mBAAmB,GAAG,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,CAChE,CAAC,CAAC,SAAS,EAAE,OAAO,CAAC,EAAE,EAAE,CAAC,CAAC;YACzB,OAAO;YACP,YAAY,EAAE;gBACZ,qBAAqB,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE;aACjD;SACF,CAAC,CACH,CAAC;QAEF,MAAM,IAAI,GAA4B;YACpC,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;YAC/C,gBAAgB,EAAE;gBAChB,kBAAkB,EAAE,CAAC,OAAO,CAAC;gBAC7B,aAAa,EAAE;oBACb,0BAA0B,EAAE;wBAC1B,qBAAqB,EAAE,mBAAmB;qBAC3C;iBACF;gBACD,GAAG,OAAO,CAAC,eAAe;aAC3B;SACF,CAAC;QAEF,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,OAAO,WAAW,OAAO,CAAC,OAAO,wBAAwB,MAAM,EAAE,CAAC;QAEtF,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE;YACvC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,cAAc,EAAE,cAAc;gBAC9B,GAAG,OAAO,CAAC,OAAO;aACnB;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC;YAC1B,MAAM,EAAE,OAAO,CAAC,WAAW;SAC5B,CAAC,CAAC;QAEH,MAAM,mBAAmB,CAAC,QAAQ,CAAC,CAAC;QAEpC,MAAM,IAAI,GAAG,6BAA6B,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;QACxE,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,IAAI,CACrD,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,EAAE,IAAI,CAC1B,CAAC;QACF,IAAI,CAAC,IAAI,EAAE,UAAU,EAAE,CAAC;YACtB,MAAM,IAAI,cAAc,CACtB,UAAU,OAAO,CAAC,OAAO,+BAA+B,CACzD,CAAC;QACJ,CAAC;QAED,MAAM,GAAG,GAAG,kBAAkB,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;QACrD,MAAM,UAAU,GACd,mBAAmB,CAAC,IAAI,CAAC,UAAU,CAAC,QAAQ,IAAI,EAAE,EAAE,MAAM,CAAC;YAC3D,0BAA0B,CAAC;QAC7B,MAAM,GAAG,GAAG,MAAM,aAAa,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;QAEjD,OAAO;YACL,KAAK,EAAE,GAAG;YACV,SAAS,EAAE,WAAW;SACvB,CAAC;IACJ,CAAC;CACF;AAED,MAAM,UAAU,YAAY,CAAC,SAAqC,EAAE;IAClE,MAAM,QAAQ,GAAG,IAAI,oBAAoB,CAAC,MAAM,CAAC,CAAC;IAClD,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;IAEvC,OAAO,SAAS,MAAM,CAAC,OAAgB;QACrC,OAAO;YACL,QAAQ;YACR,OAAO,EAAE,OAAO,IAAI,QAAQ,CAAC,YAAY;YACzC,GAAG,CAAC,WAAW,IAAI,EAAE,WAAW,EAAE,CAAC;SACpC,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC"}
|
|
@@ -1,38 +1,33 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
1
2
|
import type { WordTimestamp } from "../../timestamps.js";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
* Hume `/v0/tts` response into a single word-level alignment array, filtering
|
|
32
|
-
* to `type: "word"` entries and converting milliseconds to seconds.
|
|
33
|
-
*
|
|
34
|
-
* Assumes the caller set `split_utterances: false` (and a single utterance),
|
|
35
|
-
* so segment-relative offsets don't need to be re-based against the full audio.
|
|
36
|
-
*/
|
|
3
|
+
export declare const humeTimestampSchema: z.ZodObject<{
|
|
4
|
+
text: z.ZodString;
|
|
5
|
+
time: z.ZodObject<{
|
|
6
|
+
begin: z.ZodNumber;
|
|
7
|
+
end: z.ZodNumber;
|
|
8
|
+
}, z.core.$strip>;
|
|
9
|
+
type: z.ZodEnum<{
|
|
10
|
+
word: "word";
|
|
11
|
+
phoneme: "phoneme";
|
|
12
|
+
}>;
|
|
13
|
+
}, z.core.$strip>;
|
|
14
|
+
export type HumeTimestamp = z.infer<typeof humeTimestampSchema>;
|
|
15
|
+
export declare const humeSnippetSchema: z.ZodObject<{
|
|
16
|
+
audio: z.ZodOptional<z.ZodString>;
|
|
17
|
+
id: z.ZodOptional<z.ZodString>;
|
|
18
|
+
text: z.ZodOptional<z.ZodString>;
|
|
19
|
+
timestamps: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
20
|
+
text: z.ZodString;
|
|
21
|
+
time: z.ZodObject<{
|
|
22
|
+
begin: z.ZodNumber;
|
|
23
|
+
end: z.ZodNumber;
|
|
24
|
+
}, z.core.$strip>;
|
|
25
|
+
type: z.ZodEnum<{
|
|
26
|
+
word: "word";
|
|
27
|
+
phoneme: "phoneme";
|
|
28
|
+
}>;
|
|
29
|
+
}, z.core.$strip>>>;
|
|
30
|
+
}, z.core.$strip>;
|
|
31
|
+
export type HumeSnippet = z.infer<typeof humeSnippetSchema>;
|
|
37
32
|
export declare function snippetsToWordTimestamps(snippets: readonly (readonly HumeSnippet[])[]): WordTimestamp[];
|
|
38
33
|
//# sourceMappingURL=alignment.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"alignment.d.ts","sourceRoot":"","sources":["../../../src/providers/hume/alignment.ts"],"names":[],"mappings":"AAAA,OAAO,
|
|
1
|
+
{"version":3,"file":"alignment.d.ts","sourceRoot":"","sources":["../../../src/providers/hume/alignment.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAGzD,eAAO,MAAM,mBAAmB;;;;;;;;;;iBAI9B,CAAC;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAEhE,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;iBAK5B,CAAC;AACH,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAG5D,wBAAgB,wBAAwB,CACtC,QAAQ,EAAE,SAAS,CAAC,SAAS,WAAW,EAAE,CAAC,EAAE,GAC5C,aAAa,EAAE,CAqBjB"}
|
|
@@ -1,11 +1,17 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
// Hume Octave-2 timestamp entry. time.begin/end are integer ms.
|
|
3
|
+
export const humeTimestampSchema = z.object({
|
|
4
|
+
text: z.string(),
|
|
5
|
+
time: z.object({ begin: z.number(), end: z.number() }),
|
|
6
|
+
type: z.enum(["word", "phoneme"]),
|
|
7
|
+
});
|
|
8
|
+
export const humeSnippetSchema = z.object({
|
|
9
|
+
audio: z.string().optional(),
|
|
10
|
+
id: z.string().optional(),
|
|
11
|
+
text: z.string().optional(),
|
|
12
|
+
timestamps: z.array(humeTimestampSchema).optional(),
|
|
13
|
+
});
|
|
14
|
+
// Assumes split_utterances: false so timestamps are relative to the full audio.
|
|
9
15
|
export function snippetsToWordTimestamps(snippets) {
|
|
10
16
|
const out = [];
|
|
11
17
|
for (const utterance of snippets) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"alignment.js","sourceRoot":"","sources":["../../../src/providers/hume/alignment.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"alignment.js","sourceRoot":"","sources":["../../../src/providers/hume/alignment.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,gEAAgE;AAChE,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC1C,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE;IAChB,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC;IACtD,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;CAClC,CAAC,CAAC;AAGH,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAAC,CAAC,MAAM,CAAC;IACxC,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC5B,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACzB,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC3B,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC,QAAQ,EAAE;CACpD,CAAC,CAAC;AAGH,gFAAgF;AAChF,MAAM,UAAU,wBAAwB,CACtC,QAA6C;IAE7C,MAAM,GAAG,GAAoB,EAAE,CAAC;IAChC,KAAK,MAAM,SAAS,IAAI,QAAQ,EAAE,CAAC;QACjC,KAAK,MAAM,OAAO,IAAI,SAAS,EAAE,CAAC;YAChC,MAAM,EAAE,GAAG,OAAO,CAAC,UAAU,CAAC;YAC9B,IAAI,CAAC,EAAE,EAAE,CAAC;gBACR,SAAS;YACX,CAAC;YACD,KAAK,MAAM,KAAK,IAAI,EAAE,EAAE,CAAC;gBACvB,IAAI,KAAK,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;oBAC1B,SAAS;gBACX,CAAC;gBACD,GAAG,CAAC,IAAI,CAAC;oBACP,IAAI,EAAE,KAAK,CAAC,IAAI;oBAChB,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,GAAG,IAAI;oBAC9B,GAAG,EAAE,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,IAAI;iBAC3B,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC"}
|
|
@@ -1,27 +1,19 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type { AudioOutput } from "../../audio-output.js";
|
|
2
|
+
import type { ModelInfo, ResolvedModel, SpeechProvider } from "../../speech-provider.js";
|
|
3
|
+
import type { ResolvedSTTModel } from "../../speech-to-text-provider.js";
|
|
2
4
|
import type { WordTimestamp } from "../../timestamps.js";
|
|
3
5
|
export interface HumeSpeechProviderConfig {
|
|
4
6
|
apiKey?: string;
|
|
5
7
|
baseURL?: string;
|
|
8
|
+
fallbackSTT?: ResolvedSTTModel;
|
|
6
9
|
fetch?: typeof globalThis.fetch;
|
|
7
10
|
}
|
|
11
|
+
export declare const HUME_PROVIDER_ID: "hume";
|
|
12
|
+
export declare const HUME_MODELS: readonly ModelInfo[];
|
|
8
13
|
export declare class HumeSpeechProvider implements SpeechProvider<string, string> {
|
|
9
|
-
readonly id
|
|
14
|
+
readonly id: "hume";
|
|
10
15
|
readonly defaultModel = "octave-2";
|
|
11
|
-
readonly models: readonly [
|
|
12
|
-
readonly id: "octave-2";
|
|
13
|
-
readonly releaseDate: "2025-10-01";
|
|
14
|
-
readonly languages: readonly ["en", "fr", "de", "es", "pt", "ja", "ko", "hi", "it", "ar", "ru"];
|
|
15
|
-
readonly features: readonly ["streaming", "inline-voice-cloning", {
|
|
16
|
-
readonly id: "timestamps";
|
|
17
|
-
readonly mode: "native";
|
|
18
|
-
}];
|
|
19
|
-
}, {
|
|
20
|
-
readonly id: "octave-1";
|
|
21
|
-
readonly releaseDate: "2025-03-01";
|
|
22
|
-
readonly languages: readonly ["en"];
|
|
23
|
-
readonly features: readonly ["streaming"];
|
|
24
|
-
}];
|
|
16
|
+
readonly models: readonly ModelInfo[];
|
|
25
17
|
private readonly apiKey;
|
|
26
18
|
private readonly baseURL;
|
|
27
19
|
private readonly fetchFn;
|
|
@@ -62,6 +54,14 @@ export declare class HumeSpeechProvider implements SpeechProvider<string, string
|
|
|
62
54
|
};
|
|
63
55
|
mediaType: string;
|
|
64
56
|
} | undefined;
|
|
57
|
+
resolveOutputFormat(modelId: string, output: AudioOutput): {
|
|
58
|
+
providerOptions: {
|
|
59
|
+
format: {
|
|
60
|
+
type: string;
|
|
61
|
+
};
|
|
62
|
+
};
|
|
63
|
+
expectedMediaType: string;
|
|
64
|
+
} | undefined;
|
|
65
65
|
dialogueCapabilities(modelId: string): {
|
|
66
66
|
minVoices: number;
|
|
67
67
|
maxVoices: number;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/providers/hume/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/providers/hume/index.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAQzD,OAAO,KAAK,EACV,SAAS,EACT,aAAa,EACb,cAAc,EACf,MAAM,0BAA0B,CAAC;AAClC,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,kCAAkC,CAAC;AACzE,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAczD,MAAM,WAAW,wBAAwB;IACvC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,gBAAgB,CAAC;IAC/B,KAAK,CAAC,EAAE,OAAO,UAAU,CAAC,KAAK,CAAC;CACjC;AAED,eAAO,MAAM,gBAAgB,EAAG,MAAe,CAAC;AAEhD,eAAO,MAAM,WAAW,EAAE,SAAS,SAAS,EAyBlC,CAAC;AAEX,qBAAa,kBAAmB,YAAW,cAAc,CAAC,MAAM,EAAE,MAAM,CAAC;IACvE,QAAQ,CAAC,EAAE,SAAoB;IAC/B,QAAQ,CAAC,YAAY,cAAc;IAEnC,QAAQ,CAAC,MAAM,uBAAe;IAE9B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAqB;IAC5C,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA0B;gBAEtC,MAAM,EAAE,wBAAwB;IAM5C,OAAO,CAAC,cAAc;IAUhB,QAAQ,CAAC,OAAO,EAAE;QACtB,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACjC,iBAAiB,CAAC,EAAE,OAAO,CAAC;KAC7B,GAAG,OAAO,CAAC;QACV,KAAK,EAAE,UAAU,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC3C,UAAU,CAAC,EAAE,aAAa,EAAE,CAAC;KAC9B,CAAC;YA+CY,sBAAsB;IA0D9B,MAAM,CAAC,OAAO,EAAE;QACpB,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,MAAM,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;QACnC,SAAS,EAAE,MAAM,CAAC;QAClB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC;IA0CF,gBAAgB,CAAC,OAAO,EAAE,MAAM;;;;;;;;IAWhC,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW;;;;;;;;IAyBxD,oBAAoB,CAAC,OAAO,EAAE,MAAM;;;;IAQ9B,gBAAgB,CAAC,OAAO,EAAE;QAC9B,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,SAAS;YAAE,KAAK,EAAE,MAAM,CAAC;YAAC,IAAI,EAAE,MAAM,CAAA;SAAE,EAAE,CAAC;QAClD,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,KAAK,EAAE,UAAU,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC;CAoCH;AAED,wBAAgB,UAAU,CAAC,MAAM,GAAE,wBAA6B,IAIzC,UAAU,MAAM,KAAG,aAAa,CAAC,MAAM,CAAC,CAO9D"}
|