@pie-players/tts-server-core 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +18 -15
- package/.turbo/turbo-build.log +0 -1
- package/src/cache.ts +0 -273
- package/src/index.ts +0 -50
- package/src/provider.ts +0 -200
- package/src/speech-marks.ts +0 -243
- package/src/types.ts +0 -425
- package/tsconfig.json +0 -20
package/package.json
CHANGED
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pie-players/tts-server-core",
|
|
3
|
-
"version": "0.1.
|
|
4
|
-
"
|
|
5
|
-
"type": "module",
|
|
3
|
+
"version": "0.1.3",
|
|
4
|
+
"author": "PIE Framework",
|
|
6
5
|
"main": "./dist/index.js",
|
|
7
|
-
"
|
|
6
|
+
"devDependencies": {
|
|
7
|
+
"typescript": "^5.3.3",
|
|
8
|
+
"vitest": "^1.0.4"
|
|
9
|
+
},
|
|
8
10
|
"exports": {
|
|
9
11
|
".": {
|
|
10
12
|
"types": "./dist/index.d.ts",
|
|
11
13
|
"import": "./dist/index.js"
|
|
12
14
|
}
|
|
13
15
|
},
|
|
14
|
-
"
|
|
15
|
-
|
|
16
|
-
"
|
|
17
|
-
|
|
18
|
-
"test:coverage": "vitest --coverage"
|
|
19
|
-
},
|
|
16
|
+
"description": "Core interfaces and types for server-side TTS providers",
|
|
17
|
+
"files": [
|
|
18
|
+
"dist"
|
|
19
|
+
],
|
|
20
20
|
"keywords": [
|
|
21
21
|
"tts",
|
|
22
22
|
"text-to-speech",
|
|
@@ -24,10 +24,13 @@
|
|
|
24
24
|
"server-side",
|
|
25
25
|
"speech-marks"
|
|
26
26
|
],
|
|
27
|
-
"author": "PIE Framework",
|
|
28
27
|
"license": "MIT",
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
|
|
28
|
+
"scripts": {
|
|
29
|
+
"build": "tsc",
|
|
30
|
+
"dev": "tsc --watch",
|
|
31
|
+
"test": "vitest run --passWithNoTests",
|
|
32
|
+
"test:coverage": "vitest --coverage"
|
|
33
|
+
},
|
|
34
|
+
"type": "module",
|
|
35
|
+
"types": "./dist/index.d.ts"
|
|
33
36
|
}
|
package/.turbo/turbo-build.log
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
$ tsc
|
package/src/cache.ts
DELETED
|
@@ -1,273 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Caching interface for TTS results
|
|
3
|
-
* @module @pie-players/tts-server-core
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import type { SynthesizeResponse } from "./types.js";
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Cache key components for TTS synthesis
|
|
10
|
-
*/
|
|
11
|
-
export interface CacheKeyComponents {
|
|
12
|
-
/** Provider identifier */
|
|
13
|
-
providerId: string;
|
|
14
|
-
|
|
15
|
-
/** Text to synthesize */
|
|
16
|
-
text: string;
|
|
17
|
-
|
|
18
|
-
/** Voice ID */
|
|
19
|
-
voice: string;
|
|
20
|
-
|
|
21
|
-
/** Language code */
|
|
22
|
-
language?: string;
|
|
23
|
-
|
|
24
|
-
/** Speech rate */
|
|
25
|
-
rate?: number;
|
|
26
|
-
|
|
27
|
-
/** Audio format */
|
|
28
|
-
format?: string;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
/**
|
|
32
|
-
* Cache interface for TTS providers
|
|
33
|
-
*/
|
|
34
|
-
export interface ITTSCache {
|
|
35
|
-
/**
|
|
36
|
-
* Get cached synthesis result
|
|
37
|
-
*
|
|
38
|
-
* @param key - Cache key
|
|
39
|
-
* @returns Cached result or null if not found
|
|
40
|
-
*/
|
|
41
|
-
get(key: string): Promise<SynthesizeResponse | null>;
|
|
42
|
-
|
|
43
|
-
/**
|
|
44
|
-
* Store synthesis result in cache
|
|
45
|
-
*
|
|
46
|
-
* @param key - Cache key
|
|
47
|
-
* @param value - Synthesis response to cache
|
|
48
|
-
* @param ttl - Time to live in seconds (optional)
|
|
49
|
-
*/
|
|
50
|
-
set(key: string, value: SynthesizeResponse, ttl?: number): Promise<void>;
|
|
51
|
-
|
|
52
|
-
/**
|
|
53
|
-
* Check if key exists in cache
|
|
54
|
-
*
|
|
55
|
-
* @param key - Cache key
|
|
56
|
-
* @returns True if key exists
|
|
57
|
-
*/
|
|
58
|
-
has(key: string): Promise<boolean>;
|
|
59
|
-
|
|
60
|
-
/**
|
|
61
|
-
* Delete cached result
|
|
62
|
-
*
|
|
63
|
-
* @param key - Cache key
|
|
64
|
-
*/
|
|
65
|
-
delete(key: string): Promise<void>;
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Clear all cached results
|
|
69
|
-
*/
|
|
70
|
-
clear(): Promise<void>;
|
|
71
|
-
|
|
72
|
-
/**
|
|
73
|
-
* Get cache statistics
|
|
74
|
-
*/
|
|
75
|
-
getStats?(): Promise<CacheStats>;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* Cache statistics
|
|
80
|
-
*/
|
|
81
|
-
export interface CacheStats {
|
|
82
|
-
/** Total cache hits */
|
|
83
|
-
hits: number;
|
|
84
|
-
|
|
85
|
-
/** Total cache misses */
|
|
86
|
-
misses: number;
|
|
87
|
-
|
|
88
|
-
/** Hit rate (0.0 to 1.0) */
|
|
89
|
-
hitRate: number;
|
|
90
|
-
|
|
91
|
-
/** Number of keys in cache */
|
|
92
|
-
keyCount: number;
|
|
93
|
-
|
|
94
|
-
/** Total size in bytes (if available) */
|
|
95
|
-
sizeBytes?: number;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
/**
|
|
99
|
-
* Generate cache key from components
|
|
100
|
-
*
|
|
101
|
-
* @param components - Cache key components
|
|
102
|
-
* @returns Cache key string
|
|
103
|
-
*/
|
|
104
|
-
export function generateCacheKey(components: CacheKeyComponents): string {
|
|
105
|
-
const {
|
|
106
|
-
providerId,
|
|
107
|
-
text,
|
|
108
|
-
voice,
|
|
109
|
-
language = "",
|
|
110
|
-
rate = 1.0,
|
|
111
|
-
format = "mp3",
|
|
112
|
-
} = components;
|
|
113
|
-
|
|
114
|
-
// Create deterministic key from components
|
|
115
|
-
const keyParts = [
|
|
116
|
-
"tts",
|
|
117
|
-
providerId,
|
|
118
|
-
voice,
|
|
119
|
-
language,
|
|
120
|
-
rate.toFixed(2),
|
|
121
|
-
format,
|
|
122
|
-
text,
|
|
123
|
-
];
|
|
124
|
-
|
|
125
|
-
// Use simple concatenation with delimiter
|
|
126
|
-
// In production, consider using a hash function for shorter keys
|
|
127
|
-
return keyParts.join(":");
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
/**
|
|
131
|
-
* Generate SHA-256 hash for cache key
|
|
132
|
-
* Useful for creating shorter keys from long text
|
|
133
|
-
*
|
|
134
|
-
* @param text - Text to hash
|
|
135
|
-
* @returns Hex string hash
|
|
136
|
-
*/
|
|
137
|
-
export async function hashText(text: string): Promise<string> {
|
|
138
|
-
// Use Web Crypto API (available in modern Node.js and browsers)
|
|
139
|
-
const encoder = new TextEncoder();
|
|
140
|
-
const data = encoder.encode(text);
|
|
141
|
-
const hashBuffer = await crypto.subtle.digest("SHA-256", data);
|
|
142
|
-
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
143
|
-
return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
/**
|
|
147
|
-
* Generate short cache key using hash
|
|
148
|
-
*
|
|
149
|
-
* @param components - Cache key components
|
|
150
|
-
* @returns Promise resolving to cache key
|
|
151
|
-
*/
|
|
152
|
-
export async function generateHashedCacheKey(
|
|
153
|
-
components: CacheKeyComponents,
|
|
154
|
-
): Promise<string> {
|
|
155
|
-
const {
|
|
156
|
-
providerId,
|
|
157
|
-
text,
|
|
158
|
-
voice,
|
|
159
|
-
language = "",
|
|
160
|
-
rate = 1.0,
|
|
161
|
-
format = "mp3",
|
|
162
|
-
} = components;
|
|
163
|
-
|
|
164
|
-
// Hash the text to keep key length reasonable
|
|
165
|
-
const textHash = await hashText(text);
|
|
166
|
-
|
|
167
|
-
const keyParts = [
|
|
168
|
-
"tts",
|
|
169
|
-
providerId,
|
|
170
|
-
voice,
|
|
171
|
-
language,
|
|
172
|
-
rate.toFixed(2),
|
|
173
|
-
format,
|
|
174
|
-
textHash,
|
|
175
|
-
];
|
|
176
|
-
|
|
177
|
-
return keyParts.join(":");
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
/**
|
|
181
|
-
* In-memory cache implementation
|
|
182
|
-
* Simple LRU cache for development/testing
|
|
183
|
-
*/
|
|
184
|
-
export class MemoryCache implements ITTSCache {
|
|
185
|
-
private cache = new Map<
|
|
186
|
-
string,
|
|
187
|
-
{ value: SynthesizeResponse; expires: number }
|
|
188
|
-
>();
|
|
189
|
-
private hits = 0;
|
|
190
|
-
private misses = 0;
|
|
191
|
-
private maxSize: number;
|
|
192
|
-
|
|
193
|
-
constructor(maxSize = 100) {
|
|
194
|
-
this.maxSize = maxSize;
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
async get(key: string): Promise<SynthesizeResponse | null> {
|
|
198
|
-
const entry = this.cache.get(key);
|
|
199
|
-
|
|
200
|
-
if (!entry) {
|
|
201
|
-
this.misses++;
|
|
202
|
-
return null;
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
// Check expiration
|
|
206
|
-
if (Date.now() > entry.expires) {
|
|
207
|
-
this.cache.delete(key);
|
|
208
|
-
this.misses++;
|
|
209
|
-
return null;
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
this.hits++;
|
|
213
|
-
|
|
214
|
-
// Update metadata to mark as served from cache
|
|
215
|
-
const result = { ...entry.value };
|
|
216
|
-
result.metadata = { ...result.metadata, cached: true };
|
|
217
|
-
|
|
218
|
-
return result;
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
async set(
|
|
222
|
-
key: string,
|
|
223
|
-
value: SynthesizeResponse,
|
|
224
|
-
ttl = 86400,
|
|
225
|
-
): Promise<void> {
|
|
226
|
-
// Enforce max size (simple LRU)
|
|
227
|
-
if (this.cache.size >= this.maxSize) {
|
|
228
|
-
// Delete oldest entry (first key)
|
|
229
|
-
const firstKey = this.cache.keys().next().value;
|
|
230
|
-
if (firstKey) {
|
|
231
|
-
this.cache.delete(firstKey);
|
|
232
|
-
}
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
this.cache.set(key, {
|
|
236
|
-
value,
|
|
237
|
-
expires: Date.now() + ttl * 1000,
|
|
238
|
-
});
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
async has(key: string): Promise<boolean> {
|
|
242
|
-
const entry = this.cache.get(key);
|
|
243
|
-
if (!entry) return false;
|
|
244
|
-
|
|
245
|
-
// Check expiration
|
|
246
|
-
if (Date.now() > entry.expires) {
|
|
247
|
-
this.cache.delete(key);
|
|
248
|
-
return false;
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
return true;
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
async delete(key: string): Promise<void> {
|
|
255
|
-
this.cache.delete(key);
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
async clear(): Promise<void> {
|
|
259
|
-
this.cache.clear();
|
|
260
|
-
this.hits = 0;
|
|
261
|
-
this.misses = 0;
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
async getStats(): Promise<CacheStats> {
|
|
265
|
-
const total = this.hits + this.misses;
|
|
266
|
-
return {
|
|
267
|
-
hits: this.hits,
|
|
268
|
-
misses: this.misses,
|
|
269
|
-
hitRate: total > 0 ? this.hits / total : 0,
|
|
270
|
-
keyCount: this.cache.size,
|
|
271
|
-
};
|
|
272
|
-
}
|
|
273
|
-
}
|
package/src/index.ts
DELETED
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Core types and interfaces for server-side TTS providers
|
|
3
|
-
* @module @pie-players/tts-server-core
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
// Export cache interfaces
|
|
7
|
-
export type {
|
|
8
|
-
CacheKeyComponents,
|
|
9
|
-
CacheStats,
|
|
10
|
-
ITTSCache,
|
|
11
|
-
} from "./cache.js";
|
|
12
|
-
export {
|
|
13
|
-
generateCacheKey,
|
|
14
|
-
generateHashedCacheKey,
|
|
15
|
-
hashText,
|
|
16
|
-
MemoryCache,
|
|
17
|
-
} from "./cache.js";
|
|
18
|
-
|
|
19
|
-
// Export provider interfaces
|
|
20
|
-
export type {
|
|
21
|
-
ITTSServerProvider,
|
|
22
|
-
TTSServerConfig,
|
|
23
|
-
} from "./provider.js";
|
|
24
|
-
|
|
25
|
-
export { BaseTTSProvider } from "./provider.js";
|
|
26
|
-
|
|
27
|
-
// Export speech marks utilities
|
|
28
|
-
export {
|
|
29
|
-
adjustSpeechMarksForRate,
|
|
30
|
-
estimateSpeechMarks,
|
|
31
|
-
filterSpeechMarksByType,
|
|
32
|
-
getSpeechMarkAtTime,
|
|
33
|
-
getSpeechMarksStats,
|
|
34
|
-
mergeSpeechMarks,
|
|
35
|
-
validateSpeechMarks,
|
|
36
|
-
} from "./speech-marks.js";
|
|
37
|
-
// Export types
|
|
38
|
-
export type {
|
|
39
|
-
GetVoicesOptions,
|
|
40
|
-
ServerProviderCapabilities,
|
|
41
|
-
SpeechMark,
|
|
42
|
-
StandardTTSParameters,
|
|
43
|
-
SynthesizeMetadata,
|
|
44
|
-
SynthesizeRequest,
|
|
45
|
-
SynthesizeResponse,
|
|
46
|
-
TTSProviderExtensions,
|
|
47
|
-
Voice,
|
|
48
|
-
VoiceFeatures,
|
|
49
|
-
} from "./types.js";
|
|
50
|
-
export { TTSError, TTSErrorCode } from "./types.js";
|
package/src/provider.ts
DELETED
|
@@ -1,200 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Server-side TTS Provider interface
|
|
3
|
-
* @module @pie-players/tts-server-core
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import type {
|
|
7
|
-
GetVoicesOptions,
|
|
8
|
-
ServerProviderCapabilities,
|
|
9
|
-
SynthesizeRequest,
|
|
10
|
-
SynthesizeResponse,
|
|
11
|
-
Voice,
|
|
12
|
-
} from "./types.js";
|
|
13
|
-
|
|
14
|
-
/**
|
|
15
|
-
* Base configuration for TTS providers
|
|
16
|
-
*/
|
|
17
|
-
export interface TTSServerConfig {
|
|
18
|
-
/** Provider-specific configuration */
|
|
19
|
-
[key: string]: unknown;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* Server-side TTS Provider interface
|
|
24
|
-
*
|
|
25
|
-
* All server-side TTS providers must implement this interface.
|
|
26
|
-
* Providers handle synthesis requests and return audio with speech marks.
|
|
27
|
-
*
|
|
28
|
-
* ## Initialization Performance
|
|
29
|
-
*
|
|
30
|
-
* The `initialize()` method MUST be fast and lightweight:
|
|
31
|
-
* - Should only validate config and create API clients
|
|
32
|
-
* - MUST NOT fetch voices or make expensive API calls
|
|
33
|
-
* - MUST NOT perform test synthesis requests
|
|
34
|
-
*
|
|
35
|
-
* Use `getVoices()` explicitly when voice discovery is needed (e.g., in demo/admin UIs).
|
|
36
|
-
* Runtime synthesis should work with hardcoded voice IDs without querying available voices.
|
|
37
|
-
*
|
|
38
|
-
* @example Fast initialization (runtime)
|
|
39
|
-
* ```typescript
|
|
40
|
-
* const provider = new PollyServerProvider();
|
|
41
|
-
* await provider.initialize({ region: 'us-east-1', defaultVoice: 'Joanna' });
|
|
42
|
-
* // Ready to synthesize immediately - no voices query
|
|
43
|
-
* await provider.synthesize({ text: 'Hello', voice: 'Joanna' });
|
|
44
|
-
* ```
|
|
45
|
-
*
|
|
46
|
-
* @example Explicit voice discovery (admin/demo UIs)
|
|
47
|
-
* ```typescript
|
|
48
|
-
* const provider = new PollyServerProvider();
|
|
49
|
-
* await provider.initialize({ region: 'us-east-1' });
|
|
50
|
-
* const voices = await provider.getVoices(); // Explicit, separate call
|
|
51
|
-
* ```
|
|
52
|
-
*/
|
|
53
|
-
export interface ITTSServerProvider {
|
|
54
|
-
/**
|
|
55
|
-
* Unique provider identifier (e.g., 'aws-polly', 'google-cloud-tts')
|
|
56
|
-
*/
|
|
57
|
-
readonly providerId: string;
|
|
58
|
-
|
|
59
|
-
/**
|
|
60
|
-
* Human-readable provider name
|
|
61
|
-
*/
|
|
62
|
-
readonly providerName: string;
|
|
63
|
-
|
|
64
|
-
/**
|
|
65
|
-
* Provider version
|
|
66
|
-
*/
|
|
67
|
-
readonly version: string;
|
|
68
|
-
|
|
69
|
-
/**
|
|
70
|
-
* Initialize the provider with configuration.
|
|
71
|
-
*
|
|
72
|
-
* MUST be fast and lightweight - only validates config and creates clients.
|
|
73
|
-
* MUST NOT fetch voices or make expensive API calls during initialization.
|
|
74
|
-
*
|
|
75
|
-
* @param config - Provider-specific configuration
|
|
76
|
-
* @throws {TTSError} If initialization fails
|
|
77
|
-
* @performance Should complete in <100ms
|
|
78
|
-
*/
|
|
79
|
-
initialize(config: TTSServerConfig): Promise<void>;
|
|
80
|
-
|
|
81
|
-
/**
|
|
82
|
-
* Synthesize speech from text
|
|
83
|
-
*
|
|
84
|
-
* @param request - Synthesis request parameters
|
|
85
|
-
* @returns Audio data and speech marks
|
|
86
|
-
* @throws {TTSError} If synthesis fails
|
|
87
|
-
*/
|
|
88
|
-
synthesize(request: SynthesizeRequest): Promise<SynthesizeResponse>;
|
|
89
|
-
|
|
90
|
-
/**
|
|
91
|
-
* Get available voices (explicit, secondary query).
|
|
92
|
-
*
|
|
93
|
-
* This is an EXPLICIT operation for voice discovery in demo/admin UIs.
|
|
94
|
-
* NOT called during initialization - call separately when needed.
|
|
95
|
-
*
|
|
96
|
-
* @param options - Optional filters for voices
|
|
97
|
-
* @returns List of available voices
|
|
98
|
-
* @throws {TTSError} If voice listing fails
|
|
99
|
-
* @note May take 200-500ms depending on provider
|
|
100
|
-
*/
|
|
101
|
-
getVoices(options?: GetVoicesOptions): Promise<Voice[]>;
|
|
102
|
-
|
|
103
|
-
/**
|
|
104
|
-
* Get provider capabilities (synchronous, fast).
|
|
105
|
-
*
|
|
106
|
-
* Returns static capability information without API calls.
|
|
107
|
-
*
|
|
108
|
-
* @returns Provider feature support
|
|
109
|
-
* @performance Should complete in <1ms (synchronous)
|
|
110
|
-
*/
|
|
111
|
-
getCapabilities(): ServerProviderCapabilities;
|
|
112
|
-
|
|
113
|
-
/**
|
|
114
|
-
* Clean up provider resources
|
|
115
|
-
* Called when provider is no longer needed
|
|
116
|
-
*/
|
|
117
|
-
destroy(): Promise<void>;
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
/**
|
|
121
|
-
* Abstract base class for TTS providers
|
|
122
|
-
* Provides common functionality and helpers
|
|
123
|
-
*/
|
|
124
|
-
export abstract class BaseTTSProvider implements ITTSServerProvider {
|
|
125
|
-
abstract readonly providerId: string;
|
|
126
|
-
abstract readonly providerName: string;
|
|
127
|
-
abstract readonly version: string;
|
|
128
|
-
|
|
129
|
-
protected config: TTSServerConfig = {};
|
|
130
|
-
protected initialized = false;
|
|
131
|
-
|
|
132
|
-
abstract initialize(config: TTSServerConfig): Promise<void>;
|
|
133
|
-
abstract synthesize(request: SynthesizeRequest): Promise<SynthesizeResponse>;
|
|
134
|
-
abstract getVoices(options?: GetVoicesOptions): Promise<Voice[]>;
|
|
135
|
-
abstract getCapabilities(): ServerProviderCapabilities;
|
|
136
|
-
|
|
137
|
-
async destroy(): Promise<void> {
|
|
138
|
-
this.initialized = false;
|
|
139
|
-
this.config = {};
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
/**
|
|
143
|
-
* Ensure provider is initialized before operations
|
|
144
|
-
* @throws {TTSError} If provider not initialized
|
|
145
|
-
*/
|
|
146
|
-
protected ensureInitialized(): void {
|
|
147
|
-
if (!this.initialized) {
|
|
148
|
-
throw new Error(`Provider ${this.providerId} not initialized`);
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
/**
|
|
153
|
-
* Validate synthesis request
|
|
154
|
-
* @throws {TTSError} If request is invalid
|
|
155
|
-
*/
|
|
156
|
-
protected validateRequest(
|
|
157
|
-
request: SynthesizeRequest,
|
|
158
|
-
capabilities: ServerProviderCapabilities,
|
|
159
|
-
): void {
|
|
160
|
-
if (!request.text || request.text.trim().length === 0) {
|
|
161
|
-
throw new Error("Text is required and cannot be empty");
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
if (request.text.length > capabilities.standard.maxTextLength) {
|
|
165
|
-
throw new Error(
|
|
166
|
-
`Text length (${request.text.length}) exceeds maximum (${capabilities.standard.maxTextLength})`,
|
|
167
|
-
);
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
if (
|
|
171
|
-
request.format &&
|
|
172
|
-
!capabilities.extensions.supportedFormats.includes(request.format)
|
|
173
|
-
) {
|
|
174
|
-
throw new Error(
|
|
175
|
-
`Format '${request.format}' not supported. Supported formats: ${capabilities.extensions.supportedFormats.join(", ")}`,
|
|
176
|
-
);
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
if (
|
|
180
|
-
request.rate !== undefined &&
|
|
181
|
-
(request.rate < 0.25 || request.rate > 4.0)
|
|
182
|
-
) {
|
|
183
|
-
throw new Error("Rate must be between 0.25 and 4.0");
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
if (
|
|
187
|
-
request.pitch !== undefined &&
|
|
188
|
-
(request.pitch < -20 || request.pitch > 20)
|
|
189
|
-
) {
|
|
190
|
-
throw new Error("Pitch must be between -20 and 20");
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
if (
|
|
194
|
-
request.volume !== undefined &&
|
|
195
|
-
(request.volume < 0 || request.volume > 1)
|
|
196
|
-
) {
|
|
197
|
-
throw new Error("Volume must be between 0 and 1");
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
}
|
package/src/speech-marks.ts
DELETED
|
@@ -1,243 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Speech marks utilities
|
|
3
|
-
* @module @pie-players/tts-server-core
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import type { SpeechMark } from "./types.js";
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Estimate speech marks for text when provider doesn't support them
|
|
10
|
-
*
|
|
11
|
-
* Uses average speaking rate to estimate word timing.
|
|
12
|
-
* Not as accurate as provider-generated marks, but better than nothing.
|
|
13
|
-
*
|
|
14
|
-
* @param text - Text to generate marks for
|
|
15
|
-
* @param avgWordsPerMinute - Average speaking rate (default 150)
|
|
16
|
-
* @returns Estimated speech marks
|
|
17
|
-
*/
|
|
18
|
-
export function estimateSpeechMarks(
|
|
19
|
-
text: string,
|
|
20
|
-
avgWordsPerMinute = 150,
|
|
21
|
-
): SpeechMark[] {
|
|
22
|
-
const words = text.split(/\s+/).filter((w) => w.length > 0);
|
|
23
|
-
const msPerWord = (60 * 1000) / avgWordsPerMinute;
|
|
24
|
-
|
|
25
|
-
const marks: SpeechMark[] = [];
|
|
26
|
-
let charIndex = 0;
|
|
27
|
-
|
|
28
|
-
for (let i = 0; i < words.length; i++) {
|
|
29
|
-
const word = words[i];
|
|
30
|
-
|
|
31
|
-
// Find word position in original text (preserves spacing)
|
|
32
|
-
const wordStart = text.indexOf(word, charIndex);
|
|
33
|
-
if (wordStart === -1) {
|
|
34
|
-
// Word not found (shouldn't happen), skip
|
|
35
|
-
charIndex += word.length + 1;
|
|
36
|
-
continue;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
marks.push({
|
|
40
|
-
time: Math.round(i * msPerWord),
|
|
41
|
-
type: "word",
|
|
42
|
-
start: wordStart,
|
|
43
|
-
end: wordStart + word.length,
|
|
44
|
-
value: word,
|
|
45
|
-
});
|
|
46
|
-
|
|
47
|
-
charIndex = wordStart + word.length;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
return marks;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
/**
|
|
54
|
-
* Adjust speech marks timing for different speaking rates
|
|
55
|
-
*
|
|
56
|
-
* @param marks - Original speech marks
|
|
57
|
-
* @param rate - Speech rate multiplier (0.25 to 4.0)
|
|
58
|
-
* @returns Adjusted speech marks
|
|
59
|
-
*/
|
|
60
|
-
export function adjustSpeechMarksForRate(
|
|
61
|
-
marks: SpeechMark[],
|
|
62
|
-
rate: number,
|
|
63
|
-
): SpeechMark[] {
|
|
64
|
-
if (rate === 1.0) {
|
|
65
|
-
return marks; // No adjustment needed
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
return marks.map((mark) => ({
|
|
69
|
-
...mark,
|
|
70
|
-
time: Math.round(mark.time / rate),
|
|
71
|
-
}));
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
/**
|
|
75
|
-
* Validate speech marks
|
|
76
|
-
* Ensures marks are properly ordered and have valid data
|
|
77
|
-
*
|
|
78
|
-
* @param marks - Speech marks to validate
|
|
79
|
-
* @returns Validation errors (empty array if valid)
|
|
80
|
-
*/
|
|
81
|
-
export function validateSpeechMarks(marks: SpeechMark[]): string[] {
|
|
82
|
-
const errors: string[] = [];
|
|
83
|
-
|
|
84
|
-
if (!marks || marks.length === 0) {
|
|
85
|
-
return errors; // Empty is valid
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
for (let i = 0; i < marks.length; i++) {
|
|
89
|
-
const mark = marks[i];
|
|
90
|
-
|
|
91
|
-
// Check required fields
|
|
92
|
-
if (typeof mark.time !== "number" || mark.time < 0) {
|
|
93
|
-
errors.push(`Mark ${i}: invalid time (${mark.time})`);
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
if (typeof mark.start !== "number" || mark.start < 0) {
|
|
97
|
-
errors.push(`Mark ${i}: invalid start (${mark.start})`);
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
if (typeof mark.end !== "number" || mark.end <= mark.start) {
|
|
101
|
-
errors.push(`Mark ${i}: invalid end (${mark.end}, start: ${mark.start})`);
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
if (!mark.value || typeof mark.value !== "string") {
|
|
105
|
-
errors.push(`Mark ${i}: invalid value`);
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
// Check ordering (time should be monotonically increasing)
|
|
109
|
-
if (i > 0 && mark.time < marks[i - 1].time) {
|
|
110
|
-
errors.push(
|
|
111
|
-
`Mark ${i}: time (${mark.time}) is less than previous mark (${marks[i - 1].time})`,
|
|
112
|
-
);
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
return errors;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
/**
|
|
120
|
-
* Merge overlapping or adjacent speech marks
|
|
121
|
-
* Useful when combining marks from multiple sources
|
|
122
|
-
*
|
|
123
|
-
* @param marks - Speech marks to merge
|
|
124
|
-
* @returns Merged speech marks
|
|
125
|
-
*/
|
|
126
|
-
export function mergeSpeechMarks(marks: SpeechMark[]): SpeechMark[] {
|
|
127
|
-
if (marks.length <= 1) {
|
|
128
|
-
return marks;
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
// Sort by start position
|
|
132
|
-
const sorted = [...marks].sort((a, b) => a.start - b.start);
|
|
133
|
-
const merged: SpeechMark[] = [sorted[0]];
|
|
134
|
-
|
|
135
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
136
|
-
const current = sorted[i];
|
|
137
|
-
const previous = merged[merged.length - 1];
|
|
138
|
-
|
|
139
|
-
// Check if marks overlap or are adjacent
|
|
140
|
-
if (current.start <= previous.end) {
|
|
141
|
-
// Merge with previous mark
|
|
142
|
-
previous.end = Math.max(previous.end, current.end);
|
|
143
|
-
previous.value = previous.value + " " + current.value;
|
|
144
|
-
previous.time = Math.min(previous.time, current.time); // Use earlier time
|
|
145
|
-
} else {
|
|
146
|
-
// No overlap, add as new mark
|
|
147
|
-
merged.push(current);
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
return merged;
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
/**
|
|
155
|
-
* Filter speech marks by type
|
|
156
|
-
*
|
|
157
|
-
* @param marks - Speech marks to filter
|
|
158
|
-
* @param type - Type to filter by
|
|
159
|
-
* @returns Filtered speech marks
|
|
160
|
-
*/
|
|
161
|
-
export function filterSpeechMarksByType(
|
|
162
|
-
marks: SpeechMark[],
|
|
163
|
-
type: "word" | "sentence" | "ssml",
|
|
164
|
-
): SpeechMark[] {
|
|
165
|
-
return marks.filter((mark) => mark.type === type);
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
/**
|
|
169
|
-
* Get speech mark at specific time
|
|
170
|
-
*
|
|
171
|
-
* @param marks - Speech marks
|
|
172
|
-
* @param time - Time in milliseconds
|
|
173
|
-
* @returns Speech mark at time, or null if none found
|
|
174
|
-
*/
|
|
175
|
-
export function getSpeechMarkAtTime(
|
|
176
|
-
marks: SpeechMark[],
|
|
177
|
-
time: number,
|
|
178
|
-
): SpeechMark | null {
|
|
179
|
-
// Binary search for efficiency
|
|
180
|
-
let left = 0;
|
|
181
|
-
let right = marks.length - 1;
|
|
182
|
-
let closest: SpeechMark | null = null;
|
|
183
|
-
|
|
184
|
-
while (left <= right) {
|
|
185
|
-
const mid = Math.floor((left + right) / 2);
|
|
186
|
-
const mark = marks[mid];
|
|
187
|
-
|
|
188
|
-
if (mark.time === time) {
|
|
189
|
-
return mark;
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
// Track closest mark
|
|
193
|
-
if (
|
|
194
|
-
!closest ||
|
|
195
|
-
Math.abs(mark.time - time) < Math.abs(closest.time - time)
|
|
196
|
-
) {
|
|
197
|
-
closest = mark;
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
if (mark.time < time) {
|
|
201
|
-
left = mid + 1;
|
|
202
|
-
} else {
|
|
203
|
-
right = mid - 1;
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
// Return closest mark if within reasonable threshold (500ms)
|
|
208
|
-
if (closest && Math.abs(closest.time - time) <= 500) {
|
|
209
|
-
return closest;
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
return null;
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
/**
|
|
216
|
-
* Calculate statistics for speech marks
|
|
217
|
-
*
|
|
218
|
-
* @param marks - Speech marks
|
|
219
|
-
* @returns Statistics about the marks
|
|
220
|
-
*/
|
|
221
|
-
export function getSpeechMarksStats(marks: SpeechMark[]) {
|
|
222
|
-
if (marks.length === 0) {
|
|
223
|
-
return {
|
|
224
|
-
count: 0,
|
|
225
|
-
totalDuration: 0,
|
|
226
|
-
avgWordDuration: 0,
|
|
227
|
-
wordsPerMinute: 0,
|
|
228
|
-
};
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
const wordMarks = filterSpeechMarksByType(marks, "word");
|
|
232
|
-
const totalDuration = marks[marks.length - 1].time;
|
|
233
|
-
const avgWordDuration = totalDuration / wordMarks.length;
|
|
234
|
-
const wordsPerMinute = (wordMarks.length / totalDuration) * 60 * 1000;
|
|
235
|
-
|
|
236
|
-
return {
|
|
237
|
-
count: marks.length,
|
|
238
|
-
wordCount: wordMarks.length,
|
|
239
|
-
totalDuration,
|
|
240
|
-
avgWordDuration,
|
|
241
|
-
wordsPerMinute,
|
|
242
|
-
};
|
|
243
|
-
}
|
package/src/types.ts
DELETED
|
@@ -1,425 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Core types for server-side TTS providers
|
|
3
|
-
* @module @pie-players/tts-server-core
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
/**
|
|
7
|
-
* Speech mark representing a timing event in synthesized speech
|
|
8
|
-
* Unified format across all TTS providers
|
|
9
|
-
*/
|
|
10
|
-
export interface SpeechMark {
|
|
11
|
-
/** Milliseconds from start of audio */
|
|
12
|
-
time: number;
|
|
13
|
-
|
|
14
|
-
/** Type of speech mark */
|
|
15
|
-
type: "word" | "sentence" | "ssml";
|
|
16
|
-
|
|
17
|
-
/** Character index in original text (inclusive) */
|
|
18
|
-
start: number;
|
|
19
|
-
|
|
20
|
-
/** Character index in original text (exclusive) */
|
|
21
|
-
end: number;
|
|
22
|
-
|
|
23
|
-
/** The actual word or text */
|
|
24
|
-
value: string;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
/**
|
|
28
|
-
* Standard TTS parameters based on W3C Web Speech API and SSML specifications.
|
|
29
|
-
*
|
|
30
|
-
* These parameters are widely supported across TTS providers (browsers, cloud services)
|
|
31
|
-
* and follow established standards:
|
|
32
|
-
* - W3C Web Speech API (SpeechSynthesisUtterance)
|
|
33
|
-
* - W3C SSML 1.1 specification
|
|
34
|
-
* - BCP47 language tags (RFC 5646)
|
|
35
|
-
*
|
|
36
|
-
* @see https://w3c.github.io/speech-api/
|
|
37
|
-
* @see https://www.w3.org/TR/speech-synthesis/
|
|
38
|
-
*/
|
|
39
|
-
export interface StandardTTSParameters {
|
|
40
|
-
/**
|
|
41
|
-
* Text to synthesize (plain text or SSML markup)
|
|
42
|
-
*
|
|
43
|
-
* @standard W3C Web Speech API
|
|
44
|
-
*/
|
|
45
|
-
text: string;
|
|
46
|
-
|
|
47
|
-
/**
|
|
48
|
-
* Voice identifier (provider-specific voice names)
|
|
49
|
-
* Examples: "Joanna" (Polly), "en-US-Standard-A" (Google), browser voice names
|
|
50
|
-
*
|
|
51
|
-
* @standard W3C Web Speech API (concept)
|
|
52
|
-
* @note Voice names are provider-specific but the concept is standard
|
|
53
|
-
*/
|
|
54
|
-
voice?: string;
|
|
55
|
-
|
|
56
|
-
/**
|
|
57
|
-
* Language code using BCP47 format (e.g., 'en-US', 'es-ES', 'fr-FR')
|
|
58
|
-
*
|
|
59
|
-
* @standard BCP47 (RFC 5646), W3C Web Speech API
|
|
60
|
-
* @see https://tools.ietf.org/html/rfc5646
|
|
61
|
-
*/
|
|
62
|
-
language?: string;
|
|
63
|
-
|
|
64
|
-
/**
|
|
65
|
-
* Speech rate (speed multiplier)
|
|
66
|
-
* - Range: 0.25 to 4.0
|
|
67
|
-
* - Default: 1.0 (normal speed)
|
|
68
|
-
* - 0.5 = half speed, 2.0 = double speed
|
|
69
|
-
*
|
|
70
|
-
* @standard W3C Web Speech API, SSML <prosody rate>
|
|
71
|
-
*/
|
|
72
|
-
rate?: number;
|
|
73
|
-
|
|
74
|
-
/**
|
|
75
|
-
* Pitch adjustment
|
|
76
|
-
* - Range: -20 to +20 semitones (or 0 to 2 as multiplier depending on provider)
|
|
77
|
-
* - Default: 0 (or 1.0 as multiplier)
|
|
78
|
-
* - Negative values = lower pitch, positive = higher pitch
|
|
79
|
-
*
|
|
80
|
-
* @standard W3C Web Speech API, SSML <prosody pitch>
|
|
81
|
-
* @note Some providers use semitones (-20 to +20), others use multipliers (0 to 2)
|
|
82
|
-
*/
|
|
83
|
-
pitch?: number;
|
|
84
|
-
|
|
85
|
-
/**
|
|
86
|
-
* Volume level
|
|
87
|
-
* - Range: 0.0 to 1.0
|
|
88
|
-
* - Default: 1.0 (full volume)
|
|
89
|
-
* - 0.0 = silent, 0.5 = half volume
|
|
90
|
-
*
|
|
91
|
-
* @standard W3C Web Speech API, SSML <prosody volume>
|
|
92
|
-
*/
|
|
93
|
-
volume?: number;
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* Provider-specific extensions for advanced TTS control.
|
|
98
|
-
*
|
|
99
|
-
* These parameters are NOT part of W3C standards and have varying support
|
|
100
|
-
* across providers. Use with caution for portability.
|
|
101
|
-
*
|
|
102
|
-
* Common extensions include:
|
|
103
|
-
* - Audio format selection (mp3, wav, ogg)
|
|
104
|
-
* - Sample rate control
|
|
105
|
-
* - Engine selection (neural vs standard)
|
|
106
|
-
* - Regional endpoints
|
|
107
|
-
* - Speech marks / word timing
|
|
108
|
-
*
|
|
109
|
-
* @note Providers may ignore unsupported extensions silently or throw errors
|
|
110
|
-
*/
|
|
111
|
-
export interface TTSProviderExtensions {
|
|
112
|
-
/**
|
|
113
|
-
* Audio format for output
|
|
114
|
-
*
|
|
115
|
-
* @extension Common across providers but values vary
|
|
116
|
-
* @support AWS Polly (mp3, ogg, pcm), Google Cloud TTS (mp3, wav, ogg), Azure (mp3, wav, ogg)
|
|
117
|
-
*/
|
|
118
|
-
format?: "mp3" | "wav" | "ogg" | "pcm";
|
|
119
|
-
|
|
120
|
-
/**
|
|
121
|
-
* Sample rate in Hz (e.g., 8000, 16000, 22050, 24000)
|
|
122
|
-
*
|
|
123
|
-
* @extension Common audio parameter
|
|
124
|
-
* @note Higher sample rates = better quality but larger file sizes
|
|
125
|
-
*/
|
|
126
|
-
sampleRate?: number;
|
|
127
|
-
|
|
128
|
-
/**
|
|
129
|
-
* Request word-level timing data (speech marks)
|
|
130
|
-
*
|
|
131
|
-
* @extension Provider-specific but common pattern
|
|
132
|
-
* @support AWS Polly (SpeechMarks), Google Cloud TTS (timepoints), Azure (word boundaries)
|
|
133
|
-
* @default true
|
|
134
|
-
*/
|
|
135
|
-
includeSpeechMarks?: boolean;
|
|
136
|
-
|
|
137
|
-
/**
|
|
138
|
-
* Provider-specific options (extensibility point)
|
|
139
|
-
*
|
|
140
|
-
* Examples:
|
|
141
|
-
* - AWS Polly: { engine: 'neural' | 'standard', lexiconNames: string[] }
|
|
142
|
-
* - Google Cloud TTS: { audioEncoding: string, effectsProfileId: string[] }
|
|
143
|
-
* - Azure: { voiceType: string, stylesList: string[] }
|
|
144
|
-
*
|
|
145
|
-
* @extension Arbitrary provider-specific data
|
|
146
|
-
*/
|
|
147
|
-
providerOptions?: Record<string, unknown>;
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
/**
|
|
151
|
-
* Complete synthesis request combining standard parameters and extensions.
|
|
152
|
-
*
|
|
153
|
-
* This interface provides the full set of options for text-to-speech synthesis,
|
|
154
|
-
* clearly separating W3C-standard parameters from provider-specific extensions.
|
|
155
|
-
*
|
|
156
|
-
* @example Basic usage (portable across providers)
|
|
157
|
-
* ```typescript
|
|
158
|
-
* const request: SynthesizeRequest = {
|
|
159
|
-
* text: "Hello world",
|
|
160
|
-
* voice: "Joanna",
|
|
161
|
-
* rate: 1.0,
|
|
162
|
-
* language: "en-US"
|
|
163
|
-
* };
|
|
164
|
-
* ```
|
|
165
|
-
*
|
|
166
|
-
* @example Advanced usage with extensions (provider-specific)
|
|
167
|
-
* ```typescript
|
|
168
|
-
* const request: SynthesizeRequest = {
|
|
169
|
-
* text: "Hello world",
|
|
170
|
-
* voice: "Joanna",
|
|
171
|
-
* rate: 1.0,
|
|
172
|
-
* // Extensions - may not be portable
|
|
173
|
-
* format: 'mp3',
|
|
174
|
-
* sampleRate: 24000,
|
|
175
|
-
* includeSpeechMarks: true,
|
|
176
|
-
* providerOptions: {
|
|
177
|
-
* engine: 'neural' // AWS Polly specific
|
|
178
|
-
* }
|
|
179
|
-
* };
|
|
180
|
-
* ```
|
|
181
|
-
*/
|
|
182
|
-
export interface SynthesizeRequest
|
|
183
|
-
extends StandardTTSParameters,
|
|
184
|
-
TTSProviderExtensions {}
|
|
185
|
-
|
|
186
|
-
/**
|
|
187
|
-
* Response from speech synthesis
|
|
188
|
-
*/
|
|
189
|
-
export interface SynthesizeResponse {
|
|
190
|
-
/** Audio data (Buffer for server, base64 string for client) */
|
|
191
|
-
audio: Buffer | string;
|
|
192
|
-
|
|
193
|
-
/** MIME type of audio (e.g., 'audio/mpeg') */
|
|
194
|
-
contentType: string;
|
|
195
|
-
|
|
196
|
-
/** Speech marks for word-level timing */
|
|
197
|
-
speechMarks: SpeechMark[];
|
|
198
|
-
|
|
199
|
-
/** Metadata about the synthesis */
|
|
200
|
-
metadata: SynthesizeMetadata;
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
/**
|
|
204
|
-
* Metadata about synthesized speech
|
|
205
|
-
*/
|
|
206
|
-
export interface SynthesizeMetadata {
|
|
207
|
-
/** Provider that generated the audio */
|
|
208
|
-
providerId: string;
|
|
209
|
-
|
|
210
|
-
/** Voice ID used */
|
|
211
|
-
voice: string;
|
|
212
|
-
|
|
213
|
-
/** Audio duration in seconds */
|
|
214
|
-
duration: number;
|
|
215
|
-
|
|
216
|
-
/** Character count of input text */
|
|
217
|
-
charCount: number;
|
|
218
|
-
|
|
219
|
-
/** Whether response was served from cache */
|
|
220
|
-
cached: boolean;
|
|
221
|
-
|
|
222
|
-
/** ISO timestamp of synthesis */
|
|
223
|
-
timestamp?: string;
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
/**
|
|
227
|
-
* Voice definition
|
|
228
|
-
*/
|
|
229
|
-
export interface Voice {
|
|
230
|
-
/** Unique voice identifier */
|
|
231
|
-
id: string;
|
|
232
|
-
|
|
233
|
-
/** Human-readable name */
|
|
234
|
-
name: string;
|
|
235
|
-
|
|
236
|
-
/** Language name (e.g., "English", "Spanish") */
|
|
237
|
-
language: string;
|
|
238
|
-
|
|
239
|
-
/** Language code (e.g., "en-US", "es-ES") */
|
|
240
|
-
languageCode: string;
|
|
241
|
-
|
|
242
|
-
/** Gender of voice */
|
|
243
|
-
gender?: "male" | "female" | "neutral";
|
|
244
|
-
|
|
245
|
-
/** Voice quality level */
|
|
246
|
-
quality: "standard" | "premium" | "neural";
|
|
247
|
-
|
|
248
|
-
/** Supported features */
|
|
249
|
-
supportedFeatures: VoiceFeatures;
|
|
250
|
-
|
|
251
|
-
/** Provider-specific metadata */
|
|
252
|
-
providerMetadata?: Record<string, unknown>;
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
/**
|
|
256
|
-
* Voice feature flags
|
|
257
|
-
*/
|
|
258
|
-
export interface VoiceFeatures {
|
|
259
|
-
/** Supports SSML markup */
|
|
260
|
-
ssml: boolean;
|
|
261
|
-
|
|
262
|
-
/** Supports emotional expression */
|
|
263
|
-
emotions: boolean;
|
|
264
|
-
|
|
265
|
-
/** Supports speaking styles */
|
|
266
|
-
styles: boolean;
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
/**
|
|
270
|
-
* Options for listing voices
|
|
271
|
-
*/
|
|
272
|
-
export interface GetVoicesOptions {
|
|
273
|
-
/** Filter by language code */
|
|
274
|
-
language?: string;
|
|
275
|
-
|
|
276
|
-
/** Filter by quality level */
|
|
277
|
-
quality?: "standard" | "premium" | "neural";
|
|
278
|
-
|
|
279
|
-
/** Filter by gender */
|
|
280
|
-
gender?: "male" | "female" | "neutral";
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
/**
|
|
284
|
-
* Provider capabilities split into standard features and extensions.
|
|
285
|
-
*
|
|
286
|
-
* This interface helps consumers understand what features are universally
|
|
287
|
-
* supported (W3C standards) vs provider-specific extensions.
|
|
288
|
-
*/
|
|
289
|
-
export interface ServerProviderCapabilities {
|
|
290
|
-
/**
|
|
291
|
-
* Standard W3C features that should be widely supported
|
|
292
|
-
*/
|
|
293
|
-
standard: {
|
|
294
|
-
/**
|
|
295
|
-
* Supports SSML markup (W3C SSML 1.1)
|
|
296
|
-
*
|
|
297
|
-
* @standard W3C SSML 1.1
|
|
298
|
-
* @support Most cloud TTS providers, limited browser support
|
|
299
|
-
*/
|
|
300
|
-
supportsSSML: boolean;
|
|
301
|
-
|
|
302
|
-
/**
|
|
303
|
-
* Supports pitch control via rate parameter or SSML <prosody>
|
|
304
|
-
*
|
|
305
|
-
* @standard W3C Web Speech API, SSML <prosody pitch>
|
|
306
|
-
* @note May be via API parameter or SSML only
|
|
307
|
-
*/
|
|
308
|
-
supportsPitch: boolean;
|
|
309
|
-
|
|
310
|
-
/**
|
|
311
|
-
* Supports rate (speed) control via rate parameter or SSML <prosody>
|
|
312
|
-
*
|
|
313
|
-
* @standard W3C Web Speech API, SSML <prosody rate>
|
|
314
|
-
*/
|
|
315
|
-
supportsRate: boolean;
|
|
316
|
-
|
|
317
|
-
/**
|
|
318
|
-
* Supports volume control via volume parameter or SSML <prosody>
|
|
319
|
-
*
|
|
320
|
-
* @standard W3C Web Speech API, SSML <prosody volume>
|
|
321
|
-
* @note Often better handled client-side for server TTS
|
|
322
|
-
*/
|
|
323
|
-
supportsVolume: boolean;
|
|
324
|
-
|
|
325
|
-
/**
|
|
326
|
-
* Supports multiple voices (voice selection)
|
|
327
|
-
*
|
|
328
|
-
* @standard W3C Web Speech API (concept)
|
|
329
|
-
*/
|
|
330
|
-
supportsMultipleVoices: boolean;
|
|
331
|
-
|
|
332
|
-
/**
|
|
333
|
-
* Maximum text length in characters
|
|
334
|
-
*
|
|
335
|
-
* @note Varies by provider: Polly=3000, Google=5000, browser=~32k
|
|
336
|
-
*/
|
|
337
|
-
maxTextLength: number;
|
|
338
|
-
};
|
|
339
|
-
|
|
340
|
-
/**
|
|
341
|
-
* Provider-specific extensions
|
|
342
|
-
*/
|
|
343
|
-
extensions: {
|
|
344
|
-
/**
|
|
345
|
-
* Supports word-level timing data (speech marks)
|
|
346
|
-
*
|
|
347
|
-
* @extension Provider-specific but common
|
|
348
|
-
* @support AWS Polly ✅, Google Cloud TTS ✅, Azure TTS ✅, Browser ⚠️
|
|
349
|
-
* @note Format and precision vary by provider
|
|
350
|
-
*/
|
|
351
|
-
supportsSpeechMarks: boolean;
|
|
352
|
-
|
|
353
|
-
/**
|
|
354
|
-
* Supported audio output formats
|
|
355
|
-
*
|
|
356
|
-
* @extension Common but not standardized
|
|
357
|
-
*/
|
|
358
|
-
supportedFormats: ("mp3" | "wav" | "ogg" | "pcm")[];
|
|
359
|
-
|
|
360
|
-
/**
|
|
361
|
-
* Supports sample rate configuration
|
|
362
|
-
*
|
|
363
|
-
* @extension Common audio parameter
|
|
364
|
-
*/
|
|
365
|
-
supportsSampleRate: boolean;
|
|
366
|
-
|
|
367
|
-
/**
|
|
368
|
-
* Provider-specific features (extensibility point)
|
|
369
|
-
*
|
|
370
|
-
* Examples:
|
|
371
|
-
* - AWS Polly: { engines: ['neural', 'standard'], lexicons: true }
|
|
372
|
-
* - Google Cloud TTS: { audioProfiles: true, voiceEffects: true }
|
|
373
|
-
* - Azure: { styles: true, emotions: true }
|
|
374
|
-
*
|
|
375
|
-
* @extension Arbitrary provider capabilities
|
|
376
|
-
*/
|
|
377
|
-
providerSpecific?: Record<string, unknown>;
|
|
378
|
-
};
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
/**
|
|
382
|
-
* TTS error codes
|
|
383
|
-
*/
|
|
384
|
-
export enum TTSErrorCode {
|
|
385
|
-
INVALID_REQUEST = "INVALID_REQUEST",
|
|
386
|
-
INVALID_VOICE = "INVALID_VOICE",
|
|
387
|
-
INVALID_PROVIDER = "INVALID_PROVIDER",
|
|
388
|
-
TEXT_TOO_LONG = "TEXT_TOO_LONG",
|
|
389
|
-
PROVIDER_ERROR = "PROVIDER_ERROR",
|
|
390
|
-
NETWORK_ERROR = "NETWORK_ERROR",
|
|
391
|
-
AUTHENTICATION_ERROR = "AUTHENTICATION_ERROR",
|
|
392
|
-
RATE_LIMIT_EXCEEDED = "RATE_LIMIT_EXCEEDED",
|
|
393
|
-
INITIALIZATION_ERROR = "INITIALIZATION_ERROR",
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
/**
|
|
397
|
-
* TTS error with structured information
|
|
398
|
-
*/
|
|
399
|
-
export class TTSError extends Error {
|
|
400
|
-
constructor(
|
|
401
|
-
public code: TTSErrorCode,
|
|
402
|
-
message: string,
|
|
403
|
-
public details?: Record<string, unknown>,
|
|
404
|
-
public providerId?: string,
|
|
405
|
-
) {
|
|
406
|
-
super(message);
|
|
407
|
-
this.name = "TTSError";
|
|
408
|
-
|
|
409
|
-
// Maintains proper stack trace for where error was thrown (V8 only)
|
|
410
|
-
if (Error.captureStackTrace) {
|
|
411
|
-
Error.captureStackTrace(this, TTSError);
|
|
412
|
-
}
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
toJSON() {
|
|
416
|
-
return {
|
|
417
|
-
error: {
|
|
418
|
-
code: this.code,
|
|
419
|
-
message: this.message,
|
|
420
|
-
details: this.details,
|
|
421
|
-
provider: this.providerId,
|
|
422
|
-
},
|
|
423
|
-
};
|
|
424
|
-
}
|
|
425
|
-
}
|
package/tsconfig.json
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"compilerOptions": {
|
|
3
|
-
"target": "ES2022",
|
|
4
|
-
"module": "ESNext",
|
|
5
|
-
"lib": ["ES2022"],
|
|
6
|
-
"moduleResolution": "bundler",
|
|
7
|
-
"outDir": "./dist",
|
|
8
|
-
"rootDir": "./src",
|
|
9
|
-
"declaration": true,
|
|
10
|
-
"declarationMap": true,
|
|
11
|
-
"sourceMap": true,
|
|
12
|
-
"strict": true,
|
|
13
|
-
"esModuleInterop": true,
|
|
14
|
-
"skipLibCheck": true,
|
|
15
|
-
"forceConsistentCasingInFileNames": true,
|
|
16
|
-
"resolveJsonModule": true
|
|
17
|
-
},
|
|
18
|
-
"include": ["src/**/*"],
|
|
19
|
-
"exclude": ["node_modules", "dist", "**/*.test.ts"]
|
|
20
|
-
}
|