@pie-players/tts-server-polly 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -3
- package/.turbo/turbo-build.log +0 -1
- package/examples/INTEGRATION-GUIDE.md +0 -603
- package/examples/sveltekit/synthesize-server.ts +0 -110
- package/examples/sveltekit/voices-server.ts +0 -65
- package/src/PollyServerProvider.ts +0 -426
- package/src/index.ts +0 -7
- package/tsconfig.json +0 -9
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pie-players/tts-server-polly",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.3",
|
|
4
4
|
"description": "AWS Polly provider for server-side TTS with speech marks support",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -11,10 +11,13 @@
|
|
|
11
11
|
"import": "./dist/index.js"
|
|
12
12
|
}
|
|
13
13
|
},
|
|
14
|
+
"files": [
|
|
15
|
+
"dist"
|
|
16
|
+
],
|
|
14
17
|
"scripts": {
|
|
15
18
|
"build": "tsc",
|
|
16
19
|
"dev": "tsc --watch",
|
|
17
|
-
"test": "vitest",
|
|
20
|
+
"test": "vitest run --passWithNoTests",
|
|
18
21
|
"test:coverage": "vitest --coverage"
|
|
19
22
|
},
|
|
20
23
|
"keywords": [
|
|
@@ -28,7 +31,7 @@
|
|
|
28
31
|
"license": "MIT",
|
|
29
32
|
"dependencies": {
|
|
30
33
|
"@aws-sdk/client-polly": "^3.700.0",
|
|
31
|
-
"@pie-players/tts-server-core": "
|
|
34
|
+
"@pie-players/tts-server-core": "0.1.3"
|
|
32
35
|
},
|
|
33
36
|
"devDependencies": {
|
|
34
37
|
"typescript": "^5.3.3",
|
package/.turbo/turbo-build.log
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
$ tsc
|
|
@@ -1,603 +0,0 @@
|
|
|
1
|
-
# TTS Server API Integration Guide
|
|
2
|
-
|
|
3
|
-
This guide shows how to integrate the server-side TTS with speech marks into your SvelteKit application.
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
The integration has three parts:
|
|
8
|
-
|
|
9
|
-
1. **Server-side packages** - Handle AWS Polly API calls
|
|
10
|
-
2. **SvelteKit API routes** - Expose TTS endpoints
|
|
11
|
-
3. **Client-side provider** - Call API from browser
|
|
12
|
-
|
|
13
|
-
## Architecture
|
|
14
|
-
|
|
15
|
-
```
|
|
16
|
-
Browser (Client)
|
|
17
|
-
↓
|
|
18
|
-
ServerTTSProvider (@pie-players/tts-client-server)
|
|
19
|
-
↓ HTTP POST
|
|
20
|
-
SvelteKit API Route (/api/tts/synthesize/+server.ts)
|
|
21
|
-
↓
|
|
22
|
-
PollyServerProvider (@pie-players/tts-server-polly)
|
|
23
|
-
↓
|
|
24
|
-
AWS Polly API (audio + speech marks)
|
|
25
|
-
```
|
|
26
|
-
|
|
27
|
-
## Step 1: Install Packages
|
|
28
|
-
|
|
29
|
-
```bash
|
|
30
|
-
cd your-sveltekit-app
|
|
31
|
-
|
|
32
|
-
# Install server-side packages
|
|
33
|
-
bun add @pie-players/tts-server-core
|
|
34
|
-
bun add @pie-players/tts-server-polly
|
|
35
|
-
|
|
36
|
-
# Install client-side provider
|
|
37
|
-
bun add @pie-players/tts-client-server
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
## Step 2: Configure Environment Variables
|
|
41
|
-
|
|
42
|
-
Create or update `.env`:
|
|
43
|
-
|
|
44
|
-
```bash
|
|
45
|
-
# AWS Polly credentials
|
|
46
|
-
AWS_REGION=us-east-1
|
|
47
|
-
AWS_ACCESS_KEY_ID=your_access_key_id
|
|
48
|
-
AWS_SECRET_ACCESS_KEY=your_secret_access_key
|
|
49
|
-
|
|
50
|
-
# Optional: Redis for caching
|
|
51
|
-
REDIS_URL=redis://localhost:6379
|
|
52
|
-
```
|
|
53
|
-
|
|
54
|
-
**Important:** Never commit `.env` to git. Add to `.gitignore`:
|
|
55
|
-
|
|
56
|
-
```
|
|
57
|
-
.env
|
|
58
|
-
.env.local
|
|
59
|
-
```
|
|
60
|
-
|
|
61
|
-
## Step 3: Create SvelteKit API Routes
|
|
62
|
-
|
|
63
|
-
### Create Directory Structure
|
|
64
|
-
|
|
65
|
-
```bash
|
|
66
|
-
mkdir -p src/routes/api/tts/synthesize
|
|
67
|
-
mkdir -p src/routes/api/tts/voices
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
### Synthesize Endpoint
|
|
71
|
-
|
|
72
|
-
Copy the example to: **`src/routes/api/tts/synthesize/+server.ts`**
|
|
73
|
-
|
|
74
|
-
```typescript
|
|
75
|
-
import { json, error } from '@sveltejs/kit';
|
|
76
|
-
import type { RequestHandler } from './$types';
|
|
77
|
-
import { PollyServerProvider } from '@pie-players/tts-server-polly';
|
|
78
|
-
|
|
79
|
-
// Singleton provider instance
|
|
80
|
-
let pollyProvider: PollyServerProvider | null = null;
|
|
81
|
-
|
|
82
|
-
async function getPollyProvider(): Promise<PollyServerProvider> {
|
|
83
|
-
if (!pollyProvider) {
|
|
84
|
-
pollyProvider = new PollyServerProvider();
|
|
85
|
-
await pollyProvider.initialize({
|
|
86
|
-
region: process.env.AWS_REGION || 'us-east-1',
|
|
87
|
-
credentials: {
|
|
88
|
-
accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
|
|
89
|
-
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
|
|
90
|
-
},
|
|
91
|
-
engine: 'neural',
|
|
92
|
-
defaultVoice: 'Joanna',
|
|
93
|
-
});
|
|
94
|
-
}
|
|
95
|
-
return pollyProvider;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
export const POST: RequestHandler = async ({ request }) => {
|
|
99
|
-
try {
|
|
100
|
-
const body = await request.json();
|
|
101
|
-
const { text, voice, language, rate, includeSpeechMarks = true } = body;
|
|
102
|
-
|
|
103
|
-
if (!text || typeof text !== 'string') {
|
|
104
|
-
throw error(400, { message: 'Text is required' });
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
if (text.length > 3000) {
|
|
108
|
-
throw error(400, { message: 'Text too long (max 3000 characters)' });
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
const polly = await getPollyProvider();
|
|
112
|
-
const result = await polly.synthesize({
|
|
113
|
-
text,
|
|
114
|
-
voice: voice || 'Joanna',
|
|
115
|
-
language: language || 'en-US',
|
|
116
|
-
rate,
|
|
117
|
-
includeSpeechMarks,
|
|
118
|
-
});
|
|
119
|
-
|
|
120
|
-
return json({
|
|
121
|
-
audio: result.audio instanceof Buffer ? result.audio.toString('base64') : result.audio,
|
|
122
|
-
contentType: result.contentType,
|
|
123
|
-
speechMarks: result.speechMarks,
|
|
124
|
-
metadata: result.metadata,
|
|
125
|
-
});
|
|
126
|
-
} catch (err) {
|
|
127
|
-
console.error('[TTS API] Error:', err);
|
|
128
|
-
throw error(500, { message: err instanceof Error ? err.message : 'Synthesis failed' });
|
|
129
|
-
}
|
|
130
|
-
};
|
|
131
|
-
```
|
|
132
|
-
|
|
133
|
-
### Voices Endpoint
|
|
134
|
-
|
|
135
|
-
Copy the example to: **`src/routes/api/tts/voices/+server.ts`**
|
|
136
|
-
|
|
137
|
-
```typescript
|
|
138
|
-
import { json, error } from '@sveltejs/kit';
|
|
139
|
-
import type { RequestHandler } from './$types';
|
|
140
|
-
import { PollyServerProvider } from '@pie-players/tts-server-polly';
|
|
141
|
-
|
|
142
|
-
// Use same singleton as synthesize route
|
|
143
|
-
let pollyProvider: PollyServerProvider | null = null;
|
|
144
|
-
|
|
145
|
-
async function getPollyProvider(): Promise<PollyServerProvider> {
|
|
146
|
-
if (!pollyProvider) {
|
|
147
|
-
pollyProvider = new PollyServerProvider();
|
|
148
|
-
await pollyProvider.initialize({
|
|
149
|
-
region: process.env.AWS_REGION || 'us-east-1',
|
|
150
|
-
credentials: {
|
|
151
|
-
accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
|
|
152
|
-
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
|
|
153
|
-
},
|
|
154
|
-
engine: 'neural',
|
|
155
|
-
});
|
|
156
|
-
}
|
|
157
|
-
return pollyProvider;
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
export const GET: RequestHandler = async ({ url }) => {
|
|
161
|
-
try {
|
|
162
|
-
const language = url.searchParams.get('language') || undefined;
|
|
163
|
-
const gender = url.searchParams.get('gender') as 'male' | 'female' | 'neutral' | undefined;
|
|
164
|
-
|
|
165
|
-
const polly = await getPollyProvider();
|
|
166
|
-
const voices = await polly.getVoices({ language, gender });
|
|
167
|
-
|
|
168
|
-
return json({ voices });
|
|
169
|
-
} catch (err) {
|
|
170
|
-
console.error('[TTS API] Error:', err);
|
|
171
|
-
throw error(500, { message: err instanceof Error ? err.message : 'Failed to get voices' });
|
|
172
|
-
}
|
|
173
|
-
};
|
|
174
|
-
```
|
|
175
|
-
|
|
176
|
-
## Step 4: Use in Client Code
|
|
177
|
-
|
|
178
|
-
### Basic Usage
|
|
179
|
-
|
|
180
|
-
```typescript
|
|
181
|
-
import { ServerTTSProvider } from '@pie-players/tts-client-server';
|
|
182
|
-
import { TTSService } from '@pie-players/pie-assessment-toolkit';
|
|
183
|
-
|
|
184
|
-
// Initialize TTS service with server provider
|
|
185
|
-
const provider = new ServerTTSProvider();
|
|
186
|
-
const ttsService = new TTSService();
|
|
187
|
-
|
|
188
|
-
await ttsService.initialize(provider, {
|
|
189
|
-
apiEndpoint: '/api/tts',
|
|
190
|
-
provider: 'polly',
|
|
191
|
-
voice: 'Joanna',
|
|
192
|
-
language: 'en-US',
|
|
193
|
-
rate: 1.0,
|
|
194
|
-
});
|
|
195
|
-
|
|
196
|
-
// Speak with word highlighting
|
|
197
|
-
await ttsService.speak('Hello world, this is a test.', {
|
|
198
|
-
contentElement: document.getElementById('content'),
|
|
199
|
-
});
|
|
200
|
-
```
|
|
201
|
-
|
|
202
|
-
### With Svelte Component
|
|
203
|
-
|
|
204
|
-
```svelte
|
|
205
|
-
<script lang="ts">
|
|
206
|
-
import { ServerTTSProvider } from '@pie-players/tts-client-server';
|
|
207
|
-
import { TTSService } from '@pie-players/pie-assessment-toolkit';
|
|
208
|
-
import { onMount } from 'svelte';
|
|
209
|
-
|
|
210
|
-
let ttsService: TTSService;
|
|
211
|
-
let contentElement: HTMLElement;
|
|
212
|
-
|
|
213
|
-
onMount(async () => {
|
|
214
|
-
const provider = new ServerTTSProvider();
|
|
215
|
-
ttsService = new TTSService();
|
|
216
|
-
|
|
217
|
-
await ttsService.initialize(provider, {
|
|
218
|
-
apiEndpoint: '/api/tts',
|
|
219
|
-
provider: 'polly',
|
|
220
|
-
voice: 'Joanna',
|
|
221
|
-
});
|
|
222
|
-
});
|
|
223
|
-
|
|
224
|
-
async function handleSpeak() {
|
|
225
|
-
await ttsService.speak('Hello world', {
|
|
226
|
-
contentElement,
|
|
227
|
-
});
|
|
228
|
-
}
|
|
229
|
-
</script>
|
|
230
|
-
|
|
231
|
-
<div bind:this={contentElement}>
|
|
232
|
-
<p>Hello world, this is a test of text to speech.</p>
|
|
233
|
-
</div>
|
|
234
|
-
|
|
235
|
-
<button on:click={handleSpeak}>Speak</button>
|
|
236
|
-
```
|
|
237
|
-
|
|
238
|
-
## Step 5: Add Redis Caching (Optional)
|
|
239
|
-
|
|
240
|
-
### Install Redis
|
|
241
|
-
|
|
242
|
-
```bash
|
|
243
|
-
bun add ioredis
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
### Update API Route with Caching
|
|
247
|
-
|
|
248
|
-
```typescript
|
|
249
|
-
import { json, error } from '@sveltejs/kit';
|
|
250
|
-
import type { RequestHandler } from './$types';
|
|
251
|
-
import { PollyServerProvider } from '@pie-players/tts-server-polly';
|
|
252
|
-
import { generateHashedCacheKey } from '@pie-players/tts-server-core';
|
|
253
|
-
import Redis from 'ioredis';
|
|
254
|
-
|
|
255
|
-
// Singleton instances
|
|
256
|
-
let pollyProvider: PollyServerProvider | null = null;
|
|
257
|
-
let redis: Redis | null = null;
|
|
258
|
-
|
|
259
|
-
async function getRedis(): Promise<Redis> {
|
|
260
|
-
if (!redis && process.env.REDIS_URL) {
|
|
261
|
-
redis = new Redis(process.env.REDIS_URL);
|
|
262
|
-
}
|
|
263
|
-
return redis!;
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
async function getPollyProvider(): Promise<PollyServerProvider> {
|
|
267
|
-
if (!pollyProvider) {
|
|
268
|
-
pollyProvider = new PollyServerProvider();
|
|
269
|
-
await pollyProvider.initialize({
|
|
270
|
-
region: process.env.AWS_REGION || 'us-east-1',
|
|
271
|
-
credentials: {
|
|
272
|
-
accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
|
|
273
|
-
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
|
|
274
|
-
},
|
|
275
|
-
engine: 'neural',
|
|
276
|
-
});
|
|
277
|
-
}
|
|
278
|
-
return pollyProvider;
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
export const POST: RequestHandler = async ({ request }) => {
|
|
282
|
-
try {
|
|
283
|
-
const body = await request.json();
|
|
284
|
-
const { text, voice = 'Joanna', language = 'en-US', rate = 1.0, includeSpeechMarks = true } = body;
|
|
285
|
-
|
|
286
|
-
if (!text || typeof text !== 'string') {
|
|
287
|
-
throw error(400, { message: 'Text is required' });
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
if (text.length > 3000) {
|
|
291
|
-
throw error(400, { message: 'Text too long (max 3000 characters)' });
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
// Generate cache key
|
|
295
|
-
const cacheKey = await generateHashedCacheKey({
|
|
296
|
-
providerId: 'aws-polly',
|
|
297
|
-
text,
|
|
298
|
-
voice,
|
|
299
|
-
language,
|
|
300
|
-
rate,
|
|
301
|
-
format: 'mp3',
|
|
302
|
-
});
|
|
303
|
-
|
|
304
|
-
// Check Redis cache
|
|
305
|
-
if (process.env.REDIS_URL) {
|
|
306
|
-
try {
|
|
307
|
-
const redisClient = await getRedis();
|
|
308
|
-
const cached = await redisClient.get(cacheKey);
|
|
309
|
-
|
|
310
|
-
if (cached) {
|
|
311
|
-
console.log('[TTS API] Cache hit:', cacheKey);
|
|
312
|
-
const result = JSON.parse(cached);
|
|
313
|
-
result.metadata.cached = true;
|
|
314
|
-
return json(result);
|
|
315
|
-
}
|
|
316
|
-
} catch (cacheError) {
|
|
317
|
-
console.warn('[TTS API] Cache read error:', cacheError);
|
|
318
|
-
// Continue without cache
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
// Synthesize with Polly
|
|
323
|
-
const polly = await getPollyProvider();
|
|
324
|
-
const result = await polly.synthesize({
|
|
325
|
-
text,
|
|
326
|
-
voice,
|
|
327
|
-
language,
|
|
328
|
-
rate,
|
|
329
|
-
includeSpeechMarks,
|
|
330
|
-
});
|
|
331
|
-
|
|
332
|
-
const response = {
|
|
333
|
-
audio: result.audio instanceof Buffer ? result.audio.toString('base64') : result.audio,
|
|
334
|
-
contentType: result.contentType,
|
|
335
|
-
speechMarks: result.speechMarks,
|
|
336
|
-
metadata: result.metadata,
|
|
337
|
-
};
|
|
338
|
-
|
|
339
|
-
// Cache result
|
|
340
|
-
if (process.env.REDIS_URL) {
|
|
341
|
-
try {
|
|
342
|
-
const redisClient = await getRedis();
|
|
343
|
-
await redisClient.setex(cacheKey, 24 * 60 * 60, JSON.stringify(response));
|
|
344
|
-
console.log('[TTS API] Cached result:', cacheKey);
|
|
345
|
-
} catch (cacheError) {
|
|
346
|
-
console.warn('[TTS API] Cache write error:', cacheError);
|
|
347
|
-
// Non-fatal, continue
|
|
348
|
-
}
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
return json(response);
|
|
352
|
-
} catch (err) {
|
|
353
|
-
console.error('[TTS API] Error:', err);
|
|
354
|
-
throw error(500, { message: err instanceof Error ? err.message : 'Synthesis failed' });
|
|
355
|
-
}
|
|
356
|
-
};
|
|
357
|
-
```
|
|
358
|
-
|
|
359
|
-
## Step 6: Test the Integration
|
|
360
|
-
|
|
361
|
-
### Test API Endpoints
|
|
362
|
-
|
|
363
|
-
```bash
|
|
364
|
-
# Test synthesize endpoint
|
|
365
|
-
curl -X POST http://localhost:5173/api/tts/synthesize \
|
|
366
|
-
-H "Content-Type: application/json" \
|
|
367
|
-
-d '{"text": "Hello world", "voice": "Joanna"}'
|
|
368
|
-
|
|
369
|
-
# Test voices endpoint
|
|
370
|
-
curl http://localhost:5173/api/tts/voices
|
|
371
|
-
```
|
|
372
|
-
|
|
373
|
-
### Test in Browser
|
|
374
|
-
|
|
375
|
-
```typescript
|
|
376
|
-
// In browser console
|
|
377
|
-
const response = await fetch('/api/tts/synthesize', {
|
|
378
|
-
method: 'POST',
|
|
379
|
-
headers: { 'Content-Type': 'application/json' },
|
|
380
|
-
body: JSON.stringify({
|
|
381
|
-
text: 'Hello world, this is a test.',
|
|
382
|
-
voice: 'Joanna',
|
|
383
|
-
}),
|
|
384
|
-
});
|
|
385
|
-
|
|
386
|
-
const data = await response.json();
|
|
387
|
-
console.log('Speech marks:', data.speechMarks);
|
|
388
|
-
console.log('Metadata:', data.metadata);
|
|
389
|
-
```
|
|
390
|
-
|
|
391
|
-
## Redis Caching Benefits
|
|
392
|
-
|
|
393
|
-
With Redis caching enabled:
|
|
394
|
-
|
|
395
|
-
- **First request:** Full Polly API call (~300-500ms)
|
|
396
|
-
- **Cached requests:** Redis retrieval (~10-20ms)
|
|
397
|
-
- **Cost savings:** 80-90% reduction in Polly API calls
|
|
398
|
-
- **TTL:** 24 hours (configurable)
|
|
399
|
-
|
|
400
|
-
### Cache Key Format
|
|
401
|
-
|
|
402
|
-
```
|
|
403
|
-
tts:aws-polly:Joanna:en-US:1.00:mp3:<sha256-hash-of-text>
|
|
404
|
-
```
|
|
405
|
-
|
|
406
|
-
## Security Considerations
|
|
407
|
-
|
|
408
|
-
### Credentials
|
|
409
|
-
|
|
410
|
-
- ✅ AWS credentials stay on server (never exposed to browser)
|
|
411
|
-
- ✅ Use IAM roles in production (no hardcoded credentials)
|
|
412
|
-
- ✅ Use environment variables for configuration
|
|
413
|
-
|
|
414
|
-
### Authentication (Optional)
|
|
415
|
-
|
|
416
|
-
Add authentication middleware to protect API:
|
|
417
|
-
|
|
418
|
-
```typescript
|
|
419
|
-
// src/hooks.server.ts
|
|
420
|
-
import type { Handle } from '@sveltejs/kit';
|
|
421
|
-
|
|
422
|
-
export const handle: Handle = async ({ event, resolve }) => {
|
|
423
|
-
// Check if request is to TTS API
|
|
424
|
-
if (event.url.pathname.startsWith('/api/tts')) {
|
|
425
|
-
// Verify JWT token or API key
|
|
426
|
-
const authHeader = event.request.headers.get('Authorization');
|
|
427
|
-
if (!authHeader || !isValidToken(authHeader)) {
|
|
428
|
-
return new Response(JSON.stringify({ error: 'Unauthorized' }), {
|
|
429
|
-
status: 401,
|
|
430
|
-
headers: { 'Content-Type': 'application/json' },
|
|
431
|
-
});
|
|
432
|
-
}
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
return resolve(event);
|
|
436
|
-
};
|
|
437
|
-
```
|
|
438
|
-
|
|
439
|
-
### Rate Limiting
|
|
440
|
-
|
|
441
|
-
Add rate limiting to prevent abuse:
|
|
442
|
-
|
|
443
|
-
```typescript
|
|
444
|
-
import { rateLimit } from '$lib/rate-limiter';
|
|
445
|
-
|
|
446
|
-
export const POST: RequestHandler = async ({ request, getClientAddress }) => {
|
|
447
|
-
// Check rate limit
|
|
448
|
-
const clientIP = getClientAddress();
|
|
449
|
-
const allowed = await rateLimit.check(clientIP, {
|
|
450
|
-
maxRequests: 60, // 60 requests
|
|
451
|
-
windowMs: 60000, // per minute
|
|
452
|
-
});
|
|
453
|
-
|
|
454
|
-
if (!allowed) {
|
|
455
|
-
throw error(429, { message: 'Rate limit exceeded' });
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
// ... rest of handler
|
|
459
|
-
};
|
|
460
|
-
```
|
|
461
|
-
|
|
462
|
-
## Cost Optimization
|
|
463
|
-
|
|
464
|
-
### AWS Polly Pricing
|
|
465
|
-
|
|
466
|
-
- **Neural voices:** $16 per 1M characters
|
|
467
|
-
- **Standard voices:** $4 per 1M characters
|
|
468
|
-
|
|
469
|
-
### Example Costs
|
|
470
|
-
|
|
471
|
-
**Scenario:** 1000 students taking an assessment
|
|
472
|
-
|
|
473
|
-
- Average assessment: 5 passages × 500 words × 5 chars = 12,500 chars per student
|
|
474
|
-
- Total: 12.5M characters
|
|
475
|
-
- Cost without caching: $200 (neural) or $50 (standard)
|
|
476
|
-
- Cost with 80% cache hit rate: $40 (neural) or $10 (standard)
|
|
477
|
-
|
|
478
|
-
### Optimization Tips
|
|
479
|
-
|
|
480
|
-
1. **Use Redis caching** - 24-hour TTL captures repeated content
|
|
481
|
-
2. **Standard voices for development** - Switch to neural for production
|
|
482
|
-
3. **Monitor usage** - Track API calls and cache hit rates
|
|
483
|
-
4. **Pre-generate common content** - Cache frequently used passages
|
|
484
|
-
|
|
485
|
-
## Troubleshooting
|
|
486
|
-
|
|
487
|
-
### Error: "AWS credentials not found"
|
|
488
|
-
|
|
489
|
-
Check environment variables are set:
|
|
490
|
-
```bash
|
|
491
|
-
echo $AWS_REGION
|
|
492
|
-
echo $AWS_ACCESS_KEY_ID
|
|
493
|
-
```
|
|
494
|
-
|
|
495
|
-
### Error: "Text too long"
|
|
496
|
-
|
|
497
|
-
AWS Polly limit is 3000 characters. Split longer text:
|
|
498
|
-
|
|
499
|
-
```typescript
|
|
500
|
-
function splitText(text: string, maxLength = 2500): string[] {
|
|
501
|
-
const sentences = text.split(/(?<=[.!?])\s+/);
|
|
502
|
-
const chunks: string[] = [];
|
|
503
|
-
let currentChunk = '';
|
|
504
|
-
|
|
505
|
-
for (const sentence of sentences) {
|
|
506
|
-
if (currentChunk.length + sentence.length > maxLength) {
|
|
507
|
-
chunks.push(currentChunk.trim());
|
|
508
|
-
currentChunk = sentence;
|
|
509
|
-
} else {
|
|
510
|
-
currentChunk += ' ' + sentence;
|
|
511
|
-
}
|
|
512
|
-
}
|
|
513
|
-
|
|
514
|
-
if (currentChunk.trim()) {
|
|
515
|
-
chunks.push(currentChunk.trim());
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
return chunks;
|
|
519
|
-
}
|
|
520
|
-
```
|
|
521
|
-
|
|
522
|
-
### Error: "Speech marks empty"
|
|
523
|
-
|
|
524
|
-
Check that:
|
|
525
|
-
1. Speech marks are requested in API call
|
|
526
|
-
2. Provider supports speech marks
|
|
527
|
-
3. Text is not empty
|
|
528
|
-
|
|
529
|
-
### Redis connection errors
|
|
530
|
-
|
|
531
|
-
If Redis is unavailable, the API will work without caching. Check Redis:
|
|
532
|
-
|
|
533
|
-
```bash
|
|
534
|
-
redis-cli ping
|
|
535
|
-
# Should return: PONG
|
|
536
|
-
```
|
|
537
|
-
|
|
538
|
-
## Production Deployment
|
|
539
|
-
|
|
540
|
-
### Environment Setup
|
|
541
|
-
|
|
542
|
-
```bash
|
|
543
|
-
# Production environment variables
|
|
544
|
-
export NODE_ENV=production
|
|
545
|
-
export AWS_REGION=us-east-1
|
|
546
|
-
export AWS_ACCESS_KEY_ID=xxx
|
|
547
|
-
export AWS_SECRET_ACCESS_KEY=yyy
|
|
548
|
-
export REDIS_URL=redis://your-redis-host:6379
|
|
549
|
-
```
|
|
550
|
-
|
|
551
|
-
### Docker Deployment
|
|
552
|
-
|
|
553
|
-
```dockerfile
|
|
554
|
-
FROM node:20-alpine
|
|
555
|
-
WORKDIR /app
|
|
556
|
-
COPY . .
|
|
557
|
-
RUN npm ci --production
|
|
558
|
-
RUN npm run build
|
|
559
|
-
EXPOSE 3000
|
|
560
|
-
CMD ["node", "build"]
|
|
561
|
-
```
|
|
562
|
-
|
|
563
|
-
### Health Check
|
|
564
|
-
|
|
565
|
-
Add a health endpoint:
|
|
566
|
-
|
|
567
|
-
```typescript
|
|
568
|
-
// src/routes/api/health/+server.ts
|
|
569
|
-
import { json } from '@sveltejs/kit';
|
|
570
|
-
import type { RequestHandler } from './$types';
|
|
571
|
-
|
|
572
|
-
export const GET: RequestHandler = async () => {
|
|
573
|
-
const health = {
|
|
574
|
-
status: 'ok',
|
|
575
|
-
timestamp: new Date().toISOString(),
|
|
576
|
-
services: {
|
|
577
|
-
polly: await checkPolly(),
|
|
578
|
-
redis: await checkRedis(),
|
|
579
|
-
},
|
|
580
|
-
};
|
|
581
|
-
|
|
582
|
-
return json(health);
|
|
583
|
-
};
|
|
584
|
-
```
|
|
585
|
-
|
|
586
|
-
## Next Steps
|
|
587
|
-
|
|
588
|
-
1. **Test in your app** - Create a demo page
|
|
589
|
-
2. **Monitor usage** - Track API calls and costs
|
|
590
|
-
3. **Add more providers** - Google Cloud TTS, ElevenLabs
|
|
591
|
-
4. **Optimize caching** - Fine-tune TTL and eviction
|
|
592
|
-
|
|
593
|
-
## Complete Example
|
|
594
|
-
|
|
595
|
-
See the section-player demo for a complete working example:
|
|
596
|
-
- `packages/section-player/demo.html` - Client-side usage
|
|
597
|
-
- API routes would be added to a SvelteKit app
|
|
598
|
-
|
|
599
|
-
## Support
|
|
600
|
-
|
|
601
|
-
For issues or questions:
|
|
602
|
-
- Check the [TTS Server API Architecture](../../../docs/tts-server-api-architecture.md)
|
|
603
|
-
- See [TTS Highlighting Implementation Plan](../../../docs/tts-highlighting-implementation-plan.md)
|
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* SvelteKit API route for TTS synthesis
|
|
3
|
-
*
|
|
4
|
-
* Copy this file to your SvelteKit app:
|
|
5
|
-
* src/routes/api/tts/synthesize/+server.ts
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
import { generateHashedCacheKey } from "@pie-players/tts-server-core";
|
|
9
|
-
import { PollyServerProvider } from "@pie-players/tts-server-polly";
|
|
10
|
-
import { error, json } from "@sveltejs/kit";
|
|
11
|
-
import type { RequestHandler } from "./$types";
|
|
12
|
-
|
|
13
|
-
// Initialize Polly provider (singleton)
|
|
14
|
-
let pollyProvider: PollyServerProvider | null = null;
|
|
15
|
-
|
|
16
|
-
async function getPollyProvider(): Promise<PollyServerProvider> {
|
|
17
|
-
if (!pollyProvider) {
|
|
18
|
-
pollyProvider = new PollyServerProvider();
|
|
19
|
-
await pollyProvider.initialize({
|
|
20
|
-
region: process.env.AWS_REGION || "us-east-1",
|
|
21
|
-
credentials: {
|
|
22
|
-
accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
|
|
23
|
-
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
|
|
24
|
-
},
|
|
25
|
-
engine: "neural",
|
|
26
|
-
defaultVoice: "Joanna",
|
|
27
|
-
});
|
|
28
|
-
}
|
|
29
|
-
return pollyProvider;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
// Optional: Redis caching
|
|
33
|
-
// import { createClient } from 'redis';
|
|
34
|
-
// const redis = createClient({ url: process.env.REDIS_URL });
|
|
35
|
-
// await redis.connect();
|
|
36
|
-
|
|
37
|
-
export const POST: RequestHandler = async ({ request }) => {
|
|
38
|
-
try {
|
|
39
|
-
const body = await request.json();
|
|
40
|
-
const {
|
|
41
|
-
text,
|
|
42
|
-
provider = "polly",
|
|
43
|
-
voice,
|
|
44
|
-
language,
|
|
45
|
-
rate,
|
|
46
|
-
includeSpeechMarks = true,
|
|
47
|
-
} = body;
|
|
48
|
-
|
|
49
|
-
// Validate request
|
|
50
|
-
if (!text || typeof text !== "string") {
|
|
51
|
-
throw error(400, { message: "Text is required and must be a string" });
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
if (text.length > 3000) {
|
|
55
|
-
throw error(400, { message: "Text too long (max 3000 characters)" });
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
// Optional: Check Redis cache
|
|
59
|
-
// const cacheKey = await generateHashedCacheKey({
|
|
60
|
-
// providerId: 'aws-polly',
|
|
61
|
-
// text,
|
|
62
|
-
// voice: voice || 'Joanna',
|
|
63
|
-
// language: language || 'en-US',
|
|
64
|
-
// rate: rate || 1.0,
|
|
65
|
-
// format: 'mp3',
|
|
66
|
-
// });
|
|
67
|
-
//
|
|
68
|
-
// const cached = await redis.get(cacheKey);
|
|
69
|
-
// if (cached) {
|
|
70
|
-
// console.log('[TTS API] Cache hit:', cacheKey);
|
|
71
|
-
// return json(JSON.parse(cached));
|
|
72
|
-
// }
|
|
73
|
-
|
|
74
|
-
// Get Polly provider
|
|
75
|
-
const polly = await getPollyProvider();
|
|
76
|
-
|
|
77
|
-
// Synthesize speech
|
|
78
|
-
const result = await polly.synthesize({
|
|
79
|
-
text,
|
|
80
|
-
voice: voice || "Joanna",
|
|
81
|
-
language: language || "en-US",
|
|
82
|
-
rate,
|
|
83
|
-
includeSpeechMarks,
|
|
84
|
-
});
|
|
85
|
-
|
|
86
|
-
// Convert Buffer to base64 for JSON response
|
|
87
|
-
const response = {
|
|
88
|
-
audio:
|
|
89
|
-
result.audio instanceof Buffer
|
|
90
|
-
? result.audio.toString("base64")
|
|
91
|
-
: result.audio,
|
|
92
|
-
contentType: result.contentType,
|
|
93
|
-
speechMarks: result.speechMarks,
|
|
94
|
-
metadata: result.metadata,
|
|
95
|
-
};
|
|
96
|
-
|
|
97
|
-
// Optional: Cache result
|
|
98
|
-
// await redis.setex(cacheKey, 24 * 60 * 60, JSON.stringify(response));
|
|
99
|
-
|
|
100
|
-
return json(response);
|
|
101
|
-
} catch (err) {
|
|
102
|
-
console.error("[TTS API] Synthesis error:", err);
|
|
103
|
-
|
|
104
|
-
if (err instanceof Error) {
|
|
105
|
-
throw error(500, { message: err.message });
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
throw error(500, { message: "Internal server error" });
|
|
109
|
-
}
|
|
110
|
-
};
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* SvelteKit API route for listing TTS voices
|
|
3
|
-
*
|
|
4
|
-
* Copy this file to your SvelteKit app:
|
|
5
|
-
* src/routes/api/tts/voices/+server.ts
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
import { PollyServerProvider } from "@pie-players/tts-server-polly";
|
|
9
|
-
import { error, json } from "@sveltejs/kit";
|
|
10
|
-
import type { RequestHandler } from "./$types";
|
|
11
|
-
|
|
12
|
-
// Initialize Polly provider (singleton)
|
|
13
|
-
let pollyProvider: PollyServerProvider | null = null;
|
|
14
|
-
|
|
15
|
-
async function getPollyProvider(): Promise<PollyServerProvider> {
|
|
16
|
-
if (!pollyProvider) {
|
|
17
|
-
pollyProvider = new PollyServerProvider();
|
|
18
|
-
await pollyProvider.initialize({
|
|
19
|
-
region: process.env.AWS_REGION || "us-east-1",
|
|
20
|
-
credentials: {
|
|
21
|
-
accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
|
|
22
|
-
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
|
|
23
|
-
},
|
|
24
|
-
engine: "neural",
|
|
25
|
-
defaultVoice: "Joanna",
|
|
26
|
-
});
|
|
27
|
-
}
|
|
28
|
-
return pollyProvider;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
export const GET: RequestHandler = async ({ url }) => {
|
|
32
|
-
try {
|
|
33
|
-
const language = url.searchParams.get("language") || undefined;
|
|
34
|
-
const gender = url.searchParams.get("gender") as
|
|
35
|
-
| "male"
|
|
36
|
-
| "female"
|
|
37
|
-
| "neutral"
|
|
38
|
-
| undefined;
|
|
39
|
-
const quality = url.searchParams.get("quality") as
|
|
40
|
-
| "standard"
|
|
41
|
-
| "neural"
|
|
42
|
-
| "premium"
|
|
43
|
-
| undefined;
|
|
44
|
-
|
|
45
|
-
// Get Polly provider
|
|
46
|
-
const polly = await getPollyProvider();
|
|
47
|
-
|
|
48
|
-
// Get voices with filters
|
|
49
|
-
const voices = await polly.getVoices({
|
|
50
|
-
language,
|
|
51
|
-
gender,
|
|
52
|
-
quality,
|
|
53
|
-
});
|
|
54
|
-
|
|
55
|
-
return json({ voices });
|
|
56
|
-
} catch (err) {
|
|
57
|
-
console.error("[TTS API] Get voices error:", err);
|
|
58
|
-
|
|
59
|
-
if (err instanceof Error) {
|
|
60
|
-
throw error(500, { message: err.message });
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
throw error(500, { message: "Internal server error" });
|
|
64
|
-
}
|
|
65
|
-
};
|
|
@@ -1,426 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* AWS Polly server-side TTS provider
|
|
3
|
-
* @module @pie-players/tts-server-polly
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import {
|
|
7
|
-
DescribeVoicesCommand,
|
|
8
|
-
type DescribeVoicesCommandInput,
|
|
9
|
-
Engine,
|
|
10
|
-
OutputFormat,
|
|
11
|
-
PollyClient,
|
|
12
|
-
SpeechMarkType,
|
|
13
|
-
SynthesizeSpeechCommand,
|
|
14
|
-
VoiceId,
|
|
15
|
-
} from "@aws-sdk/client-polly";
|
|
16
|
-
|
|
17
|
-
import {
|
|
18
|
-
BaseTTSProvider,
|
|
19
|
-
type GetVoicesOptions,
|
|
20
|
-
type ServerProviderCapabilities,
|
|
21
|
-
type SpeechMark,
|
|
22
|
-
type SynthesizeRequest,
|
|
23
|
-
type SynthesizeResponse,
|
|
24
|
-
TTSError,
|
|
25
|
-
TTSErrorCode,
|
|
26
|
-
type TTSServerConfig,
|
|
27
|
-
type Voice,
|
|
28
|
-
} from "@pie-players/tts-server-core";
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* AWS Polly provider configuration.
|
|
32
|
-
*
|
|
33
|
-
* This extends the base TTSServerConfig with Polly-specific settings.
|
|
34
|
-
* All fields marked with @extension are AWS-specific and not portable.
|
|
35
|
-
*/
|
|
36
|
-
export interface PollyProviderConfig extends TTSServerConfig {
|
|
37
|
-
/**
|
|
38
|
-
* AWS region (e.g., 'us-east-1', 'us-west-2', 'eu-west-1')
|
|
39
|
-
*
|
|
40
|
-
* @extension AWS-specific (region concept)
|
|
41
|
-
* @required
|
|
42
|
-
*/
|
|
43
|
-
region: string;
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* AWS credentials for API authentication
|
|
47
|
-
*
|
|
48
|
-
* @extension AWS-specific
|
|
49
|
-
* @note In production, prefer IAM roles over hardcoded credentials
|
|
50
|
-
* @see https://docs.aws.amazon.com/sdk-for-javascript/v3/developer-guide/setting-credentials.html
|
|
51
|
-
*/
|
|
52
|
-
credentials?: {
|
|
53
|
-
accessKeyId: string;
|
|
54
|
-
secretAccessKey: string;
|
|
55
|
-
sessionToken?: string;
|
|
56
|
-
};
|
|
57
|
-
|
|
58
|
-
/**
|
|
59
|
-
* Polly engine type: 'neural' (higher quality) or 'standard' (lower cost)
|
|
60
|
-
*
|
|
61
|
-
* @extension AWS Polly-specific
|
|
62
|
-
* @default 'neural'
|
|
63
|
-
* @note Neural: $16/1M chars, Standard: $4/1M chars
|
|
64
|
-
*/
|
|
65
|
-
engine?: "neural" | "standard";
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Default voice ID if not specified in synthesis requests
|
|
69
|
-
*
|
|
70
|
-
* @standard Voice selection is standard, but voice names are provider-specific
|
|
71
|
-
* @default 'Joanna'
|
|
72
|
-
* @see https://docs.aws.amazon.com/polly/latest/dg/voicelist.html
|
|
73
|
-
*/
|
|
74
|
-
defaultVoice?: string;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
/**
|
|
78
|
-
* AWS Polly Server Provider
|
|
79
|
-
*
|
|
80
|
-
* Provides high-quality neural text-to-speech with precise word-level timing
|
|
81
|
-
* through AWS Polly speech marks.
|
|
82
|
-
*
|
|
83
|
-
* Features:
|
|
84
|
-
* - Native speech marks support (millisecond precision)
|
|
85
|
-
* - Neural and standard voices
|
|
86
|
-
* - 25+ languages
|
|
87
|
-
* - Full SSML support
|
|
88
|
-
* - Parallel audio + speech marks requests
|
|
89
|
-
*/
|
|
90
|
-
export class PollyServerProvider extends BaseTTSProvider {
|
|
91
|
-
readonly providerId = "aws-polly";
|
|
92
|
-
readonly providerName = "AWS Polly";
|
|
93
|
-
readonly version = "1.0.0";
|
|
94
|
-
|
|
95
|
-
private client!: PollyClient;
|
|
96
|
-
private engine: "neural" | "standard" = "neural";
|
|
97
|
-
private defaultVoice = "Joanna";
|
|
98
|
-
|
|
99
|
-
/**
|
|
100
|
-
* Initialize the AWS Polly provider.
|
|
101
|
-
*
|
|
102
|
-
* This is FAST and lightweight - only validates config and creates the Polly client.
|
|
103
|
-
* Does NOT fetch voices or make test API calls.
|
|
104
|
-
*
|
|
105
|
-
* @param config - Polly configuration with region and credentials
|
|
106
|
-
* @performance Completes in ~10-50ms
|
|
107
|
-
*/
|
|
108
|
-
async initialize(config: PollyProviderConfig): Promise<void> {
|
|
109
|
-
if (!config.region) {
|
|
110
|
-
throw new TTSError(
|
|
111
|
-
TTSErrorCode.INITIALIZATION_ERROR,
|
|
112
|
-
"AWS region is required",
|
|
113
|
-
undefined,
|
|
114
|
-
this.providerId,
|
|
115
|
-
);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
this.config = config;
|
|
119
|
-
this.engine = config.engine || "neural";
|
|
120
|
-
this.defaultVoice = config.defaultVoice || "Joanna";
|
|
121
|
-
|
|
122
|
-
try {
|
|
123
|
-
// Create Polly client (fast - no API calls)
|
|
124
|
-
this.client = new PollyClient({
|
|
125
|
-
region: config.region,
|
|
126
|
-
credentials: config.credentials,
|
|
127
|
-
});
|
|
128
|
-
|
|
129
|
-
this.initialized = true;
|
|
130
|
-
// NOTE: We do NOT call getVoices() here - that's an explicit secondary operation
|
|
131
|
-
} catch (error) {
|
|
132
|
-
throw new TTSError(
|
|
133
|
-
TTSErrorCode.INITIALIZATION_ERROR,
|
|
134
|
-
`Failed to initialize AWS Polly: ${error instanceof Error ? error.message : String(error)}`,
|
|
135
|
-
{ error },
|
|
136
|
-
this.providerId,
|
|
137
|
-
);
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
/**
|
|
142
|
-
* Synthesize speech with AWS Polly
|
|
143
|
-
*/
|
|
144
|
-
async synthesize(request: SynthesizeRequest): Promise<SynthesizeResponse> {
|
|
145
|
-
this.ensureInitialized();
|
|
146
|
-
|
|
147
|
-
const capabilities = this.getCapabilities();
|
|
148
|
-
this.validateRequest(request, capabilities);
|
|
149
|
-
|
|
150
|
-
const voice = request.voice || this.defaultVoice;
|
|
151
|
-
const startTime = Date.now();
|
|
152
|
-
|
|
153
|
-
try {
|
|
154
|
-
// Make parallel requests for audio and speech marks
|
|
155
|
-
const [audioResponse, speechMarksResponse] = await Promise.all([
|
|
156
|
-
this.synthesizeAudio(request, voice),
|
|
157
|
-
request.includeSpeechMarks !== false
|
|
158
|
-
? this.synthesizeSpeechMarks(request, voice)
|
|
159
|
-
: Promise.resolve([]),
|
|
160
|
-
]);
|
|
161
|
-
|
|
162
|
-
const duration = (Date.now() - startTime) / 1000;
|
|
163
|
-
|
|
164
|
-
return {
|
|
165
|
-
audio: audioResponse.audio,
|
|
166
|
-
contentType: audioResponse.contentType,
|
|
167
|
-
speechMarks: speechMarksResponse,
|
|
168
|
-
metadata: {
|
|
169
|
-
providerId: this.providerId,
|
|
170
|
-
voice,
|
|
171
|
-
duration,
|
|
172
|
-
charCount: request.text.length,
|
|
173
|
-
cached: false,
|
|
174
|
-
timestamp: new Date().toISOString(),
|
|
175
|
-
},
|
|
176
|
-
};
|
|
177
|
-
} catch (error) {
|
|
178
|
-
throw new TTSError(
|
|
179
|
-
TTSErrorCode.PROVIDER_ERROR,
|
|
180
|
-
`AWS Polly synthesis failed: ${error instanceof Error ? error.message : String(error)}`,
|
|
181
|
-
{ error, request },
|
|
182
|
-
this.providerId,
|
|
183
|
-
);
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
/**
|
|
188
|
-
* Synthesize audio stream
|
|
189
|
-
*/
|
|
190
|
-
private async synthesizeAudio(
|
|
191
|
-
request: SynthesizeRequest,
|
|
192
|
-
voice: string,
|
|
193
|
-
): Promise<{ audio: Buffer; contentType: string }> {
|
|
194
|
-
// Detect if text contains SSML tags
|
|
195
|
-
const isSsml =
|
|
196
|
-
request.text.includes("<speak") ||
|
|
197
|
-
request.text.includes("<emphasis") ||
|
|
198
|
-
request.text.includes("<break") ||
|
|
199
|
-
request.text.includes("<prosody") ||
|
|
200
|
-
request.text.includes("<phoneme") ||
|
|
201
|
-
request.text.includes("<amazon:") ||
|
|
202
|
-
request.text.includes("<aws-");
|
|
203
|
-
|
|
204
|
-
const textType = isSsml ? "ssml" : "text";
|
|
205
|
-
|
|
206
|
-
if (isSsml) {
|
|
207
|
-
console.log(
|
|
208
|
-
"[PollyServerProvider] Detected SSML content, using TextType: ssml",
|
|
209
|
-
);
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
const command = new SynthesizeSpeechCommand({
|
|
213
|
-
Engine: this.engine === "neural" ? Engine.NEURAL : Engine.STANDARD,
|
|
214
|
-
OutputFormat: OutputFormat.MP3,
|
|
215
|
-
Text: request.text,
|
|
216
|
-
TextType: textType,
|
|
217
|
-
VoiceId: voice as VoiceId,
|
|
218
|
-
SampleRate: String(request.sampleRate || 24000),
|
|
219
|
-
});
|
|
220
|
-
|
|
221
|
-
const response = await this.client.send(command);
|
|
222
|
-
|
|
223
|
-
if (!response.AudioStream) {
|
|
224
|
-
throw new Error("No audio stream received from AWS Polly");
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
// Convert stream to buffer
|
|
228
|
-
const chunks: Uint8Array[] = [];
|
|
229
|
-
const stream = response.AudioStream;
|
|
230
|
-
|
|
231
|
-
if (Symbol.asyncIterator in stream) {
|
|
232
|
-
for await (const chunk of stream as AsyncIterable<Uint8Array>) {
|
|
233
|
-
chunks.push(chunk);
|
|
234
|
-
}
|
|
235
|
-
} else if (stream instanceof Uint8Array) {
|
|
236
|
-
chunks.push(stream);
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
const audioBuffer = Buffer.concat(chunks);
|
|
240
|
-
|
|
241
|
-
return {
|
|
242
|
-
audio: audioBuffer,
|
|
243
|
-
contentType: response.ContentType || "audio/mpeg",
|
|
244
|
-
};
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
/**
|
|
248
|
-
* Synthesize speech marks
|
|
249
|
-
*/
|
|
250
|
-
private async synthesizeSpeechMarks(
|
|
251
|
-
request: SynthesizeRequest,
|
|
252
|
-
voice: string,
|
|
253
|
-
): Promise<SpeechMark[]> {
|
|
254
|
-
// Detect if text contains SSML tags (same logic as audio synthesis)
|
|
255
|
-
const isSsml =
|
|
256
|
-
request.text.includes("<speak") ||
|
|
257
|
-
request.text.includes("<emphasis") ||
|
|
258
|
-
request.text.includes("<break") ||
|
|
259
|
-
request.text.includes("<prosody") ||
|
|
260
|
-
request.text.includes("<phoneme") ||
|
|
261
|
-
request.text.includes("<amazon:") ||
|
|
262
|
-
request.text.includes("<aws-");
|
|
263
|
-
|
|
264
|
-
const textType = isSsml ? "ssml" : "text";
|
|
265
|
-
|
|
266
|
-
const command = new SynthesizeSpeechCommand({
|
|
267
|
-
Engine: this.engine === "neural" ? Engine.NEURAL : Engine.STANDARD,
|
|
268
|
-
OutputFormat: OutputFormat.JSON,
|
|
269
|
-
Text: request.text,
|
|
270
|
-
TextType: textType,
|
|
271
|
-
VoiceId: voice as VoiceId,
|
|
272
|
-
SpeechMarkTypes: [SpeechMarkType.WORD],
|
|
273
|
-
});
|
|
274
|
-
|
|
275
|
-
const response = await this.client.send(command);
|
|
276
|
-
|
|
277
|
-
if (!response.AudioStream) {
|
|
278
|
-
return [];
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
// Convert stream to text
|
|
282
|
-
const chunks: Uint8Array[] = [];
|
|
283
|
-
const stream = response.AudioStream;
|
|
284
|
-
|
|
285
|
-
if (Symbol.asyncIterator in stream) {
|
|
286
|
-
for await (const chunk of stream as AsyncIterable<Uint8Array>) {
|
|
287
|
-
chunks.push(chunk);
|
|
288
|
-
}
|
|
289
|
-
} else if (stream instanceof Uint8Array) {
|
|
290
|
-
chunks.push(stream);
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
const marksText = Buffer.concat(chunks).toString("utf-8");
|
|
294
|
-
|
|
295
|
-
// Parse NDJSON (newline-delimited JSON)
|
|
296
|
-
// Each line is a separate JSON object
|
|
297
|
-
const speechMarks = marksText
|
|
298
|
-
.trim()
|
|
299
|
-
.split("\n")
|
|
300
|
-
.filter((line) => line.trim())
|
|
301
|
-
.map((line) => {
|
|
302
|
-
const mark = JSON.parse(line);
|
|
303
|
-
return {
|
|
304
|
-
time: mark.time,
|
|
305
|
-
type: "word" as const,
|
|
306
|
-
start: mark.start,
|
|
307
|
-
end: mark.end,
|
|
308
|
-
value: mark.value,
|
|
309
|
-
};
|
|
310
|
-
});
|
|
311
|
-
|
|
312
|
-
return speechMarks;
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
/**
|
|
316
|
-
* Get available voices from AWS Polly
|
|
317
|
-
*/
|
|
318
|
-
async getVoices(options?: GetVoicesOptions): Promise<Voice[]> {
|
|
319
|
-
this.ensureInitialized();
|
|
320
|
-
|
|
321
|
-
try {
|
|
322
|
-
const input: DescribeVoicesCommandInput = {
|
|
323
|
-
Engine: this.engine === "neural" ? Engine.NEURAL : Engine.STANDARD,
|
|
324
|
-
};
|
|
325
|
-
|
|
326
|
-
if (options?.language) {
|
|
327
|
-
input.LanguageCode = options.language as any;
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
const command = new DescribeVoicesCommand(input);
|
|
331
|
-
const response = await this.client.send(command);
|
|
332
|
-
|
|
333
|
-
if (!response.Voices) {
|
|
334
|
-
return [];
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
return response.Voices.map((voice) => ({
|
|
338
|
-
id: voice.Id!,
|
|
339
|
-
name: voice.Name!,
|
|
340
|
-
language: voice.LanguageName!,
|
|
341
|
-
languageCode: voice.LanguageCode!,
|
|
342
|
-
gender: voice.Gender?.toLowerCase() as
|
|
343
|
-
| "male"
|
|
344
|
-
| "female"
|
|
345
|
-
| "neutral"
|
|
346
|
-
| undefined,
|
|
347
|
-
quality: (this.engine === "neural" ? "neural" : "standard") as
|
|
348
|
-
| "neural"
|
|
349
|
-
| "standard"
|
|
350
|
-
| "premium",
|
|
351
|
-
supportedFeatures: {
|
|
352
|
-
ssml: true,
|
|
353
|
-
emotions: false,
|
|
354
|
-
styles: false,
|
|
355
|
-
},
|
|
356
|
-
providerMetadata: {
|
|
357
|
-
supportedEngines: voice.SupportedEngines,
|
|
358
|
-
additionalLanguageCodes: voice.AdditionalLanguageCodes,
|
|
359
|
-
},
|
|
360
|
-
})).filter((voice) => {
|
|
361
|
-
// Apply filters
|
|
362
|
-
if (options?.gender && voice.gender !== options.gender) {
|
|
363
|
-
return false;
|
|
364
|
-
}
|
|
365
|
-
if (options?.quality && voice.quality !== options.quality) {
|
|
366
|
-
return false;
|
|
367
|
-
}
|
|
368
|
-
return true;
|
|
369
|
-
});
|
|
370
|
-
} catch (error) {
|
|
371
|
-
throw new TTSError(
|
|
372
|
-
TTSErrorCode.PROVIDER_ERROR,
|
|
373
|
-
`Failed to get voices: ${error instanceof Error ? error.message : String(error)}`,
|
|
374
|
-
{ error },
|
|
375
|
-
this.providerId,
|
|
376
|
-
);
|
|
377
|
-
}
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
/**
|
|
381
|
-
* Get AWS Polly capabilities.
|
|
382
|
-
*
|
|
383
|
-
* Clearly documents what features are W3C-standard vs AWS-specific.
|
|
384
|
-
*/
|
|
385
|
-
getCapabilities(): ServerProviderCapabilities {
|
|
386
|
-
return {
|
|
387
|
-
// W3C Standard features
|
|
388
|
-
standard: {
|
|
389
|
-
supportsSSML: true, // ✅ Full SSML 1.1 + AWS extensions
|
|
390
|
-
supportsPitch: true, // ✅ Via SSML <prosody pitch> (not direct API param)
|
|
391
|
-
supportsRate: true, // ✅ Via SSML <prosody rate> (not direct API param)
|
|
392
|
-
supportsVolume: false, // ❌ Not supported by Polly API (handle client-side)
|
|
393
|
-
supportsMultipleVoices: true, // ✅ 60+ voices across 25+ languages
|
|
394
|
-
maxTextLength: 3000, // AWS Polly limit per request
|
|
395
|
-
},
|
|
396
|
-
|
|
397
|
-
// Provider-specific extensions
|
|
398
|
-
extensions: {
|
|
399
|
-
supportsSpeechMarks: true, // ✅ Native WORD speech marks (millisecond precision)
|
|
400
|
-
supportedFormats: ["mp3"], // Currently MP3 only (could add ogg, pcm)
|
|
401
|
-
supportsSampleRate: true, // ✅ Configurable sample rate
|
|
402
|
-
|
|
403
|
-
// AWS Polly-specific features
|
|
404
|
-
providerSpecific: {
|
|
405
|
-
engines: ["neural", "standard"], // Engine selection
|
|
406
|
-
supportedSpeechMarkTypes: ["word"], // Currently only WORD (could add sentence, ssml, viseme)
|
|
407
|
-
supportsLexicons: false, // Not yet implemented
|
|
408
|
-
awsSSMLExtensions: true, // <aws-break>, <aws-emphasis>, <aws-w>, etc.
|
|
409
|
-
neuralVoicesCount: 30, // ~30 neural voices available
|
|
410
|
-
standardVoicesCount: 30, // ~30 standard voices available
|
|
411
|
-
languagesCount: 25, // 25+ languages supported
|
|
412
|
-
},
|
|
413
|
-
},
|
|
414
|
-
};
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
/**
|
|
418
|
-
* Clean up AWS Polly client
|
|
419
|
-
*/
|
|
420
|
-
async destroy(): Promise<void> {
|
|
421
|
-
if (this.client) {
|
|
422
|
-
this.client.destroy();
|
|
423
|
-
}
|
|
424
|
-
await super.destroy();
|
|
425
|
-
}
|
|
426
|
-
}
|
package/src/index.ts
DELETED