@pie-players/tts-server-polly 0.1.0 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pie-players/tts-server-polly",
3
- "version": "0.1.0",
3
+ "version": "0.1.3",
4
4
  "description": "AWS Polly provider for server-side TTS with speech marks support",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -11,10 +11,13 @@
11
11
  "import": "./dist/index.js"
12
12
  }
13
13
  },
14
+ "files": [
15
+ "dist"
16
+ ],
14
17
  "scripts": {
15
18
  "build": "tsc",
16
19
  "dev": "tsc --watch",
17
- "test": "vitest",
20
+ "test": "vitest run --passWithNoTests",
18
21
  "test:coverage": "vitest --coverage"
19
22
  },
20
23
  "keywords": [
@@ -28,7 +31,7 @@
28
31
  "license": "MIT",
29
32
  "dependencies": {
30
33
  "@aws-sdk/client-polly": "^3.700.0",
31
- "@pie-players/tts-server-core": "workspace:*"
34
+ "@pie-players/tts-server-core": "0.1.3"
32
35
  },
33
36
  "devDependencies": {
34
37
  "typescript": "^5.3.3",
@@ -1 +0,0 @@
1
- $ tsc
@@ -1,603 +0,0 @@
1
- # TTS Server API Integration Guide
2
-
3
- This guide shows how to integrate the server-side TTS with speech marks into your SvelteKit application.
4
-
5
- ## Overview
6
-
7
- The integration has three parts:
8
-
9
- 1. **Server-side packages** - Handle AWS Polly API calls
10
- 2. **SvelteKit API routes** - Expose TTS endpoints
11
- 3. **Client-side provider** - Call API from browser
12
-
13
- ## Architecture
14
-
15
- ```
16
- Browser (Client)
17
-
18
- ServerTTSProvider (@pie-players/tts-client-server)
19
- ↓ HTTP POST
20
- SvelteKit API Route (/api/tts/synthesize/+server.ts)
21
-
22
- PollyServerProvider (@pie-players/tts-server-polly)
23
-
24
- AWS Polly API (audio + speech marks)
25
- ```
26
-
27
- ## Step 1: Install Packages
28
-
29
- ```bash
30
- cd your-sveltekit-app
31
-
32
- # Install server-side packages
33
- bun add @pie-players/tts-server-core
34
- bun add @pie-players/tts-server-polly
35
-
36
- # Install client-side provider
37
- bun add @pie-players/tts-client-server
38
- ```
39
-
40
- ## Step 2: Configure Environment Variables
41
-
42
- Create or update `.env`:
43
-
44
- ```bash
45
- # AWS Polly credentials
46
- AWS_REGION=us-east-1
47
- AWS_ACCESS_KEY_ID=your_access_key_id
48
- AWS_SECRET_ACCESS_KEY=your_secret_access_key
49
-
50
- # Optional: Redis for caching
51
- REDIS_URL=redis://localhost:6379
52
- ```
53
-
54
- **Important:** Never commit `.env` to git. Add to `.gitignore`:
55
-
56
- ```
57
- .env
58
- .env.local
59
- ```
60
-
61
- ## Step 3: Create SvelteKit API Routes
62
-
63
- ### Create Directory Structure
64
-
65
- ```bash
66
- mkdir -p src/routes/api/tts/synthesize
67
- mkdir -p src/routes/api/tts/voices
68
- ```
69
-
70
- ### Synthesize Endpoint
71
-
72
- Copy the example to: **`src/routes/api/tts/synthesize/+server.ts`**
73
-
74
- ```typescript
75
- import { json, error } from '@sveltejs/kit';
76
- import type { RequestHandler } from './$types';
77
- import { PollyServerProvider } from '@pie-players/tts-server-polly';
78
-
79
- // Singleton provider instance
80
- let pollyProvider: PollyServerProvider | null = null;
81
-
82
- async function getPollyProvider(): Promise<PollyServerProvider> {
83
- if (!pollyProvider) {
84
- pollyProvider = new PollyServerProvider();
85
- await pollyProvider.initialize({
86
- region: process.env.AWS_REGION || 'us-east-1',
87
- credentials: {
88
- accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
89
- secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
90
- },
91
- engine: 'neural',
92
- defaultVoice: 'Joanna',
93
- });
94
- }
95
- return pollyProvider;
96
- }
97
-
98
- export const POST: RequestHandler = async ({ request }) => {
99
- try {
100
- const body = await request.json();
101
- const { text, voice, language, rate, includeSpeechMarks = true } = body;
102
-
103
- if (!text || typeof text !== 'string') {
104
- throw error(400, { message: 'Text is required' });
105
- }
106
-
107
- if (text.length > 3000) {
108
- throw error(400, { message: 'Text too long (max 3000 characters)' });
109
- }
110
-
111
- const polly = await getPollyProvider();
112
- const result = await polly.synthesize({
113
- text,
114
- voice: voice || 'Joanna',
115
- language: language || 'en-US',
116
- rate,
117
- includeSpeechMarks,
118
- });
119
-
120
- return json({
121
- audio: result.audio instanceof Buffer ? result.audio.toString('base64') : result.audio,
122
- contentType: result.contentType,
123
- speechMarks: result.speechMarks,
124
- metadata: result.metadata,
125
- });
126
- } catch (err) {
127
- console.error('[TTS API] Error:', err);
128
- throw error(500, { message: err instanceof Error ? err.message : 'Synthesis failed' });
129
- }
130
- };
131
- ```
132
-
133
- ### Voices Endpoint
134
-
135
- Copy the example to: **`src/routes/api/tts/voices/+server.ts`**
136
-
137
- ```typescript
138
- import { json, error } from '@sveltejs/kit';
139
- import type { RequestHandler } from './$types';
140
- import { PollyServerProvider } from '@pie-players/tts-server-polly';
141
-
142
- // Use same singleton as synthesize route
143
- let pollyProvider: PollyServerProvider | null = null;
144
-
145
- async function getPollyProvider(): Promise<PollyServerProvider> {
146
- if (!pollyProvider) {
147
- pollyProvider = new PollyServerProvider();
148
- await pollyProvider.initialize({
149
- region: process.env.AWS_REGION || 'us-east-1',
150
- credentials: {
151
- accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
152
- secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
153
- },
154
- engine: 'neural',
155
- });
156
- }
157
- return pollyProvider;
158
- }
159
-
160
- export const GET: RequestHandler = async ({ url }) => {
161
- try {
162
- const language = url.searchParams.get('language') || undefined;
163
- const gender = url.searchParams.get('gender') as 'male' | 'female' | 'neutral' | undefined;
164
-
165
- const polly = await getPollyProvider();
166
- const voices = await polly.getVoices({ language, gender });
167
-
168
- return json({ voices });
169
- } catch (err) {
170
- console.error('[TTS API] Error:', err);
171
- throw error(500, { message: err instanceof Error ? err.message : 'Failed to get voices' });
172
- }
173
- };
174
- ```
175
-
176
- ## Step 4: Use in Client Code
177
-
178
- ### Basic Usage
179
-
180
- ```typescript
181
- import { ServerTTSProvider } from '@pie-players/tts-client-server';
182
- import { TTSService } from '@pie-players/pie-assessment-toolkit';
183
-
184
- // Initialize TTS service with server provider
185
- const provider = new ServerTTSProvider();
186
- const ttsService = new TTSService();
187
-
188
- await ttsService.initialize(provider, {
189
- apiEndpoint: '/api/tts',
190
- provider: 'polly',
191
- voice: 'Joanna',
192
- language: 'en-US',
193
- rate: 1.0,
194
- });
195
-
196
- // Speak with word highlighting
197
- await ttsService.speak('Hello world, this is a test.', {
198
- contentElement: document.getElementById('content'),
199
- });
200
- ```
201
-
202
- ### With Svelte Component
203
-
204
- ```svelte
205
- <script lang="ts">
206
- import { ServerTTSProvider } from '@pie-players/tts-client-server';
207
- import { TTSService } from '@pie-players/pie-assessment-toolkit';
208
- import { onMount } from 'svelte';
209
-
210
- let ttsService: TTSService;
211
- let contentElement: HTMLElement;
212
-
213
- onMount(async () => {
214
- const provider = new ServerTTSProvider();
215
- ttsService = new TTSService();
216
-
217
- await ttsService.initialize(provider, {
218
- apiEndpoint: '/api/tts',
219
- provider: 'polly',
220
- voice: 'Joanna',
221
- });
222
- });
223
-
224
- async function handleSpeak() {
225
- await ttsService.speak('Hello world', {
226
- contentElement,
227
- });
228
- }
229
- </script>
230
-
231
- <div bind:this={contentElement}>
232
- <p>Hello world, this is a test of text to speech.</p>
233
- </div>
234
-
235
- <button on:click={handleSpeak}>Speak</button>
236
- ```
237
-
238
- ## Step 5: Add Redis Caching (Optional)
239
-
240
- ### Install Redis
241
-
242
- ```bash
243
- bun add ioredis
244
- ```
245
-
246
- ### Update API Route with Caching
247
-
248
- ```typescript
249
- import { json, error } from '@sveltejs/kit';
250
- import type { RequestHandler } from './$types';
251
- import { PollyServerProvider } from '@pie-players/tts-server-polly';
252
- import { generateHashedCacheKey } from '@pie-players/tts-server-core';
253
- import Redis from 'ioredis';
254
-
255
- // Singleton instances
256
- let pollyProvider: PollyServerProvider | null = null;
257
- let redis: Redis | null = null;
258
-
259
- async function getRedis(): Promise<Redis> {
260
- if (!redis && process.env.REDIS_URL) {
261
- redis = new Redis(process.env.REDIS_URL);
262
- }
263
- return redis!;
264
- }
265
-
266
- async function getPollyProvider(): Promise<PollyServerProvider> {
267
- if (!pollyProvider) {
268
- pollyProvider = new PollyServerProvider();
269
- await pollyProvider.initialize({
270
- region: process.env.AWS_REGION || 'us-east-1',
271
- credentials: {
272
- accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
273
- secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
274
- },
275
- engine: 'neural',
276
- });
277
- }
278
- return pollyProvider;
279
- }
280
-
281
- export const POST: RequestHandler = async ({ request }) => {
282
- try {
283
- const body = await request.json();
284
- const { text, voice = 'Joanna', language = 'en-US', rate = 1.0, includeSpeechMarks = true } = body;
285
-
286
- if (!text || typeof text !== 'string') {
287
- throw error(400, { message: 'Text is required' });
288
- }
289
-
290
- if (text.length > 3000) {
291
- throw error(400, { message: 'Text too long (max 3000 characters)' });
292
- }
293
-
294
- // Generate cache key
295
- const cacheKey = await generateHashedCacheKey({
296
- providerId: 'aws-polly',
297
- text,
298
- voice,
299
- language,
300
- rate,
301
- format: 'mp3',
302
- });
303
-
304
- // Check Redis cache
305
- if (process.env.REDIS_URL) {
306
- try {
307
- const redisClient = await getRedis();
308
- const cached = await redisClient.get(cacheKey);
309
-
310
- if (cached) {
311
- console.log('[TTS API] Cache hit:', cacheKey);
312
- const result = JSON.parse(cached);
313
- result.metadata.cached = true;
314
- return json(result);
315
- }
316
- } catch (cacheError) {
317
- console.warn('[TTS API] Cache read error:', cacheError);
318
- // Continue without cache
319
- }
320
- }
321
-
322
- // Synthesize with Polly
323
- const polly = await getPollyProvider();
324
- const result = await polly.synthesize({
325
- text,
326
- voice,
327
- language,
328
- rate,
329
- includeSpeechMarks,
330
- });
331
-
332
- const response = {
333
- audio: result.audio instanceof Buffer ? result.audio.toString('base64') : result.audio,
334
- contentType: result.contentType,
335
- speechMarks: result.speechMarks,
336
- metadata: result.metadata,
337
- };
338
-
339
- // Cache result
340
- if (process.env.REDIS_URL) {
341
- try {
342
- const redisClient = await getRedis();
343
- await redisClient.setex(cacheKey, 24 * 60 * 60, JSON.stringify(response));
344
- console.log('[TTS API] Cached result:', cacheKey);
345
- } catch (cacheError) {
346
- console.warn('[TTS API] Cache write error:', cacheError);
347
- // Non-fatal, continue
348
- }
349
- }
350
-
351
- return json(response);
352
- } catch (err) {
353
- console.error('[TTS API] Error:', err);
354
- throw error(500, { message: err instanceof Error ? err.message : 'Synthesis failed' });
355
- }
356
- };
357
- ```
358
-
359
- ## Step 6: Test the Integration
360
-
361
- ### Test API Endpoints
362
-
363
- ```bash
364
- # Test synthesize endpoint
365
- curl -X POST http://localhost:5173/api/tts/synthesize \
366
- -H "Content-Type: application/json" \
367
- -d '{"text": "Hello world", "voice": "Joanna"}'
368
-
369
- # Test voices endpoint
370
- curl http://localhost:5173/api/tts/voices
371
- ```
372
-
373
- ### Test in Browser
374
-
375
- ```typescript
376
- // In browser console
377
- const response = await fetch('/api/tts/synthesize', {
378
- method: 'POST',
379
- headers: { 'Content-Type': 'application/json' },
380
- body: JSON.stringify({
381
- text: 'Hello world, this is a test.',
382
- voice: 'Joanna',
383
- }),
384
- });
385
-
386
- const data = await response.json();
387
- console.log('Speech marks:', data.speechMarks);
388
- console.log('Metadata:', data.metadata);
389
- ```
390
-
391
- ## Redis Caching Benefits
392
-
393
- With Redis caching enabled:
394
-
395
- - **First request:** Full Polly API call (~300-500ms)
396
- - **Cached requests:** Redis retrieval (~10-20ms)
397
- - **Cost savings:** 80-90% reduction in Polly API calls
398
- - **TTL:** 24 hours (configurable)
399
-
400
- ### Cache Key Format
401
-
402
- ```
403
- tts:aws-polly:Joanna:en-US:1.00:mp3:<sha256-hash-of-text>
404
- ```
405
-
406
- ## Security Considerations
407
-
408
- ### Credentials
409
-
410
- - ✅ AWS credentials stay on server (never exposed to browser)
411
- - ✅ Use IAM roles in production (no hardcoded credentials)
412
- - ✅ Use environment variables for configuration
413
-
414
- ### Authentication (Optional)
415
-
416
- Add authentication middleware to protect API:
417
-
418
- ```typescript
419
- // src/hooks.server.ts
420
- import type { Handle } from '@sveltejs/kit';
421
-
422
- export const handle: Handle = async ({ event, resolve }) => {
423
- // Check if request is to TTS API
424
- if (event.url.pathname.startsWith('/api/tts')) {
425
- // Verify JWT token or API key
426
- const authHeader = event.request.headers.get('Authorization');
427
- if (!authHeader || !isValidToken(authHeader)) {
428
- return new Response(JSON.stringify({ error: 'Unauthorized' }), {
429
- status: 401,
430
- headers: { 'Content-Type': 'application/json' },
431
- });
432
- }
433
- }
434
-
435
- return resolve(event);
436
- };
437
- ```
438
-
439
- ### Rate Limiting
440
-
441
- Add rate limiting to prevent abuse:
442
-
443
- ```typescript
444
- import { rateLimit } from '$lib/rate-limiter';
445
-
446
- export const POST: RequestHandler = async ({ request, getClientAddress }) => {
447
- // Check rate limit
448
- const clientIP = getClientAddress();
449
- const allowed = await rateLimit.check(clientIP, {
450
- maxRequests: 60, // 60 requests
451
- windowMs: 60000, // per minute
452
- });
453
-
454
- if (!allowed) {
455
- throw error(429, { message: 'Rate limit exceeded' });
456
- }
457
-
458
- // ... rest of handler
459
- };
460
- ```
461
-
462
- ## Cost Optimization
463
-
464
- ### AWS Polly Pricing
465
-
466
- - **Neural voices:** $16 per 1M characters
467
- - **Standard voices:** $4 per 1M characters
468
-
469
- ### Example Costs
470
-
471
- **Scenario:** 1000 students taking an assessment
472
-
473
- - Average assessment: 5 passages × 500 words × 5 chars = 12,500 chars per student
474
- - Total: 12.5M characters
475
- - Cost without caching: $200 (neural) or $50 (standard)
476
- - Cost with 80% cache hit rate: $40 (neural) or $10 (standard)
477
-
478
- ### Optimization Tips
479
-
480
- 1. **Use Redis caching** - 24-hour TTL captures repeated content
481
- 2. **Standard voices for development** - Switch to neural for production
482
- 3. **Monitor usage** - Track API calls and cache hit rates
483
- 4. **Pre-generate common content** - Cache frequently used passages
484
-
485
- ## Troubleshooting
486
-
487
- ### Error: "AWS credentials not found"
488
-
489
- Check environment variables are set:
490
- ```bash
491
- echo $AWS_REGION
492
- echo $AWS_ACCESS_KEY_ID
493
- ```
494
-
495
- ### Error: "Text too long"
496
-
497
- AWS Polly limit is 3000 characters. Split longer text:
498
-
499
- ```typescript
500
- function splitText(text: string, maxLength = 2500): string[] {
501
- const sentences = text.split(/(?<=[.!?])\s+/);
502
- const chunks: string[] = [];
503
- let currentChunk = '';
504
-
505
- for (const sentence of sentences) {
506
- if (currentChunk.length + sentence.length > maxLength) {
507
- chunks.push(currentChunk.trim());
508
- currentChunk = sentence;
509
- } else {
510
- currentChunk += ' ' + sentence;
511
- }
512
- }
513
-
514
- if (currentChunk.trim()) {
515
- chunks.push(currentChunk.trim());
516
- }
517
-
518
- return chunks;
519
- }
520
- ```
521
-
522
- ### Error: "Speech marks empty"
523
-
524
- Check that:
525
- 1. Speech marks are requested in API call
526
- 2. Provider supports speech marks
527
- 3. Text is not empty
528
-
529
- ### Redis connection errors
530
-
531
- If Redis is unavailable, the API will work without caching. Check Redis:
532
-
533
- ```bash
534
- redis-cli ping
535
- # Should return: PONG
536
- ```
537
-
538
- ## Production Deployment
539
-
540
- ### Environment Setup
541
-
542
- ```bash
543
- # Production environment variables
544
- export NODE_ENV=production
545
- export AWS_REGION=us-east-1
546
- export AWS_ACCESS_KEY_ID=xxx
547
- export AWS_SECRET_ACCESS_KEY=yyy
548
- export REDIS_URL=redis://your-redis-host:6379
549
- ```
550
-
551
- ### Docker Deployment
552
-
553
- ```dockerfile
554
- FROM node:20-alpine
555
- WORKDIR /app
556
- COPY . .
557
- RUN npm ci --production
558
- RUN npm run build
559
- EXPOSE 3000
560
- CMD ["node", "build"]
561
- ```
562
-
563
- ### Health Check
564
-
565
- Add a health endpoint:
566
-
567
- ```typescript
568
- // src/routes/api/health/+server.ts
569
- import { json } from '@sveltejs/kit';
570
- import type { RequestHandler } from './$types';
571
-
572
- export const GET: RequestHandler = async () => {
573
- const health = {
574
- status: 'ok',
575
- timestamp: new Date().toISOString(),
576
- services: {
577
- polly: await checkPolly(),
578
- redis: await checkRedis(),
579
- },
580
- };
581
-
582
- return json(health);
583
- };
584
- ```
585
-
586
- ## Next Steps
587
-
588
- 1. **Test in your app** - Create a demo page
589
- 2. **Monitor usage** - Track API calls and costs
590
- 3. **Add more providers** - Google Cloud TTS, ElevenLabs
591
- 4. **Optimize caching** - Fine-tune TTL and eviction
592
-
593
- ## Complete Example
594
-
595
- See the section-player demo for a complete working example:
596
- - `packages/section-player/demo.html` - Client-side usage
597
- - API routes would be added to a SvelteKit app
598
-
599
- ## Support
600
-
601
- For issues or questions:
602
- - Check the [TTS Server API Architecture](../../../docs/tts-server-api-architecture.md)
603
- - See [TTS Highlighting Implementation Plan](../../../docs/tts-highlighting-implementation-plan.md)
@@ -1,110 +0,0 @@
1
- /**
2
- * SvelteKit API route for TTS synthesis
3
- *
4
- * Copy this file to your SvelteKit app:
5
- * src/routes/api/tts/synthesize/+server.ts
6
- */
7
-
8
- import { generateHashedCacheKey } from "@pie-players/tts-server-core";
9
- import { PollyServerProvider } from "@pie-players/tts-server-polly";
10
- import { error, json } from "@sveltejs/kit";
11
- import type { RequestHandler } from "./$types";
12
-
13
- // Initialize Polly provider (singleton)
14
- let pollyProvider: PollyServerProvider | null = null;
15
-
16
- async function getPollyProvider(): Promise<PollyServerProvider> {
17
- if (!pollyProvider) {
18
- pollyProvider = new PollyServerProvider();
19
- await pollyProvider.initialize({
20
- region: process.env.AWS_REGION || "us-east-1",
21
- credentials: {
22
- accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
23
- secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
24
- },
25
- engine: "neural",
26
- defaultVoice: "Joanna",
27
- });
28
- }
29
- return pollyProvider;
30
- }
31
-
32
- // Optional: Redis caching
33
- // import { createClient } from 'redis';
34
- // const redis = createClient({ url: process.env.REDIS_URL });
35
- // await redis.connect();
36
-
37
- export const POST: RequestHandler = async ({ request }) => {
38
- try {
39
- const body = await request.json();
40
- const {
41
- text,
42
- provider = "polly",
43
- voice,
44
- language,
45
- rate,
46
- includeSpeechMarks = true,
47
- } = body;
48
-
49
- // Validate request
50
- if (!text || typeof text !== "string") {
51
- throw error(400, { message: "Text is required and must be a string" });
52
- }
53
-
54
- if (text.length > 3000) {
55
- throw error(400, { message: "Text too long (max 3000 characters)" });
56
- }
57
-
58
- // Optional: Check Redis cache
59
- // const cacheKey = await generateHashedCacheKey({
60
- // providerId: 'aws-polly',
61
- // text,
62
- // voice: voice || 'Joanna',
63
- // language: language || 'en-US',
64
- // rate: rate || 1.0,
65
- // format: 'mp3',
66
- // });
67
- //
68
- // const cached = await redis.get(cacheKey);
69
- // if (cached) {
70
- // console.log('[TTS API] Cache hit:', cacheKey);
71
- // return json(JSON.parse(cached));
72
- // }
73
-
74
- // Get Polly provider
75
- const polly = await getPollyProvider();
76
-
77
- // Synthesize speech
78
- const result = await polly.synthesize({
79
- text,
80
- voice: voice || "Joanna",
81
- language: language || "en-US",
82
- rate,
83
- includeSpeechMarks,
84
- });
85
-
86
- // Convert Buffer to base64 for JSON response
87
- const response = {
88
- audio:
89
- result.audio instanceof Buffer
90
- ? result.audio.toString("base64")
91
- : result.audio,
92
- contentType: result.contentType,
93
- speechMarks: result.speechMarks,
94
- metadata: result.metadata,
95
- };
96
-
97
- // Optional: Cache result
98
- // await redis.setex(cacheKey, 24 * 60 * 60, JSON.stringify(response));
99
-
100
- return json(response);
101
- } catch (err) {
102
- console.error("[TTS API] Synthesis error:", err);
103
-
104
- if (err instanceof Error) {
105
- throw error(500, { message: err.message });
106
- }
107
-
108
- throw error(500, { message: "Internal server error" });
109
- }
110
- };
@@ -1,65 +0,0 @@
1
- /**
2
- * SvelteKit API route for listing TTS voices
3
- *
4
- * Copy this file to your SvelteKit app:
5
- * src/routes/api/tts/voices/+server.ts
6
- */
7
-
8
- import { PollyServerProvider } from "@pie-players/tts-server-polly";
9
- import { error, json } from "@sveltejs/kit";
10
- import type { RequestHandler } from "./$types";
11
-
12
- // Initialize Polly provider (singleton)
13
- let pollyProvider: PollyServerProvider | null = null;
14
-
15
- async function getPollyProvider(): Promise<PollyServerProvider> {
16
- if (!pollyProvider) {
17
- pollyProvider = new PollyServerProvider();
18
- await pollyProvider.initialize({
19
- region: process.env.AWS_REGION || "us-east-1",
20
- credentials: {
21
- accessKeyId: process.env.AWS_ACCESS_KEY_ID!,
22
- secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY!,
23
- },
24
- engine: "neural",
25
- defaultVoice: "Joanna",
26
- });
27
- }
28
- return pollyProvider;
29
- }
30
-
31
- export const GET: RequestHandler = async ({ url }) => {
32
- try {
33
- const language = url.searchParams.get("language") || undefined;
34
- const gender = url.searchParams.get("gender") as
35
- | "male"
36
- | "female"
37
- | "neutral"
38
- | undefined;
39
- const quality = url.searchParams.get("quality") as
40
- | "standard"
41
- | "neural"
42
- | "premium"
43
- | undefined;
44
-
45
- // Get Polly provider
46
- const polly = await getPollyProvider();
47
-
48
- // Get voices with filters
49
- const voices = await polly.getVoices({
50
- language,
51
- gender,
52
- quality,
53
- });
54
-
55
- return json({ voices });
56
- } catch (err) {
57
- console.error("[TTS API] Get voices error:", err);
58
-
59
- if (err instanceof Error) {
60
- throw error(500, { message: err.message });
61
- }
62
-
63
- throw error(500, { message: "Internal server error" });
64
- }
65
- };
@@ -1,426 +0,0 @@
1
- /**
2
- * AWS Polly server-side TTS provider
3
- * @module @pie-players/tts-server-polly
4
- */
5
-
6
- import {
7
- DescribeVoicesCommand,
8
- type DescribeVoicesCommandInput,
9
- Engine,
10
- OutputFormat,
11
- PollyClient,
12
- SpeechMarkType,
13
- SynthesizeSpeechCommand,
14
- VoiceId,
15
- } from "@aws-sdk/client-polly";
16
-
17
- import {
18
- BaseTTSProvider,
19
- type GetVoicesOptions,
20
- type ServerProviderCapabilities,
21
- type SpeechMark,
22
- type SynthesizeRequest,
23
- type SynthesizeResponse,
24
- TTSError,
25
- TTSErrorCode,
26
- type TTSServerConfig,
27
- type Voice,
28
- } from "@pie-players/tts-server-core";
29
-
30
- /**
31
- * AWS Polly provider configuration.
32
- *
33
- * This extends the base TTSServerConfig with Polly-specific settings.
34
- * All fields marked with @extension are AWS-specific and not portable.
35
- */
36
- export interface PollyProviderConfig extends TTSServerConfig {
37
- /**
38
- * AWS region (e.g., 'us-east-1', 'us-west-2', 'eu-west-1')
39
- *
40
- * @extension AWS-specific (region concept)
41
- * @required
42
- */
43
- region: string;
44
-
45
- /**
46
- * AWS credentials for API authentication
47
- *
48
- * @extension AWS-specific
49
- * @note In production, prefer IAM roles over hardcoded credentials
50
- * @see https://docs.aws.amazon.com/sdk-for-javascript/v3/developer-guide/setting-credentials.html
51
- */
52
- credentials?: {
53
- accessKeyId: string;
54
- secretAccessKey: string;
55
- sessionToken?: string;
56
- };
57
-
58
- /**
59
- * Polly engine type: 'neural' (higher quality) or 'standard' (lower cost)
60
- *
61
- * @extension AWS Polly-specific
62
- * @default 'neural'
63
- * @note Neural: $16/1M chars, Standard: $4/1M chars
64
- */
65
- engine?: "neural" | "standard";
66
-
67
- /**
68
- * Default voice ID if not specified in synthesis requests
69
- *
70
- * @standard Voice selection is standard, but voice names are provider-specific
71
- * @default 'Joanna'
72
- * @see https://docs.aws.amazon.com/polly/latest/dg/voicelist.html
73
- */
74
- defaultVoice?: string;
75
- }
76
-
77
- /**
78
- * AWS Polly Server Provider
79
- *
80
- * Provides high-quality neural text-to-speech with precise word-level timing
81
- * through AWS Polly speech marks.
82
- *
83
- * Features:
84
- * - Native speech marks support (millisecond precision)
85
- * - Neural and standard voices
86
- * - 25+ languages
87
- * - Full SSML support
88
- * - Parallel audio + speech marks requests
89
- */
90
- export class PollyServerProvider extends BaseTTSProvider {
91
- readonly providerId = "aws-polly";
92
- readonly providerName = "AWS Polly";
93
- readonly version = "1.0.0";
94
-
95
- private client!: PollyClient;
96
- private engine: "neural" | "standard" = "neural";
97
- private defaultVoice = "Joanna";
98
-
99
- /**
100
- * Initialize the AWS Polly provider.
101
- *
102
- * This is FAST and lightweight - only validates config and creates the Polly client.
103
- * Does NOT fetch voices or make test API calls.
104
- *
105
- * @param config - Polly configuration with region and credentials
106
- * @performance Completes in ~10-50ms
107
- */
108
- async initialize(config: PollyProviderConfig): Promise<void> {
109
- if (!config.region) {
110
- throw new TTSError(
111
- TTSErrorCode.INITIALIZATION_ERROR,
112
- "AWS region is required",
113
- undefined,
114
- this.providerId,
115
- );
116
- }
117
-
118
- this.config = config;
119
- this.engine = config.engine || "neural";
120
- this.defaultVoice = config.defaultVoice || "Joanna";
121
-
122
- try {
123
- // Create Polly client (fast - no API calls)
124
- this.client = new PollyClient({
125
- region: config.region,
126
- credentials: config.credentials,
127
- });
128
-
129
- this.initialized = true;
130
- // NOTE: We do NOT call getVoices() here - that's an explicit secondary operation
131
- } catch (error) {
132
- throw new TTSError(
133
- TTSErrorCode.INITIALIZATION_ERROR,
134
- `Failed to initialize AWS Polly: ${error instanceof Error ? error.message : String(error)}`,
135
- { error },
136
- this.providerId,
137
- );
138
- }
139
- }
140
-
141
- /**
142
- * Synthesize speech with AWS Polly
143
- */
144
- async synthesize(request: SynthesizeRequest): Promise<SynthesizeResponse> {
145
- this.ensureInitialized();
146
-
147
- const capabilities = this.getCapabilities();
148
- this.validateRequest(request, capabilities);
149
-
150
- const voice = request.voice || this.defaultVoice;
151
- const startTime = Date.now();
152
-
153
- try {
154
- // Make parallel requests for audio and speech marks
155
- const [audioResponse, speechMarksResponse] = await Promise.all([
156
- this.synthesizeAudio(request, voice),
157
- request.includeSpeechMarks !== false
158
- ? this.synthesizeSpeechMarks(request, voice)
159
- : Promise.resolve([]),
160
- ]);
161
-
162
- const duration = (Date.now() - startTime) / 1000;
163
-
164
- return {
165
- audio: audioResponse.audio,
166
- contentType: audioResponse.contentType,
167
- speechMarks: speechMarksResponse,
168
- metadata: {
169
- providerId: this.providerId,
170
- voice,
171
- duration,
172
- charCount: request.text.length,
173
- cached: false,
174
- timestamp: new Date().toISOString(),
175
- },
176
- };
177
- } catch (error) {
178
- throw new TTSError(
179
- TTSErrorCode.PROVIDER_ERROR,
180
- `AWS Polly synthesis failed: ${error instanceof Error ? error.message : String(error)}`,
181
- { error, request },
182
- this.providerId,
183
- );
184
- }
185
- }
186
-
187
- /**
188
- * Synthesize audio stream
189
- */
190
- private async synthesizeAudio(
191
- request: SynthesizeRequest,
192
- voice: string,
193
- ): Promise<{ audio: Buffer; contentType: string }> {
194
- // Detect if text contains SSML tags
195
- const isSsml =
196
- request.text.includes("<speak") ||
197
- request.text.includes("<emphasis") ||
198
- request.text.includes("<break") ||
199
- request.text.includes("<prosody") ||
200
- request.text.includes("<phoneme") ||
201
- request.text.includes("<amazon:") ||
202
- request.text.includes("<aws-");
203
-
204
- const textType = isSsml ? "ssml" : "text";
205
-
206
- if (isSsml) {
207
- console.log(
208
- "[PollyServerProvider] Detected SSML content, using TextType: ssml",
209
- );
210
- }
211
-
212
- const command = new SynthesizeSpeechCommand({
213
- Engine: this.engine === "neural" ? Engine.NEURAL : Engine.STANDARD,
214
- OutputFormat: OutputFormat.MP3,
215
- Text: request.text,
216
- TextType: textType,
217
- VoiceId: voice as VoiceId,
218
- SampleRate: String(request.sampleRate || 24000),
219
- });
220
-
221
- const response = await this.client.send(command);
222
-
223
- if (!response.AudioStream) {
224
- throw new Error("No audio stream received from AWS Polly");
225
- }
226
-
227
- // Convert stream to buffer
228
- const chunks: Uint8Array[] = [];
229
- const stream = response.AudioStream;
230
-
231
- if (Symbol.asyncIterator in stream) {
232
- for await (const chunk of stream as AsyncIterable<Uint8Array>) {
233
- chunks.push(chunk);
234
- }
235
- } else if (stream instanceof Uint8Array) {
236
- chunks.push(stream);
237
- }
238
-
239
- const audioBuffer = Buffer.concat(chunks);
240
-
241
- return {
242
- audio: audioBuffer,
243
- contentType: response.ContentType || "audio/mpeg",
244
- };
245
- }
246
-
247
- /**
248
- * Synthesize speech marks
249
- */
250
- private async synthesizeSpeechMarks(
251
- request: SynthesizeRequest,
252
- voice: string,
253
- ): Promise<SpeechMark[]> {
254
- // Detect if text contains SSML tags (same logic as audio synthesis)
255
- const isSsml =
256
- request.text.includes("<speak") ||
257
- request.text.includes("<emphasis") ||
258
- request.text.includes("<break") ||
259
- request.text.includes("<prosody") ||
260
- request.text.includes("<phoneme") ||
261
- request.text.includes("<amazon:") ||
262
- request.text.includes("<aws-");
263
-
264
- const textType = isSsml ? "ssml" : "text";
265
-
266
- const command = new SynthesizeSpeechCommand({
267
- Engine: this.engine === "neural" ? Engine.NEURAL : Engine.STANDARD,
268
- OutputFormat: OutputFormat.JSON,
269
- Text: request.text,
270
- TextType: textType,
271
- VoiceId: voice as VoiceId,
272
- SpeechMarkTypes: [SpeechMarkType.WORD],
273
- });
274
-
275
- const response = await this.client.send(command);
276
-
277
- if (!response.AudioStream) {
278
- return [];
279
- }
280
-
281
- // Convert stream to text
282
- const chunks: Uint8Array[] = [];
283
- const stream = response.AudioStream;
284
-
285
- if (Symbol.asyncIterator in stream) {
286
- for await (const chunk of stream as AsyncIterable<Uint8Array>) {
287
- chunks.push(chunk);
288
- }
289
- } else if (stream instanceof Uint8Array) {
290
- chunks.push(stream);
291
- }
292
-
293
- const marksText = Buffer.concat(chunks).toString("utf-8");
294
-
295
- // Parse NDJSON (newline-delimited JSON)
296
- // Each line is a separate JSON object
297
- const speechMarks = marksText
298
- .trim()
299
- .split("\n")
300
- .filter((line) => line.trim())
301
- .map((line) => {
302
- const mark = JSON.parse(line);
303
- return {
304
- time: mark.time,
305
- type: "word" as const,
306
- start: mark.start,
307
- end: mark.end,
308
- value: mark.value,
309
- };
310
- });
311
-
312
- return speechMarks;
313
- }
314
-
315
- /**
316
- * Get available voices from AWS Polly
317
- */
318
- async getVoices(options?: GetVoicesOptions): Promise<Voice[]> {
319
- this.ensureInitialized();
320
-
321
- try {
322
- const input: DescribeVoicesCommandInput = {
323
- Engine: this.engine === "neural" ? Engine.NEURAL : Engine.STANDARD,
324
- };
325
-
326
- if (options?.language) {
327
- input.LanguageCode = options.language as any;
328
- }
329
-
330
- const command = new DescribeVoicesCommand(input);
331
- const response = await this.client.send(command);
332
-
333
- if (!response.Voices) {
334
- return [];
335
- }
336
-
337
- return response.Voices.map((voice) => ({
338
- id: voice.Id!,
339
- name: voice.Name!,
340
- language: voice.LanguageName!,
341
- languageCode: voice.LanguageCode!,
342
- gender: voice.Gender?.toLowerCase() as
343
- | "male"
344
- | "female"
345
- | "neutral"
346
- | undefined,
347
- quality: (this.engine === "neural" ? "neural" : "standard") as
348
- | "neural"
349
- | "standard"
350
- | "premium",
351
- supportedFeatures: {
352
- ssml: true,
353
- emotions: false,
354
- styles: false,
355
- },
356
- providerMetadata: {
357
- supportedEngines: voice.SupportedEngines,
358
- additionalLanguageCodes: voice.AdditionalLanguageCodes,
359
- },
360
- })).filter((voice) => {
361
- // Apply filters
362
- if (options?.gender && voice.gender !== options.gender) {
363
- return false;
364
- }
365
- if (options?.quality && voice.quality !== options.quality) {
366
- return false;
367
- }
368
- return true;
369
- });
370
- } catch (error) {
371
- throw new TTSError(
372
- TTSErrorCode.PROVIDER_ERROR,
373
- `Failed to get voices: ${error instanceof Error ? error.message : String(error)}`,
374
- { error },
375
- this.providerId,
376
- );
377
- }
378
- }
379
-
380
- /**
381
- * Get AWS Polly capabilities.
382
- *
383
- * Clearly documents what features are W3C-standard vs AWS-specific.
384
- */
385
- getCapabilities(): ServerProviderCapabilities {
386
- return {
387
- // W3C Standard features
388
- standard: {
389
- supportsSSML: true, // ✅ Full SSML 1.1 + AWS extensions
390
- supportsPitch: true, // ✅ Via SSML <prosody pitch> (not direct API param)
391
- supportsRate: true, // ✅ Via SSML <prosody rate> (not direct API param)
392
- supportsVolume: false, // ❌ Not supported by Polly API (handle client-side)
393
- supportsMultipleVoices: true, // ✅ 60+ voices across 25+ languages
394
- maxTextLength: 3000, // AWS Polly limit per request
395
- },
396
-
397
- // Provider-specific extensions
398
- extensions: {
399
- supportsSpeechMarks: true, // ✅ Native WORD speech marks (millisecond precision)
400
- supportedFormats: ["mp3"], // Currently MP3 only (could add ogg, pcm)
401
- supportsSampleRate: true, // ✅ Configurable sample rate
402
-
403
- // AWS Polly-specific features
404
- providerSpecific: {
405
- engines: ["neural", "standard"], // Engine selection
406
- supportedSpeechMarkTypes: ["word"], // Currently only WORD (could add sentence, ssml, viseme)
407
- supportsLexicons: false, // Not yet implemented
408
- awsSSMLExtensions: true, // <aws-break>, <aws-emphasis>, <aws-w>, etc.
409
- neuralVoicesCount: 30, // ~30 neural voices available
410
- standardVoicesCount: 30, // ~30 standard voices available
411
- languagesCount: 25, // 25+ languages supported
412
- },
413
- },
414
- };
415
- }
416
-
417
- /**
418
- * Clean up AWS Polly client
419
- */
420
- async destroy(): Promise<void> {
421
- if (this.client) {
422
- this.client.destroy();
423
- }
424
- await super.destroy();
425
- }
426
- }
package/src/index.ts DELETED
@@ -1,7 +0,0 @@
1
- /**
2
- * AWS Polly server-side TTS provider
3
- * @module @pie-players/tts-server-polly
4
- */
5
-
6
- export type { PollyProviderConfig } from "./PollyServerProvider.js";
7
- export { PollyServerProvider } from "./PollyServerProvider.js";
package/tsconfig.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "extends": "../tts-server-core/tsconfig.json",
3
- "compilerOptions": {
4
- "outDir": "./dist",
5
- "rootDir": "./src"
6
- },
7
- "include": ["src/**/*"],
8
- "exclude": ["node_modules", "dist", "**/*.test.ts"]
9
- }