@happyvertical/smrt-voice 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md ADDED
@@ -0,0 +1,18 @@
1
+ # @happyvertical/smrt-voice
2
+
3
+ TTS voice profiles with two creation modes: AI design or audio cloning. Word-level timing output for lip-sync.
4
+
5
+ ## Models
6
+
7
+ - **VoiceProfile**: two mutually exclusive modes — `designPrompt` (AI-generated from description) XOR `sampleAssetId` (cloned from audio). Status: `pending → processing → ready/failed`. `voiceData` is opaque provider-specific storage. `defaultSpeed` (0.5-2.0), `defaultPitch` (-20 to 20 semitones).
8
+ - **VoiceSample**: audio training data. `duration`, `transcription`, `quality` (low/medium/high), `sampleRate`, `format`. Validation: `meetsMinDuration` (≥3 sec), `isSuitableForCloning` (≥3 sec AND quality ≠ low).
9
+ - **VoiceOutput** (extends Content): generated TTS audio. `sourceText`, `audioAssetId`, `wordTimings` array `[{word, start, end}]` in seconds for lip-sync. `audioMetadata` (sampleRate, format, channels, bitDepth, provider, model). Computed: `wordCount`, `wordsPerSecond`, `getWordAtTime(seconds)`.
10
+
11
+ ## Gotchas
12
+
13
+ - **Default provider hardcoded**: 'qwen3-tts' — no provider abstraction layer
14
+ - **Sample minimum not enforced in constructor**: 3-sec minimum documented but not validated on create
15
+ - **WordTiming from external provider**: framework doesn't generate timings — populated by TTS service
16
+ - **Status transitions not enforced**: can manually set status without triggering generation workflow
17
+ - **voiceData is opaque**: `{ [key: string]: any }` — provider-specific, no schema
18
+ - **Optional tenancy**: tenantId=null for global/default voices
package/CLAUDE.md ADDED
@@ -0,0 +1 @@
1
+ @AGENTS.md
package/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright <2025> <Happy Vertical Corporation>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,106 @@
1
+ # @happyvertical/smrt-voice
2
+
3
+ Voice profile management for AI-powered voice synthesis and cloning in the SMRT ecosystem. Manages voice profiles, cloning samples, and TTS output with word-level timing for lip-sync.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pnpm add @happyvertical/smrt-voice
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```typescript
14
+ import { VoiceProfile, VoiceSample, VoiceOutput } from '@happyvertical/smrt-voice';
15
+
16
+ // Two mutually exclusive creation modes:
17
+
18
+ // 1. Voice design -- AI generates voice from a natural language prompt
19
+ const designed = new VoiceProfile({
20
+ name: 'News Anchor',
21
+ language: 'en-US',
22
+ gender: 'male',
23
+ designPrompt: 'Warm, authoritative male voice with clear enunciation',
24
+ defaultSpeed: 1.0, // 0.5 - 2.0
25
+ defaultPitch: 0, // -20 to 20 semitones
26
+ });
27
+ await designed.save();
28
+
29
+ // 2. Voice cloning -- replicate voice from audio sample(s)
30
+ const cloned = new VoiceProfile({
31
+ name: 'Custom Voice',
32
+ language: 'en-US',
33
+ sampleAssetId: 'asset-123',
34
+ });
35
+ await cloned.save();
36
+
37
+ // Add training samples for cloning (minimum 3 seconds, quality != low)
38
+ const sample = new VoiceSample({
39
+ voiceProfileId: cloned.id,
40
+ assetId: 'asset-456',
41
+ duration: 5.2,
42
+ transcription: 'Hello, this is a test recording for voice cloning.',
43
+ quality: 'high',
44
+ sampleRate: 48000,
45
+ format: 'wav',
46
+ isPrimary: true,
47
+ });
48
+ await sample.save();
49
+
50
+ // TTS output with word-level timing for lip-sync
51
+ const output = new VoiceOutput({
52
+ voiceProfileId: designed.id,
53
+ sourceText: 'Welcome to the evening news.',
54
+ audioAssetId: 'asset-789',
55
+ duration: 2.8,
56
+ wordTimings: [
57
+ { word: 'Welcome', start: 0.0, end: 0.4 },
58
+ { word: 'to', start: 0.4, end: 0.5 },
59
+ { word: 'the', start: 0.5, end: 0.6 },
60
+ { word: 'evening', start: 0.6, end: 1.0 },
61
+ { word: 'news', start: 1.0, end: 1.3 },
62
+ ],
63
+ });
64
+ // Look up which word is being spoken at a given timestamp
65
+ output.getWordAtTime(0.7); // { word: 'evening', start: 0.6, end: 1.0 }
66
+ ```
67
+
68
+ ## API
69
+
70
+ ### Models
71
+
72
+ | Export | Description |
73
+ |--------|------------|
74
+ | `VoiceProfile` | Voice identity with two modes: `designPrompt` (AI-generated) or `sampleAssetId` (cloned) |
75
+ | `VoiceSample` | Audio training data for voice cloning with quality rating |
76
+ | `VoiceOutput` | Generated TTS audio (extends Content) with word-level timing for lip-sync |
77
+
78
+ ### Types
79
+
80
+ | Export | Description |
81
+ |--------|------------|
82
+ | `VoiceProfileStatus` | Lifecycle status: `pending`, `processing`, `ready`, `failed` |
83
+ | `VoiceGender` | Gender classification: `male`, `female`, `neutral` |
84
+ | `SampleQuality` | Audio quality rating: `low`, `medium`, `high` |
85
+ | `WordTiming` | Per-word timing entry: `{ word, start, end }` (seconds) |
86
+ | `VoiceOutputMetadata` | Audio metadata: sampleRate, format, channels, bitDepth, provider, model |
87
+ | `VoiceProfileOptions` | Profile creation options |
88
+ | `VoiceSampleOptions` | Sample creation options |
89
+ | `VoiceOutputOptions` | Output creation options |
90
+
91
+ ### Key Computed Properties
92
+
93
+ - `VoiceProfile.isCloned` / `isDesigned` -- which creation mode is active
94
+ - `VoiceProfile.isReady` -- status equals `ready`
95
+ - `VoiceSample.meetsMinDuration` -- duration >= 3 seconds
96
+ - `VoiceSample.isSuitableForCloning` -- meets min duration AND quality != low
97
+ - `VoiceOutput.wordCount` / `wordsPerSecond` -- computed from sourceText and duration
98
+ - `VoiceOutput.getWordAtTime(seconds)` -- look up word being spoken at a timestamp
99
+
100
+ ## Dependencies
101
+
102
+ - `@happyvertical/smrt-core` -- ORM and code generation
103
+ - `@happyvertical/smrt-assets` -- base asset management
104
+ - `@happyvertical/smrt-config` -- configuration loading
105
+ - `@happyvertical/smrt-content` -- content models (VoiceOutput extends Content)
106
+ - `@happyvertical/smrt-tenancy` -- multi-tenant scoping
@@ -0,0 +1,504 @@
1
+ import { Content } from '@happyvertical/smrt-content';
2
+ import { ContentOptions } from '@happyvertical/smrt-content';
3
+ import { SmrtObject } from '@happyvertical/smrt-core';
4
+ import { SmrtObjectOptions } from '@happyvertical/smrt-core';
5
+
6
+ /**
7
+ * Audio sample quality rating
8
+ */
9
+ export declare type SampleQuality = 'low' | 'medium' | 'high';
10
+
11
+ /**
12
+ * Voice gender classification
13
+ */
14
+ export declare type VoiceGender = 'male' | 'female' | 'neutral';
15
+
16
+ /**
17
+ * Generated audio output from text-to-speech synthesis
18
+ *
19
+ * VoiceOutput extends Content to represent audio generated from
20
+ * text using a VoiceProfile. It includes word-level timing information
21
+ * for lip-sync alignment in video production.
22
+ *
23
+ * @example
24
+ * ```typescript
25
+ * import { VoiceOutput } from '@happyvertical/smrt-voice';
26
+ *
27
+ * const output = new VoiceOutput({
28
+ * voiceProfileId: 'voice-123',
29
+ * sourceText: 'Welcome to the evening news broadcast.',
30
+ * audioAssetId: 'asset-789',
31
+ * duration: 3.5,
32
+ * wordTimings: [
33
+ * { word: 'Welcome', start: 0.0, end: 0.4 },
34
+ * { word: 'to', start: 0.4, end: 0.5 },
35
+ * { word: 'the', start: 0.5, end: 0.6 },
36
+ * { word: 'evening', start: 0.6, end: 1.0 },
37
+ * { word: 'news', start: 1.0, end: 1.3 },
38
+ * { word: 'broadcast', start: 1.3, end: 1.9 },
39
+ * ],
40
+ * audioMetadata: {
41
+ * sampleRate: 48000,
42
+ * format: 'wav',
43
+ * channels: 1,
44
+ * provider: 'qwen3-tts',
45
+ * },
46
+ * });
47
+ * ```
48
+ */
49
+ export declare class VoiceOutput extends Content {
50
+ /**
51
+ * Voice profile used for synthesis
52
+ */
53
+ voiceProfileId: string | null;
54
+ /**
55
+ * Original text that was synthesized
56
+ */
57
+ sourceText: string;
58
+ /**
59
+ * Asset ID of the generated audio file
60
+ */
61
+ audioAssetId: string | null;
62
+ /**
63
+ * Duration of the generated audio in seconds
64
+ */
65
+ duration: number;
66
+ /**
67
+ * Word-level timing information for lip-sync alignment
68
+ */
69
+ wordTimings: WordTiming[] | null;
70
+ /**
71
+ * Audio metadata (sample rate, format, etc.)
72
+ */
73
+ audioMetadata: VoiceOutputMetadata;
74
+ constructor(options?: VoiceOutputOptions);
75
+ /**
76
+ * Get the word count of the source text
77
+ */
78
+ get wordCount(): number;
79
+ /**
80
+ * Get the average words per second rate
81
+ */
82
+ get wordsPerSecond(): number;
83
+ /**
84
+ * Check if word timing data is available for lip-sync
85
+ */
86
+ get hasWordTimings(): boolean;
87
+ /**
88
+ * Get the word at a specific timestamp
89
+ */
90
+ getWordAtTime(seconds: number): WordTiming | null;
91
+ }
92
+
93
+ /**
94
+ * Voice output metadata
95
+ */
96
+ export declare interface VoiceOutputMetadata {
97
+ /**
98
+ * Sample rate in Hz
99
+ */
100
+ sampleRate?: number;
101
+ /**
102
+ * Audio format (e.g., 'wav', 'mp3', 'ogg')
103
+ */
104
+ format?: string;
105
+ /**
106
+ * Number of audio channels
107
+ */
108
+ channels?: number;
109
+ /**
110
+ * Bit depth (e.g., 16, 24, 32)
111
+ */
112
+ bitDepth?: number;
113
+ /**
114
+ * File size in bytes
115
+ */
116
+ fileSize?: number;
117
+ /**
118
+ * TTS provider used
119
+ */
120
+ provider?: string;
121
+ /**
122
+ * Model used for synthesis
123
+ */
124
+ model?: string;
125
+ /**
126
+ * Speech speed used (1.0 = normal)
127
+ */
128
+ speed?: number;
129
+ /**
130
+ * Pitch adjustment used (semitones)
131
+ */
132
+ pitch?: number;
133
+ }
134
+
135
+ /**
136
+ * Voice output creation options
137
+ */
138
+ export declare interface VoiceOutputOptions extends ContentOptions {
139
+ /**
140
+ * Voice profile used for synthesis
141
+ */
142
+ voiceProfileId?: string | null;
143
+ /**
144
+ * Original text that was synthesized
145
+ */
146
+ sourceText?: string;
147
+ /**
148
+ * Asset ID of the generated audio file
149
+ */
150
+ audioAssetId?: string | null;
151
+ /**
152
+ * Duration of the generated audio in seconds
153
+ */
154
+ duration?: number;
155
+ /**
156
+ * Word-level timing information for lip-sync
157
+ */
158
+ wordTimings?: WordTiming[] | null;
159
+ /**
160
+ * Audio metadata
161
+ */
162
+ audioMetadata?: VoiceOutputMetadata;
163
+ }
164
+
165
+ /**
166
+ * Voice profile for AI-powered speech synthesis
167
+ *
168
+ * VoiceProfile represents a configured voice identity that can be used
169
+ * for text-to-speech synthesis. Voices can be created through:
170
+ * - Voice design: Natural language description of desired voice characteristics
171
+ * - Voice cloning: 3+ second audio sample for voice replication
172
+ *
173
+ * @example
174
+ * ```typescript
175
+ * import { VoiceProfile } from '@happyvertical/smrt-voice';
176
+ *
177
+ * // Create a designed voice
178
+ * const anchorVoice = new VoiceProfile({
179
+ * name: 'News Anchor',
180
+ * description: 'Professional news anchor voice with clear enunciation',
181
+ * language: 'en-US',
182
+ * gender: 'male',
183
+ * designPrompt: 'Warm, authoritative male voice with slight gravitas, suitable for news broadcasts',
184
+ * provider: 'qwen3-tts',
185
+ * });
186
+ *
187
+ * // Create a cloned voice
188
+ * const clonedVoice = new VoiceProfile({
189
+ * name: 'Custom Voice',
190
+ * description: 'Cloned from user sample',
191
+ * language: 'en-US',
192
+ * sampleAssetId: 'asset-123',
193
+ * provider: 'qwen3-tts',
194
+ * });
195
+ * ```
196
+ */
197
+ export declare class VoiceProfile extends SmrtObject {
198
+ /**
199
+ * Tenant ID for multi-tenant isolation
200
+ * Nullable to support global/default voices
201
+ */
202
+ tenantId: string | null;
203
+ /**
204
+ * Human-readable name for the voice profile
205
+ */
206
+ name: string;
207
+ /**
208
+ * Description of the voice characteristics
209
+ */
210
+ description: string | null;
211
+ /**
212
+ * ISO language code (e.g., 'en-US', 'zh-CN')
213
+ */
214
+ language: string;
215
+ /**
216
+ * Voice gender classification
217
+ */
218
+ gender: VoiceGender;
219
+ /**
220
+ * Natural language description for voice design
221
+ * Used when creating a voice from scratch via AI
222
+ */
223
+ designPrompt: string | null;
224
+ /**
225
+ * Asset ID of the audio sample for voice cloning
226
+ * Should be at least 3 seconds of clear speech
227
+ */
228
+ sampleAssetId: string | null;
229
+ /**
230
+ * Provider-specific voice data (ID, embedding, etc.)
231
+ * Stored after voice creation/cloning is complete
232
+ */
233
+ voiceData: Record<string, any> | null;
234
+ /**
235
+ * Default speech speed multiplier (0.5 - 2.0)
236
+ * 1.0 = normal speed
237
+ */
238
+ defaultSpeed: number;
239
+ /**
240
+ * Default pitch adjustment in semitones (-20 to 20)
241
+ * 0 = no adjustment
242
+ */
243
+ defaultPitch: number;
244
+ /**
245
+ * Voice profile status
246
+ * - pending: Profile created but voice not yet generated
247
+ * - processing: Voice generation/cloning in progress
248
+ * - ready: Voice is ready for use
249
+ * - failed: Voice generation failed
250
+ */
251
+ status: VoiceProfileStatus;
252
+ /**
253
+ * TTS provider that created/manages this voice
254
+ */
255
+ provider: string;
256
+ /**
257
+ * Error message if status is 'failed'
258
+ */
259
+ errorMessage: string | null;
260
+ constructor(options?: VoiceProfileOptions);
261
+ /**
262
+ * Check if this voice profile uses voice cloning
263
+ */
264
+ get isCloned(): boolean;
265
+ /**
266
+ * Check if this voice profile uses voice design
267
+ */
268
+ get isDesigned(): boolean;
269
+ /**
270
+ * Check if the voice is ready for use
271
+ */
272
+ get isReady(): boolean;
273
+ /**
274
+ * Check if this is a global (default) voice
275
+ */
276
+ get isGlobal(): boolean;
277
+ }
278
+
279
+ /**
280
+ * Voice profile creation options
281
+ */
282
+ export declare interface VoiceProfileOptions extends SmrtObjectOptions {
283
+ /**
284
+ * Human-readable name for the voice profile
285
+ */
286
+ name?: string;
287
+ /**
288
+ * Description of the voice characteristics
289
+ */
290
+ description?: string | null;
291
+ /**
292
+ * ISO language code (e.g., 'en-US', 'zh-CN')
293
+ */
294
+ language?: string;
295
+ /**
296
+ * Voice gender classification
297
+ * @default 'neutral'
298
+ */
299
+ gender?: VoiceGender;
300
+ /**
301
+ * Natural language description for voice design
302
+ * Used when creating a voice from scratch
303
+ */
304
+ designPrompt?: string | null;
305
+ /**
306
+ * Asset ID of the audio sample for voice cloning
307
+ * Should be at least 3 seconds of clear speech
308
+ */
309
+ sampleAssetId?: string | null;
310
+ /**
311
+ * Provider-specific voice data (ID, embedding, etc.)
312
+ * Stored after voice creation/cloning
313
+ */
314
+ voiceData?: Record<string, any> | null;
315
+ /**
316
+ * Default speech speed multiplier
317
+ * @default 1.0
318
+ */
319
+ defaultSpeed?: number;
320
+ /**
321
+ * Default pitch adjustment in semitones
322
+ * @default 0
323
+ */
324
+ defaultPitch?: number;
325
+ /**
326
+ * Voice profile status
327
+ * @default 'pending'
328
+ */
329
+ status?: VoiceProfileStatus;
330
+ /**
331
+ * TTS provider that created this voice
332
+ * @default 'qwen3-tts'
333
+ */
334
+ provider?: string;
335
+ /**
336
+ * Error message if status is 'failed'
337
+ */
338
+ errorMessage?: string | null;
339
+ /**
340
+ * Tenant ID for multi-tenant isolation
341
+ * Null for global/default voices
342
+ */
343
+ tenantId?: string | null;
344
+ }
345
+
346
+ /**
347
+ * Voice profile status
348
+ */
349
+ export declare type VoiceProfileStatus = 'pending' | 'processing' | 'ready' | 'failed';
350
+
351
+ /**
352
+ * Audio sample for voice cloning
353
+ *
354
+ * VoiceSample represents an audio recording used as source material
355
+ * for voice cloning. For best results, samples should be:
356
+ * - At least 3 seconds long
357
+ * - Clear speech without background noise
358
+ * - Single speaker only
359
+ * - High quality (44.1kHz or higher)
360
+ *
361
+ * Multiple samples can be associated with a single VoiceProfile
362
+ * to improve voice cloning quality.
363
+ *
364
+ * @example
365
+ * ```typescript
366
+ * import { VoiceSample } from '@happyvertical/smrt-voice';
367
+ *
368
+ * const sample = new VoiceSample({
369
+ * voiceProfileId: 'voice-123',
370
+ * assetId: 'asset-456',
371
+ * duration: 5.2,
372
+ * transcription: 'Hello, this is a test recording for voice cloning.',
373
+ * quality: 'high',
374
+ * sampleRate: 48000,
375
+ * channels: 1,
376
+ * format: 'wav',
377
+ * isPrimary: true,
378
+ * });
379
+ * ```
380
+ */
381
+ export declare class VoiceSample extends SmrtObject {
382
+ /**
383
+ * Tenant ID for multi-tenant isolation
384
+ */
385
+ tenantId: string | null;
386
+ /**
387
+ * Voice profile this sample belongs to
388
+ */
389
+ voiceProfileId: string | null;
390
+ /**
391
+ * Asset ID of the audio file
392
+ * References an Asset in smrt-assets
393
+ */
394
+ assetId: string | null;
395
+ /**
396
+ * Sample duration in seconds
397
+ */
398
+ duration: number;
399
+ /**
400
+ * Transcription of what was said in the sample
401
+ * Used for alignment and quality verification
402
+ */
403
+ transcription: string | null;
404
+ /**
405
+ * Quality rating based on audio analysis
406
+ * - low: Noisy or short samples
407
+ * - medium: Acceptable quality
408
+ * - high: Clear audio, good length
409
+ */
410
+ quality: SampleQuality;
411
+ /**
412
+ * Sample rate in Hz
413
+ */
414
+ sampleRate: number | null;
415
+ /**
416
+ * Number of audio channels
417
+ */
418
+ channels: number | null;
419
+ /**
420
+ * Audio format
421
+ */
422
+ format: string | null;
423
+ /**
424
+ * Whether this is the primary sample for the voice profile
425
+ */
426
+ isPrimary: boolean;
427
+ constructor(options?: VoiceSampleOptions);
428
+ /**
429
+ * Check if sample meets minimum duration requirement (3 seconds)
430
+ */
431
+ get meetsMinDuration(): boolean;
432
+ /**
433
+ * Check if sample is high quality and suitable for cloning
434
+ */
435
+ get isSuitableForCloning(): boolean;
436
+ }
437
+
438
+ /**
439
+ * Voice sample creation options
440
+ */
441
+ export declare interface VoiceSampleOptions extends SmrtObjectOptions {
442
+ /**
443
+ * Voice profile this sample belongs to
444
+ */
445
+ voiceProfileId?: string | null;
446
+ /**
447
+ * Asset ID of the audio file
448
+ */
449
+ assetId?: string | null;
450
+ /**
451
+ * Sample duration in seconds
452
+ */
453
+ duration?: number;
454
+ /**
455
+ * Transcription of what was said in the sample
456
+ */
457
+ transcription?: string | null;
458
+ /**
459
+ * Quality rating based on audio analysis
460
+ * @default 'medium'
461
+ */
462
+ quality?: SampleQuality;
463
+ /**
464
+ * Sample rate in Hz (e.g., 44100, 48000)
465
+ */
466
+ sampleRate?: number | null;
467
+ /**
468
+ * Number of audio channels (1 = mono, 2 = stereo)
469
+ */
470
+ channels?: number | null;
471
+ /**
472
+ * Audio format (e.g., 'wav', 'mp3', 'ogg')
473
+ */
474
+ format?: string | null;
475
+ /**
476
+ * Whether this is the primary sample for the voice profile
477
+ * @default false
478
+ */
479
+ isPrimary?: boolean;
480
+ /**
481
+ * Tenant ID for multi-tenant isolation
482
+ */
483
+ tenantId?: string | null;
484
+ }
485
+
486
+ /**
487
+ * Word timing information for lip-sync alignment
488
+ */
489
+ export declare interface WordTiming {
490
+ /**
491
+ * The word
492
+ */
493
+ word: string;
494
+ /**
495
+ * Start time in seconds
496
+ */
497
+ start: number;
498
+ /**
499
+ * End time in seconds
500
+ */
501
+ end: number;
502
+ }
503
+
504
+ export { }