@happyvertical/smrt-voice 0.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +18 -0
- package/CLAUDE.md +1 -0
- package/LICENSE +7 -0
- package/README.md +106 -0
- package/dist/index.d.ts +504 -0
- package/dist/index.js +330 -0
- package/dist/index.js.map +1 -0
- package/dist/manifest.json +3047 -0
- package/dist/smrt-knowledge.json +1158 -0
- package/package.json +72 -0
package/AGENTS.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# @happyvertical/smrt-voice
|
|
2
|
+
|
|
3
|
+
TTS voice profiles with two creation modes: AI design or audio cloning. Word-level timing output for lip-sync.
|
|
4
|
+
|
|
5
|
+
## Models
|
|
6
|
+
|
|
7
|
+
- **VoiceProfile**: two mutually exclusive modes — `designPrompt` (AI-generated from description) XOR `sampleAssetId` (cloned from audio). Status: `pending → processing → ready/failed`. `voiceData` is opaque provider-specific storage. `defaultSpeed` (0.5-2.0), `defaultPitch` (-20 to 20 semitones).
|
|
8
|
+
- **VoiceSample**: audio training data. `duration`, `transcription`, `quality` (low/medium/high), `sampleRate`, `format`. Validation: `meetsMinDuration` (≥3 sec), `isSuitableForCloning` (≥3 sec AND quality ≠ low).
|
|
9
|
+
- **VoiceOutput** (extends Content): generated TTS audio. `sourceText`, `audioAssetId`, `wordTimings` array `[{word, start, end}]` in seconds for lip-sync. `audioMetadata` (sampleRate, format, channels, bitDepth, provider, model). Computed: `wordCount`, `wordsPerSecond`, `getWordAtTime(seconds)`.
|
|
10
|
+
|
|
11
|
+
## Gotchas
|
|
12
|
+
|
|
13
|
+
- **Default provider hardcoded**: 'qwen3-tts' — no provider abstraction layer
|
|
14
|
+
- **Sample minimum not enforced in constructor**: 3-sec minimum documented but not validated on create
|
|
15
|
+
- **WordTiming from external provider**: framework doesn't generate timings — populated by TTS service
|
|
16
|
+
- **Status transitions not enforced**: can manually set status without triggering generation workflow
|
|
17
|
+
- **voiceData is opaque**: `{ [key: string]: any }` — provider-specific, no schema
|
|
18
|
+
- **Optional tenancy**: tenantId=null for global/default voices
|
package/CLAUDE.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
@AGENTS.md
|
package/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright <2025> <Happy Vertical Corporation>
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# @happyvertical/smrt-voice
|
|
2
|
+
|
|
3
|
+
Voice profile management for AI-powered voice synthesis and cloning in the SMRT ecosystem. Manages voice profiles, cloning samples, and TTS output with word-level timing for lip-sync.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pnpm add @happyvertical/smrt-voice
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```typescript
|
|
14
|
+
import { VoiceProfile, VoiceSample, VoiceOutput } from '@happyvertical/smrt-voice';
|
|
15
|
+
|
|
16
|
+
// Two mutually exclusive creation modes:
|
|
17
|
+
|
|
18
|
+
// 1. Voice design -- AI generates voice from a natural language prompt
|
|
19
|
+
const designed = new VoiceProfile({
|
|
20
|
+
name: 'News Anchor',
|
|
21
|
+
language: 'en-US',
|
|
22
|
+
gender: 'male',
|
|
23
|
+
designPrompt: 'Warm, authoritative male voice with clear enunciation',
|
|
24
|
+
defaultSpeed: 1.0, // 0.5 - 2.0
|
|
25
|
+
defaultPitch: 0, // -20 to 20 semitones
|
|
26
|
+
});
|
|
27
|
+
await designed.save();
|
|
28
|
+
|
|
29
|
+
// 2. Voice cloning -- replicate voice from audio sample(s)
|
|
30
|
+
const cloned = new VoiceProfile({
|
|
31
|
+
name: 'Custom Voice',
|
|
32
|
+
language: 'en-US',
|
|
33
|
+
sampleAssetId: 'asset-123',
|
|
34
|
+
});
|
|
35
|
+
await cloned.save();
|
|
36
|
+
|
|
37
|
+
// Add training samples for cloning (minimum 3 seconds, quality != low)
|
|
38
|
+
const sample = new VoiceSample({
|
|
39
|
+
voiceProfileId: cloned.id,
|
|
40
|
+
assetId: 'asset-456',
|
|
41
|
+
duration: 5.2,
|
|
42
|
+
transcription: 'Hello, this is a test recording for voice cloning.',
|
|
43
|
+
quality: 'high',
|
|
44
|
+
sampleRate: 48000,
|
|
45
|
+
format: 'wav',
|
|
46
|
+
isPrimary: true,
|
|
47
|
+
});
|
|
48
|
+
await sample.save();
|
|
49
|
+
|
|
50
|
+
// TTS output with word-level timing for lip-sync
|
|
51
|
+
const output = new VoiceOutput({
|
|
52
|
+
voiceProfileId: designed.id,
|
|
53
|
+
sourceText: 'Welcome to the evening news.',
|
|
54
|
+
audioAssetId: 'asset-789',
|
|
55
|
+
duration: 2.8,
|
|
56
|
+
wordTimings: [
|
|
57
|
+
{ word: 'Welcome', start: 0.0, end: 0.4 },
|
|
58
|
+
{ word: 'to', start: 0.4, end: 0.5 },
|
|
59
|
+
{ word: 'the', start: 0.5, end: 0.6 },
|
|
60
|
+
{ word: 'evening', start: 0.6, end: 1.0 },
|
|
61
|
+
{ word: 'news', start: 1.0, end: 1.3 },
|
|
62
|
+
],
|
|
63
|
+
});
|
|
64
|
+
// Look up which word is being spoken at a given timestamp
|
|
65
|
+
output.getWordAtTime(0.7); // { word: 'evening', start: 0.6, end: 1.0 }
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## API
|
|
69
|
+
|
|
70
|
+
### Models
|
|
71
|
+
|
|
72
|
+
| Export | Description |
|
|
73
|
+
|--------|------------|
|
|
74
|
+
| `VoiceProfile` | Voice identity with two modes: `designPrompt` (AI-generated) or `sampleAssetId` (cloned) |
|
|
75
|
+
| `VoiceSample` | Audio training data for voice cloning with quality rating |
|
|
76
|
+
| `VoiceOutput` | Generated TTS audio (extends Content) with word-level timing for lip-sync |
|
|
77
|
+
|
|
78
|
+
### Types
|
|
79
|
+
|
|
80
|
+
| Export | Description |
|
|
81
|
+
|--------|------------|
|
|
82
|
+
| `VoiceProfileStatus` | Lifecycle status: `pending`, `processing`, `ready`, `failed` |
|
|
83
|
+
| `VoiceGender` | Gender classification: `male`, `female`, `neutral` |
|
|
84
|
+
| `SampleQuality` | Audio quality rating: `low`, `medium`, `high` |
|
|
85
|
+
| `WordTiming` | Per-word timing entry: `{ word, start, end }` (seconds) |
|
|
86
|
+
| `VoiceOutputMetadata` | Audio metadata: sampleRate, format, channels, bitDepth, provider, model |
|
|
87
|
+
| `VoiceProfileOptions` | Profile creation options |
|
|
88
|
+
| `VoiceSampleOptions` | Sample creation options |
|
|
89
|
+
| `VoiceOutputOptions` | Output creation options |
|
|
90
|
+
|
|
91
|
+
### Key Computed Properties
|
|
92
|
+
|
|
93
|
+
- `VoiceProfile.isCloned` / `isDesigned` -- which creation mode is active
|
|
94
|
+
- `VoiceProfile.isReady` -- status equals `ready`
|
|
95
|
+
- `VoiceSample.meetsMinDuration` -- duration >= 3 seconds
|
|
96
|
+
- `VoiceSample.isSuitableForCloning` -- meets min duration AND quality != low
|
|
97
|
+
- `VoiceOutput.wordCount` / `wordsPerSecond` -- computed from sourceText and duration
|
|
98
|
+
- `VoiceOutput.getWordAtTime(seconds)` -- look up word being spoken at a timestamp
|
|
99
|
+
|
|
100
|
+
## Dependencies
|
|
101
|
+
|
|
102
|
+
- `@happyvertical/smrt-core` -- ORM and code generation
|
|
103
|
+
- `@happyvertical/smrt-assets` -- base asset management
|
|
104
|
+
- `@happyvertical/smrt-config` -- configuration loading
|
|
105
|
+
- `@happyvertical/smrt-content` -- content models (VoiceOutput extends Content)
|
|
106
|
+
- `@happyvertical/smrt-tenancy` -- multi-tenant scoping
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
import { Content } from '@happyvertical/smrt-content';
|
|
2
|
+
import { ContentOptions } from '@happyvertical/smrt-content';
|
|
3
|
+
import { SmrtObject } from '@happyvertical/smrt-core';
|
|
4
|
+
import { SmrtObjectOptions } from '@happyvertical/smrt-core';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Audio sample quality rating
|
|
8
|
+
*/
|
|
9
|
+
export declare type SampleQuality = 'low' | 'medium' | 'high';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Voice gender classification
|
|
13
|
+
*/
|
|
14
|
+
export declare type VoiceGender = 'male' | 'female' | 'neutral';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Generated audio output from text-to-speech synthesis
|
|
18
|
+
*
|
|
19
|
+
* VoiceOutput extends Content to represent audio generated from
|
|
20
|
+
* text using a VoiceProfile. It includes word-level timing information
|
|
21
|
+
* for lip-sync alignment in video production.
|
|
22
|
+
*
|
|
23
|
+
* @example
|
|
24
|
+
* ```typescript
|
|
25
|
+
* import { VoiceOutput } from '@happyvertical/smrt-voice';
|
|
26
|
+
*
|
|
27
|
+
* const output = new VoiceOutput({
|
|
28
|
+
* voiceProfileId: 'voice-123',
|
|
29
|
+
* sourceText: 'Welcome to the evening news broadcast.',
|
|
30
|
+
* audioAssetId: 'asset-789',
|
|
31
|
+
* duration: 3.5,
|
|
32
|
+
* wordTimings: [
|
|
33
|
+
* { word: 'Welcome', start: 0.0, end: 0.4 },
|
|
34
|
+
* { word: 'to', start: 0.4, end: 0.5 },
|
|
35
|
+
* { word: 'the', start: 0.5, end: 0.6 },
|
|
36
|
+
* { word: 'evening', start: 0.6, end: 1.0 },
|
|
37
|
+
* { word: 'news', start: 1.0, end: 1.3 },
|
|
38
|
+
* { word: 'broadcast', start: 1.3, end: 1.9 },
|
|
39
|
+
* ],
|
|
40
|
+
* audioMetadata: {
|
|
41
|
+
* sampleRate: 48000,
|
|
42
|
+
* format: 'wav',
|
|
43
|
+
* channels: 1,
|
|
44
|
+
* provider: 'qwen3-tts',
|
|
45
|
+
* },
|
|
46
|
+
* });
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
export declare class VoiceOutput extends Content {
|
|
50
|
+
/**
|
|
51
|
+
* Voice profile used for synthesis
|
|
52
|
+
*/
|
|
53
|
+
voiceProfileId: string | null;
|
|
54
|
+
/**
|
|
55
|
+
* Original text that was synthesized
|
|
56
|
+
*/
|
|
57
|
+
sourceText: string;
|
|
58
|
+
/**
|
|
59
|
+
* Asset ID of the generated audio file
|
|
60
|
+
*/
|
|
61
|
+
audioAssetId: string | null;
|
|
62
|
+
/**
|
|
63
|
+
* Duration of the generated audio in seconds
|
|
64
|
+
*/
|
|
65
|
+
duration: number;
|
|
66
|
+
/**
|
|
67
|
+
* Word-level timing information for lip-sync alignment
|
|
68
|
+
*/
|
|
69
|
+
wordTimings: WordTiming[] | null;
|
|
70
|
+
/**
|
|
71
|
+
* Audio metadata (sample rate, format, etc.)
|
|
72
|
+
*/
|
|
73
|
+
audioMetadata: VoiceOutputMetadata;
|
|
74
|
+
constructor(options?: VoiceOutputOptions);
|
|
75
|
+
/**
|
|
76
|
+
* Get the word count of the source text
|
|
77
|
+
*/
|
|
78
|
+
get wordCount(): number;
|
|
79
|
+
/**
|
|
80
|
+
* Get the average words per second rate
|
|
81
|
+
*/
|
|
82
|
+
get wordsPerSecond(): number;
|
|
83
|
+
/**
|
|
84
|
+
* Check if word timing data is available for lip-sync
|
|
85
|
+
*/
|
|
86
|
+
get hasWordTimings(): boolean;
|
|
87
|
+
/**
|
|
88
|
+
* Get the word at a specific timestamp
|
|
89
|
+
*/
|
|
90
|
+
getWordAtTime(seconds: number): WordTiming | null;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Voice output metadata
|
|
95
|
+
*/
|
|
96
|
+
export declare interface VoiceOutputMetadata {
|
|
97
|
+
/**
|
|
98
|
+
* Sample rate in Hz
|
|
99
|
+
*/
|
|
100
|
+
sampleRate?: number;
|
|
101
|
+
/**
|
|
102
|
+
* Audio format (e.g., 'wav', 'mp3', 'ogg')
|
|
103
|
+
*/
|
|
104
|
+
format?: string;
|
|
105
|
+
/**
|
|
106
|
+
* Number of audio channels
|
|
107
|
+
*/
|
|
108
|
+
channels?: number;
|
|
109
|
+
/**
|
|
110
|
+
* Bit depth (e.g., 16, 24, 32)
|
|
111
|
+
*/
|
|
112
|
+
bitDepth?: number;
|
|
113
|
+
/**
|
|
114
|
+
* File size in bytes
|
|
115
|
+
*/
|
|
116
|
+
fileSize?: number;
|
|
117
|
+
/**
|
|
118
|
+
* TTS provider used
|
|
119
|
+
*/
|
|
120
|
+
provider?: string;
|
|
121
|
+
/**
|
|
122
|
+
* Model used for synthesis
|
|
123
|
+
*/
|
|
124
|
+
model?: string;
|
|
125
|
+
/**
|
|
126
|
+
* Speech speed used (1.0 = normal)
|
|
127
|
+
*/
|
|
128
|
+
speed?: number;
|
|
129
|
+
/**
|
|
130
|
+
* Pitch adjustment used (semitones)
|
|
131
|
+
*/
|
|
132
|
+
pitch?: number;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Voice output creation options
|
|
137
|
+
*/
|
|
138
|
+
export declare interface VoiceOutputOptions extends ContentOptions {
|
|
139
|
+
/**
|
|
140
|
+
* Voice profile used for synthesis
|
|
141
|
+
*/
|
|
142
|
+
voiceProfileId?: string | null;
|
|
143
|
+
/**
|
|
144
|
+
* Original text that was synthesized
|
|
145
|
+
*/
|
|
146
|
+
sourceText?: string;
|
|
147
|
+
/**
|
|
148
|
+
* Asset ID of the generated audio file
|
|
149
|
+
*/
|
|
150
|
+
audioAssetId?: string | null;
|
|
151
|
+
/**
|
|
152
|
+
* Duration of the generated audio in seconds
|
|
153
|
+
*/
|
|
154
|
+
duration?: number;
|
|
155
|
+
/**
|
|
156
|
+
* Word-level timing information for lip-sync
|
|
157
|
+
*/
|
|
158
|
+
wordTimings?: WordTiming[] | null;
|
|
159
|
+
/**
|
|
160
|
+
* Audio metadata
|
|
161
|
+
*/
|
|
162
|
+
audioMetadata?: VoiceOutputMetadata;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Voice profile for AI-powered speech synthesis
|
|
167
|
+
*
|
|
168
|
+
* VoiceProfile represents a configured voice identity that can be used
|
|
169
|
+
* for text-to-speech synthesis. Voices can be created through:
|
|
170
|
+
* - Voice design: Natural language description of desired voice characteristics
|
|
171
|
+
* - Voice cloning: 3+ second audio sample for voice replication
|
|
172
|
+
*
|
|
173
|
+
* @example
|
|
174
|
+
* ```typescript
|
|
175
|
+
* import { VoiceProfile } from '@happyvertical/smrt-voice';
|
|
176
|
+
*
|
|
177
|
+
* // Create a designed voice
|
|
178
|
+
* const anchorVoice = new VoiceProfile({
|
|
179
|
+
* name: 'News Anchor',
|
|
180
|
+
* description: 'Professional news anchor voice with clear enunciation',
|
|
181
|
+
* language: 'en-US',
|
|
182
|
+
* gender: 'male',
|
|
183
|
+
* designPrompt: 'Warm, authoritative male voice with slight gravitas, suitable for news broadcasts',
|
|
184
|
+
* provider: 'qwen3-tts',
|
|
185
|
+
* });
|
|
186
|
+
*
|
|
187
|
+
* // Create a cloned voice
|
|
188
|
+
* const clonedVoice = new VoiceProfile({
|
|
189
|
+
* name: 'Custom Voice',
|
|
190
|
+
* description: 'Cloned from user sample',
|
|
191
|
+
* language: 'en-US',
|
|
192
|
+
* sampleAssetId: 'asset-123',
|
|
193
|
+
* provider: 'qwen3-tts',
|
|
194
|
+
* });
|
|
195
|
+
* ```
|
|
196
|
+
*/
|
|
197
|
+
export declare class VoiceProfile extends SmrtObject {
|
|
198
|
+
/**
|
|
199
|
+
* Tenant ID for multi-tenant isolation
|
|
200
|
+
* Nullable to support global/default voices
|
|
201
|
+
*/
|
|
202
|
+
tenantId: string | null;
|
|
203
|
+
/**
|
|
204
|
+
* Human-readable name for the voice profile
|
|
205
|
+
*/
|
|
206
|
+
name: string;
|
|
207
|
+
/**
|
|
208
|
+
* Description of the voice characteristics
|
|
209
|
+
*/
|
|
210
|
+
description: string | null;
|
|
211
|
+
/**
|
|
212
|
+
* ISO language code (e.g., 'en-US', 'zh-CN')
|
|
213
|
+
*/
|
|
214
|
+
language: string;
|
|
215
|
+
/**
|
|
216
|
+
* Voice gender classification
|
|
217
|
+
*/
|
|
218
|
+
gender: VoiceGender;
|
|
219
|
+
/**
|
|
220
|
+
* Natural language description for voice design
|
|
221
|
+
* Used when creating a voice from scratch via AI
|
|
222
|
+
*/
|
|
223
|
+
designPrompt: string | null;
|
|
224
|
+
/**
|
|
225
|
+
* Asset ID of the audio sample for voice cloning
|
|
226
|
+
* Should be at least 3 seconds of clear speech
|
|
227
|
+
*/
|
|
228
|
+
sampleAssetId: string | null;
|
|
229
|
+
/**
|
|
230
|
+
* Provider-specific voice data (ID, embedding, etc.)
|
|
231
|
+
* Stored after voice creation/cloning is complete
|
|
232
|
+
*/
|
|
233
|
+
voiceData: Record<string, any> | null;
|
|
234
|
+
/**
|
|
235
|
+
* Default speech speed multiplier (0.5 - 2.0)
|
|
236
|
+
* 1.0 = normal speed
|
|
237
|
+
*/
|
|
238
|
+
defaultSpeed: number;
|
|
239
|
+
/**
|
|
240
|
+
* Default pitch adjustment in semitones (-20 to 20)
|
|
241
|
+
* 0 = no adjustment
|
|
242
|
+
*/
|
|
243
|
+
defaultPitch: number;
|
|
244
|
+
/**
|
|
245
|
+
* Voice profile status
|
|
246
|
+
* - pending: Profile created but voice not yet generated
|
|
247
|
+
* - processing: Voice generation/cloning in progress
|
|
248
|
+
* - ready: Voice is ready for use
|
|
249
|
+
* - failed: Voice generation failed
|
|
250
|
+
*/
|
|
251
|
+
status: VoiceProfileStatus;
|
|
252
|
+
/**
|
|
253
|
+
* TTS provider that created/manages this voice
|
|
254
|
+
*/
|
|
255
|
+
provider: string;
|
|
256
|
+
/**
|
|
257
|
+
* Error message if status is 'failed'
|
|
258
|
+
*/
|
|
259
|
+
errorMessage: string | null;
|
|
260
|
+
constructor(options?: VoiceProfileOptions);
|
|
261
|
+
/**
|
|
262
|
+
* Check if this voice profile uses voice cloning
|
|
263
|
+
*/
|
|
264
|
+
get isCloned(): boolean;
|
|
265
|
+
/**
|
|
266
|
+
* Check if this voice profile uses voice design
|
|
267
|
+
*/
|
|
268
|
+
get isDesigned(): boolean;
|
|
269
|
+
/**
|
|
270
|
+
* Check if the voice is ready for use
|
|
271
|
+
*/
|
|
272
|
+
get isReady(): boolean;
|
|
273
|
+
/**
|
|
274
|
+
* Check if this is a global (default) voice
|
|
275
|
+
*/
|
|
276
|
+
get isGlobal(): boolean;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Voice profile creation options
|
|
281
|
+
*/
|
|
282
|
+
export declare interface VoiceProfileOptions extends SmrtObjectOptions {
|
|
283
|
+
/**
|
|
284
|
+
* Human-readable name for the voice profile
|
|
285
|
+
*/
|
|
286
|
+
name?: string;
|
|
287
|
+
/**
|
|
288
|
+
* Description of the voice characteristics
|
|
289
|
+
*/
|
|
290
|
+
description?: string | null;
|
|
291
|
+
/**
|
|
292
|
+
* ISO language code (e.g., 'en-US', 'zh-CN')
|
|
293
|
+
*/
|
|
294
|
+
language?: string;
|
|
295
|
+
/**
|
|
296
|
+
* Voice gender classification
|
|
297
|
+
* @default 'neutral'
|
|
298
|
+
*/
|
|
299
|
+
gender?: VoiceGender;
|
|
300
|
+
/**
|
|
301
|
+
* Natural language description for voice design
|
|
302
|
+
* Used when creating a voice from scratch
|
|
303
|
+
*/
|
|
304
|
+
designPrompt?: string | null;
|
|
305
|
+
/**
|
|
306
|
+
* Asset ID of the audio sample for voice cloning
|
|
307
|
+
* Should be at least 3 seconds of clear speech
|
|
308
|
+
*/
|
|
309
|
+
sampleAssetId?: string | null;
|
|
310
|
+
/**
|
|
311
|
+
* Provider-specific voice data (ID, embedding, etc.)
|
|
312
|
+
* Stored after voice creation/cloning
|
|
313
|
+
*/
|
|
314
|
+
voiceData?: Record<string, any> | null;
|
|
315
|
+
/**
|
|
316
|
+
* Default speech speed multiplier
|
|
317
|
+
* @default 1.0
|
|
318
|
+
*/
|
|
319
|
+
defaultSpeed?: number;
|
|
320
|
+
/**
|
|
321
|
+
* Default pitch adjustment in semitones
|
|
322
|
+
* @default 0
|
|
323
|
+
*/
|
|
324
|
+
defaultPitch?: number;
|
|
325
|
+
/**
|
|
326
|
+
* Voice profile status
|
|
327
|
+
* @default 'pending'
|
|
328
|
+
*/
|
|
329
|
+
status?: VoiceProfileStatus;
|
|
330
|
+
/**
|
|
331
|
+
* TTS provider that created this voice
|
|
332
|
+
* @default 'qwen3-tts'
|
|
333
|
+
*/
|
|
334
|
+
provider?: string;
|
|
335
|
+
/**
|
|
336
|
+
* Error message if status is 'failed'
|
|
337
|
+
*/
|
|
338
|
+
errorMessage?: string | null;
|
|
339
|
+
/**
|
|
340
|
+
* Tenant ID for multi-tenant isolation
|
|
341
|
+
* Null for global/default voices
|
|
342
|
+
*/
|
|
343
|
+
tenantId?: string | null;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Voice profile status
|
|
348
|
+
*/
|
|
349
|
+
export declare type VoiceProfileStatus = 'pending' | 'processing' | 'ready' | 'failed';
|
|
350
|
+
|
|
351
|
+
/**
|
|
352
|
+
* Audio sample for voice cloning
|
|
353
|
+
*
|
|
354
|
+
* VoiceSample represents an audio recording used as source material
|
|
355
|
+
* for voice cloning. For best results, samples should be:
|
|
356
|
+
* - At least 3 seconds long
|
|
357
|
+
* - Clear speech without background noise
|
|
358
|
+
* - Single speaker only
|
|
359
|
+
* - High quality (44.1kHz or higher)
|
|
360
|
+
*
|
|
361
|
+
* Multiple samples can be associated with a single VoiceProfile
|
|
362
|
+
* to improve voice cloning quality.
|
|
363
|
+
*
|
|
364
|
+
* @example
|
|
365
|
+
* ```typescript
|
|
366
|
+
* import { VoiceSample } from '@happyvertical/smrt-voice';
|
|
367
|
+
*
|
|
368
|
+
* const sample = new VoiceSample({
|
|
369
|
+
* voiceProfileId: 'voice-123',
|
|
370
|
+
* assetId: 'asset-456',
|
|
371
|
+
* duration: 5.2,
|
|
372
|
+
* transcription: 'Hello, this is a test recording for voice cloning.',
|
|
373
|
+
* quality: 'high',
|
|
374
|
+
* sampleRate: 48000,
|
|
375
|
+
* channels: 1,
|
|
376
|
+
* format: 'wav',
|
|
377
|
+
* isPrimary: true,
|
|
378
|
+
* });
|
|
379
|
+
* ```
|
|
380
|
+
*/
|
|
381
|
+
export declare class VoiceSample extends SmrtObject {
|
|
382
|
+
/**
|
|
383
|
+
* Tenant ID for multi-tenant isolation
|
|
384
|
+
*/
|
|
385
|
+
tenantId: string | null;
|
|
386
|
+
/**
|
|
387
|
+
* Voice profile this sample belongs to
|
|
388
|
+
*/
|
|
389
|
+
voiceProfileId: string | null;
|
|
390
|
+
/**
|
|
391
|
+
* Asset ID of the audio file
|
|
392
|
+
* References an Asset in smrt-assets
|
|
393
|
+
*/
|
|
394
|
+
assetId: string | null;
|
|
395
|
+
/**
|
|
396
|
+
* Sample duration in seconds
|
|
397
|
+
*/
|
|
398
|
+
duration: number;
|
|
399
|
+
/**
|
|
400
|
+
* Transcription of what was said in the sample
|
|
401
|
+
* Used for alignment and quality verification
|
|
402
|
+
*/
|
|
403
|
+
transcription: string | null;
|
|
404
|
+
/**
|
|
405
|
+
* Quality rating based on audio analysis
|
|
406
|
+
* - low: Noisy or short samples
|
|
407
|
+
* - medium: Acceptable quality
|
|
408
|
+
* - high: Clear audio, good length
|
|
409
|
+
*/
|
|
410
|
+
quality: SampleQuality;
|
|
411
|
+
/**
|
|
412
|
+
* Sample rate in Hz
|
|
413
|
+
*/
|
|
414
|
+
sampleRate: number | null;
|
|
415
|
+
/**
|
|
416
|
+
* Number of audio channels
|
|
417
|
+
*/
|
|
418
|
+
channels: number | null;
|
|
419
|
+
/**
|
|
420
|
+
* Audio format
|
|
421
|
+
*/
|
|
422
|
+
format: string | null;
|
|
423
|
+
/**
|
|
424
|
+
* Whether this is the primary sample for the voice profile
|
|
425
|
+
*/
|
|
426
|
+
isPrimary: boolean;
|
|
427
|
+
constructor(options?: VoiceSampleOptions);
|
|
428
|
+
/**
|
|
429
|
+
* Check if sample meets minimum duration requirement (3 seconds)
|
|
430
|
+
*/
|
|
431
|
+
get meetsMinDuration(): boolean;
|
|
432
|
+
/**
|
|
433
|
+
* Check if sample is high quality and suitable for cloning
|
|
434
|
+
*/
|
|
435
|
+
get isSuitableForCloning(): boolean;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
/**
|
|
439
|
+
* Voice sample creation options
|
|
440
|
+
*/
|
|
441
|
+
export declare interface VoiceSampleOptions extends SmrtObjectOptions {
|
|
442
|
+
/**
|
|
443
|
+
* Voice profile this sample belongs to
|
|
444
|
+
*/
|
|
445
|
+
voiceProfileId?: string | null;
|
|
446
|
+
/**
|
|
447
|
+
* Asset ID of the audio file
|
|
448
|
+
*/
|
|
449
|
+
assetId?: string | null;
|
|
450
|
+
/**
|
|
451
|
+
* Sample duration in seconds
|
|
452
|
+
*/
|
|
453
|
+
duration?: number;
|
|
454
|
+
/**
|
|
455
|
+
* Transcription of what was said in the sample
|
|
456
|
+
*/
|
|
457
|
+
transcription?: string | null;
|
|
458
|
+
/**
|
|
459
|
+
* Quality rating based on audio analysis
|
|
460
|
+
* @default 'medium'
|
|
461
|
+
*/
|
|
462
|
+
quality?: SampleQuality;
|
|
463
|
+
/**
|
|
464
|
+
* Sample rate in Hz (e.g., 44100, 48000)
|
|
465
|
+
*/
|
|
466
|
+
sampleRate?: number | null;
|
|
467
|
+
/**
|
|
468
|
+
* Number of audio channels (1 = mono, 2 = stereo)
|
|
469
|
+
*/
|
|
470
|
+
channels?: number | null;
|
|
471
|
+
/**
|
|
472
|
+
* Audio format (e.g., 'wav', 'mp3', 'ogg')
|
|
473
|
+
*/
|
|
474
|
+
format?: string | null;
|
|
475
|
+
/**
|
|
476
|
+
* Whether this is the primary sample for the voice profile
|
|
477
|
+
* @default false
|
|
478
|
+
*/
|
|
479
|
+
isPrimary?: boolean;
|
|
480
|
+
/**
|
|
481
|
+
* Tenant ID for multi-tenant isolation
|
|
482
|
+
*/
|
|
483
|
+
tenantId?: string | null;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
/**
|
|
487
|
+
* Word timing information for lip-sync alignment
|
|
488
|
+
*/
|
|
489
|
+
export declare interface WordTiming {
|
|
490
|
+
/**
|
|
491
|
+
* The word
|
|
492
|
+
*/
|
|
493
|
+
word: string;
|
|
494
|
+
/**
|
|
495
|
+
* Start time in seconds
|
|
496
|
+
*/
|
|
497
|
+
start: number;
|
|
498
|
+
/**
|
|
499
|
+
* End time in seconds
|
|
500
|
+
*/
|
|
501
|
+
end: number;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
export { }
|