@contractspec/lib.voice 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio/audio-concatenator.d.ts +15 -0
- package/dist/audio/audio-concatenator.js +57 -0
- package/dist/audio/duration-estimator.d.ts +31 -0
- package/dist/audio/duration-estimator.js +22 -0
- package/dist/audio/format-converter.d.ts +17 -0
- package/dist/audio/format-converter.js +28 -0
- package/dist/audio/index.d.ts +4 -0
- package/dist/audio/index.js +121 -0
- package/dist/audio/silence-generator.d.ts +16 -0
- package/dist/audio/silence-generator.js +20 -0
- package/dist/browser/audio/audio-concatenator.js +56 -0
- package/dist/browser/audio/duration-estimator.js +21 -0
- package/dist/browser/audio/format-converter.js +27 -0
- package/dist/browser/audio/index.js +120 -0
- package/dist/browser/audio/silence-generator.js +19 -0
- package/dist/browser/conversational/index.js +241 -0
- package/dist/browser/conversational/response-orchestrator.js +62 -0
- package/dist/browser/conversational/transcript-builder.js +63 -0
- package/dist/browser/conversational/turn-detector.js +43 -0
- package/dist/browser/conversational/types.js +0 -0
- package/dist/browser/conversational/voice-session-manager.js +137 -0
- package/dist/browser/docs/conversational.docblock.js +5 -0
- package/dist/browser/docs/stt.docblock.js +5 -0
- package/dist/browser/docs/sync.docblock.js +5 -0
- package/dist/browser/docs/tts.docblock.js +5 -0
- package/dist/browser/docs/voice.docblock.js +5 -0
- package/dist/browser/i18n/catalogs/en.js +91 -0
- package/dist/browser/i18n/catalogs/es.js +91 -0
- package/dist/browser/i18n/catalogs/fr.js +91 -0
- package/dist/browser/i18n/catalogs/index.js +271 -0
- package/dist/browser/i18n/index.js +335 -0
- package/dist/browser/i18n/keys.js +38 -0
- package/dist/browser/i18n/locale.js +13 -0
- package/dist/browser/i18n/messages.js +283 -0
- package/dist/browser/index.js +1070 -0
- package/dist/browser/stt/diarization-mapper.js +42 -0
- package/dist/browser/stt/index.js +222 -0
- package/dist/browser/stt/segment-splitter.js +36 -0
- package/dist/browser/stt/subtitle-formatter.js +51 -0
- package/dist/browser/stt/transcriber.js +219 -0
- package/dist/browser/stt/types.js +0 -0
- package/dist/browser/sync/duration-negotiator.js +69 -0
- package/dist/browser/sync/index.js +165 -0
- package/dist/browser/sync/scene-adapter.js +52 -0
- package/dist/browser/sync/timing-calculator.js +46 -0
- package/dist/browser/tts/audio-assembler.js +120 -0
- package/dist/browser/tts/emphasis-planner.js +134 -0
- package/dist/browser/tts/index.js +439 -0
- package/dist/browser/tts/pace-analyzer.js +67 -0
- package/dist/browser/tts/segment-synthesizer.js +36 -0
- package/dist/browser/tts/types.js +0 -0
- package/dist/browser/tts/voice-synthesizer.js +435 -0
- package/dist/browser/types.js +0 -0
- package/dist/conversational/index.d.ts +5 -0
- package/dist/conversational/index.js +242 -0
- package/dist/conversational/response-orchestrator.d.ts +26 -0
- package/dist/conversational/response-orchestrator.js +63 -0
- package/dist/conversational/transcript-builder.d.ts +25 -0
- package/dist/conversational/transcript-builder.js +64 -0
- package/dist/conversational/turn-detector.d.ts +31 -0
- package/dist/conversational/turn-detector.js +44 -0
- package/dist/conversational/types.d.ts +55 -0
- package/dist/conversational/types.js +1 -0
- package/dist/conversational/voice-session-manager.d.ts +17 -0
- package/dist/conversational/voice-session-manager.js +138 -0
- package/dist/docs/conversational.docblock.d.ts +14 -0
- package/dist/docs/conversational.docblock.js +6 -0
- package/dist/docs/stt.docblock.d.ts +12 -0
- package/dist/docs/stt.docblock.js +6 -0
- package/dist/docs/sync.docblock.d.ts +12 -0
- package/dist/docs/sync.docblock.js +6 -0
- package/dist/docs/tts.docblock.d.ts +12 -0
- package/dist/docs/tts.docblock.js +6 -0
- package/dist/docs/voice.docblock.d.ts +22 -0
- package/dist/docs/voice.docblock.js +6 -0
- package/dist/i18n/catalogs/en.d.ts +6 -0
- package/dist/i18n/catalogs/en.js +92 -0
- package/dist/i18n/catalogs/es.d.ts +4 -0
- package/dist/i18n/catalogs/es.js +92 -0
- package/dist/i18n/catalogs/fr.d.ts +4 -0
- package/dist/i18n/catalogs/fr.js +92 -0
- package/dist/i18n/catalogs/index.d.ts +3 -0
- package/dist/i18n/catalogs/index.js +272 -0
- package/dist/i18n/index.d.ts +20 -0
- package/dist/i18n/index.js +336 -0
- package/dist/i18n/keys.d.ts +50 -0
- package/dist/i18n/keys.js +39 -0
- package/dist/i18n/locale.d.ts +6 -0
- package/dist/i18n/locale.js +14 -0
- package/dist/i18n/messages.d.ts +13 -0
- package/dist/i18n/messages.js +284 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +1071 -0
- package/dist/node/audio/audio-concatenator.js +56 -0
- package/dist/node/audio/duration-estimator.js +21 -0
- package/dist/node/audio/format-converter.js +27 -0
- package/dist/node/audio/index.js +120 -0
- package/dist/node/audio/silence-generator.js +19 -0
- package/dist/node/conversational/index.js +241 -0
- package/dist/node/conversational/response-orchestrator.js +62 -0
- package/dist/node/conversational/transcript-builder.js +63 -0
- package/dist/node/conversational/turn-detector.js +43 -0
- package/dist/node/conversational/types.js +0 -0
- package/dist/node/conversational/voice-session-manager.js +137 -0
- package/dist/node/docs/conversational.docblock.js +5 -0
- package/dist/node/docs/stt.docblock.js +5 -0
- package/dist/node/docs/sync.docblock.js +5 -0
- package/dist/node/docs/tts.docblock.js +5 -0
- package/dist/node/docs/voice.docblock.js +5 -0
- package/dist/node/i18n/catalogs/en.js +91 -0
- package/dist/node/i18n/catalogs/es.js +91 -0
- package/dist/node/i18n/catalogs/fr.js +91 -0
- package/dist/node/i18n/catalogs/index.js +271 -0
- package/dist/node/i18n/index.js +335 -0
- package/dist/node/i18n/keys.js +38 -0
- package/dist/node/i18n/locale.js +13 -0
- package/dist/node/i18n/messages.js +283 -0
- package/dist/node/index.js +1070 -0
- package/dist/node/stt/diarization-mapper.js +42 -0
- package/dist/node/stt/index.js +222 -0
- package/dist/node/stt/segment-splitter.js +36 -0
- package/dist/node/stt/subtitle-formatter.js +51 -0
- package/dist/node/stt/transcriber.js +219 -0
- package/dist/node/stt/types.js +0 -0
- package/dist/node/sync/duration-negotiator.js +69 -0
- package/dist/node/sync/index.js +165 -0
- package/dist/node/sync/scene-adapter.js +52 -0
- package/dist/node/sync/timing-calculator.js +46 -0
- package/dist/node/tts/audio-assembler.js +120 -0
- package/dist/node/tts/emphasis-planner.js +134 -0
- package/dist/node/tts/index.js +439 -0
- package/dist/node/tts/pace-analyzer.js +67 -0
- package/dist/node/tts/segment-synthesizer.js +36 -0
- package/dist/node/tts/types.js +0 -0
- package/dist/node/tts/voice-synthesizer.js +435 -0
- package/dist/node/types.js +0 -0
- package/dist/stt/diarization-mapper.d.ts +19 -0
- package/dist/stt/diarization-mapper.js +43 -0
- package/dist/stt/index.d.ts +5 -0
- package/dist/stt/index.js +223 -0
- package/dist/stt/segment-splitter.d.ts +19 -0
- package/dist/stt/segment-splitter.js +37 -0
- package/dist/stt/subtitle-formatter.d.ts +19 -0
- package/dist/stt/subtitle-formatter.js +52 -0
- package/dist/stt/transcriber.d.ts +21 -0
- package/dist/stt/transcriber.js +220 -0
- package/dist/stt/types.d.ts +44 -0
- package/dist/stt/types.js +1 -0
- package/dist/sync/duration-negotiator.d.ts +37 -0
- package/dist/sync/duration-negotiator.js +70 -0
- package/dist/sync/index.d.ts +3 -0
- package/dist/sync/index.js +166 -0
- package/dist/sync/scene-adapter.d.ts +29 -0
- package/dist/sync/scene-adapter.js +53 -0
- package/dist/sync/timing-calculator.d.ts +21 -0
- package/dist/sync/timing-calculator.js +47 -0
- package/dist/tts/audio-assembler.d.ts +19 -0
- package/dist/tts/audio-assembler.js +121 -0
- package/dist/tts/emphasis-planner.d.ts +24 -0
- package/dist/tts/emphasis-planner.js +135 -0
- package/dist/tts/index.d.ts +6 -0
- package/dist/tts/index.js +440 -0
- package/dist/tts/pace-analyzer.d.ts +30 -0
- package/dist/tts/pace-analyzer.js +68 -0
- package/dist/tts/segment-synthesizer.d.ts +21 -0
- package/dist/tts/segment-synthesizer.js +37 -0
- package/dist/tts/types.d.ts +76 -0
- package/dist/tts/types.js +1 -0
- package/dist/tts/voice-synthesizer.d.ts +28 -0
- package/dist/tts/voice-synthesizer.js +436 -0
- package/dist/types.d.ts +12 -0
- package/dist/types.js +1 -0
- package/package.json +760 -0
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { VoiceSynthesizer } from './voice-synthesizer';
|
|
2
|
+
export { PaceAnalyzer } from './pace-analyzer';
|
|
3
|
+
export { EmphasisPlanner } from './emphasis-planner';
|
|
4
|
+
export { SegmentSynthesizer } from './segment-synthesizer';
|
|
5
|
+
export { AudioAssembler } from './audio-assembler';
|
|
6
|
+
export type { TTSBrief, TTSVoiceConfig, PacingConfig, TTSProject, TTSScript, TTSScriptSegment, SynthesizedSegment, TTSOptions, VideoTTSBrief, } from './types';
|
|
@@ -0,0 +1,440 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/audio/audio-concatenator.ts
|
|
3
|
+
class AudioConcatenator {
|
|
4
|
+
concatenate(segments) {
|
|
5
|
+
if (segments.length === 0) {
|
|
6
|
+
return {
|
|
7
|
+
data: new Uint8Array(0),
|
|
8
|
+
format: "wav",
|
|
9
|
+
sampleRateHz: 44100,
|
|
10
|
+
durationMs: 0,
|
|
11
|
+
channels: 1
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
const [firstSegment] = segments;
|
|
15
|
+
if (!firstSegment) {
|
|
16
|
+
return {
|
|
17
|
+
data: new Uint8Array(0),
|
|
18
|
+
format: "wav",
|
|
19
|
+
sampleRateHz: 44100,
|
|
20
|
+
durationMs: 0,
|
|
21
|
+
channels: 1
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
if (segments.length === 1) {
|
|
25
|
+
return { ...firstSegment };
|
|
26
|
+
}
|
|
27
|
+
const referenceFormat = firstSegment.format;
|
|
28
|
+
const referenceSampleRate = firstSegment.sampleRateHz;
|
|
29
|
+
const referenceChannels = firstSegment.channels ?? 1;
|
|
30
|
+
for (const seg of segments) {
|
|
31
|
+
if (seg.format !== referenceFormat) {
|
|
32
|
+
throw new Error(`Format mismatch: expected ${referenceFormat}, got ${seg.format}`);
|
|
33
|
+
}
|
|
34
|
+
if (seg.sampleRateHz !== referenceSampleRate) {
|
|
35
|
+
throw new Error(`Sample rate mismatch: expected ${referenceSampleRate}, got ${seg.sampleRateHz}`);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
const totalBytes = segments.reduce((sum, s) => sum + s.data.length, 0);
|
|
39
|
+
const combined = new Uint8Array(totalBytes);
|
|
40
|
+
let offset = 0;
|
|
41
|
+
for (const seg of segments) {
|
|
42
|
+
combined.set(seg.data, offset);
|
|
43
|
+
offset += seg.data.length;
|
|
44
|
+
}
|
|
45
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
|
|
46
|
+
return {
|
|
47
|
+
data: combined,
|
|
48
|
+
format: referenceFormat,
|
|
49
|
+
sampleRateHz: referenceSampleRate,
|
|
50
|
+
durationMs: totalDurationMs,
|
|
51
|
+
channels: referenceChannels
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// src/audio/duration-estimator.ts
|
|
57
|
+
class DurationEstimator {
|
|
58
|
+
static DEFAULT_WPM = 150;
|
|
59
|
+
estimateSeconds(text, wordsPerMinute) {
|
|
60
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
61
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
62
|
+
return Math.ceil(wordCount / wpm * 60);
|
|
63
|
+
}
|
|
64
|
+
estimateMs(text, wordsPerMinute) {
|
|
65
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
66
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
67
|
+
return Math.ceil(wordCount / wpm * 60 * 1000);
|
|
68
|
+
}
|
|
69
|
+
estimateWordCount(durationSeconds, wordsPerMinute) {
|
|
70
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
71
|
+
return Math.round(durationSeconds / 60 * wpm);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// src/audio/silence-generator.ts
|
|
76
|
+
class SilenceGenerator {
|
|
77
|
+
generate(durationMs, format = "wav", sampleRateHz = 44100, channels = 1) {
|
|
78
|
+
const totalSamples = Math.ceil(sampleRateHz * durationMs / 1000);
|
|
79
|
+
const bytesPerSample = 2;
|
|
80
|
+
const dataSize = totalSamples * bytesPerSample * channels;
|
|
81
|
+
const data = new Uint8Array(dataSize);
|
|
82
|
+
return {
|
|
83
|
+
data,
|
|
84
|
+
format,
|
|
85
|
+
sampleRateHz,
|
|
86
|
+
durationMs,
|
|
87
|
+
channels
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// src/tts/pace-analyzer.ts
|
|
93
|
+
var CONTENT_TYPE_PACING = {
|
|
94
|
+
intro: {
|
|
95
|
+
rate: 0.95,
|
|
96
|
+
emphasis: "normal",
|
|
97
|
+
tone: "authoritative",
|
|
98
|
+
leadingSilenceMs: 0,
|
|
99
|
+
trailingSilenceMs: 500
|
|
100
|
+
},
|
|
101
|
+
problem: {
|
|
102
|
+
rate: 0.9,
|
|
103
|
+
emphasis: "strong",
|
|
104
|
+
tone: "urgent",
|
|
105
|
+
leadingSilenceMs: 300,
|
|
106
|
+
trailingSilenceMs: 500
|
|
107
|
+
},
|
|
108
|
+
solution: {
|
|
109
|
+
rate: 1,
|
|
110
|
+
emphasis: "normal",
|
|
111
|
+
tone: "calm",
|
|
112
|
+
leadingSilenceMs: 300,
|
|
113
|
+
trailingSilenceMs: 500
|
|
114
|
+
},
|
|
115
|
+
metric: {
|
|
116
|
+
rate: 0.85,
|
|
117
|
+
emphasis: "strong",
|
|
118
|
+
tone: "excited",
|
|
119
|
+
leadingSilenceMs: 300,
|
|
120
|
+
trailingSilenceMs: 600
|
|
121
|
+
},
|
|
122
|
+
cta: {
|
|
123
|
+
rate: 0.9,
|
|
124
|
+
emphasis: "strong",
|
|
125
|
+
tone: "authoritative",
|
|
126
|
+
leadingSilenceMs: 400,
|
|
127
|
+
trailingSilenceMs: 0
|
|
128
|
+
},
|
|
129
|
+
transition: {
|
|
130
|
+
rate: 1.1,
|
|
131
|
+
emphasis: "reduced",
|
|
132
|
+
tone: "neutral",
|
|
133
|
+
leadingSilenceMs: 200,
|
|
134
|
+
trailingSilenceMs: 300
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
class PaceAnalyzer {
|
|
139
|
+
analyze(segments, baseRate = 1) {
|
|
140
|
+
return segments.map((segment) => {
|
|
141
|
+
const defaults = CONTENT_TYPE_PACING[segment.contentType];
|
|
142
|
+
return {
|
|
143
|
+
sceneId: segment.sceneId,
|
|
144
|
+
rate: defaults.rate * baseRate,
|
|
145
|
+
emphasis: defaults.emphasis,
|
|
146
|
+
tone: defaults.tone,
|
|
147
|
+
leadingSilenceMs: defaults.leadingSilenceMs,
|
|
148
|
+
trailingSilenceMs: defaults.trailingSilenceMs
|
|
149
|
+
};
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
getDefaults(contentType) {
|
|
153
|
+
return { ...CONTENT_TYPE_PACING[contentType] };
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// src/tts/emphasis-planner.ts
|
|
158
|
+
class EmphasisPlanner {
|
|
159
|
+
llm;
|
|
160
|
+
model;
|
|
161
|
+
paceAnalyzer;
|
|
162
|
+
constructor(options) {
|
|
163
|
+
this.llm = options?.llm;
|
|
164
|
+
this.model = options?.model;
|
|
165
|
+
this.paceAnalyzer = new PaceAnalyzer;
|
|
166
|
+
}
|
|
167
|
+
async plan(segments, baseRate = 1) {
|
|
168
|
+
if (!this.llm) {
|
|
169
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
170
|
+
}
|
|
171
|
+
try {
|
|
172
|
+
return await this.planWithLlm(segments, baseRate);
|
|
173
|
+
} catch {
|
|
174
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
async planWithLlm(segments, baseRate) {
|
|
178
|
+
if (!this.llm) {
|
|
179
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
180
|
+
}
|
|
181
|
+
const response = await this.llm.chat([
|
|
182
|
+
{
|
|
183
|
+
role: "system",
|
|
184
|
+
content: [
|
|
185
|
+
{
|
|
186
|
+
type: "text",
|
|
187
|
+
text: [
|
|
188
|
+
"You are a voice director planning emphasis and pacing for TTS narration.",
|
|
189
|
+
"For each segment, return a JSON array of directives.",
|
|
190
|
+
"Each directive has: sceneId, rate (0.7-1.3), emphasis (reduced|normal|strong),",
|
|
191
|
+
"tone (neutral|urgent|excited|calm|authoritative), leadingSilenceMs, trailingSilenceMs.",
|
|
192
|
+
"Return ONLY a JSON array, no other text."
|
|
193
|
+
].join(`
|
|
194
|
+
`)
|
|
195
|
+
}
|
|
196
|
+
]
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
role: "user",
|
|
200
|
+
content: [
|
|
201
|
+
{
|
|
202
|
+
type: "text",
|
|
203
|
+
text: JSON.stringify(segments.map((s) => ({
|
|
204
|
+
sceneId: s.sceneId,
|
|
205
|
+
text: s.text,
|
|
206
|
+
contentType: s.contentType
|
|
207
|
+
})))
|
|
208
|
+
}
|
|
209
|
+
]
|
|
210
|
+
}
|
|
211
|
+
], { model: this.model, temperature: 0.3, responseFormat: "json" });
|
|
212
|
+
const text = response.message.content.find((p) => p.type === "text");
|
|
213
|
+
if (!text || text.type !== "text") {
|
|
214
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
215
|
+
}
|
|
216
|
+
const parsed = JSON.parse(text.text);
|
|
217
|
+
return parsed.map((d) => ({
|
|
218
|
+
...d,
|
|
219
|
+
rate: d.rate * baseRate
|
|
220
|
+
}));
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// src/tts/segment-synthesizer.ts
|
|
225
|
+
class SegmentSynthesizer {
|
|
226
|
+
tts;
|
|
227
|
+
constructor(tts) {
|
|
228
|
+
this.tts = tts;
|
|
229
|
+
}
|
|
230
|
+
async synthesizeAll(segments, voice, directives) {
|
|
231
|
+
const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
|
|
232
|
+
const results = await Promise.all(segments.map((segment) => this.synthesizeOne(segment, voice, directiveMap.get(segment.sceneId))));
|
|
233
|
+
return results;
|
|
234
|
+
}
|
|
235
|
+
async synthesizeOne(segment, voice, directive) {
|
|
236
|
+
const result = await this.tts.synthesize({
|
|
237
|
+
text: segment.text,
|
|
238
|
+
voiceId: voice.voiceId,
|
|
239
|
+
language: voice.language,
|
|
240
|
+
style: voice.style,
|
|
241
|
+
stability: voice.stability,
|
|
242
|
+
rate: directive?.rate,
|
|
243
|
+
emphasis: directive?.emphasis
|
|
244
|
+
});
|
|
245
|
+
return {
|
|
246
|
+
sceneId: segment.sceneId,
|
|
247
|
+
audio: result.audio,
|
|
248
|
+
durationMs: result.audio.durationMs ?? 0,
|
|
249
|
+
wordTimings: result.wordTimings?.map((wt) => ({
|
|
250
|
+
word: wt.word,
|
|
251
|
+
startMs: wt.startMs,
|
|
252
|
+
endMs: wt.endMs
|
|
253
|
+
}))
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// src/tts/audio-assembler.ts
|
|
259
|
+
class AudioAssembler {
|
|
260
|
+
concatenator = new AudioConcatenator;
|
|
261
|
+
silenceGenerator = new SilenceGenerator;
|
|
262
|
+
assemble(segments, directives, defaultPauseMs = 500) {
|
|
263
|
+
if (segments.length === 0) {
|
|
264
|
+
return {
|
|
265
|
+
data: new Uint8Array(0),
|
|
266
|
+
format: "wav",
|
|
267
|
+
sampleRateHz: 44100,
|
|
268
|
+
durationMs: 0,
|
|
269
|
+
channels: 1
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
const [firstSegment] = segments;
|
|
273
|
+
if (!firstSegment) {
|
|
274
|
+
return {
|
|
275
|
+
data: new Uint8Array(0),
|
|
276
|
+
format: "wav",
|
|
277
|
+
sampleRateHz: 44100,
|
|
278
|
+
durationMs: 0,
|
|
279
|
+
channels: 1
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
|
|
283
|
+
const reference = firstSegment.audio;
|
|
284
|
+
const parts = [];
|
|
285
|
+
for (let i = 0;i < segments.length; i++) {
|
|
286
|
+
const segment = segments[i];
|
|
287
|
+
if (!segment) {
|
|
288
|
+
continue;
|
|
289
|
+
}
|
|
290
|
+
const directive = directiveMap.get(segment.sceneId);
|
|
291
|
+
const leadingSilenceMs = directive?.leadingSilenceMs ?? 0;
|
|
292
|
+
if (leadingSilenceMs > 0) {
|
|
293
|
+
parts.push(this.silenceGenerator.generate(leadingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
|
|
294
|
+
}
|
|
295
|
+
parts.push(segment.audio);
|
|
296
|
+
const trailingSilenceMs = directive?.trailingSilenceMs ?? (i < segments.length - 1 ? defaultPauseMs : 0);
|
|
297
|
+
if (trailingSilenceMs > 0) {
|
|
298
|
+
parts.push(this.silenceGenerator.generate(trailingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
return this.concatenator.concatenate(parts);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// src/tts/voice-synthesizer.ts
|
|
306
|
+
class VoiceSynthesizer {
|
|
307
|
+
segmentSynthesizer;
|
|
308
|
+
emphasisPlanner;
|
|
309
|
+
audioAssembler = new AudioAssembler;
|
|
310
|
+
durationEstimator = new DurationEstimator;
|
|
311
|
+
paceAnalyzer = new PaceAnalyzer;
|
|
312
|
+
options;
|
|
313
|
+
constructor(options) {
|
|
314
|
+
this.options = options;
|
|
315
|
+
this.segmentSynthesizer = new SegmentSynthesizer(options.tts);
|
|
316
|
+
this.emphasisPlanner = new EmphasisPlanner({
|
|
317
|
+
llm: options.llm,
|
|
318
|
+
model: options.model
|
|
319
|
+
});
|
|
320
|
+
}
|
|
321
|
+
async synthesize(brief) {
|
|
322
|
+
const script = this.buildScript(brief);
|
|
323
|
+
return this.executePipeline(script, brief.voice, brief.pacing);
|
|
324
|
+
}
|
|
325
|
+
async synthesizeForVideo(brief) {
|
|
326
|
+
const script = this.buildScriptFromScenePlan(brief);
|
|
327
|
+
return this.executePipeline(script, brief.voice, brief.pacing, brief.fps);
|
|
328
|
+
}
|
|
329
|
+
async executePipeline(script, voice, pacing, fps) {
|
|
330
|
+
const projectId = generateProjectId();
|
|
331
|
+
const baseRate = pacing?.baseRate ?? 1;
|
|
332
|
+
const pacingDirectives = await this.emphasisPlanner.plan(script.segments, baseRate);
|
|
333
|
+
const synthesized = await this.segmentSynthesizer.synthesizeAll(script.segments, voice, pacingDirectives);
|
|
334
|
+
const pauseMs = pacing?.segmentPauseMs ?? 500;
|
|
335
|
+
const assembledAudio = this.audioAssembler.assemble(synthesized, pacingDirectives, pauseMs);
|
|
336
|
+
const effectiveFps = fps ?? this.options.fps ?? 30;
|
|
337
|
+
const breathingRoomFactor = pacing?.breathingRoomFactor ?? 1.15;
|
|
338
|
+
const timingMap = this.buildTimingMap(synthesized, effectiveFps, breathingRoomFactor);
|
|
339
|
+
return {
|
|
340
|
+
id: projectId,
|
|
341
|
+
script,
|
|
342
|
+
pacingDirectives,
|
|
343
|
+
segments: synthesized,
|
|
344
|
+
assembledAudio,
|
|
345
|
+
timingMap
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
buildScript(brief) {
|
|
349
|
+
const segments = [];
|
|
350
|
+
const introText = `${brief.content.title}. ${brief.content.summary}`;
|
|
351
|
+
segments.push({
|
|
352
|
+
sceneId: "intro",
|
|
353
|
+
text: introText,
|
|
354
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(introText),
|
|
355
|
+
contentType: "intro"
|
|
356
|
+
});
|
|
357
|
+
if (brief.content.problems.length > 0) {
|
|
358
|
+
const text = brief.content.problems.join(". ");
|
|
359
|
+
segments.push({
|
|
360
|
+
sceneId: "problems",
|
|
361
|
+
text,
|
|
362
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
363
|
+
contentType: "problem"
|
|
364
|
+
});
|
|
365
|
+
}
|
|
366
|
+
if (brief.content.solutions.length > 0) {
|
|
367
|
+
const text = brief.content.solutions.join(". ");
|
|
368
|
+
segments.push({
|
|
369
|
+
sceneId: "solutions",
|
|
370
|
+
text,
|
|
371
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
372
|
+
contentType: "solution"
|
|
373
|
+
});
|
|
374
|
+
}
|
|
375
|
+
if (brief.content.metrics && brief.content.metrics.length > 0) {
|
|
376
|
+
const text = brief.content.metrics.join(". ");
|
|
377
|
+
segments.push({
|
|
378
|
+
sceneId: "metrics",
|
|
379
|
+
text,
|
|
380
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
381
|
+
contentType: "metric"
|
|
382
|
+
});
|
|
383
|
+
}
|
|
384
|
+
if (brief.content.callToAction) {
|
|
385
|
+
segments.push({
|
|
386
|
+
sceneId: "cta",
|
|
387
|
+
text: brief.content.callToAction,
|
|
388
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(brief.content.callToAction),
|
|
389
|
+
contentType: "cta"
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
const fullText = segments.map((s) => s.text).join(" ");
|
|
393
|
+
const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
|
|
394
|
+
return { fullText, segments, estimatedDurationSeconds };
|
|
395
|
+
}
|
|
396
|
+
buildScriptFromScenePlan(brief) {
|
|
397
|
+
const segments = brief.scenePlan.scenes.filter((scene) => scene.narrationText).map((scene) => {
|
|
398
|
+
const text = scene.narrationText ?? "";
|
|
399
|
+
return {
|
|
400
|
+
sceneId: scene.id,
|
|
401
|
+
text,
|
|
402
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
403
|
+
contentType: "intro"
|
|
404
|
+
};
|
|
405
|
+
});
|
|
406
|
+
const fullText = segments.map((s) => s.text).join(" ");
|
|
407
|
+
const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
|
|
408
|
+
return { fullText, segments, estimatedDurationSeconds };
|
|
409
|
+
}
|
|
410
|
+
buildTimingMap(segments, fps, breathingRoomFactor) {
|
|
411
|
+
const timingSegments = segments.map((seg) => {
|
|
412
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * fps);
|
|
413
|
+
return {
|
|
414
|
+
sceneId: seg.sceneId,
|
|
415
|
+
durationMs: seg.durationMs,
|
|
416
|
+
durationInFrames,
|
|
417
|
+
recommendedSceneDurationInFrames: Math.ceil(durationInFrames * breathingRoomFactor),
|
|
418
|
+
wordTimings: seg.wordTimings?.map((wt) => ({
|
|
419
|
+
word: wt.word,
|
|
420
|
+
startMs: wt.startMs,
|
|
421
|
+
endMs: wt.endMs
|
|
422
|
+
}))
|
|
423
|
+
};
|
|
424
|
+
});
|
|
425
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + s.durationMs, 0);
|
|
426
|
+
return { totalDurationMs, segments: timingSegments, fps };
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
function generateProjectId() {
|
|
430
|
+
const timestamp = Date.now().toString(36);
|
|
431
|
+
const random = Math.random().toString(36).slice(2, 8);
|
|
432
|
+
return `tts_${timestamp}_${random}`;
|
|
433
|
+
}
|
|
434
|
+
export {
|
|
435
|
+
VoiceSynthesizer,
|
|
436
|
+
SegmentSynthesizer,
|
|
437
|
+
PaceAnalyzer,
|
|
438
|
+
EmphasisPlanner,
|
|
439
|
+
AudioAssembler
|
|
440
|
+
};
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { VoicePacingDirective } from '../types';
|
|
2
|
+
import type { TTSScriptSegment } from './types';
|
|
3
|
+
type ContentType = TTSScriptSegment['contentType'];
|
|
4
|
+
type Tone = VoicePacingDirective['tone'];
|
|
5
|
+
type Emphasis = VoicePacingDirective['emphasis'];
|
|
6
|
+
interface PacingDefaults {
|
|
7
|
+
rate: number;
|
|
8
|
+
emphasis: Emphasis;
|
|
9
|
+
tone: Tone;
|
|
10
|
+
leadingSilenceMs: number;
|
|
11
|
+
trailingSilenceMs: number;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Analyze content segments and produce pacing directives.
|
|
15
|
+
*
|
|
16
|
+
* Uses a deterministic content-type -> pacing mapping as fallback.
|
|
17
|
+
* When an LLM is provided, can enhance with fine-grained analysis.
|
|
18
|
+
*/
|
|
19
|
+
export declare class PaceAnalyzer {
|
|
20
|
+
/**
|
|
21
|
+
* Generate pacing directives for a list of script segments.
|
|
22
|
+
*
|
|
23
|
+
* @param segments - The TTS script segments to analyze
|
|
24
|
+
* @param baseRate - Base rate multiplier applied to all directives
|
|
25
|
+
*/
|
|
26
|
+
analyze(segments: TTSScriptSegment[], baseRate?: number): VoicePacingDirective[];
|
|
27
|
+
/** Get the default pacing for a content type */
|
|
28
|
+
getDefaults(contentType: ContentType): PacingDefaults;
|
|
29
|
+
}
|
|
30
|
+
export {};
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/tts/pace-analyzer.ts
|
|
3
|
+
var CONTENT_TYPE_PACING = {
|
|
4
|
+
intro: {
|
|
5
|
+
rate: 0.95,
|
|
6
|
+
emphasis: "normal",
|
|
7
|
+
tone: "authoritative",
|
|
8
|
+
leadingSilenceMs: 0,
|
|
9
|
+
trailingSilenceMs: 500
|
|
10
|
+
},
|
|
11
|
+
problem: {
|
|
12
|
+
rate: 0.9,
|
|
13
|
+
emphasis: "strong",
|
|
14
|
+
tone: "urgent",
|
|
15
|
+
leadingSilenceMs: 300,
|
|
16
|
+
trailingSilenceMs: 500
|
|
17
|
+
},
|
|
18
|
+
solution: {
|
|
19
|
+
rate: 1,
|
|
20
|
+
emphasis: "normal",
|
|
21
|
+
tone: "calm",
|
|
22
|
+
leadingSilenceMs: 300,
|
|
23
|
+
trailingSilenceMs: 500
|
|
24
|
+
},
|
|
25
|
+
metric: {
|
|
26
|
+
rate: 0.85,
|
|
27
|
+
emphasis: "strong",
|
|
28
|
+
tone: "excited",
|
|
29
|
+
leadingSilenceMs: 300,
|
|
30
|
+
trailingSilenceMs: 600
|
|
31
|
+
},
|
|
32
|
+
cta: {
|
|
33
|
+
rate: 0.9,
|
|
34
|
+
emphasis: "strong",
|
|
35
|
+
tone: "authoritative",
|
|
36
|
+
leadingSilenceMs: 400,
|
|
37
|
+
trailingSilenceMs: 0
|
|
38
|
+
},
|
|
39
|
+
transition: {
|
|
40
|
+
rate: 1.1,
|
|
41
|
+
emphasis: "reduced",
|
|
42
|
+
tone: "neutral",
|
|
43
|
+
leadingSilenceMs: 200,
|
|
44
|
+
trailingSilenceMs: 300
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
class PaceAnalyzer {
|
|
49
|
+
analyze(segments, baseRate = 1) {
|
|
50
|
+
return segments.map((segment) => {
|
|
51
|
+
const defaults = CONTENT_TYPE_PACING[segment.contentType];
|
|
52
|
+
return {
|
|
53
|
+
sceneId: segment.sceneId,
|
|
54
|
+
rate: defaults.rate * baseRate,
|
|
55
|
+
emphasis: defaults.emphasis,
|
|
56
|
+
tone: defaults.tone,
|
|
57
|
+
leadingSilenceMs: defaults.leadingSilenceMs,
|
|
58
|
+
trailingSilenceMs: defaults.trailingSilenceMs
|
|
59
|
+
};
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
getDefaults(contentType) {
|
|
63
|
+
return { ...CONTENT_TYPE_PACING[contentType] };
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
export {
|
|
67
|
+
PaceAnalyzer
|
|
68
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { TTSProvider, VoicePacingDirective } from '../types';
|
|
2
|
+
import type { SynthesizedSegment, TTSScriptSegment, TTSVoiceConfig } from './types';
|
|
3
|
+
/**
|
|
4
|
+
* Synthesize individual script segments via TTSProvider.
|
|
5
|
+
*
|
|
6
|
+
* Applies pacing directives (rate, emphasis) to each synthesis call.
|
|
7
|
+
* Runs segments in parallel via Promise.all().
|
|
8
|
+
*/
|
|
9
|
+
export declare class SegmentSynthesizer {
|
|
10
|
+
private readonly tts;
|
|
11
|
+
constructor(tts: TTSProvider);
|
|
12
|
+
/**
|
|
13
|
+
* Synthesize all segments in parallel.
|
|
14
|
+
*
|
|
15
|
+
* @param segments - Script segments to synthesize
|
|
16
|
+
* @param voice - Voice configuration
|
|
17
|
+
* @param directives - Pacing directives per segment (matched by sceneId)
|
|
18
|
+
*/
|
|
19
|
+
synthesizeAll(segments: TTSScriptSegment[], voice: TTSVoiceConfig, directives: VoicePacingDirective[]): Promise<SynthesizedSegment[]>;
|
|
20
|
+
private synthesizeOne;
|
|
21
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/tts/segment-synthesizer.ts
|
|
3
|
+
class SegmentSynthesizer {
|
|
4
|
+
tts;
|
|
5
|
+
constructor(tts) {
|
|
6
|
+
this.tts = tts;
|
|
7
|
+
}
|
|
8
|
+
async synthesizeAll(segments, voice, directives) {
|
|
9
|
+
const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
|
|
10
|
+
const results = await Promise.all(segments.map((segment) => this.synthesizeOne(segment, voice, directiveMap.get(segment.sceneId))));
|
|
11
|
+
return results;
|
|
12
|
+
}
|
|
13
|
+
async synthesizeOne(segment, voice, directive) {
|
|
14
|
+
const result = await this.tts.synthesize({
|
|
15
|
+
text: segment.text,
|
|
16
|
+
voiceId: voice.voiceId,
|
|
17
|
+
language: voice.language,
|
|
18
|
+
style: voice.style,
|
|
19
|
+
stability: voice.stability,
|
|
20
|
+
rate: directive?.rate,
|
|
21
|
+
emphasis: directive?.emphasis
|
|
22
|
+
});
|
|
23
|
+
return {
|
|
24
|
+
sceneId: segment.sceneId,
|
|
25
|
+
audio: result.audio,
|
|
26
|
+
durationMs: result.audio.durationMs ?? 0,
|
|
27
|
+
wordTimings: result.wordTimings?.map((wt) => ({
|
|
28
|
+
word: wt.word,
|
|
29
|
+
startMs: wt.startMs,
|
|
30
|
+
endMs: wt.endMs
|
|
31
|
+
}))
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
export {
|
|
36
|
+
SegmentSynthesizer
|
|
37
|
+
};
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import type { ContentBrief } from '@contractspec/lib.content-gen/types';
|
|
2
|
+
import type { TTSProvider, AudioData, VoiceTimingMap, VoicePacingDirective, VoiceOptions } from '../types';
|
|
3
|
+
export interface TTSBrief {
|
|
4
|
+
content: ContentBrief;
|
|
5
|
+
voice: TTSVoiceConfig;
|
|
6
|
+
pacing?: PacingConfig;
|
|
7
|
+
targetDurationSeconds?: number;
|
|
8
|
+
locale?: string;
|
|
9
|
+
}
|
|
10
|
+
export interface TTSVoiceConfig {
|
|
11
|
+
voiceId: string;
|
|
12
|
+
language?: string;
|
|
13
|
+
style?: number;
|
|
14
|
+
stability?: number;
|
|
15
|
+
}
|
|
16
|
+
export interface PacingConfig {
|
|
17
|
+
/** Base speaking rate multiplier. Default 1.0 */
|
|
18
|
+
baseRate?: number;
|
|
19
|
+
strategy: 'uniform' | 'dynamic' | 'scene-matched';
|
|
20
|
+
/** Pause between segments in ms. Default 500 */
|
|
21
|
+
segmentPauseMs?: number;
|
|
22
|
+
/** Breathing room factor for scene duration. Default 1.15 */
|
|
23
|
+
breathingRoomFactor?: number;
|
|
24
|
+
}
|
|
25
|
+
export interface TTSProject {
|
|
26
|
+
id: string;
|
|
27
|
+
script: TTSScript;
|
|
28
|
+
pacingDirectives: VoicePacingDirective[];
|
|
29
|
+
segments?: SynthesizedSegment[];
|
|
30
|
+
assembledAudio?: AudioData;
|
|
31
|
+
timingMap?: VoiceTimingMap;
|
|
32
|
+
}
|
|
33
|
+
export interface TTSScript {
|
|
34
|
+
fullText: string;
|
|
35
|
+
segments: TTSScriptSegment[];
|
|
36
|
+
estimatedDurationSeconds: number;
|
|
37
|
+
}
|
|
38
|
+
export interface TTSScriptSegment {
|
|
39
|
+
sceneId: string;
|
|
40
|
+
text: string;
|
|
41
|
+
estimatedDurationSeconds: number;
|
|
42
|
+
contentType: 'intro' | 'problem' | 'solution' | 'metric' | 'cta' | 'transition';
|
|
43
|
+
}
|
|
44
|
+
export interface SynthesizedSegment {
|
|
45
|
+
sceneId: string;
|
|
46
|
+
audio: AudioData;
|
|
47
|
+
durationMs: number;
|
|
48
|
+
wordTimings?: {
|
|
49
|
+
word: string;
|
|
50
|
+
startMs: number;
|
|
51
|
+
endMs: number;
|
|
52
|
+
}[];
|
|
53
|
+
}
|
|
54
|
+
export interface TTSOptions extends VoiceOptions {
|
|
55
|
+
tts: TTSProvider;
|
|
56
|
+
defaultVoiceId?: string;
|
|
57
|
+
fps?: number;
|
|
58
|
+
defaultPacing?: PacingConfig;
|
|
59
|
+
}
|
|
60
|
+
/** Video-aware brief: takes a ScenePlan instead of standalone content */
|
|
61
|
+
export interface VideoTTSBrief {
|
|
62
|
+
content: ContentBrief;
|
|
63
|
+
scenePlan: {
|
|
64
|
+
scenes: {
|
|
65
|
+
id: string;
|
|
66
|
+
compositionId: string;
|
|
67
|
+
durationInFrames: number;
|
|
68
|
+
narrationText?: string;
|
|
69
|
+
}[];
|
|
70
|
+
estimatedDurationSeconds: number;
|
|
71
|
+
};
|
|
72
|
+
voice: TTSVoiceConfig;
|
|
73
|
+
pacing?: PacingConfig;
|
|
74
|
+
fps: number;
|
|
75
|
+
locale?: string;
|
|
76
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
// @bun
|