@contractspec/lib.voice 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio/audio-concatenator.d.ts +15 -0
- package/dist/audio/audio-concatenator.js +57 -0
- package/dist/audio/duration-estimator.d.ts +31 -0
- package/dist/audio/duration-estimator.js +22 -0
- package/dist/audio/format-converter.d.ts +17 -0
- package/dist/audio/format-converter.js +28 -0
- package/dist/audio/index.d.ts +4 -0
- package/dist/audio/index.js +121 -0
- package/dist/audio/silence-generator.d.ts +16 -0
- package/dist/audio/silence-generator.js +20 -0
- package/dist/browser/audio/audio-concatenator.js +56 -0
- package/dist/browser/audio/duration-estimator.js +21 -0
- package/dist/browser/audio/format-converter.js +27 -0
- package/dist/browser/audio/index.js +120 -0
- package/dist/browser/audio/silence-generator.js +19 -0
- package/dist/browser/conversational/index.js +241 -0
- package/dist/browser/conversational/response-orchestrator.js +62 -0
- package/dist/browser/conversational/transcript-builder.js +63 -0
- package/dist/browser/conversational/turn-detector.js +43 -0
- package/dist/browser/conversational/types.js +0 -0
- package/dist/browser/conversational/voice-session-manager.js +137 -0
- package/dist/browser/docs/conversational.docblock.js +5 -0
- package/dist/browser/docs/stt.docblock.js +5 -0
- package/dist/browser/docs/sync.docblock.js +5 -0
- package/dist/browser/docs/tts.docblock.js +5 -0
- package/dist/browser/docs/voice.docblock.js +5 -0
- package/dist/browser/i18n/catalogs/en.js +91 -0
- package/dist/browser/i18n/catalogs/es.js +91 -0
- package/dist/browser/i18n/catalogs/fr.js +91 -0
- package/dist/browser/i18n/catalogs/index.js +271 -0
- package/dist/browser/i18n/index.js +335 -0
- package/dist/browser/i18n/keys.js +38 -0
- package/dist/browser/i18n/locale.js +13 -0
- package/dist/browser/i18n/messages.js +283 -0
- package/dist/browser/index.js +1070 -0
- package/dist/browser/stt/diarization-mapper.js +42 -0
- package/dist/browser/stt/index.js +222 -0
- package/dist/browser/stt/segment-splitter.js +36 -0
- package/dist/browser/stt/subtitle-formatter.js +51 -0
- package/dist/browser/stt/transcriber.js +219 -0
- package/dist/browser/stt/types.js +0 -0
- package/dist/browser/sync/duration-negotiator.js +69 -0
- package/dist/browser/sync/index.js +165 -0
- package/dist/browser/sync/scene-adapter.js +52 -0
- package/dist/browser/sync/timing-calculator.js +46 -0
- package/dist/browser/tts/audio-assembler.js +120 -0
- package/dist/browser/tts/emphasis-planner.js +134 -0
- package/dist/browser/tts/index.js +439 -0
- package/dist/browser/tts/pace-analyzer.js +67 -0
- package/dist/browser/tts/segment-synthesizer.js +36 -0
- package/dist/browser/tts/types.js +0 -0
- package/dist/browser/tts/voice-synthesizer.js +435 -0
- package/dist/browser/types.js +0 -0
- package/dist/conversational/index.d.ts +5 -0
- package/dist/conversational/index.js +242 -0
- package/dist/conversational/response-orchestrator.d.ts +26 -0
- package/dist/conversational/response-orchestrator.js +63 -0
- package/dist/conversational/transcript-builder.d.ts +25 -0
- package/dist/conversational/transcript-builder.js +64 -0
- package/dist/conversational/turn-detector.d.ts +31 -0
- package/dist/conversational/turn-detector.js +44 -0
- package/dist/conversational/types.d.ts +55 -0
- package/dist/conversational/types.js +1 -0
- package/dist/conversational/voice-session-manager.d.ts +17 -0
- package/dist/conversational/voice-session-manager.js +138 -0
- package/dist/docs/conversational.docblock.d.ts +14 -0
- package/dist/docs/conversational.docblock.js +6 -0
- package/dist/docs/stt.docblock.d.ts +12 -0
- package/dist/docs/stt.docblock.js +6 -0
- package/dist/docs/sync.docblock.d.ts +12 -0
- package/dist/docs/sync.docblock.js +6 -0
- package/dist/docs/tts.docblock.d.ts +12 -0
- package/dist/docs/tts.docblock.js +6 -0
- package/dist/docs/voice.docblock.d.ts +22 -0
- package/dist/docs/voice.docblock.js +6 -0
- package/dist/i18n/catalogs/en.d.ts +6 -0
- package/dist/i18n/catalogs/en.js +92 -0
- package/dist/i18n/catalogs/es.d.ts +4 -0
- package/dist/i18n/catalogs/es.js +92 -0
- package/dist/i18n/catalogs/fr.d.ts +4 -0
- package/dist/i18n/catalogs/fr.js +92 -0
- package/dist/i18n/catalogs/index.d.ts +3 -0
- package/dist/i18n/catalogs/index.js +272 -0
- package/dist/i18n/index.d.ts +20 -0
- package/dist/i18n/index.js +336 -0
- package/dist/i18n/keys.d.ts +50 -0
- package/dist/i18n/keys.js +39 -0
- package/dist/i18n/locale.d.ts +6 -0
- package/dist/i18n/locale.js +14 -0
- package/dist/i18n/messages.d.ts +13 -0
- package/dist/i18n/messages.js +284 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +1071 -0
- package/dist/node/audio/audio-concatenator.js +56 -0
- package/dist/node/audio/duration-estimator.js +21 -0
- package/dist/node/audio/format-converter.js +27 -0
- package/dist/node/audio/index.js +120 -0
- package/dist/node/audio/silence-generator.js +19 -0
- package/dist/node/conversational/index.js +241 -0
- package/dist/node/conversational/response-orchestrator.js +62 -0
- package/dist/node/conversational/transcript-builder.js +63 -0
- package/dist/node/conversational/turn-detector.js +43 -0
- package/dist/node/conversational/types.js +0 -0
- package/dist/node/conversational/voice-session-manager.js +137 -0
- package/dist/node/docs/conversational.docblock.js +5 -0
- package/dist/node/docs/stt.docblock.js +5 -0
- package/dist/node/docs/sync.docblock.js +5 -0
- package/dist/node/docs/tts.docblock.js +5 -0
- package/dist/node/docs/voice.docblock.js +5 -0
- package/dist/node/i18n/catalogs/en.js +91 -0
- package/dist/node/i18n/catalogs/es.js +91 -0
- package/dist/node/i18n/catalogs/fr.js +91 -0
- package/dist/node/i18n/catalogs/index.js +271 -0
- package/dist/node/i18n/index.js +335 -0
- package/dist/node/i18n/keys.js +38 -0
- package/dist/node/i18n/locale.js +13 -0
- package/dist/node/i18n/messages.js +283 -0
- package/dist/node/index.js +1070 -0
- package/dist/node/stt/diarization-mapper.js +42 -0
- package/dist/node/stt/index.js +222 -0
- package/dist/node/stt/segment-splitter.js +36 -0
- package/dist/node/stt/subtitle-formatter.js +51 -0
- package/dist/node/stt/transcriber.js +219 -0
- package/dist/node/stt/types.js +0 -0
- package/dist/node/sync/duration-negotiator.js +69 -0
- package/dist/node/sync/index.js +165 -0
- package/dist/node/sync/scene-adapter.js +52 -0
- package/dist/node/sync/timing-calculator.js +46 -0
- package/dist/node/tts/audio-assembler.js +120 -0
- package/dist/node/tts/emphasis-planner.js +134 -0
- package/dist/node/tts/index.js +439 -0
- package/dist/node/tts/pace-analyzer.js +67 -0
- package/dist/node/tts/segment-synthesizer.js +36 -0
- package/dist/node/tts/types.js +0 -0
- package/dist/node/tts/voice-synthesizer.js +435 -0
- package/dist/node/types.js +0 -0
- package/dist/stt/diarization-mapper.d.ts +19 -0
- package/dist/stt/diarization-mapper.js +43 -0
- package/dist/stt/index.d.ts +5 -0
- package/dist/stt/index.js +223 -0
- package/dist/stt/segment-splitter.d.ts +19 -0
- package/dist/stt/segment-splitter.js +37 -0
- package/dist/stt/subtitle-formatter.d.ts +19 -0
- package/dist/stt/subtitle-formatter.js +52 -0
- package/dist/stt/transcriber.d.ts +21 -0
- package/dist/stt/transcriber.js +220 -0
- package/dist/stt/types.d.ts +44 -0
- package/dist/stt/types.js +1 -0
- package/dist/sync/duration-negotiator.d.ts +37 -0
- package/dist/sync/duration-negotiator.js +70 -0
- package/dist/sync/index.d.ts +3 -0
- package/dist/sync/index.js +166 -0
- package/dist/sync/scene-adapter.d.ts +29 -0
- package/dist/sync/scene-adapter.js +53 -0
- package/dist/sync/timing-calculator.d.ts +21 -0
- package/dist/sync/timing-calculator.js +47 -0
- package/dist/tts/audio-assembler.d.ts +19 -0
- package/dist/tts/audio-assembler.js +121 -0
- package/dist/tts/emphasis-planner.d.ts +24 -0
- package/dist/tts/emphasis-planner.js +135 -0
- package/dist/tts/index.d.ts +6 -0
- package/dist/tts/index.js +440 -0
- package/dist/tts/pace-analyzer.d.ts +30 -0
- package/dist/tts/pace-analyzer.js +68 -0
- package/dist/tts/segment-synthesizer.d.ts +21 -0
- package/dist/tts/segment-synthesizer.js +37 -0
- package/dist/tts/types.d.ts +76 -0
- package/dist/tts/types.js +1 -0
- package/dist/tts/voice-synthesizer.d.ts +28 -0
- package/dist/tts/voice-synthesizer.js +436 -0
- package/dist/types.d.ts +12 -0
- package/dist/types.js +1 -0
- package/package.json +760 -0
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
// src/audio/audio-concatenator.ts
|
|
2
|
+
class AudioConcatenator {
|
|
3
|
+
concatenate(segments) {
|
|
4
|
+
if (segments.length === 0) {
|
|
5
|
+
return {
|
|
6
|
+
data: new Uint8Array(0),
|
|
7
|
+
format: "wav",
|
|
8
|
+
sampleRateHz: 44100,
|
|
9
|
+
durationMs: 0,
|
|
10
|
+
channels: 1
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
const [firstSegment] = segments;
|
|
14
|
+
if (!firstSegment) {
|
|
15
|
+
return {
|
|
16
|
+
data: new Uint8Array(0),
|
|
17
|
+
format: "wav",
|
|
18
|
+
sampleRateHz: 44100,
|
|
19
|
+
durationMs: 0,
|
|
20
|
+
channels: 1
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
if (segments.length === 1) {
|
|
24
|
+
return { ...firstSegment };
|
|
25
|
+
}
|
|
26
|
+
const referenceFormat = firstSegment.format;
|
|
27
|
+
const referenceSampleRate = firstSegment.sampleRateHz;
|
|
28
|
+
const referenceChannels = firstSegment.channels ?? 1;
|
|
29
|
+
for (const seg of segments) {
|
|
30
|
+
if (seg.format !== referenceFormat) {
|
|
31
|
+
throw new Error(`Format mismatch: expected ${referenceFormat}, got ${seg.format}`);
|
|
32
|
+
}
|
|
33
|
+
if (seg.sampleRateHz !== referenceSampleRate) {
|
|
34
|
+
throw new Error(`Sample rate mismatch: expected ${referenceSampleRate}, got ${seg.sampleRateHz}`);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
const totalBytes = segments.reduce((sum, s) => sum + s.data.length, 0);
|
|
38
|
+
const combined = new Uint8Array(totalBytes);
|
|
39
|
+
let offset = 0;
|
|
40
|
+
for (const seg of segments) {
|
|
41
|
+
combined.set(seg.data, offset);
|
|
42
|
+
offset += seg.data.length;
|
|
43
|
+
}
|
|
44
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
|
|
45
|
+
return {
|
|
46
|
+
data: combined,
|
|
47
|
+
format: referenceFormat,
|
|
48
|
+
sampleRateHz: referenceSampleRate,
|
|
49
|
+
durationMs: totalDurationMs,
|
|
50
|
+
channels: referenceChannels
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// src/audio/duration-estimator.ts
|
|
56
|
+
class DurationEstimator {
|
|
57
|
+
static DEFAULT_WPM = 150;
|
|
58
|
+
estimateSeconds(text, wordsPerMinute) {
|
|
59
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
60
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
61
|
+
return Math.ceil(wordCount / wpm * 60);
|
|
62
|
+
}
|
|
63
|
+
estimateMs(text, wordsPerMinute) {
|
|
64
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
65
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
66
|
+
return Math.ceil(wordCount / wpm * 60 * 1000);
|
|
67
|
+
}
|
|
68
|
+
estimateWordCount(durationSeconds, wordsPerMinute) {
|
|
69
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
70
|
+
return Math.round(durationSeconds / 60 * wpm);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// src/audio/silence-generator.ts
|
|
75
|
+
class SilenceGenerator {
|
|
76
|
+
generate(durationMs, format = "wav", sampleRateHz = 44100, channels = 1) {
|
|
77
|
+
const totalSamples = Math.ceil(sampleRateHz * durationMs / 1000);
|
|
78
|
+
const bytesPerSample = 2;
|
|
79
|
+
const dataSize = totalSamples * bytesPerSample * channels;
|
|
80
|
+
const data = new Uint8Array(dataSize);
|
|
81
|
+
return {
|
|
82
|
+
data,
|
|
83
|
+
format,
|
|
84
|
+
sampleRateHz,
|
|
85
|
+
durationMs,
|
|
86
|
+
channels
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// src/tts/pace-analyzer.ts
|
|
92
|
+
var CONTENT_TYPE_PACING = {
|
|
93
|
+
intro: {
|
|
94
|
+
rate: 0.95,
|
|
95
|
+
emphasis: "normal",
|
|
96
|
+
tone: "authoritative",
|
|
97
|
+
leadingSilenceMs: 0,
|
|
98
|
+
trailingSilenceMs: 500
|
|
99
|
+
},
|
|
100
|
+
problem: {
|
|
101
|
+
rate: 0.9,
|
|
102
|
+
emphasis: "strong",
|
|
103
|
+
tone: "urgent",
|
|
104
|
+
leadingSilenceMs: 300,
|
|
105
|
+
trailingSilenceMs: 500
|
|
106
|
+
},
|
|
107
|
+
solution: {
|
|
108
|
+
rate: 1,
|
|
109
|
+
emphasis: "normal",
|
|
110
|
+
tone: "calm",
|
|
111
|
+
leadingSilenceMs: 300,
|
|
112
|
+
trailingSilenceMs: 500
|
|
113
|
+
},
|
|
114
|
+
metric: {
|
|
115
|
+
rate: 0.85,
|
|
116
|
+
emphasis: "strong",
|
|
117
|
+
tone: "excited",
|
|
118
|
+
leadingSilenceMs: 300,
|
|
119
|
+
trailingSilenceMs: 600
|
|
120
|
+
},
|
|
121
|
+
cta: {
|
|
122
|
+
rate: 0.9,
|
|
123
|
+
emphasis: "strong",
|
|
124
|
+
tone: "authoritative",
|
|
125
|
+
leadingSilenceMs: 400,
|
|
126
|
+
trailingSilenceMs: 0
|
|
127
|
+
},
|
|
128
|
+
transition: {
|
|
129
|
+
rate: 1.1,
|
|
130
|
+
emphasis: "reduced",
|
|
131
|
+
tone: "neutral",
|
|
132
|
+
leadingSilenceMs: 200,
|
|
133
|
+
trailingSilenceMs: 300
|
|
134
|
+
}
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
class PaceAnalyzer {
|
|
138
|
+
analyze(segments, baseRate = 1) {
|
|
139
|
+
return segments.map((segment) => {
|
|
140
|
+
const defaults = CONTENT_TYPE_PACING[segment.contentType];
|
|
141
|
+
return {
|
|
142
|
+
sceneId: segment.sceneId,
|
|
143
|
+
rate: defaults.rate * baseRate,
|
|
144
|
+
emphasis: defaults.emphasis,
|
|
145
|
+
tone: defaults.tone,
|
|
146
|
+
leadingSilenceMs: defaults.leadingSilenceMs,
|
|
147
|
+
trailingSilenceMs: defaults.trailingSilenceMs
|
|
148
|
+
};
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
getDefaults(contentType) {
|
|
152
|
+
return { ...CONTENT_TYPE_PACING[contentType] };
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// src/tts/emphasis-planner.ts
|
|
157
|
+
class EmphasisPlanner {
|
|
158
|
+
llm;
|
|
159
|
+
model;
|
|
160
|
+
paceAnalyzer;
|
|
161
|
+
constructor(options) {
|
|
162
|
+
this.llm = options?.llm;
|
|
163
|
+
this.model = options?.model;
|
|
164
|
+
this.paceAnalyzer = new PaceAnalyzer;
|
|
165
|
+
}
|
|
166
|
+
async plan(segments, baseRate = 1) {
|
|
167
|
+
if (!this.llm) {
|
|
168
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
169
|
+
}
|
|
170
|
+
try {
|
|
171
|
+
return await this.planWithLlm(segments, baseRate);
|
|
172
|
+
} catch {
|
|
173
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
async planWithLlm(segments, baseRate) {
|
|
177
|
+
if (!this.llm) {
|
|
178
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
179
|
+
}
|
|
180
|
+
const response = await this.llm.chat([
|
|
181
|
+
{
|
|
182
|
+
role: "system",
|
|
183
|
+
content: [
|
|
184
|
+
{
|
|
185
|
+
type: "text",
|
|
186
|
+
text: [
|
|
187
|
+
"You are a voice director planning emphasis and pacing for TTS narration.",
|
|
188
|
+
"For each segment, return a JSON array of directives.",
|
|
189
|
+
"Each directive has: sceneId, rate (0.7-1.3), emphasis (reduced|normal|strong),",
|
|
190
|
+
"tone (neutral|urgent|excited|calm|authoritative), leadingSilenceMs, trailingSilenceMs.",
|
|
191
|
+
"Return ONLY a JSON array, no other text."
|
|
192
|
+
].join(`
|
|
193
|
+
`)
|
|
194
|
+
}
|
|
195
|
+
]
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
role: "user",
|
|
199
|
+
content: [
|
|
200
|
+
{
|
|
201
|
+
type: "text",
|
|
202
|
+
text: JSON.stringify(segments.map((s) => ({
|
|
203
|
+
sceneId: s.sceneId,
|
|
204
|
+
text: s.text,
|
|
205
|
+
contentType: s.contentType
|
|
206
|
+
})))
|
|
207
|
+
}
|
|
208
|
+
]
|
|
209
|
+
}
|
|
210
|
+
], { model: this.model, temperature: 0.3, responseFormat: "json" });
|
|
211
|
+
const text = response.message.content.find((p) => p.type === "text");
|
|
212
|
+
if (!text || text.type !== "text") {
|
|
213
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
214
|
+
}
|
|
215
|
+
const parsed = JSON.parse(text.text);
|
|
216
|
+
return parsed.map((d) => ({
|
|
217
|
+
...d,
|
|
218
|
+
rate: d.rate * baseRate
|
|
219
|
+
}));
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// src/tts/segment-synthesizer.ts
|
|
224
|
+
class SegmentSynthesizer {
|
|
225
|
+
tts;
|
|
226
|
+
constructor(tts) {
|
|
227
|
+
this.tts = tts;
|
|
228
|
+
}
|
|
229
|
+
async synthesizeAll(segments, voice, directives) {
|
|
230
|
+
const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
|
|
231
|
+
const results = await Promise.all(segments.map((segment) => this.synthesizeOne(segment, voice, directiveMap.get(segment.sceneId))));
|
|
232
|
+
return results;
|
|
233
|
+
}
|
|
234
|
+
async synthesizeOne(segment, voice, directive) {
|
|
235
|
+
const result = await this.tts.synthesize({
|
|
236
|
+
text: segment.text,
|
|
237
|
+
voiceId: voice.voiceId,
|
|
238
|
+
language: voice.language,
|
|
239
|
+
style: voice.style,
|
|
240
|
+
stability: voice.stability,
|
|
241
|
+
rate: directive?.rate,
|
|
242
|
+
emphasis: directive?.emphasis
|
|
243
|
+
});
|
|
244
|
+
return {
|
|
245
|
+
sceneId: segment.sceneId,
|
|
246
|
+
audio: result.audio,
|
|
247
|
+
durationMs: result.audio.durationMs ?? 0,
|
|
248
|
+
wordTimings: result.wordTimings?.map((wt) => ({
|
|
249
|
+
word: wt.word,
|
|
250
|
+
startMs: wt.startMs,
|
|
251
|
+
endMs: wt.endMs
|
|
252
|
+
}))
|
|
253
|
+
};
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// src/tts/audio-assembler.ts
|
|
258
|
+
class AudioAssembler {
|
|
259
|
+
concatenator = new AudioConcatenator;
|
|
260
|
+
silenceGenerator = new SilenceGenerator;
|
|
261
|
+
assemble(segments, directives, defaultPauseMs = 500) {
|
|
262
|
+
if (segments.length === 0) {
|
|
263
|
+
return {
|
|
264
|
+
data: new Uint8Array(0),
|
|
265
|
+
format: "wav",
|
|
266
|
+
sampleRateHz: 44100,
|
|
267
|
+
durationMs: 0,
|
|
268
|
+
channels: 1
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
const [firstSegment] = segments;
|
|
272
|
+
if (!firstSegment) {
|
|
273
|
+
return {
|
|
274
|
+
data: new Uint8Array(0),
|
|
275
|
+
format: "wav",
|
|
276
|
+
sampleRateHz: 44100,
|
|
277
|
+
durationMs: 0,
|
|
278
|
+
channels: 1
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
|
|
282
|
+
const reference = firstSegment.audio;
|
|
283
|
+
const parts = [];
|
|
284
|
+
for (let i = 0;i < segments.length; i++) {
|
|
285
|
+
const segment = segments[i];
|
|
286
|
+
if (!segment) {
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
const directive = directiveMap.get(segment.sceneId);
|
|
290
|
+
const leadingSilenceMs = directive?.leadingSilenceMs ?? 0;
|
|
291
|
+
if (leadingSilenceMs > 0) {
|
|
292
|
+
parts.push(this.silenceGenerator.generate(leadingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
|
|
293
|
+
}
|
|
294
|
+
parts.push(segment.audio);
|
|
295
|
+
const trailingSilenceMs = directive?.trailingSilenceMs ?? (i < segments.length - 1 ? defaultPauseMs : 0);
|
|
296
|
+
if (trailingSilenceMs > 0) {
|
|
297
|
+
parts.push(this.silenceGenerator.generate(trailingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
return this.concatenator.concatenate(parts);
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// src/tts/voice-synthesizer.ts
|
|
305
|
+
class VoiceSynthesizer {
|
|
306
|
+
segmentSynthesizer;
|
|
307
|
+
emphasisPlanner;
|
|
308
|
+
audioAssembler = new AudioAssembler;
|
|
309
|
+
durationEstimator = new DurationEstimator;
|
|
310
|
+
paceAnalyzer = new PaceAnalyzer;
|
|
311
|
+
options;
|
|
312
|
+
constructor(options) {
|
|
313
|
+
this.options = options;
|
|
314
|
+
this.segmentSynthesizer = new SegmentSynthesizer(options.tts);
|
|
315
|
+
this.emphasisPlanner = new EmphasisPlanner({
|
|
316
|
+
llm: options.llm,
|
|
317
|
+
model: options.model
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
async synthesize(brief) {
|
|
321
|
+
const script = this.buildScript(brief);
|
|
322
|
+
return this.executePipeline(script, brief.voice, brief.pacing);
|
|
323
|
+
}
|
|
324
|
+
async synthesizeForVideo(brief) {
|
|
325
|
+
const script = this.buildScriptFromScenePlan(brief);
|
|
326
|
+
return this.executePipeline(script, brief.voice, brief.pacing, brief.fps);
|
|
327
|
+
}
|
|
328
|
+
async executePipeline(script, voice, pacing, fps) {
|
|
329
|
+
const projectId = generateProjectId();
|
|
330
|
+
const baseRate = pacing?.baseRate ?? 1;
|
|
331
|
+
const pacingDirectives = await this.emphasisPlanner.plan(script.segments, baseRate);
|
|
332
|
+
const synthesized = await this.segmentSynthesizer.synthesizeAll(script.segments, voice, pacingDirectives);
|
|
333
|
+
const pauseMs = pacing?.segmentPauseMs ?? 500;
|
|
334
|
+
const assembledAudio = this.audioAssembler.assemble(synthesized, pacingDirectives, pauseMs);
|
|
335
|
+
const effectiveFps = fps ?? this.options.fps ?? 30;
|
|
336
|
+
const breathingRoomFactor = pacing?.breathingRoomFactor ?? 1.15;
|
|
337
|
+
const timingMap = this.buildTimingMap(synthesized, effectiveFps, breathingRoomFactor);
|
|
338
|
+
return {
|
|
339
|
+
id: projectId,
|
|
340
|
+
script,
|
|
341
|
+
pacingDirectives,
|
|
342
|
+
segments: synthesized,
|
|
343
|
+
assembledAudio,
|
|
344
|
+
timingMap
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
buildScript(brief) {
|
|
348
|
+
const segments = [];
|
|
349
|
+
const introText = `${brief.content.title}. ${brief.content.summary}`;
|
|
350
|
+
segments.push({
|
|
351
|
+
sceneId: "intro",
|
|
352
|
+
text: introText,
|
|
353
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(introText),
|
|
354
|
+
contentType: "intro"
|
|
355
|
+
});
|
|
356
|
+
if (brief.content.problems.length > 0) {
|
|
357
|
+
const text = brief.content.problems.join(". ");
|
|
358
|
+
segments.push({
|
|
359
|
+
sceneId: "problems",
|
|
360
|
+
text,
|
|
361
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
362
|
+
contentType: "problem"
|
|
363
|
+
});
|
|
364
|
+
}
|
|
365
|
+
if (brief.content.solutions.length > 0) {
|
|
366
|
+
const text = brief.content.solutions.join(". ");
|
|
367
|
+
segments.push({
|
|
368
|
+
sceneId: "solutions",
|
|
369
|
+
text,
|
|
370
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
371
|
+
contentType: "solution"
|
|
372
|
+
});
|
|
373
|
+
}
|
|
374
|
+
if (brief.content.metrics && brief.content.metrics.length > 0) {
|
|
375
|
+
const text = brief.content.metrics.join(". ");
|
|
376
|
+
segments.push({
|
|
377
|
+
sceneId: "metrics",
|
|
378
|
+
text,
|
|
379
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
380
|
+
contentType: "metric"
|
|
381
|
+
});
|
|
382
|
+
}
|
|
383
|
+
if (brief.content.callToAction) {
|
|
384
|
+
segments.push({
|
|
385
|
+
sceneId: "cta",
|
|
386
|
+
text: brief.content.callToAction,
|
|
387
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(brief.content.callToAction),
|
|
388
|
+
contentType: "cta"
|
|
389
|
+
});
|
|
390
|
+
}
|
|
391
|
+
const fullText = segments.map((s) => s.text).join(" ");
|
|
392
|
+
const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
|
|
393
|
+
return { fullText, segments, estimatedDurationSeconds };
|
|
394
|
+
}
|
|
395
|
+
buildScriptFromScenePlan(brief) {
|
|
396
|
+
const segments = brief.scenePlan.scenes.filter((scene) => scene.narrationText).map((scene) => {
|
|
397
|
+
const text = scene.narrationText ?? "";
|
|
398
|
+
return {
|
|
399
|
+
sceneId: scene.id,
|
|
400
|
+
text,
|
|
401
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
402
|
+
contentType: "intro"
|
|
403
|
+
};
|
|
404
|
+
});
|
|
405
|
+
const fullText = segments.map((s) => s.text).join(" ");
|
|
406
|
+
const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
|
|
407
|
+
return { fullText, segments, estimatedDurationSeconds };
|
|
408
|
+
}
|
|
409
|
+
buildTimingMap(segments, fps, breathingRoomFactor) {
|
|
410
|
+
const timingSegments = segments.map((seg) => {
|
|
411
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * fps);
|
|
412
|
+
return {
|
|
413
|
+
sceneId: seg.sceneId,
|
|
414
|
+
durationMs: seg.durationMs,
|
|
415
|
+
durationInFrames,
|
|
416
|
+
recommendedSceneDurationInFrames: Math.ceil(durationInFrames * breathingRoomFactor),
|
|
417
|
+
wordTimings: seg.wordTimings?.map((wt) => ({
|
|
418
|
+
word: wt.word,
|
|
419
|
+
startMs: wt.startMs,
|
|
420
|
+
endMs: wt.endMs
|
|
421
|
+
}))
|
|
422
|
+
};
|
|
423
|
+
});
|
|
424
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + s.durationMs, 0);
|
|
425
|
+
return { totalDurationMs, segments: timingSegments, fps };
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
function generateProjectId() {
|
|
429
|
+
const timestamp = Date.now().toString(36);
|
|
430
|
+
const random = Math.random().toString(36).slice(2, 8);
|
|
431
|
+
return `tts_${timestamp}_${random}`;
|
|
432
|
+
}
|
|
433
|
+
export {
|
|
434
|
+
VoiceSynthesizer,
|
|
435
|
+
SegmentSynthesizer,
|
|
436
|
+
PaceAnalyzer,
|
|
437
|
+
EmphasisPlanner,
|
|
438
|
+
AudioAssembler
|
|
439
|
+
};
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
// src/tts/pace-analyzer.ts
|
|
2
|
+
var CONTENT_TYPE_PACING = {
|
|
3
|
+
intro: {
|
|
4
|
+
rate: 0.95,
|
|
5
|
+
emphasis: "normal",
|
|
6
|
+
tone: "authoritative",
|
|
7
|
+
leadingSilenceMs: 0,
|
|
8
|
+
trailingSilenceMs: 500
|
|
9
|
+
},
|
|
10
|
+
problem: {
|
|
11
|
+
rate: 0.9,
|
|
12
|
+
emphasis: "strong",
|
|
13
|
+
tone: "urgent",
|
|
14
|
+
leadingSilenceMs: 300,
|
|
15
|
+
trailingSilenceMs: 500
|
|
16
|
+
},
|
|
17
|
+
solution: {
|
|
18
|
+
rate: 1,
|
|
19
|
+
emphasis: "normal",
|
|
20
|
+
tone: "calm",
|
|
21
|
+
leadingSilenceMs: 300,
|
|
22
|
+
trailingSilenceMs: 500
|
|
23
|
+
},
|
|
24
|
+
metric: {
|
|
25
|
+
rate: 0.85,
|
|
26
|
+
emphasis: "strong",
|
|
27
|
+
tone: "excited",
|
|
28
|
+
leadingSilenceMs: 300,
|
|
29
|
+
trailingSilenceMs: 600
|
|
30
|
+
},
|
|
31
|
+
cta: {
|
|
32
|
+
rate: 0.9,
|
|
33
|
+
emphasis: "strong",
|
|
34
|
+
tone: "authoritative",
|
|
35
|
+
leadingSilenceMs: 400,
|
|
36
|
+
trailingSilenceMs: 0
|
|
37
|
+
},
|
|
38
|
+
transition: {
|
|
39
|
+
rate: 1.1,
|
|
40
|
+
emphasis: "reduced",
|
|
41
|
+
tone: "neutral",
|
|
42
|
+
leadingSilenceMs: 200,
|
|
43
|
+
trailingSilenceMs: 300
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
class PaceAnalyzer {
|
|
48
|
+
analyze(segments, baseRate = 1) {
|
|
49
|
+
return segments.map((segment) => {
|
|
50
|
+
const defaults = CONTENT_TYPE_PACING[segment.contentType];
|
|
51
|
+
return {
|
|
52
|
+
sceneId: segment.sceneId,
|
|
53
|
+
rate: defaults.rate * baseRate,
|
|
54
|
+
emphasis: defaults.emphasis,
|
|
55
|
+
tone: defaults.tone,
|
|
56
|
+
leadingSilenceMs: defaults.leadingSilenceMs,
|
|
57
|
+
trailingSilenceMs: defaults.trailingSilenceMs
|
|
58
|
+
};
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
getDefaults(contentType) {
|
|
62
|
+
return { ...CONTENT_TYPE_PACING[contentType] };
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
export {
|
|
66
|
+
PaceAnalyzer
|
|
67
|
+
};
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
// src/tts/segment-synthesizer.ts
|
|
2
|
+
class SegmentSynthesizer {
|
|
3
|
+
tts;
|
|
4
|
+
constructor(tts) {
|
|
5
|
+
this.tts = tts;
|
|
6
|
+
}
|
|
7
|
+
async synthesizeAll(segments, voice, directives) {
|
|
8
|
+
const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
|
|
9
|
+
const results = await Promise.all(segments.map((segment) => this.synthesizeOne(segment, voice, directiveMap.get(segment.sceneId))));
|
|
10
|
+
return results;
|
|
11
|
+
}
|
|
12
|
+
async synthesizeOne(segment, voice, directive) {
|
|
13
|
+
const result = await this.tts.synthesize({
|
|
14
|
+
text: segment.text,
|
|
15
|
+
voiceId: voice.voiceId,
|
|
16
|
+
language: voice.language,
|
|
17
|
+
style: voice.style,
|
|
18
|
+
stability: voice.stability,
|
|
19
|
+
rate: directive?.rate,
|
|
20
|
+
emphasis: directive?.emphasis
|
|
21
|
+
});
|
|
22
|
+
return {
|
|
23
|
+
sceneId: segment.sceneId,
|
|
24
|
+
audio: result.audio,
|
|
25
|
+
durationMs: result.audio.durationMs ?? 0,
|
|
26
|
+
wordTimings: result.wordTimings?.map((wt) => ({
|
|
27
|
+
word: wt.word,
|
|
28
|
+
startMs: wt.startMs,
|
|
29
|
+
endMs: wt.endMs
|
|
30
|
+
}))
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
export {
|
|
35
|
+
SegmentSynthesizer
|
|
36
|
+
};
|
|
File without changes
|