@contractspec/lib.voice 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio/audio-concatenator.d.ts +15 -0
- package/dist/audio/audio-concatenator.js +57 -0
- package/dist/audio/duration-estimator.d.ts +31 -0
- package/dist/audio/duration-estimator.js +22 -0
- package/dist/audio/format-converter.d.ts +17 -0
- package/dist/audio/format-converter.js +28 -0
- package/dist/audio/index.d.ts +4 -0
- package/dist/audio/index.js +121 -0
- package/dist/audio/silence-generator.d.ts +16 -0
- package/dist/audio/silence-generator.js +20 -0
- package/dist/browser/audio/audio-concatenator.js +56 -0
- package/dist/browser/audio/duration-estimator.js +21 -0
- package/dist/browser/audio/format-converter.js +27 -0
- package/dist/browser/audio/index.js +120 -0
- package/dist/browser/audio/silence-generator.js +19 -0
- package/dist/browser/conversational/index.js +241 -0
- package/dist/browser/conversational/response-orchestrator.js +62 -0
- package/dist/browser/conversational/transcript-builder.js +63 -0
- package/dist/browser/conversational/turn-detector.js +43 -0
- package/dist/browser/conversational/types.js +0 -0
- package/dist/browser/conversational/voice-session-manager.js +137 -0
- package/dist/browser/docs/conversational.docblock.js +5 -0
- package/dist/browser/docs/stt.docblock.js +5 -0
- package/dist/browser/docs/sync.docblock.js +5 -0
- package/dist/browser/docs/tts.docblock.js +5 -0
- package/dist/browser/docs/voice.docblock.js +5 -0
- package/dist/browser/i18n/catalogs/en.js +91 -0
- package/dist/browser/i18n/catalogs/es.js +91 -0
- package/dist/browser/i18n/catalogs/fr.js +91 -0
- package/dist/browser/i18n/catalogs/index.js +271 -0
- package/dist/browser/i18n/index.js +335 -0
- package/dist/browser/i18n/keys.js +38 -0
- package/dist/browser/i18n/locale.js +13 -0
- package/dist/browser/i18n/messages.js +283 -0
- package/dist/browser/index.js +1070 -0
- package/dist/browser/stt/diarization-mapper.js +42 -0
- package/dist/browser/stt/index.js +222 -0
- package/dist/browser/stt/segment-splitter.js +36 -0
- package/dist/browser/stt/subtitle-formatter.js +51 -0
- package/dist/browser/stt/transcriber.js +219 -0
- package/dist/browser/stt/types.js +0 -0
- package/dist/browser/sync/duration-negotiator.js +69 -0
- package/dist/browser/sync/index.js +165 -0
- package/dist/browser/sync/scene-adapter.js +52 -0
- package/dist/browser/sync/timing-calculator.js +46 -0
- package/dist/browser/tts/audio-assembler.js +120 -0
- package/dist/browser/tts/emphasis-planner.js +134 -0
- package/dist/browser/tts/index.js +439 -0
- package/dist/browser/tts/pace-analyzer.js +67 -0
- package/dist/browser/tts/segment-synthesizer.js +36 -0
- package/dist/browser/tts/types.js +0 -0
- package/dist/browser/tts/voice-synthesizer.js +435 -0
- package/dist/browser/types.js +0 -0
- package/dist/conversational/index.d.ts +5 -0
- package/dist/conversational/index.js +242 -0
- package/dist/conversational/response-orchestrator.d.ts +26 -0
- package/dist/conversational/response-orchestrator.js +63 -0
- package/dist/conversational/transcript-builder.d.ts +25 -0
- package/dist/conversational/transcript-builder.js +64 -0
- package/dist/conversational/turn-detector.d.ts +31 -0
- package/dist/conversational/turn-detector.js +44 -0
- package/dist/conversational/types.d.ts +55 -0
- package/dist/conversational/types.js +1 -0
- package/dist/conversational/voice-session-manager.d.ts +17 -0
- package/dist/conversational/voice-session-manager.js +138 -0
- package/dist/docs/conversational.docblock.d.ts +14 -0
- package/dist/docs/conversational.docblock.js +6 -0
- package/dist/docs/stt.docblock.d.ts +12 -0
- package/dist/docs/stt.docblock.js +6 -0
- package/dist/docs/sync.docblock.d.ts +12 -0
- package/dist/docs/sync.docblock.js +6 -0
- package/dist/docs/tts.docblock.d.ts +12 -0
- package/dist/docs/tts.docblock.js +6 -0
- package/dist/docs/voice.docblock.d.ts +22 -0
- package/dist/docs/voice.docblock.js +6 -0
- package/dist/i18n/catalogs/en.d.ts +6 -0
- package/dist/i18n/catalogs/en.js +92 -0
- package/dist/i18n/catalogs/es.d.ts +4 -0
- package/dist/i18n/catalogs/es.js +92 -0
- package/dist/i18n/catalogs/fr.d.ts +4 -0
- package/dist/i18n/catalogs/fr.js +92 -0
- package/dist/i18n/catalogs/index.d.ts +3 -0
- package/dist/i18n/catalogs/index.js +272 -0
- package/dist/i18n/index.d.ts +20 -0
- package/dist/i18n/index.js +336 -0
- package/dist/i18n/keys.d.ts +50 -0
- package/dist/i18n/keys.js +39 -0
- package/dist/i18n/locale.d.ts +6 -0
- package/dist/i18n/locale.js +14 -0
- package/dist/i18n/messages.d.ts +13 -0
- package/dist/i18n/messages.js +284 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +1071 -0
- package/dist/node/audio/audio-concatenator.js +56 -0
- package/dist/node/audio/duration-estimator.js +21 -0
- package/dist/node/audio/format-converter.js +27 -0
- package/dist/node/audio/index.js +120 -0
- package/dist/node/audio/silence-generator.js +19 -0
- package/dist/node/conversational/index.js +241 -0
- package/dist/node/conversational/response-orchestrator.js +62 -0
- package/dist/node/conversational/transcript-builder.js +63 -0
- package/dist/node/conversational/turn-detector.js +43 -0
- package/dist/node/conversational/types.js +0 -0
- package/dist/node/conversational/voice-session-manager.js +137 -0
- package/dist/node/docs/conversational.docblock.js +5 -0
- package/dist/node/docs/stt.docblock.js +5 -0
- package/dist/node/docs/sync.docblock.js +5 -0
- package/dist/node/docs/tts.docblock.js +5 -0
- package/dist/node/docs/voice.docblock.js +5 -0
- package/dist/node/i18n/catalogs/en.js +91 -0
- package/dist/node/i18n/catalogs/es.js +91 -0
- package/dist/node/i18n/catalogs/fr.js +91 -0
- package/dist/node/i18n/catalogs/index.js +271 -0
- package/dist/node/i18n/index.js +335 -0
- package/dist/node/i18n/keys.js +38 -0
- package/dist/node/i18n/locale.js +13 -0
- package/dist/node/i18n/messages.js +283 -0
- package/dist/node/index.js +1070 -0
- package/dist/node/stt/diarization-mapper.js +42 -0
- package/dist/node/stt/index.js +222 -0
- package/dist/node/stt/segment-splitter.js +36 -0
- package/dist/node/stt/subtitle-formatter.js +51 -0
- package/dist/node/stt/transcriber.js +219 -0
- package/dist/node/stt/types.js +0 -0
- package/dist/node/sync/duration-negotiator.js +69 -0
- package/dist/node/sync/index.js +165 -0
- package/dist/node/sync/scene-adapter.js +52 -0
- package/dist/node/sync/timing-calculator.js +46 -0
- package/dist/node/tts/audio-assembler.js +120 -0
- package/dist/node/tts/emphasis-planner.js +134 -0
- package/dist/node/tts/index.js +439 -0
- package/dist/node/tts/pace-analyzer.js +67 -0
- package/dist/node/tts/segment-synthesizer.js +36 -0
- package/dist/node/tts/types.js +0 -0
- package/dist/node/tts/voice-synthesizer.js +435 -0
- package/dist/node/types.js +0 -0
- package/dist/stt/diarization-mapper.d.ts +19 -0
- package/dist/stt/diarization-mapper.js +43 -0
- package/dist/stt/index.d.ts +5 -0
- package/dist/stt/index.js +223 -0
- package/dist/stt/segment-splitter.d.ts +19 -0
- package/dist/stt/segment-splitter.js +37 -0
- package/dist/stt/subtitle-formatter.d.ts +19 -0
- package/dist/stt/subtitle-formatter.js +52 -0
- package/dist/stt/transcriber.d.ts +21 -0
- package/dist/stt/transcriber.js +220 -0
- package/dist/stt/types.d.ts +44 -0
- package/dist/stt/types.js +1 -0
- package/dist/sync/duration-negotiator.d.ts +37 -0
- package/dist/sync/duration-negotiator.js +70 -0
- package/dist/sync/index.d.ts +3 -0
- package/dist/sync/index.js +166 -0
- package/dist/sync/scene-adapter.d.ts +29 -0
- package/dist/sync/scene-adapter.js +53 -0
- package/dist/sync/timing-calculator.d.ts +21 -0
- package/dist/sync/timing-calculator.js +47 -0
- package/dist/tts/audio-assembler.d.ts +19 -0
- package/dist/tts/audio-assembler.js +121 -0
- package/dist/tts/emphasis-planner.d.ts +24 -0
- package/dist/tts/emphasis-planner.js +135 -0
- package/dist/tts/index.d.ts +6 -0
- package/dist/tts/index.js +440 -0
- package/dist/tts/pace-analyzer.d.ts +30 -0
- package/dist/tts/pace-analyzer.js +68 -0
- package/dist/tts/segment-synthesizer.d.ts +21 -0
- package/dist/tts/segment-synthesizer.js +37 -0
- package/dist/tts/types.d.ts +76 -0
- package/dist/tts/types.js +1 -0
- package/dist/tts/voice-synthesizer.d.ts +28 -0
- package/dist/tts/voice-synthesizer.js +436 -0
- package/dist/types.d.ts +12 -0
- package/dist/types.js +1 -0
- package/package.json +760 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/sync/duration-negotiator.ts
|
|
3
|
+
class DurationNegotiator {
|
|
4
|
+
static UPPER_THRESHOLD = 1.1;
|
|
5
|
+
static LOWER_THRESHOLD = 0.7;
|
|
6
|
+
static MAX_RATE = 1.3;
|
|
7
|
+
static MIN_RATE = 0.8;
|
|
8
|
+
negotiate(timingMap, sceneDurations) {
|
|
9
|
+
const adjustments = [];
|
|
10
|
+
const updatedSegments = timingMap.segments.map((seg) => {
|
|
11
|
+
const originalSceneDuration = sceneDurations.get(seg.sceneId);
|
|
12
|
+
if (originalSceneDuration === undefined) {
|
|
13
|
+
adjustments.push({
|
|
14
|
+
sceneId: seg.sceneId,
|
|
15
|
+
originalSceneDurationInFrames: seg.recommendedSceneDurationInFrames,
|
|
16
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
17
|
+
action: "no_change",
|
|
18
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
19
|
+
});
|
|
20
|
+
return seg;
|
|
21
|
+
}
|
|
22
|
+
const ratio = seg.durationInFrames / originalSceneDuration;
|
|
23
|
+
if (ratio > DurationNegotiator.UPPER_THRESHOLD) {
|
|
24
|
+
const suggestedRate = Math.min(ratio, DurationNegotiator.MAX_RATE);
|
|
25
|
+
adjustments.push({
|
|
26
|
+
sceneId: seg.sceneId,
|
|
27
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
28
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
29
|
+
action: ratio > DurationNegotiator.MAX_RATE ? "extend_scene" : "suggest_rate_change",
|
|
30
|
+
suggestedRate,
|
|
31
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
32
|
+
});
|
|
33
|
+
return seg;
|
|
34
|
+
}
|
|
35
|
+
if (ratio < DurationNegotiator.LOWER_THRESHOLD) {
|
|
36
|
+
const suggestedRate = Math.max(ratio, DurationNegotiator.MIN_RATE);
|
|
37
|
+
adjustments.push({
|
|
38
|
+
sceneId: seg.sceneId,
|
|
39
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
40
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
41
|
+
action: "pad_silence",
|
|
42
|
+
suggestedRate,
|
|
43
|
+
finalSceneDurationInFrames: originalSceneDuration
|
|
44
|
+
});
|
|
45
|
+
return {
|
|
46
|
+
...seg,
|
|
47
|
+
recommendedSceneDurationInFrames: originalSceneDuration
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
adjustments.push({
|
|
51
|
+
sceneId: seg.sceneId,
|
|
52
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
53
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
54
|
+
action: "no_change",
|
|
55
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
56
|
+
});
|
|
57
|
+
return seg;
|
|
58
|
+
});
|
|
59
|
+
return {
|
|
60
|
+
timingMap: {
|
|
61
|
+
...timingMap,
|
|
62
|
+
segments: updatedSegments
|
|
63
|
+
},
|
|
64
|
+
adjustments
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
export {
|
|
69
|
+
DurationNegotiator
|
|
70
|
+
};
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/audio/duration-estimator.ts
|
|
3
|
+
class DurationEstimator {
|
|
4
|
+
static DEFAULT_WPM = 150;
|
|
5
|
+
estimateSeconds(text, wordsPerMinute) {
|
|
6
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
7
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
8
|
+
return Math.ceil(wordCount / wpm * 60);
|
|
9
|
+
}
|
|
10
|
+
estimateMs(text, wordsPerMinute) {
|
|
11
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
12
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
13
|
+
return Math.ceil(wordCount / wpm * 60 * 1000);
|
|
14
|
+
}
|
|
15
|
+
estimateWordCount(durationSeconds, wordsPerMinute) {
|
|
16
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
17
|
+
return Math.round(durationSeconds / 60 * wpm);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// src/sync/timing-calculator.ts
|
|
22
|
+
class TimingCalculator {
|
|
23
|
+
calculate(segments, fps, breathingRoomFactor = 1.15) {
|
|
24
|
+
const timingSegments = segments.map((seg) => {
|
|
25
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * fps);
|
|
26
|
+
const recommendedSceneDurationInFrames = Math.ceil(durationInFrames * breathingRoomFactor);
|
|
27
|
+
const wordTimings = seg.wordTimings?.map((wt) => ({
|
|
28
|
+
word: wt.word,
|
|
29
|
+
startMs: wt.startMs,
|
|
30
|
+
endMs: wt.endMs
|
|
31
|
+
}));
|
|
32
|
+
return {
|
|
33
|
+
sceneId: seg.sceneId,
|
|
34
|
+
durationMs: seg.durationMs,
|
|
35
|
+
durationInFrames,
|
|
36
|
+
recommendedSceneDurationInFrames,
|
|
37
|
+
wordTimings
|
|
38
|
+
};
|
|
39
|
+
});
|
|
40
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + s.durationMs, 0);
|
|
41
|
+
return {
|
|
42
|
+
totalDurationMs,
|
|
43
|
+
segments: timingSegments,
|
|
44
|
+
fps
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
recalculateForFps(timingMap, newFps) {
|
|
48
|
+
const segments = timingMap.segments.map((seg) => {
|
|
49
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * newFps);
|
|
50
|
+
const ratio = seg.recommendedSceneDurationInFrames / Math.max(seg.durationInFrames, 1);
|
|
51
|
+
return {
|
|
52
|
+
...seg,
|
|
53
|
+
durationInFrames,
|
|
54
|
+
recommendedSceneDurationInFrames: Math.ceil(durationInFrames * ratio)
|
|
55
|
+
};
|
|
56
|
+
});
|
|
57
|
+
return {
|
|
58
|
+
...timingMap,
|
|
59
|
+
segments,
|
|
60
|
+
fps: newFps
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// src/sync/scene-adapter.ts
|
|
66
|
+
class SceneAdapter {
|
|
67
|
+
durationEstimator = new DurationEstimator;
|
|
68
|
+
adapt(scenePlan) {
|
|
69
|
+
const scenesWithNarration = scenePlan.scenes.filter((s) => s.narrationText && s.narrationText.trim().length > 0);
|
|
70
|
+
const segments = scenesWithNarration.map((scene, index) => {
|
|
71
|
+
const text = scene.narrationText ?? "";
|
|
72
|
+
return {
|
|
73
|
+
sceneId: scene.id,
|
|
74
|
+
text,
|
|
75
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
76
|
+
contentType: this.inferContentType(index, scenesWithNarration.length)
|
|
77
|
+
};
|
|
78
|
+
});
|
|
79
|
+
const fullText = segments.map((s) => s.text).join(" ");
|
|
80
|
+
const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
|
|
81
|
+
return { fullText, segments, estimatedDurationSeconds };
|
|
82
|
+
}
|
|
83
|
+
inferContentType(index, total) {
|
|
84
|
+
if (index === 0)
|
|
85
|
+
return "intro";
|
|
86
|
+
if (index === total - 1)
|
|
87
|
+
return "cta";
|
|
88
|
+
if (index === 1 && total > 3)
|
|
89
|
+
return "problem";
|
|
90
|
+
if (index === total - 2 && total > 3)
|
|
91
|
+
return "metric";
|
|
92
|
+
return "solution";
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// src/sync/duration-negotiator.ts
|
|
97
|
+
class DurationNegotiator {
|
|
98
|
+
static UPPER_THRESHOLD = 1.1;
|
|
99
|
+
static LOWER_THRESHOLD = 0.7;
|
|
100
|
+
static MAX_RATE = 1.3;
|
|
101
|
+
static MIN_RATE = 0.8;
|
|
102
|
+
negotiate(timingMap, sceneDurations) {
|
|
103
|
+
const adjustments = [];
|
|
104
|
+
const updatedSegments = timingMap.segments.map((seg) => {
|
|
105
|
+
const originalSceneDuration = sceneDurations.get(seg.sceneId);
|
|
106
|
+
if (originalSceneDuration === undefined) {
|
|
107
|
+
adjustments.push({
|
|
108
|
+
sceneId: seg.sceneId,
|
|
109
|
+
originalSceneDurationInFrames: seg.recommendedSceneDurationInFrames,
|
|
110
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
111
|
+
action: "no_change",
|
|
112
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
113
|
+
});
|
|
114
|
+
return seg;
|
|
115
|
+
}
|
|
116
|
+
const ratio = seg.durationInFrames / originalSceneDuration;
|
|
117
|
+
if (ratio > DurationNegotiator.UPPER_THRESHOLD) {
|
|
118
|
+
const suggestedRate = Math.min(ratio, DurationNegotiator.MAX_RATE);
|
|
119
|
+
adjustments.push({
|
|
120
|
+
sceneId: seg.sceneId,
|
|
121
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
122
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
123
|
+
action: ratio > DurationNegotiator.MAX_RATE ? "extend_scene" : "suggest_rate_change",
|
|
124
|
+
suggestedRate,
|
|
125
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
126
|
+
});
|
|
127
|
+
return seg;
|
|
128
|
+
}
|
|
129
|
+
if (ratio < DurationNegotiator.LOWER_THRESHOLD) {
|
|
130
|
+
const suggestedRate = Math.max(ratio, DurationNegotiator.MIN_RATE);
|
|
131
|
+
adjustments.push({
|
|
132
|
+
sceneId: seg.sceneId,
|
|
133
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
134
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
135
|
+
action: "pad_silence",
|
|
136
|
+
suggestedRate,
|
|
137
|
+
finalSceneDurationInFrames: originalSceneDuration
|
|
138
|
+
});
|
|
139
|
+
return {
|
|
140
|
+
...seg,
|
|
141
|
+
recommendedSceneDurationInFrames: originalSceneDuration
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
adjustments.push({
|
|
145
|
+
sceneId: seg.sceneId,
|
|
146
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
147
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
148
|
+
action: "no_change",
|
|
149
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
150
|
+
});
|
|
151
|
+
return seg;
|
|
152
|
+
});
|
|
153
|
+
return {
|
|
154
|
+
timingMap: {
|
|
155
|
+
...timingMap,
|
|
156
|
+
segments: updatedSegments
|
|
157
|
+
},
|
|
158
|
+
adjustments
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
export {
|
|
163
|
+
TimingCalculator,
|
|
164
|
+
SceneAdapter,
|
|
165
|
+
DurationNegotiator
|
|
166
|
+
};
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { TTSScript } from '../tts/types';
|
|
2
|
+
interface ScenePlanScene {
|
|
3
|
+
id: string;
|
|
4
|
+
compositionId: string;
|
|
5
|
+
durationInFrames: number;
|
|
6
|
+
narrationText?: string;
|
|
7
|
+
}
|
|
8
|
+
interface ScenePlan {
|
|
9
|
+
scenes: ScenePlanScene[];
|
|
10
|
+
estimatedDurationSeconds: number;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Adapt a video-gen ScenePlan into a TTSScript.
|
|
14
|
+
*
|
|
15
|
+
* Bridges the video-gen domain to the voice TTS domain,
|
|
16
|
+
* preserving scene IDs for timing map correlation.
|
|
17
|
+
*/
|
|
18
|
+
export declare class SceneAdapter {
|
|
19
|
+
private readonly durationEstimator;
|
|
20
|
+
/**
|
|
21
|
+
* Convert a ScenePlan into a TTSScript.
|
|
22
|
+
*
|
|
23
|
+
* Filters out scenes without narration text.
|
|
24
|
+
* Assigns contentType based on scene position (first = intro, last = cta, middle = solution).
|
|
25
|
+
*/
|
|
26
|
+
adapt(scenePlan: ScenePlan): TTSScript;
|
|
27
|
+
private inferContentType;
|
|
28
|
+
}
|
|
29
|
+
export {};
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/audio/duration-estimator.ts
|
|
3
|
+
class DurationEstimator {
|
|
4
|
+
static DEFAULT_WPM = 150;
|
|
5
|
+
estimateSeconds(text, wordsPerMinute) {
|
|
6
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
7
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
8
|
+
return Math.ceil(wordCount / wpm * 60);
|
|
9
|
+
}
|
|
10
|
+
estimateMs(text, wordsPerMinute) {
|
|
11
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
12
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
13
|
+
return Math.ceil(wordCount / wpm * 60 * 1000);
|
|
14
|
+
}
|
|
15
|
+
estimateWordCount(durationSeconds, wordsPerMinute) {
|
|
16
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
17
|
+
return Math.round(durationSeconds / 60 * wpm);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// src/sync/scene-adapter.ts
|
|
22
|
+
class SceneAdapter {
|
|
23
|
+
durationEstimator = new DurationEstimator;
|
|
24
|
+
adapt(scenePlan) {
|
|
25
|
+
const scenesWithNarration = scenePlan.scenes.filter((s) => s.narrationText && s.narrationText.trim().length > 0);
|
|
26
|
+
const segments = scenesWithNarration.map((scene, index) => {
|
|
27
|
+
const text = scene.narrationText ?? "";
|
|
28
|
+
return {
|
|
29
|
+
sceneId: scene.id,
|
|
30
|
+
text,
|
|
31
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
32
|
+
contentType: this.inferContentType(index, scenesWithNarration.length)
|
|
33
|
+
};
|
|
34
|
+
});
|
|
35
|
+
const fullText = segments.map((s) => s.text).join(" ");
|
|
36
|
+
const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
|
|
37
|
+
return { fullText, segments, estimatedDurationSeconds };
|
|
38
|
+
}
|
|
39
|
+
inferContentType(index, total) {
|
|
40
|
+
if (index === 0)
|
|
41
|
+
return "intro";
|
|
42
|
+
if (index === total - 1)
|
|
43
|
+
return "cta";
|
|
44
|
+
if (index === 1 && total > 3)
|
|
45
|
+
return "problem";
|
|
46
|
+
if (index === total - 2 && total > 3)
|
|
47
|
+
return "metric";
|
|
48
|
+
return "solution";
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
export {
|
|
52
|
+
SceneAdapter
|
|
53
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { VoiceTimingMap } from '../types';
|
|
2
|
+
import type { SynthesizedSegment } from '../tts/types';
|
|
3
|
+
/**
|
|
4
|
+
* Calculate timing maps from synthesized segments.
|
|
5
|
+
*
|
|
6
|
+
* Converts SynthesizedSegment[] into a VoiceTimingMap with frame calculations.
|
|
7
|
+
*/
|
|
8
|
+
export declare class TimingCalculator {
|
|
9
|
+
/**
|
|
10
|
+
* Build a timing map from synthesized segments.
|
|
11
|
+
*
|
|
12
|
+
* @param segments - Synthesized audio segments with duration info
|
|
13
|
+
* @param fps - Frames per second for frame calculations
|
|
14
|
+
* @param breathingRoomFactor - Factor to add breathing room (default 1.15)
|
|
15
|
+
*/
|
|
16
|
+
calculate(segments: SynthesizedSegment[], fps: number, breathingRoomFactor?: number): VoiceTimingMap;
|
|
17
|
+
/**
|
|
18
|
+
* Recalculate timing map for a different FPS.
|
|
19
|
+
*/
|
|
20
|
+
recalculateForFps(timingMap: VoiceTimingMap, newFps: number): VoiceTimingMap;
|
|
21
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/sync/timing-calculator.ts
|
|
3
|
+
class TimingCalculator {
|
|
4
|
+
calculate(segments, fps, breathingRoomFactor = 1.15) {
|
|
5
|
+
const timingSegments = segments.map((seg) => {
|
|
6
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * fps);
|
|
7
|
+
const recommendedSceneDurationInFrames = Math.ceil(durationInFrames * breathingRoomFactor);
|
|
8
|
+
const wordTimings = seg.wordTimings?.map((wt) => ({
|
|
9
|
+
word: wt.word,
|
|
10
|
+
startMs: wt.startMs,
|
|
11
|
+
endMs: wt.endMs
|
|
12
|
+
}));
|
|
13
|
+
return {
|
|
14
|
+
sceneId: seg.sceneId,
|
|
15
|
+
durationMs: seg.durationMs,
|
|
16
|
+
durationInFrames,
|
|
17
|
+
recommendedSceneDurationInFrames,
|
|
18
|
+
wordTimings
|
|
19
|
+
};
|
|
20
|
+
});
|
|
21
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + s.durationMs, 0);
|
|
22
|
+
return {
|
|
23
|
+
totalDurationMs,
|
|
24
|
+
segments: timingSegments,
|
|
25
|
+
fps
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
recalculateForFps(timingMap, newFps) {
|
|
29
|
+
const segments = timingMap.segments.map((seg) => {
|
|
30
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * newFps);
|
|
31
|
+
const ratio = seg.recommendedSceneDurationInFrames / Math.max(seg.durationInFrames, 1);
|
|
32
|
+
return {
|
|
33
|
+
...seg,
|
|
34
|
+
durationInFrames,
|
|
35
|
+
recommendedSceneDurationInFrames: Math.ceil(durationInFrames * ratio)
|
|
36
|
+
};
|
|
37
|
+
});
|
|
38
|
+
return {
|
|
39
|
+
...timingMap,
|
|
40
|
+
segments,
|
|
41
|
+
fps: newFps
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
export {
|
|
46
|
+
TimingCalculator
|
|
47
|
+
};
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { AudioData, VoicePacingDirective } from '../types';
|
|
2
|
+
import type { SynthesizedSegment } from './types';
|
|
3
|
+
/**
|
|
4
|
+
* Assemble synthesized segments into a single audio track.
|
|
5
|
+
*
|
|
6
|
+
* Inserts silence between segments based on pacing directives.
|
|
7
|
+
*/
|
|
8
|
+
export declare class AudioAssembler {
|
|
9
|
+
private readonly concatenator;
|
|
10
|
+
private readonly silenceGenerator;
|
|
11
|
+
/**
|
|
12
|
+
* Assemble segments with silence padding.
|
|
13
|
+
*
|
|
14
|
+
* @param segments - Synthesized audio segments
|
|
15
|
+
* @param directives - Pacing directives for silence timing
|
|
16
|
+
* @param defaultPauseMs - Default pause between segments (default 500ms)
|
|
17
|
+
*/
|
|
18
|
+
assemble(segments: SynthesizedSegment[], directives: VoicePacingDirective[], defaultPauseMs?: number): AudioData;
|
|
19
|
+
}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/audio/audio-concatenator.ts
|
|
3
|
+
class AudioConcatenator {
|
|
4
|
+
concatenate(segments) {
|
|
5
|
+
if (segments.length === 0) {
|
|
6
|
+
return {
|
|
7
|
+
data: new Uint8Array(0),
|
|
8
|
+
format: "wav",
|
|
9
|
+
sampleRateHz: 44100,
|
|
10
|
+
durationMs: 0,
|
|
11
|
+
channels: 1
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
const [firstSegment] = segments;
|
|
15
|
+
if (!firstSegment) {
|
|
16
|
+
return {
|
|
17
|
+
data: new Uint8Array(0),
|
|
18
|
+
format: "wav",
|
|
19
|
+
sampleRateHz: 44100,
|
|
20
|
+
durationMs: 0,
|
|
21
|
+
channels: 1
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
if (segments.length === 1) {
|
|
25
|
+
return { ...firstSegment };
|
|
26
|
+
}
|
|
27
|
+
const referenceFormat = firstSegment.format;
|
|
28
|
+
const referenceSampleRate = firstSegment.sampleRateHz;
|
|
29
|
+
const referenceChannels = firstSegment.channels ?? 1;
|
|
30
|
+
for (const seg of segments) {
|
|
31
|
+
if (seg.format !== referenceFormat) {
|
|
32
|
+
throw new Error(`Format mismatch: expected ${referenceFormat}, got ${seg.format}`);
|
|
33
|
+
}
|
|
34
|
+
if (seg.sampleRateHz !== referenceSampleRate) {
|
|
35
|
+
throw new Error(`Sample rate mismatch: expected ${referenceSampleRate}, got ${seg.sampleRateHz}`);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
const totalBytes = segments.reduce((sum, s) => sum + s.data.length, 0);
|
|
39
|
+
const combined = new Uint8Array(totalBytes);
|
|
40
|
+
let offset = 0;
|
|
41
|
+
for (const seg of segments) {
|
|
42
|
+
combined.set(seg.data, offset);
|
|
43
|
+
offset += seg.data.length;
|
|
44
|
+
}
|
|
45
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
|
|
46
|
+
return {
|
|
47
|
+
data: combined,
|
|
48
|
+
format: referenceFormat,
|
|
49
|
+
sampleRateHz: referenceSampleRate,
|
|
50
|
+
durationMs: totalDurationMs,
|
|
51
|
+
channels: referenceChannels
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// src/audio/silence-generator.ts
|
|
57
|
+
class SilenceGenerator {
|
|
58
|
+
generate(durationMs, format = "wav", sampleRateHz = 44100, channels = 1) {
|
|
59
|
+
const totalSamples = Math.ceil(sampleRateHz * durationMs / 1000);
|
|
60
|
+
const bytesPerSample = 2;
|
|
61
|
+
const dataSize = totalSamples * bytesPerSample * channels;
|
|
62
|
+
const data = new Uint8Array(dataSize);
|
|
63
|
+
return {
|
|
64
|
+
data,
|
|
65
|
+
format,
|
|
66
|
+
sampleRateHz,
|
|
67
|
+
durationMs,
|
|
68
|
+
channels
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// src/tts/audio-assembler.ts
|
|
74
|
+
class AudioAssembler {
|
|
75
|
+
concatenator = new AudioConcatenator;
|
|
76
|
+
silenceGenerator = new SilenceGenerator;
|
|
77
|
+
assemble(segments, directives, defaultPauseMs = 500) {
|
|
78
|
+
if (segments.length === 0) {
|
|
79
|
+
return {
|
|
80
|
+
data: new Uint8Array(0),
|
|
81
|
+
format: "wav",
|
|
82
|
+
sampleRateHz: 44100,
|
|
83
|
+
durationMs: 0,
|
|
84
|
+
channels: 1
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
const [firstSegment] = segments;
|
|
88
|
+
if (!firstSegment) {
|
|
89
|
+
return {
|
|
90
|
+
data: new Uint8Array(0),
|
|
91
|
+
format: "wav",
|
|
92
|
+
sampleRateHz: 44100,
|
|
93
|
+
durationMs: 0,
|
|
94
|
+
channels: 1
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
|
|
98
|
+
const reference = firstSegment.audio;
|
|
99
|
+
const parts = [];
|
|
100
|
+
for (let i = 0;i < segments.length; i++) {
|
|
101
|
+
const segment = segments[i];
|
|
102
|
+
if (!segment) {
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
const directive = directiveMap.get(segment.sceneId);
|
|
106
|
+
const leadingSilenceMs = directive?.leadingSilenceMs ?? 0;
|
|
107
|
+
if (leadingSilenceMs > 0) {
|
|
108
|
+
parts.push(this.silenceGenerator.generate(leadingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
|
|
109
|
+
}
|
|
110
|
+
parts.push(segment.audio);
|
|
111
|
+
const trailingSilenceMs = directive?.trailingSilenceMs ?? (i < segments.length - 1 ? defaultPauseMs : 0);
|
|
112
|
+
if (trailingSilenceMs > 0) {
|
|
113
|
+
parts.push(this.silenceGenerator.generate(trailingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return this.concatenator.concatenate(parts);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
export {
|
|
120
|
+
AudioAssembler
|
|
121
|
+
};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { LLMProvider, VoicePacingDirective } from '../types';
|
|
2
|
+
import type { TTSScriptSegment } from './types';
|
|
3
|
+
/**
|
|
4
|
+
* Plan emphasis and tone per segment.
|
|
5
|
+
*
|
|
6
|
+
* With LLM: requests fine-grained tone/emphasis analysis.
|
|
7
|
+
* Without LLM: falls back to PaceAnalyzer's content-type mapping.
|
|
8
|
+
*/
|
|
9
|
+
export declare class EmphasisPlanner {
|
|
10
|
+
private readonly llm?;
|
|
11
|
+
private readonly model?;
|
|
12
|
+
private readonly paceAnalyzer;
|
|
13
|
+
constructor(options?: {
|
|
14
|
+
llm?: LLMProvider;
|
|
15
|
+
model?: string;
|
|
16
|
+
});
|
|
17
|
+
/**
|
|
18
|
+
* Plan emphasis and tone directives for segments.
|
|
19
|
+
*
|
|
20
|
+
* Falls back to deterministic mapping if LLM is unavailable.
|
|
21
|
+
*/
|
|
22
|
+
plan(segments: TTSScriptSegment[], baseRate?: number): Promise<VoicePacingDirective[]>;
|
|
23
|
+
private planWithLlm;
|
|
24
|
+
}
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/tts/pace-analyzer.ts
|
|
3
|
+
var CONTENT_TYPE_PACING = {
|
|
4
|
+
intro: {
|
|
5
|
+
rate: 0.95,
|
|
6
|
+
emphasis: "normal",
|
|
7
|
+
tone: "authoritative",
|
|
8
|
+
leadingSilenceMs: 0,
|
|
9
|
+
trailingSilenceMs: 500
|
|
10
|
+
},
|
|
11
|
+
problem: {
|
|
12
|
+
rate: 0.9,
|
|
13
|
+
emphasis: "strong",
|
|
14
|
+
tone: "urgent",
|
|
15
|
+
leadingSilenceMs: 300,
|
|
16
|
+
trailingSilenceMs: 500
|
|
17
|
+
},
|
|
18
|
+
solution: {
|
|
19
|
+
rate: 1,
|
|
20
|
+
emphasis: "normal",
|
|
21
|
+
tone: "calm",
|
|
22
|
+
leadingSilenceMs: 300,
|
|
23
|
+
trailingSilenceMs: 500
|
|
24
|
+
},
|
|
25
|
+
metric: {
|
|
26
|
+
rate: 0.85,
|
|
27
|
+
emphasis: "strong",
|
|
28
|
+
tone: "excited",
|
|
29
|
+
leadingSilenceMs: 300,
|
|
30
|
+
trailingSilenceMs: 600
|
|
31
|
+
},
|
|
32
|
+
cta: {
|
|
33
|
+
rate: 0.9,
|
|
34
|
+
emphasis: "strong",
|
|
35
|
+
tone: "authoritative",
|
|
36
|
+
leadingSilenceMs: 400,
|
|
37
|
+
trailingSilenceMs: 0
|
|
38
|
+
},
|
|
39
|
+
transition: {
|
|
40
|
+
rate: 1.1,
|
|
41
|
+
emphasis: "reduced",
|
|
42
|
+
tone: "neutral",
|
|
43
|
+
leadingSilenceMs: 200,
|
|
44
|
+
trailingSilenceMs: 300
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
class PaceAnalyzer {
|
|
49
|
+
analyze(segments, baseRate = 1) {
|
|
50
|
+
return segments.map((segment) => {
|
|
51
|
+
const defaults = CONTENT_TYPE_PACING[segment.contentType];
|
|
52
|
+
return {
|
|
53
|
+
sceneId: segment.sceneId,
|
|
54
|
+
rate: defaults.rate * baseRate,
|
|
55
|
+
emphasis: defaults.emphasis,
|
|
56
|
+
tone: defaults.tone,
|
|
57
|
+
leadingSilenceMs: defaults.leadingSilenceMs,
|
|
58
|
+
trailingSilenceMs: defaults.trailingSilenceMs
|
|
59
|
+
};
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
getDefaults(contentType) {
|
|
63
|
+
return { ...CONTENT_TYPE_PACING[contentType] };
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// src/tts/emphasis-planner.ts
|
|
68
|
+
class EmphasisPlanner {
|
|
69
|
+
llm;
|
|
70
|
+
model;
|
|
71
|
+
paceAnalyzer;
|
|
72
|
+
constructor(options) {
|
|
73
|
+
this.llm = options?.llm;
|
|
74
|
+
this.model = options?.model;
|
|
75
|
+
this.paceAnalyzer = new PaceAnalyzer;
|
|
76
|
+
}
|
|
77
|
+
async plan(segments, baseRate = 1) {
|
|
78
|
+
if (!this.llm) {
|
|
79
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
80
|
+
}
|
|
81
|
+
try {
|
|
82
|
+
return await this.planWithLlm(segments, baseRate);
|
|
83
|
+
} catch {
|
|
84
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
async planWithLlm(segments, baseRate) {
|
|
88
|
+
if (!this.llm) {
|
|
89
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
90
|
+
}
|
|
91
|
+
const response = await this.llm.chat([
|
|
92
|
+
{
|
|
93
|
+
role: "system",
|
|
94
|
+
content: [
|
|
95
|
+
{
|
|
96
|
+
type: "text",
|
|
97
|
+
text: [
|
|
98
|
+
"You are a voice director planning emphasis and pacing for TTS narration.",
|
|
99
|
+
"For each segment, return a JSON array of directives.",
|
|
100
|
+
"Each directive has: sceneId, rate (0.7-1.3), emphasis (reduced|normal|strong),",
|
|
101
|
+
"tone (neutral|urgent|excited|calm|authoritative), leadingSilenceMs, trailingSilenceMs.",
|
|
102
|
+
"Return ONLY a JSON array, no other text."
|
|
103
|
+
].join(`
|
|
104
|
+
`)
|
|
105
|
+
}
|
|
106
|
+
]
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
role: "user",
|
|
110
|
+
content: [
|
|
111
|
+
{
|
|
112
|
+
type: "text",
|
|
113
|
+
text: JSON.stringify(segments.map((s) => ({
|
|
114
|
+
sceneId: s.sceneId,
|
|
115
|
+
text: s.text,
|
|
116
|
+
contentType: s.contentType
|
|
117
|
+
})))
|
|
118
|
+
}
|
|
119
|
+
]
|
|
120
|
+
}
|
|
121
|
+
], { model: this.model, temperature: 0.3, responseFormat: "json" });
|
|
122
|
+
const text = response.message.content.find((p) => p.type === "text");
|
|
123
|
+
if (!text || text.type !== "text") {
|
|
124
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
125
|
+
}
|
|
126
|
+
const parsed = JSON.parse(text.text);
|
|
127
|
+
return parsed.map((d) => ({
|
|
128
|
+
...d,
|
|
129
|
+
rate: d.rate * baseRate
|
|
130
|
+
}));
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
export {
|
|
134
|
+
EmphasisPlanner
|
|
135
|
+
};
|