@contractspec/lib.voice 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio/audio-concatenator.d.ts +15 -0
- package/dist/audio/audio-concatenator.js +57 -0
- package/dist/audio/duration-estimator.d.ts +31 -0
- package/dist/audio/duration-estimator.js +22 -0
- package/dist/audio/format-converter.d.ts +17 -0
- package/dist/audio/format-converter.js +28 -0
- package/dist/audio/index.d.ts +4 -0
- package/dist/audio/index.js +121 -0
- package/dist/audio/silence-generator.d.ts +16 -0
- package/dist/audio/silence-generator.js +20 -0
- package/dist/browser/audio/audio-concatenator.js +56 -0
- package/dist/browser/audio/duration-estimator.js +21 -0
- package/dist/browser/audio/format-converter.js +27 -0
- package/dist/browser/audio/index.js +120 -0
- package/dist/browser/audio/silence-generator.js +19 -0
- package/dist/browser/conversational/index.js +241 -0
- package/dist/browser/conversational/response-orchestrator.js +62 -0
- package/dist/browser/conversational/transcript-builder.js +63 -0
- package/dist/browser/conversational/turn-detector.js +43 -0
- package/dist/browser/conversational/types.js +0 -0
- package/dist/browser/conversational/voice-session-manager.js +137 -0
- package/dist/browser/docs/conversational.docblock.js +5 -0
- package/dist/browser/docs/stt.docblock.js +5 -0
- package/dist/browser/docs/sync.docblock.js +5 -0
- package/dist/browser/docs/tts.docblock.js +5 -0
- package/dist/browser/docs/voice.docblock.js +5 -0
- package/dist/browser/i18n/catalogs/en.js +91 -0
- package/dist/browser/i18n/catalogs/es.js +91 -0
- package/dist/browser/i18n/catalogs/fr.js +91 -0
- package/dist/browser/i18n/catalogs/index.js +271 -0
- package/dist/browser/i18n/index.js +335 -0
- package/dist/browser/i18n/keys.js +38 -0
- package/dist/browser/i18n/locale.js +13 -0
- package/dist/browser/i18n/messages.js +283 -0
- package/dist/browser/index.js +1070 -0
- package/dist/browser/stt/diarization-mapper.js +42 -0
- package/dist/browser/stt/index.js +222 -0
- package/dist/browser/stt/segment-splitter.js +36 -0
- package/dist/browser/stt/subtitle-formatter.js +51 -0
- package/dist/browser/stt/transcriber.js +219 -0
- package/dist/browser/stt/types.js +0 -0
- package/dist/browser/sync/duration-negotiator.js +69 -0
- package/dist/browser/sync/index.js +165 -0
- package/dist/browser/sync/scene-adapter.js +52 -0
- package/dist/browser/sync/timing-calculator.js +46 -0
- package/dist/browser/tts/audio-assembler.js +120 -0
- package/dist/browser/tts/emphasis-planner.js +134 -0
- package/dist/browser/tts/index.js +439 -0
- package/dist/browser/tts/pace-analyzer.js +67 -0
- package/dist/browser/tts/segment-synthesizer.js +36 -0
- package/dist/browser/tts/types.js +0 -0
- package/dist/browser/tts/voice-synthesizer.js +435 -0
- package/dist/browser/types.js +0 -0
- package/dist/conversational/index.d.ts +5 -0
- package/dist/conversational/index.js +242 -0
- package/dist/conversational/response-orchestrator.d.ts +26 -0
- package/dist/conversational/response-orchestrator.js +63 -0
- package/dist/conversational/transcript-builder.d.ts +25 -0
- package/dist/conversational/transcript-builder.js +64 -0
- package/dist/conversational/turn-detector.d.ts +31 -0
- package/dist/conversational/turn-detector.js +44 -0
- package/dist/conversational/types.d.ts +55 -0
- package/dist/conversational/types.js +1 -0
- package/dist/conversational/voice-session-manager.d.ts +17 -0
- package/dist/conversational/voice-session-manager.js +138 -0
- package/dist/docs/conversational.docblock.d.ts +14 -0
- package/dist/docs/conversational.docblock.js +6 -0
- package/dist/docs/stt.docblock.d.ts +12 -0
- package/dist/docs/stt.docblock.js +6 -0
- package/dist/docs/sync.docblock.d.ts +12 -0
- package/dist/docs/sync.docblock.js +6 -0
- package/dist/docs/tts.docblock.d.ts +12 -0
- package/dist/docs/tts.docblock.js +6 -0
- package/dist/docs/voice.docblock.d.ts +22 -0
- package/dist/docs/voice.docblock.js +6 -0
- package/dist/i18n/catalogs/en.d.ts +6 -0
- package/dist/i18n/catalogs/en.js +92 -0
- package/dist/i18n/catalogs/es.d.ts +4 -0
- package/dist/i18n/catalogs/es.js +92 -0
- package/dist/i18n/catalogs/fr.d.ts +4 -0
- package/dist/i18n/catalogs/fr.js +92 -0
- package/dist/i18n/catalogs/index.d.ts +3 -0
- package/dist/i18n/catalogs/index.js +272 -0
- package/dist/i18n/index.d.ts +20 -0
- package/dist/i18n/index.js +336 -0
- package/dist/i18n/keys.d.ts +50 -0
- package/dist/i18n/keys.js +39 -0
- package/dist/i18n/locale.d.ts +6 -0
- package/dist/i18n/locale.js +14 -0
- package/dist/i18n/messages.d.ts +13 -0
- package/dist/i18n/messages.js +284 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +1071 -0
- package/dist/node/audio/audio-concatenator.js +56 -0
- package/dist/node/audio/duration-estimator.js +21 -0
- package/dist/node/audio/format-converter.js +27 -0
- package/dist/node/audio/index.js +120 -0
- package/dist/node/audio/silence-generator.js +19 -0
- package/dist/node/conversational/index.js +241 -0
- package/dist/node/conversational/response-orchestrator.js +62 -0
- package/dist/node/conversational/transcript-builder.js +63 -0
- package/dist/node/conversational/turn-detector.js +43 -0
- package/dist/node/conversational/types.js +0 -0
- package/dist/node/conversational/voice-session-manager.js +137 -0
- package/dist/node/docs/conversational.docblock.js +5 -0
- package/dist/node/docs/stt.docblock.js +5 -0
- package/dist/node/docs/sync.docblock.js +5 -0
- package/dist/node/docs/tts.docblock.js +5 -0
- package/dist/node/docs/voice.docblock.js +5 -0
- package/dist/node/i18n/catalogs/en.js +91 -0
- package/dist/node/i18n/catalogs/es.js +91 -0
- package/dist/node/i18n/catalogs/fr.js +91 -0
- package/dist/node/i18n/catalogs/index.js +271 -0
- package/dist/node/i18n/index.js +335 -0
- package/dist/node/i18n/keys.js +38 -0
- package/dist/node/i18n/locale.js +13 -0
- package/dist/node/i18n/messages.js +283 -0
- package/dist/node/index.js +1070 -0
- package/dist/node/stt/diarization-mapper.js +42 -0
- package/dist/node/stt/index.js +222 -0
- package/dist/node/stt/segment-splitter.js +36 -0
- package/dist/node/stt/subtitle-formatter.js +51 -0
- package/dist/node/stt/transcriber.js +219 -0
- package/dist/node/stt/types.js +0 -0
- package/dist/node/sync/duration-negotiator.js +69 -0
- package/dist/node/sync/index.js +165 -0
- package/dist/node/sync/scene-adapter.js +52 -0
- package/dist/node/sync/timing-calculator.js +46 -0
- package/dist/node/tts/audio-assembler.js +120 -0
- package/dist/node/tts/emphasis-planner.js +134 -0
- package/dist/node/tts/index.js +439 -0
- package/dist/node/tts/pace-analyzer.js +67 -0
- package/dist/node/tts/segment-synthesizer.js +36 -0
- package/dist/node/tts/types.js +0 -0
- package/dist/node/tts/voice-synthesizer.js +435 -0
- package/dist/node/types.js +0 -0
- package/dist/stt/diarization-mapper.d.ts +19 -0
- package/dist/stt/diarization-mapper.js +43 -0
- package/dist/stt/index.d.ts +5 -0
- package/dist/stt/index.js +223 -0
- package/dist/stt/segment-splitter.d.ts +19 -0
- package/dist/stt/segment-splitter.js +37 -0
- package/dist/stt/subtitle-formatter.d.ts +19 -0
- package/dist/stt/subtitle-formatter.js +52 -0
- package/dist/stt/transcriber.d.ts +21 -0
- package/dist/stt/transcriber.js +220 -0
- package/dist/stt/types.d.ts +44 -0
- package/dist/stt/types.js +1 -0
- package/dist/sync/duration-negotiator.d.ts +37 -0
- package/dist/sync/duration-negotiator.js +70 -0
- package/dist/sync/index.d.ts +3 -0
- package/dist/sync/index.js +166 -0
- package/dist/sync/scene-adapter.d.ts +29 -0
- package/dist/sync/scene-adapter.js +53 -0
- package/dist/sync/timing-calculator.d.ts +21 -0
- package/dist/sync/timing-calculator.js +47 -0
- package/dist/tts/audio-assembler.d.ts +19 -0
- package/dist/tts/audio-assembler.js +121 -0
- package/dist/tts/emphasis-planner.d.ts +24 -0
- package/dist/tts/emphasis-planner.js +135 -0
- package/dist/tts/index.d.ts +6 -0
- package/dist/tts/index.js +440 -0
- package/dist/tts/pace-analyzer.d.ts +30 -0
- package/dist/tts/pace-analyzer.js +68 -0
- package/dist/tts/segment-synthesizer.d.ts +21 -0
- package/dist/tts/segment-synthesizer.js +37 -0
- package/dist/tts/types.d.ts +76 -0
- package/dist/tts/types.js +1 -0
- package/dist/tts/voice-synthesizer.d.ts +28 -0
- package/dist/tts/voice-synthesizer.js +436 -0
- package/dist/types.d.ts +12 -0
- package/dist/types.js +1 -0
- package/package.json +760 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
// src/audio/duration-estimator.ts
|
|
2
|
+
class DurationEstimator {
|
|
3
|
+
static DEFAULT_WPM = 150;
|
|
4
|
+
estimateSeconds(text, wordsPerMinute) {
|
|
5
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
6
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
7
|
+
return Math.ceil(wordCount / wpm * 60);
|
|
8
|
+
}
|
|
9
|
+
estimateMs(text, wordsPerMinute) {
|
|
10
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
11
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
12
|
+
return Math.ceil(wordCount / wpm * 60 * 1000);
|
|
13
|
+
}
|
|
14
|
+
estimateWordCount(durationSeconds, wordsPerMinute) {
|
|
15
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
16
|
+
return Math.round(durationSeconds / 60 * wpm);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// src/sync/timing-calculator.ts
|
|
21
|
+
class TimingCalculator {
|
|
22
|
+
calculate(segments, fps, breathingRoomFactor = 1.15) {
|
|
23
|
+
const timingSegments = segments.map((seg) => {
|
|
24
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * fps);
|
|
25
|
+
const recommendedSceneDurationInFrames = Math.ceil(durationInFrames * breathingRoomFactor);
|
|
26
|
+
const wordTimings = seg.wordTimings?.map((wt) => ({
|
|
27
|
+
word: wt.word,
|
|
28
|
+
startMs: wt.startMs,
|
|
29
|
+
endMs: wt.endMs
|
|
30
|
+
}));
|
|
31
|
+
return {
|
|
32
|
+
sceneId: seg.sceneId,
|
|
33
|
+
durationMs: seg.durationMs,
|
|
34
|
+
durationInFrames,
|
|
35
|
+
recommendedSceneDurationInFrames,
|
|
36
|
+
wordTimings
|
|
37
|
+
};
|
|
38
|
+
});
|
|
39
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + s.durationMs, 0);
|
|
40
|
+
return {
|
|
41
|
+
totalDurationMs,
|
|
42
|
+
segments: timingSegments,
|
|
43
|
+
fps
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
recalculateForFps(timingMap, newFps) {
|
|
47
|
+
const segments = timingMap.segments.map((seg) => {
|
|
48
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * newFps);
|
|
49
|
+
const ratio = seg.recommendedSceneDurationInFrames / Math.max(seg.durationInFrames, 1);
|
|
50
|
+
return {
|
|
51
|
+
...seg,
|
|
52
|
+
durationInFrames,
|
|
53
|
+
recommendedSceneDurationInFrames: Math.ceil(durationInFrames * ratio)
|
|
54
|
+
};
|
|
55
|
+
});
|
|
56
|
+
return {
|
|
57
|
+
...timingMap,
|
|
58
|
+
segments,
|
|
59
|
+
fps: newFps
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// src/sync/scene-adapter.ts
|
|
65
|
+
class SceneAdapter {
|
|
66
|
+
durationEstimator = new DurationEstimator;
|
|
67
|
+
adapt(scenePlan) {
|
|
68
|
+
const scenesWithNarration = scenePlan.scenes.filter((s) => s.narrationText && s.narrationText.trim().length > 0);
|
|
69
|
+
const segments = scenesWithNarration.map((scene, index) => {
|
|
70
|
+
const text = scene.narrationText ?? "";
|
|
71
|
+
return {
|
|
72
|
+
sceneId: scene.id,
|
|
73
|
+
text,
|
|
74
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
75
|
+
contentType: this.inferContentType(index, scenesWithNarration.length)
|
|
76
|
+
};
|
|
77
|
+
});
|
|
78
|
+
const fullText = segments.map((s) => s.text).join(" ");
|
|
79
|
+
const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
|
|
80
|
+
return { fullText, segments, estimatedDurationSeconds };
|
|
81
|
+
}
|
|
82
|
+
inferContentType(index, total) {
|
|
83
|
+
if (index === 0)
|
|
84
|
+
return "intro";
|
|
85
|
+
if (index === total - 1)
|
|
86
|
+
return "cta";
|
|
87
|
+
if (index === 1 && total > 3)
|
|
88
|
+
return "problem";
|
|
89
|
+
if (index === total - 2 && total > 3)
|
|
90
|
+
return "metric";
|
|
91
|
+
return "solution";
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// src/sync/duration-negotiator.ts
|
|
96
|
+
class DurationNegotiator {
|
|
97
|
+
static UPPER_THRESHOLD = 1.1;
|
|
98
|
+
static LOWER_THRESHOLD = 0.7;
|
|
99
|
+
static MAX_RATE = 1.3;
|
|
100
|
+
static MIN_RATE = 0.8;
|
|
101
|
+
negotiate(timingMap, sceneDurations) {
|
|
102
|
+
const adjustments = [];
|
|
103
|
+
const updatedSegments = timingMap.segments.map((seg) => {
|
|
104
|
+
const originalSceneDuration = sceneDurations.get(seg.sceneId);
|
|
105
|
+
if (originalSceneDuration === undefined) {
|
|
106
|
+
adjustments.push({
|
|
107
|
+
sceneId: seg.sceneId,
|
|
108
|
+
originalSceneDurationInFrames: seg.recommendedSceneDurationInFrames,
|
|
109
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
110
|
+
action: "no_change",
|
|
111
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
112
|
+
});
|
|
113
|
+
return seg;
|
|
114
|
+
}
|
|
115
|
+
const ratio = seg.durationInFrames / originalSceneDuration;
|
|
116
|
+
if (ratio > DurationNegotiator.UPPER_THRESHOLD) {
|
|
117
|
+
const suggestedRate = Math.min(ratio, DurationNegotiator.MAX_RATE);
|
|
118
|
+
adjustments.push({
|
|
119
|
+
sceneId: seg.sceneId,
|
|
120
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
121
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
122
|
+
action: ratio > DurationNegotiator.MAX_RATE ? "extend_scene" : "suggest_rate_change",
|
|
123
|
+
suggestedRate,
|
|
124
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
125
|
+
});
|
|
126
|
+
return seg;
|
|
127
|
+
}
|
|
128
|
+
if (ratio < DurationNegotiator.LOWER_THRESHOLD) {
|
|
129
|
+
const suggestedRate = Math.max(ratio, DurationNegotiator.MIN_RATE);
|
|
130
|
+
adjustments.push({
|
|
131
|
+
sceneId: seg.sceneId,
|
|
132
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
133
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
134
|
+
action: "pad_silence",
|
|
135
|
+
suggestedRate,
|
|
136
|
+
finalSceneDurationInFrames: originalSceneDuration
|
|
137
|
+
});
|
|
138
|
+
return {
|
|
139
|
+
...seg,
|
|
140
|
+
recommendedSceneDurationInFrames: originalSceneDuration
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
adjustments.push({
|
|
144
|
+
sceneId: seg.sceneId,
|
|
145
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
146
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
147
|
+
action: "no_change",
|
|
148
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
149
|
+
});
|
|
150
|
+
return seg;
|
|
151
|
+
});
|
|
152
|
+
return {
|
|
153
|
+
timingMap: {
|
|
154
|
+
...timingMap,
|
|
155
|
+
segments: updatedSegments
|
|
156
|
+
},
|
|
157
|
+
adjustments
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
export {
|
|
162
|
+
TimingCalculator,
|
|
163
|
+
SceneAdapter,
|
|
164
|
+
DurationNegotiator
|
|
165
|
+
};
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
// src/audio/duration-estimator.ts
|
|
2
|
+
class DurationEstimator {
|
|
3
|
+
static DEFAULT_WPM = 150;
|
|
4
|
+
estimateSeconds(text, wordsPerMinute) {
|
|
5
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
6
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
7
|
+
return Math.ceil(wordCount / wpm * 60);
|
|
8
|
+
}
|
|
9
|
+
estimateMs(text, wordsPerMinute) {
|
|
10
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
11
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
12
|
+
return Math.ceil(wordCount / wpm * 60 * 1000);
|
|
13
|
+
}
|
|
14
|
+
estimateWordCount(durationSeconds, wordsPerMinute) {
|
|
15
|
+
const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
|
|
16
|
+
return Math.round(durationSeconds / 60 * wpm);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// src/sync/scene-adapter.ts
|
|
21
|
+
class SceneAdapter {
|
|
22
|
+
durationEstimator = new DurationEstimator;
|
|
23
|
+
adapt(scenePlan) {
|
|
24
|
+
const scenesWithNarration = scenePlan.scenes.filter((s) => s.narrationText && s.narrationText.trim().length > 0);
|
|
25
|
+
const segments = scenesWithNarration.map((scene, index) => {
|
|
26
|
+
const text = scene.narrationText ?? "";
|
|
27
|
+
return {
|
|
28
|
+
sceneId: scene.id,
|
|
29
|
+
text,
|
|
30
|
+
estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
|
|
31
|
+
contentType: this.inferContentType(index, scenesWithNarration.length)
|
|
32
|
+
};
|
|
33
|
+
});
|
|
34
|
+
const fullText = segments.map((s) => s.text).join(" ");
|
|
35
|
+
const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
|
|
36
|
+
return { fullText, segments, estimatedDurationSeconds };
|
|
37
|
+
}
|
|
38
|
+
inferContentType(index, total) {
|
|
39
|
+
if (index === 0)
|
|
40
|
+
return "intro";
|
|
41
|
+
if (index === total - 1)
|
|
42
|
+
return "cta";
|
|
43
|
+
if (index === 1 && total > 3)
|
|
44
|
+
return "problem";
|
|
45
|
+
if (index === total - 2 && total > 3)
|
|
46
|
+
return "metric";
|
|
47
|
+
return "solution";
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
export {
|
|
51
|
+
SceneAdapter
|
|
52
|
+
};
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
// src/sync/timing-calculator.ts
|
|
2
|
+
class TimingCalculator {
|
|
3
|
+
calculate(segments, fps, breathingRoomFactor = 1.15) {
|
|
4
|
+
const timingSegments = segments.map((seg) => {
|
|
5
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * fps);
|
|
6
|
+
const recommendedSceneDurationInFrames = Math.ceil(durationInFrames * breathingRoomFactor);
|
|
7
|
+
const wordTimings = seg.wordTimings?.map((wt) => ({
|
|
8
|
+
word: wt.word,
|
|
9
|
+
startMs: wt.startMs,
|
|
10
|
+
endMs: wt.endMs
|
|
11
|
+
}));
|
|
12
|
+
return {
|
|
13
|
+
sceneId: seg.sceneId,
|
|
14
|
+
durationMs: seg.durationMs,
|
|
15
|
+
durationInFrames,
|
|
16
|
+
recommendedSceneDurationInFrames,
|
|
17
|
+
wordTimings
|
|
18
|
+
};
|
|
19
|
+
});
|
|
20
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + s.durationMs, 0);
|
|
21
|
+
return {
|
|
22
|
+
totalDurationMs,
|
|
23
|
+
segments: timingSegments,
|
|
24
|
+
fps
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
recalculateForFps(timingMap, newFps) {
|
|
28
|
+
const segments = timingMap.segments.map((seg) => {
|
|
29
|
+
const durationInFrames = Math.ceil(seg.durationMs / 1000 * newFps);
|
|
30
|
+
const ratio = seg.recommendedSceneDurationInFrames / Math.max(seg.durationInFrames, 1);
|
|
31
|
+
return {
|
|
32
|
+
...seg,
|
|
33
|
+
durationInFrames,
|
|
34
|
+
recommendedSceneDurationInFrames: Math.ceil(durationInFrames * ratio)
|
|
35
|
+
};
|
|
36
|
+
});
|
|
37
|
+
return {
|
|
38
|
+
...timingMap,
|
|
39
|
+
segments,
|
|
40
|
+
fps: newFps
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
export {
|
|
45
|
+
TimingCalculator
|
|
46
|
+
};
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
// src/audio/audio-concatenator.ts
|
|
2
|
+
class AudioConcatenator {
|
|
3
|
+
concatenate(segments) {
|
|
4
|
+
if (segments.length === 0) {
|
|
5
|
+
return {
|
|
6
|
+
data: new Uint8Array(0),
|
|
7
|
+
format: "wav",
|
|
8
|
+
sampleRateHz: 44100,
|
|
9
|
+
durationMs: 0,
|
|
10
|
+
channels: 1
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
const [firstSegment] = segments;
|
|
14
|
+
if (!firstSegment) {
|
|
15
|
+
return {
|
|
16
|
+
data: new Uint8Array(0),
|
|
17
|
+
format: "wav",
|
|
18
|
+
sampleRateHz: 44100,
|
|
19
|
+
durationMs: 0,
|
|
20
|
+
channels: 1
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
if (segments.length === 1) {
|
|
24
|
+
return { ...firstSegment };
|
|
25
|
+
}
|
|
26
|
+
const referenceFormat = firstSegment.format;
|
|
27
|
+
const referenceSampleRate = firstSegment.sampleRateHz;
|
|
28
|
+
const referenceChannels = firstSegment.channels ?? 1;
|
|
29
|
+
for (const seg of segments) {
|
|
30
|
+
if (seg.format !== referenceFormat) {
|
|
31
|
+
throw new Error(`Format mismatch: expected ${referenceFormat}, got ${seg.format}`);
|
|
32
|
+
}
|
|
33
|
+
if (seg.sampleRateHz !== referenceSampleRate) {
|
|
34
|
+
throw new Error(`Sample rate mismatch: expected ${referenceSampleRate}, got ${seg.sampleRateHz}`);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
const totalBytes = segments.reduce((sum, s) => sum + s.data.length, 0);
|
|
38
|
+
const combined = new Uint8Array(totalBytes);
|
|
39
|
+
let offset = 0;
|
|
40
|
+
for (const seg of segments) {
|
|
41
|
+
combined.set(seg.data, offset);
|
|
42
|
+
offset += seg.data.length;
|
|
43
|
+
}
|
|
44
|
+
const totalDurationMs = segments.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
|
|
45
|
+
return {
|
|
46
|
+
data: combined,
|
|
47
|
+
format: referenceFormat,
|
|
48
|
+
sampleRateHz: referenceSampleRate,
|
|
49
|
+
durationMs: totalDurationMs,
|
|
50
|
+
channels: referenceChannels
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// src/audio/silence-generator.ts
|
|
56
|
+
class SilenceGenerator {
|
|
57
|
+
generate(durationMs, format = "wav", sampleRateHz = 44100, channels = 1) {
|
|
58
|
+
const totalSamples = Math.ceil(sampleRateHz * durationMs / 1000);
|
|
59
|
+
const bytesPerSample = 2;
|
|
60
|
+
const dataSize = totalSamples * bytesPerSample * channels;
|
|
61
|
+
const data = new Uint8Array(dataSize);
|
|
62
|
+
return {
|
|
63
|
+
data,
|
|
64
|
+
format,
|
|
65
|
+
sampleRateHz,
|
|
66
|
+
durationMs,
|
|
67
|
+
channels
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// src/tts/audio-assembler.ts
|
|
73
|
+
class AudioAssembler {
|
|
74
|
+
concatenator = new AudioConcatenator;
|
|
75
|
+
silenceGenerator = new SilenceGenerator;
|
|
76
|
+
assemble(segments, directives, defaultPauseMs = 500) {
|
|
77
|
+
if (segments.length === 0) {
|
|
78
|
+
return {
|
|
79
|
+
data: new Uint8Array(0),
|
|
80
|
+
format: "wav",
|
|
81
|
+
sampleRateHz: 44100,
|
|
82
|
+
durationMs: 0,
|
|
83
|
+
channels: 1
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
const [firstSegment] = segments;
|
|
87
|
+
if (!firstSegment) {
|
|
88
|
+
return {
|
|
89
|
+
data: new Uint8Array(0),
|
|
90
|
+
format: "wav",
|
|
91
|
+
sampleRateHz: 44100,
|
|
92
|
+
durationMs: 0,
|
|
93
|
+
channels: 1
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
|
|
97
|
+
const reference = firstSegment.audio;
|
|
98
|
+
const parts = [];
|
|
99
|
+
for (let i = 0;i < segments.length; i++) {
|
|
100
|
+
const segment = segments[i];
|
|
101
|
+
if (!segment) {
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
const directive = directiveMap.get(segment.sceneId);
|
|
105
|
+
const leadingSilenceMs = directive?.leadingSilenceMs ?? 0;
|
|
106
|
+
if (leadingSilenceMs > 0) {
|
|
107
|
+
parts.push(this.silenceGenerator.generate(leadingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
|
|
108
|
+
}
|
|
109
|
+
parts.push(segment.audio);
|
|
110
|
+
const trailingSilenceMs = directive?.trailingSilenceMs ?? (i < segments.length - 1 ? defaultPauseMs : 0);
|
|
111
|
+
if (trailingSilenceMs > 0) {
|
|
112
|
+
parts.push(this.silenceGenerator.generate(trailingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return this.concatenator.concatenate(parts);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
export {
|
|
119
|
+
AudioAssembler
|
|
120
|
+
};
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
// src/tts/pace-analyzer.ts
|
|
2
|
+
var CONTENT_TYPE_PACING = {
|
|
3
|
+
intro: {
|
|
4
|
+
rate: 0.95,
|
|
5
|
+
emphasis: "normal",
|
|
6
|
+
tone: "authoritative",
|
|
7
|
+
leadingSilenceMs: 0,
|
|
8
|
+
trailingSilenceMs: 500
|
|
9
|
+
},
|
|
10
|
+
problem: {
|
|
11
|
+
rate: 0.9,
|
|
12
|
+
emphasis: "strong",
|
|
13
|
+
tone: "urgent",
|
|
14
|
+
leadingSilenceMs: 300,
|
|
15
|
+
trailingSilenceMs: 500
|
|
16
|
+
},
|
|
17
|
+
solution: {
|
|
18
|
+
rate: 1,
|
|
19
|
+
emphasis: "normal",
|
|
20
|
+
tone: "calm",
|
|
21
|
+
leadingSilenceMs: 300,
|
|
22
|
+
trailingSilenceMs: 500
|
|
23
|
+
},
|
|
24
|
+
metric: {
|
|
25
|
+
rate: 0.85,
|
|
26
|
+
emphasis: "strong",
|
|
27
|
+
tone: "excited",
|
|
28
|
+
leadingSilenceMs: 300,
|
|
29
|
+
trailingSilenceMs: 600
|
|
30
|
+
},
|
|
31
|
+
cta: {
|
|
32
|
+
rate: 0.9,
|
|
33
|
+
emphasis: "strong",
|
|
34
|
+
tone: "authoritative",
|
|
35
|
+
leadingSilenceMs: 400,
|
|
36
|
+
trailingSilenceMs: 0
|
|
37
|
+
},
|
|
38
|
+
transition: {
|
|
39
|
+
rate: 1.1,
|
|
40
|
+
emphasis: "reduced",
|
|
41
|
+
tone: "neutral",
|
|
42
|
+
leadingSilenceMs: 200,
|
|
43
|
+
trailingSilenceMs: 300
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
class PaceAnalyzer {
|
|
48
|
+
analyze(segments, baseRate = 1) {
|
|
49
|
+
return segments.map((segment) => {
|
|
50
|
+
const defaults = CONTENT_TYPE_PACING[segment.contentType];
|
|
51
|
+
return {
|
|
52
|
+
sceneId: segment.sceneId,
|
|
53
|
+
rate: defaults.rate * baseRate,
|
|
54
|
+
emphasis: defaults.emphasis,
|
|
55
|
+
tone: defaults.tone,
|
|
56
|
+
leadingSilenceMs: defaults.leadingSilenceMs,
|
|
57
|
+
trailingSilenceMs: defaults.trailingSilenceMs
|
|
58
|
+
};
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
getDefaults(contentType) {
|
|
62
|
+
return { ...CONTENT_TYPE_PACING[contentType] };
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// src/tts/emphasis-planner.ts
|
|
67
|
+
class EmphasisPlanner {
|
|
68
|
+
llm;
|
|
69
|
+
model;
|
|
70
|
+
paceAnalyzer;
|
|
71
|
+
constructor(options) {
|
|
72
|
+
this.llm = options?.llm;
|
|
73
|
+
this.model = options?.model;
|
|
74
|
+
this.paceAnalyzer = new PaceAnalyzer;
|
|
75
|
+
}
|
|
76
|
+
async plan(segments, baseRate = 1) {
|
|
77
|
+
if (!this.llm) {
|
|
78
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
79
|
+
}
|
|
80
|
+
try {
|
|
81
|
+
return await this.planWithLlm(segments, baseRate);
|
|
82
|
+
} catch {
|
|
83
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
async planWithLlm(segments, baseRate) {
|
|
87
|
+
if (!this.llm) {
|
|
88
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
89
|
+
}
|
|
90
|
+
const response = await this.llm.chat([
|
|
91
|
+
{
|
|
92
|
+
role: "system",
|
|
93
|
+
content: [
|
|
94
|
+
{
|
|
95
|
+
type: "text",
|
|
96
|
+
text: [
|
|
97
|
+
"You are a voice director planning emphasis and pacing for TTS narration.",
|
|
98
|
+
"For each segment, return a JSON array of directives.",
|
|
99
|
+
"Each directive has: sceneId, rate (0.7-1.3), emphasis (reduced|normal|strong),",
|
|
100
|
+
"tone (neutral|urgent|excited|calm|authoritative), leadingSilenceMs, trailingSilenceMs.",
|
|
101
|
+
"Return ONLY a JSON array, no other text."
|
|
102
|
+
].join(`
|
|
103
|
+
`)
|
|
104
|
+
}
|
|
105
|
+
]
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
role: "user",
|
|
109
|
+
content: [
|
|
110
|
+
{
|
|
111
|
+
type: "text",
|
|
112
|
+
text: JSON.stringify(segments.map((s) => ({
|
|
113
|
+
sceneId: s.sceneId,
|
|
114
|
+
text: s.text,
|
|
115
|
+
contentType: s.contentType
|
|
116
|
+
})))
|
|
117
|
+
}
|
|
118
|
+
]
|
|
119
|
+
}
|
|
120
|
+
], { model: this.model, temperature: 0.3, responseFormat: "json" });
|
|
121
|
+
const text = response.message.content.find((p) => p.type === "text");
|
|
122
|
+
if (!text || text.type !== "text") {
|
|
123
|
+
return this.paceAnalyzer.analyze(segments, baseRate);
|
|
124
|
+
}
|
|
125
|
+
const parsed = JSON.parse(text.text);
|
|
126
|
+
return parsed.map((d) => ({
|
|
127
|
+
...d,
|
|
128
|
+
rate: d.rate * baseRate
|
|
129
|
+
}));
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
export {
|
|
133
|
+
EmphasisPlanner
|
|
134
|
+
};
|