@contractspec/lib.voice 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/dist/audio/audio-concatenator.d.ts +15 -0
  2. package/dist/audio/audio-concatenator.js +57 -0
  3. package/dist/audio/duration-estimator.d.ts +31 -0
  4. package/dist/audio/duration-estimator.js +22 -0
  5. package/dist/audio/format-converter.d.ts +17 -0
  6. package/dist/audio/format-converter.js +28 -0
  7. package/dist/audio/index.d.ts +4 -0
  8. package/dist/audio/index.js +121 -0
  9. package/dist/audio/silence-generator.d.ts +16 -0
  10. package/dist/audio/silence-generator.js +20 -0
  11. package/dist/browser/audio/audio-concatenator.js +56 -0
  12. package/dist/browser/audio/duration-estimator.js +21 -0
  13. package/dist/browser/audio/format-converter.js +27 -0
  14. package/dist/browser/audio/index.js +120 -0
  15. package/dist/browser/audio/silence-generator.js +19 -0
  16. package/dist/browser/conversational/index.js +241 -0
  17. package/dist/browser/conversational/response-orchestrator.js +62 -0
  18. package/dist/browser/conversational/transcript-builder.js +63 -0
  19. package/dist/browser/conversational/turn-detector.js +43 -0
  20. package/dist/browser/conversational/types.js +0 -0
  21. package/dist/browser/conversational/voice-session-manager.js +137 -0
  22. package/dist/browser/docs/conversational.docblock.js +5 -0
  23. package/dist/browser/docs/stt.docblock.js +5 -0
  24. package/dist/browser/docs/sync.docblock.js +5 -0
  25. package/dist/browser/docs/tts.docblock.js +5 -0
  26. package/dist/browser/docs/voice.docblock.js +5 -0
  27. package/dist/browser/i18n/catalogs/en.js +91 -0
  28. package/dist/browser/i18n/catalogs/es.js +91 -0
  29. package/dist/browser/i18n/catalogs/fr.js +91 -0
  30. package/dist/browser/i18n/catalogs/index.js +271 -0
  31. package/dist/browser/i18n/index.js +335 -0
  32. package/dist/browser/i18n/keys.js +38 -0
  33. package/dist/browser/i18n/locale.js +13 -0
  34. package/dist/browser/i18n/messages.js +283 -0
  35. package/dist/browser/index.js +1070 -0
  36. package/dist/browser/stt/diarization-mapper.js +42 -0
  37. package/dist/browser/stt/index.js +222 -0
  38. package/dist/browser/stt/segment-splitter.js +36 -0
  39. package/dist/browser/stt/subtitle-formatter.js +51 -0
  40. package/dist/browser/stt/transcriber.js +219 -0
  41. package/dist/browser/stt/types.js +0 -0
  42. package/dist/browser/sync/duration-negotiator.js +69 -0
  43. package/dist/browser/sync/index.js +165 -0
  44. package/dist/browser/sync/scene-adapter.js +52 -0
  45. package/dist/browser/sync/timing-calculator.js +46 -0
  46. package/dist/browser/tts/audio-assembler.js +120 -0
  47. package/dist/browser/tts/emphasis-planner.js +134 -0
  48. package/dist/browser/tts/index.js +439 -0
  49. package/dist/browser/tts/pace-analyzer.js +67 -0
  50. package/dist/browser/tts/segment-synthesizer.js +36 -0
  51. package/dist/browser/tts/types.js +0 -0
  52. package/dist/browser/tts/voice-synthesizer.js +435 -0
  53. package/dist/browser/types.js +0 -0
  54. package/dist/conversational/index.d.ts +5 -0
  55. package/dist/conversational/index.js +242 -0
  56. package/dist/conversational/response-orchestrator.d.ts +26 -0
  57. package/dist/conversational/response-orchestrator.js +63 -0
  58. package/dist/conversational/transcript-builder.d.ts +25 -0
  59. package/dist/conversational/transcript-builder.js +64 -0
  60. package/dist/conversational/turn-detector.d.ts +31 -0
  61. package/dist/conversational/turn-detector.js +44 -0
  62. package/dist/conversational/types.d.ts +55 -0
  63. package/dist/conversational/types.js +1 -0
  64. package/dist/conversational/voice-session-manager.d.ts +17 -0
  65. package/dist/conversational/voice-session-manager.js +138 -0
  66. package/dist/docs/conversational.docblock.d.ts +14 -0
  67. package/dist/docs/conversational.docblock.js +6 -0
  68. package/dist/docs/stt.docblock.d.ts +12 -0
  69. package/dist/docs/stt.docblock.js +6 -0
  70. package/dist/docs/sync.docblock.d.ts +12 -0
  71. package/dist/docs/sync.docblock.js +6 -0
  72. package/dist/docs/tts.docblock.d.ts +12 -0
  73. package/dist/docs/tts.docblock.js +6 -0
  74. package/dist/docs/voice.docblock.d.ts +22 -0
  75. package/dist/docs/voice.docblock.js +6 -0
  76. package/dist/i18n/catalogs/en.d.ts +6 -0
  77. package/dist/i18n/catalogs/en.js +92 -0
  78. package/dist/i18n/catalogs/es.d.ts +4 -0
  79. package/dist/i18n/catalogs/es.js +92 -0
  80. package/dist/i18n/catalogs/fr.d.ts +4 -0
  81. package/dist/i18n/catalogs/fr.js +92 -0
  82. package/dist/i18n/catalogs/index.d.ts +3 -0
  83. package/dist/i18n/catalogs/index.js +272 -0
  84. package/dist/i18n/index.d.ts +20 -0
  85. package/dist/i18n/index.js +336 -0
  86. package/dist/i18n/keys.d.ts +50 -0
  87. package/dist/i18n/keys.js +39 -0
  88. package/dist/i18n/locale.d.ts +6 -0
  89. package/dist/i18n/locale.js +14 -0
  90. package/dist/i18n/messages.d.ts +13 -0
  91. package/dist/i18n/messages.js +284 -0
  92. package/dist/index.d.ts +6 -0
  93. package/dist/index.js +1071 -0
  94. package/dist/node/audio/audio-concatenator.js +56 -0
  95. package/dist/node/audio/duration-estimator.js +21 -0
  96. package/dist/node/audio/format-converter.js +27 -0
  97. package/dist/node/audio/index.js +120 -0
  98. package/dist/node/audio/silence-generator.js +19 -0
  99. package/dist/node/conversational/index.js +241 -0
  100. package/dist/node/conversational/response-orchestrator.js +62 -0
  101. package/dist/node/conversational/transcript-builder.js +63 -0
  102. package/dist/node/conversational/turn-detector.js +43 -0
  103. package/dist/node/conversational/types.js +0 -0
  104. package/dist/node/conversational/voice-session-manager.js +137 -0
  105. package/dist/node/docs/conversational.docblock.js +5 -0
  106. package/dist/node/docs/stt.docblock.js +5 -0
  107. package/dist/node/docs/sync.docblock.js +5 -0
  108. package/dist/node/docs/tts.docblock.js +5 -0
  109. package/dist/node/docs/voice.docblock.js +5 -0
  110. package/dist/node/i18n/catalogs/en.js +91 -0
  111. package/dist/node/i18n/catalogs/es.js +91 -0
  112. package/dist/node/i18n/catalogs/fr.js +91 -0
  113. package/dist/node/i18n/catalogs/index.js +271 -0
  114. package/dist/node/i18n/index.js +335 -0
  115. package/dist/node/i18n/keys.js +38 -0
  116. package/dist/node/i18n/locale.js +13 -0
  117. package/dist/node/i18n/messages.js +283 -0
  118. package/dist/node/index.js +1070 -0
  119. package/dist/node/stt/diarization-mapper.js +42 -0
  120. package/dist/node/stt/index.js +222 -0
  121. package/dist/node/stt/segment-splitter.js +36 -0
  122. package/dist/node/stt/subtitle-formatter.js +51 -0
  123. package/dist/node/stt/transcriber.js +219 -0
  124. package/dist/node/stt/types.js +0 -0
  125. package/dist/node/sync/duration-negotiator.js +69 -0
  126. package/dist/node/sync/index.js +165 -0
  127. package/dist/node/sync/scene-adapter.js +52 -0
  128. package/dist/node/sync/timing-calculator.js +46 -0
  129. package/dist/node/tts/audio-assembler.js +120 -0
  130. package/dist/node/tts/emphasis-planner.js +134 -0
  131. package/dist/node/tts/index.js +439 -0
  132. package/dist/node/tts/pace-analyzer.js +67 -0
  133. package/dist/node/tts/segment-synthesizer.js +36 -0
  134. package/dist/node/tts/types.js +0 -0
  135. package/dist/node/tts/voice-synthesizer.js +435 -0
  136. package/dist/node/types.js +0 -0
  137. package/dist/stt/diarization-mapper.d.ts +19 -0
  138. package/dist/stt/diarization-mapper.js +43 -0
  139. package/dist/stt/index.d.ts +5 -0
  140. package/dist/stt/index.js +223 -0
  141. package/dist/stt/segment-splitter.d.ts +19 -0
  142. package/dist/stt/segment-splitter.js +37 -0
  143. package/dist/stt/subtitle-formatter.d.ts +19 -0
  144. package/dist/stt/subtitle-formatter.js +52 -0
  145. package/dist/stt/transcriber.d.ts +21 -0
  146. package/dist/stt/transcriber.js +220 -0
  147. package/dist/stt/types.d.ts +44 -0
  148. package/dist/stt/types.js +1 -0
  149. package/dist/sync/duration-negotiator.d.ts +37 -0
  150. package/dist/sync/duration-negotiator.js +70 -0
  151. package/dist/sync/index.d.ts +3 -0
  152. package/dist/sync/index.js +166 -0
  153. package/dist/sync/scene-adapter.d.ts +29 -0
  154. package/dist/sync/scene-adapter.js +53 -0
  155. package/dist/sync/timing-calculator.d.ts +21 -0
  156. package/dist/sync/timing-calculator.js +47 -0
  157. package/dist/tts/audio-assembler.d.ts +19 -0
  158. package/dist/tts/audio-assembler.js +121 -0
  159. package/dist/tts/emphasis-planner.d.ts +24 -0
  160. package/dist/tts/emphasis-planner.js +135 -0
  161. package/dist/tts/index.d.ts +6 -0
  162. package/dist/tts/index.js +440 -0
  163. package/dist/tts/pace-analyzer.d.ts +30 -0
  164. package/dist/tts/pace-analyzer.js +68 -0
  165. package/dist/tts/segment-synthesizer.d.ts +21 -0
  166. package/dist/tts/segment-synthesizer.js +37 -0
  167. package/dist/tts/types.d.ts +76 -0
  168. package/dist/tts/types.js +1 -0
  169. package/dist/tts/voice-synthesizer.d.ts +28 -0
  170. package/dist/tts/voice-synthesizer.js +436 -0
  171. package/dist/types.d.ts +12 -0
  172. package/dist/types.js +1 -0
  173. package/package.json +760 -0
package/dist/index.js ADDED
@@ -0,0 +1,1071 @@
1
+ // @bun
2
+ // src/audio/audio-concatenator.ts
3
+ class AudioConcatenator {
4
+ concatenate(segments) {
5
+ if (segments.length === 0) {
6
+ return {
7
+ data: new Uint8Array(0),
8
+ format: "wav",
9
+ sampleRateHz: 44100,
10
+ durationMs: 0,
11
+ channels: 1
12
+ };
13
+ }
14
+ const [firstSegment] = segments;
15
+ if (!firstSegment) {
16
+ return {
17
+ data: new Uint8Array(0),
18
+ format: "wav",
19
+ sampleRateHz: 44100,
20
+ durationMs: 0,
21
+ channels: 1
22
+ };
23
+ }
24
+ if (segments.length === 1) {
25
+ return { ...firstSegment };
26
+ }
27
+ const referenceFormat = firstSegment.format;
28
+ const referenceSampleRate = firstSegment.sampleRateHz;
29
+ const referenceChannels = firstSegment.channels ?? 1;
30
+ for (const seg of segments) {
31
+ if (seg.format !== referenceFormat) {
32
+ throw new Error(`Format mismatch: expected ${referenceFormat}, got ${seg.format}`);
33
+ }
34
+ if (seg.sampleRateHz !== referenceSampleRate) {
35
+ throw new Error(`Sample rate mismatch: expected ${referenceSampleRate}, got ${seg.sampleRateHz}`);
36
+ }
37
+ }
38
+ const totalBytes = segments.reduce((sum, s) => sum + s.data.length, 0);
39
+ const combined = new Uint8Array(totalBytes);
40
+ let offset = 0;
41
+ for (const seg of segments) {
42
+ combined.set(seg.data, offset);
43
+ offset += seg.data.length;
44
+ }
45
+ const totalDurationMs = segments.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
46
+ return {
47
+ data: combined,
48
+ format: referenceFormat,
49
+ sampleRateHz: referenceSampleRate,
50
+ durationMs: totalDurationMs,
51
+ channels: referenceChannels
52
+ };
53
+ }
54
+ }
55
+
56
+ // src/audio/duration-estimator.ts
57
+ class DurationEstimator {
58
+ static DEFAULT_WPM = 150;
59
+ estimateSeconds(text, wordsPerMinute) {
60
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
61
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
62
+ return Math.ceil(wordCount / wpm * 60);
63
+ }
64
+ estimateMs(text, wordsPerMinute) {
65
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
66
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
67
+ return Math.ceil(wordCount / wpm * 60 * 1000);
68
+ }
69
+ estimateWordCount(durationSeconds, wordsPerMinute) {
70
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
71
+ return Math.round(durationSeconds / 60 * wpm);
72
+ }
73
+ }
74
+
75
+ // src/audio/format-converter.ts
76
+ class FormatConverter {
77
+ convert(audio, targetFormat) {
78
+ if (audio.format === targetFormat) {
79
+ return audio;
80
+ }
81
+ return {
82
+ ...audio,
83
+ format: targetFormat
84
+ };
85
+ }
86
+ isSupported(from, to) {
87
+ if (from === to)
88
+ return true;
89
+ const supportedPaths = {
90
+ wav: ["mp3", "ogg", "pcm", "opus"],
91
+ mp3: ["wav"],
92
+ ogg: ["wav"],
93
+ pcm: ["wav"],
94
+ opus: ["wav"]
95
+ };
96
+ return supportedPaths[from]?.includes(to) ?? false;
97
+ }
98
+ }
99
+
100
+ // src/audio/silence-generator.ts
101
+ class SilenceGenerator {
102
+ generate(durationMs, format = "wav", sampleRateHz = 44100, channels = 1) {
103
+ const totalSamples = Math.ceil(sampleRateHz * durationMs / 1000);
104
+ const bytesPerSample = 2;
105
+ const dataSize = totalSamples * bytesPerSample * channels;
106
+ const data = new Uint8Array(dataSize);
107
+ return {
108
+ data,
109
+ format,
110
+ sampleRateHz,
111
+ durationMs,
112
+ channels
113
+ };
114
+ }
115
+ }
116
+ // src/conversational/transcript-builder.ts
117
+ class TranscriptBuilder {
118
+ turns = [];
119
+ currentTurn = null;
120
+ sessionStartMs = Date.now();
121
+ getTranscript() {
122
+ return [...this.turns];
123
+ }
124
+ toText() {
125
+ return this.turns.map((t) => `[${t.role}] ${t.text}`).join(`
126
+ `);
127
+ }
128
+ getTurnCount() {
129
+ return this.turns.length;
130
+ }
131
+ processEvent(event) {
132
+ switch (event.type) {
133
+ case "session_started":
134
+ this.sessionStartMs = Date.now();
135
+ break;
136
+ case "user_speech_started":
137
+ this.currentTurn = {
138
+ role: "user",
139
+ startMs: Date.now() - this.sessionStartMs
140
+ };
141
+ break;
142
+ case "user_speech_ended":
143
+ if (this.currentTurn && this.currentTurn.role === "user") {
144
+ this.currentTurn.text = event.transcript;
145
+ this.currentTurn.endMs = Date.now() - this.sessionStartMs;
146
+ this.turns.push(this.currentTurn);
147
+ this.currentTurn = null;
148
+ }
149
+ break;
150
+ case "agent_speech_started":
151
+ this.currentTurn = {
152
+ role: "agent",
153
+ text: event.text,
154
+ startMs: Date.now() - this.sessionStartMs
155
+ };
156
+ break;
157
+ case "agent_speech_ended":
158
+ if (this.currentTurn && this.currentTurn.role === "agent") {
159
+ this.currentTurn.endMs = Date.now() - this.sessionStartMs;
160
+ this.turns.push(this.currentTurn);
161
+ this.currentTurn = null;
162
+ }
163
+ break;
164
+ case "transcript":
165
+ break;
166
+ default:
167
+ break;
168
+ }
169
+ }
170
+ reset() {
171
+ this.turns.length = 0;
172
+ this.currentTurn = null;
173
+ this.sessionStartMs = Date.now();
174
+ }
175
+ }
176
+
177
+ // src/conversational/voice-session-manager.ts
178
+ class VoiceSessionManager {
179
+ provider;
180
+ constructor(options) {
181
+ this.provider = options.conversational;
182
+ }
183
+ async startSession(config) {
184
+ const transcriptBuilder = new TranscriptBuilder;
185
+ const session = await this.provider.startSession({
186
+ voiceId: config.voiceId,
187
+ language: config.language,
188
+ systemPrompt: config.systemPrompt,
189
+ llmModel: config.llmModel,
190
+ inputFormat: config.inputFormat,
191
+ outputFormat: config.outputFormat,
192
+ turnDetection: config.turnDetection,
193
+ silenceThresholdMs: config.silenceThresholdMs,
194
+ maxDurationSeconds: config.maxDurationSeconds
195
+ });
196
+ const state = {
197
+ sessionId: "",
198
+ status: "connecting",
199
+ currentTurn: "idle",
200
+ turnCount: 0,
201
+ durationMs: 0,
202
+ transcript: []
203
+ };
204
+ const wrappedEvents = this.wrapEvents(session.events, state, transcriptBuilder);
205
+ return {
206
+ state,
207
+ sendAudio: (chunk) => session.sendAudio(chunk),
208
+ sendText: (text) => session.sendText(text),
209
+ interrupt: () => session.interrupt(),
210
+ close: async () => {
211
+ const summary = await session.close();
212
+ state.status = "ended";
213
+ return summary;
214
+ },
215
+ events: wrappedEvents
216
+ };
217
+ }
218
+ async* wrapEvents(events, state, transcriptBuilder) {
219
+ for await (const event of events) {
220
+ transcriptBuilder.processEvent(event);
221
+ switch (event.type) {
222
+ case "session_started":
223
+ state.sessionId = event.sessionId;
224
+ state.status = "active";
225
+ break;
226
+ case "user_speech_started":
227
+ state.currentTurn = "user";
228
+ break;
229
+ case "user_speech_ended":
230
+ state.currentTurn = "idle";
231
+ state.turnCount += 1;
232
+ break;
233
+ case "agent_speech_started":
234
+ state.currentTurn = "agent";
235
+ break;
236
+ case "agent_speech_ended":
237
+ state.currentTurn = "idle";
238
+ state.turnCount += 1;
239
+ break;
240
+ case "session_ended":
241
+ state.status = "ended";
242
+ state.durationMs = event.durationMs;
243
+ break;
244
+ }
245
+ state.transcript = transcriptBuilder.getTranscript();
246
+ yield event;
247
+ }
248
+ }
249
+ }
250
+
251
+ // src/conversational/turn-detector.ts
252
+ class TurnDetector {
253
+ silenceThresholdMs;
254
+ energyThreshold;
255
+ silenceStartMs = null;
256
+ constructor(silenceThresholdMs = 800, energyThreshold = 0.01) {
257
+ this.silenceThresholdMs = silenceThresholdMs;
258
+ this.energyThreshold = energyThreshold;
259
+ }
260
+ processChunk(chunk, timestampMs) {
261
+ const energy = this.calculateEnergy(chunk);
262
+ const isSpeech = energy > this.energyThreshold;
263
+ if (isSpeech) {
264
+ this.silenceStartMs = null;
265
+ return false;
266
+ }
267
+ if (this.silenceStartMs === null) {
268
+ this.silenceStartMs = timestampMs;
269
+ }
270
+ const silenceDurationMs = timestampMs - this.silenceStartMs;
271
+ return silenceDurationMs >= this.silenceThresholdMs;
272
+ }
273
+ reset() {
274
+ this.silenceStartMs = null;
275
+ }
276
+ calculateEnergy(chunk) {
277
+ if (chunk.length < 2)
278
+ return 0;
279
+ let sum = 0;
280
+ const sampleCount = Math.floor(chunk.length / 2);
281
+ for (let i = 0;i < chunk.length - 1; i += 2) {
282
+ const low = chunk[i] ?? 0;
283
+ const high = chunk[i + 1] ?? 0;
284
+ const sample = (low | high << 8) << 16 >> 16;
285
+ const normalized = sample / 32768;
286
+ sum += normalized * normalized;
287
+ }
288
+ return Math.sqrt(sum / sampleCount);
289
+ }
290
+ }
291
+
292
+ // src/conversational/response-orchestrator.ts
293
+ class ResponseOrchestrator {
294
+ stt;
295
+ llm;
296
+ tts;
297
+ conversationHistory = [];
298
+ constructor(stt, llm, tts) {
299
+ this.stt = stt;
300
+ this.llm = llm;
301
+ this.tts = tts;
302
+ }
303
+ async* processUserTurn(userAudio, config) {
304
+ const transcription = await this.stt.transcribe({
305
+ audio: userAudio,
306
+ language: config.language,
307
+ wordTimestamps: false
308
+ });
309
+ const userText = transcription.text;
310
+ yield { type: "user_speech_ended", transcript: userText };
311
+ yield {
312
+ type: "transcript",
313
+ role: "user",
314
+ text: userText,
315
+ timestamp: Date.now()
316
+ };
317
+ this.conversationHistory.push({ role: "user", content: userText });
318
+ const llmResponse = await this.llm.chat([
319
+ {
320
+ role: "system",
321
+ content: [{ type: "text", text: config.systemPrompt }]
322
+ },
323
+ ...this.conversationHistory.map((msg) => ({
324
+ role: msg.role,
325
+ content: [{ type: "text", text: msg.content }]
326
+ }))
327
+ ], { model: config.llmModel });
328
+ const responseText = llmResponse.message.content.find((p) => p.type === "text");
329
+ const agentText = responseText && responseText.type === "text" ? responseText.text : "I apologize, I could not generate a response.";
330
+ this.conversationHistory.push({ role: "assistant", content: agentText });
331
+ yield { type: "agent_speech_started", text: agentText };
332
+ const synthesis = await this.tts.synthesize({
333
+ text: agentText,
334
+ voiceId: config.voiceId,
335
+ language: config.language,
336
+ format: config.outputFormat
337
+ });
338
+ yield { type: "agent_audio", audio: synthesis.audio.data };
339
+ yield { type: "agent_speech_ended" };
340
+ yield {
341
+ type: "transcript",
342
+ role: "agent",
343
+ text: agentText,
344
+ timestamp: Date.now()
345
+ };
346
+ }
347
+ reset() {
348
+ this.conversationHistory.length = 0;
349
+ }
350
+ }
351
+ // src/tts/pace-analyzer.ts
352
+ var CONTENT_TYPE_PACING = {
353
+ intro: {
354
+ rate: 0.95,
355
+ emphasis: "normal",
356
+ tone: "authoritative",
357
+ leadingSilenceMs: 0,
358
+ trailingSilenceMs: 500
359
+ },
360
+ problem: {
361
+ rate: 0.9,
362
+ emphasis: "strong",
363
+ tone: "urgent",
364
+ leadingSilenceMs: 300,
365
+ trailingSilenceMs: 500
366
+ },
367
+ solution: {
368
+ rate: 1,
369
+ emphasis: "normal",
370
+ tone: "calm",
371
+ leadingSilenceMs: 300,
372
+ trailingSilenceMs: 500
373
+ },
374
+ metric: {
375
+ rate: 0.85,
376
+ emphasis: "strong",
377
+ tone: "excited",
378
+ leadingSilenceMs: 300,
379
+ trailingSilenceMs: 600
380
+ },
381
+ cta: {
382
+ rate: 0.9,
383
+ emphasis: "strong",
384
+ tone: "authoritative",
385
+ leadingSilenceMs: 400,
386
+ trailingSilenceMs: 0
387
+ },
388
+ transition: {
389
+ rate: 1.1,
390
+ emphasis: "reduced",
391
+ tone: "neutral",
392
+ leadingSilenceMs: 200,
393
+ trailingSilenceMs: 300
394
+ }
395
+ };
396
+
397
+ class PaceAnalyzer {
398
+ analyze(segments, baseRate = 1) {
399
+ return segments.map((segment) => {
400
+ const defaults = CONTENT_TYPE_PACING[segment.contentType];
401
+ return {
402
+ sceneId: segment.sceneId,
403
+ rate: defaults.rate * baseRate,
404
+ emphasis: defaults.emphasis,
405
+ tone: defaults.tone,
406
+ leadingSilenceMs: defaults.leadingSilenceMs,
407
+ trailingSilenceMs: defaults.trailingSilenceMs
408
+ };
409
+ });
410
+ }
411
+ getDefaults(contentType) {
412
+ return { ...CONTENT_TYPE_PACING[contentType] };
413
+ }
414
+ }
415
+
416
+ // src/tts/emphasis-planner.ts
417
+ class EmphasisPlanner {
418
+ llm;
419
+ model;
420
+ paceAnalyzer;
421
+ constructor(options) {
422
+ this.llm = options?.llm;
423
+ this.model = options?.model;
424
+ this.paceAnalyzer = new PaceAnalyzer;
425
+ }
426
+ async plan(segments, baseRate = 1) {
427
+ if (!this.llm) {
428
+ return this.paceAnalyzer.analyze(segments, baseRate);
429
+ }
430
+ try {
431
+ return await this.planWithLlm(segments, baseRate);
432
+ } catch {
433
+ return this.paceAnalyzer.analyze(segments, baseRate);
434
+ }
435
+ }
436
+ async planWithLlm(segments, baseRate) {
437
+ if (!this.llm) {
438
+ return this.paceAnalyzer.analyze(segments, baseRate);
439
+ }
440
+ const response = await this.llm.chat([
441
+ {
442
+ role: "system",
443
+ content: [
444
+ {
445
+ type: "text",
446
+ text: [
447
+ "You are a voice director planning emphasis and pacing for TTS narration.",
448
+ "For each segment, return a JSON array of directives.",
449
+ "Each directive has: sceneId, rate (0.7-1.3), emphasis (reduced|normal|strong),",
450
+ "tone (neutral|urgent|excited|calm|authoritative), leadingSilenceMs, trailingSilenceMs.",
451
+ "Return ONLY a JSON array, no other text."
452
+ ].join(`
453
+ `)
454
+ }
455
+ ]
456
+ },
457
+ {
458
+ role: "user",
459
+ content: [
460
+ {
461
+ type: "text",
462
+ text: JSON.stringify(segments.map((s) => ({
463
+ sceneId: s.sceneId,
464
+ text: s.text,
465
+ contentType: s.contentType
466
+ })))
467
+ }
468
+ ]
469
+ }
470
+ ], { model: this.model, temperature: 0.3, responseFormat: "json" });
471
+ const text = response.message.content.find((p) => p.type === "text");
472
+ if (!text || text.type !== "text") {
473
+ return this.paceAnalyzer.analyze(segments, baseRate);
474
+ }
475
+ const parsed = JSON.parse(text.text);
476
+ return parsed.map((d) => ({
477
+ ...d,
478
+ rate: d.rate * baseRate
479
+ }));
480
+ }
481
+ }
482
+
483
+ // src/tts/segment-synthesizer.ts
484
+ class SegmentSynthesizer {
485
+ tts;
486
+ constructor(tts) {
487
+ this.tts = tts;
488
+ }
489
+ async synthesizeAll(segments, voice, directives) {
490
+ const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
491
+ const results = await Promise.all(segments.map((segment) => this.synthesizeOne(segment, voice, directiveMap.get(segment.sceneId))));
492
+ return results;
493
+ }
494
+ async synthesizeOne(segment, voice, directive) {
495
+ const result = await this.tts.synthesize({
496
+ text: segment.text,
497
+ voiceId: voice.voiceId,
498
+ language: voice.language,
499
+ style: voice.style,
500
+ stability: voice.stability,
501
+ rate: directive?.rate,
502
+ emphasis: directive?.emphasis
503
+ });
504
+ return {
505
+ sceneId: segment.sceneId,
506
+ audio: result.audio,
507
+ durationMs: result.audio.durationMs ?? 0,
508
+ wordTimings: result.wordTimings?.map((wt) => ({
509
+ word: wt.word,
510
+ startMs: wt.startMs,
511
+ endMs: wt.endMs
512
+ }))
513
+ };
514
+ }
515
+ }
516
+
517
+ // src/tts/audio-assembler.ts
518
+ class AudioAssembler {
519
+ concatenator = new AudioConcatenator;
520
+ silenceGenerator = new SilenceGenerator;
521
+ assemble(segments, directives, defaultPauseMs = 500) {
522
+ if (segments.length === 0) {
523
+ return {
524
+ data: new Uint8Array(0),
525
+ format: "wav",
526
+ sampleRateHz: 44100,
527
+ durationMs: 0,
528
+ channels: 1
529
+ };
530
+ }
531
+ const [firstSegment] = segments;
532
+ if (!firstSegment) {
533
+ return {
534
+ data: new Uint8Array(0),
535
+ format: "wav",
536
+ sampleRateHz: 44100,
537
+ durationMs: 0,
538
+ channels: 1
539
+ };
540
+ }
541
+ const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
542
+ const reference = firstSegment.audio;
543
+ const parts = [];
544
+ for (let i = 0;i < segments.length; i++) {
545
+ const segment = segments[i];
546
+ if (!segment) {
547
+ continue;
548
+ }
549
+ const directive = directiveMap.get(segment.sceneId);
550
+ const leadingSilenceMs = directive?.leadingSilenceMs ?? 0;
551
+ if (leadingSilenceMs > 0) {
552
+ parts.push(this.silenceGenerator.generate(leadingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
553
+ }
554
+ parts.push(segment.audio);
555
+ const trailingSilenceMs = directive?.trailingSilenceMs ?? (i < segments.length - 1 ? defaultPauseMs : 0);
556
+ if (trailingSilenceMs > 0) {
557
+ parts.push(this.silenceGenerator.generate(trailingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
558
+ }
559
+ }
560
+ return this.concatenator.concatenate(parts);
561
+ }
562
+ }
563
+
564
+ // src/tts/voice-synthesizer.ts
565
+ class VoiceSynthesizer {
566
+ segmentSynthesizer;
567
+ emphasisPlanner;
568
+ audioAssembler = new AudioAssembler;
569
+ durationEstimator = new DurationEstimator;
570
+ paceAnalyzer = new PaceAnalyzer;
571
+ options;
572
+ constructor(options) {
573
+ this.options = options;
574
+ this.segmentSynthesizer = new SegmentSynthesizer(options.tts);
575
+ this.emphasisPlanner = new EmphasisPlanner({
576
+ llm: options.llm,
577
+ model: options.model
578
+ });
579
+ }
580
+ async synthesize(brief) {
581
+ const script = this.buildScript(brief);
582
+ return this.executePipeline(script, brief.voice, brief.pacing);
583
+ }
584
+ async synthesizeForVideo(brief) {
585
+ const script = this.buildScriptFromScenePlan(brief);
586
+ return this.executePipeline(script, brief.voice, brief.pacing, brief.fps);
587
+ }
588
+ async executePipeline(script, voice, pacing, fps) {
589
+ const projectId = generateProjectId();
590
+ const baseRate = pacing?.baseRate ?? 1;
591
+ const pacingDirectives = await this.emphasisPlanner.plan(script.segments, baseRate);
592
+ const synthesized = await this.segmentSynthesizer.synthesizeAll(script.segments, voice, pacingDirectives);
593
+ const pauseMs = pacing?.segmentPauseMs ?? 500;
594
+ const assembledAudio = this.audioAssembler.assemble(synthesized, pacingDirectives, pauseMs);
595
+ const effectiveFps = fps ?? this.options.fps ?? 30;
596
+ const breathingRoomFactor = pacing?.breathingRoomFactor ?? 1.15;
597
+ const timingMap = this.buildTimingMap(synthesized, effectiveFps, breathingRoomFactor);
598
+ return {
599
+ id: projectId,
600
+ script,
601
+ pacingDirectives,
602
+ segments: synthesized,
603
+ assembledAudio,
604
+ timingMap
605
+ };
606
+ }
607
+ buildScript(brief) {
608
+ const segments = [];
609
+ const introText = `${brief.content.title}. ${brief.content.summary}`;
610
+ segments.push({
611
+ sceneId: "intro",
612
+ text: introText,
613
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(introText),
614
+ contentType: "intro"
615
+ });
616
+ if (brief.content.problems.length > 0) {
617
+ const text = brief.content.problems.join(". ");
618
+ segments.push({
619
+ sceneId: "problems",
620
+ text,
621
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
622
+ contentType: "problem"
623
+ });
624
+ }
625
+ if (brief.content.solutions.length > 0) {
626
+ const text = brief.content.solutions.join(". ");
627
+ segments.push({
628
+ sceneId: "solutions",
629
+ text,
630
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
631
+ contentType: "solution"
632
+ });
633
+ }
634
+ if (brief.content.metrics && brief.content.metrics.length > 0) {
635
+ const text = brief.content.metrics.join(". ");
636
+ segments.push({
637
+ sceneId: "metrics",
638
+ text,
639
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
640
+ contentType: "metric"
641
+ });
642
+ }
643
+ if (brief.content.callToAction) {
644
+ segments.push({
645
+ sceneId: "cta",
646
+ text: brief.content.callToAction,
647
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(brief.content.callToAction),
648
+ contentType: "cta"
649
+ });
650
+ }
651
+ const fullText = segments.map((s) => s.text).join(" ");
652
+ const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
653
+ return { fullText, segments, estimatedDurationSeconds };
654
+ }
655
+ buildScriptFromScenePlan(brief) {
656
+ const segments = brief.scenePlan.scenes.filter((scene) => scene.narrationText).map((scene) => {
657
+ const text = scene.narrationText ?? "";
658
+ return {
659
+ sceneId: scene.id,
660
+ text,
661
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
662
+ contentType: "intro"
663
+ };
664
+ });
665
+ const fullText = segments.map((s) => s.text).join(" ");
666
+ const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
667
+ return { fullText, segments, estimatedDurationSeconds };
668
+ }
669
+ buildTimingMap(segments, fps, breathingRoomFactor) {
670
+ const timingSegments = segments.map((seg) => {
671
+ const durationInFrames = Math.ceil(seg.durationMs / 1000 * fps);
672
+ return {
673
+ sceneId: seg.sceneId,
674
+ durationMs: seg.durationMs,
675
+ durationInFrames,
676
+ recommendedSceneDurationInFrames: Math.ceil(durationInFrames * breathingRoomFactor),
677
+ wordTimings: seg.wordTimings?.map((wt) => ({
678
+ word: wt.word,
679
+ startMs: wt.startMs,
680
+ endMs: wt.endMs
681
+ }))
682
+ };
683
+ });
684
+ const totalDurationMs = segments.reduce((sum, s) => sum + s.durationMs, 0);
685
+ return { totalDurationMs, segments: timingSegments, fps };
686
+ }
687
+ }
688
+ function generateProjectId() {
689
+ const timestamp = Date.now().toString(36);
690
+ const random = Math.random().toString(36).slice(2, 8);
691
+ return `tts_${timestamp}_${random}`;
692
+ }
693
+ // src/stt/segment-splitter.ts
694
+ class SegmentSplitter {
695
+ static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
696
+ split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
697
+ const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
698
+ if (totalDurationMs <= maxChunkMs) {
699
+ return [audio];
700
+ }
701
+ const chunks = [];
702
+ const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
703
+ let offsetMs = 0;
704
+ while (offsetMs < totalDurationMs) {
705
+ const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
706
+ const startByte = Math.floor(offsetMs * bytesPerMs);
707
+ const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
708
+ chunks.push({
709
+ data: audio.data.slice(startByte, endByte),
710
+ format: audio.format,
711
+ sampleRateHz: audio.sampleRateHz,
712
+ durationMs: chunkDurationMs,
713
+ channels: audio.channels
714
+ });
715
+ offsetMs += chunkDurationMs;
716
+ }
717
+ return chunks;
718
+ }
719
+ estimateDurationMs(audio) {
720
+ const bytesPerSample = 2;
721
+ const channels = audio.channels ?? 1;
722
+ const totalSamples = audio.data.length / (bytesPerSample * channels);
723
+ return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
724
+ }
725
+ }
726
+
727
+ // src/stt/diarization-mapper.ts
728
+ class DiarizationMapper {
729
+ map(segments, labelPrefix = "Speaker") {
730
+ const speakerOrder = [];
731
+ const speakerStats = new Map;
732
+ for (const seg of segments) {
733
+ if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
734
+ speakerOrder.push(seg.speakerId);
735
+ speakerStats.set(seg.speakerId, {
736
+ segmentCount: 0,
737
+ totalSpeakingMs: 0
738
+ });
739
+ }
740
+ }
741
+ const labeledSegments = segments.map((seg) => {
742
+ if (!seg.speakerId)
743
+ return seg;
744
+ const index = speakerOrder.indexOf(seg.speakerId);
745
+ const label = `${labelPrefix} ${index + 1}`;
746
+ const stats = speakerStats.get(seg.speakerId);
747
+ if (!stats) {
748
+ return { ...seg, speakerLabel: label };
749
+ }
750
+ stats.segmentCount += 1;
751
+ stats.totalSpeakingMs += seg.endMs - seg.startMs;
752
+ return { ...seg, speakerLabel: label };
753
+ });
754
+ const speakers = speakerOrder.map((id, index) => {
755
+ const stats = speakerStats.get(id);
756
+ return {
757
+ id,
758
+ label: `${labelPrefix} ${index + 1}`,
759
+ segmentCount: stats?.segmentCount ?? 0,
760
+ totalSpeakingMs: stats?.totalSpeakingMs ?? 0
761
+ };
762
+ });
763
+ return { segments: labeledSegments, speakers };
764
+ }
765
+ }
766
+
767
+ // src/stt/subtitle-formatter.ts
768
+ class SubtitleFormatter {
769
+ toSRT(segments) {
770
+ return segments.map((seg, i) => {
771
+ const start = this.formatTimeSRT(seg.startMs);
772
+ const end = this.formatTimeSRT(seg.endMs);
773
+ const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
774
+ return `${i + 1}
775
+ ${start} --> ${end}
776
+ ${label}${seg.text}`;
777
+ }).join(`
778
+
779
+ `);
780
+ }
781
+ toVTT(segments) {
782
+ const header = `WEBVTT
783
+
784
+ `;
785
+ const cues = segments.map((seg, i) => {
786
+ const start = this.formatTimeVTT(seg.startMs);
787
+ const end = this.formatTimeVTT(seg.endMs);
788
+ const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
789
+ return `${i + 1}
790
+ ${start} --> ${end}
791
+ ${label}${seg.text}`;
792
+ }).join(`
793
+
794
+ `);
795
+ return header + cues;
796
+ }
797
+ formatTimeSRT(ms) {
798
+ const hours = Math.floor(ms / 3600000);
799
+ const minutes = Math.floor(ms % 3600000 / 60000);
800
+ const seconds = Math.floor(ms % 60000 / 1000);
801
+ const millis = ms % 1000;
802
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
803
+ }
804
+ formatTimeVTT(ms) {
805
+ const hours = Math.floor(ms / 3600000);
806
+ const minutes = Math.floor(ms % 3600000 / 60000);
807
+ const seconds = Math.floor(ms % 60000 / 1000);
808
+ const millis = ms % 1000;
809
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
810
+ }
811
+ pad(value, length) {
812
+ return value.toString().padStart(length, "0");
813
+ }
814
+ }
815
+
816
+ // src/stt/transcriber.ts
817
+ class Transcriber {
818
+ stt;
819
+ segmentSplitter = new SegmentSplitter;
820
+ diarizationMapper = new DiarizationMapper;
821
+ subtitleFormatter = new SubtitleFormatter;
822
+ constructor(options) {
823
+ this.stt = options.stt;
824
+ }
825
+ async transcribe(brief) {
826
+ const projectId = generateProjectId2();
827
+ const chunks = this.segmentSplitter.split(brief.audio);
828
+ const allSegments = [];
829
+ let fullText = "";
830
+ let totalDurationMs = 0;
831
+ let offsetMs = 0;
832
+ for (const chunk of chunks) {
833
+ const result = await this.stt.transcribe({
834
+ audio: chunk,
835
+ language: brief.language,
836
+ diarize: brief.diarize,
837
+ speakerCount: brief.speakerCount,
838
+ wordTimestamps: true,
839
+ vocabularyHints: brief.vocabularyHints
840
+ });
841
+ const offsetSegments = result.segments.map((seg) => ({
842
+ text: seg.text,
843
+ startMs: seg.startMs + offsetMs,
844
+ endMs: seg.endMs + offsetMs,
845
+ speakerId: seg.speakerId,
846
+ speakerName: seg.speakerName,
847
+ confidence: seg.confidence
848
+ }));
849
+ allSegments.push(...offsetSegments);
850
+ fullText += (fullText ? " " : "") + result.text;
851
+ totalDurationMs += result.durationMs;
852
+ offsetMs += chunk.durationMs ?? 0;
853
+ }
854
+ let mappedSegments = allSegments;
855
+ let speakers;
856
+ if (brief.diarize) {
857
+ const mapping = this.diarizationMapper.map(allSegments);
858
+ mappedSegments = mapping.segments;
859
+ speakers = mapping.speakers;
860
+ }
861
+ const transcript = {
862
+ text: fullText,
863
+ segments: mappedSegments,
864
+ language: brief.language ?? "en",
865
+ durationMs: totalDurationMs
866
+ };
867
+ let subtitles;
868
+ const format = brief.subtitleFormat ?? "none";
869
+ if (format === "srt") {
870
+ subtitles = this.subtitleFormatter.toSRT(mappedSegments);
871
+ } else if (format === "vtt") {
872
+ subtitles = this.subtitleFormatter.toVTT(mappedSegments);
873
+ }
874
+ return {
875
+ id: projectId,
876
+ transcript,
877
+ subtitles,
878
+ speakers
879
+ };
880
+ }
881
+ async* transcribeStream(audio, options) {
882
+ if (!this.stt.transcribeStream) {
883
+ throw new Error("Streaming transcription not supported by the current STT provider");
884
+ }
885
+ const stream = this.stt.transcribeStream(audio, {
886
+ language: options?.language,
887
+ diarize: options?.diarize,
888
+ speakerCount: options?.speakerCount,
889
+ wordTimestamps: true,
890
+ vocabularyHints: options?.vocabularyHints
891
+ });
892
+ for await (const segment of stream) {
893
+ yield {
894
+ text: segment.text,
895
+ startMs: segment.startMs,
896
+ endMs: segment.endMs,
897
+ speakerId: segment.speakerId,
898
+ speakerLabel: segment.speakerName,
899
+ confidence: segment.confidence
900
+ };
901
+ }
902
+ }
903
+ }
904
+ function generateProjectId2() {
905
+ const timestamp = Date.now().toString(36);
906
+ const random = Math.random().toString(36).slice(2, 8);
907
+ return `stt_${timestamp}_${random}`;
908
+ }
909
+ // src/sync/timing-calculator.ts
910
+ class TimingCalculator {
911
+ calculate(segments, fps, breathingRoomFactor = 1.15) {
912
+ const timingSegments = segments.map((seg) => {
913
+ const durationInFrames = Math.ceil(seg.durationMs / 1000 * fps);
914
+ const recommendedSceneDurationInFrames = Math.ceil(durationInFrames * breathingRoomFactor);
915
+ const wordTimings = seg.wordTimings?.map((wt) => ({
916
+ word: wt.word,
917
+ startMs: wt.startMs,
918
+ endMs: wt.endMs
919
+ }));
920
+ return {
921
+ sceneId: seg.sceneId,
922
+ durationMs: seg.durationMs,
923
+ durationInFrames,
924
+ recommendedSceneDurationInFrames,
925
+ wordTimings
926
+ };
927
+ });
928
+ const totalDurationMs = segments.reduce((sum, s) => sum + s.durationMs, 0);
929
+ return {
930
+ totalDurationMs,
931
+ segments: timingSegments,
932
+ fps
933
+ };
934
+ }
935
+ recalculateForFps(timingMap, newFps) {
936
+ const segments = timingMap.segments.map((seg) => {
937
+ const durationInFrames = Math.ceil(seg.durationMs / 1000 * newFps);
938
+ const ratio = seg.recommendedSceneDurationInFrames / Math.max(seg.durationInFrames, 1);
939
+ return {
940
+ ...seg,
941
+ durationInFrames,
942
+ recommendedSceneDurationInFrames: Math.ceil(durationInFrames * ratio)
943
+ };
944
+ });
945
+ return {
946
+ ...timingMap,
947
+ segments,
948
+ fps: newFps
949
+ };
950
+ }
951
+ }
952
+
953
+ // src/sync/scene-adapter.ts
954
+ class SceneAdapter {
955
+ durationEstimator = new DurationEstimator;
956
+ adapt(scenePlan) {
957
+ const scenesWithNarration = scenePlan.scenes.filter((s) => s.narrationText && s.narrationText.trim().length > 0);
958
+ const segments = scenesWithNarration.map((scene, index) => {
959
+ const text = scene.narrationText ?? "";
960
+ return {
961
+ sceneId: scene.id,
962
+ text,
963
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
964
+ contentType: this.inferContentType(index, scenesWithNarration.length)
965
+ };
966
+ });
967
+ const fullText = segments.map((s) => s.text).join(" ");
968
+ const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
969
+ return { fullText, segments, estimatedDurationSeconds };
970
+ }
971
+ inferContentType(index, total) {
972
+ if (index === 0)
973
+ return "intro";
974
+ if (index === total - 1)
975
+ return "cta";
976
+ if (index === 1 && total > 3)
977
+ return "problem";
978
+ if (index === total - 2 && total > 3)
979
+ return "metric";
980
+ return "solution";
981
+ }
982
+ }
983
+
984
+ // src/sync/duration-negotiator.ts
985
+ class DurationNegotiator {
986
+ static UPPER_THRESHOLD = 1.1;
987
+ static LOWER_THRESHOLD = 0.7;
988
+ static MAX_RATE = 1.3;
989
+ static MIN_RATE = 0.8;
990
+ negotiate(timingMap, sceneDurations) {
991
+ const adjustments = [];
992
+ const updatedSegments = timingMap.segments.map((seg) => {
993
+ const originalSceneDuration = sceneDurations.get(seg.sceneId);
994
+ if (originalSceneDuration === undefined) {
995
+ adjustments.push({
996
+ sceneId: seg.sceneId,
997
+ originalSceneDurationInFrames: seg.recommendedSceneDurationInFrames,
998
+ voiceDurationInFrames: seg.durationInFrames,
999
+ action: "no_change",
1000
+ finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
1001
+ });
1002
+ return seg;
1003
+ }
1004
+ const ratio = seg.durationInFrames / originalSceneDuration;
1005
+ if (ratio > DurationNegotiator.UPPER_THRESHOLD) {
1006
+ const suggestedRate = Math.min(ratio, DurationNegotiator.MAX_RATE);
1007
+ adjustments.push({
1008
+ sceneId: seg.sceneId,
1009
+ originalSceneDurationInFrames: originalSceneDuration,
1010
+ voiceDurationInFrames: seg.durationInFrames,
1011
+ action: ratio > DurationNegotiator.MAX_RATE ? "extend_scene" : "suggest_rate_change",
1012
+ suggestedRate,
1013
+ finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
1014
+ });
1015
+ return seg;
1016
+ }
1017
+ if (ratio < DurationNegotiator.LOWER_THRESHOLD) {
1018
+ const suggestedRate = Math.max(ratio, DurationNegotiator.MIN_RATE);
1019
+ adjustments.push({
1020
+ sceneId: seg.sceneId,
1021
+ originalSceneDurationInFrames: originalSceneDuration,
1022
+ voiceDurationInFrames: seg.durationInFrames,
1023
+ action: "pad_silence",
1024
+ suggestedRate,
1025
+ finalSceneDurationInFrames: originalSceneDuration
1026
+ });
1027
+ return {
1028
+ ...seg,
1029
+ recommendedSceneDurationInFrames: originalSceneDuration
1030
+ };
1031
+ }
1032
+ adjustments.push({
1033
+ sceneId: seg.sceneId,
1034
+ originalSceneDurationInFrames: originalSceneDuration,
1035
+ voiceDurationInFrames: seg.durationInFrames,
1036
+ action: "no_change",
1037
+ finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
1038
+ });
1039
+ return seg;
1040
+ });
1041
+ return {
1042
+ timingMap: {
1043
+ ...timingMap,
1044
+ segments: updatedSegments
1045
+ },
1046
+ adjustments
1047
+ };
1048
+ }
1049
+ }
1050
+ export {
1051
+ VoiceSynthesizer,
1052
+ VoiceSessionManager,
1053
+ TurnDetector,
1054
+ TranscriptBuilder,
1055
+ Transcriber,
1056
+ TimingCalculator,
1057
+ SubtitleFormatter,
1058
+ SilenceGenerator,
1059
+ SegmentSynthesizer,
1060
+ SegmentSplitter,
1061
+ SceneAdapter,
1062
+ ResponseOrchestrator,
1063
+ PaceAnalyzer,
1064
+ FormatConverter,
1065
+ EmphasisPlanner,
1066
+ DurationNegotiator,
1067
+ DurationEstimator,
1068
+ DiarizationMapper,
1069
+ AudioConcatenator,
1070
+ AudioAssembler
1071
+ };