@contractspec/lib.voice 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/dist/audio/audio-concatenator.d.ts +15 -0
  2. package/dist/audio/audio-concatenator.js +57 -0
  3. package/dist/audio/duration-estimator.d.ts +31 -0
  4. package/dist/audio/duration-estimator.js +22 -0
  5. package/dist/audio/format-converter.d.ts +17 -0
  6. package/dist/audio/format-converter.js +28 -0
  7. package/dist/audio/index.d.ts +4 -0
  8. package/dist/audio/index.js +121 -0
  9. package/dist/audio/silence-generator.d.ts +16 -0
  10. package/dist/audio/silence-generator.js +20 -0
  11. package/dist/browser/audio/audio-concatenator.js +56 -0
  12. package/dist/browser/audio/duration-estimator.js +21 -0
  13. package/dist/browser/audio/format-converter.js +27 -0
  14. package/dist/browser/audio/index.js +120 -0
  15. package/dist/browser/audio/silence-generator.js +19 -0
  16. package/dist/browser/conversational/index.js +241 -0
  17. package/dist/browser/conversational/response-orchestrator.js +62 -0
  18. package/dist/browser/conversational/transcript-builder.js +63 -0
  19. package/dist/browser/conversational/turn-detector.js +43 -0
  20. package/dist/browser/conversational/types.js +0 -0
  21. package/dist/browser/conversational/voice-session-manager.js +137 -0
  22. package/dist/browser/docs/conversational.docblock.js +5 -0
  23. package/dist/browser/docs/stt.docblock.js +5 -0
  24. package/dist/browser/docs/sync.docblock.js +5 -0
  25. package/dist/browser/docs/tts.docblock.js +5 -0
  26. package/dist/browser/docs/voice.docblock.js +5 -0
  27. package/dist/browser/i18n/catalogs/en.js +91 -0
  28. package/dist/browser/i18n/catalogs/es.js +91 -0
  29. package/dist/browser/i18n/catalogs/fr.js +91 -0
  30. package/dist/browser/i18n/catalogs/index.js +271 -0
  31. package/dist/browser/i18n/index.js +335 -0
  32. package/dist/browser/i18n/keys.js +38 -0
  33. package/dist/browser/i18n/locale.js +13 -0
  34. package/dist/browser/i18n/messages.js +283 -0
  35. package/dist/browser/index.js +1070 -0
  36. package/dist/browser/stt/diarization-mapper.js +42 -0
  37. package/dist/browser/stt/index.js +222 -0
  38. package/dist/browser/stt/segment-splitter.js +36 -0
  39. package/dist/browser/stt/subtitle-formatter.js +51 -0
  40. package/dist/browser/stt/transcriber.js +219 -0
  41. package/dist/browser/stt/types.js +0 -0
  42. package/dist/browser/sync/duration-negotiator.js +69 -0
  43. package/dist/browser/sync/index.js +165 -0
  44. package/dist/browser/sync/scene-adapter.js +52 -0
  45. package/dist/browser/sync/timing-calculator.js +46 -0
  46. package/dist/browser/tts/audio-assembler.js +120 -0
  47. package/dist/browser/tts/emphasis-planner.js +134 -0
  48. package/dist/browser/tts/index.js +439 -0
  49. package/dist/browser/tts/pace-analyzer.js +67 -0
  50. package/dist/browser/tts/segment-synthesizer.js +36 -0
  51. package/dist/browser/tts/types.js +0 -0
  52. package/dist/browser/tts/voice-synthesizer.js +435 -0
  53. package/dist/browser/types.js +0 -0
  54. package/dist/conversational/index.d.ts +5 -0
  55. package/dist/conversational/index.js +242 -0
  56. package/dist/conversational/response-orchestrator.d.ts +26 -0
  57. package/dist/conversational/response-orchestrator.js +63 -0
  58. package/dist/conversational/transcript-builder.d.ts +25 -0
  59. package/dist/conversational/transcript-builder.js +64 -0
  60. package/dist/conversational/turn-detector.d.ts +31 -0
  61. package/dist/conversational/turn-detector.js +44 -0
  62. package/dist/conversational/types.d.ts +55 -0
  63. package/dist/conversational/types.js +1 -0
  64. package/dist/conversational/voice-session-manager.d.ts +17 -0
  65. package/dist/conversational/voice-session-manager.js +138 -0
  66. package/dist/docs/conversational.docblock.d.ts +14 -0
  67. package/dist/docs/conversational.docblock.js +6 -0
  68. package/dist/docs/stt.docblock.d.ts +12 -0
  69. package/dist/docs/stt.docblock.js +6 -0
  70. package/dist/docs/sync.docblock.d.ts +12 -0
  71. package/dist/docs/sync.docblock.js +6 -0
  72. package/dist/docs/tts.docblock.d.ts +12 -0
  73. package/dist/docs/tts.docblock.js +6 -0
  74. package/dist/docs/voice.docblock.d.ts +22 -0
  75. package/dist/docs/voice.docblock.js +6 -0
  76. package/dist/i18n/catalogs/en.d.ts +6 -0
  77. package/dist/i18n/catalogs/en.js +92 -0
  78. package/dist/i18n/catalogs/es.d.ts +4 -0
  79. package/dist/i18n/catalogs/es.js +92 -0
  80. package/dist/i18n/catalogs/fr.d.ts +4 -0
  81. package/dist/i18n/catalogs/fr.js +92 -0
  82. package/dist/i18n/catalogs/index.d.ts +3 -0
  83. package/dist/i18n/catalogs/index.js +272 -0
  84. package/dist/i18n/index.d.ts +20 -0
  85. package/dist/i18n/index.js +336 -0
  86. package/dist/i18n/keys.d.ts +50 -0
  87. package/dist/i18n/keys.js +39 -0
  88. package/dist/i18n/locale.d.ts +6 -0
  89. package/dist/i18n/locale.js +14 -0
  90. package/dist/i18n/messages.d.ts +13 -0
  91. package/dist/i18n/messages.js +284 -0
  92. package/dist/index.d.ts +6 -0
  93. package/dist/index.js +1071 -0
  94. package/dist/node/audio/audio-concatenator.js +56 -0
  95. package/dist/node/audio/duration-estimator.js +21 -0
  96. package/dist/node/audio/format-converter.js +27 -0
  97. package/dist/node/audio/index.js +120 -0
  98. package/dist/node/audio/silence-generator.js +19 -0
  99. package/dist/node/conversational/index.js +241 -0
  100. package/dist/node/conversational/response-orchestrator.js +62 -0
  101. package/dist/node/conversational/transcript-builder.js +63 -0
  102. package/dist/node/conversational/turn-detector.js +43 -0
  103. package/dist/node/conversational/types.js +0 -0
  104. package/dist/node/conversational/voice-session-manager.js +137 -0
  105. package/dist/node/docs/conversational.docblock.js +5 -0
  106. package/dist/node/docs/stt.docblock.js +5 -0
  107. package/dist/node/docs/sync.docblock.js +5 -0
  108. package/dist/node/docs/tts.docblock.js +5 -0
  109. package/dist/node/docs/voice.docblock.js +5 -0
  110. package/dist/node/i18n/catalogs/en.js +91 -0
  111. package/dist/node/i18n/catalogs/es.js +91 -0
  112. package/dist/node/i18n/catalogs/fr.js +91 -0
  113. package/dist/node/i18n/catalogs/index.js +271 -0
  114. package/dist/node/i18n/index.js +335 -0
  115. package/dist/node/i18n/keys.js +38 -0
  116. package/dist/node/i18n/locale.js +13 -0
  117. package/dist/node/i18n/messages.js +283 -0
  118. package/dist/node/index.js +1070 -0
  119. package/dist/node/stt/diarization-mapper.js +42 -0
  120. package/dist/node/stt/index.js +222 -0
  121. package/dist/node/stt/segment-splitter.js +36 -0
  122. package/dist/node/stt/subtitle-formatter.js +51 -0
  123. package/dist/node/stt/transcriber.js +219 -0
  124. package/dist/node/stt/types.js +0 -0
  125. package/dist/node/sync/duration-negotiator.js +69 -0
  126. package/dist/node/sync/index.js +165 -0
  127. package/dist/node/sync/scene-adapter.js +52 -0
  128. package/dist/node/sync/timing-calculator.js +46 -0
  129. package/dist/node/tts/audio-assembler.js +120 -0
  130. package/dist/node/tts/emphasis-planner.js +134 -0
  131. package/dist/node/tts/index.js +439 -0
  132. package/dist/node/tts/pace-analyzer.js +67 -0
  133. package/dist/node/tts/segment-synthesizer.js +36 -0
  134. package/dist/node/tts/types.js +0 -0
  135. package/dist/node/tts/voice-synthesizer.js +435 -0
  136. package/dist/node/types.js +0 -0
  137. package/dist/stt/diarization-mapper.d.ts +19 -0
  138. package/dist/stt/diarization-mapper.js +43 -0
  139. package/dist/stt/index.d.ts +5 -0
  140. package/dist/stt/index.js +223 -0
  141. package/dist/stt/segment-splitter.d.ts +19 -0
  142. package/dist/stt/segment-splitter.js +37 -0
  143. package/dist/stt/subtitle-formatter.d.ts +19 -0
  144. package/dist/stt/subtitle-formatter.js +52 -0
  145. package/dist/stt/transcriber.d.ts +21 -0
  146. package/dist/stt/transcriber.js +220 -0
  147. package/dist/stt/types.d.ts +44 -0
  148. package/dist/stt/types.js +1 -0
  149. package/dist/sync/duration-negotiator.d.ts +37 -0
  150. package/dist/sync/duration-negotiator.js +70 -0
  151. package/dist/sync/index.d.ts +3 -0
  152. package/dist/sync/index.js +166 -0
  153. package/dist/sync/scene-adapter.d.ts +29 -0
  154. package/dist/sync/scene-adapter.js +53 -0
  155. package/dist/sync/timing-calculator.d.ts +21 -0
  156. package/dist/sync/timing-calculator.js +47 -0
  157. package/dist/tts/audio-assembler.d.ts +19 -0
  158. package/dist/tts/audio-assembler.js +121 -0
  159. package/dist/tts/emphasis-planner.d.ts +24 -0
  160. package/dist/tts/emphasis-planner.js +135 -0
  161. package/dist/tts/index.d.ts +6 -0
  162. package/dist/tts/index.js +440 -0
  163. package/dist/tts/pace-analyzer.d.ts +30 -0
  164. package/dist/tts/pace-analyzer.js +68 -0
  165. package/dist/tts/segment-synthesizer.d.ts +21 -0
  166. package/dist/tts/segment-synthesizer.js +37 -0
  167. package/dist/tts/types.d.ts +76 -0
  168. package/dist/tts/types.js +1 -0
  169. package/dist/tts/voice-synthesizer.d.ts +28 -0
  170. package/dist/tts/voice-synthesizer.js +436 -0
  171. package/dist/types.d.ts +12 -0
  172. package/dist/types.js +1 -0
  173. package/package.json +760 -0
@@ -0,0 +1,435 @@
1
+ // src/audio/audio-concatenator.ts
2
+ class AudioConcatenator {
3
+ concatenate(segments) {
4
+ if (segments.length === 0) {
5
+ return {
6
+ data: new Uint8Array(0),
7
+ format: "wav",
8
+ sampleRateHz: 44100,
9
+ durationMs: 0,
10
+ channels: 1
11
+ };
12
+ }
13
+ const [firstSegment] = segments;
14
+ if (!firstSegment) {
15
+ return {
16
+ data: new Uint8Array(0),
17
+ format: "wav",
18
+ sampleRateHz: 44100,
19
+ durationMs: 0,
20
+ channels: 1
21
+ };
22
+ }
23
+ if (segments.length === 1) {
24
+ return { ...firstSegment };
25
+ }
26
+ const referenceFormat = firstSegment.format;
27
+ const referenceSampleRate = firstSegment.sampleRateHz;
28
+ const referenceChannels = firstSegment.channels ?? 1;
29
+ for (const seg of segments) {
30
+ if (seg.format !== referenceFormat) {
31
+ throw new Error(`Format mismatch: expected ${referenceFormat}, got ${seg.format}`);
32
+ }
33
+ if (seg.sampleRateHz !== referenceSampleRate) {
34
+ throw new Error(`Sample rate mismatch: expected ${referenceSampleRate}, got ${seg.sampleRateHz}`);
35
+ }
36
+ }
37
+ const totalBytes = segments.reduce((sum, s) => sum + s.data.length, 0);
38
+ const combined = new Uint8Array(totalBytes);
39
+ let offset = 0;
40
+ for (const seg of segments) {
41
+ combined.set(seg.data, offset);
42
+ offset += seg.data.length;
43
+ }
44
+ const totalDurationMs = segments.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
45
+ return {
46
+ data: combined,
47
+ format: referenceFormat,
48
+ sampleRateHz: referenceSampleRate,
49
+ durationMs: totalDurationMs,
50
+ channels: referenceChannels
51
+ };
52
+ }
53
+ }
54
+
55
+ // src/audio/duration-estimator.ts
56
+ class DurationEstimator {
57
+ static DEFAULT_WPM = 150;
58
+ estimateSeconds(text, wordsPerMinute) {
59
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
60
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
61
+ return Math.ceil(wordCount / wpm * 60);
62
+ }
63
+ estimateMs(text, wordsPerMinute) {
64
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
65
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
66
+ return Math.ceil(wordCount / wpm * 60 * 1000);
67
+ }
68
+ estimateWordCount(durationSeconds, wordsPerMinute) {
69
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
70
+ return Math.round(durationSeconds / 60 * wpm);
71
+ }
72
+ }
73
+
74
+ // src/audio/silence-generator.ts
75
+ class SilenceGenerator {
76
+ generate(durationMs, format = "wav", sampleRateHz = 44100, channels = 1) {
77
+ const totalSamples = Math.ceil(sampleRateHz * durationMs / 1000);
78
+ const bytesPerSample = 2;
79
+ const dataSize = totalSamples * bytesPerSample * channels;
80
+ const data = new Uint8Array(dataSize);
81
+ return {
82
+ data,
83
+ format,
84
+ sampleRateHz,
85
+ durationMs,
86
+ channels
87
+ };
88
+ }
89
+ }
90
+
91
+ // src/tts/pace-analyzer.ts
92
+ var CONTENT_TYPE_PACING = {
93
+ intro: {
94
+ rate: 0.95,
95
+ emphasis: "normal",
96
+ tone: "authoritative",
97
+ leadingSilenceMs: 0,
98
+ trailingSilenceMs: 500
99
+ },
100
+ problem: {
101
+ rate: 0.9,
102
+ emphasis: "strong",
103
+ tone: "urgent",
104
+ leadingSilenceMs: 300,
105
+ trailingSilenceMs: 500
106
+ },
107
+ solution: {
108
+ rate: 1,
109
+ emphasis: "normal",
110
+ tone: "calm",
111
+ leadingSilenceMs: 300,
112
+ trailingSilenceMs: 500
113
+ },
114
+ metric: {
115
+ rate: 0.85,
116
+ emphasis: "strong",
117
+ tone: "excited",
118
+ leadingSilenceMs: 300,
119
+ trailingSilenceMs: 600
120
+ },
121
+ cta: {
122
+ rate: 0.9,
123
+ emphasis: "strong",
124
+ tone: "authoritative",
125
+ leadingSilenceMs: 400,
126
+ trailingSilenceMs: 0
127
+ },
128
+ transition: {
129
+ rate: 1.1,
130
+ emphasis: "reduced",
131
+ tone: "neutral",
132
+ leadingSilenceMs: 200,
133
+ trailingSilenceMs: 300
134
+ }
135
+ };
136
+
137
+ class PaceAnalyzer {
138
+ analyze(segments, baseRate = 1) {
139
+ return segments.map((segment) => {
140
+ const defaults = CONTENT_TYPE_PACING[segment.contentType];
141
+ return {
142
+ sceneId: segment.sceneId,
143
+ rate: defaults.rate * baseRate,
144
+ emphasis: defaults.emphasis,
145
+ tone: defaults.tone,
146
+ leadingSilenceMs: defaults.leadingSilenceMs,
147
+ trailingSilenceMs: defaults.trailingSilenceMs
148
+ };
149
+ });
150
+ }
151
+ getDefaults(contentType) {
152
+ return { ...CONTENT_TYPE_PACING[contentType] };
153
+ }
154
+ }
155
+
156
+ // src/tts/emphasis-planner.ts
157
+ class EmphasisPlanner {
158
+ llm;
159
+ model;
160
+ paceAnalyzer;
161
+ constructor(options) {
162
+ this.llm = options?.llm;
163
+ this.model = options?.model;
164
+ this.paceAnalyzer = new PaceAnalyzer;
165
+ }
166
+ async plan(segments, baseRate = 1) {
167
+ if (!this.llm) {
168
+ return this.paceAnalyzer.analyze(segments, baseRate);
169
+ }
170
+ try {
171
+ return await this.planWithLlm(segments, baseRate);
172
+ } catch {
173
+ return this.paceAnalyzer.analyze(segments, baseRate);
174
+ }
175
+ }
176
+ async planWithLlm(segments, baseRate) {
177
+ if (!this.llm) {
178
+ return this.paceAnalyzer.analyze(segments, baseRate);
179
+ }
180
+ const response = await this.llm.chat([
181
+ {
182
+ role: "system",
183
+ content: [
184
+ {
185
+ type: "text",
186
+ text: [
187
+ "You are a voice director planning emphasis and pacing for TTS narration.",
188
+ "For each segment, return a JSON array of directives.",
189
+ "Each directive has: sceneId, rate (0.7-1.3), emphasis (reduced|normal|strong),",
190
+ "tone (neutral|urgent|excited|calm|authoritative), leadingSilenceMs, trailingSilenceMs.",
191
+ "Return ONLY a JSON array, no other text."
192
+ ].join(`
193
+ `)
194
+ }
195
+ ]
196
+ },
197
+ {
198
+ role: "user",
199
+ content: [
200
+ {
201
+ type: "text",
202
+ text: JSON.stringify(segments.map((s) => ({
203
+ sceneId: s.sceneId,
204
+ text: s.text,
205
+ contentType: s.contentType
206
+ })))
207
+ }
208
+ ]
209
+ }
210
+ ], { model: this.model, temperature: 0.3, responseFormat: "json" });
211
+ const text = response.message.content.find((p) => p.type === "text");
212
+ if (!text || text.type !== "text") {
213
+ return this.paceAnalyzer.analyze(segments, baseRate);
214
+ }
215
+ const parsed = JSON.parse(text.text);
216
+ return parsed.map((d) => ({
217
+ ...d,
218
+ rate: d.rate * baseRate
219
+ }));
220
+ }
221
+ }
222
+
223
+ // src/tts/segment-synthesizer.ts
224
+ class SegmentSynthesizer {
225
+ tts;
226
+ constructor(tts) {
227
+ this.tts = tts;
228
+ }
229
+ async synthesizeAll(segments, voice, directives) {
230
+ const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
231
+ const results = await Promise.all(segments.map((segment) => this.synthesizeOne(segment, voice, directiveMap.get(segment.sceneId))));
232
+ return results;
233
+ }
234
+ async synthesizeOne(segment, voice, directive) {
235
+ const result = await this.tts.synthesize({
236
+ text: segment.text,
237
+ voiceId: voice.voiceId,
238
+ language: voice.language,
239
+ style: voice.style,
240
+ stability: voice.stability,
241
+ rate: directive?.rate,
242
+ emphasis: directive?.emphasis
243
+ });
244
+ return {
245
+ sceneId: segment.sceneId,
246
+ audio: result.audio,
247
+ durationMs: result.audio.durationMs ?? 0,
248
+ wordTimings: result.wordTimings?.map((wt) => ({
249
+ word: wt.word,
250
+ startMs: wt.startMs,
251
+ endMs: wt.endMs
252
+ }))
253
+ };
254
+ }
255
+ }
256
+
257
+ // src/tts/audio-assembler.ts
258
+ class AudioAssembler {
259
+ concatenator = new AudioConcatenator;
260
+ silenceGenerator = new SilenceGenerator;
261
+ assemble(segments, directives, defaultPauseMs = 500) {
262
+ if (segments.length === 0) {
263
+ return {
264
+ data: new Uint8Array(0),
265
+ format: "wav",
266
+ sampleRateHz: 44100,
267
+ durationMs: 0,
268
+ channels: 1
269
+ };
270
+ }
271
+ const [firstSegment] = segments;
272
+ if (!firstSegment) {
273
+ return {
274
+ data: new Uint8Array(0),
275
+ format: "wav",
276
+ sampleRateHz: 44100,
277
+ durationMs: 0,
278
+ channels: 1
279
+ };
280
+ }
281
+ const directiveMap = new Map(directives.map((d) => [d.sceneId, d]));
282
+ const reference = firstSegment.audio;
283
+ const parts = [];
284
+ for (let i = 0;i < segments.length; i++) {
285
+ const segment = segments[i];
286
+ if (!segment) {
287
+ continue;
288
+ }
289
+ const directive = directiveMap.get(segment.sceneId);
290
+ const leadingSilenceMs = directive?.leadingSilenceMs ?? 0;
291
+ if (leadingSilenceMs > 0) {
292
+ parts.push(this.silenceGenerator.generate(leadingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
293
+ }
294
+ parts.push(segment.audio);
295
+ const trailingSilenceMs = directive?.trailingSilenceMs ?? (i < segments.length - 1 ? defaultPauseMs : 0);
296
+ if (trailingSilenceMs > 0) {
297
+ parts.push(this.silenceGenerator.generate(trailingSilenceMs, reference.format, reference.sampleRateHz, reference.channels ?? 1));
298
+ }
299
+ }
300
+ return this.concatenator.concatenate(parts);
301
+ }
302
+ }
303
+
304
+ // src/tts/voice-synthesizer.ts
305
+ class VoiceSynthesizer {
306
+ segmentSynthesizer;
307
+ emphasisPlanner;
308
+ audioAssembler = new AudioAssembler;
309
+ durationEstimator = new DurationEstimator;
310
+ paceAnalyzer = new PaceAnalyzer;
311
+ options;
312
+ constructor(options) {
313
+ this.options = options;
314
+ this.segmentSynthesizer = new SegmentSynthesizer(options.tts);
315
+ this.emphasisPlanner = new EmphasisPlanner({
316
+ llm: options.llm,
317
+ model: options.model
318
+ });
319
+ }
320
+ async synthesize(brief) {
321
+ const script = this.buildScript(brief);
322
+ return this.executePipeline(script, brief.voice, brief.pacing);
323
+ }
324
+ async synthesizeForVideo(brief) {
325
+ const script = this.buildScriptFromScenePlan(brief);
326
+ return this.executePipeline(script, brief.voice, brief.pacing, brief.fps);
327
+ }
328
+ async executePipeline(script, voice, pacing, fps) {
329
+ const projectId = generateProjectId();
330
+ const baseRate = pacing?.baseRate ?? 1;
331
+ const pacingDirectives = await this.emphasisPlanner.plan(script.segments, baseRate);
332
+ const synthesized = await this.segmentSynthesizer.synthesizeAll(script.segments, voice, pacingDirectives);
333
+ const pauseMs = pacing?.segmentPauseMs ?? 500;
334
+ const assembledAudio = this.audioAssembler.assemble(synthesized, pacingDirectives, pauseMs);
335
+ const effectiveFps = fps ?? this.options.fps ?? 30;
336
+ const breathingRoomFactor = pacing?.breathingRoomFactor ?? 1.15;
337
+ const timingMap = this.buildTimingMap(synthesized, effectiveFps, breathingRoomFactor);
338
+ return {
339
+ id: projectId,
340
+ script,
341
+ pacingDirectives,
342
+ segments: synthesized,
343
+ assembledAudio,
344
+ timingMap
345
+ };
346
+ }
347
+ buildScript(brief) {
348
+ const segments = [];
349
+ const introText = `${brief.content.title}. ${brief.content.summary}`;
350
+ segments.push({
351
+ sceneId: "intro",
352
+ text: introText,
353
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(introText),
354
+ contentType: "intro"
355
+ });
356
+ if (brief.content.problems.length > 0) {
357
+ const text = brief.content.problems.join(". ");
358
+ segments.push({
359
+ sceneId: "problems",
360
+ text,
361
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
362
+ contentType: "problem"
363
+ });
364
+ }
365
+ if (brief.content.solutions.length > 0) {
366
+ const text = brief.content.solutions.join(". ");
367
+ segments.push({
368
+ sceneId: "solutions",
369
+ text,
370
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
371
+ contentType: "solution"
372
+ });
373
+ }
374
+ if (brief.content.metrics && brief.content.metrics.length > 0) {
375
+ const text = brief.content.metrics.join(". ");
376
+ segments.push({
377
+ sceneId: "metrics",
378
+ text,
379
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
380
+ contentType: "metric"
381
+ });
382
+ }
383
+ if (brief.content.callToAction) {
384
+ segments.push({
385
+ sceneId: "cta",
386
+ text: brief.content.callToAction,
387
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(brief.content.callToAction),
388
+ contentType: "cta"
389
+ });
390
+ }
391
+ const fullText = segments.map((s) => s.text).join(" ");
392
+ const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
393
+ return { fullText, segments, estimatedDurationSeconds };
394
+ }
395
+ buildScriptFromScenePlan(brief) {
396
+ const segments = brief.scenePlan.scenes.filter((scene) => scene.narrationText).map((scene) => {
397
+ const text = scene.narrationText ?? "";
398
+ return {
399
+ sceneId: scene.id,
400
+ text,
401
+ estimatedDurationSeconds: this.durationEstimator.estimateSeconds(text),
402
+ contentType: "intro"
403
+ };
404
+ });
405
+ const fullText = segments.map((s) => s.text).join(" ");
406
+ const estimatedDurationSeconds = segments.reduce((sum, s) => sum + s.estimatedDurationSeconds, 0);
407
+ return { fullText, segments, estimatedDurationSeconds };
408
+ }
409
+ buildTimingMap(segments, fps, breathingRoomFactor) {
410
+ const timingSegments = segments.map((seg) => {
411
+ const durationInFrames = Math.ceil(seg.durationMs / 1000 * fps);
412
+ return {
413
+ sceneId: seg.sceneId,
414
+ durationMs: seg.durationMs,
415
+ durationInFrames,
416
+ recommendedSceneDurationInFrames: Math.ceil(durationInFrames * breathingRoomFactor),
417
+ wordTimings: seg.wordTimings?.map((wt) => ({
418
+ word: wt.word,
419
+ startMs: wt.startMs,
420
+ endMs: wt.endMs
421
+ }))
422
+ };
423
+ });
424
+ const totalDurationMs = segments.reduce((sum, s) => sum + s.durationMs, 0);
425
+ return { totalDurationMs, segments: timingSegments, fps };
426
+ }
427
+ }
428
+ function generateProjectId() {
429
+ const timestamp = Date.now().toString(36);
430
+ const random = Math.random().toString(36).slice(2, 8);
431
+ return `tts_${timestamp}_${random}`;
432
+ }
433
+ export {
434
+ VoiceSynthesizer
435
+ };
File without changes
@@ -0,0 +1,5 @@
1
+ export { VoiceSessionManager } from './voice-session-manager';
2
+ export { TurnDetector } from './turn-detector';
3
+ export { ResponseOrchestrator } from './response-orchestrator';
4
+ export { TranscriptBuilder } from './transcript-builder';
5
+ export type { ConversationConfig, ConversationalTool, ConversationState, ConversationTurn, ConversationalOptions, ManagedSession, ConversationalEvent, ConversationalSessionSummary, } from './types';
@@ -0,0 +1,242 @@
1
+ // @bun
2
+ // src/conversational/transcript-builder.ts
3
+ class TranscriptBuilder {
4
+ turns = [];
5
+ currentTurn = null;
6
+ sessionStartMs = Date.now();
7
+ getTranscript() {
8
+ return [...this.turns];
9
+ }
10
+ toText() {
11
+ return this.turns.map((t) => `[${t.role}] ${t.text}`).join(`
12
+ `);
13
+ }
14
+ getTurnCount() {
15
+ return this.turns.length;
16
+ }
17
+ processEvent(event) {
18
+ switch (event.type) {
19
+ case "session_started":
20
+ this.sessionStartMs = Date.now();
21
+ break;
22
+ case "user_speech_started":
23
+ this.currentTurn = {
24
+ role: "user",
25
+ startMs: Date.now() - this.sessionStartMs
26
+ };
27
+ break;
28
+ case "user_speech_ended":
29
+ if (this.currentTurn && this.currentTurn.role === "user") {
30
+ this.currentTurn.text = event.transcript;
31
+ this.currentTurn.endMs = Date.now() - this.sessionStartMs;
32
+ this.turns.push(this.currentTurn);
33
+ this.currentTurn = null;
34
+ }
35
+ break;
36
+ case "agent_speech_started":
37
+ this.currentTurn = {
38
+ role: "agent",
39
+ text: event.text,
40
+ startMs: Date.now() - this.sessionStartMs
41
+ };
42
+ break;
43
+ case "agent_speech_ended":
44
+ if (this.currentTurn && this.currentTurn.role === "agent") {
45
+ this.currentTurn.endMs = Date.now() - this.sessionStartMs;
46
+ this.turns.push(this.currentTurn);
47
+ this.currentTurn = null;
48
+ }
49
+ break;
50
+ case "transcript":
51
+ break;
52
+ default:
53
+ break;
54
+ }
55
+ }
56
+ reset() {
57
+ this.turns.length = 0;
58
+ this.currentTurn = null;
59
+ this.sessionStartMs = Date.now();
60
+ }
61
+ }
62
+
63
+ // src/conversational/voice-session-manager.ts
64
+ class VoiceSessionManager {
65
+ provider;
66
+ constructor(options) {
67
+ this.provider = options.conversational;
68
+ }
69
+ async startSession(config) {
70
+ const transcriptBuilder = new TranscriptBuilder;
71
+ const session = await this.provider.startSession({
72
+ voiceId: config.voiceId,
73
+ language: config.language,
74
+ systemPrompt: config.systemPrompt,
75
+ llmModel: config.llmModel,
76
+ inputFormat: config.inputFormat,
77
+ outputFormat: config.outputFormat,
78
+ turnDetection: config.turnDetection,
79
+ silenceThresholdMs: config.silenceThresholdMs,
80
+ maxDurationSeconds: config.maxDurationSeconds
81
+ });
82
+ const state = {
83
+ sessionId: "",
84
+ status: "connecting",
85
+ currentTurn: "idle",
86
+ turnCount: 0,
87
+ durationMs: 0,
88
+ transcript: []
89
+ };
90
+ const wrappedEvents = this.wrapEvents(session.events, state, transcriptBuilder);
91
+ return {
92
+ state,
93
+ sendAudio: (chunk) => session.sendAudio(chunk),
94
+ sendText: (text) => session.sendText(text),
95
+ interrupt: () => session.interrupt(),
96
+ close: async () => {
97
+ const summary = await session.close();
98
+ state.status = "ended";
99
+ return summary;
100
+ },
101
+ events: wrappedEvents
102
+ };
103
+ }
104
+ async* wrapEvents(events, state, transcriptBuilder) {
105
+ for await (const event of events) {
106
+ transcriptBuilder.processEvent(event);
107
+ switch (event.type) {
108
+ case "session_started":
109
+ state.sessionId = event.sessionId;
110
+ state.status = "active";
111
+ break;
112
+ case "user_speech_started":
113
+ state.currentTurn = "user";
114
+ break;
115
+ case "user_speech_ended":
116
+ state.currentTurn = "idle";
117
+ state.turnCount += 1;
118
+ break;
119
+ case "agent_speech_started":
120
+ state.currentTurn = "agent";
121
+ break;
122
+ case "agent_speech_ended":
123
+ state.currentTurn = "idle";
124
+ state.turnCount += 1;
125
+ break;
126
+ case "session_ended":
127
+ state.status = "ended";
128
+ state.durationMs = event.durationMs;
129
+ break;
130
+ }
131
+ state.transcript = transcriptBuilder.getTranscript();
132
+ yield event;
133
+ }
134
+ }
135
+ }
136
+
137
+ // src/conversational/turn-detector.ts
138
+ class TurnDetector {
139
+ silenceThresholdMs;
140
+ energyThreshold;
141
+ silenceStartMs = null;
142
+ constructor(silenceThresholdMs = 800, energyThreshold = 0.01) {
143
+ this.silenceThresholdMs = silenceThresholdMs;
144
+ this.energyThreshold = energyThreshold;
145
+ }
146
+ processChunk(chunk, timestampMs) {
147
+ const energy = this.calculateEnergy(chunk);
148
+ const isSpeech = energy > this.energyThreshold;
149
+ if (isSpeech) {
150
+ this.silenceStartMs = null;
151
+ return false;
152
+ }
153
+ if (this.silenceStartMs === null) {
154
+ this.silenceStartMs = timestampMs;
155
+ }
156
+ const silenceDurationMs = timestampMs - this.silenceStartMs;
157
+ return silenceDurationMs >= this.silenceThresholdMs;
158
+ }
159
+ reset() {
160
+ this.silenceStartMs = null;
161
+ }
162
+ calculateEnergy(chunk) {
163
+ if (chunk.length < 2)
164
+ return 0;
165
+ let sum = 0;
166
+ const sampleCount = Math.floor(chunk.length / 2);
167
+ for (let i = 0;i < chunk.length - 1; i += 2) {
168
+ const low = chunk[i] ?? 0;
169
+ const high = chunk[i + 1] ?? 0;
170
+ const sample = (low | high << 8) << 16 >> 16;
171
+ const normalized = sample / 32768;
172
+ sum += normalized * normalized;
173
+ }
174
+ return Math.sqrt(sum / sampleCount);
175
+ }
176
+ }
177
+
178
+ // src/conversational/response-orchestrator.ts
179
+ class ResponseOrchestrator {
180
+ stt;
181
+ llm;
182
+ tts;
183
+ conversationHistory = [];
184
+ constructor(stt, llm, tts) {
185
+ this.stt = stt;
186
+ this.llm = llm;
187
+ this.tts = tts;
188
+ }
189
+ async* processUserTurn(userAudio, config) {
190
+ const transcription = await this.stt.transcribe({
191
+ audio: userAudio,
192
+ language: config.language,
193
+ wordTimestamps: false
194
+ });
195
+ const userText = transcription.text;
196
+ yield { type: "user_speech_ended", transcript: userText };
197
+ yield {
198
+ type: "transcript",
199
+ role: "user",
200
+ text: userText,
201
+ timestamp: Date.now()
202
+ };
203
+ this.conversationHistory.push({ role: "user", content: userText });
204
+ const llmResponse = await this.llm.chat([
205
+ {
206
+ role: "system",
207
+ content: [{ type: "text", text: config.systemPrompt }]
208
+ },
209
+ ...this.conversationHistory.map((msg) => ({
210
+ role: msg.role,
211
+ content: [{ type: "text", text: msg.content }]
212
+ }))
213
+ ], { model: config.llmModel });
214
+ const responseText = llmResponse.message.content.find((p) => p.type === "text");
215
+ const agentText = responseText && responseText.type === "text" ? responseText.text : "I apologize, I could not generate a response.";
216
+ this.conversationHistory.push({ role: "assistant", content: agentText });
217
+ yield { type: "agent_speech_started", text: agentText };
218
+ const synthesis = await this.tts.synthesize({
219
+ text: agentText,
220
+ voiceId: config.voiceId,
221
+ language: config.language,
222
+ format: config.outputFormat
223
+ });
224
+ yield { type: "agent_audio", audio: synthesis.audio.data };
225
+ yield { type: "agent_speech_ended" };
226
+ yield {
227
+ type: "transcript",
228
+ role: "agent",
229
+ text: agentText,
230
+ timestamp: Date.now()
231
+ };
232
+ }
233
+ reset() {
234
+ this.conversationHistory.length = 0;
235
+ }
236
+ }
237
+ export {
238
+ VoiceSessionManager,
239
+ TurnDetector,
240
+ TranscriptBuilder,
241
+ ResponseOrchestrator
242
+ };
@@ -0,0 +1,26 @@
1
+ import type { STTProvider, TTSProvider, AudioData, ConversationalEvent } from '../types';
2
+ import type { LLMProvider } from '@contractspec/lib.contracts-integrations/integrations/providers/llm';
3
+ import type { ConversationConfig } from './types';
4
+ /**
5
+ * Orchestrate STT -> LLM -> TTS per conversation turn.
6
+ *
7
+ * Used as a fallback when the ConversationalProvider doesn't support
8
+ * native bidirectional conversation.
9
+ */
10
+ export declare class ResponseOrchestrator {
11
+ private readonly stt;
12
+ private readonly llm;
13
+ private readonly tts;
14
+ private readonly conversationHistory;
15
+ constructor(stt: STTProvider, llm: LLMProvider, tts: TTSProvider);
16
+ /**
17
+ * Process a user's audio turn and generate an agent response.
18
+ *
19
+ * @param userAudio - Audio from the user's turn
20
+ * @param config - Session configuration
21
+ * @returns Stream of conversational events
22
+ */
23
+ processUserTurn(userAudio: AudioData, config: ConversationConfig): AsyncGenerator<ConversationalEvent>;
24
+ /** Reset conversation history */
25
+ reset(): void;
26
+ }