@contractspec/lib.voice 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/dist/audio/audio-concatenator.d.ts +15 -0
  2. package/dist/audio/audio-concatenator.js +57 -0
  3. package/dist/audio/duration-estimator.d.ts +31 -0
  4. package/dist/audio/duration-estimator.js +22 -0
  5. package/dist/audio/format-converter.d.ts +17 -0
  6. package/dist/audio/format-converter.js +28 -0
  7. package/dist/audio/index.d.ts +4 -0
  8. package/dist/audio/index.js +121 -0
  9. package/dist/audio/silence-generator.d.ts +16 -0
  10. package/dist/audio/silence-generator.js +20 -0
  11. package/dist/browser/audio/audio-concatenator.js +56 -0
  12. package/dist/browser/audio/duration-estimator.js +21 -0
  13. package/dist/browser/audio/format-converter.js +27 -0
  14. package/dist/browser/audio/index.js +120 -0
  15. package/dist/browser/audio/silence-generator.js +19 -0
  16. package/dist/browser/conversational/index.js +241 -0
  17. package/dist/browser/conversational/response-orchestrator.js +62 -0
  18. package/dist/browser/conversational/transcript-builder.js +63 -0
  19. package/dist/browser/conversational/turn-detector.js +43 -0
  20. package/dist/browser/conversational/types.js +0 -0
  21. package/dist/browser/conversational/voice-session-manager.js +137 -0
  22. package/dist/browser/docs/conversational.docblock.js +5 -0
  23. package/dist/browser/docs/stt.docblock.js +5 -0
  24. package/dist/browser/docs/sync.docblock.js +5 -0
  25. package/dist/browser/docs/tts.docblock.js +5 -0
  26. package/dist/browser/docs/voice.docblock.js +5 -0
  27. package/dist/browser/i18n/catalogs/en.js +91 -0
  28. package/dist/browser/i18n/catalogs/es.js +91 -0
  29. package/dist/browser/i18n/catalogs/fr.js +91 -0
  30. package/dist/browser/i18n/catalogs/index.js +271 -0
  31. package/dist/browser/i18n/index.js +335 -0
  32. package/dist/browser/i18n/keys.js +38 -0
  33. package/dist/browser/i18n/locale.js +13 -0
  34. package/dist/browser/i18n/messages.js +283 -0
  35. package/dist/browser/index.js +1070 -0
  36. package/dist/browser/stt/diarization-mapper.js +42 -0
  37. package/dist/browser/stt/index.js +222 -0
  38. package/dist/browser/stt/segment-splitter.js +36 -0
  39. package/dist/browser/stt/subtitle-formatter.js +51 -0
  40. package/dist/browser/stt/transcriber.js +219 -0
  41. package/dist/browser/stt/types.js +0 -0
  42. package/dist/browser/sync/duration-negotiator.js +69 -0
  43. package/dist/browser/sync/index.js +165 -0
  44. package/dist/browser/sync/scene-adapter.js +52 -0
  45. package/dist/browser/sync/timing-calculator.js +46 -0
  46. package/dist/browser/tts/audio-assembler.js +120 -0
  47. package/dist/browser/tts/emphasis-planner.js +134 -0
  48. package/dist/browser/tts/index.js +439 -0
  49. package/dist/browser/tts/pace-analyzer.js +67 -0
  50. package/dist/browser/tts/segment-synthesizer.js +36 -0
  51. package/dist/browser/tts/types.js +0 -0
  52. package/dist/browser/tts/voice-synthesizer.js +435 -0
  53. package/dist/browser/types.js +0 -0
  54. package/dist/conversational/index.d.ts +5 -0
  55. package/dist/conversational/index.js +242 -0
  56. package/dist/conversational/response-orchestrator.d.ts +26 -0
  57. package/dist/conversational/response-orchestrator.js +63 -0
  58. package/dist/conversational/transcript-builder.d.ts +25 -0
  59. package/dist/conversational/transcript-builder.js +64 -0
  60. package/dist/conversational/turn-detector.d.ts +31 -0
  61. package/dist/conversational/turn-detector.js +44 -0
  62. package/dist/conversational/types.d.ts +55 -0
  63. package/dist/conversational/types.js +1 -0
  64. package/dist/conversational/voice-session-manager.d.ts +17 -0
  65. package/dist/conversational/voice-session-manager.js +138 -0
  66. package/dist/docs/conversational.docblock.d.ts +14 -0
  67. package/dist/docs/conversational.docblock.js +6 -0
  68. package/dist/docs/stt.docblock.d.ts +12 -0
  69. package/dist/docs/stt.docblock.js +6 -0
  70. package/dist/docs/sync.docblock.d.ts +12 -0
  71. package/dist/docs/sync.docblock.js +6 -0
  72. package/dist/docs/tts.docblock.d.ts +12 -0
  73. package/dist/docs/tts.docblock.js +6 -0
  74. package/dist/docs/voice.docblock.d.ts +22 -0
  75. package/dist/docs/voice.docblock.js +6 -0
  76. package/dist/i18n/catalogs/en.d.ts +6 -0
  77. package/dist/i18n/catalogs/en.js +92 -0
  78. package/dist/i18n/catalogs/es.d.ts +4 -0
  79. package/dist/i18n/catalogs/es.js +92 -0
  80. package/dist/i18n/catalogs/fr.d.ts +4 -0
  81. package/dist/i18n/catalogs/fr.js +92 -0
  82. package/dist/i18n/catalogs/index.d.ts +3 -0
  83. package/dist/i18n/catalogs/index.js +272 -0
  84. package/dist/i18n/index.d.ts +20 -0
  85. package/dist/i18n/index.js +336 -0
  86. package/dist/i18n/keys.d.ts +50 -0
  87. package/dist/i18n/keys.js +39 -0
  88. package/dist/i18n/locale.d.ts +6 -0
  89. package/dist/i18n/locale.js +14 -0
  90. package/dist/i18n/messages.d.ts +13 -0
  91. package/dist/i18n/messages.js +284 -0
  92. package/dist/index.d.ts +6 -0
  93. package/dist/index.js +1071 -0
  94. package/dist/node/audio/audio-concatenator.js +56 -0
  95. package/dist/node/audio/duration-estimator.js +21 -0
  96. package/dist/node/audio/format-converter.js +27 -0
  97. package/dist/node/audio/index.js +120 -0
  98. package/dist/node/audio/silence-generator.js +19 -0
  99. package/dist/node/conversational/index.js +241 -0
  100. package/dist/node/conversational/response-orchestrator.js +62 -0
  101. package/dist/node/conversational/transcript-builder.js +63 -0
  102. package/dist/node/conversational/turn-detector.js +43 -0
  103. package/dist/node/conversational/types.js +0 -0
  104. package/dist/node/conversational/voice-session-manager.js +137 -0
  105. package/dist/node/docs/conversational.docblock.js +5 -0
  106. package/dist/node/docs/stt.docblock.js +5 -0
  107. package/dist/node/docs/sync.docblock.js +5 -0
  108. package/dist/node/docs/tts.docblock.js +5 -0
  109. package/dist/node/docs/voice.docblock.js +5 -0
  110. package/dist/node/i18n/catalogs/en.js +91 -0
  111. package/dist/node/i18n/catalogs/es.js +91 -0
  112. package/dist/node/i18n/catalogs/fr.js +91 -0
  113. package/dist/node/i18n/catalogs/index.js +271 -0
  114. package/dist/node/i18n/index.js +335 -0
  115. package/dist/node/i18n/keys.js +38 -0
  116. package/dist/node/i18n/locale.js +13 -0
  117. package/dist/node/i18n/messages.js +283 -0
  118. package/dist/node/index.js +1070 -0
  119. package/dist/node/stt/diarization-mapper.js +42 -0
  120. package/dist/node/stt/index.js +222 -0
  121. package/dist/node/stt/segment-splitter.js +36 -0
  122. package/dist/node/stt/subtitle-formatter.js +51 -0
  123. package/dist/node/stt/transcriber.js +219 -0
  124. package/dist/node/stt/types.js +0 -0
  125. package/dist/node/sync/duration-negotiator.js +69 -0
  126. package/dist/node/sync/index.js +165 -0
  127. package/dist/node/sync/scene-adapter.js +52 -0
  128. package/dist/node/sync/timing-calculator.js +46 -0
  129. package/dist/node/tts/audio-assembler.js +120 -0
  130. package/dist/node/tts/emphasis-planner.js +134 -0
  131. package/dist/node/tts/index.js +439 -0
  132. package/dist/node/tts/pace-analyzer.js +67 -0
  133. package/dist/node/tts/segment-synthesizer.js +36 -0
  134. package/dist/node/tts/types.js +0 -0
  135. package/dist/node/tts/voice-synthesizer.js +435 -0
  136. package/dist/node/types.js +0 -0
  137. package/dist/stt/diarization-mapper.d.ts +19 -0
  138. package/dist/stt/diarization-mapper.js +43 -0
  139. package/dist/stt/index.d.ts +5 -0
  140. package/dist/stt/index.js +223 -0
  141. package/dist/stt/segment-splitter.d.ts +19 -0
  142. package/dist/stt/segment-splitter.js +37 -0
  143. package/dist/stt/subtitle-formatter.d.ts +19 -0
  144. package/dist/stt/subtitle-formatter.js +52 -0
  145. package/dist/stt/transcriber.d.ts +21 -0
  146. package/dist/stt/transcriber.js +220 -0
  147. package/dist/stt/types.d.ts +44 -0
  148. package/dist/stt/types.js +1 -0
  149. package/dist/sync/duration-negotiator.d.ts +37 -0
  150. package/dist/sync/duration-negotiator.js +70 -0
  151. package/dist/sync/index.d.ts +3 -0
  152. package/dist/sync/index.js +166 -0
  153. package/dist/sync/scene-adapter.d.ts +29 -0
  154. package/dist/sync/scene-adapter.js +53 -0
  155. package/dist/sync/timing-calculator.d.ts +21 -0
  156. package/dist/sync/timing-calculator.js +47 -0
  157. package/dist/tts/audio-assembler.d.ts +19 -0
  158. package/dist/tts/audio-assembler.js +121 -0
  159. package/dist/tts/emphasis-planner.d.ts +24 -0
  160. package/dist/tts/emphasis-planner.js +135 -0
  161. package/dist/tts/index.d.ts +6 -0
  162. package/dist/tts/index.js +440 -0
  163. package/dist/tts/pace-analyzer.d.ts +30 -0
  164. package/dist/tts/pace-analyzer.js +68 -0
  165. package/dist/tts/segment-synthesizer.d.ts +21 -0
  166. package/dist/tts/segment-synthesizer.js +37 -0
  167. package/dist/tts/types.d.ts +76 -0
  168. package/dist/tts/types.js +1 -0
  169. package/dist/tts/voice-synthesizer.d.ts +28 -0
  170. package/dist/tts/voice-synthesizer.js +436 -0
  171. package/dist/types.d.ts +12 -0
  172. package/dist/types.js +1 -0
  173. package/package.json +760 -0
@@ -0,0 +1,42 @@
1
+ // src/stt/diarization-mapper.ts
2
+ class DiarizationMapper {
3
+ map(segments, labelPrefix = "Speaker") {
4
+ const speakerOrder = [];
5
+ const speakerStats = new Map;
6
+ for (const seg of segments) {
7
+ if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
8
+ speakerOrder.push(seg.speakerId);
9
+ speakerStats.set(seg.speakerId, {
10
+ segmentCount: 0,
11
+ totalSpeakingMs: 0
12
+ });
13
+ }
14
+ }
15
+ const labeledSegments = segments.map((seg) => {
16
+ if (!seg.speakerId)
17
+ return seg;
18
+ const index = speakerOrder.indexOf(seg.speakerId);
19
+ const label = `${labelPrefix} ${index + 1}`;
20
+ const stats = speakerStats.get(seg.speakerId);
21
+ if (!stats) {
22
+ return { ...seg, speakerLabel: label };
23
+ }
24
+ stats.segmentCount += 1;
25
+ stats.totalSpeakingMs += seg.endMs - seg.startMs;
26
+ return { ...seg, speakerLabel: label };
27
+ });
28
+ const speakers = speakerOrder.map((id, index) => {
29
+ const stats = speakerStats.get(id);
30
+ return {
31
+ id,
32
+ label: `${labelPrefix} ${index + 1}`,
33
+ segmentCount: stats?.segmentCount ?? 0,
34
+ totalSpeakingMs: stats?.totalSpeakingMs ?? 0
35
+ };
36
+ });
37
+ return { segments: labeledSegments, speakers };
38
+ }
39
+ }
40
+ export {
41
+ DiarizationMapper
42
+ };
@@ -0,0 +1,222 @@
1
+ // src/stt/segment-splitter.ts
2
+ class SegmentSplitter {
3
+ static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
4
+ split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
5
+ const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
6
+ if (totalDurationMs <= maxChunkMs) {
7
+ return [audio];
8
+ }
9
+ const chunks = [];
10
+ const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
11
+ let offsetMs = 0;
12
+ while (offsetMs < totalDurationMs) {
13
+ const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
14
+ const startByte = Math.floor(offsetMs * bytesPerMs);
15
+ const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
16
+ chunks.push({
17
+ data: audio.data.slice(startByte, endByte),
18
+ format: audio.format,
19
+ sampleRateHz: audio.sampleRateHz,
20
+ durationMs: chunkDurationMs,
21
+ channels: audio.channels
22
+ });
23
+ offsetMs += chunkDurationMs;
24
+ }
25
+ return chunks;
26
+ }
27
+ estimateDurationMs(audio) {
28
+ const bytesPerSample = 2;
29
+ const channels = audio.channels ?? 1;
30
+ const totalSamples = audio.data.length / (bytesPerSample * channels);
31
+ return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
32
+ }
33
+ }
34
+
35
+ // src/stt/diarization-mapper.ts
36
+ class DiarizationMapper {
37
+ map(segments, labelPrefix = "Speaker") {
38
+ const speakerOrder = [];
39
+ const speakerStats = new Map;
40
+ for (const seg of segments) {
41
+ if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
42
+ speakerOrder.push(seg.speakerId);
43
+ speakerStats.set(seg.speakerId, {
44
+ segmentCount: 0,
45
+ totalSpeakingMs: 0
46
+ });
47
+ }
48
+ }
49
+ const labeledSegments = segments.map((seg) => {
50
+ if (!seg.speakerId)
51
+ return seg;
52
+ const index = speakerOrder.indexOf(seg.speakerId);
53
+ const label = `${labelPrefix} ${index + 1}`;
54
+ const stats = speakerStats.get(seg.speakerId);
55
+ if (!stats) {
56
+ return { ...seg, speakerLabel: label };
57
+ }
58
+ stats.segmentCount += 1;
59
+ stats.totalSpeakingMs += seg.endMs - seg.startMs;
60
+ return { ...seg, speakerLabel: label };
61
+ });
62
+ const speakers = speakerOrder.map((id, index) => {
63
+ const stats = speakerStats.get(id);
64
+ return {
65
+ id,
66
+ label: `${labelPrefix} ${index + 1}`,
67
+ segmentCount: stats?.segmentCount ?? 0,
68
+ totalSpeakingMs: stats?.totalSpeakingMs ?? 0
69
+ };
70
+ });
71
+ return { segments: labeledSegments, speakers };
72
+ }
73
+ }
74
+
75
+ // src/stt/subtitle-formatter.ts
76
+ class SubtitleFormatter {
77
+ toSRT(segments) {
78
+ return segments.map((seg, i) => {
79
+ const start = this.formatTimeSRT(seg.startMs);
80
+ const end = this.formatTimeSRT(seg.endMs);
81
+ const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
82
+ return `${i + 1}
83
+ ${start} --> ${end}
84
+ ${label}${seg.text}`;
85
+ }).join(`
86
+
87
+ `);
88
+ }
89
+ toVTT(segments) {
90
+ const header = `WEBVTT
91
+
92
+ `;
93
+ const cues = segments.map((seg, i) => {
94
+ const start = this.formatTimeVTT(seg.startMs);
95
+ const end = this.formatTimeVTT(seg.endMs);
96
+ const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
97
+ return `${i + 1}
98
+ ${start} --> ${end}
99
+ ${label}${seg.text}`;
100
+ }).join(`
101
+
102
+ `);
103
+ return header + cues;
104
+ }
105
+ formatTimeSRT(ms) {
106
+ const hours = Math.floor(ms / 3600000);
107
+ const minutes = Math.floor(ms % 3600000 / 60000);
108
+ const seconds = Math.floor(ms % 60000 / 1000);
109
+ const millis = ms % 1000;
110
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
111
+ }
112
+ formatTimeVTT(ms) {
113
+ const hours = Math.floor(ms / 3600000);
114
+ const minutes = Math.floor(ms % 3600000 / 60000);
115
+ const seconds = Math.floor(ms % 60000 / 1000);
116
+ const millis = ms % 1000;
117
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
118
+ }
119
+ pad(value, length) {
120
+ return value.toString().padStart(length, "0");
121
+ }
122
+ }
123
+
124
+ // src/stt/transcriber.ts
125
+ class Transcriber {
126
+ stt;
127
+ segmentSplitter = new SegmentSplitter;
128
+ diarizationMapper = new DiarizationMapper;
129
+ subtitleFormatter = new SubtitleFormatter;
130
+ constructor(options) {
131
+ this.stt = options.stt;
132
+ }
133
+ async transcribe(brief) {
134
+ const projectId = generateProjectId();
135
+ const chunks = this.segmentSplitter.split(brief.audio);
136
+ const allSegments = [];
137
+ let fullText = "";
138
+ let totalDurationMs = 0;
139
+ let offsetMs = 0;
140
+ for (const chunk of chunks) {
141
+ const result = await this.stt.transcribe({
142
+ audio: chunk,
143
+ language: brief.language,
144
+ diarize: brief.diarize,
145
+ speakerCount: brief.speakerCount,
146
+ wordTimestamps: true,
147
+ vocabularyHints: brief.vocabularyHints
148
+ });
149
+ const offsetSegments = result.segments.map((seg) => ({
150
+ text: seg.text,
151
+ startMs: seg.startMs + offsetMs,
152
+ endMs: seg.endMs + offsetMs,
153
+ speakerId: seg.speakerId,
154
+ speakerName: seg.speakerName,
155
+ confidence: seg.confidence
156
+ }));
157
+ allSegments.push(...offsetSegments);
158
+ fullText += (fullText ? " " : "") + result.text;
159
+ totalDurationMs += result.durationMs;
160
+ offsetMs += chunk.durationMs ?? 0;
161
+ }
162
+ let mappedSegments = allSegments;
163
+ let speakers;
164
+ if (brief.diarize) {
165
+ const mapping = this.diarizationMapper.map(allSegments);
166
+ mappedSegments = mapping.segments;
167
+ speakers = mapping.speakers;
168
+ }
169
+ const transcript = {
170
+ text: fullText,
171
+ segments: mappedSegments,
172
+ language: brief.language ?? "en",
173
+ durationMs: totalDurationMs
174
+ };
175
+ let subtitles;
176
+ const format = brief.subtitleFormat ?? "none";
177
+ if (format === "srt") {
178
+ subtitles = this.subtitleFormatter.toSRT(mappedSegments);
179
+ } else if (format === "vtt") {
180
+ subtitles = this.subtitleFormatter.toVTT(mappedSegments);
181
+ }
182
+ return {
183
+ id: projectId,
184
+ transcript,
185
+ subtitles,
186
+ speakers
187
+ };
188
+ }
189
+ async* transcribeStream(audio, options) {
190
+ if (!this.stt.transcribeStream) {
191
+ throw new Error("Streaming transcription not supported by the current STT provider");
192
+ }
193
+ const stream = this.stt.transcribeStream(audio, {
194
+ language: options?.language,
195
+ diarize: options?.diarize,
196
+ speakerCount: options?.speakerCount,
197
+ wordTimestamps: true,
198
+ vocabularyHints: options?.vocabularyHints
199
+ });
200
+ for await (const segment of stream) {
201
+ yield {
202
+ text: segment.text,
203
+ startMs: segment.startMs,
204
+ endMs: segment.endMs,
205
+ speakerId: segment.speakerId,
206
+ speakerLabel: segment.speakerName,
207
+ confidence: segment.confidence
208
+ };
209
+ }
210
+ }
211
+ }
212
+ function generateProjectId() {
213
+ const timestamp = Date.now().toString(36);
214
+ const random = Math.random().toString(36).slice(2, 8);
215
+ return `stt_${timestamp}_${random}`;
216
+ }
217
+ export {
218
+ Transcriber,
219
+ SubtitleFormatter,
220
+ SegmentSplitter,
221
+ DiarizationMapper
222
+ };
@@ -0,0 +1,36 @@
1
+ // src/stt/segment-splitter.ts
2
+ class SegmentSplitter {
3
+ static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
4
+ split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
5
+ const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
6
+ if (totalDurationMs <= maxChunkMs) {
7
+ return [audio];
8
+ }
9
+ const chunks = [];
10
+ const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
11
+ let offsetMs = 0;
12
+ while (offsetMs < totalDurationMs) {
13
+ const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
14
+ const startByte = Math.floor(offsetMs * bytesPerMs);
15
+ const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
16
+ chunks.push({
17
+ data: audio.data.slice(startByte, endByte),
18
+ format: audio.format,
19
+ sampleRateHz: audio.sampleRateHz,
20
+ durationMs: chunkDurationMs,
21
+ channels: audio.channels
22
+ });
23
+ offsetMs += chunkDurationMs;
24
+ }
25
+ return chunks;
26
+ }
27
+ estimateDurationMs(audio) {
28
+ const bytesPerSample = 2;
29
+ const channels = audio.channels ?? 1;
30
+ const totalSamples = audio.data.length / (bytesPerSample * channels);
31
+ return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
32
+ }
33
+ }
34
+ export {
35
+ SegmentSplitter
36
+ };
@@ -0,0 +1,51 @@
1
+ // src/stt/subtitle-formatter.ts
2
+ class SubtitleFormatter {
3
+ toSRT(segments) {
4
+ return segments.map((seg, i) => {
5
+ const start = this.formatTimeSRT(seg.startMs);
6
+ const end = this.formatTimeSRT(seg.endMs);
7
+ const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
8
+ return `${i + 1}
9
+ ${start} --> ${end}
10
+ ${label}${seg.text}`;
11
+ }).join(`
12
+
13
+ `);
14
+ }
15
+ toVTT(segments) {
16
+ const header = `WEBVTT
17
+
18
+ `;
19
+ const cues = segments.map((seg, i) => {
20
+ const start = this.formatTimeVTT(seg.startMs);
21
+ const end = this.formatTimeVTT(seg.endMs);
22
+ const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
23
+ return `${i + 1}
24
+ ${start} --> ${end}
25
+ ${label}${seg.text}`;
26
+ }).join(`
27
+
28
+ `);
29
+ return header + cues;
30
+ }
31
+ formatTimeSRT(ms) {
32
+ const hours = Math.floor(ms / 3600000);
33
+ const minutes = Math.floor(ms % 3600000 / 60000);
34
+ const seconds = Math.floor(ms % 60000 / 1000);
35
+ const millis = ms % 1000;
36
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
37
+ }
38
+ formatTimeVTT(ms) {
39
+ const hours = Math.floor(ms / 3600000);
40
+ const minutes = Math.floor(ms % 3600000 / 60000);
41
+ const seconds = Math.floor(ms % 60000 / 1000);
42
+ const millis = ms % 1000;
43
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
44
+ }
45
+ pad(value, length) {
46
+ return value.toString().padStart(length, "0");
47
+ }
48
+ }
49
+ export {
50
+ SubtitleFormatter
51
+ };
@@ -0,0 +1,219 @@
1
+ // src/stt/segment-splitter.ts
2
+ class SegmentSplitter {
3
+ static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
4
+ split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
5
+ const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
6
+ if (totalDurationMs <= maxChunkMs) {
7
+ return [audio];
8
+ }
9
+ const chunks = [];
10
+ const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
11
+ let offsetMs = 0;
12
+ while (offsetMs < totalDurationMs) {
13
+ const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
14
+ const startByte = Math.floor(offsetMs * bytesPerMs);
15
+ const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
16
+ chunks.push({
17
+ data: audio.data.slice(startByte, endByte),
18
+ format: audio.format,
19
+ sampleRateHz: audio.sampleRateHz,
20
+ durationMs: chunkDurationMs,
21
+ channels: audio.channels
22
+ });
23
+ offsetMs += chunkDurationMs;
24
+ }
25
+ return chunks;
26
+ }
27
+ estimateDurationMs(audio) {
28
+ const bytesPerSample = 2;
29
+ const channels = audio.channels ?? 1;
30
+ const totalSamples = audio.data.length / (bytesPerSample * channels);
31
+ return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
32
+ }
33
+ }
34
+
35
+ // src/stt/diarization-mapper.ts
36
+ class DiarizationMapper {
37
+ map(segments, labelPrefix = "Speaker") {
38
+ const speakerOrder = [];
39
+ const speakerStats = new Map;
40
+ for (const seg of segments) {
41
+ if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
42
+ speakerOrder.push(seg.speakerId);
43
+ speakerStats.set(seg.speakerId, {
44
+ segmentCount: 0,
45
+ totalSpeakingMs: 0
46
+ });
47
+ }
48
+ }
49
+ const labeledSegments = segments.map((seg) => {
50
+ if (!seg.speakerId)
51
+ return seg;
52
+ const index = speakerOrder.indexOf(seg.speakerId);
53
+ const label = `${labelPrefix} ${index + 1}`;
54
+ const stats = speakerStats.get(seg.speakerId);
55
+ if (!stats) {
56
+ return { ...seg, speakerLabel: label };
57
+ }
58
+ stats.segmentCount += 1;
59
+ stats.totalSpeakingMs += seg.endMs - seg.startMs;
60
+ return { ...seg, speakerLabel: label };
61
+ });
62
+ const speakers = speakerOrder.map((id, index) => {
63
+ const stats = speakerStats.get(id);
64
+ return {
65
+ id,
66
+ label: `${labelPrefix} ${index + 1}`,
67
+ segmentCount: stats?.segmentCount ?? 0,
68
+ totalSpeakingMs: stats?.totalSpeakingMs ?? 0
69
+ };
70
+ });
71
+ return { segments: labeledSegments, speakers };
72
+ }
73
+ }
74
+
75
+ // src/stt/subtitle-formatter.ts
76
+ class SubtitleFormatter {
77
+ toSRT(segments) {
78
+ return segments.map((seg, i) => {
79
+ const start = this.formatTimeSRT(seg.startMs);
80
+ const end = this.formatTimeSRT(seg.endMs);
81
+ const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
82
+ return `${i + 1}
83
+ ${start} --> ${end}
84
+ ${label}${seg.text}`;
85
+ }).join(`
86
+
87
+ `);
88
+ }
89
+ toVTT(segments) {
90
+ const header = `WEBVTT
91
+
92
+ `;
93
+ const cues = segments.map((seg, i) => {
94
+ const start = this.formatTimeVTT(seg.startMs);
95
+ const end = this.formatTimeVTT(seg.endMs);
96
+ const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
97
+ return `${i + 1}
98
+ ${start} --> ${end}
99
+ ${label}${seg.text}`;
100
+ }).join(`
101
+
102
+ `);
103
+ return header + cues;
104
+ }
105
+ formatTimeSRT(ms) {
106
+ const hours = Math.floor(ms / 3600000);
107
+ const minutes = Math.floor(ms % 3600000 / 60000);
108
+ const seconds = Math.floor(ms % 60000 / 1000);
109
+ const millis = ms % 1000;
110
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
111
+ }
112
+ formatTimeVTT(ms) {
113
+ const hours = Math.floor(ms / 3600000);
114
+ const minutes = Math.floor(ms % 3600000 / 60000);
115
+ const seconds = Math.floor(ms % 60000 / 1000);
116
+ const millis = ms % 1000;
117
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
118
+ }
119
+ pad(value, length) {
120
+ return value.toString().padStart(length, "0");
121
+ }
122
+ }
123
+
124
+ // src/stt/transcriber.ts
125
+ class Transcriber {
126
+ stt;
127
+ segmentSplitter = new SegmentSplitter;
128
+ diarizationMapper = new DiarizationMapper;
129
+ subtitleFormatter = new SubtitleFormatter;
130
+ constructor(options) {
131
+ this.stt = options.stt;
132
+ }
133
+ async transcribe(brief) {
134
+ const projectId = generateProjectId();
135
+ const chunks = this.segmentSplitter.split(brief.audio);
136
+ const allSegments = [];
137
+ let fullText = "";
138
+ let totalDurationMs = 0;
139
+ let offsetMs = 0;
140
+ for (const chunk of chunks) {
141
+ const result = await this.stt.transcribe({
142
+ audio: chunk,
143
+ language: brief.language,
144
+ diarize: brief.diarize,
145
+ speakerCount: brief.speakerCount,
146
+ wordTimestamps: true,
147
+ vocabularyHints: brief.vocabularyHints
148
+ });
149
+ const offsetSegments = result.segments.map((seg) => ({
150
+ text: seg.text,
151
+ startMs: seg.startMs + offsetMs,
152
+ endMs: seg.endMs + offsetMs,
153
+ speakerId: seg.speakerId,
154
+ speakerName: seg.speakerName,
155
+ confidence: seg.confidence
156
+ }));
157
+ allSegments.push(...offsetSegments);
158
+ fullText += (fullText ? " " : "") + result.text;
159
+ totalDurationMs += result.durationMs;
160
+ offsetMs += chunk.durationMs ?? 0;
161
+ }
162
+ let mappedSegments = allSegments;
163
+ let speakers;
164
+ if (brief.diarize) {
165
+ const mapping = this.diarizationMapper.map(allSegments);
166
+ mappedSegments = mapping.segments;
167
+ speakers = mapping.speakers;
168
+ }
169
+ const transcript = {
170
+ text: fullText,
171
+ segments: mappedSegments,
172
+ language: brief.language ?? "en",
173
+ durationMs: totalDurationMs
174
+ };
175
+ let subtitles;
176
+ const format = brief.subtitleFormat ?? "none";
177
+ if (format === "srt") {
178
+ subtitles = this.subtitleFormatter.toSRT(mappedSegments);
179
+ } else if (format === "vtt") {
180
+ subtitles = this.subtitleFormatter.toVTT(mappedSegments);
181
+ }
182
+ return {
183
+ id: projectId,
184
+ transcript,
185
+ subtitles,
186
+ speakers
187
+ };
188
+ }
189
+ async* transcribeStream(audio, options) {
190
+ if (!this.stt.transcribeStream) {
191
+ throw new Error("Streaming transcription not supported by the current STT provider");
192
+ }
193
+ const stream = this.stt.transcribeStream(audio, {
194
+ language: options?.language,
195
+ diarize: options?.diarize,
196
+ speakerCount: options?.speakerCount,
197
+ wordTimestamps: true,
198
+ vocabularyHints: options?.vocabularyHints
199
+ });
200
+ for await (const segment of stream) {
201
+ yield {
202
+ text: segment.text,
203
+ startMs: segment.startMs,
204
+ endMs: segment.endMs,
205
+ speakerId: segment.speakerId,
206
+ speakerLabel: segment.speakerName,
207
+ confidence: segment.confidence
208
+ };
209
+ }
210
+ }
211
+ }
212
+ function generateProjectId() {
213
+ const timestamp = Date.now().toString(36);
214
+ const random = Math.random().toString(36).slice(2, 8);
215
+ return `stt_${timestamp}_${random}`;
216
+ }
217
+ export {
218
+ Transcriber
219
+ };
File without changes
@@ -0,0 +1,69 @@
1
+ // src/sync/duration-negotiator.ts
2
+ class DurationNegotiator {
3
+ static UPPER_THRESHOLD = 1.1;
4
+ static LOWER_THRESHOLD = 0.7;
5
+ static MAX_RATE = 1.3;
6
+ static MIN_RATE = 0.8;
7
+ negotiate(timingMap, sceneDurations) {
8
+ const adjustments = [];
9
+ const updatedSegments = timingMap.segments.map((seg) => {
10
+ const originalSceneDuration = sceneDurations.get(seg.sceneId);
11
+ if (originalSceneDuration === undefined) {
12
+ adjustments.push({
13
+ sceneId: seg.sceneId,
14
+ originalSceneDurationInFrames: seg.recommendedSceneDurationInFrames,
15
+ voiceDurationInFrames: seg.durationInFrames,
16
+ action: "no_change",
17
+ finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
18
+ });
19
+ return seg;
20
+ }
21
+ const ratio = seg.durationInFrames / originalSceneDuration;
22
+ if (ratio > DurationNegotiator.UPPER_THRESHOLD) {
23
+ const suggestedRate = Math.min(ratio, DurationNegotiator.MAX_RATE);
24
+ adjustments.push({
25
+ sceneId: seg.sceneId,
26
+ originalSceneDurationInFrames: originalSceneDuration,
27
+ voiceDurationInFrames: seg.durationInFrames,
28
+ action: ratio > DurationNegotiator.MAX_RATE ? "extend_scene" : "suggest_rate_change",
29
+ suggestedRate,
30
+ finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
31
+ });
32
+ return seg;
33
+ }
34
+ if (ratio < DurationNegotiator.LOWER_THRESHOLD) {
35
+ const suggestedRate = Math.max(ratio, DurationNegotiator.MIN_RATE);
36
+ adjustments.push({
37
+ sceneId: seg.sceneId,
38
+ originalSceneDurationInFrames: originalSceneDuration,
39
+ voiceDurationInFrames: seg.durationInFrames,
40
+ action: "pad_silence",
41
+ suggestedRate,
42
+ finalSceneDurationInFrames: originalSceneDuration
43
+ });
44
+ return {
45
+ ...seg,
46
+ recommendedSceneDurationInFrames: originalSceneDuration
47
+ };
48
+ }
49
+ adjustments.push({
50
+ sceneId: seg.sceneId,
51
+ originalSceneDurationInFrames: originalSceneDuration,
52
+ voiceDurationInFrames: seg.durationInFrames,
53
+ action: "no_change",
54
+ finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
55
+ });
56
+ return seg;
57
+ });
58
+ return {
59
+ timingMap: {
60
+ ...timingMap,
61
+ segments: updatedSegments
62
+ },
63
+ adjustments
64
+ };
65
+ }
66
+ }
67
+ export {
68
+ DurationNegotiator
69
+ };