@contractspec/lib.voice 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/dist/audio/audio-concatenator.d.ts +15 -0
  2. package/dist/audio/audio-concatenator.js +57 -0
  3. package/dist/audio/duration-estimator.d.ts +31 -0
  4. package/dist/audio/duration-estimator.js +22 -0
  5. package/dist/audio/format-converter.d.ts +17 -0
  6. package/dist/audio/format-converter.js +28 -0
  7. package/dist/audio/index.d.ts +4 -0
  8. package/dist/audio/index.js +121 -0
  9. package/dist/audio/silence-generator.d.ts +16 -0
  10. package/dist/audio/silence-generator.js +20 -0
  11. package/dist/browser/audio/audio-concatenator.js +56 -0
  12. package/dist/browser/audio/duration-estimator.js +21 -0
  13. package/dist/browser/audio/format-converter.js +27 -0
  14. package/dist/browser/audio/index.js +120 -0
  15. package/dist/browser/audio/silence-generator.js +19 -0
  16. package/dist/browser/conversational/index.js +241 -0
  17. package/dist/browser/conversational/response-orchestrator.js +62 -0
  18. package/dist/browser/conversational/transcript-builder.js +63 -0
  19. package/dist/browser/conversational/turn-detector.js +43 -0
  20. package/dist/browser/conversational/types.js +0 -0
  21. package/dist/browser/conversational/voice-session-manager.js +137 -0
  22. package/dist/browser/docs/conversational.docblock.js +5 -0
  23. package/dist/browser/docs/stt.docblock.js +5 -0
  24. package/dist/browser/docs/sync.docblock.js +5 -0
  25. package/dist/browser/docs/tts.docblock.js +5 -0
  26. package/dist/browser/docs/voice.docblock.js +5 -0
  27. package/dist/browser/i18n/catalogs/en.js +91 -0
  28. package/dist/browser/i18n/catalogs/es.js +91 -0
  29. package/dist/browser/i18n/catalogs/fr.js +91 -0
  30. package/dist/browser/i18n/catalogs/index.js +271 -0
  31. package/dist/browser/i18n/index.js +335 -0
  32. package/dist/browser/i18n/keys.js +38 -0
  33. package/dist/browser/i18n/locale.js +13 -0
  34. package/dist/browser/i18n/messages.js +283 -0
  35. package/dist/browser/index.js +1070 -0
  36. package/dist/browser/stt/diarization-mapper.js +42 -0
  37. package/dist/browser/stt/index.js +222 -0
  38. package/dist/browser/stt/segment-splitter.js +36 -0
  39. package/dist/browser/stt/subtitle-formatter.js +51 -0
  40. package/dist/browser/stt/transcriber.js +219 -0
  41. package/dist/browser/stt/types.js +0 -0
  42. package/dist/browser/sync/duration-negotiator.js +69 -0
  43. package/dist/browser/sync/index.js +165 -0
  44. package/dist/browser/sync/scene-adapter.js +52 -0
  45. package/dist/browser/sync/timing-calculator.js +46 -0
  46. package/dist/browser/tts/audio-assembler.js +120 -0
  47. package/dist/browser/tts/emphasis-planner.js +134 -0
  48. package/dist/browser/tts/index.js +439 -0
  49. package/dist/browser/tts/pace-analyzer.js +67 -0
  50. package/dist/browser/tts/segment-synthesizer.js +36 -0
  51. package/dist/browser/tts/types.js +0 -0
  52. package/dist/browser/tts/voice-synthesizer.js +435 -0
  53. package/dist/browser/types.js +0 -0
  54. package/dist/conversational/index.d.ts +5 -0
  55. package/dist/conversational/index.js +242 -0
  56. package/dist/conversational/response-orchestrator.d.ts +26 -0
  57. package/dist/conversational/response-orchestrator.js +63 -0
  58. package/dist/conversational/transcript-builder.d.ts +25 -0
  59. package/dist/conversational/transcript-builder.js +64 -0
  60. package/dist/conversational/turn-detector.d.ts +31 -0
  61. package/dist/conversational/turn-detector.js +44 -0
  62. package/dist/conversational/types.d.ts +55 -0
  63. package/dist/conversational/types.js +1 -0
  64. package/dist/conversational/voice-session-manager.d.ts +17 -0
  65. package/dist/conversational/voice-session-manager.js +138 -0
  66. package/dist/docs/conversational.docblock.d.ts +14 -0
  67. package/dist/docs/conversational.docblock.js +6 -0
  68. package/dist/docs/stt.docblock.d.ts +12 -0
  69. package/dist/docs/stt.docblock.js +6 -0
  70. package/dist/docs/sync.docblock.d.ts +12 -0
  71. package/dist/docs/sync.docblock.js +6 -0
  72. package/dist/docs/tts.docblock.d.ts +12 -0
  73. package/dist/docs/tts.docblock.js +6 -0
  74. package/dist/docs/voice.docblock.d.ts +22 -0
  75. package/dist/docs/voice.docblock.js +6 -0
  76. package/dist/i18n/catalogs/en.d.ts +6 -0
  77. package/dist/i18n/catalogs/en.js +92 -0
  78. package/dist/i18n/catalogs/es.d.ts +4 -0
  79. package/dist/i18n/catalogs/es.js +92 -0
  80. package/dist/i18n/catalogs/fr.d.ts +4 -0
  81. package/dist/i18n/catalogs/fr.js +92 -0
  82. package/dist/i18n/catalogs/index.d.ts +3 -0
  83. package/dist/i18n/catalogs/index.js +272 -0
  84. package/dist/i18n/index.d.ts +20 -0
  85. package/dist/i18n/index.js +336 -0
  86. package/dist/i18n/keys.d.ts +50 -0
  87. package/dist/i18n/keys.js +39 -0
  88. package/dist/i18n/locale.d.ts +6 -0
  89. package/dist/i18n/locale.js +14 -0
  90. package/dist/i18n/messages.d.ts +13 -0
  91. package/dist/i18n/messages.js +284 -0
  92. package/dist/index.d.ts +6 -0
  93. package/dist/index.js +1071 -0
  94. package/dist/node/audio/audio-concatenator.js +56 -0
  95. package/dist/node/audio/duration-estimator.js +21 -0
  96. package/dist/node/audio/format-converter.js +27 -0
  97. package/dist/node/audio/index.js +120 -0
  98. package/dist/node/audio/silence-generator.js +19 -0
  99. package/dist/node/conversational/index.js +241 -0
  100. package/dist/node/conversational/response-orchestrator.js +62 -0
  101. package/dist/node/conversational/transcript-builder.js +63 -0
  102. package/dist/node/conversational/turn-detector.js +43 -0
  103. package/dist/node/conversational/types.js +0 -0
  104. package/dist/node/conversational/voice-session-manager.js +137 -0
  105. package/dist/node/docs/conversational.docblock.js +5 -0
  106. package/dist/node/docs/stt.docblock.js +5 -0
  107. package/dist/node/docs/sync.docblock.js +5 -0
  108. package/dist/node/docs/tts.docblock.js +5 -0
  109. package/dist/node/docs/voice.docblock.js +5 -0
  110. package/dist/node/i18n/catalogs/en.js +91 -0
  111. package/dist/node/i18n/catalogs/es.js +91 -0
  112. package/dist/node/i18n/catalogs/fr.js +91 -0
  113. package/dist/node/i18n/catalogs/index.js +271 -0
  114. package/dist/node/i18n/index.js +335 -0
  115. package/dist/node/i18n/keys.js +38 -0
  116. package/dist/node/i18n/locale.js +13 -0
  117. package/dist/node/i18n/messages.js +283 -0
  118. package/dist/node/index.js +1070 -0
  119. package/dist/node/stt/diarization-mapper.js +42 -0
  120. package/dist/node/stt/index.js +222 -0
  121. package/dist/node/stt/segment-splitter.js +36 -0
  122. package/dist/node/stt/subtitle-formatter.js +51 -0
  123. package/dist/node/stt/transcriber.js +219 -0
  124. package/dist/node/stt/types.js +0 -0
  125. package/dist/node/sync/duration-negotiator.js +69 -0
  126. package/dist/node/sync/index.js +165 -0
  127. package/dist/node/sync/scene-adapter.js +52 -0
  128. package/dist/node/sync/timing-calculator.js +46 -0
  129. package/dist/node/tts/audio-assembler.js +120 -0
  130. package/dist/node/tts/emphasis-planner.js +134 -0
  131. package/dist/node/tts/index.js +439 -0
  132. package/dist/node/tts/pace-analyzer.js +67 -0
  133. package/dist/node/tts/segment-synthesizer.js +36 -0
  134. package/dist/node/tts/types.js +0 -0
  135. package/dist/node/tts/voice-synthesizer.js +435 -0
  136. package/dist/node/types.js +0 -0
  137. package/dist/stt/diarization-mapper.d.ts +19 -0
  138. package/dist/stt/diarization-mapper.js +43 -0
  139. package/dist/stt/index.d.ts +5 -0
  140. package/dist/stt/index.js +223 -0
  141. package/dist/stt/segment-splitter.d.ts +19 -0
  142. package/dist/stt/segment-splitter.js +37 -0
  143. package/dist/stt/subtitle-formatter.d.ts +19 -0
  144. package/dist/stt/subtitle-formatter.js +52 -0
  145. package/dist/stt/transcriber.d.ts +21 -0
  146. package/dist/stt/transcriber.js +220 -0
  147. package/dist/stt/types.d.ts +44 -0
  148. package/dist/stt/types.js +1 -0
  149. package/dist/sync/duration-negotiator.d.ts +37 -0
  150. package/dist/sync/duration-negotiator.js +70 -0
  151. package/dist/sync/index.d.ts +3 -0
  152. package/dist/sync/index.js +166 -0
  153. package/dist/sync/scene-adapter.d.ts +29 -0
  154. package/dist/sync/scene-adapter.js +53 -0
  155. package/dist/sync/timing-calculator.d.ts +21 -0
  156. package/dist/sync/timing-calculator.js +47 -0
  157. package/dist/tts/audio-assembler.d.ts +19 -0
  158. package/dist/tts/audio-assembler.js +121 -0
  159. package/dist/tts/emphasis-planner.d.ts +24 -0
  160. package/dist/tts/emphasis-planner.js +135 -0
  161. package/dist/tts/index.d.ts +6 -0
  162. package/dist/tts/index.js +440 -0
  163. package/dist/tts/pace-analyzer.d.ts +30 -0
  164. package/dist/tts/pace-analyzer.js +68 -0
  165. package/dist/tts/segment-synthesizer.d.ts +21 -0
  166. package/dist/tts/segment-synthesizer.js +37 -0
  167. package/dist/tts/types.d.ts +76 -0
  168. package/dist/tts/types.js +1 -0
  169. package/dist/tts/voice-synthesizer.d.ts +28 -0
  170. package/dist/tts/voice-synthesizer.js +436 -0
  171. package/dist/types.d.ts +12 -0
  172. package/dist/types.js +1 -0
  173. package/package.json +760 -0
@@ -0,0 +1,223 @@
1
+ // @bun
2
+ // src/stt/segment-splitter.ts
3
+ class SegmentSplitter {
4
+ static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
5
+ split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
6
+ const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
7
+ if (totalDurationMs <= maxChunkMs) {
8
+ return [audio];
9
+ }
10
+ const chunks = [];
11
+ const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
12
+ let offsetMs = 0;
13
+ while (offsetMs < totalDurationMs) {
14
+ const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
15
+ const startByte = Math.floor(offsetMs * bytesPerMs);
16
+ const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
17
+ chunks.push({
18
+ data: audio.data.slice(startByte, endByte),
19
+ format: audio.format,
20
+ sampleRateHz: audio.sampleRateHz,
21
+ durationMs: chunkDurationMs,
22
+ channels: audio.channels
23
+ });
24
+ offsetMs += chunkDurationMs;
25
+ }
26
+ return chunks;
27
+ }
28
+ estimateDurationMs(audio) {
29
+ const bytesPerSample = 2;
30
+ const channels = audio.channels ?? 1;
31
+ const totalSamples = audio.data.length / (bytesPerSample * channels);
32
+ return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
33
+ }
34
+ }
35
+
36
+ // src/stt/diarization-mapper.ts
37
+ class DiarizationMapper {
38
+ map(segments, labelPrefix = "Speaker") {
39
+ const speakerOrder = [];
40
+ const speakerStats = new Map;
41
+ for (const seg of segments) {
42
+ if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
43
+ speakerOrder.push(seg.speakerId);
44
+ speakerStats.set(seg.speakerId, {
45
+ segmentCount: 0,
46
+ totalSpeakingMs: 0
47
+ });
48
+ }
49
+ }
50
+ const labeledSegments = segments.map((seg) => {
51
+ if (!seg.speakerId)
52
+ return seg;
53
+ const index = speakerOrder.indexOf(seg.speakerId);
54
+ const label = `${labelPrefix} ${index + 1}`;
55
+ const stats = speakerStats.get(seg.speakerId);
56
+ if (!stats) {
57
+ return { ...seg, speakerLabel: label };
58
+ }
59
+ stats.segmentCount += 1;
60
+ stats.totalSpeakingMs += seg.endMs - seg.startMs;
61
+ return { ...seg, speakerLabel: label };
62
+ });
63
+ const speakers = speakerOrder.map((id, index) => {
64
+ const stats = speakerStats.get(id);
65
+ return {
66
+ id,
67
+ label: `${labelPrefix} ${index + 1}`,
68
+ segmentCount: stats?.segmentCount ?? 0,
69
+ totalSpeakingMs: stats?.totalSpeakingMs ?? 0
70
+ };
71
+ });
72
+ return { segments: labeledSegments, speakers };
73
+ }
74
+ }
75
+
76
+ // src/stt/subtitle-formatter.ts
77
+ class SubtitleFormatter {
78
+ toSRT(segments) {
79
+ return segments.map((seg, i) => {
80
+ const start = this.formatTimeSRT(seg.startMs);
81
+ const end = this.formatTimeSRT(seg.endMs);
82
+ const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
83
+ return `${i + 1}
84
+ ${start} --> ${end}
85
+ ${label}${seg.text}`;
86
+ }).join(`
87
+
88
+ `);
89
+ }
90
+ toVTT(segments) {
91
+ const header = `WEBVTT
92
+
93
+ `;
94
+ const cues = segments.map((seg, i) => {
95
+ const start = this.formatTimeVTT(seg.startMs);
96
+ const end = this.formatTimeVTT(seg.endMs);
97
+ const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
98
+ return `${i + 1}
99
+ ${start} --> ${end}
100
+ ${label}${seg.text}`;
101
+ }).join(`
102
+
103
+ `);
104
+ return header + cues;
105
+ }
106
+ formatTimeSRT(ms) {
107
+ const hours = Math.floor(ms / 3600000);
108
+ const minutes = Math.floor(ms % 3600000 / 60000);
109
+ const seconds = Math.floor(ms % 60000 / 1000);
110
+ const millis = ms % 1000;
111
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
112
+ }
113
+ formatTimeVTT(ms) {
114
+ const hours = Math.floor(ms / 3600000);
115
+ const minutes = Math.floor(ms % 3600000 / 60000);
116
+ const seconds = Math.floor(ms % 60000 / 1000);
117
+ const millis = ms % 1000;
118
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
119
+ }
120
+ pad(value, length) {
121
+ return value.toString().padStart(length, "0");
122
+ }
123
+ }
124
+
125
+ // src/stt/transcriber.ts
126
+ class Transcriber {
127
+ stt;
128
+ segmentSplitter = new SegmentSplitter;
129
+ diarizationMapper = new DiarizationMapper;
130
+ subtitleFormatter = new SubtitleFormatter;
131
+ constructor(options) {
132
+ this.stt = options.stt;
133
+ }
134
+ async transcribe(brief) {
135
+ const projectId = generateProjectId();
136
+ const chunks = this.segmentSplitter.split(brief.audio);
137
+ const allSegments = [];
138
+ let fullText = "";
139
+ let totalDurationMs = 0;
140
+ let offsetMs = 0;
141
+ for (const chunk of chunks) {
142
+ const result = await this.stt.transcribe({
143
+ audio: chunk,
144
+ language: brief.language,
145
+ diarize: brief.diarize,
146
+ speakerCount: brief.speakerCount,
147
+ wordTimestamps: true,
148
+ vocabularyHints: brief.vocabularyHints
149
+ });
150
+ const offsetSegments = result.segments.map((seg) => ({
151
+ text: seg.text,
152
+ startMs: seg.startMs + offsetMs,
153
+ endMs: seg.endMs + offsetMs,
154
+ speakerId: seg.speakerId,
155
+ speakerName: seg.speakerName,
156
+ confidence: seg.confidence
157
+ }));
158
+ allSegments.push(...offsetSegments);
159
+ fullText += (fullText ? " " : "") + result.text;
160
+ totalDurationMs += result.durationMs;
161
+ offsetMs += chunk.durationMs ?? 0;
162
+ }
163
+ let mappedSegments = allSegments;
164
+ let speakers;
165
+ if (brief.diarize) {
166
+ const mapping = this.diarizationMapper.map(allSegments);
167
+ mappedSegments = mapping.segments;
168
+ speakers = mapping.speakers;
169
+ }
170
+ const transcript = {
171
+ text: fullText,
172
+ segments: mappedSegments,
173
+ language: brief.language ?? "en",
174
+ durationMs: totalDurationMs
175
+ };
176
+ let subtitles;
177
+ const format = brief.subtitleFormat ?? "none";
178
+ if (format === "srt") {
179
+ subtitles = this.subtitleFormatter.toSRT(mappedSegments);
180
+ } else if (format === "vtt") {
181
+ subtitles = this.subtitleFormatter.toVTT(mappedSegments);
182
+ }
183
+ return {
184
+ id: projectId,
185
+ transcript,
186
+ subtitles,
187
+ speakers
188
+ };
189
+ }
190
+ async* transcribeStream(audio, options) {
191
+ if (!this.stt.transcribeStream) {
192
+ throw new Error("Streaming transcription not supported by the current STT provider");
193
+ }
194
+ const stream = this.stt.transcribeStream(audio, {
195
+ language: options?.language,
196
+ diarize: options?.diarize,
197
+ speakerCount: options?.speakerCount,
198
+ wordTimestamps: true,
199
+ vocabularyHints: options?.vocabularyHints
200
+ });
201
+ for await (const segment of stream) {
202
+ yield {
203
+ text: segment.text,
204
+ startMs: segment.startMs,
205
+ endMs: segment.endMs,
206
+ speakerId: segment.speakerId,
207
+ speakerLabel: segment.speakerName,
208
+ confidence: segment.confidence
209
+ };
210
+ }
211
+ }
212
+ }
213
+ function generateProjectId() {
214
+ const timestamp = Date.now().toString(36);
215
+ const random = Math.random().toString(36).slice(2, 8);
216
+ return `stt_${timestamp}_${random}`;
217
+ }
218
+ export {
219
+ Transcriber,
220
+ SubtitleFormatter,
221
+ SegmentSplitter,
222
+ DiarizationMapper
223
+ };
@@ -0,0 +1,19 @@
1
+ import type { AudioData } from '../types';
2
+ /**
3
+ * Split long audio into processable chunks.
4
+ *
5
+ * Useful for providers with maximum audio length limits.
6
+ * Splits at silence boundaries when possible (approximated by byte position).
7
+ */
8
+ export declare class SegmentSplitter {
9
+ /** Default maximum chunk duration in ms (5 minutes) */
10
+ private static readonly DEFAULT_MAX_CHUNK_MS;
11
+ /**
12
+ * Split audio into chunks of at most maxChunkMs duration.
13
+ *
14
+ * @param audio - Input audio data
15
+ * @param maxChunkMs - Maximum chunk duration in milliseconds
16
+ */
17
+ split(audio: AudioData, maxChunkMs?: number): AudioData[];
18
+ private estimateDurationMs;
19
+ }
@@ -0,0 +1,37 @@
1
+ // @bun
2
+ // src/stt/segment-splitter.ts
3
+ class SegmentSplitter {
4
+ static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
5
+ split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
6
+ const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
7
+ if (totalDurationMs <= maxChunkMs) {
8
+ return [audio];
9
+ }
10
+ const chunks = [];
11
+ const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
12
+ let offsetMs = 0;
13
+ while (offsetMs < totalDurationMs) {
14
+ const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
15
+ const startByte = Math.floor(offsetMs * bytesPerMs);
16
+ const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
17
+ chunks.push({
18
+ data: audio.data.slice(startByte, endByte),
19
+ format: audio.format,
20
+ sampleRateHz: audio.sampleRateHz,
21
+ durationMs: chunkDurationMs,
22
+ channels: audio.channels
23
+ });
24
+ offsetMs += chunkDurationMs;
25
+ }
26
+ return chunks;
27
+ }
28
+ estimateDurationMs(audio) {
29
+ const bytesPerSample = 2;
30
+ const channels = audio.channels ?? 1;
31
+ const totalSamples = audio.data.length / (bytesPerSample * channels);
32
+ return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
33
+ }
34
+ }
35
+ export {
36
+ SegmentSplitter
37
+ };
@@ -0,0 +1,19 @@
1
+ import type { TranscriptionSegment } from './types';
2
+ /**
3
+ * Format transcription segments as SRT or VTT subtitles.
4
+ */
5
+ export declare class SubtitleFormatter {
6
+ /**
7
+ * Convert segments to SRT format.
8
+ */
9
+ toSRT(segments: TranscriptionSegment[]): string;
10
+ /**
11
+ * Convert segments to WebVTT format.
12
+ */
13
+ toVTT(segments: TranscriptionSegment[]): string;
14
+ /** Format ms as SRT timestamp: HH:MM:SS,mmm */
15
+ private formatTimeSRT;
16
+ /** Format ms as VTT timestamp: HH:MM:SS.mmm */
17
+ private formatTimeVTT;
18
+ private pad;
19
+ }
@@ -0,0 +1,52 @@
1
+ // @bun
2
+ // src/stt/subtitle-formatter.ts
3
+ class SubtitleFormatter {
4
+ toSRT(segments) {
5
+ return segments.map((seg, i) => {
6
+ const start = this.formatTimeSRT(seg.startMs);
7
+ const end = this.formatTimeSRT(seg.endMs);
8
+ const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
9
+ return `${i + 1}
10
+ ${start} --> ${end}
11
+ ${label}${seg.text}`;
12
+ }).join(`
13
+
14
+ `);
15
+ }
16
+ toVTT(segments) {
17
+ const header = `WEBVTT
18
+
19
+ `;
20
+ const cues = segments.map((seg, i) => {
21
+ const start = this.formatTimeVTT(seg.startMs);
22
+ const end = this.formatTimeVTT(seg.endMs);
23
+ const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
24
+ return `${i + 1}
25
+ ${start} --> ${end}
26
+ ${label}${seg.text}`;
27
+ }).join(`
28
+
29
+ `);
30
+ return header + cues;
31
+ }
32
+ formatTimeSRT(ms) {
33
+ const hours = Math.floor(ms / 3600000);
34
+ const minutes = Math.floor(ms % 3600000 / 60000);
35
+ const seconds = Math.floor(ms % 60000 / 1000);
36
+ const millis = ms % 1000;
37
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
38
+ }
39
+ formatTimeVTT(ms) {
40
+ const hours = Math.floor(ms / 3600000);
41
+ const minutes = Math.floor(ms % 3600000 / 60000);
42
+ const seconds = Math.floor(ms % 60000 / 1000);
43
+ const millis = ms % 1000;
44
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
45
+ }
46
+ pad(value, length) {
47
+ return value.toString().padStart(length, "0");
48
+ }
49
+ }
50
+ export {
51
+ SubtitleFormatter
52
+ };
@@ -0,0 +1,21 @@
1
+ import type { STTBrief, STTOptions, TranscriptionProject, TranscriptionSegment } from './types';
2
+ /**
3
+ * Main STT orchestrator.
4
+ *
5
+ * Pipeline:
6
+ * 1. Split audio into processable chunks (if needed)
7
+ * 2. Transcribe via STTProvider
8
+ * 3. Map speaker IDs to labels (if diarization enabled)
9
+ * 4. Format subtitles (if requested)
10
+ */
11
+ export declare class Transcriber {
12
+ private readonly stt;
13
+ private readonly segmentSplitter;
14
+ private readonly diarizationMapper;
15
+ private readonly subtitleFormatter;
16
+ constructor(options: STTOptions);
17
+ /** Transcribe audio to text */
18
+ transcribe(brief: STTBrief): Promise<TranscriptionProject>;
19
+ /** Stream transcription (real-time, if provider supports it) */
20
+ transcribeStream(audio: AsyncIterable<Uint8Array>, options?: Partial<STTBrief>): AsyncIterable<TranscriptionSegment>;
21
+ }
@@ -0,0 +1,220 @@
1
+ // @bun
2
+ // src/stt/segment-splitter.ts
3
+ class SegmentSplitter {
4
+ static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
5
+ split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
6
+ const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
7
+ if (totalDurationMs <= maxChunkMs) {
8
+ return [audio];
9
+ }
10
+ const chunks = [];
11
+ const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
12
+ let offsetMs = 0;
13
+ while (offsetMs < totalDurationMs) {
14
+ const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
15
+ const startByte = Math.floor(offsetMs * bytesPerMs);
16
+ const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
17
+ chunks.push({
18
+ data: audio.data.slice(startByte, endByte),
19
+ format: audio.format,
20
+ sampleRateHz: audio.sampleRateHz,
21
+ durationMs: chunkDurationMs,
22
+ channels: audio.channels
23
+ });
24
+ offsetMs += chunkDurationMs;
25
+ }
26
+ return chunks;
27
+ }
28
+ estimateDurationMs(audio) {
29
+ const bytesPerSample = 2;
30
+ const channels = audio.channels ?? 1;
31
+ const totalSamples = audio.data.length / (bytesPerSample * channels);
32
+ return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
33
+ }
34
+ }
35
+
36
+ // src/stt/diarization-mapper.ts
37
+ class DiarizationMapper {
38
+ map(segments, labelPrefix = "Speaker") {
39
+ const speakerOrder = [];
40
+ const speakerStats = new Map;
41
+ for (const seg of segments) {
42
+ if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
43
+ speakerOrder.push(seg.speakerId);
44
+ speakerStats.set(seg.speakerId, {
45
+ segmentCount: 0,
46
+ totalSpeakingMs: 0
47
+ });
48
+ }
49
+ }
50
+ const labeledSegments = segments.map((seg) => {
51
+ if (!seg.speakerId)
52
+ return seg;
53
+ const index = speakerOrder.indexOf(seg.speakerId);
54
+ const label = `${labelPrefix} ${index + 1}`;
55
+ const stats = speakerStats.get(seg.speakerId);
56
+ if (!stats) {
57
+ return { ...seg, speakerLabel: label };
58
+ }
59
+ stats.segmentCount += 1;
60
+ stats.totalSpeakingMs += seg.endMs - seg.startMs;
61
+ return { ...seg, speakerLabel: label };
62
+ });
63
+ const speakers = speakerOrder.map((id, index) => {
64
+ const stats = speakerStats.get(id);
65
+ return {
66
+ id,
67
+ label: `${labelPrefix} ${index + 1}`,
68
+ segmentCount: stats?.segmentCount ?? 0,
69
+ totalSpeakingMs: stats?.totalSpeakingMs ?? 0
70
+ };
71
+ });
72
+ return { segments: labeledSegments, speakers };
73
+ }
74
+ }
75
+
76
+ // src/stt/subtitle-formatter.ts
77
+ class SubtitleFormatter {
78
+ toSRT(segments) {
79
+ return segments.map((seg, i) => {
80
+ const start = this.formatTimeSRT(seg.startMs);
81
+ const end = this.formatTimeSRT(seg.endMs);
82
+ const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
83
+ return `${i + 1}
84
+ ${start} --> ${end}
85
+ ${label}${seg.text}`;
86
+ }).join(`
87
+
88
+ `);
89
+ }
90
+ toVTT(segments) {
91
+ const header = `WEBVTT
92
+
93
+ `;
94
+ const cues = segments.map((seg, i) => {
95
+ const start = this.formatTimeVTT(seg.startMs);
96
+ const end = this.formatTimeVTT(seg.endMs);
97
+ const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
98
+ return `${i + 1}
99
+ ${start} --> ${end}
100
+ ${label}${seg.text}`;
101
+ }).join(`
102
+
103
+ `);
104
+ return header + cues;
105
+ }
106
+ formatTimeSRT(ms) {
107
+ const hours = Math.floor(ms / 3600000);
108
+ const minutes = Math.floor(ms % 3600000 / 60000);
109
+ const seconds = Math.floor(ms % 60000 / 1000);
110
+ const millis = ms % 1000;
111
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
112
+ }
113
+ formatTimeVTT(ms) {
114
+ const hours = Math.floor(ms / 3600000);
115
+ const minutes = Math.floor(ms % 3600000 / 60000);
116
+ const seconds = Math.floor(ms % 60000 / 1000);
117
+ const millis = ms % 1000;
118
+ return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
119
+ }
120
+ pad(value, length) {
121
+ return value.toString().padStart(length, "0");
122
+ }
123
+ }
124
+
125
+ // src/stt/transcriber.ts
126
+ class Transcriber {
127
+ stt;
128
+ segmentSplitter = new SegmentSplitter;
129
+ diarizationMapper = new DiarizationMapper;
130
+ subtitleFormatter = new SubtitleFormatter;
131
+ constructor(options) {
132
+ this.stt = options.stt;
133
+ }
134
+ async transcribe(brief) {
135
+ const projectId = generateProjectId();
136
+ const chunks = this.segmentSplitter.split(brief.audio);
137
+ const allSegments = [];
138
+ let fullText = "";
139
+ let totalDurationMs = 0;
140
+ let offsetMs = 0;
141
+ for (const chunk of chunks) {
142
+ const result = await this.stt.transcribe({
143
+ audio: chunk,
144
+ language: brief.language,
145
+ diarize: brief.diarize,
146
+ speakerCount: brief.speakerCount,
147
+ wordTimestamps: true,
148
+ vocabularyHints: brief.vocabularyHints
149
+ });
150
+ const offsetSegments = result.segments.map((seg) => ({
151
+ text: seg.text,
152
+ startMs: seg.startMs + offsetMs,
153
+ endMs: seg.endMs + offsetMs,
154
+ speakerId: seg.speakerId,
155
+ speakerName: seg.speakerName,
156
+ confidence: seg.confidence
157
+ }));
158
+ allSegments.push(...offsetSegments);
159
+ fullText += (fullText ? " " : "") + result.text;
160
+ totalDurationMs += result.durationMs;
161
+ offsetMs += chunk.durationMs ?? 0;
162
+ }
163
+ let mappedSegments = allSegments;
164
+ let speakers;
165
+ if (brief.diarize) {
166
+ const mapping = this.diarizationMapper.map(allSegments);
167
+ mappedSegments = mapping.segments;
168
+ speakers = mapping.speakers;
169
+ }
170
+ const transcript = {
171
+ text: fullText,
172
+ segments: mappedSegments,
173
+ language: brief.language ?? "en",
174
+ durationMs: totalDurationMs
175
+ };
176
+ let subtitles;
177
+ const format = brief.subtitleFormat ?? "none";
178
+ if (format === "srt") {
179
+ subtitles = this.subtitleFormatter.toSRT(mappedSegments);
180
+ } else if (format === "vtt") {
181
+ subtitles = this.subtitleFormatter.toVTT(mappedSegments);
182
+ }
183
+ return {
184
+ id: projectId,
185
+ transcript,
186
+ subtitles,
187
+ speakers
188
+ };
189
+ }
190
+ async* transcribeStream(audio, options) {
191
+ if (!this.stt.transcribeStream) {
192
+ throw new Error("Streaming transcription not supported by the current STT provider");
193
+ }
194
+ const stream = this.stt.transcribeStream(audio, {
195
+ language: options?.language,
196
+ diarize: options?.diarize,
197
+ speakerCount: options?.speakerCount,
198
+ wordTimestamps: true,
199
+ vocabularyHints: options?.vocabularyHints
200
+ });
201
+ for await (const segment of stream) {
202
+ yield {
203
+ text: segment.text,
204
+ startMs: segment.startMs,
205
+ endMs: segment.endMs,
206
+ speakerId: segment.speakerId,
207
+ speakerLabel: segment.speakerName,
208
+ confidence: segment.confidence
209
+ };
210
+ }
211
+ }
212
+ }
213
+ function generateProjectId() {
214
+ const timestamp = Date.now().toString(36);
215
+ const random = Math.random().toString(36).slice(2, 8);
216
+ return `stt_${timestamp}_${random}`;
217
+ }
218
+ export {
219
+ Transcriber
220
+ };
@@ -0,0 +1,44 @@
1
+ import type { STTProvider, AudioData, VoiceOptions } from '../types';
2
+ export interface STTBrief {
3
+ audio: AudioData;
4
+ language?: string;
5
+ diarize?: boolean;
6
+ speakerCount?: number;
7
+ vocabularyHints?: string[];
8
+ /** Output subtitle format */
9
+ subtitleFormat?: 'srt' | 'vtt' | 'none';
10
+ }
11
+ export interface TranscriptionProject {
12
+ id: string;
13
+ transcript: TranscriptionResult;
14
+ subtitles?: string;
15
+ speakers?: SpeakerMap[];
16
+ }
17
+ export interface TranscriptionResult {
18
+ text: string;
19
+ segments: TranscriptionSegment[];
20
+ language: string;
21
+ durationMs: number;
22
+ wordTimings?: {
23
+ word: string;
24
+ startMs: number;
25
+ endMs: number;
26
+ }[];
27
+ }
28
+ export interface TranscriptionSegment {
29
+ text: string;
30
+ startMs: number;
31
+ endMs: number;
32
+ speakerId?: string;
33
+ speakerLabel?: string;
34
+ confidence?: number;
35
+ }
36
+ export interface SpeakerMap {
37
+ id: string;
38
+ label: string;
39
+ segmentCount: number;
40
+ totalSpeakingMs: number;
41
+ }
42
+ export interface STTOptions extends VoiceOptions {
43
+ stt: STTProvider;
44
+ }
@@ -0,0 +1 @@
1
+ // @bun
@@ -0,0 +1,37 @@
1
+ import type { VoiceTimingMap } from '../types';
2
+ interface NegotiationResult {
3
+ /** Updated timing map with negotiated durations */
4
+ timingMap: VoiceTimingMap;
5
+ /** Per-scene negotiation details */
6
+ adjustments: SceneAdjustment[];
7
+ }
8
+ interface SceneAdjustment {
9
+ sceneId: string;
10
+ originalSceneDurationInFrames: number;
11
+ voiceDurationInFrames: number;
12
+ action: 'no_change' | 'extend_scene' | 'pad_silence' | 'suggest_rate_change';
13
+ suggestedRate?: number;
14
+ finalSceneDurationInFrames: number;
15
+ }
16
+ /**
17
+ * Negotiate duration between voice audio and scene durations.
18
+ *
19
+ * One-pass duration balancing:
20
+ * - Voice fits scene -> no change
21
+ * - Voice > 110% of scene -> suggest rate increase (cap 1.3x), extend scene
22
+ * - Voice < 70% of scene -> suggest rate decrease (floor 0.8x), pad silence
23
+ */
24
+ export declare class DurationNegotiator {
25
+ private static readonly UPPER_THRESHOLD;
26
+ private static readonly LOWER_THRESHOLD;
27
+ private static readonly MAX_RATE;
28
+ private static readonly MIN_RATE;
29
+ /**
30
+ * Negotiate voice-vs-scene durations.
31
+ *
32
+ * @param timingMap - Voice timing map with per-segment durations
33
+ * @param sceneDurations - Map of sceneId -> original scene duration in frames
34
+ */
35
+ negotiate(timingMap: VoiceTimingMap, sceneDurations: Map<string, number>): NegotiationResult;
36
+ }
37
+ export {};