@contractspec/lib.voice 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio/audio-concatenator.d.ts +15 -0
- package/dist/audio/audio-concatenator.js +57 -0
- package/dist/audio/duration-estimator.d.ts +31 -0
- package/dist/audio/duration-estimator.js +22 -0
- package/dist/audio/format-converter.d.ts +17 -0
- package/dist/audio/format-converter.js +28 -0
- package/dist/audio/index.d.ts +4 -0
- package/dist/audio/index.js +121 -0
- package/dist/audio/silence-generator.d.ts +16 -0
- package/dist/audio/silence-generator.js +20 -0
- package/dist/browser/audio/audio-concatenator.js +56 -0
- package/dist/browser/audio/duration-estimator.js +21 -0
- package/dist/browser/audio/format-converter.js +27 -0
- package/dist/browser/audio/index.js +120 -0
- package/dist/browser/audio/silence-generator.js +19 -0
- package/dist/browser/conversational/index.js +241 -0
- package/dist/browser/conversational/response-orchestrator.js +62 -0
- package/dist/browser/conversational/transcript-builder.js +63 -0
- package/dist/browser/conversational/turn-detector.js +43 -0
- package/dist/browser/conversational/types.js +0 -0
- package/dist/browser/conversational/voice-session-manager.js +137 -0
- package/dist/browser/docs/conversational.docblock.js +5 -0
- package/dist/browser/docs/stt.docblock.js +5 -0
- package/dist/browser/docs/sync.docblock.js +5 -0
- package/dist/browser/docs/tts.docblock.js +5 -0
- package/dist/browser/docs/voice.docblock.js +5 -0
- package/dist/browser/i18n/catalogs/en.js +91 -0
- package/dist/browser/i18n/catalogs/es.js +91 -0
- package/dist/browser/i18n/catalogs/fr.js +91 -0
- package/dist/browser/i18n/catalogs/index.js +271 -0
- package/dist/browser/i18n/index.js +335 -0
- package/dist/browser/i18n/keys.js +38 -0
- package/dist/browser/i18n/locale.js +13 -0
- package/dist/browser/i18n/messages.js +283 -0
- package/dist/browser/index.js +1070 -0
- package/dist/browser/stt/diarization-mapper.js +42 -0
- package/dist/browser/stt/index.js +222 -0
- package/dist/browser/stt/segment-splitter.js +36 -0
- package/dist/browser/stt/subtitle-formatter.js +51 -0
- package/dist/browser/stt/transcriber.js +219 -0
- package/dist/browser/stt/types.js +0 -0
- package/dist/browser/sync/duration-negotiator.js +69 -0
- package/dist/browser/sync/index.js +165 -0
- package/dist/browser/sync/scene-adapter.js +52 -0
- package/dist/browser/sync/timing-calculator.js +46 -0
- package/dist/browser/tts/audio-assembler.js +120 -0
- package/dist/browser/tts/emphasis-planner.js +134 -0
- package/dist/browser/tts/index.js +439 -0
- package/dist/browser/tts/pace-analyzer.js +67 -0
- package/dist/browser/tts/segment-synthesizer.js +36 -0
- package/dist/browser/tts/types.js +0 -0
- package/dist/browser/tts/voice-synthesizer.js +435 -0
- package/dist/browser/types.js +0 -0
- package/dist/conversational/index.d.ts +5 -0
- package/dist/conversational/index.js +242 -0
- package/dist/conversational/response-orchestrator.d.ts +26 -0
- package/dist/conversational/response-orchestrator.js +63 -0
- package/dist/conversational/transcript-builder.d.ts +25 -0
- package/dist/conversational/transcript-builder.js +64 -0
- package/dist/conversational/turn-detector.d.ts +31 -0
- package/dist/conversational/turn-detector.js +44 -0
- package/dist/conversational/types.d.ts +55 -0
- package/dist/conversational/types.js +1 -0
- package/dist/conversational/voice-session-manager.d.ts +17 -0
- package/dist/conversational/voice-session-manager.js +138 -0
- package/dist/docs/conversational.docblock.d.ts +14 -0
- package/dist/docs/conversational.docblock.js +6 -0
- package/dist/docs/stt.docblock.d.ts +12 -0
- package/dist/docs/stt.docblock.js +6 -0
- package/dist/docs/sync.docblock.d.ts +12 -0
- package/dist/docs/sync.docblock.js +6 -0
- package/dist/docs/tts.docblock.d.ts +12 -0
- package/dist/docs/tts.docblock.js +6 -0
- package/dist/docs/voice.docblock.d.ts +22 -0
- package/dist/docs/voice.docblock.js +6 -0
- package/dist/i18n/catalogs/en.d.ts +6 -0
- package/dist/i18n/catalogs/en.js +92 -0
- package/dist/i18n/catalogs/es.d.ts +4 -0
- package/dist/i18n/catalogs/es.js +92 -0
- package/dist/i18n/catalogs/fr.d.ts +4 -0
- package/dist/i18n/catalogs/fr.js +92 -0
- package/dist/i18n/catalogs/index.d.ts +3 -0
- package/dist/i18n/catalogs/index.js +272 -0
- package/dist/i18n/index.d.ts +20 -0
- package/dist/i18n/index.js +336 -0
- package/dist/i18n/keys.d.ts +50 -0
- package/dist/i18n/keys.js +39 -0
- package/dist/i18n/locale.d.ts +6 -0
- package/dist/i18n/locale.js +14 -0
- package/dist/i18n/messages.d.ts +13 -0
- package/dist/i18n/messages.js +284 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +1071 -0
- package/dist/node/audio/audio-concatenator.js +56 -0
- package/dist/node/audio/duration-estimator.js +21 -0
- package/dist/node/audio/format-converter.js +27 -0
- package/dist/node/audio/index.js +120 -0
- package/dist/node/audio/silence-generator.js +19 -0
- package/dist/node/conversational/index.js +241 -0
- package/dist/node/conversational/response-orchestrator.js +62 -0
- package/dist/node/conversational/transcript-builder.js +63 -0
- package/dist/node/conversational/turn-detector.js +43 -0
- package/dist/node/conversational/types.js +0 -0
- package/dist/node/conversational/voice-session-manager.js +137 -0
- package/dist/node/docs/conversational.docblock.js +5 -0
- package/dist/node/docs/stt.docblock.js +5 -0
- package/dist/node/docs/sync.docblock.js +5 -0
- package/dist/node/docs/tts.docblock.js +5 -0
- package/dist/node/docs/voice.docblock.js +5 -0
- package/dist/node/i18n/catalogs/en.js +91 -0
- package/dist/node/i18n/catalogs/es.js +91 -0
- package/dist/node/i18n/catalogs/fr.js +91 -0
- package/dist/node/i18n/catalogs/index.js +271 -0
- package/dist/node/i18n/index.js +335 -0
- package/dist/node/i18n/keys.js +38 -0
- package/dist/node/i18n/locale.js +13 -0
- package/dist/node/i18n/messages.js +283 -0
- package/dist/node/index.js +1070 -0
- package/dist/node/stt/diarization-mapper.js +42 -0
- package/dist/node/stt/index.js +222 -0
- package/dist/node/stt/segment-splitter.js +36 -0
- package/dist/node/stt/subtitle-formatter.js +51 -0
- package/dist/node/stt/transcriber.js +219 -0
- package/dist/node/stt/types.js +0 -0
- package/dist/node/sync/duration-negotiator.js +69 -0
- package/dist/node/sync/index.js +165 -0
- package/dist/node/sync/scene-adapter.js +52 -0
- package/dist/node/sync/timing-calculator.js +46 -0
- package/dist/node/tts/audio-assembler.js +120 -0
- package/dist/node/tts/emphasis-planner.js +134 -0
- package/dist/node/tts/index.js +439 -0
- package/dist/node/tts/pace-analyzer.js +67 -0
- package/dist/node/tts/segment-synthesizer.js +36 -0
- package/dist/node/tts/types.js +0 -0
- package/dist/node/tts/voice-synthesizer.js +435 -0
- package/dist/node/types.js +0 -0
- package/dist/stt/diarization-mapper.d.ts +19 -0
- package/dist/stt/diarization-mapper.js +43 -0
- package/dist/stt/index.d.ts +5 -0
- package/dist/stt/index.js +223 -0
- package/dist/stt/segment-splitter.d.ts +19 -0
- package/dist/stt/segment-splitter.js +37 -0
- package/dist/stt/subtitle-formatter.d.ts +19 -0
- package/dist/stt/subtitle-formatter.js +52 -0
- package/dist/stt/transcriber.d.ts +21 -0
- package/dist/stt/transcriber.js +220 -0
- package/dist/stt/types.d.ts +44 -0
- package/dist/stt/types.js +1 -0
- package/dist/sync/duration-negotiator.d.ts +37 -0
- package/dist/sync/duration-negotiator.js +70 -0
- package/dist/sync/index.d.ts +3 -0
- package/dist/sync/index.js +166 -0
- package/dist/sync/scene-adapter.d.ts +29 -0
- package/dist/sync/scene-adapter.js +53 -0
- package/dist/sync/timing-calculator.d.ts +21 -0
- package/dist/sync/timing-calculator.js +47 -0
- package/dist/tts/audio-assembler.d.ts +19 -0
- package/dist/tts/audio-assembler.js +121 -0
- package/dist/tts/emphasis-planner.d.ts +24 -0
- package/dist/tts/emphasis-planner.js +135 -0
- package/dist/tts/index.d.ts +6 -0
- package/dist/tts/index.js +440 -0
- package/dist/tts/pace-analyzer.d.ts +30 -0
- package/dist/tts/pace-analyzer.js +68 -0
- package/dist/tts/segment-synthesizer.d.ts +21 -0
- package/dist/tts/segment-synthesizer.js +37 -0
- package/dist/tts/types.d.ts +76 -0
- package/dist/tts/types.js +1 -0
- package/dist/tts/voice-synthesizer.d.ts +28 -0
- package/dist/tts/voice-synthesizer.js +436 -0
- package/dist/types.d.ts +12 -0
- package/dist/types.js +1 -0
- package/package.json +760 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/stt/segment-splitter.ts
|
|
3
|
+
class SegmentSplitter {
|
|
4
|
+
static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
|
|
5
|
+
split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
|
|
6
|
+
const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
|
|
7
|
+
if (totalDurationMs <= maxChunkMs) {
|
|
8
|
+
return [audio];
|
|
9
|
+
}
|
|
10
|
+
const chunks = [];
|
|
11
|
+
const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
|
|
12
|
+
let offsetMs = 0;
|
|
13
|
+
while (offsetMs < totalDurationMs) {
|
|
14
|
+
const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
|
|
15
|
+
const startByte = Math.floor(offsetMs * bytesPerMs);
|
|
16
|
+
const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
|
|
17
|
+
chunks.push({
|
|
18
|
+
data: audio.data.slice(startByte, endByte),
|
|
19
|
+
format: audio.format,
|
|
20
|
+
sampleRateHz: audio.sampleRateHz,
|
|
21
|
+
durationMs: chunkDurationMs,
|
|
22
|
+
channels: audio.channels
|
|
23
|
+
});
|
|
24
|
+
offsetMs += chunkDurationMs;
|
|
25
|
+
}
|
|
26
|
+
return chunks;
|
|
27
|
+
}
|
|
28
|
+
estimateDurationMs(audio) {
|
|
29
|
+
const bytesPerSample = 2;
|
|
30
|
+
const channels = audio.channels ?? 1;
|
|
31
|
+
const totalSamples = audio.data.length / (bytesPerSample * channels);
|
|
32
|
+
return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// src/stt/diarization-mapper.ts
|
|
37
|
+
class DiarizationMapper {
|
|
38
|
+
map(segments, labelPrefix = "Speaker") {
|
|
39
|
+
const speakerOrder = [];
|
|
40
|
+
const speakerStats = new Map;
|
|
41
|
+
for (const seg of segments) {
|
|
42
|
+
if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
|
|
43
|
+
speakerOrder.push(seg.speakerId);
|
|
44
|
+
speakerStats.set(seg.speakerId, {
|
|
45
|
+
segmentCount: 0,
|
|
46
|
+
totalSpeakingMs: 0
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
const labeledSegments = segments.map((seg) => {
|
|
51
|
+
if (!seg.speakerId)
|
|
52
|
+
return seg;
|
|
53
|
+
const index = speakerOrder.indexOf(seg.speakerId);
|
|
54
|
+
const label = `${labelPrefix} ${index + 1}`;
|
|
55
|
+
const stats = speakerStats.get(seg.speakerId);
|
|
56
|
+
if (!stats) {
|
|
57
|
+
return { ...seg, speakerLabel: label };
|
|
58
|
+
}
|
|
59
|
+
stats.segmentCount += 1;
|
|
60
|
+
stats.totalSpeakingMs += seg.endMs - seg.startMs;
|
|
61
|
+
return { ...seg, speakerLabel: label };
|
|
62
|
+
});
|
|
63
|
+
const speakers = speakerOrder.map((id, index) => {
|
|
64
|
+
const stats = speakerStats.get(id);
|
|
65
|
+
return {
|
|
66
|
+
id,
|
|
67
|
+
label: `${labelPrefix} ${index + 1}`,
|
|
68
|
+
segmentCount: stats?.segmentCount ?? 0,
|
|
69
|
+
totalSpeakingMs: stats?.totalSpeakingMs ?? 0
|
|
70
|
+
};
|
|
71
|
+
});
|
|
72
|
+
return { segments: labeledSegments, speakers };
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// src/stt/subtitle-formatter.ts
|
|
77
|
+
class SubtitleFormatter {
|
|
78
|
+
toSRT(segments) {
|
|
79
|
+
return segments.map((seg, i) => {
|
|
80
|
+
const start = this.formatTimeSRT(seg.startMs);
|
|
81
|
+
const end = this.formatTimeSRT(seg.endMs);
|
|
82
|
+
const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
|
|
83
|
+
return `${i + 1}
|
|
84
|
+
${start} --> ${end}
|
|
85
|
+
${label}${seg.text}`;
|
|
86
|
+
}).join(`
|
|
87
|
+
|
|
88
|
+
`);
|
|
89
|
+
}
|
|
90
|
+
toVTT(segments) {
|
|
91
|
+
const header = `WEBVTT
|
|
92
|
+
|
|
93
|
+
`;
|
|
94
|
+
const cues = segments.map((seg, i) => {
|
|
95
|
+
const start = this.formatTimeVTT(seg.startMs);
|
|
96
|
+
const end = this.formatTimeVTT(seg.endMs);
|
|
97
|
+
const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
|
|
98
|
+
return `${i + 1}
|
|
99
|
+
${start} --> ${end}
|
|
100
|
+
${label}${seg.text}`;
|
|
101
|
+
}).join(`
|
|
102
|
+
|
|
103
|
+
`);
|
|
104
|
+
return header + cues;
|
|
105
|
+
}
|
|
106
|
+
formatTimeSRT(ms) {
|
|
107
|
+
const hours = Math.floor(ms / 3600000);
|
|
108
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
109
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
110
|
+
const millis = ms % 1000;
|
|
111
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
|
|
112
|
+
}
|
|
113
|
+
formatTimeVTT(ms) {
|
|
114
|
+
const hours = Math.floor(ms / 3600000);
|
|
115
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
116
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
117
|
+
const millis = ms % 1000;
|
|
118
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
|
|
119
|
+
}
|
|
120
|
+
pad(value, length) {
|
|
121
|
+
return value.toString().padStart(length, "0");
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// src/stt/transcriber.ts
|
|
126
|
+
class Transcriber {
|
|
127
|
+
stt;
|
|
128
|
+
segmentSplitter = new SegmentSplitter;
|
|
129
|
+
diarizationMapper = new DiarizationMapper;
|
|
130
|
+
subtitleFormatter = new SubtitleFormatter;
|
|
131
|
+
constructor(options) {
|
|
132
|
+
this.stt = options.stt;
|
|
133
|
+
}
|
|
134
|
+
async transcribe(brief) {
|
|
135
|
+
const projectId = generateProjectId();
|
|
136
|
+
const chunks = this.segmentSplitter.split(brief.audio);
|
|
137
|
+
const allSegments = [];
|
|
138
|
+
let fullText = "";
|
|
139
|
+
let totalDurationMs = 0;
|
|
140
|
+
let offsetMs = 0;
|
|
141
|
+
for (const chunk of chunks) {
|
|
142
|
+
const result = await this.stt.transcribe({
|
|
143
|
+
audio: chunk,
|
|
144
|
+
language: brief.language,
|
|
145
|
+
diarize: brief.diarize,
|
|
146
|
+
speakerCount: brief.speakerCount,
|
|
147
|
+
wordTimestamps: true,
|
|
148
|
+
vocabularyHints: brief.vocabularyHints
|
|
149
|
+
});
|
|
150
|
+
const offsetSegments = result.segments.map((seg) => ({
|
|
151
|
+
text: seg.text,
|
|
152
|
+
startMs: seg.startMs + offsetMs,
|
|
153
|
+
endMs: seg.endMs + offsetMs,
|
|
154
|
+
speakerId: seg.speakerId,
|
|
155
|
+
speakerName: seg.speakerName,
|
|
156
|
+
confidence: seg.confidence
|
|
157
|
+
}));
|
|
158
|
+
allSegments.push(...offsetSegments);
|
|
159
|
+
fullText += (fullText ? " " : "") + result.text;
|
|
160
|
+
totalDurationMs += result.durationMs;
|
|
161
|
+
offsetMs += chunk.durationMs ?? 0;
|
|
162
|
+
}
|
|
163
|
+
let mappedSegments = allSegments;
|
|
164
|
+
let speakers;
|
|
165
|
+
if (brief.diarize) {
|
|
166
|
+
const mapping = this.diarizationMapper.map(allSegments);
|
|
167
|
+
mappedSegments = mapping.segments;
|
|
168
|
+
speakers = mapping.speakers;
|
|
169
|
+
}
|
|
170
|
+
const transcript = {
|
|
171
|
+
text: fullText,
|
|
172
|
+
segments: mappedSegments,
|
|
173
|
+
language: brief.language ?? "en",
|
|
174
|
+
durationMs: totalDurationMs
|
|
175
|
+
};
|
|
176
|
+
let subtitles;
|
|
177
|
+
const format = brief.subtitleFormat ?? "none";
|
|
178
|
+
if (format === "srt") {
|
|
179
|
+
subtitles = this.subtitleFormatter.toSRT(mappedSegments);
|
|
180
|
+
} else if (format === "vtt") {
|
|
181
|
+
subtitles = this.subtitleFormatter.toVTT(mappedSegments);
|
|
182
|
+
}
|
|
183
|
+
return {
|
|
184
|
+
id: projectId,
|
|
185
|
+
transcript,
|
|
186
|
+
subtitles,
|
|
187
|
+
speakers
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
async* transcribeStream(audio, options) {
|
|
191
|
+
if (!this.stt.transcribeStream) {
|
|
192
|
+
throw new Error("Streaming transcription not supported by the current STT provider");
|
|
193
|
+
}
|
|
194
|
+
const stream = this.stt.transcribeStream(audio, {
|
|
195
|
+
language: options?.language,
|
|
196
|
+
diarize: options?.diarize,
|
|
197
|
+
speakerCount: options?.speakerCount,
|
|
198
|
+
wordTimestamps: true,
|
|
199
|
+
vocabularyHints: options?.vocabularyHints
|
|
200
|
+
});
|
|
201
|
+
for await (const segment of stream) {
|
|
202
|
+
yield {
|
|
203
|
+
text: segment.text,
|
|
204
|
+
startMs: segment.startMs,
|
|
205
|
+
endMs: segment.endMs,
|
|
206
|
+
speakerId: segment.speakerId,
|
|
207
|
+
speakerLabel: segment.speakerName,
|
|
208
|
+
confidence: segment.confidence
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
function generateProjectId() {
|
|
214
|
+
const timestamp = Date.now().toString(36);
|
|
215
|
+
const random = Math.random().toString(36).slice(2, 8);
|
|
216
|
+
return `stt_${timestamp}_${random}`;
|
|
217
|
+
}
|
|
218
|
+
export {
|
|
219
|
+
Transcriber,
|
|
220
|
+
SubtitleFormatter,
|
|
221
|
+
SegmentSplitter,
|
|
222
|
+
DiarizationMapper
|
|
223
|
+
};
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { AudioData } from '../types';
|
|
2
|
+
/**
|
|
3
|
+
* Split long audio into processable chunks.
|
|
4
|
+
*
|
|
5
|
+
* Useful for providers with maximum audio length limits.
|
|
6
|
+
* Splits at silence boundaries when possible (approximated by byte position).
|
|
7
|
+
*/
|
|
8
|
+
export declare class SegmentSplitter {
|
|
9
|
+
/** Default maximum chunk duration in ms (5 minutes) */
|
|
10
|
+
private static readonly DEFAULT_MAX_CHUNK_MS;
|
|
11
|
+
/**
|
|
12
|
+
* Split audio into chunks of at most maxChunkMs duration.
|
|
13
|
+
*
|
|
14
|
+
* @param audio - Input audio data
|
|
15
|
+
* @param maxChunkMs - Maximum chunk duration in milliseconds
|
|
16
|
+
*/
|
|
17
|
+
split(audio: AudioData, maxChunkMs?: number): AudioData[];
|
|
18
|
+
private estimateDurationMs;
|
|
19
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/stt/segment-splitter.ts
|
|
3
|
+
class SegmentSplitter {
|
|
4
|
+
static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
|
|
5
|
+
split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
|
|
6
|
+
const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
|
|
7
|
+
if (totalDurationMs <= maxChunkMs) {
|
|
8
|
+
return [audio];
|
|
9
|
+
}
|
|
10
|
+
const chunks = [];
|
|
11
|
+
const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
|
|
12
|
+
let offsetMs = 0;
|
|
13
|
+
while (offsetMs < totalDurationMs) {
|
|
14
|
+
const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
|
|
15
|
+
const startByte = Math.floor(offsetMs * bytesPerMs);
|
|
16
|
+
const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
|
|
17
|
+
chunks.push({
|
|
18
|
+
data: audio.data.slice(startByte, endByte),
|
|
19
|
+
format: audio.format,
|
|
20
|
+
sampleRateHz: audio.sampleRateHz,
|
|
21
|
+
durationMs: chunkDurationMs,
|
|
22
|
+
channels: audio.channels
|
|
23
|
+
});
|
|
24
|
+
offsetMs += chunkDurationMs;
|
|
25
|
+
}
|
|
26
|
+
return chunks;
|
|
27
|
+
}
|
|
28
|
+
estimateDurationMs(audio) {
|
|
29
|
+
const bytesPerSample = 2;
|
|
30
|
+
const channels = audio.channels ?? 1;
|
|
31
|
+
const totalSamples = audio.data.length / (bytesPerSample * channels);
|
|
32
|
+
return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
export {
|
|
36
|
+
SegmentSplitter
|
|
37
|
+
};
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { TranscriptionSegment } from './types';
|
|
2
|
+
/**
|
|
3
|
+
* Format transcription segments as SRT or VTT subtitles.
|
|
4
|
+
*/
|
|
5
|
+
export declare class SubtitleFormatter {
|
|
6
|
+
/**
|
|
7
|
+
* Convert segments to SRT format.
|
|
8
|
+
*/
|
|
9
|
+
toSRT(segments: TranscriptionSegment[]): string;
|
|
10
|
+
/**
|
|
11
|
+
* Convert segments to WebVTT format.
|
|
12
|
+
*/
|
|
13
|
+
toVTT(segments: TranscriptionSegment[]): string;
|
|
14
|
+
/** Format ms as SRT timestamp: HH:MM:SS,mmm */
|
|
15
|
+
private formatTimeSRT;
|
|
16
|
+
/** Format ms as VTT timestamp: HH:MM:SS.mmm */
|
|
17
|
+
private formatTimeVTT;
|
|
18
|
+
private pad;
|
|
19
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/stt/subtitle-formatter.ts
|
|
3
|
+
class SubtitleFormatter {
|
|
4
|
+
toSRT(segments) {
|
|
5
|
+
return segments.map((seg, i) => {
|
|
6
|
+
const start = this.formatTimeSRT(seg.startMs);
|
|
7
|
+
const end = this.formatTimeSRT(seg.endMs);
|
|
8
|
+
const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
|
|
9
|
+
return `${i + 1}
|
|
10
|
+
${start} --> ${end}
|
|
11
|
+
${label}${seg.text}`;
|
|
12
|
+
}).join(`
|
|
13
|
+
|
|
14
|
+
`);
|
|
15
|
+
}
|
|
16
|
+
toVTT(segments) {
|
|
17
|
+
const header = `WEBVTT
|
|
18
|
+
|
|
19
|
+
`;
|
|
20
|
+
const cues = segments.map((seg, i) => {
|
|
21
|
+
const start = this.formatTimeVTT(seg.startMs);
|
|
22
|
+
const end = this.formatTimeVTT(seg.endMs);
|
|
23
|
+
const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
|
|
24
|
+
return `${i + 1}
|
|
25
|
+
${start} --> ${end}
|
|
26
|
+
${label}${seg.text}`;
|
|
27
|
+
}).join(`
|
|
28
|
+
|
|
29
|
+
`);
|
|
30
|
+
return header + cues;
|
|
31
|
+
}
|
|
32
|
+
formatTimeSRT(ms) {
|
|
33
|
+
const hours = Math.floor(ms / 3600000);
|
|
34
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
35
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
36
|
+
const millis = ms % 1000;
|
|
37
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
|
|
38
|
+
}
|
|
39
|
+
formatTimeVTT(ms) {
|
|
40
|
+
const hours = Math.floor(ms / 3600000);
|
|
41
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
42
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
43
|
+
const millis = ms % 1000;
|
|
44
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
|
|
45
|
+
}
|
|
46
|
+
pad(value, length) {
|
|
47
|
+
return value.toString().padStart(length, "0");
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
export {
|
|
51
|
+
SubtitleFormatter
|
|
52
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { STTBrief, STTOptions, TranscriptionProject, TranscriptionSegment } from './types';
|
|
2
|
+
/**
|
|
3
|
+
* Main STT orchestrator.
|
|
4
|
+
*
|
|
5
|
+
* Pipeline:
|
|
6
|
+
* 1. Split audio into processable chunks (if needed)
|
|
7
|
+
* 2. Transcribe via STTProvider
|
|
8
|
+
* 3. Map speaker IDs to labels (if diarization enabled)
|
|
9
|
+
* 4. Format subtitles (if requested)
|
|
10
|
+
*/
|
|
11
|
+
export declare class Transcriber {
|
|
12
|
+
private readonly stt;
|
|
13
|
+
private readonly segmentSplitter;
|
|
14
|
+
private readonly diarizationMapper;
|
|
15
|
+
private readonly subtitleFormatter;
|
|
16
|
+
constructor(options: STTOptions);
|
|
17
|
+
/** Transcribe audio to text */
|
|
18
|
+
transcribe(brief: STTBrief): Promise<TranscriptionProject>;
|
|
19
|
+
/** Stream transcription (real-time, if provider supports it) */
|
|
20
|
+
transcribeStream(audio: AsyncIterable<Uint8Array>, options?: Partial<STTBrief>): AsyncIterable<TranscriptionSegment>;
|
|
21
|
+
}
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/stt/segment-splitter.ts
|
|
3
|
+
class SegmentSplitter {
|
|
4
|
+
static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
|
|
5
|
+
split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
|
|
6
|
+
const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
|
|
7
|
+
if (totalDurationMs <= maxChunkMs) {
|
|
8
|
+
return [audio];
|
|
9
|
+
}
|
|
10
|
+
const chunks = [];
|
|
11
|
+
const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
|
|
12
|
+
let offsetMs = 0;
|
|
13
|
+
while (offsetMs < totalDurationMs) {
|
|
14
|
+
const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
|
|
15
|
+
const startByte = Math.floor(offsetMs * bytesPerMs);
|
|
16
|
+
const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
|
|
17
|
+
chunks.push({
|
|
18
|
+
data: audio.data.slice(startByte, endByte),
|
|
19
|
+
format: audio.format,
|
|
20
|
+
sampleRateHz: audio.sampleRateHz,
|
|
21
|
+
durationMs: chunkDurationMs,
|
|
22
|
+
channels: audio.channels
|
|
23
|
+
});
|
|
24
|
+
offsetMs += chunkDurationMs;
|
|
25
|
+
}
|
|
26
|
+
return chunks;
|
|
27
|
+
}
|
|
28
|
+
estimateDurationMs(audio) {
|
|
29
|
+
const bytesPerSample = 2;
|
|
30
|
+
const channels = audio.channels ?? 1;
|
|
31
|
+
const totalSamples = audio.data.length / (bytesPerSample * channels);
|
|
32
|
+
return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// src/stt/diarization-mapper.ts
|
|
37
|
+
class DiarizationMapper {
|
|
38
|
+
map(segments, labelPrefix = "Speaker") {
|
|
39
|
+
const speakerOrder = [];
|
|
40
|
+
const speakerStats = new Map;
|
|
41
|
+
for (const seg of segments) {
|
|
42
|
+
if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
|
|
43
|
+
speakerOrder.push(seg.speakerId);
|
|
44
|
+
speakerStats.set(seg.speakerId, {
|
|
45
|
+
segmentCount: 0,
|
|
46
|
+
totalSpeakingMs: 0
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
const labeledSegments = segments.map((seg) => {
|
|
51
|
+
if (!seg.speakerId)
|
|
52
|
+
return seg;
|
|
53
|
+
const index = speakerOrder.indexOf(seg.speakerId);
|
|
54
|
+
const label = `${labelPrefix} ${index + 1}`;
|
|
55
|
+
const stats = speakerStats.get(seg.speakerId);
|
|
56
|
+
if (!stats) {
|
|
57
|
+
return { ...seg, speakerLabel: label };
|
|
58
|
+
}
|
|
59
|
+
stats.segmentCount += 1;
|
|
60
|
+
stats.totalSpeakingMs += seg.endMs - seg.startMs;
|
|
61
|
+
return { ...seg, speakerLabel: label };
|
|
62
|
+
});
|
|
63
|
+
const speakers = speakerOrder.map((id, index) => {
|
|
64
|
+
const stats = speakerStats.get(id);
|
|
65
|
+
return {
|
|
66
|
+
id,
|
|
67
|
+
label: `${labelPrefix} ${index + 1}`,
|
|
68
|
+
segmentCount: stats?.segmentCount ?? 0,
|
|
69
|
+
totalSpeakingMs: stats?.totalSpeakingMs ?? 0
|
|
70
|
+
};
|
|
71
|
+
});
|
|
72
|
+
return { segments: labeledSegments, speakers };
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// src/stt/subtitle-formatter.ts
|
|
77
|
+
class SubtitleFormatter {
|
|
78
|
+
toSRT(segments) {
|
|
79
|
+
return segments.map((seg, i) => {
|
|
80
|
+
const start = this.formatTimeSRT(seg.startMs);
|
|
81
|
+
const end = this.formatTimeSRT(seg.endMs);
|
|
82
|
+
const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
|
|
83
|
+
return `${i + 1}
|
|
84
|
+
${start} --> ${end}
|
|
85
|
+
${label}${seg.text}`;
|
|
86
|
+
}).join(`
|
|
87
|
+
|
|
88
|
+
`);
|
|
89
|
+
}
|
|
90
|
+
toVTT(segments) {
|
|
91
|
+
const header = `WEBVTT
|
|
92
|
+
|
|
93
|
+
`;
|
|
94
|
+
const cues = segments.map((seg, i) => {
|
|
95
|
+
const start = this.formatTimeVTT(seg.startMs);
|
|
96
|
+
const end = this.formatTimeVTT(seg.endMs);
|
|
97
|
+
const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
|
|
98
|
+
return `${i + 1}
|
|
99
|
+
${start} --> ${end}
|
|
100
|
+
${label}${seg.text}`;
|
|
101
|
+
}).join(`
|
|
102
|
+
|
|
103
|
+
`);
|
|
104
|
+
return header + cues;
|
|
105
|
+
}
|
|
106
|
+
formatTimeSRT(ms) {
|
|
107
|
+
const hours = Math.floor(ms / 3600000);
|
|
108
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
109
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
110
|
+
const millis = ms % 1000;
|
|
111
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
|
|
112
|
+
}
|
|
113
|
+
formatTimeVTT(ms) {
|
|
114
|
+
const hours = Math.floor(ms / 3600000);
|
|
115
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
116
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
117
|
+
const millis = ms % 1000;
|
|
118
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
|
|
119
|
+
}
|
|
120
|
+
pad(value, length) {
|
|
121
|
+
return value.toString().padStart(length, "0");
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// src/stt/transcriber.ts
|
|
126
|
+
class Transcriber {
|
|
127
|
+
stt;
|
|
128
|
+
segmentSplitter = new SegmentSplitter;
|
|
129
|
+
diarizationMapper = new DiarizationMapper;
|
|
130
|
+
subtitleFormatter = new SubtitleFormatter;
|
|
131
|
+
constructor(options) {
|
|
132
|
+
this.stt = options.stt;
|
|
133
|
+
}
|
|
134
|
+
async transcribe(brief) {
|
|
135
|
+
const projectId = generateProjectId();
|
|
136
|
+
const chunks = this.segmentSplitter.split(brief.audio);
|
|
137
|
+
const allSegments = [];
|
|
138
|
+
let fullText = "";
|
|
139
|
+
let totalDurationMs = 0;
|
|
140
|
+
let offsetMs = 0;
|
|
141
|
+
for (const chunk of chunks) {
|
|
142
|
+
const result = await this.stt.transcribe({
|
|
143
|
+
audio: chunk,
|
|
144
|
+
language: brief.language,
|
|
145
|
+
diarize: brief.diarize,
|
|
146
|
+
speakerCount: brief.speakerCount,
|
|
147
|
+
wordTimestamps: true,
|
|
148
|
+
vocabularyHints: brief.vocabularyHints
|
|
149
|
+
});
|
|
150
|
+
const offsetSegments = result.segments.map((seg) => ({
|
|
151
|
+
text: seg.text,
|
|
152
|
+
startMs: seg.startMs + offsetMs,
|
|
153
|
+
endMs: seg.endMs + offsetMs,
|
|
154
|
+
speakerId: seg.speakerId,
|
|
155
|
+
speakerName: seg.speakerName,
|
|
156
|
+
confidence: seg.confidence
|
|
157
|
+
}));
|
|
158
|
+
allSegments.push(...offsetSegments);
|
|
159
|
+
fullText += (fullText ? " " : "") + result.text;
|
|
160
|
+
totalDurationMs += result.durationMs;
|
|
161
|
+
offsetMs += chunk.durationMs ?? 0;
|
|
162
|
+
}
|
|
163
|
+
let mappedSegments = allSegments;
|
|
164
|
+
let speakers;
|
|
165
|
+
if (brief.diarize) {
|
|
166
|
+
const mapping = this.diarizationMapper.map(allSegments);
|
|
167
|
+
mappedSegments = mapping.segments;
|
|
168
|
+
speakers = mapping.speakers;
|
|
169
|
+
}
|
|
170
|
+
const transcript = {
|
|
171
|
+
text: fullText,
|
|
172
|
+
segments: mappedSegments,
|
|
173
|
+
language: brief.language ?? "en",
|
|
174
|
+
durationMs: totalDurationMs
|
|
175
|
+
};
|
|
176
|
+
let subtitles;
|
|
177
|
+
const format = brief.subtitleFormat ?? "none";
|
|
178
|
+
if (format === "srt") {
|
|
179
|
+
subtitles = this.subtitleFormatter.toSRT(mappedSegments);
|
|
180
|
+
} else if (format === "vtt") {
|
|
181
|
+
subtitles = this.subtitleFormatter.toVTT(mappedSegments);
|
|
182
|
+
}
|
|
183
|
+
return {
|
|
184
|
+
id: projectId,
|
|
185
|
+
transcript,
|
|
186
|
+
subtitles,
|
|
187
|
+
speakers
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
async* transcribeStream(audio, options) {
|
|
191
|
+
if (!this.stt.transcribeStream) {
|
|
192
|
+
throw new Error("Streaming transcription not supported by the current STT provider");
|
|
193
|
+
}
|
|
194
|
+
const stream = this.stt.transcribeStream(audio, {
|
|
195
|
+
language: options?.language,
|
|
196
|
+
diarize: options?.diarize,
|
|
197
|
+
speakerCount: options?.speakerCount,
|
|
198
|
+
wordTimestamps: true,
|
|
199
|
+
vocabularyHints: options?.vocabularyHints
|
|
200
|
+
});
|
|
201
|
+
for await (const segment of stream) {
|
|
202
|
+
yield {
|
|
203
|
+
text: segment.text,
|
|
204
|
+
startMs: segment.startMs,
|
|
205
|
+
endMs: segment.endMs,
|
|
206
|
+
speakerId: segment.speakerId,
|
|
207
|
+
speakerLabel: segment.speakerName,
|
|
208
|
+
confidence: segment.confidence
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
function generateProjectId() {
|
|
214
|
+
const timestamp = Date.now().toString(36);
|
|
215
|
+
const random = Math.random().toString(36).slice(2, 8);
|
|
216
|
+
return `stt_${timestamp}_${random}`;
|
|
217
|
+
}
|
|
218
|
+
export {
|
|
219
|
+
Transcriber
|
|
220
|
+
};
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import type { STTProvider, AudioData, VoiceOptions } from '../types';
|
|
2
|
+
export interface STTBrief {
|
|
3
|
+
audio: AudioData;
|
|
4
|
+
language?: string;
|
|
5
|
+
diarize?: boolean;
|
|
6
|
+
speakerCount?: number;
|
|
7
|
+
vocabularyHints?: string[];
|
|
8
|
+
/** Output subtitle format */
|
|
9
|
+
subtitleFormat?: 'srt' | 'vtt' | 'none';
|
|
10
|
+
}
|
|
11
|
+
export interface TranscriptionProject {
|
|
12
|
+
id: string;
|
|
13
|
+
transcript: TranscriptionResult;
|
|
14
|
+
subtitles?: string;
|
|
15
|
+
speakers?: SpeakerMap[];
|
|
16
|
+
}
|
|
17
|
+
export interface TranscriptionResult {
|
|
18
|
+
text: string;
|
|
19
|
+
segments: TranscriptionSegment[];
|
|
20
|
+
language: string;
|
|
21
|
+
durationMs: number;
|
|
22
|
+
wordTimings?: {
|
|
23
|
+
word: string;
|
|
24
|
+
startMs: number;
|
|
25
|
+
endMs: number;
|
|
26
|
+
}[];
|
|
27
|
+
}
|
|
28
|
+
export interface TranscriptionSegment {
|
|
29
|
+
text: string;
|
|
30
|
+
startMs: number;
|
|
31
|
+
endMs: number;
|
|
32
|
+
speakerId?: string;
|
|
33
|
+
speakerLabel?: string;
|
|
34
|
+
confidence?: number;
|
|
35
|
+
}
|
|
36
|
+
export interface SpeakerMap {
|
|
37
|
+
id: string;
|
|
38
|
+
label: string;
|
|
39
|
+
segmentCount: number;
|
|
40
|
+
totalSpeakingMs: number;
|
|
41
|
+
}
|
|
42
|
+
export interface STTOptions extends VoiceOptions {
|
|
43
|
+
stt: STTProvider;
|
|
44
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
// @bun
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type { VoiceTimingMap } from '../types';
|
|
2
|
+
interface NegotiationResult {
|
|
3
|
+
/** Updated timing map with negotiated durations */
|
|
4
|
+
timingMap: VoiceTimingMap;
|
|
5
|
+
/** Per-scene negotiation details */
|
|
6
|
+
adjustments: SceneAdjustment[];
|
|
7
|
+
}
|
|
8
|
+
interface SceneAdjustment {
|
|
9
|
+
sceneId: string;
|
|
10
|
+
originalSceneDurationInFrames: number;
|
|
11
|
+
voiceDurationInFrames: number;
|
|
12
|
+
action: 'no_change' | 'extend_scene' | 'pad_silence' | 'suggest_rate_change';
|
|
13
|
+
suggestedRate?: number;
|
|
14
|
+
finalSceneDurationInFrames: number;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Negotiate duration between voice audio and scene durations.
|
|
18
|
+
*
|
|
19
|
+
* One-pass duration balancing:
|
|
20
|
+
* - Voice fits scene -> no change
|
|
21
|
+
* - Voice > 110% of scene -> suggest rate increase (cap 1.3x), extend scene
|
|
22
|
+
* - Voice < 70% of scene -> suggest rate decrease (floor 0.8x), pad silence
|
|
23
|
+
*/
|
|
24
|
+
export declare class DurationNegotiator {
|
|
25
|
+
private static readonly UPPER_THRESHOLD;
|
|
26
|
+
private static readonly LOWER_THRESHOLD;
|
|
27
|
+
private static readonly MAX_RATE;
|
|
28
|
+
private static readonly MIN_RATE;
|
|
29
|
+
/**
|
|
30
|
+
* Negotiate voice-vs-scene durations.
|
|
31
|
+
*
|
|
32
|
+
* @param timingMap - Voice timing map with per-segment durations
|
|
33
|
+
* @param sceneDurations - Map of sceneId -> original scene duration in frames
|
|
34
|
+
*/
|
|
35
|
+
negotiate(timingMap: VoiceTimingMap, sceneDurations: Map<string, number>): NegotiationResult;
|
|
36
|
+
}
|
|
37
|
+
export {};
|