@contractspec/lib.voice 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio/audio-concatenator.d.ts +15 -0
- package/dist/audio/audio-concatenator.js +57 -0
- package/dist/audio/duration-estimator.d.ts +31 -0
- package/dist/audio/duration-estimator.js +22 -0
- package/dist/audio/format-converter.d.ts +17 -0
- package/dist/audio/format-converter.js +28 -0
- package/dist/audio/index.d.ts +4 -0
- package/dist/audio/index.js +121 -0
- package/dist/audio/silence-generator.d.ts +16 -0
- package/dist/audio/silence-generator.js +20 -0
- package/dist/browser/audio/audio-concatenator.js +56 -0
- package/dist/browser/audio/duration-estimator.js +21 -0
- package/dist/browser/audio/format-converter.js +27 -0
- package/dist/browser/audio/index.js +120 -0
- package/dist/browser/audio/silence-generator.js +19 -0
- package/dist/browser/conversational/index.js +241 -0
- package/dist/browser/conversational/response-orchestrator.js +62 -0
- package/dist/browser/conversational/transcript-builder.js +63 -0
- package/dist/browser/conversational/turn-detector.js +43 -0
- package/dist/browser/conversational/types.js +0 -0
- package/dist/browser/conversational/voice-session-manager.js +137 -0
- package/dist/browser/docs/conversational.docblock.js +5 -0
- package/dist/browser/docs/stt.docblock.js +5 -0
- package/dist/browser/docs/sync.docblock.js +5 -0
- package/dist/browser/docs/tts.docblock.js +5 -0
- package/dist/browser/docs/voice.docblock.js +5 -0
- package/dist/browser/i18n/catalogs/en.js +91 -0
- package/dist/browser/i18n/catalogs/es.js +91 -0
- package/dist/browser/i18n/catalogs/fr.js +91 -0
- package/dist/browser/i18n/catalogs/index.js +271 -0
- package/dist/browser/i18n/index.js +335 -0
- package/dist/browser/i18n/keys.js +38 -0
- package/dist/browser/i18n/locale.js +13 -0
- package/dist/browser/i18n/messages.js +283 -0
- package/dist/browser/index.js +1070 -0
- package/dist/browser/stt/diarization-mapper.js +42 -0
- package/dist/browser/stt/index.js +222 -0
- package/dist/browser/stt/segment-splitter.js +36 -0
- package/dist/browser/stt/subtitle-formatter.js +51 -0
- package/dist/browser/stt/transcriber.js +219 -0
- package/dist/browser/stt/types.js +0 -0
- package/dist/browser/sync/duration-negotiator.js +69 -0
- package/dist/browser/sync/index.js +165 -0
- package/dist/browser/sync/scene-adapter.js +52 -0
- package/dist/browser/sync/timing-calculator.js +46 -0
- package/dist/browser/tts/audio-assembler.js +120 -0
- package/dist/browser/tts/emphasis-planner.js +134 -0
- package/dist/browser/tts/index.js +439 -0
- package/dist/browser/tts/pace-analyzer.js +67 -0
- package/dist/browser/tts/segment-synthesizer.js +36 -0
- package/dist/browser/tts/types.js +0 -0
- package/dist/browser/tts/voice-synthesizer.js +435 -0
- package/dist/browser/types.js +0 -0
- package/dist/conversational/index.d.ts +5 -0
- package/dist/conversational/index.js +242 -0
- package/dist/conversational/response-orchestrator.d.ts +26 -0
- package/dist/conversational/response-orchestrator.js +63 -0
- package/dist/conversational/transcript-builder.d.ts +25 -0
- package/dist/conversational/transcript-builder.js +64 -0
- package/dist/conversational/turn-detector.d.ts +31 -0
- package/dist/conversational/turn-detector.js +44 -0
- package/dist/conversational/types.d.ts +55 -0
- package/dist/conversational/types.js +1 -0
- package/dist/conversational/voice-session-manager.d.ts +17 -0
- package/dist/conversational/voice-session-manager.js +138 -0
- package/dist/docs/conversational.docblock.d.ts +14 -0
- package/dist/docs/conversational.docblock.js +6 -0
- package/dist/docs/stt.docblock.d.ts +12 -0
- package/dist/docs/stt.docblock.js +6 -0
- package/dist/docs/sync.docblock.d.ts +12 -0
- package/dist/docs/sync.docblock.js +6 -0
- package/dist/docs/tts.docblock.d.ts +12 -0
- package/dist/docs/tts.docblock.js +6 -0
- package/dist/docs/voice.docblock.d.ts +22 -0
- package/dist/docs/voice.docblock.js +6 -0
- package/dist/i18n/catalogs/en.d.ts +6 -0
- package/dist/i18n/catalogs/en.js +92 -0
- package/dist/i18n/catalogs/es.d.ts +4 -0
- package/dist/i18n/catalogs/es.js +92 -0
- package/dist/i18n/catalogs/fr.d.ts +4 -0
- package/dist/i18n/catalogs/fr.js +92 -0
- package/dist/i18n/catalogs/index.d.ts +3 -0
- package/dist/i18n/catalogs/index.js +272 -0
- package/dist/i18n/index.d.ts +20 -0
- package/dist/i18n/index.js +336 -0
- package/dist/i18n/keys.d.ts +50 -0
- package/dist/i18n/keys.js +39 -0
- package/dist/i18n/locale.d.ts +6 -0
- package/dist/i18n/locale.js +14 -0
- package/dist/i18n/messages.d.ts +13 -0
- package/dist/i18n/messages.js +284 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +1071 -0
- package/dist/node/audio/audio-concatenator.js +56 -0
- package/dist/node/audio/duration-estimator.js +21 -0
- package/dist/node/audio/format-converter.js +27 -0
- package/dist/node/audio/index.js +120 -0
- package/dist/node/audio/silence-generator.js +19 -0
- package/dist/node/conversational/index.js +241 -0
- package/dist/node/conversational/response-orchestrator.js +62 -0
- package/dist/node/conversational/transcript-builder.js +63 -0
- package/dist/node/conversational/turn-detector.js +43 -0
- package/dist/node/conversational/types.js +0 -0
- package/dist/node/conversational/voice-session-manager.js +137 -0
- package/dist/node/docs/conversational.docblock.js +5 -0
- package/dist/node/docs/stt.docblock.js +5 -0
- package/dist/node/docs/sync.docblock.js +5 -0
- package/dist/node/docs/tts.docblock.js +5 -0
- package/dist/node/docs/voice.docblock.js +5 -0
- package/dist/node/i18n/catalogs/en.js +91 -0
- package/dist/node/i18n/catalogs/es.js +91 -0
- package/dist/node/i18n/catalogs/fr.js +91 -0
- package/dist/node/i18n/catalogs/index.js +271 -0
- package/dist/node/i18n/index.js +335 -0
- package/dist/node/i18n/keys.js +38 -0
- package/dist/node/i18n/locale.js +13 -0
- package/dist/node/i18n/messages.js +283 -0
- package/dist/node/index.js +1070 -0
- package/dist/node/stt/diarization-mapper.js +42 -0
- package/dist/node/stt/index.js +222 -0
- package/dist/node/stt/segment-splitter.js +36 -0
- package/dist/node/stt/subtitle-formatter.js +51 -0
- package/dist/node/stt/transcriber.js +219 -0
- package/dist/node/stt/types.js +0 -0
- package/dist/node/sync/duration-negotiator.js +69 -0
- package/dist/node/sync/index.js +165 -0
- package/dist/node/sync/scene-adapter.js +52 -0
- package/dist/node/sync/timing-calculator.js +46 -0
- package/dist/node/tts/audio-assembler.js +120 -0
- package/dist/node/tts/emphasis-planner.js +134 -0
- package/dist/node/tts/index.js +439 -0
- package/dist/node/tts/pace-analyzer.js +67 -0
- package/dist/node/tts/segment-synthesizer.js +36 -0
- package/dist/node/tts/types.js +0 -0
- package/dist/node/tts/voice-synthesizer.js +435 -0
- package/dist/node/types.js +0 -0
- package/dist/stt/diarization-mapper.d.ts +19 -0
- package/dist/stt/diarization-mapper.js +43 -0
- package/dist/stt/index.d.ts +5 -0
- package/dist/stt/index.js +223 -0
- package/dist/stt/segment-splitter.d.ts +19 -0
- package/dist/stt/segment-splitter.js +37 -0
- package/dist/stt/subtitle-formatter.d.ts +19 -0
- package/dist/stt/subtitle-formatter.js +52 -0
- package/dist/stt/transcriber.d.ts +21 -0
- package/dist/stt/transcriber.js +220 -0
- package/dist/stt/types.d.ts +44 -0
- package/dist/stt/types.js +1 -0
- package/dist/sync/duration-negotiator.d.ts +37 -0
- package/dist/sync/duration-negotiator.js +70 -0
- package/dist/sync/index.d.ts +3 -0
- package/dist/sync/index.js +166 -0
- package/dist/sync/scene-adapter.d.ts +29 -0
- package/dist/sync/scene-adapter.js +53 -0
- package/dist/sync/timing-calculator.d.ts +21 -0
- package/dist/sync/timing-calculator.js +47 -0
- package/dist/tts/audio-assembler.d.ts +19 -0
- package/dist/tts/audio-assembler.js +121 -0
- package/dist/tts/emphasis-planner.d.ts +24 -0
- package/dist/tts/emphasis-planner.js +135 -0
- package/dist/tts/index.d.ts +6 -0
- package/dist/tts/index.js +440 -0
- package/dist/tts/pace-analyzer.d.ts +30 -0
- package/dist/tts/pace-analyzer.js +68 -0
- package/dist/tts/segment-synthesizer.d.ts +21 -0
- package/dist/tts/segment-synthesizer.js +37 -0
- package/dist/tts/types.d.ts +76 -0
- package/dist/tts/types.js +1 -0
- package/dist/tts/voice-synthesizer.d.ts +28 -0
- package/dist/tts/voice-synthesizer.js +436 -0
- package/dist/types.d.ts +12 -0
- package/dist/types.js +1 -0
- package/package.json +760 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
// src/stt/diarization-mapper.ts
|
|
2
|
+
class DiarizationMapper {
|
|
3
|
+
map(segments, labelPrefix = "Speaker") {
|
|
4
|
+
const speakerOrder = [];
|
|
5
|
+
const speakerStats = new Map;
|
|
6
|
+
for (const seg of segments) {
|
|
7
|
+
if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
|
|
8
|
+
speakerOrder.push(seg.speakerId);
|
|
9
|
+
speakerStats.set(seg.speakerId, {
|
|
10
|
+
segmentCount: 0,
|
|
11
|
+
totalSpeakingMs: 0
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
const labeledSegments = segments.map((seg) => {
|
|
16
|
+
if (!seg.speakerId)
|
|
17
|
+
return seg;
|
|
18
|
+
const index = speakerOrder.indexOf(seg.speakerId);
|
|
19
|
+
const label = `${labelPrefix} ${index + 1}`;
|
|
20
|
+
const stats = speakerStats.get(seg.speakerId);
|
|
21
|
+
if (!stats) {
|
|
22
|
+
return { ...seg, speakerLabel: label };
|
|
23
|
+
}
|
|
24
|
+
stats.segmentCount += 1;
|
|
25
|
+
stats.totalSpeakingMs += seg.endMs - seg.startMs;
|
|
26
|
+
return { ...seg, speakerLabel: label };
|
|
27
|
+
});
|
|
28
|
+
const speakers = speakerOrder.map((id, index) => {
|
|
29
|
+
const stats = speakerStats.get(id);
|
|
30
|
+
return {
|
|
31
|
+
id,
|
|
32
|
+
label: `${labelPrefix} ${index + 1}`,
|
|
33
|
+
segmentCount: stats?.segmentCount ?? 0,
|
|
34
|
+
totalSpeakingMs: stats?.totalSpeakingMs ?? 0
|
|
35
|
+
};
|
|
36
|
+
});
|
|
37
|
+
return { segments: labeledSegments, speakers };
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
export {
|
|
41
|
+
DiarizationMapper
|
|
42
|
+
};
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
// src/stt/segment-splitter.ts
|
|
2
|
+
class SegmentSplitter {
|
|
3
|
+
static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
|
|
4
|
+
split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
|
|
5
|
+
const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
|
|
6
|
+
if (totalDurationMs <= maxChunkMs) {
|
|
7
|
+
return [audio];
|
|
8
|
+
}
|
|
9
|
+
const chunks = [];
|
|
10
|
+
const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
|
|
11
|
+
let offsetMs = 0;
|
|
12
|
+
while (offsetMs < totalDurationMs) {
|
|
13
|
+
const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
|
|
14
|
+
const startByte = Math.floor(offsetMs * bytesPerMs);
|
|
15
|
+
const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
|
|
16
|
+
chunks.push({
|
|
17
|
+
data: audio.data.slice(startByte, endByte),
|
|
18
|
+
format: audio.format,
|
|
19
|
+
sampleRateHz: audio.sampleRateHz,
|
|
20
|
+
durationMs: chunkDurationMs,
|
|
21
|
+
channels: audio.channels
|
|
22
|
+
});
|
|
23
|
+
offsetMs += chunkDurationMs;
|
|
24
|
+
}
|
|
25
|
+
return chunks;
|
|
26
|
+
}
|
|
27
|
+
estimateDurationMs(audio) {
|
|
28
|
+
const bytesPerSample = 2;
|
|
29
|
+
const channels = audio.channels ?? 1;
|
|
30
|
+
const totalSamples = audio.data.length / (bytesPerSample * channels);
|
|
31
|
+
return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// src/stt/diarization-mapper.ts
|
|
36
|
+
class DiarizationMapper {
|
|
37
|
+
map(segments, labelPrefix = "Speaker") {
|
|
38
|
+
const speakerOrder = [];
|
|
39
|
+
const speakerStats = new Map;
|
|
40
|
+
for (const seg of segments) {
|
|
41
|
+
if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
|
|
42
|
+
speakerOrder.push(seg.speakerId);
|
|
43
|
+
speakerStats.set(seg.speakerId, {
|
|
44
|
+
segmentCount: 0,
|
|
45
|
+
totalSpeakingMs: 0
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
const labeledSegments = segments.map((seg) => {
|
|
50
|
+
if (!seg.speakerId)
|
|
51
|
+
return seg;
|
|
52
|
+
const index = speakerOrder.indexOf(seg.speakerId);
|
|
53
|
+
const label = `${labelPrefix} ${index + 1}`;
|
|
54
|
+
const stats = speakerStats.get(seg.speakerId);
|
|
55
|
+
if (!stats) {
|
|
56
|
+
return { ...seg, speakerLabel: label };
|
|
57
|
+
}
|
|
58
|
+
stats.segmentCount += 1;
|
|
59
|
+
stats.totalSpeakingMs += seg.endMs - seg.startMs;
|
|
60
|
+
return { ...seg, speakerLabel: label };
|
|
61
|
+
});
|
|
62
|
+
const speakers = speakerOrder.map((id, index) => {
|
|
63
|
+
const stats = speakerStats.get(id);
|
|
64
|
+
return {
|
|
65
|
+
id,
|
|
66
|
+
label: `${labelPrefix} ${index + 1}`,
|
|
67
|
+
segmentCount: stats?.segmentCount ?? 0,
|
|
68
|
+
totalSpeakingMs: stats?.totalSpeakingMs ?? 0
|
|
69
|
+
};
|
|
70
|
+
});
|
|
71
|
+
return { segments: labeledSegments, speakers };
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// src/stt/subtitle-formatter.ts
|
|
76
|
+
class SubtitleFormatter {
|
|
77
|
+
toSRT(segments) {
|
|
78
|
+
return segments.map((seg, i) => {
|
|
79
|
+
const start = this.formatTimeSRT(seg.startMs);
|
|
80
|
+
const end = this.formatTimeSRT(seg.endMs);
|
|
81
|
+
const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
|
|
82
|
+
return `${i + 1}
|
|
83
|
+
${start} --> ${end}
|
|
84
|
+
${label}${seg.text}`;
|
|
85
|
+
}).join(`
|
|
86
|
+
|
|
87
|
+
`);
|
|
88
|
+
}
|
|
89
|
+
toVTT(segments) {
|
|
90
|
+
const header = `WEBVTT
|
|
91
|
+
|
|
92
|
+
`;
|
|
93
|
+
const cues = segments.map((seg, i) => {
|
|
94
|
+
const start = this.formatTimeVTT(seg.startMs);
|
|
95
|
+
const end = this.formatTimeVTT(seg.endMs);
|
|
96
|
+
const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
|
|
97
|
+
return `${i + 1}
|
|
98
|
+
${start} --> ${end}
|
|
99
|
+
${label}${seg.text}`;
|
|
100
|
+
}).join(`
|
|
101
|
+
|
|
102
|
+
`);
|
|
103
|
+
return header + cues;
|
|
104
|
+
}
|
|
105
|
+
formatTimeSRT(ms) {
|
|
106
|
+
const hours = Math.floor(ms / 3600000);
|
|
107
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
108
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
109
|
+
const millis = ms % 1000;
|
|
110
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
|
|
111
|
+
}
|
|
112
|
+
formatTimeVTT(ms) {
|
|
113
|
+
const hours = Math.floor(ms / 3600000);
|
|
114
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
115
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
116
|
+
const millis = ms % 1000;
|
|
117
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
|
|
118
|
+
}
|
|
119
|
+
pad(value, length) {
|
|
120
|
+
return value.toString().padStart(length, "0");
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// src/stt/transcriber.ts
|
|
125
|
+
class Transcriber {
|
|
126
|
+
stt;
|
|
127
|
+
segmentSplitter = new SegmentSplitter;
|
|
128
|
+
diarizationMapper = new DiarizationMapper;
|
|
129
|
+
subtitleFormatter = new SubtitleFormatter;
|
|
130
|
+
constructor(options) {
|
|
131
|
+
this.stt = options.stt;
|
|
132
|
+
}
|
|
133
|
+
async transcribe(brief) {
|
|
134
|
+
const projectId = generateProjectId();
|
|
135
|
+
const chunks = this.segmentSplitter.split(brief.audio);
|
|
136
|
+
const allSegments = [];
|
|
137
|
+
let fullText = "";
|
|
138
|
+
let totalDurationMs = 0;
|
|
139
|
+
let offsetMs = 0;
|
|
140
|
+
for (const chunk of chunks) {
|
|
141
|
+
const result = await this.stt.transcribe({
|
|
142
|
+
audio: chunk,
|
|
143
|
+
language: brief.language,
|
|
144
|
+
diarize: brief.diarize,
|
|
145
|
+
speakerCount: brief.speakerCount,
|
|
146
|
+
wordTimestamps: true,
|
|
147
|
+
vocabularyHints: brief.vocabularyHints
|
|
148
|
+
});
|
|
149
|
+
const offsetSegments = result.segments.map((seg) => ({
|
|
150
|
+
text: seg.text,
|
|
151
|
+
startMs: seg.startMs + offsetMs,
|
|
152
|
+
endMs: seg.endMs + offsetMs,
|
|
153
|
+
speakerId: seg.speakerId,
|
|
154
|
+
speakerName: seg.speakerName,
|
|
155
|
+
confidence: seg.confidence
|
|
156
|
+
}));
|
|
157
|
+
allSegments.push(...offsetSegments);
|
|
158
|
+
fullText += (fullText ? " " : "") + result.text;
|
|
159
|
+
totalDurationMs += result.durationMs;
|
|
160
|
+
offsetMs += chunk.durationMs ?? 0;
|
|
161
|
+
}
|
|
162
|
+
let mappedSegments = allSegments;
|
|
163
|
+
let speakers;
|
|
164
|
+
if (brief.diarize) {
|
|
165
|
+
const mapping = this.diarizationMapper.map(allSegments);
|
|
166
|
+
mappedSegments = mapping.segments;
|
|
167
|
+
speakers = mapping.speakers;
|
|
168
|
+
}
|
|
169
|
+
const transcript = {
|
|
170
|
+
text: fullText,
|
|
171
|
+
segments: mappedSegments,
|
|
172
|
+
language: brief.language ?? "en",
|
|
173
|
+
durationMs: totalDurationMs
|
|
174
|
+
};
|
|
175
|
+
let subtitles;
|
|
176
|
+
const format = brief.subtitleFormat ?? "none";
|
|
177
|
+
if (format === "srt") {
|
|
178
|
+
subtitles = this.subtitleFormatter.toSRT(mappedSegments);
|
|
179
|
+
} else if (format === "vtt") {
|
|
180
|
+
subtitles = this.subtitleFormatter.toVTT(mappedSegments);
|
|
181
|
+
}
|
|
182
|
+
return {
|
|
183
|
+
id: projectId,
|
|
184
|
+
transcript,
|
|
185
|
+
subtitles,
|
|
186
|
+
speakers
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
async* transcribeStream(audio, options) {
|
|
190
|
+
if (!this.stt.transcribeStream) {
|
|
191
|
+
throw new Error("Streaming transcription not supported by the current STT provider");
|
|
192
|
+
}
|
|
193
|
+
const stream = this.stt.transcribeStream(audio, {
|
|
194
|
+
language: options?.language,
|
|
195
|
+
diarize: options?.diarize,
|
|
196
|
+
speakerCount: options?.speakerCount,
|
|
197
|
+
wordTimestamps: true,
|
|
198
|
+
vocabularyHints: options?.vocabularyHints
|
|
199
|
+
});
|
|
200
|
+
for await (const segment of stream) {
|
|
201
|
+
yield {
|
|
202
|
+
text: segment.text,
|
|
203
|
+
startMs: segment.startMs,
|
|
204
|
+
endMs: segment.endMs,
|
|
205
|
+
speakerId: segment.speakerId,
|
|
206
|
+
speakerLabel: segment.speakerName,
|
|
207
|
+
confidence: segment.confidence
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
function generateProjectId() {
|
|
213
|
+
const timestamp = Date.now().toString(36);
|
|
214
|
+
const random = Math.random().toString(36).slice(2, 8);
|
|
215
|
+
return `stt_${timestamp}_${random}`;
|
|
216
|
+
}
|
|
217
|
+
export {
|
|
218
|
+
Transcriber,
|
|
219
|
+
SubtitleFormatter,
|
|
220
|
+
SegmentSplitter,
|
|
221
|
+
DiarizationMapper
|
|
222
|
+
};
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
// src/stt/segment-splitter.ts
|
|
2
|
+
class SegmentSplitter {
|
|
3
|
+
static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
|
|
4
|
+
split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
|
|
5
|
+
const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
|
|
6
|
+
if (totalDurationMs <= maxChunkMs) {
|
|
7
|
+
return [audio];
|
|
8
|
+
}
|
|
9
|
+
const chunks = [];
|
|
10
|
+
const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
|
|
11
|
+
let offsetMs = 0;
|
|
12
|
+
while (offsetMs < totalDurationMs) {
|
|
13
|
+
const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
|
|
14
|
+
const startByte = Math.floor(offsetMs * bytesPerMs);
|
|
15
|
+
const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
|
|
16
|
+
chunks.push({
|
|
17
|
+
data: audio.data.slice(startByte, endByte),
|
|
18
|
+
format: audio.format,
|
|
19
|
+
sampleRateHz: audio.sampleRateHz,
|
|
20
|
+
durationMs: chunkDurationMs,
|
|
21
|
+
channels: audio.channels
|
|
22
|
+
});
|
|
23
|
+
offsetMs += chunkDurationMs;
|
|
24
|
+
}
|
|
25
|
+
return chunks;
|
|
26
|
+
}
|
|
27
|
+
estimateDurationMs(audio) {
|
|
28
|
+
const bytesPerSample = 2;
|
|
29
|
+
const channels = audio.channels ?? 1;
|
|
30
|
+
const totalSamples = audio.data.length / (bytesPerSample * channels);
|
|
31
|
+
return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
export {
|
|
35
|
+
SegmentSplitter
|
|
36
|
+
};
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
// src/stt/subtitle-formatter.ts
|
|
2
|
+
class SubtitleFormatter {
|
|
3
|
+
toSRT(segments) {
|
|
4
|
+
return segments.map((seg, i) => {
|
|
5
|
+
const start = this.formatTimeSRT(seg.startMs);
|
|
6
|
+
const end = this.formatTimeSRT(seg.endMs);
|
|
7
|
+
const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
|
|
8
|
+
return `${i + 1}
|
|
9
|
+
${start} --> ${end}
|
|
10
|
+
${label}${seg.text}`;
|
|
11
|
+
}).join(`
|
|
12
|
+
|
|
13
|
+
`);
|
|
14
|
+
}
|
|
15
|
+
toVTT(segments) {
|
|
16
|
+
const header = `WEBVTT
|
|
17
|
+
|
|
18
|
+
`;
|
|
19
|
+
const cues = segments.map((seg, i) => {
|
|
20
|
+
const start = this.formatTimeVTT(seg.startMs);
|
|
21
|
+
const end = this.formatTimeVTT(seg.endMs);
|
|
22
|
+
const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
|
|
23
|
+
return `${i + 1}
|
|
24
|
+
${start} --> ${end}
|
|
25
|
+
${label}${seg.text}`;
|
|
26
|
+
}).join(`
|
|
27
|
+
|
|
28
|
+
`);
|
|
29
|
+
return header + cues;
|
|
30
|
+
}
|
|
31
|
+
formatTimeSRT(ms) {
|
|
32
|
+
const hours = Math.floor(ms / 3600000);
|
|
33
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
34
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
35
|
+
const millis = ms % 1000;
|
|
36
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
|
|
37
|
+
}
|
|
38
|
+
formatTimeVTT(ms) {
|
|
39
|
+
const hours = Math.floor(ms / 3600000);
|
|
40
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
41
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
42
|
+
const millis = ms % 1000;
|
|
43
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
|
|
44
|
+
}
|
|
45
|
+
pad(value, length) {
|
|
46
|
+
return value.toString().padStart(length, "0");
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
export {
|
|
50
|
+
SubtitleFormatter
|
|
51
|
+
};
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
// src/stt/segment-splitter.ts
|
|
2
|
+
class SegmentSplitter {
|
|
3
|
+
static DEFAULT_MAX_CHUNK_MS = 5 * 60 * 1000;
|
|
4
|
+
split(audio, maxChunkMs = SegmentSplitter.DEFAULT_MAX_CHUNK_MS) {
|
|
5
|
+
const totalDurationMs = audio.durationMs ?? this.estimateDurationMs(audio);
|
|
6
|
+
if (totalDurationMs <= maxChunkMs) {
|
|
7
|
+
return [audio];
|
|
8
|
+
}
|
|
9
|
+
const chunks = [];
|
|
10
|
+
const bytesPerMs = audio.data.length / Math.max(totalDurationMs, 1);
|
|
11
|
+
let offsetMs = 0;
|
|
12
|
+
while (offsetMs < totalDurationMs) {
|
|
13
|
+
const chunkDurationMs = Math.min(maxChunkMs, totalDurationMs - offsetMs);
|
|
14
|
+
const startByte = Math.floor(offsetMs * bytesPerMs);
|
|
15
|
+
const endByte = Math.floor((offsetMs + chunkDurationMs) * bytesPerMs);
|
|
16
|
+
chunks.push({
|
|
17
|
+
data: audio.data.slice(startByte, endByte),
|
|
18
|
+
format: audio.format,
|
|
19
|
+
sampleRateHz: audio.sampleRateHz,
|
|
20
|
+
durationMs: chunkDurationMs,
|
|
21
|
+
channels: audio.channels
|
|
22
|
+
});
|
|
23
|
+
offsetMs += chunkDurationMs;
|
|
24
|
+
}
|
|
25
|
+
return chunks;
|
|
26
|
+
}
|
|
27
|
+
estimateDurationMs(audio) {
|
|
28
|
+
const bytesPerSample = 2;
|
|
29
|
+
const channels = audio.channels ?? 1;
|
|
30
|
+
const totalSamples = audio.data.length / (bytesPerSample * channels);
|
|
31
|
+
return Math.ceil(totalSamples / audio.sampleRateHz * 1000);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// src/stt/diarization-mapper.ts
|
|
36
|
+
class DiarizationMapper {
|
|
37
|
+
map(segments, labelPrefix = "Speaker") {
|
|
38
|
+
const speakerOrder = [];
|
|
39
|
+
const speakerStats = new Map;
|
|
40
|
+
for (const seg of segments) {
|
|
41
|
+
if (seg.speakerId && !speakerOrder.includes(seg.speakerId)) {
|
|
42
|
+
speakerOrder.push(seg.speakerId);
|
|
43
|
+
speakerStats.set(seg.speakerId, {
|
|
44
|
+
segmentCount: 0,
|
|
45
|
+
totalSpeakingMs: 0
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
const labeledSegments = segments.map((seg) => {
|
|
50
|
+
if (!seg.speakerId)
|
|
51
|
+
return seg;
|
|
52
|
+
const index = speakerOrder.indexOf(seg.speakerId);
|
|
53
|
+
const label = `${labelPrefix} ${index + 1}`;
|
|
54
|
+
const stats = speakerStats.get(seg.speakerId);
|
|
55
|
+
if (!stats) {
|
|
56
|
+
return { ...seg, speakerLabel: label };
|
|
57
|
+
}
|
|
58
|
+
stats.segmentCount += 1;
|
|
59
|
+
stats.totalSpeakingMs += seg.endMs - seg.startMs;
|
|
60
|
+
return { ...seg, speakerLabel: label };
|
|
61
|
+
});
|
|
62
|
+
const speakers = speakerOrder.map((id, index) => {
|
|
63
|
+
const stats = speakerStats.get(id);
|
|
64
|
+
return {
|
|
65
|
+
id,
|
|
66
|
+
label: `${labelPrefix} ${index + 1}`,
|
|
67
|
+
segmentCount: stats?.segmentCount ?? 0,
|
|
68
|
+
totalSpeakingMs: stats?.totalSpeakingMs ?? 0
|
|
69
|
+
};
|
|
70
|
+
});
|
|
71
|
+
return { segments: labeledSegments, speakers };
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// src/stt/subtitle-formatter.ts
|
|
76
|
+
class SubtitleFormatter {
|
|
77
|
+
toSRT(segments) {
|
|
78
|
+
return segments.map((seg, i) => {
|
|
79
|
+
const start = this.formatTimeSRT(seg.startMs);
|
|
80
|
+
const end = this.formatTimeSRT(seg.endMs);
|
|
81
|
+
const label = seg.speakerLabel ? `[${seg.speakerLabel}] ` : "";
|
|
82
|
+
return `${i + 1}
|
|
83
|
+
${start} --> ${end}
|
|
84
|
+
${label}${seg.text}`;
|
|
85
|
+
}).join(`
|
|
86
|
+
|
|
87
|
+
`);
|
|
88
|
+
}
|
|
89
|
+
toVTT(segments) {
|
|
90
|
+
const header = `WEBVTT
|
|
91
|
+
|
|
92
|
+
`;
|
|
93
|
+
const cues = segments.map((seg, i) => {
|
|
94
|
+
const start = this.formatTimeVTT(seg.startMs);
|
|
95
|
+
const end = this.formatTimeVTT(seg.endMs);
|
|
96
|
+
const label = seg.speakerLabel ? `<v ${seg.speakerLabel}>` : "";
|
|
97
|
+
return `${i + 1}
|
|
98
|
+
${start} --> ${end}
|
|
99
|
+
${label}${seg.text}`;
|
|
100
|
+
}).join(`
|
|
101
|
+
|
|
102
|
+
`);
|
|
103
|
+
return header + cues;
|
|
104
|
+
}
|
|
105
|
+
formatTimeSRT(ms) {
|
|
106
|
+
const hours = Math.floor(ms / 3600000);
|
|
107
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
108
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
109
|
+
const millis = ms % 1000;
|
|
110
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)},${this.pad(millis, 3)}`;
|
|
111
|
+
}
|
|
112
|
+
formatTimeVTT(ms) {
|
|
113
|
+
const hours = Math.floor(ms / 3600000);
|
|
114
|
+
const minutes = Math.floor(ms % 3600000 / 60000);
|
|
115
|
+
const seconds = Math.floor(ms % 60000 / 1000);
|
|
116
|
+
const millis = ms % 1000;
|
|
117
|
+
return `${this.pad(hours, 2)}:${this.pad(minutes, 2)}:${this.pad(seconds, 2)}.${this.pad(millis, 3)}`;
|
|
118
|
+
}
|
|
119
|
+
pad(value, length) {
|
|
120
|
+
return value.toString().padStart(length, "0");
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// src/stt/transcriber.ts
|
|
125
|
+
class Transcriber {
|
|
126
|
+
stt;
|
|
127
|
+
segmentSplitter = new SegmentSplitter;
|
|
128
|
+
diarizationMapper = new DiarizationMapper;
|
|
129
|
+
subtitleFormatter = new SubtitleFormatter;
|
|
130
|
+
constructor(options) {
|
|
131
|
+
this.stt = options.stt;
|
|
132
|
+
}
|
|
133
|
+
async transcribe(brief) {
|
|
134
|
+
const projectId = generateProjectId();
|
|
135
|
+
const chunks = this.segmentSplitter.split(brief.audio);
|
|
136
|
+
const allSegments = [];
|
|
137
|
+
let fullText = "";
|
|
138
|
+
let totalDurationMs = 0;
|
|
139
|
+
let offsetMs = 0;
|
|
140
|
+
for (const chunk of chunks) {
|
|
141
|
+
const result = await this.stt.transcribe({
|
|
142
|
+
audio: chunk,
|
|
143
|
+
language: brief.language,
|
|
144
|
+
diarize: brief.diarize,
|
|
145
|
+
speakerCount: brief.speakerCount,
|
|
146
|
+
wordTimestamps: true,
|
|
147
|
+
vocabularyHints: brief.vocabularyHints
|
|
148
|
+
});
|
|
149
|
+
const offsetSegments = result.segments.map((seg) => ({
|
|
150
|
+
text: seg.text,
|
|
151
|
+
startMs: seg.startMs + offsetMs,
|
|
152
|
+
endMs: seg.endMs + offsetMs,
|
|
153
|
+
speakerId: seg.speakerId,
|
|
154
|
+
speakerName: seg.speakerName,
|
|
155
|
+
confidence: seg.confidence
|
|
156
|
+
}));
|
|
157
|
+
allSegments.push(...offsetSegments);
|
|
158
|
+
fullText += (fullText ? " " : "") + result.text;
|
|
159
|
+
totalDurationMs += result.durationMs;
|
|
160
|
+
offsetMs += chunk.durationMs ?? 0;
|
|
161
|
+
}
|
|
162
|
+
let mappedSegments = allSegments;
|
|
163
|
+
let speakers;
|
|
164
|
+
if (brief.diarize) {
|
|
165
|
+
const mapping = this.diarizationMapper.map(allSegments);
|
|
166
|
+
mappedSegments = mapping.segments;
|
|
167
|
+
speakers = mapping.speakers;
|
|
168
|
+
}
|
|
169
|
+
const transcript = {
|
|
170
|
+
text: fullText,
|
|
171
|
+
segments: mappedSegments,
|
|
172
|
+
language: brief.language ?? "en",
|
|
173
|
+
durationMs: totalDurationMs
|
|
174
|
+
};
|
|
175
|
+
let subtitles;
|
|
176
|
+
const format = brief.subtitleFormat ?? "none";
|
|
177
|
+
if (format === "srt") {
|
|
178
|
+
subtitles = this.subtitleFormatter.toSRT(mappedSegments);
|
|
179
|
+
} else if (format === "vtt") {
|
|
180
|
+
subtitles = this.subtitleFormatter.toVTT(mappedSegments);
|
|
181
|
+
}
|
|
182
|
+
return {
|
|
183
|
+
id: projectId,
|
|
184
|
+
transcript,
|
|
185
|
+
subtitles,
|
|
186
|
+
speakers
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
async* transcribeStream(audio, options) {
|
|
190
|
+
if (!this.stt.transcribeStream) {
|
|
191
|
+
throw new Error("Streaming transcription not supported by the current STT provider");
|
|
192
|
+
}
|
|
193
|
+
const stream = this.stt.transcribeStream(audio, {
|
|
194
|
+
language: options?.language,
|
|
195
|
+
diarize: options?.diarize,
|
|
196
|
+
speakerCount: options?.speakerCount,
|
|
197
|
+
wordTimestamps: true,
|
|
198
|
+
vocabularyHints: options?.vocabularyHints
|
|
199
|
+
});
|
|
200
|
+
for await (const segment of stream) {
|
|
201
|
+
yield {
|
|
202
|
+
text: segment.text,
|
|
203
|
+
startMs: segment.startMs,
|
|
204
|
+
endMs: segment.endMs,
|
|
205
|
+
speakerId: segment.speakerId,
|
|
206
|
+
speakerLabel: segment.speakerName,
|
|
207
|
+
confidence: segment.confidence
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
function generateProjectId() {
|
|
213
|
+
const timestamp = Date.now().toString(36);
|
|
214
|
+
const random = Math.random().toString(36).slice(2, 8);
|
|
215
|
+
return `stt_${timestamp}_${random}`;
|
|
216
|
+
}
|
|
217
|
+
export {
|
|
218
|
+
Transcriber
|
|
219
|
+
};
|
|
File without changes
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
// src/sync/duration-negotiator.ts
|
|
2
|
+
class DurationNegotiator {
|
|
3
|
+
static UPPER_THRESHOLD = 1.1;
|
|
4
|
+
static LOWER_THRESHOLD = 0.7;
|
|
5
|
+
static MAX_RATE = 1.3;
|
|
6
|
+
static MIN_RATE = 0.8;
|
|
7
|
+
negotiate(timingMap, sceneDurations) {
|
|
8
|
+
const adjustments = [];
|
|
9
|
+
const updatedSegments = timingMap.segments.map((seg) => {
|
|
10
|
+
const originalSceneDuration = sceneDurations.get(seg.sceneId);
|
|
11
|
+
if (originalSceneDuration === undefined) {
|
|
12
|
+
adjustments.push({
|
|
13
|
+
sceneId: seg.sceneId,
|
|
14
|
+
originalSceneDurationInFrames: seg.recommendedSceneDurationInFrames,
|
|
15
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
16
|
+
action: "no_change",
|
|
17
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
18
|
+
});
|
|
19
|
+
return seg;
|
|
20
|
+
}
|
|
21
|
+
const ratio = seg.durationInFrames / originalSceneDuration;
|
|
22
|
+
if (ratio > DurationNegotiator.UPPER_THRESHOLD) {
|
|
23
|
+
const suggestedRate = Math.min(ratio, DurationNegotiator.MAX_RATE);
|
|
24
|
+
adjustments.push({
|
|
25
|
+
sceneId: seg.sceneId,
|
|
26
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
27
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
28
|
+
action: ratio > DurationNegotiator.MAX_RATE ? "extend_scene" : "suggest_rate_change",
|
|
29
|
+
suggestedRate,
|
|
30
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
31
|
+
});
|
|
32
|
+
return seg;
|
|
33
|
+
}
|
|
34
|
+
if (ratio < DurationNegotiator.LOWER_THRESHOLD) {
|
|
35
|
+
const suggestedRate = Math.max(ratio, DurationNegotiator.MIN_RATE);
|
|
36
|
+
adjustments.push({
|
|
37
|
+
sceneId: seg.sceneId,
|
|
38
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
39
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
40
|
+
action: "pad_silence",
|
|
41
|
+
suggestedRate,
|
|
42
|
+
finalSceneDurationInFrames: originalSceneDuration
|
|
43
|
+
});
|
|
44
|
+
return {
|
|
45
|
+
...seg,
|
|
46
|
+
recommendedSceneDurationInFrames: originalSceneDuration
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
adjustments.push({
|
|
50
|
+
sceneId: seg.sceneId,
|
|
51
|
+
originalSceneDurationInFrames: originalSceneDuration,
|
|
52
|
+
voiceDurationInFrames: seg.durationInFrames,
|
|
53
|
+
action: "no_change",
|
|
54
|
+
finalSceneDurationInFrames: seg.recommendedSceneDurationInFrames
|
|
55
|
+
});
|
|
56
|
+
return seg;
|
|
57
|
+
});
|
|
58
|
+
return {
|
|
59
|
+
timingMap: {
|
|
60
|
+
...timingMap,
|
|
61
|
+
segments: updatedSegments
|
|
62
|
+
},
|
|
63
|
+
adjustments
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
export {
|
|
68
|
+
DurationNegotiator
|
|
69
|
+
};
|