@storyteller-platform/ghost-story 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +611 -0
- package/README.md +18 -0
- package/dist/api/APIOptions.cjs +16 -0
- package/dist/api/APIOptions.d.cts +18 -0
- package/dist/api/APIOptions.d.ts +18 -0
- package/dist/api/APIOptions.js +0 -0
- package/dist/api/Recognition.cjs +263 -0
- package/dist/api/Recognition.d.cts +77 -0
- package/dist/api/Recognition.d.ts +77 -0
- package/dist/api/Recognition.js +233 -0
- package/dist/api/VoiceActivityDetection.cjs +77 -0
- package/dist/api/VoiceActivityDetection.d.cts +24 -0
- package/dist/api/VoiceActivityDetection.d.ts +24 -0
- package/dist/api/VoiceActivityDetection.js +43 -0
- package/dist/audio/AudioConverter.cjs +331 -0
- package/dist/audio/AudioConverter.d.cts +53 -0
- package/dist/audio/AudioConverter.d.ts +53 -0
- package/dist/audio/AudioConverter.js +310 -0
- package/dist/audio/AudioFormat.cjs +151 -0
- package/dist/audio/AudioFormat.d.cts +25 -0
- package/dist/audio/AudioFormat.d.ts +25 -0
- package/dist/audio/AudioFormat.js +123 -0
- package/dist/audio/AudioSource.cjs +119 -0
- package/dist/audio/AudioSource.d.cts +33 -0
- package/dist/audio/AudioSource.d.ts +33 -0
- package/dist/audio/AudioSource.js +88 -0
- package/dist/audio/index.cjs +74 -0
- package/dist/audio/index.d.cts +6 -0
- package/dist/audio/index.d.ts +6 -0
- package/dist/audio/index.js +54 -0
- package/dist/cli/bin.cjs +277 -0
- package/dist/cli/bin.d.cts +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +275 -0
- package/dist/cli/config.cjs +347 -0
- package/dist/cli/config.d.cts +33 -0
- package/dist/cli/config.d.ts +33 -0
- package/dist/cli/config.js +285 -0
- package/dist/cli/install.cjs +334 -0
- package/dist/cli/install.d.cts +62 -0
- package/dist/cli/install.d.ts +62 -0
- package/dist/cli/install.js +316 -0
- package/dist/cli/whisper-server.cjs +172 -0
- package/dist/cli/whisper-server.d.cts +24 -0
- package/dist/cli/whisper-server.d.ts +24 -0
- package/dist/cli/whisper-server.js +152 -0
- package/dist/config.cjs +60 -0
- package/dist/config.d.cts +12 -0
- package/dist/config.d.ts +12 -0
- package/dist/config.js +32 -0
- package/dist/convert.cjs +88 -0
- package/dist/convert.d.cts +12 -0
- package/dist/convert.d.ts +12 -0
- package/dist/convert.js +63 -0
- package/dist/encodings/Ascii.cjs +75 -0
- package/dist/encodings/Ascii.d.cts +13 -0
- package/dist/encodings/Ascii.d.ts +13 -0
- package/dist/encodings/Ascii.js +48 -0
- package/dist/encodings/Base64.cjs +155 -0
- package/dist/encodings/Base64.d.cts +5 -0
- package/dist/encodings/Base64.d.ts +5 -0
- package/dist/encodings/Base64.js +129 -0
- package/dist/encodings/TextEncodingsCommon.cjs +16 -0
- package/dist/encodings/TextEncodingsCommon.d.cts +6 -0
- package/dist/encodings/TextEncodingsCommon.d.ts +6 -0
- package/dist/encodings/TextEncodingsCommon.js +0 -0
- package/dist/index.cjs +153 -0
- package/dist/index.d.cts +15 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +140 -0
- package/dist/recognition/AmazonTranscribeSTT.cjs +188 -0
- package/dist/recognition/AmazonTranscribeSTT.d.cts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.d.ts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.js +160 -0
- package/dist/recognition/AzureCognitiveServicesSTT.cjs +124 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.cts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.ts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.js +95 -0
- package/dist/recognition/DeepgramSTT.cjs +172 -0
- package/dist/recognition/DeepgramSTT.d.cts +23 -0
- package/dist/recognition/DeepgramSTT.d.ts +23 -0
- package/dist/recognition/DeepgramSTT.js +153 -0
- package/dist/recognition/GoogleCloudSTT.cjs +125 -0
- package/dist/recognition/GoogleCloudSTT.d.cts +35 -0
- package/dist/recognition/GoogleCloudSTT.d.ts +35 -0
- package/dist/recognition/GoogleCloudSTT.js +107 -0
- package/dist/recognition/OpenAICloudSTT.cjs +180 -0
- package/dist/recognition/OpenAICloudSTT.d.cts +29 -0
- package/dist/recognition/OpenAICloudSTT.d.ts +29 -0
- package/dist/recognition/OpenAICloudSTT.js +150 -0
- package/dist/recognition/WhisperCppSTT.cjs +296 -0
- package/dist/recognition/WhisperCppSTT.d.cts +40 -0
- package/dist/recognition/WhisperCppSTT.d.ts +40 -0
- package/dist/recognition/WhisperCppSTT.js +275 -0
- package/dist/recognition/WhisperServerSTT.cjs +119 -0
- package/dist/recognition/WhisperServerSTT.d.cts +24 -0
- package/dist/recognition/WhisperServerSTT.d.ts +24 -0
- package/dist/recognition/WhisperServerSTT.js +105 -0
- package/dist/utilities/FileSystem.cjs +54 -0
- package/dist/utilities/FileSystem.d.cts +3 -0
- package/dist/utilities/FileSystem.d.ts +3 -0
- package/dist/utilities/FileSystem.js +20 -0
- package/dist/utilities/Locale.cjs +46 -0
- package/dist/utilities/Locale.d.cts +9 -0
- package/dist/utilities/Locale.d.ts +9 -0
- package/dist/utilities/Locale.js +20 -0
- package/dist/utilities/ObjectUtilities.cjs +41 -0
- package/dist/utilities/ObjectUtilities.d.cts +3 -0
- package/dist/utilities/ObjectUtilities.d.ts +3 -0
- package/dist/utilities/ObjectUtilities.js +7 -0
- package/dist/utilities/Timeline.cjs +120 -0
- package/dist/utilities/Timeline.d.cts +23 -0
- package/dist/utilities/Timeline.d.ts +23 -0
- package/dist/utilities/Timeline.js +94 -0
- package/dist/utilities/Timing.cjs +287 -0
- package/dist/utilities/Timing.d.cts +64 -0
- package/dist/utilities/Timing.d.ts +64 -0
- package/dist/utilities/Timing.js +256 -0
- package/dist/utilities/WhisperTimeline.cjs +344 -0
- package/dist/utilities/WhisperTimeline.d.cts +86 -0
- package/dist/utilities/WhisperTimeline.d.ts +86 -0
- package/dist/utilities/WhisperTimeline.js +313 -0
- package/dist/vad/ActiveGate.cjs +357 -0
- package/dist/vad/ActiveGate.d.cts +53 -0
- package/dist/vad/ActiveGate.d.ts +53 -0
- package/dist/vad/ActiveGate.js +329 -0
- package/dist/vad/ActiveGateOg.cjs +1366 -0
- package/dist/vad/ActiveGateOg.d.cts +33 -0
- package/dist/vad/ActiveGateOg.d.ts +33 -0
- package/dist/vad/ActiveGateOg.js +1341 -0
- package/dist/vad/Silero.cjs +174 -0
- package/dist/vad/Silero.d.cts +25 -0
- package/dist/vad/Silero.d.ts +25 -0
- package/dist/vad/Silero.js +153 -0
- package/package.json +125 -0
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
const WHISPER_SAMPLE_RATE = 16e3;
|
|
2
|
+
function calculateWhisperSplits(durationSeconds, numProcessors, sampleRate = WHISPER_SAMPLE_RATE) {
|
|
3
|
+
if (numProcessors <= 1) return [];
|
|
4
|
+
const totalSamples = Math.floor(durationSeconds * sampleRate);
|
|
5
|
+
const samplesPerProcessor = Math.floor(totalSamples / numProcessors);
|
|
6
|
+
const splits = [];
|
|
7
|
+
for (let i = 1; i < numProcessors; i++) {
|
|
8
|
+
const splitSamples = i * samplesPerProcessor;
|
|
9
|
+
const splitSeconds = splitSamples / sampleRate;
|
|
10
|
+
splits.push(splitSeconds);
|
|
11
|
+
}
|
|
12
|
+
return splits;
|
|
13
|
+
}
|
|
14
|
+
const specialTokenPattern = /\[_.+\]|<\|[a-z_]+\|>/g;
|
|
15
|
+
function parseWhisperCppOutput(transcription) {
|
|
16
|
+
return transcription.map((segment) => {
|
|
17
|
+
var _a, _b;
|
|
18
|
+
const words = [];
|
|
19
|
+
let lastTokenEndMs = 0;
|
|
20
|
+
for (const token of segment.tokens) {
|
|
21
|
+
const cleanedText = token.text.replace(specialTokenPattern, "");
|
|
22
|
+
if (cleanedText.trim().length === 0) continue;
|
|
23
|
+
const offsetFrom = ((_a = token.offsets) == null ? void 0 : _a.from) ?? lastTokenEndMs;
|
|
24
|
+
const offsetTo = ((_b = token.offsets) == null ? void 0 : _b.to) ?? lastTokenEndMs;
|
|
25
|
+
if (token.offsets) {
|
|
26
|
+
lastTokenEndMs = token.offsets.to;
|
|
27
|
+
}
|
|
28
|
+
words.push({
|
|
29
|
+
text: cleanedText,
|
|
30
|
+
start: offsetFrom / 1e3,
|
|
31
|
+
end: offsetTo / 1e3,
|
|
32
|
+
confidence: token.p
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
return {
|
|
36
|
+
text: segment.text,
|
|
37
|
+
segmentStart: segment.offsets.from / 1e3,
|
|
38
|
+
segmentEnd: segment.offsets.to / 1e3,
|
|
39
|
+
words
|
|
40
|
+
};
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
function parseWhisperServerOutput(segments) {
|
|
44
|
+
return segments.map((segment) => {
|
|
45
|
+
const words = (segment.words ?? []).map((word) => ({
|
|
46
|
+
text: word.word,
|
|
47
|
+
start: word.start,
|
|
48
|
+
end: word.end,
|
|
49
|
+
confidence: word.probability ?? 0
|
|
50
|
+
}));
|
|
51
|
+
return {
|
|
52
|
+
text: segment.text,
|
|
53
|
+
segmentStart: segment.start,
|
|
54
|
+
segmentEnd: segment.end,
|
|
55
|
+
words
|
|
56
|
+
};
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
const MS_PER_CHAR = 0.15;
|
|
60
|
+
const MAX_REASONABLE_WORD_DURATION = 5;
|
|
61
|
+
const LOW_CONFIDENCE_THRESHOLD = 0.3;
|
|
62
|
+
function estimateReasonableDuration(text) {
|
|
63
|
+
const charCount = text.trim().length;
|
|
64
|
+
return Math.max(0.1, charCount * MS_PER_CHAR);
|
|
65
|
+
}
|
|
66
|
+
function detectProcessorBoundary(segment, state) {
|
|
67
|
+
if (segment.words.length === 0) return false;
|
|
68
|
+
const firstWord = segment.words[0];
|
|
69
|
+
if (!firstWord) return false;
|
|
70
|
+
const wordStartsNearZero = firstWord.start < 1;
|
|
71
|
+
if (!wordStartsNearZero) return false;
|
|
72
|
+
const segmentJumpsForward = segment.segmentStart > state.lastSegmentEnd + 1;
|
|
73
|
+
if (segmentJumpsForward) return true;
|
|
74
|
+
const segmentGoesBackwards = segment.segmentEnd < segment.segmentStart;
|
|
75
|
+
if (segmentGoesBackwards) return true;
|
|
76
|
+
const significantTimeAccumulated = state.lastWordEnd > 30;
|
|
77
|
+
if (significantTimeAccumulated && firstWord.start < 0.5) return true;
|
|
78
|
+
return false;
|
|
79
|
+
}
|
|
80
|
+
function countProcessorBoundaries(segments) {
|
|
81
|
+
if (segments.length === 0) return 0;
|
|
82
|
+
let boundaryCount = 0;
|
|
83
|
+
const state = {
|
|
84
|
+
cumulativeOffset: 0,
|
|
85
|
+
lastSegmentEnd: 0,
|
|
86
|
+
lastWordEnd: 0
|
|
87
|
+
};
|
|
88
|
+
for (const segment of segments) {
|
|
89
|
+
if (detectProcessorBoundary(segment, state)) {
|
|
90
|
+
boundaryCount++;
|
|
91
|
+
}
|
|
92
|
+
const segmentEnd = segment.segmentEnd < segment.segmentStart ? segment.segmentStart : segment.segmentEnd;
|
|
93
|
+
state.lastSegmentEnd = segmentEnd;
|
|
94
|
+
const lastWord = segment.words[segment.words.length - 1];
|
|
95
|
+
if (lastWord) {
|
|
96
|
+
state.lastWordEnd = Math.max(state.lastWordEnd, lastWord.end);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return boundaryCount;
|
|
100
|
+
}
|
|
101
|
+
const MIN_SECONDS_PER_PROCESSOR = 30;
|
|
102
|
+
function calculateEffectiveProcessors(durationSeconds, requestedProcessors) {
|
|
103
|
+
const maxProcessors = Math.max(
|
|
104
|
+
1,
|
|
105
|
+
Math.floor(durationSeconds / MIN_SECONDS_PER_PROCESSOR)
|
|
106
|
+
);
|
|
107
|
+
return Math.min(requestedProcessors, maxProcessors);
|
|
108
|
+
}
|
|
109
|
+
function correctWordTimestamps(word, state, segmentBounds) {
|
|
110
|
+
let startTime = word.start + state.cumulativeOffset;
|
|
111
|
+
let endTime = word.end + state.cumulativeOffset;
|
|
112
|
+
const duration = endTime - startTime;
|
|
113
|
+
if (duration > MAX_REASONABLE_WORD_DURATION && word.confidence < LOW_CONFIDENCE_THRESHOLD) {
|
|
114
|
+
const reasonableDuration = estimateReasonableDuration(word.text);
|
|
115
|
+
endTime = startTime + reasonableDuration;
|
|
116
|
+
}
|
|
117
|
+
if (startTime < state.lastWordEnd) {
|
|
118
|
+
const shift = state.lastWordEnd - startTime;
|
|
119
|
+
startTime = state.lastWordEnd;
|
|
120
|
+
endTime = endTime + shift;
|
|
121
|
+
}
|
|
122
|
+
if (endTime < startTime) {
|
|
123
|
+
endTime = startTime;
|
|
124
|
+
}
|
|
125
|
+
const segmentDuration = segmentBounds.end - segmentBounds.start;
|
|
126
|
+
if (segmentDuration > 0 && endTime > segmentBounds.end + state.cumulativeOffset) {
|
|
127
|
+
endTime = Math.max(startTime, segmentBounds.end + state.cumulativeOffset);
|
|
128
|
+
}
|
|
129
|
+
return { startTime, endTime };
|
|
130
|
+
}
|
|
131
|
+
function extractCorrectedTimeline(segments, options = {}) {
|
|
132
|
+
var _a;
|
|
133
|
+
if (segments.length === 0) return [];
|
|
134
|
+
const { splitBoundaries = [] } = options;
|
|
135
|
+
const usedSplits = /* @__PURE__ */ new Set();
|
|
136
|
+
const timeline = [];
|
|
137
|
+
const state = {
|
|
138
|
+
cumulativeOffset: 0,
|
|
139
|
+
lastSegmentEnd: 0,
|
|
140
|
+
lastWordEnd: 0
|
|
141
|
+
};
|
|
142
|
+
for (const segment of segments) {
|
|
143
|
+
const segmentStart = segment.segmentStart;
|
|
144
|
+
const segmentEnd = segment.segmentEnd < segment.segmentStart ? segment.segmentStart : segment.segmentEnd;
|
|
145
|
+
if (detectProcessorBoundary(segment, state)) {
|
|
146
|
+
const firstWord = segment.words[0];
|
|
147
|
+
const firstWordStart = (firstWord == null ? void 0 : firstWord.start) ?? 0;
|
|
148
|
+
if (splitBoundaries.length > 0) {
|
|
149
|
+
const expectedTime = state.lastWordEnd > 0 ? state.lastWordEnd : segmentStart;
|
|
150
|
+
let bestSplit = null;
|
|
151
|
+
for (const split of splitBoundaries) {
|
|
152
|
+
if (usedSplits.has(split)) continue;
|
|
153
|
+
if (Math.abs(split - expectedTime) < 30 || Math.abs(split - segmentStart) < 30) {
|
|
154
|
+
bestSplit = split;
|
|
155
|
+
break;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
if (bestSplit !== null) {
|
|
159
|
+
state.cumulativeOffset = bestSplit - firstWordStart;
|
|
160
|
+
usedSplits.add(bestSplit);
|
|
161
|
+
} else if (state.lastWordEnd > 0) {
|
|
162
|
+
state.cumulativeOffset = state.lastWordEnd - firstWordStart;
|
|
163
|
+
} else {
|
|
164
|
+
state.cumulativeOffset = segmentStart - firstWordStart;
|
|
165
|
+
}
|
|
166
|
+
} else {
|
|
167
|
+
if (state.lastWordEnd > 0) {
|
|
168
|
+
state.cumulativeOffset = state.lastWordEnd - firstWordStart;
|
|
169
|
+
} else {
|
|
170
|
+
state.cumulativeOffset = segmentStart - firstWordStart;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
for (const word of segment.words) {
|
|
175
|
+
const trimmedText = word.text.trim();
|
|
176
|
+
if (trimmedText.length === 0) continue;
|
|
177
|
+
if (trimmedText.includes("BLANK_AUDIO")) continue;
|
|
178
|
+
const { startTime, endTime } = correctWordTimestamps(word, state, {
|
|
179
|
+
start: segmentStart,
|
|
180
|
+
end: segmentEnd
|
|
181
|
+
});
|
|
182
|
+
const lastEntry = timeline[timeline.length - 1];
|
|
183
|
+
if (lastEntry && !word.text.startsWith(" ")) {
|
|
184
|
+
lastEntry.text += trimmedText;
|
|
185
|
+
if (lastEntry.confidence !== void 0) {
|
|
186
|
+
lastEntry.confidence = Math.min(lastEntry.confidence, word.confidence);
|
|
187
|
+
}
|
|
188
|
+
const mergedDuration = endTime - lastEntry.startTime;
|
|
189
|
+
const mergedConfidence = lastEntry.confidence ?? 1;
|
|
190
|
+
if (mergedDuration > MAX_REASONABLE_WORD_DURATION && mergedConfidence < LOW_CONFIDENCE_THRESHOLD) {
|
|
191
|
+
lastEntry.endTime = lastEntry.startTime + estimateReasonableDuration(lastEntry.text);
|
|
192
|
+
} else {
|
|
193
|
+
lastEntry.endTime = endTime;
|
|
194
|
+
}
|
|
195
|
+
} else {
|
|
196
|
+
timeline.push({
|
|
197
|
+
type: "word",
|
|
198
|
+
text: trimmedText,
|
|
199
|
+
startTime,
|
|
200
|
+
endTime,
|
|
201
|
+
confidence: word.confidence
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
state.lastWordEnd = ((_a = timeline[timeline.length - 1]) == null ? void 0 : _a.endTime) ?? endTime;
|
|
205
|
+
}
|
|
206
|
+
state.lastSegmentEnd = segmentEnd;
|
|
207
|
+
}
|
|
208
|
+
return ensureMonotonicTimeline(timeline);
|
|
209
|
+
}
|
|
210
|
+
function ensureMonotonicTimeline(timeline) {
|
|
211
|
+
if (timeline.length === 0) return [];
|
|
212
|
+
const sorted = [...timeline].sort((a, b) => a.startTime - b.startTime);
|
|
213
|
+
const result = [];
|
|
214
|
+
let lastEndTime = 0;
|
|
215
|
+
for (const entry of sorted) {
|
|
216
|
+
let startTime = entry.startTime;
|
|
217
|
+
let endTime = entry.endTime;
|
|
218
|
+
if (startTime < lastEndTime) {
|
|
219
|
+
const shift = lastEndTime - startTime;
|
|
220
|
+
startTime = lastEndTime;
|
|
221
|
+
endTime = endTime + shift;
|
|
222
|
+
}
|
|
223
|
+
if (endTime < startTime) {
|
|
224
|
+
endTime = startTime;
|
|
225
|
+
}
|
|
226
|
+
result.push({
|
|
227
|
+
...entry,
|
|
228
|
+
startTime,
|
|
229
|
+
endTime
|
|
230
|
+
});
|
|
231
|
+
lastEndTime = endTime;
|
|
232
|
+
}
|
|
233
|
+
return result;
|
|
234
|
+
}
|
|
235
|
+
function scoreTimeline(timeline) {
|
|
236
|
+
if (timeline.length === 0) {
|
|
237
|
+
return {
|
|
238
|
+
totalWords: 0,
|
|
239
|
+
maxWordDuration: 0,
|
|
240
|
+
averageWordDuration: 0,
|
|
241
|
+
medianWordDuration: 0,
|
|
242
|
+
suspiciousTokenCount: 0,
|
|
243
|
+
suspiciousTokens: [],
|
|
244
|
+
totalDuration: 0
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
const durations = timeline.map((entry) => entry.endTime - entry.startTime);
|
|
248
|
+
const sortedDurations = [...durations].sort((a, b) => a - b);
|
|
249
|
+
const suspiciousTokens = [];
|
|
250
|
+
for (const entry of timeline) {
|
|
251
|
+
const duration = entry.endTime - entry.startTime;
|
|
252
|
+
const confidence = entry.confidence ?? 1;
|
|
253
|
+
if (duration > 3 && confidence < LOW_CONFIDENCE_THRESHOLD) {
|
|
254
|
+
suspiciousTokens.push({
|
|
255
|
+
text: entry.text,
|
|
256
|
+
duration,
|
|
257
|
+
confidence,
|
|
258
|
+
startTime: entry.startTime
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
const lastEntry = timeline[timeline.length - 1];
|
|
263
|
+
const totalDuration = lastEntry ? lastEntry.endTime : 0;
|
|
264
|
+
const sum = durations.reduce((acc, d) => acc + d, 0);
|
|
265
|
+
const medianIndex = Math.floor(sortedDurations.length / 2);
|
|
266
|
+
return {
|
|
267
|
+
totalWords: timeline.length,
|
|
268
|
+
maxWordDuration: Math.max(...durations),
|
|
269
|
+
averageWordDuration: sum / durations.length,
|
|
270
|
+
medianWordDuration: sortedDurations[medianIndex] ?? 0,
|
|
271
|
+
suspiciousTokenCount: suspiciousTokens.length,
|
|
272
|
+
suspiciousTokens,
|
|
273
|
+
totalDuration
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
function compareTimelines(baseline, test) {
|
|
277
|
+
const baselineText = baseline.map((e) => e.text).join(" ").toLowerCase();
|
|
278
|
+
const testText = test.map((e) => e.text).join(" ").toLowerCase();
|
|
279
|
+
const baselineWords = new Set(baselineText.split(/\s+/));
|
|
280
|
+
const testWords = new Set(testText.split(/\s+/));
|
|
281
|
+
const intersection = [...baselineWords].filter((w) => testWords.has(w));
|
|
282
|
+
const union = /* @__PURE__ */ new Set([...baselineWords, ...testWords]);
|
|
283
|
+
const textSimilarity = intersection.length / union.size;
|
|
284
|
+
const baselineMetrics = scoreTimeline(baseline);
|
|
285
|
+
const testMetrics = scoreTimeline(test);
|
|
286
|
+
const durationDifference = Math.abs(
|
|
287
|
+
baselineMetrics.totalDuration - testMetrics.totalDuration
|
|
288
|
+
);
|
|
289
|
+
const wordCountDifference = Math.abs(
|
|
290
|
+
baselineMetrics.totalWords - testMetrics.totalWords
|
|
291
|
+
);
|
|
292
|
+
const maxDurationDifference = Math.abs(
|
|
293
|
+
baselineMetrics.maxWordDuration - testMetrics.maxWordDuration
|
|
294
|
+
);
|
|
295
|
+
const isAcceptable = textSimilarity > 0.8 && testMetrics.suspiciousTokenCount === 0 && maxDurationDifference < 2;
|
|
296
|
+
return {
|
|
297
|
+
textSimilarity,
|
|
298
|
+
durationDifference,
|
|
299
|
+
wordCountDifference,
|
|
300
|
+
maxDurationDifference,
|
|
301
|
+
isAcceptable
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
export {
|
|
305
|
+
calculateEffectiveProcessors,
|
|
306
|
+
calculateWhisperSplits,
|
|
307
|
+
compareTimelines,
|
|
308
|
+
countProcessorBoundaries,
|
|
309
|
+
extractCorrectedTimeline,
|
|
310
|
+
parseWhisperCppOutput,
|
|
311
|
+
parseWhisperServerOutput,
|
|
312
|
+
scoreTimeline
|
|
313
|
+
};
|
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var ActiveGate_exports = {};
|
|
20
|
+
__export(ActiveGate_exports, {
|
|
21
|
+
StreamingVad: () => StreamingVad,
|
|
22
|
+
defaultVadOptions: () => defaultVadOptions,
|
|
23
|
+
detectVoiceActivity: () => detectVoiceActivity,
|
|
24
|
+
vadFromFile: () => vadFromFile,
|
|
25
|
+
vadFromStream: () => vadFromStream
|
|
26
|
+
});
|
|
27
|
+
module.exports = __toCommonJS(ActiveGate_exports);
|
|
28
|
+
var import_ffmpeg_stream = require("ffmpeg-stream");
|
|
29
|
+
const defaultVadOptions = {
|
|
30
|
+
lowCutoff: 100,
|
|
31
|
+
highCutoff: 1e3,
|
|
32
|
+
positiveAdaptationRate: 400,
|
|
33
|
+
negativeAdaptationRate: 10,
|
|
34
|
+
peakLoudnessDecay: 4,
|
|
35
|
+
backwardExtensionDuration: 0.2,
|
|
36
|
+
relativeThreshold: -15
|
|
37
|
+
};
|
|
38
|
+
function detectVoiceActivity(rawAudio, options = {}) {
|
|
39
|
+
var _a;
|
|
40
|
+
const channelCount = rawAudio.audioChannels.length;
|
|
41
|
+
const firstChannel = rawAudio.audioChannels[0];
|
|
42
|
+
if (!firstChannel || channelCount === 0) return [];
|
|
43
|
+
const vad = new StreamingVad(rawAudio.sampleRate, channelCount, options);
|
|
44
|
+
for (let i = 0; i < firstChannel.length; i++) {
|
|
45
|
+
for (let ch = 0; ch < channelCount; ch++) {
|
|
46
|
+
const sample = ((_a = rawAudio.audioChannels[ch]) == null ? void 0 : _a[i]) ?? 0;
|
|
47
|
+
vad.process(sample, ch);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return vad.finalize();
|
|
51
|
+
}
|
|
52
|
+
const defaultStreamOptions = {
|
|
53
|
+
sampleRate: 16e3,
|
|
54
|
+
channelCount: 1
|
|
55
|
+
};
|
|
56
|
+
async function vadFromFile(path, options = {}) {
|
|
57
|
+
const converter = new import_ffmpeg_stream.Converter();
|
|
58
|
+
converter.createInputFromFile(path);
|
|
59
|
+
const outputStream = converter.createOutputStream({
|
|
60
|
+
f: "f32le",
|
|
61
|
+
ar: "16000",
|
|
62
|
+
ac: "1",
|
|
63
|
+
acodec: "pcm_f32le"
|
|
64
|
+
});
|
|
65
|
+
const segments = [];
|
|
66
|
+
const run = converter.run();
|
|
67
|
+
try {
|
|
68
|
+
for await (const seg of vadFromStream(outputStream, options)) {
|
|
69
|
+
segments.push(seg);
|
|
70
|
+
}
|
|
71
|
+
} catch (error) {
|
|
72
|
+
converter.kill();
|
|
73
|
+
throw error;
|
|
74
|
+
} finally {
|
|
75
|
+
await run;
|
|
76
|
+
}
|
|
77
|
+
return segments;
|
|
78
|
+
}
|
|
79
|
+
async function* vadFromStream(stream, options = {}) {
|
|
80
|
+
const sampleRate = options.sampleRate ?? defaultStreamOptions.sampleRate;
|
|
81
|
+
const channelCount = options.channelCount ?? defaultStreamOptions.channelCount;
|
|
82
|
+
const vad = new StreamingVad(sampleRate, channelCount, options);
|
|
83
|
+
let leftover = Buffer.alloc(0);
|
|
84
|
+
for await (const chunk of stream) {
|
|
85
|
+
const combined = Buffer.concat([leftover, chunk]);
|
|
86
|
+
const bytesPerSample = 4;
|
|
87
|
+
const bytesPerFrame = bytesPerSample * channelCount;
|
|
88
|
+
const completeFrames = Math.floor(combined.length / bytesPerFrame);
|
|
89
|
+
const usableBytes = completeFrames * bytesPerFrame;
|
|
90
|
+
for (let offset = 0; offset < usableBytes; offset += bytesPerFrame) {
|
|
91
|
+
for (let ch = 0; ch < channelCount; ch++) {
|
|
92
|
+
const sample = combined.readFloatLE(offset + ch * bytesPerSample);
|
|
93
|
+
vad.process(sample, ch);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
leftover = combined.subarray(usableBytes);
|
|
97
|
+
for (const seg of vad.flush()) {
|
|
98
|
+
yield seg;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
for (const seg of vad.flush(true)) {
|
|
102
|
+
yield seg;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
class StreamingVad {
|
|
106
|
+
constructor(sampleRate, channelCount, options = {}) {
|
|
107
|
+
this.sampleRate = sampleRate;
|
|
108
|
+
this.channelCount = channelCount;
|
|
109
|
+
this.opts = { ...defaultVadOptions, ...options };
|
|
110
|
+
this.ticksPerSecond = sampleRate * channelCount;
|
|
111
|
+
this.backwardFrameCount = Math.ceil(
|
|
112
|
+
this.opts.backwardExtensionDuration / this.frameDuration
|
|
113
|
+
);
|
|
114
|
+
this.channelFilters = Array.from({ length: channelCount }, () => ({
|
|
115
|
+
highpass: createBiquadFilter("highpass", sampleRate, this.opts.lowCutoff),
|
|
116
|
+
lowpass: createBiquadFilter("lowpass", sampleRate, this.opts.highCutoff)
|
|
117
|
+
}));
|
|
118
|
+
const initialPower = dbToGain(-60) ** 2;
|
|
119
|
+
this.loudness = new SmoothEstimator(
|
|
120
|
+
this.opts.positiveAdaptationRate / this.ticksPerSecond,
|
|
121
|
+
this.opts.negativeAdaptationRate / this.ticksPerSecond,
|
|
122
|
+
initialPower
|
|
123
|
+
);
|
|
124
|
+
this.minLoudness = new DecayingPeak(
|
|
125
|
+
"min",
|
|
126
|
+
-60,
|
|
127
|
+
this.opts.peakLoudnessDecay / this.ticksPerSecond
|
|
128
|
+
);
|
|
129
|
+
this.maxLoudness = new DecayingPeak(
|
|
130
|
+
"max",
|
|
131
|
+
-60,
|
|
132
|
+
this.opts.peakLoudnessDecay / this.ticksPerSecond
|
|
133
|
+
);
|
|
134
|
+
}
|
|
135
|
+
opts;
|
|
136
|
+
channelFilters;
|
|
137
|
+
loudness;
|
|
138
|
+
minLoudness;
|
|
139
|
+
maxLoudness;
|
|
140
|
+
frameDuration = 0.01;
|
|
141
|
+
ticksPerSecond;
|
|
142
|
+
backwardFrameCount;
|
|
143
|
+
frameBuffer = [];
|
|
144
|
+
currentSampleIndex = 0;
|
|
145
|
+
emittedUpToTime = 0;
|
|
146
|
+
pendingSegment = null;
|
|
147
|
+
process(sample, channel) {
|
|
148
|
+
const filter = this.channelFilters[channel];
|
|
149
|
+
if (!filter) return;
|
|
150
|
+
sample = filter.highpass.process(sample);
|
|
151
|
+
sample = filter.lowpass.process(sample);
|
|
152
|
+
this.loudness.update(sample ** 2);
|
|
153
|
+
const currentDb = gainToDb(
|
|
154
|
+
Math.sqrt(Math.max(this.loudness.value, dbToGain(-60) ** 2))
|
|
155
|
+
);
|
|
156
|
+
this.minLoudness.update(currentDb);
|
|
157
|
+
if (currentDb >= -60) {
|
|
158
|
+
this.maxLoudness.update(currentDb);
|
|
159
|
+
}
|
|
160
|
+
const time = this.currentSampleIndex / this.sampleRate;
|
|
161
|
+
const lastFrame = this.frameBuffer[this.frameBuffer.length - 1];
|
|
162
|
+
if (!lastFrame || time > lastFrame.time + this.frameDuration) {
|
|
163
|
+
this.frameBuffer.push({
|
|
164
|
+
time,
|
|
165
|
+
loudness: currentDb,
|
|
166
|
+
maxLoudness: this.maxLoudness.value
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
if (channel === this.channelCount - 1) {
|
|
170
|
+
this.currentSampleIndex++;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
// flush finalized segments, keeping buffer for backward extension
|
|
174
|
+
// call with final=true when done processing to get remaining segments
|
|
175
|
+
flush(final = false) {
|
|
176
|
+
var _a;
|
|
177
|
+
const segments = [];
|
|
178
|
+
const frameCount = this.frameBuffer.length;
|
|
179
|
+
if (frameCount === 0) return segments;
|
|
180
|
+
const finalizeCount = final ? frameCount : Math.max(0, frameCount - this.backwardFrameCount);
|
|
181
|
+
if (finalizeCount === 0) return segments;
|
|
182
|
+
const active = this.computeActivity(finalizeCount, final);
|
|
183
|
+
for (let i = 0; i < finalizeCount; i++) {
|
|
184
|
+
const frame = this.frameBuffer[i];
|
|
185
|
+
if (!frame) continue;
|
|
186
|
+
const isActive = active[i] ?? false;
|
|
187
|
+
const startTime = frame.time;
|
|
188
|
+
const endTime = startTime + this.frameDuration;
|
|
189
|
+
if (this.pendingSegment && this.pendingSegment.isActive === isActive) {
|
|
190
|
+
this.pendingSegment.endTime = endTime;
|
|
191
|
+
} else {
|
|
192
|
+
if (this.pendingSegment) {
|
|
193
|
+
segments.push(this.pendingSegment);
|
|
194
|
+
}
|
|
195
|
+
this.pendingSegment = { startTime, endTime, isActive };
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
if (final && this.pendingSegment) {
|
|
199
|
+
segments.push(this.pendingSegment);
|
|
200
|
+
this.pendingSegment = null;
|
|
201
|
+
}
|
|
202
|
+
this.frameBuffer.splice(0, finalizeCount);
|
|
203
|
+
this.emittedUpToTime = ((_a = segments[segments.length - 1]) == null ? void 0 : _a.endTime) ?? this.emittedUpToTime;
|
|
204
|
+
return segments;
|
|
205
|
+
}
|
|
206
|
+
computeActivity(count, includeBuffer) {
|
|
207
|
+
const active = new Array(count).fill(false);
|
|
208
|
+
const lookAheadEnd = includeBuffer ? this.frameBuffer.length : Math.min(count + this.backwardFrameCount, this.frameBuffer.length);
|
|
209
|
+
let extendTo = count;
|
|
210
|
+
for (let i = lookAheadEnd - 1; i >= 0; i--) {
|
|
211
|
+
const frame = this.frameBuffer[i];
|
|
212
|
+
if (!frame) continue;
|
|
213
|
+
const refLoudness = Math.max(frame.maxLoudness, -30);
|
|
214
|
+
const isLoud = frame.loudness >= refLoudness + this.opts.relativeThreshold;
|
|
215
|
+
if (isLoud) {
|
|
216
|
+
extendTo = Math.max(i - this.backwardFrameCount, 0);
|
|
217
|
+
}
|
|
218
|
+
if (i < count) {
|
|
219
|
+
if (i >= extendTo || isLoud) {
|
|
220
|
+
active[i] = true;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
return active;
|
|
225
|
+
}
|
|
226
|
+
finalize() {
|
|
227
|
+
const segments = this.flush(true);
|
|
228
|
+
return segments;
|
|
229
|
+
}
|
|
230
|
+
getSegments() {
|
|
231
|
+
return this.flush(true);
|
|
232
|
+
}
|
|
233
|
+
// reset all state to process a new audio stream
|
|
234
|
+
reset() {
|
|
235
|
+
this.frameBuffer = [];
|
|
236
|
+
this.currentSampleIndex = 0;
|
|
237
|
+
this.emittedUpToTime = 0;
|
|
238
|
+
this.pendingSegment = null;
|
|
239
|
+
this.loudness.reset(dbToGain(-60) ** 2);
|
|
240
|
+
this.minLoudness.reset(-60);
|
|
241
|
+
this.maxLoudness.reset(-60);
|
|
242
|
+
for (const filter of this.channelFilters) {
|
|
243
|
+
filter.highpass.reset();
|
|
244
|
+
filter.lowpass.reset();
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
class SmoothEstimator {
|
|
249
|
+
constructor(upRate, downRate, initial) {
|
|
250
|
+
this.upRate = upRate;
|
|
251
|
+
this.downRate = downRate;
|
|
252
|
+
this.value = initial;
|
|
253
|
+
}
|
|
254
|
+
value;
|
|
255
|
+
update(target) {
|
|
256
|
+
const diff = target - this.value;
|
|
257
|
+
const rate = diff >= 0 ? this.upRate : this.downRate;
|
|
258
|
+
this.value += diff * rate;
|
|
259
|
+
}
|
|
260
|
+
reset(value) {
|
|
261
|
+
this.value = value;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
class DecayingPeak {
|
|
265
|
+
constructor(kind, initial, decay) {
|
|
266
|
+
this.kind = kind;
|
|
267
|
+
this.decay = decay;
|
|
268
|
+
this.value = initial;
|
|
269
|
+
}
|
|
270
|
+
value;
|
|
271
|
+
update(v) {
|
|
272
|
+
if (this.kind === "max") {
|
|
273
|
+
this.value -= this.decay;
|
|
274
|
+
this.value = Math.max(v, this.value);
|
|
275
|
+
} else {
|
|
276
|
+
this.value += this.decay;
|
|
277
|
+
this.value = Math.min(v, this.value);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
reset(value) {
|
|
281
|
+
this.value = value;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
class BiquadFilter {
|
|
285
|
+
constructor(c) {
|
|
286
|
+
this.c = c;
|
|
287
|
+
}
|
|
288
|
+
x1 = 0;
|
|
289
|
+
x2 = 0;
|
|
290
|
+
y1 = 0;
|
|
291
|
+
y2 = 0;
|
|
292
|
+
process(x) {
|
|
293
|
+
const y = this.c.b0 * x + this.c.b1 * this.x1 + this.c.b2 * this.x2 - this.c.a1 * this.y1 - this.c.a2 * this.y2;
|
|
294
|
+
this.x2 = this.x1;
|
|
295
|
+
this.x1 = x;
|
|
296
|
+
this.y2 = this.y1;
|
|
297
|
+
this.y1 = y;
|
|
298
|
+
return y;
|
|
299
|
+
}
|
|
300
|
+
reset() {
|
|
301
|
+
this.x1 = 0;
|
|
302
|
+
this.x2 = 0;
|
|
303
|
+
this.y1 = 0;
|
|
304
|
+
this.y2 = 0;
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
function createBiquadFilter(type, sampleRate, freq, q = 0.7071) {
|
|
308
|
+
const nyquist = sampleRate / 2;
|
|
309
|
+
const w = Math.min(freq / nyquist, 1);
|
|
310
|
+
if (w === 1) {
|
|
311
|
+
return new BiquadFilter(
|
|
312
|
+
type === "lowpass" ? { b0: 1, b1: 0, b2: 0, a1: 0, a2: 0 } : { b0: 0, b1: 0, b2: 0, a1: 0, a2: 0 }
|
|
313
|
+
);
|
|
314
|
+
}
|
|
315
|
+
if (w === 0 && type === "highpass") {
|
|
316
|
+
return new BiquadFilter({ b0: 1, b1: 0, b2: 0, a1: 0, a2: 0 });
|
|
317
|
+
}
|
|
318
|
+
const theta = Math.PI * w;
|
|
319
|
+
const alpha = Math.sin(theta) / (2 * Math.pow(10, q / 20));
|
|
320
|
+
const cosw = Math.cos(theta);
|
|
321
|
+
let b0, b1, b2;
|
|
322
|
+
if (type === "lowpass") {
|
|
323
|
+
const beta = (1 - cosw) / 2;
|
|
324
|
+
b0 = beta;
|
|
325
|
+
b1 = 2 * beta;
|
|
326
|
+
b2 = beta;
|
|
327
|
+
} else {
|
|
328
|
+
const beta = (1 + cosw) / 2;
|
|
329
|
+
b0 = beta;
|
|
330
|
+
b1 = -2 * beta;
|
|
331
|
+
b2 = beta;
|
|
332
|
+
}
|
|
333
|
+
const a0 = 1 + alpha;
|
|
334
|
+
const a1 = -2 * cosw;
|
|
335
|
+
const a2 = 1 - alpha;
|
|
336
|
+
return new BiquadFilter({
|
|
337
|
+
b0: b0 / a0,
|
|
338
|
+
b1: b1 / a0,
|
|
339
|
+
b2: b2 / a0,
|
|
340
|
+
a1: a1 / a0,
|
|
341
|
+
a2: a2 / a0
|
|
342
|
+
});
|
|
343
|
+
}
|
|
344
|
+
function gainToDb(gain) {
|
|
345
|
+
return gain <= 1e-5 ? -100 : 20 * Math.log10(gain);
|
|
346
|
+
}
|
|
347
|
+
function dbToGain(db) {
|
|
348
|
+
return db <= -100 ? 0 : Math.pow(10, db / 20);
|
|
349
|
+
}
|
|
350
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
351
|
+
0 && (module.exports = {
|
|
352
|
+
StreamingVad,
|
|
353
|
+
defaultVadOptions,
|
|
354
|
+
detectVoiceActivity,
|
|
355
|
+
vadFromFile,
|
|
356
|
+
vadFromStream
|
|
357
|
+
});
|