@storyteller-platform/ghost-story 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +611 -0
- package/README.md +18 -0
- package/dist/api/APIOptions.cjs +16 -0
- package/dist/api/APIOptions.d.cts +18 -0
- package/dist/api/APIOptions.d.ts +18 -0
- package/dist/api/APIOptions.js +0 -0
- package/dist/api/Recognition.cjs +263 -0
- package/dist/api/Recognition.d.cts +77 -0
- package/dist/api/Recognition.d.ts +77 -0
- package/dist/api/Recognition.js +233 -0
- package/dist/api/VoiceActivityDetection.cjs +77 -0
- package/dist/api/VoiceActivityDetection.d.cts +24 -0
- package/dist/api/VoiceActivityDetection.d.ts +24 -0
- package/dist/api/VoiceActivityDetection.js +43 -0
- package/dist/audio/AudioConverter.cjs +331 -0
- package/dist/audio/AudioConverter.d.cts +53 -0
- package/dist/audio/AudioConverter.d.ts +53 -0
- package/dist/audio/AudioConverter.js +310 -0
- package/dist/audio/AudioFormat.cjs +151 -0
- package/dist/audio/AudioFormat.d.cts +25 -0
- package/dist/audio/AudioFormat.d.ts +25 -0
- package/dist/audio/AudioFormat.js +123 -0
- package/dist/audio/AudioSource.cjs +119 -0
- package/dist/audio/AudioSource.d.cts +33 -0
- package/dist/audio/AudioSource.d.ts +33 -0
- package/dist/audio/AudioSource.js +88 -0
- package/dist/audio/index.cjs +74 -0
- package/dist/audio/index.d.cts +6 -0
- package/dist/audio/index.d.ts +6 -0
- package/dist/audio/index.js +54 -0
- package/dist/cli/bin.cjs +277 -0
- package/dist/cli/bin.d.cts +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +275 -0
- package/dist/cli/config.cjs +347 -0
- package/dist/cli/config.d.cts +33 -0
- package/dist/cli/config.d.ts +33 -0
- package/dist/cli/config.js +285 -0
- package/dist/cli/install.cjs +334 -0
- package/dist/cli/install.d.cts +62 -0
- package/dist/cli/install.d.ts +62 -0
- package/dist/cli/install.js +316 -0
- package/dist/cli/whisper-server.cjs +172 -0
- package/dist/cli/whisper-server.d.cts +24 -0
- package/dist/cli/whisper-server.d.ts +24 -0
- package/dist/cli/whisper-server.js +152 -0
- package/dist/config.cjs +60 -0
- package/dist/config.d.cts +12 -0
- package/dist/config.d.ts +12 -0
- package/dist/config.js +32 -0
- package/dist/convert.cjs +88 -0
- package/dist/convert.d.cts +12 -0
- package/dist/convert.d.ts +12 -0
- package/dist/convert.js +63 -0
- package/dist/encodings/Ascii.cjs +75 -0
- package/dist/encodings/Ascii.d.cts +13 -0
- package/dist/encodings/Ascii.d.ts +13 -0
- package/dist/encodings/Ascii.js +48 -0
- package/dist/encodings/Base64.cjs +155 -0
- package/dist/encodings/Base64.d.cts +5 -0
- package/dist/encodings/Base64.d.ts +5 -0
- package/dist/encodings/Base64.js +129 -0
- package/dist/encodings/TextEncodingsCommon.cjs +16 -0
- package/dist/encodings/TextEncodingsCommon.d.cts +6 -0
- package/dist/encodings/TextEncodingsCommon.d.ts +6 -0
- package/dist/encodings/TextEncodingsCommon.js +0 -0
- package/dist/index.cjs +153 -0
- package/dist/index.d.cts +15 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +140 -0
- package/dist/recognition/AmazonTranscribeSTT.cjs +188 -0
- package/dist/recognition/AmazonTranscribeSTT.d.cts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.d.ts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.js +160 -0
- package/dist/recognition/AzureCognitiveServicesSTT.cjs +124 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.cts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.ts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.js +95 -0
- package/dist/recognition/DeepgramSTT.cjs +172 -0
- package/dist/recognition/DeepgramSTT.d.cts +23 -0
- package/dist/recognition/DeepgramSTT.d.ts +23 -0
- package/dist/recognition/DeepgramSTT.js +153 -0
- package/dist/recognition/GoogleCloudSTT.cjs +125 -0
- package/dist/recognition/GoogleCloudSTT.d.cts +35 -0
- package/dist/recognition/GoogleCloudSTT.d.ts +35 -0
- package/dist/recognition/GoogleCloudSTT.js +107 -0
- package/dist/recognition/OpenAICloudSTT.cjs +180 -0
- package/dist/recognition/OpenAICloudSTT.d.cts +29 -0
- package/dist/recognition/OpenAICloudSTT.d.ts +29 -0
- package/dist/recognition/OpenAICloudSTT.js +150 -0
- package/dist/recognition/WhisperCppSTT.cjs +296 -0
- package/dist/recognition/WhisperCppSTT.d.cts +40 -0
- package/dist/recognition/WhisperCppSTT.d.ts +40 -0
- package/dist/recognition/WhisperCppSTT.js +275 -0
- package/dist/recognition/WhisperServerSTT.cjs +119 -0
- package/dist/recognition/WhisperServerSTT.d.cts +24 -0
- package/dist/recognition/WhisperServerSTT.d.ts +24 -0
- package/dist/recognition/WhisperServerSTT.js +105 -0
- package/dist/utilities/FileSystem.cjs +54 -0
- package/dist/utilities/FileSystem.d.cts +3 -0
- package/dist/utilities/FileSystem.d.ts +3 -0
- package/dist/utilities/FileSystem.js +20 -0
- package/dist/utilities/Locale.cjs +46 -0
- package/dist/utilities/Locale.d.cts +9 -0
- package/dist/utilities/Locale.d.ts +9 -0
- package/dist/utilities/Locale.js +20 -0
- package/dist/utilities/ObjectUtilities.cjs +41 -0
- package/dist/utilities/ObjectUtilities.d.cts +3 -0
- package/dist/utilities/ObjectUtilities.d.ts +3 -0
- package/dist/utilities/ObjectUtilities.js +7 -0
- package/dist/utilities/Timeline.cjs +120 -0
- package/dist/utilities/Timeline.d.cts +23 -0
- package/dist/utilities/Timeline.d.ts +23 -0
- package/dist/utilities/Timeline.js +94 -0
- package/dist/utilities/Timing.cjs +287 -0
- package/dist/utilities/Timing.d.cts +64 -0
- package/dist/utilities/Timing.d.ts +64 -0
- package/dist/utilities/Timing.js +256 -0
- package/dist/utilities/WhisperTimeline.cjs +344 -0
- package/dist/utilities/WhisperTimeline.d.cts +86 -0
- package/dist/utilities/WhisperTimeline.d.ts +86 -0
- package/dist/utilities/WhisperTimeline.js +313 -0
- package/dist/vad/ActiveGate.cjs +357 -0
- package/dist/vad/ActiveGate.d.cts +53 -0
- package/dist/vad/ActiveGate.d.ts +53 -0
- package/dist/vad/ActiveGate.js +329 -0
- package/dist/vad/ActiveGateOg.cjs +1366 -0
- package/dist/vad/ActiveGateOg.d.cts +33 -0
- package/dist/vad/ActiveGateOg.d.ts +33 -0
- package/dist/vad/ActiveGateOg.js +1341 -0
- package/dist/vad/Silero.cjs +174 -0
- package/dist/vad/Silero.d.cts +25 -0
- package/dist/vad/Silero.d.ts +25 -0
- package/dist/vad/Silero.js +153 -0
- package/package.json +125 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { AudioFormat } from '../audio/AudioFormat.cjs';
|
|
2
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
|
|
3
|
+
import { WhisperModel } from '../cli/config.cjs';
|
|
4
|
+
import { Timeline } from '../utilities/Timeline.cjs';
|
|
5
|
+
import { Timing } from '../utilities/Timing.cjs';
|
|
6
|
+
import 'node:fs';
|
|
7
|
+
import 'node:stream';
|
|
8
|
+
|
|
9
|
+
type InputPreference = "file";
|
|
10
|
+
declare const inputPreference: InputPreference;
|
|
11
|
+
type Language = "af" | "am" | "ar" | "as" | "az" | "ba" | "be" | "bg" | "bn" | "bo" | "br" | "bs" | "ca" | "cs" | "cy" | "da" | "de" | "el" | "en" | "es" | "et" | "eu" | "fa" | "fi" | "fo" | "fr" | "gl" | "gu" | "ha" | "haw" | "he" | "hi" | "hr" | "ht" | "hu" | "hy" | "id" | "is" | "it" | "ja" | "jw" | "ka" | "kk" | "km" | "kn" | "ko" | "la" | "lb" | "ln" | "lo" | "lt" | "lv" | "mg" | "mi" | "mk" | "ml" | "mn" | "mr" | "ms" | "mt" | "my" | "ne" | "nl" | "nn" | "no" | "oc" | "pa" | "pl" | "ps" | "pt" | "ro" | "ru" | "sa" | "sd" | "si" | "sk" | "sl" | "sn" | "so" | "sq" | "sr" | "su" | "sv" | "sw" | "ta" | "te" | "tg" | "th" | "tk" | "tl" | "tr" | "tt" | "uk" | "ur" | "uz" | "vi" | "yi" | "yo" | "zh";
|
|
12
|
+
|
|
13
|
+
interface WhisperCppOptions {
|
|
14
|
+
model: WhisperModel;
|
|
15
|
+
modelDir?: string;
|
|
16
|
+
installDir?: string;
|
|
17
|
+
language?: Language;
|
|
18
|
+
processors?: number;
|
|
19
|
+
threads?: number;
|
|
20
|
+
flashAttention?: boolean;
|
|
21
|
+
suppressNonSpeechTokens?: boolean;
|
|
22
|
+
tokenLevelTimestamps?: boolean;
|
|
23
|
+
printOutput?: boolean;
|
|
24
|
+
autoInstall?: boolean;
|
|
25
|
+
onProgress?: (progress: number) => void;
|
|
26
|
+
signal?: AbortSignal;
|
|
27
|
+
inputFormat?: AudioFormat;
|
|
28
|
+
timing?: Timing | undefined;
|
|
29
|
+
}
|
|
30
|
+
declare function recognize(input: RawAudioInput | AudioSource, options?: WhisperCppOptions): Promise<RecognitionResult>;
|
|
31
|
+
declare function ensureWhisperCppInstalled(): Promise<void>;
|
|
32
|
+
declare function ensureModelDownloaded(modelDir: string, modelName: WhisperModel, printOutput: boolean): Promise<void>;
|
|
33
|
+
interface RecognitionResult {
|
|
34
|
+
transcript: string;
|
|
35
|
+
timeline: Timeline;
|
|
36
|
+
language?: string;
|
|
37
|
+
}
|
|
38
|
+
type WhisperCppModelId = "tiny" | "tiny-q5_1" | "tiny.en" | "tiny.en-q5_1" | "tiny.en-q8_0" | "base" | "base-q5_1" | "base.en" | "base.en-q5_1" | "small" | "small-q5_1" | "small.en" | "small.en-q5_1" | "medium" | "medium-q5_0" | "medium.en" | "medium.en-q5_0" | "large" | "large-v1" | "large-v2" | "large-v2-q5_0" | "large-v3" | "large-v3-q5_0" | "large-v3-turbo" | "large-v3-turbo-q5_0";
|
|
39
|
+
|
|
40
|
+
export { type InputPreference, type Language, type RecognitionResult, type WhisperCppModelId, type WhisperCppOptions, WhisperModel, ensureModelDownloaded, ensureWhisperCppInstalled, inputPreference, recognize };
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { AudioFormat } from '../audio/AudioFormat.js';
|
|
2
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
|
|
3
|
+
import { WhisperModel } from '../cli/config.js';
|
|
4
|
+
import { Timeline } from '../utilities/Timeline.js';
|
|
5
|
+
import { Timing } from '../utilities/Timing.js';
|
|
6
|
+
import 'node:fs';
|
|
7
|
+
import 'node:stream';
|
|
8
|
+
|
|
9
|
+
type InputPreference = "file";
|
|
10
|
+
declare const inputPreference: InputPreference;
|
|
11
|
+
type Language = "af" | "am" | "ar" | "as" | "az" | "ba" | "be" | "bg" | "bn" | "bo" | "br" | "bs" | "ca" | "cs" | "cy" | "da" | "de" | "el" | "en" | "es" | "et" | "eu" | "fa" | "fi" | "fo" | "fr" | "gl" | "gu" | "ha" | "haw" | "he" | "hi" | "hr" | "ht" | "hu" | "hy" | "id" | "is" | "it" | "ja" | "jw" | "ka" | "kk" | "km" | "kn" | "ko" | "la" | "lb" | "ln" | "lo" | "lt" | "lv" | "mg" | "mi" | "mk" | "ml" | "mn" | "mr" | "ms" | "mt" | "my" | "ne" | "nl" | "nn" | "no" | "oc" | "pa" | "pl" | "ps" | "pt" | "ro" | "ru" | "sa" | "sd" | "si" | "sk" | "sl" | "sn" | "so" | "sq" | "sr" | "su" | "sv" | "sw" | "ta" | "te" | "tg" | "th" | "tk" | "tl" | "tr" | "tt" | "uk" | "ur" | "uz" | "vi" | "yi" | "yo" | "zh";
|
|
12
|
+
|
|
13
|
+
interface WhisperCppOptions {
|
|
14
|
+
model: WhisperModel;
|
|
15
|
+
modelDir?: string;
|
|
16
|
+
installDir?: string;
|
|
17
|
+
language?: Language;
|
|
18
|
+
processors?: number;
|
|
19
|
+
threads?: number;
|
|
20
|
+
flashAttention?: boolean;
|
|
21
|
+
suppressNonSpeechTokens?: boolean;
|
|
22
|
+
tokenLevelTimestamps?: boolean;
|
|
23
|
+
printOutput?: boolean;
|
|
24
|
+
autoInstall?: boolean;
|
|
25
|
+
onProgress?: (progress: number) => void;
|
|
26
|
+
signal?: AbortSignal;
|
|
27
|
+
inputFormat?: AudioFormat;
|
|
28
|
+
timing?: Timing | undefined;
|
|
29
|
+
}
|
|
30
|
+
declare function recognize(input: RawAudioInput | AudioSource, options?: WhisperCppOptions): Promise<RecognitionResult>;
|
|
31
|
+
declare function ensureWhisperCppInstalled(): Promise<void>;
|
|
32
|
+
declare function ensureModelDownloaded(modelDir: string, modelName: WhisperModel, printOutput: boolean): Promise<void>;
|
|
33
|
+
interface RecognitionResult {
|
|
34
|
+
transcript: string;
|
|
35
|
+
timeline: Timeline;
|
|
36
|
+
language?: string;
|
|
37
|
+
}
|
|
38
|
+
type WhisperCppModelId = "tiny" | "tiny-q5_1" | "tiny.en" | "tiny.en-q5_1" | "tiny.en-q8_0" | "base" | "base-q5_1" | "base.en" | "base.en-q5_1" | "small" | "small-q5_1" | "small.en" | "small.en-q5_1" | "medium" | "medium-q5_0" | "medium.en" | "medium.en-q5_0" | "large" | "large-v1" | "large-v2" | "large-v2-q5_0" | "large-v3" | "large-v3-q5_0" | "large-v3-turbo" | "large-v3-turbo-q5_0";
|
|
39
|
+
|
|
40
|
+
export { type InputPreference, type Language, type RecognitionResult, type WhisperCppModelId, type WhisperCppOptions, WhisperModel, ensureModelDownloaded, ensureWhisperCppInstalled, inputPreference, recognize };
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import fs, { existsSync } from "node:fs";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { ensureDir } from "fs-extra";
|
|
6
|
+
import {
|
|
7
|
+
getAudioDuration,
|
|
8
|
+
isAudioSource,
|
|
9
|
+
normalizeToAudioSource,
|
|
10
|
+
prepareWavForService,
|
|
11
|
+
toFilePath
|
|
12
|
+
} from "../audio/index.js";
|
|
13
|
+
import {
|
|
14
|
+
getInstallDir,
|
|
15
|
+
getModelDir,
|
|
16
|
+
getModelPath as getModelPathFromConfig,
|
|
17
|
+
getWhisperExecutablePath
|
|
18
|
+
} from "../cli/config.js";
|
|
19
|
+
import { installBinary, installModel } from "../cli/install.js";
|
|
20
|
+
import {
|
|
21
|
+
calculateEffectiveProcessors,
|
|
22
|
+
calculateWhisperSplits,
|
|
23
|
+
extractCorrectedTimeline,
|
|
24
|
+
parseWhisperCppOutput
|
|
25
|
+
} from "../utilities/WhisperTimeline.js";
|
|
26
|
+
const inputPreference = "file";
|
|
27
|
+
const defaultOptions = {
|
|
28
|
+
processors: 1,
|
|
29
|
+
threads: 4,
|
|
30
|
+
flashAttention: true,
|
|
31
|
+
suppressNonSpeechTokens: true,
|
|
32
|
+
tokenLevelTimestamps: true,
|
|
33
|
+
printOutput: false,
|
|
34
|
+
model: "tiny.en",
|
|
35
|
+
autoInstall: true
|
|
36
|
+
};
|
|
37
|
+
const acceptedFormats = ["wav", "flac", "ogg", "mp3"];
|
|
38
|
+
async function recognize(input, options) {
|
|
39
|
+
const opts = { ...defaultOptions, ...options };
|
|
40
|
+
const timing = opts.timing;
|
|
41
|
+
const modelDir = opts.modelDir ?? getModelDir();
|
|
42
|
+
const installDir = opts.installDir ?? getInstallDir();
|
|
43
|
+
const source = isAudioSource(input) ? input : normalizeToAudioSource(input, opts.inputFormat);
|
|
44
|
+
await ensureDir(modelDir);
|
|
45
|
+
const doInstall = async () => {
|
|
46
|
+
await ensureWhisperCppInstalled();
|
|
47
|
+
await ensureModelDownloaded(modelDir, opts.model, opts.printOutput);
|
|
48
|
+
};
|
|
49
|
+
if (opts.autoInstall) {
|
|
50
|
+
if (timing) {
|
|
51
|
+
await timing.timeAsync("installation", doInstall);
|
|
52
|
+
} else {
|
|
53
|
+
await doInstall();
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
const conversionNeeded = !acceptedFormats.includes(source.format);
|
|
57
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
|
|
58
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", conversionNeeded ? "wav" : source.format);
|
|
59
|
+
const doPrepare = async () => {
|
|
60
|
+
if (!conversionNeeded) return { source, cleanup: async () => {
|
|
61
|
+
} };
|
|
62
|
+
return prepareWavForService(source, { sampleRate: 16e3, channels: 1 });
|
|
63
|
+
};
|
|
64
|
+
const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
|
|
65
|
+
try {
|
|
66
|
+
const inputPath = toFilePath(prepared.source);
|
|
67
|
+
if (!inputPath) {
|
|
68
|
+
throw new Error(
|
|
69
|
+
"whisper.cpp requires a file path. The audio could not be prepared as a file."
|
|
70
|
+
);
|
|
71
|
+
}
|
|
72
|
+
if (!existsSync(inputPath)) {
|
|
73
|
+
throw new Error(`Input file does not exist: ${inputPath}`);
|
|
74
|
+
}
|
|
75
|
+
const audioDuration = await getAudioDuration(inputPath);
|
|
76
|
+
const effectiveProcessors = calculateEffectiveProcessors(
|
|
77
|
+
audioDuration,
|
|
78
|
+
opts.processors
|
|
79
|
+
);
|
|
80
|
+
const doTranscribe = () => transcribe({
|
|
81
|
+
inputPath,
|
|
82
|
+
model: opts.model,
|
|
83
|
+
installDir,
|
|
84
|
+
modelFolder: modelDir,
|
|
85
|
+
language: opts.language ?? null,
|
|
86
|
+
tokenLevelTimestamps: opts.tokenLevelTimestamps,
|
|
87
|
+
printOutput: opts.printOutput,
|
|
88
|
+
flashAttention: opts.flashAttention,
|
|
89
|
+
suppressNonSpeechTokens: opts.suppressNonSpeechTokens,
|
|
90
|
+
processors: effectiveProcessors,
|
|
91
|
+
threads: opts.threads,
|
|
92
|
+
onProgress: opts.onProgress ?? null,
|
|
93
|
+
signal: opts.signal ?? null
|
|
94
|
+
});
|
|
95
|
+
const transcription = timing ? await timing.timeAsync("transcription", doTranscribe) : await doTranscribe();
|
|
96
|
+
const rawSegments = parseWhisperCppOutput(transcription.transcription);
|
|
97
|
+
const splitBoundaries = effectiveProcessors > 1 ? calculateWhisperSplits(audioDuration, effectiveProcessors) : [];
|
|
98
|
+
const timeline = extractCorrectedTimeline(rawSegments, {
|
|
99
|
+
splitBoundaries: splitBoundaries.length > 0 ? splitBoundaries : void 0
|
|
100
|
+
});
|
|
101
|
+
const transcript = transcription.transcription.map((s) => s.text).join("").trim();
|
|
102
|
+
return {
|
|
103
|
+
transcript,
|
|
104
|
+
timeline,
|
|
105
|
+
language: transcription.result.language
|
|
106
|
+
};
|
|
107
|
+
} finally {
|
|
108
|
+
await prepared.cleanup();
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
async function ensureWhisperCppInstalled() {
|
|
112
|
+
await installBinary({ printOutput: false });
|
|
113
|
+
}
|
|
114
|
+
async function ensureModelDownloaded(modelDir, modelName, printOutput) {
|
|
115
|
+
const modelPath = getModelPathFromConfig(modelName, modelDir);
|
|
116
|
+
if (existsSync(modelPath)) {
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
await installModel({
|
|
120
|
+
model: modelName,
|
|
121
|
+
modelDir,
|
|
122
|
+
printOutput
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
function getModelPath(folder, model) {
|
|
126
|
+
return path.join(folder, `ggml-${model}.bin`);
|
|
127
|
+
}
|
|
128
|
+
async function transcribe(options) {
|
|
129
|
+
const {
|
|
130
|
+
inputPath,
|
|
131
|
+
model,
|
|
132
|
+
installDir,
|
|
133
|
+
modelFolder,
|
|
134
|
+
language,
|
|
135
|
+
tokenLevelTimestamps,
|
|
136
|
+
printOutput,
|
|
137
|
+
flashAttention,
|
|
138
|
+
suppressNonSpeechTokens,
|
|
139
|
+
processors,
|
|
140
|
+
threads,
|
|
141
|
+
onProgress,
|
|
142
|
+
signal
|
|
143
|
+
} = options;
|
|
144
|
+
const executable = getWhisperExecutablePath(installDir);
|
|
145
|
+
const modelPath = getModelPath(modelFolder, model);
|
|
146
|
+
if (!existsSync(executable)) {
|
|
147
|
+
throw new Error(`Whisper executable not found at ${executable}`);
|
|
148
|
+
}
|
|
149
|
+
if (!existsSync(modelPath)) {
|
|
150
|
+
throw new Error(`Model not found at ${modelPath}`);
|
|
151
|
+
}
|
|
152
|
+
const tmpDir = path.join(os.tmpdir(), "ghost-story-whisper");
|
|
153
|
+
await ensureDir(tmpDir);
|
|
154
|
+
const tmpJsonPath = path.join(tmpDir, `transcription-${Date.now()}`);
|
|
155
|
+
const args = buildTranscribeArgs({
|
|
156
|
+
inputPath,
|
|
157
|
+
modelPath,
|
|
158
|
+
outputPath: tmpJsonPath,
|
|
159
|
+
model,
|
|
160
|
+
language,
|
|
161
|
+
tokenLevelTimestamps,
|
|
162
|
+
flashAttention,
|
|
163
|
+
suppressNonSpeechTokens,
|
|
164
|
+
processors,
|
|
165
|
+
threads
|
|
166
|
+
});
|
|
167
|
+
try {
|
|
168
|
+
const outputPath = await runWhisperProcess({
|
|
169
|
+
executable,
|
|
170
|
+
args,
|
|
171
|
+
cwd: installDir,
|
|
172
|
+
printOutput,
|
|
173
|
+
onProgress,
|
|
174
|
+
signal,
|
|
175
|
+
expectedOutputPath: `${tmpJsonPath}.json`
|
|
176
|
+
});
|
|
177
|
+
const json = JSON.parse(
|
|
178
|
+
await fs.promises.readFile(outputPath, "utf8")
|
|
179
|
+
);
|
|
180
|
+
fs.promises.unlink(outputPath).catch(() => {
|
|
181
|
+
});
|
|
182
|
+
return json;
|
|
183
|
+
} catch (error) {
|
|
184
|
+
await fs.promises.unlink(`${tmpJsonPath}.json`).catch(() => {
|
|
185
|
+
});
|
|
186
|
+
throw error;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
function buildTranscribeArgs(options) {
|
|
190
|
+
const args = [
|
|
191
|
+
"--file",
|
|
192
|
+
options.inputPath,
|
|
193
|
+
"--output-file",
|
|
194
|
+
options.outputPath,
|
|
195
|
+
"--output-json-full",
|
|
196
|
+
"--model",
|
|
197
|
+
options.modelPath,
|
|
198
|
+
"--print-progress",
|
|
199
|
+
options.language ? ["--language", options.language.toLowerCase()] : null,
|
|
200
|
+
options.flashAttention ? ["--flash-attn"] : null,
|
|
201
|
+
options.suppressNonSpeechTokens ? ["--suppress-nst", "--no-prints"] : null,
|
|
202
|
+
["--processors", String(options.processors)],
|
|
203
|
+
["--threads", String(options.threads)]
|
|
204
|
+
];
|
|
205
|
+
return args.flat().filter((arg) => arg !== null);
|
|
206
|
+
}
|
|
207
|
+
function runWhisperProcess(options) {
|
|
208
|
+
const {
|
|
209
|
+
executable,
|
|
210
|
+
args,
|
|
211
|
+
cwd,
|
|
212
|
+
printOutput,
|
|
213
|
+
onProgress,
|
|
214
|
+
signal,
|
|
215
|
+
expectedOutputPath
|
|
216
|
+
} = options;
|
|
217
|
+
if (signal == null ? void 0 : signal.aborted) {
|
|
218
|
+
return Promise.reject(new Error("Signal aborted"));
|
|
219
|
+
}
|
|
220
|
+
return new Promise((resolve, reject) => {
|
|
221
|
+
const task = spawn(executable, args, { cwd, signal: signal ?? void 0 });
|
|
222
|
+
let output = "";
|
|
223
|
+
const handleData = (data) => {
|
|
224
|
+
const str = data.toString("utf-8");
|
|
225
|
+
output += str;
|
|
226
|
+
if (str.includes("progress =")) {
|
|
227
|
+
const match = str.match(/progress\s*=\s*([\d.]+)/);
|
|
228
|
+
if (match == null ? void 0 : match[1]) {
|
|
229
|
+
onProgress == null ? void 0 : onProgress(parseFloat(match[1]) / 100);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
};
|
|
233
|
+
task.stdout.on("data", (data) => {
|
|
234
|
+
handleData(data);
|
|
235
|
+
if (printOutput) {
|
|
236
|
+
process.stdout.write(data);
|
|
237
|
+
}
|
|
238
|
+
});
|
|
239
|
+
task.stderr.on("data", (data) => {
|
|
240
|
+
handleData(data);
|
|
241
|
+
if (printOutput) {
|
|
242
|
+
process.stderr.write(data);
|
|
243
|
+
}
|
|
244
|
+
});
|
|
245
|
+
task.on("exit", (code, exitSignal) => {
|
|
246
|
+
if (existsSync(expectedOutputPath)) {
|
|
247
|
+
onProgress == null ? void 0 : onProgress(1);
|
|
248
|
+
resolve(expectedOutputPath);
|
|
249
|
+
return;
|
|
250
|
+
}
|
|
251
|
+
if (exitSignal) {
|
|
252
|
+
reject(new Error(`Process killed with signal ${exitSignal}: ${output}`));
|
|
253
|
+
return;
|
|
254
|
+
}
|
|
255
|
+
if (output.includes("must be 16 kHz")) {
|
|
256
|
+
reject(
|
|
257
|
+
new Error(
|
|
258
|
+
"Audio file must be 16 kHz. Convert your audio to 16-bit, 16KHz WAV format."
|
|
259
|
+
)
|
|
260
|
+
);
|
|
261
|
+
return;
|
|
262
|
+
}
|
|
263
|
+
reject(new Error(`Transcription failed (exit code ${code}): ${output}`));
|
|
264
|
+
});
|
|
265
|
+
task.on("error", (err) => {
|
|
266
|
+
reject(new Error(`Failed to start whisper process: ${err.message}`));
|
|
267
|
+
});
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
export {
|
|
271
|
+
ensureModelDownloaded,
|
|
272
|
+
ensureWhisperCppInstalled,
|
|
273
|
+
inputPreference,
|
|
274
|
+
recognize
|
|
275
|
+
};
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var WhisperServerSTT_exports = {};
|
|
20
|
+
__export(WhisperServerSTT_exports, {
|
|
21
|
+
inputPreference: () => inputPreference,
|
|
22
|
+
recognize: () => recognize
|
|
23
|
+
});
|
|
24
|
+
module.exports = __toCommonJS(WhisperServerSTT_exports);
|
|
25
|
+
var import_node_fs = require("node:fs");
|
|
26
|
+
var import_node_path = require("node:path");
|
|
27
|
+
var import_audio = require("../audio/index.cjs");
|
|
28
|
+
var import_WhisperTimeline = require("../utilities/WhisperTimeline.cjs");
|
|
29
|
+
const inputPreference = "file";
|
|
30
|
+
const defaultOptions = {
|
|
31
|
+
baseURL: "http://localhost:8080",
|
|
32
|
+
inferencePath: "/audio/transcriptions",
|
|
33
|
+
temperature: 0
|
|
34
|
+
};
|
|
35
|
+
async function recognize(input, languageCode, options = {}) {
|
|
36
|
+
const opts = { ...defaultOptions, ...options };
|
|
37
|
+
const timing = opts.timing;
|
|
38
|
+
const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, opts.inputFormat);
|
|
39
|
+
const conversionNeeded = source.format !== "wav";
|
|
40
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
|
|
41
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", "wav");
|
|
42
|
+
const doPrepare = () => (0, import_audio.prepareWavForService)(source, { sampleRate: 16e3, channels: 1 });
|
|
43
|
+
const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
|
|
44
|
+
try {
|
|
45
|
+
const filePath = (0, import_audio.toFilePath)(prepared.source);
|
|
46
|
+
if (!filePath) {
|
|
47
|
+
throw new Error(
|
|
48
|
+
"Whisper server requires a file path. The audio could not be prepared as a file."
|
|
49
|
+
);
|
|
50
|
+
}
|
|
51
|
+
const filename = (0, import_node_path.basename)(filePath);
|
|
52
|
+
const blob = await (0, import_node_fs.openAsBlob)(filePath);
|
|
53
|
+
const form = new FormData();
|
|
54
|
+
form.append("file", blob, filename);
|
|
55
|
+
form.append("temperature", String(opts.temperature));
|
|
56
|
+
form.append("response_format", "verbose_json");
|
|
57
|
+
if (languageCode) {
|
|
58
|
+
form.append("language", languageCode);
|
|
59
|
+
}
|
|
60
|
+
const url = `${opts.baseURL}${opts.inferencePath}`;
|
|
61
|
+
const headers = {};
|
|
62
|
+
if (opts.apiKey) {
|
|
63
|
+
headers["Authorization"] = `Bearer ${opts.apiKey}`;
|
|
64
|
+
}
|
|
65
|
+
const doUpload = () => fetch(url, { method: "POST", body: form, headers });
|
|
66
|
+
const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
|
|
67
|
+
if (!response.ok) {
|
|
68
|
+
const text = await response.text();
|
|
69
|
+
throw new Error(`Whisper server error: ${response.status} ${text}`);
|
|
70
|
+
}
|
|
71
|
+
const data = await response.json();
|
|
72
|
+
const { timeline, transcript } = await extractTimelineAndTranscript(
|
|
73
|
+
data,
|
|
74
|
+
filePath
|
|
75
|
+
);
|
|
76
|
+
if (!timeline) {
|
|
77
|
+
throw new Error("Failed to extract timeline from Whisper server response");
|
|
78
|
+
}
|
|
79
|
+
return { transcript, timeline };
|
|
80
|
+
} finally {
|
|
81
|
+
await prepared.cleanup();
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
async function extractTimelineAndTranscript(response, audioPath) {
|
|
85
|
+
var _a, _b, _c, _d;
|
|
86
|
+
if (response.segments.length === 0) {
|
|
87
|
+
return { timeline: void 0, transcript: ((_a = response.text) == null ? void 0 : _a.trim()) ?? "" };
|
|
88
|
+
}
|
|
89
|
+
const hasNestedWords = (((_c = (_b = response.segments[0]) == null ? void 0 : _b.words) == null ? void 0 : _c.length) ?? 0) > 0;
|
|
90
|
+
if (hasNestedWords) {
|
|
91
|
+
const rawSegments = (0, import_WhisperTimeline.parseWhisperServerOutput)(response.segments);
|
|
92
|
+
const splitBoundaries = await detectSplitBoundaries(rawSegments, audioPath);
|
|
93
|
+
const timeline2 = (0, import_WhisperTimeline.extractCorrectedTimeline)(rawSegments, { splitBoundaries });
|
|
94
|
+
const transcript = timeline2.map((entry) => entry.text).join(" ");
|
|
95
|
+
return { timeline: timeline2, transcript };
|
|
96
|
+
}
|
|
97
|
+
const timeline = response.segments.map((seg) => ({
|
|
98
|
+
type: "segment",
|
|
99
|
+
text: seg.text.trim(),
|
|
100
|
+
startTime: seg.start,
|
|
101
|
+
endTime: seg.end
|
|
102
|
+
}));
|
|
103
|
+
return { timeline, transcript: ((_d = response.text) == null ? void 0 : _d.trim()) ?? "" };
|
|
104
|
+
}
|
|
105
|
+
async function detectSplitBoundaries(rawSegments, audioPath) {
|
|
106
|
+
const boundaryCount = (0, import_WhisperTimeline.countProcessorBoundaries)(rawSegments);
|
|
107
|
+
if (boundaryCount === 0) return void 0;
|
|
108
|
+
try {
|
|
109
|
+
const audioDuration = await (0, import_audio.getAudioDuration)(audioPath);
|
|
110
|
+
return (0, import_WhisperTimeline.calculateWhisperSplits)(audioDuration, boundaryCount + 1);
|
|
111
|
+
} catch {
|
|
112
|
+
return void 0;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
116
|
+
0 && (module.exports = {
|
|
117
|
+
inputPreference,
|
|
118
|
+
recognize
|
|
119
|
+
});
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { AudioFormat } from '../audio/AudioFormat.cjs';
|
|
2
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
|
|
3
|
+
import { Timeline } from '../utilities/Timeline.cjs';
|
|
4
|
+
import { Timing } from '../utilities/Timing.cjs';
|
|
5
|
+
import 'node:fs';
|
|
6
|
+
import 'node:stream';
|
|
7
|
+
|
|
8
|
+
type InputPreference = "file";
|
|
9
|
+
declare const inputPreference: InputPreference;
|
|
10
|
+
interface WhisperServerOptions {
|
|
11
|
+
baseURL?: string;
|
|
12
|
+
inferencePath?: string;
|
|
13
|
+
temperature?: number;
|
|
14
|
+
apiKey?: string;
|
|
15
|
+
inputFormat?: AudioFormat;
|
|
16
|
+
timing?: Timing | undefined;
|
|
17
|
+
}
|
|
18
|
+
interface RecognitionResult {
|
|
19
|
+
transcript: string;
|
|
20
|
+
timeline?: Timeline;
|
|
21
|
+
}
|
|
22
|
+
declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options?: WhisperServerOptions): Promise<RecognitionResult>;
|
|
23
|
+
|
|
24
|
+
export { type InputPreference, type RecognitionResult, type WhisperServerOptions, inputPreference, recognize };
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { AudioFormat } from '../audio/AudioFormat.js';
|
|
2
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
|
|
3
|
+
import { Timeline } from '../utilities/Timeline.js';
|
|
4
|
+
import { Timing } from '../utilities/Timing.js';
|
|
5
|
+
import 'node:fs';
|
|
6
|
+
import 'node:stream';
|
|
7
|
+
|
|
8
|
+
type InputPreference = "file";
|
|
9
|
+
declare const inputPreference: InputPreference;
|
|
10
|
+
interface WhisperServerOptions {
|
|
11
|
+
baseURL?: string;
|
|
12
|
+
inferencePath?: string;
|
|
13
|
+
temperature?: number;
|
|
14
|
+
apiKey?: string;
|
|
15
|
+
inputFormat?: AudioFormat;
|
|
16
|
+
timing?: Timing | undefined;
|
|
17
|
+
}
|
|
18
|
+
interface RecognitionResult {
|
|
19
|
+
transcript: string;
|
|
20
|
+
timeline?: Timeline;
|
|
21
|
+
}
|
|
22
|
+
declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options?: WhisperServerOptions): Promise<RecognitionResult>;
|
|
23
|
+
|
|
24
|
+
export { type InputPreference, type RecognitionResult, type WhisperServerOptions, inputPreference, recognize };
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { openAsBlob } from "node:fs";
|
|
2
|
+
import { basename } from "node:path";
|
|
3
|
+
import {
|
|
4
|
+
getAudioDuration,
|
|
5
|
+
isAudioSource,
|
|
6
|
+
normalizeToAudioSource,
|
|
7
|
+
prepareWavForService,
|
|
8
|
+
toFilePath
|
|
9
|
+
} from "../audio/index.js";
|
|
10
|
+
import {
|
|
11
|
+
calculateWhisperSplits,
|
|
12
|
+
countProcessorBoundaries,
|
|
13
|
+
extractCorrectedTimeline,
|
|
14
|
+
parseWhisperServerOutput
|
|
15
|
+
} from "../utilities/WhisperTimeline.js";
|
|
16
|
+
const inputPreference = "file";
|
|
17
|
+
const defaultOptions = {
|
|
18
|
+
baseURL: "http://localhost:8080",
|
|
19
|
+
inferencePath: "/audio/transcriptions",
|
|
20
|
+
temperature: 0
|
|
21
|
+
};
|
|
22
|
+
async function recognize(input, languageCode, options = {}) {
|
|
23
|
+
const opts = { ...defaultOptions, ...options };
|
|
24
|
+
const timing = opts.timing;
|
|
25
|
+
const source = isAudioSource(input) ? input : normalizeToAudioSource(input, opts.inputFormat);
|
|
26
|
+
const conversionNeeded = source.format !== "wav";
|
|
27
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
|
|
28
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", "wav");
|
|
29
|
+
const doPrepare = () => prepareWavForService(source, { sampleRate: 16e3, channels: 1 });
|
|
30
|
+
const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
|
|
31
|
+
try {
|
|
32
|
+
const filePath = toFilePath(prepared.source);
|
|
33
|
+
if (!filePath) {
|
|
34
|
+
throw new Error(
|
|
35
|
+
"Whisper server requires a file path. The audio could not be prepared as a file."
|
|
36
|
+
);
|
|
37
|
+
}
|
|
38
|
+
const filename = basename(filePath);
|
|
39
|
+
const blob = await openAsBlob(filePath);
|
|
40
|
+
const form = new FormData();
|
|
41
|
+
form.append("file", blob, filename);
|
|
42
|
+
form.append("temperature", String(opts.temperature));
|
|
43
|
+
form.append("response_format", "verbose_json");
|
|
44
|
+
if (languageCode) {
|
|
45
|
+
form.append("language", languageCode);
|
|
46
|
+
}
|
|
47
|
+
const url = `${opts.baseURL}${opts.inferencePath}`;
|
|
48
|
+
const headers = {};
|
|
49
|
+
if (opts.apiKey) {
|
|
50
|
+
headers["Authorization"] = `Bearer ${opts.apiKey}`;
|
|
51
|
+
}
|
|
52
|
+
const doUpload = () => fetch(url, { method: "POST", body: form, headers });
|
|
53
|
+
const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
|
|
54
|
+
if (!response.ok) {
|
|
55
|
+
const text = await response.text();
|
|
56
|
+
throw new Error(`Whisper server error: ${response.status} ${text}`);
|
|
57
|
+
}
|
|
58
|
+
const data = await response.json();
|
|
59
|
+
const { timeline, transcript } = await extractTimelineAndTranscript(
|
|
60
|
+
data,
|
|
61
|
+
filePath
|
|
62
|
+
);
|
|
63
|
+
if (!timeline) {
|
|
64
|
+
throw new Error("Failed to extract timeline from Whisper server response");
|
|
65
|
+
}
|
|
66
|
+
return { transcript, timeline };
|
|
67
|
+
} finally {
|
|
68
|
+
await prepared.cleanup();
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
async function extractTimelineAndTranscript(response, audioPath) {
|
|
72
|
+
var _a, _b, _c, _d;
|
|
73
|
+
if (response.segments.length === 0) {
|
|
74
|
+
return { timeline: void 0, transcript: ((_a = response.text) == null ? void 0 : _a.trim()) ?? "" };
|
|
75
|
+
}
|
|
76
|
+
const hasNestedWords = (((_c = (_b = response.segments[0]) == null ? void 0 : _b.words) == null ? void 0 : _c.length) ?? 0) > 0;
|
|
77
|
+
if (hasNestedWords) {
|
|
78
|
+
const rawSegments = parseWhisperServerOutput(response.segments);
|
|
79
|
+
const splitBoundaries = await detectSplitBoundaries(rawSegments, audioPath);
|
|
80
|
+
const timeline2 = extractCorrectedTimeline(rawSegments, { splitBoundaries });
|
|
81
|
+
const transcript = timeline2.map((entry) => entry.text).join(" ");
|
|
82
|
+
return { timeline: timeline2, transcript };
|
|
83
|
+
}
|
|
84
|
+
const timeline = response.segments.map((seg) => ({
|
|
85
|
+
type: "segment",
|
|
86
|
+
text: seg.text.trim(),
|
|
87
|
+
startTime: seg.start,
|
|
88
|
+
endTime: seg.end
|
|
89
|
+
}));
|
|
90
|
+
return { timeline, transcript: ((_d = response.text) == null ? void 0 : _d.trim()) ?? "" };
|
|
91
|
+
}
|
|
92
|
+
async function detectSplitBoundaries(rawSegments, audioPath) {
|
|
93
|
+
const boundaryCount = countProcessorBoundaries(rawSegments);
|
|
94
|
+
if (boundaryCount === 0) return void 0;
|
|
95
|
+
try {
|
|
96
|
+
const audioDuration = await getAudioDuration(audioPath);
|
|
97
|
+
return calculateWhisperSplits(audioDuration, boundaryCount + 1);
|
|
98
|
+
} catch {
|
|
99
|
+
return void 0;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
export {
|
|
103
|
+
inputPreference,
|
|
104
|
+
recognize
|
|
105
|
+
};
|