@storyteller-platform/ghost-story 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/LICENSE.md +611 -0
  2. package/README.md +18 -0
  3. package/dist/api/APIOptions.cjs +16 -0
  4. package/dist/api/APIOptions.d.cts +18 -0
  5. package/dist/api/APIOptions.d.ts +18 -0
  6. package/dist/api/APIOptions.js +0 -0
  7. package/dist/api/Recognition.cjs +263 -0
  8. package/dist/api/Recognition.d.cts +77 -0
  9. package/dist/api/Recognition.d.ts +77 -0
  10. package/dist/api/Recognition.js +233 -0
  11. package/dist/api/VoiceActivityDetection.cjs +77 -0
  12. package/dist/api/VoiceActivityDetection.d.cts +24 -0
  13. package/dist/api/VoiceActivityDetection.d.ts +24 -0
  14. package/dist/api/VoiceActivityDetection.js +43 -0
  15. package/dist/audio/AudioConverter.cjs +331 -0
  16. package/dist/audio/AudioConverter.d.cts +53 -0
  17. package/dist/audio/AudioConverter.d.ts +53 -0
  18. package/dist/audio/AudioConverter.js +310 -0
  19. package/dist/audio/AudioFormat.cjs +151 -0
  20. package/dist/audio/AudioFormat.d.cts +25 -0
  21. package/dist/audio/AudioFormat.d.ts +25 -0
  22. package/dist/audio/AudioFormat.js +123 -0
  23. package/dist/audio/AudioSource.cjs +119 -0
  24. package/dist/audio/AudioSource.d.cts +33 -0
  25. package/dist/audio/AudioSource.d.ts +33 -0
  26. package/dist/audio/AudioSource.js +88 -0
  27. package/dist/audio/index.cjs +74 -0
  28. package/dist/audio/index.d.cts +6 -0
  29. package/dist/audio/index.d.ts +6 -0
  30. package/dist/audio/index.js +54 -0
  31. package/dist/cli/bin.cjs +277 -0
  32. package/dist/cli/bin.d.cts +1 -0
  33. package/dist/cli/bin.d.ts +1 -0
  34. package/dist/cli/bin.js +275 -0
  35. package/dist/cli/config.cjs +347 -0
  36. package/dist/cli/config.d.cts +33 -0
  37. package/dist/cli/config.d.ts +33 -0
  38. package/dist/cli/config.js +285 -0
  39. package/dist/cli/install.cjs +334 -0
  40. package/dist/cli/install.d.cts +62 -0
  41. package/dist/cli/install.d.ts +62 -0
  42. package/dist/cli/install.js +316 -0
  43. package/dist/cli/whisper-server.cjs +172 -0
  44. package/dist/cli/whisper-server.d.cts +24 -0
  45. package/dist/cli/whisper-server.d.ts +24 -0
  46. package/dist/cli/whisper-server.js +152 -0
  47. package/dist/config.cjs +60 -0
  48. package/dist/config.d.cts +12 -0
  49. package/dist/config.d.ts +12 -0
  50. package/dist/config.js +32 -0
  51. package/dist/convert.cjs +88 -0
  52. package/dist/convert.d.cts +12 -0
  53. package/dist/convert.d.ts +12 -0
  54. package/dist/convert.js +63 -0
  55. package/dist/encodings/Ascii.cjs +75 -0
  56. package/dist/encodings/Ascii.d.cts +13 -0
  57. package/dist/encodings/Ascii.d.ts +13 -0
  58. package/dist/encodings/Ascii.js +48 -0
  59. package/dist/encodings/Base64.cjs +155 -0
  60. package/dist/encodings/Base64.d.cts +5 -0
  61. package/dist/encodings/Base64.d.ts +5 -0
  62. package/dist/encodings/Base64.js +129 -0
  63. package/dist/encodings/TextEncodingsCommon.cjs +16 -0
  64. package/dist/encodings/TextEncodingsCommon.d.cts +6 -0
  65. package/dist/encodings/TextEncodingsCommon.d.ts +6 -0
  66. package/dist/encodings/TextEncodingsCommon.js +0 -0
  67. package/dist/index.cjs +153 -0
  68. package/dist/index.d.cts +15 -0
  69. package/dist/index.d.ts +15 -0
  70. package/dist/index.js +140 -0
  71. package/dist/recognition/AmazonTranscribeSTT.cjs +188 -0
  72. package/dist/recognition/AmazonTranscribeSTT.d.cts +21 -0
  73. package/dist/recognition/AmazonTranscribeSTT.d.ts +21 -0
  74. package/dist/recognition/AmazonTranscribeSTT.js +160 -0
  75. package/dist/recognition/AzureCognitiveServicesSTT.cjs +124 -0
  76. package/dist/recognition/AzureCognitiveServicesSTT.d.cts +21 -0
  77. package/dist/recognition/AzureCognitiveServicesSTT.d.ts +21 -0
  78. package/dist/recognition/AzureCognitiveServicesSTT.js +95 -0
  79. package/dist/recognition/DeepgramSTT.cjs +172 -0
  80. package/dist/recognition/DeepgramSTT.d.cts +23 -0
  81. package/dist/recognition/DeepgramSTT.d.ts +23 -0
  82. package/dist/recognition/DeepgramSTT.js +153 -0
  83. package/dist/recognition/GoogleCloudSTT.cjs +125 -0
  84. package/dist/recognition/GoogleCloudSTT.d.cts +35 -0
  85. package/dist/recognition/GoogleCloudSTT.d.ts +35 -0
  86. package/dist/recognition/GoogleCloudSTT.js +107 -0
  87. package/dist/recognition/OpenAICloudSTT.cjs +180 -0
  88. package/dist/recognition/OpenAICloudSTT.d.cts +29 -0
  89. package/dist/recognition/OpenAICloudSTT.d.ts +29 -0
  90. package/dist/recognition/OpenAICloudSTT.js +150 -0
  91. package/dist/recognition/WhisperCppSTT.cjs +296 -0
  92. package/dist/recognition/WhisperCppSTT.d.cts +40 -0
  93. package/dist/recognition/WhisperCppSTT.d.ts +40 -0
  94. package/dist/recognition/WhisperCppSTT.js +275 -0
  95. package/dist/recognition/WhisperServerSTT.cjs +119 -0
  96. package/dist/recognition/WhisperServerSTT.d.cts +24 -0
  97. package/dist/recognition/WhisperServerSTT.d.ts +24 -0
  98. package/dist/recognition/WhisperServerSTT.js +105 -0
  99. package/dist/utilities/FileSystem.cjs +54 -0
  100. package/dist/utilities/FileSystem.d.cts +3 -0
  101. package/dist/utilities/FileSystem.d.ts +3 -0
  102. package/dist/utilities/FileSystem.js +20 -0
  103. package/dist/utilities/Locale.cjs +46 -0
  104. package/dist/utilities/Locale.d.cts +9 -0
  105. package/dist/utilities/Locale.d.ts +9 -0
  106. package/dist/utilities/Locale.js +20 -0
  107. package/dist/utilities/ObjectUtilities.cjs +41 -0
  108. package/dist/utilities/ObjectUtilities.d.cts +3 -0
  109. package/dist/utilities/ObjectUtilities.d.ts +3 -0
  110. package/dist/utilities/ObjectUtilities.js +7 -0
  111. package/dist/utilities/Timeline.cjs +120 -0
  112. package/dist/utilities/Timeline.d.cts +23 -0
  113. package/dist/utilities/Timeline.d.ts +23 -0
  114. package/dist/utilities/Timeline.js +94 -0
  115. package/dist/utilities/Timing.cjs +287 -0
  116. package/dist/utilities/Timing.d.cts +64 -0
  117. package/dist/utilities/Timing.d.ts +64 -0
  118. package/dist/utilities/Timing.js +256 -0
  119. package/dist/utilities/WhisperTimeline.cjs +344 -0
  120. package/dist/utilities/WhisperTimeline.d.cts +86 -0
  121. package/dist/utilities/WhisperTimeline.d.ts +86 -0
  122. package/dist/utilities/WhisperTimeline.js +313 -0
  123. package/dist/vad/ActiveGate.cjs +357 -0
  124. package/dist/vad/ActiveGate.d.cts +53 -0
  125. package/dist/vad/ActiveGate.d.ts +53 -0
  126. package/dist/vad/ActiveGate.js +329 -0
  127. package/dist/vad/ActiveGateOg.cjs +1366 -0
  128. package/dist/vad/ActiveGateOg.d.cts +33 -0
  129. package/dist/vad/ActiveGateOg.d.ts +33 -0
  130. package/dist/vad/ActiveGateOg.js +1341 -0
  131. package/dist/vad/Silero.cjs +174 -0
  132. package/dist/vad/Silero.d.cts +25 -0
  133. package/dist/vad/Silero.d.ts +25 -0
  134. package/dist/vad/Silero.js +153 -0
  135. package/package.json +125 -0
@@ -0,0 +1,40 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.cjs';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
3
+ import { WhisperModel } from '../cli/config.cjs';
4
+ import { Timeline } from '../utilities/Timeline.cjs';
5
+ import { Timing } from '../utilities/Timing.cjs';
6
+ import 'node:fs';
7
+ import 'node:stream';
8
+
9
+ type InputPreference = "file";
10
+ declare const inputPreference: InputPreference;
11
+ type Language = "af" | "am" | "ar" | "as" | "az" | "ba" | "be" | "bg" | "bn" | "bo" | "br" | "bs" | "ca" | "cs" | "cy" | "da" | "de" | "el" | "en" | "es" | "et" | "eu" | "fa" | "fi" | "fo" | "fr" | "gl" | "gu" | "ha" | "haw" | "he" | "hi" | "hr" | "ht" | "hu" | "hy" | "id" | "is" | "it" | "ja" | "jw" | "ka" | "kk" | "km" | "kn" | "ko" | "la" | "lb" | "ln" | "lo" | "lt" | "lv" | "mg" | "mi" | "mk" | "ml" | "mn" | "mr" | "ms" | "mt" | "my" | "ne" | "nl" | "nn" | "no" | "oc" | "pa" | "pl" | "ps" | "pt" | "ro" | "ru" | "sa" | "sd" | "si" | "sk" | "sl" | "sn" | "so" | "sq" | "sr" | "su" | "sv" | "sw" | "ta" | "te" | "tg" | "th" | "tk" | "tl" | "tr" | "tt" | "uk" | "ur" | "uz" | "vi" | "yi" | "yo" | "zh";
12
+
13
+ interface WhisperCppOptions {
14
+ model: WhisperModel;
15
+ modelDir?: string;
16
+ installDir?: string;
17
+ language?: Language;
18
+ processors?: number;
19
+ threads?: number;
20
+ flashAttention?: boolean;
21
+ suppressNonSpeechTokens?: boolean;
22
+ tokenLevelTimestamps?: boolean;
23
+ printOutput?: boolean;
24
+ autoInstall?: boolean;
25
+ onProgress?: (progress: number) => void;
26
+ signal?: AbortSignal;
27
+ inputFormat?: AudioFormat;
28
+ timing?: Timing | undefined;
29
+ }
30
+ declare function recognize(input: RawAudioInput | AudioSource, options?: WhisperCppOptions): Promise<RecognitionResult>;
31
+ declare function ensureWhisperCppInstalled(): Promise<void>;
32
+ declare function ensureModelDownloaded(modelDir: string, modelName: WhisperModel, printOutput: boolean): Promise<void>;
33
+ interface RecognitionResult {
34
+ transcript: string;
35
+ timeline: Timeline;
36
+ language?: string;
37
+ }
38
+ type WhisperCppModelId = "tiny" | "tiny-q5_1" | "tiny.en" | "tiny.en-q5_1" | "tiny.en-q8_0" | "base" | "base-q5_1" | "base.en" | "base.en-q5_1" | "small" | "small-q5_1" | "small.en" | "small.en-q5_1" | "medium" | "medium-q5_0" | "medium.en" | "medium.en-q5_0" | "large" | "large-v1" | "large-v2" | "large-v2-q5_0" | "large-v3" | "large-v3-q5_0" | "large-v3-turbo" | "large-v3-turbo-q5_0";
39
+
40
+ export { type InputPreference, type Language, type RecognitionResult, type WhisperCppModelId, type WhisperCppOptions, WhisperModel, ensureModelDownloaded, ensureWhisperCppInstalled, inputPreference, recognize };
@@ -0,0 +1,40 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.js';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
3
+ import { WhisperModel } from '../cli/config.js';
4
+ import { Timeline } from '../utilities/Timeline.js';
5
+ import { Timing } from '../utilities/Timing.js';
6
+ import 'node:fs';
7
+ import 'node:stream';
8
+
9
+ type InputPreference = "file";
10
+ declare const inputPreference: InputPreference;
11
+ type Language = "af" | "am" | "ar" | "as" | "az" | "ba" | "be" | "bg" | "bn" | "bo" | "br" | "bs" | "ca" | "cs" | "cy" | "da" | "de" | "el" | "en" | "es" | "et" | "eu" | "fa" | "fi" | "fo" | "fr" | "gl" | "gu" | "ha" | "haw" | "he" | "hi" | "hr" | "ht" | "hu" | "hy" | "id" | "is" | "it" | "ja" | "jw" | "ka" | "kk" | "km" | "kn" | "ko" | "la" | "lb" | "ln" | "lo" | "lt" | "lv" | "mg" | "mi" | "mk" | "ml" | "mn" | "mr" | "ms" | "mt" | "my" | "ne" | "nl" | "nn" | "no" | "oc" | "pa" | "pl" | "ps" | "pt" | "ro" | "ru" | "sa" | "sd" | "si" | "sk" | "sl" | "sn" | "so" | "sq" | "sr" | "su" | "sv" | "sw" | "ta" | "te" | "tg" | "th" | "tk" | "tl" | "tr" | "tt" | "uk" | "ur" | "uz" | "vi" | "yi" | "yo" | "zh";
12
+
13
+ interface WhisperCppOptions {
14
+ model: WhisperModel;
15
+ modelDir?: string;
16
+ installDir?: string;
17
+ language?: Language;
18
+ processors?: number;
19
+ threads?: number;
20
+ flashAttention?: boolean;
21
+ suppressNonSpeechTokens?: boolean;
22
+ tokenLevelTimestamps?: boolean;
23
+ printOutput?: boolean;
24
+ autoInstall?: boolean;
25
+ onProgress?: (progress: number) => void;
26
+ signal?: AbortSignal;
27
+ inputFormat?: AudioFormat;
28
+ timing?: Timing | undefined;
29
+ }
30
+ declare function recognize(input: RawAudioInput | AudioSource, options?: WhisperCppOptions): Promise<RecognitionResult>;
31
+ declare function ensureWhisperCppInstalled(): Promise<void>;
32
+ declare function ensureModelDownloaded(modelDir: string, modelName: WhisperModel, printOutput: boolean): Promise<void>;
33
+ interface RecognitionResult {
34
+ transcript: string;
35
+ timeline: Timeline;
36
+ language?: string;
37
+ }
38
+ type WhisperCppModelId = "tiny" | "tiny-q5_1" | "tiny.en" | "tiny.en-q5_1" | "tiny.en-q8_0" | "base" | "base-q5_1" | "base.en" | "base.en-q5_1" | "small" | "small-q5_1" | "small.en" | "small.en-q5_1" | "medium" | "medium-q5_0" | "medium.en" | "medium.en-q5_0" | "large" | "large-v1" | "large-v2" | "large-v2-q5_0" | "large-v3" | "large-v3-q5_0" | "large-v3-turbo" | "large-v3-turbo-q5_0";
39
+
40
+ export { type InputPreference, type Language, type RecognitionResult, type WhisperCppModelId, type WhisperCppOptions, WhisperModel, ensureModelDownloaded, ensureWhisperCppInstalled, inputPreference, recognize };
@@ -0,0 +1,275 @@
1
+ import { spawn } from "node:child_process";
2
+ import fs, { existsSync } from "node:fs";
3
+ import os from "node:os";
4
+ import path from "node:path";
5
+ import { ensureDir } from "fs-extra";
6
+ import {
7
+ getAudioDuration,
8
+ isAudioSource,
9
+ normalizeToAudioSource,
10
+ prepareWavForService,
11
+ toFilePath
12
+ } from "../audio/index.js";
13
+ import {
14
+ getInstallDir,
15
+ getModelDir,
16
+ getModelPath as getModelPathFromConfig,
17
+ getWhisperExecutablePath
18
+ } from "../cli/config.js";
19
+ import { installBinary, installModel } from "../cli/install.js";
20
+ import {
21
+ calculateEffectiveProcessors,
22
+ calculateWhisperSplits,
23
+ extractCorrectedTimeline,
24
+ parseWhisperCppOutput
25
+ } from "../utilities/WhisperTimeline.js";
26
+ const inputPreference = "file";
27
+ const defaultOptions = {
28
+ processors: 1,
29
+ threads: 4,
30
+ flashAttention: true,
31
+ suppressNonSpeechTokens: true,
32
+ tokenLevelTimestamps: true,
33
+ printOutput: false,
34
+ model: "tiny.en",
35
+ autoInstall: true
36
+ };
37
+ const acceptedFormats = ["wav", "flac", "ogg", "mp3"];
38
+ async function recognize(input, options) {
39
+ const opts = { ...defaultOptions, ...options };
40
+ const timing = opts.timing;
41
+ const modelDir = opts.modelDir ?? getModelDir();
42
+ const installDir = opts.installDir ?? getInstallDir();
43
+ const source = isAudioSource(input) ? input : normalizeToAudioSource(input, opts.inputFormat);
44
+ await ensureDir(modelDir);
45
+ const doInstall = async () => {
46
+ await ensureWhisperCppInstalled();
47
+ await ensureModelDownloaded(modelDir, opts.model, opts.printOutput);
48
+ };
49
+ if (opts.autoInstall) {
50
+ if (timing) {
51
+ await timing.timeAsync("installation", doInstall);
52
+ } else {
53
+ await doInstall();
54
+ }
55
+ }
56
+ const conversionNeeded = !acceptedFormats.includes(source.format);
57
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
58
+ timing == null ? void 0 : timing.setMetadata("targetFormat", conversionNeeded ? "wav" : source.format);
59
+ const doPrepare = async () => {
60
+ if (!conversionNeeded) return { source, cleanup: async () => {
61
+ } };
62
+ return prepareWavForService(source, { sampleRate: 16e3, channels: 1 });
63
+ };
64
+ const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
65
+ try {
66
+ const inputPath = toFilePath(prepared.source);
67
+ if (!inputPath) {
68
+ throw new Error(
69
+ "whisper.cpp requires a file path. The audio could not be prepared as a file."
70
+ );
71
+ }
72
+ if (!existsSync(inputPath)) {
73
+ throw new Error(`Input file does not exist: ${inputPath}`);
74
+ }
75
+ const audioDuration = await getAudioDuration(inputPath);
76
+ const effectiveProcessors = calculateEffectiveProcessors(
77
+ audioDuration,
78
+ opts.processors
79
+ );
80
+ const doTranscribe = () => transcribe({
81
+ inputPath,
82
+ model: opts.model,
83
+ installDir,
84
+ modelFolder: modelDir,
85
+ language: opts.language ?? null,
86
+ tokenLevelTimestamps: opts.tokenLevelTimestamps,
87
+ printOutput: opts.printOutput,
88
+ flashAttention: opts.flashAttention,
89
+ suppressNonSpeechTokens: opts.suppressNonSpeechTokens,
90
+ processors: effectiveProcessors,
91
+ threads: opts.threads,
92
+ onProgress: opts.onProgress ?? null,
93
+ signal: opts.signal ?? null
94
+ });
95
+ const transcription = timing ? await timing.timeAsync("transcription", doTranscribe) : await doTranscribe();
96
+ const rawSegments = parseWhisperCppOutput(transcription.transcription);
97
+ const splitBoundaries = effectiveProcessors > 1 ? calculateWhisperSplits(audioDuration, effectiveProcessors) : [];
98
+ const timeline = extractCorrectedTimeline(rawSegments, {
99
+ splitBoundaries: splitBoundaries.length > 0 ? splitBoundaries : void 0
100
+ });
101
+ const transcript = transcription.transcription.map((s) => s.text).join("").trim();
102
+ return {
103
+ transcript,
104
+ timeline,
105
+ language: transcription.result.language
106
+ };
107
+ } finally {
108
+ await prepared.cleanup();
109
+ }
110
+ }
111
+ async function ensureWhisperCppInstalled() {
112
+ await installBinary({ printOutput: false });
113
+ }
114
+ async function ensureModelDownloaded(modelDir, modelName, printOutput) {
115
+ const modelPath = getModelPathFromConfig(modelName, modelDir);
116
+ if (existsSync(modelPath)) {
117
+ return;
118
+ }
119
+ await installModel({
120
+ model: modelName,
121
+ modelDir,
122
+ printOutput
123
+ });
124
+ }
125
+ function getModelPath(folder, model) {
126
+ return path.join(folder, `ggml-${model}.bin`);
127
+ }
128
+ async function transcribe(options) {
129
+ const {
130
+ inputPath,
131
+ model,
132
+ installDir,
133
+ modelFolder,
134
+ language,
135
+ tokenLevelTimestamps,
136
+ printOutput,
137
+ flashAttention,
138
+ suppressNonSpeechTokens,
139
+ processors,
140
+ threads,
141
+ onProgress,
142
+ signal
143
+ } = options;
144
+ const executable = getWhisperExecutablePath(installDir);
145
+ const modelPath = getModelPath(modelFolder, model);
146
+ if (!existsSync(executable)) {
147
+ throw new Error(`Whisper executable not found at ${executable}`);
148
+ }
149
+ if (!existsSync(modelPath)) {
150
+ throw new Error(`Model not found at ${modelPath}`);
151
+ }
152
+ const tmpDir = path.join(os.tmpdir(), "ghost-story-whisper");
153
+ await ensureDir(tmpDir);
154
+ const tmpJsonPath = path.join(tmpDir, `transcription-${Date.now()}`);
155
+ const args = buildTranscribeArgs({
156
+ inputPath,
157
+ modelPath,
158
+ outputPath: tmpJsonPath,
159
+ model,
160
+ language,
161
+ tokenLevelTimestamps,
162
+ flashAttention,
163
+ suppressNonSpeechTokens,
164
+ processors,
165
+ threads
166
+ });
167
+ try {
168
+ const outputPath = await runWhisperProcess({
169
+ executable,
170
+ args,
171
+ cwd: installDir,
172
+ printOutput,
173
+ onProgress,
174
+ signal,
175
+ expectedOutputPath: `${tmpJsonPath}.json`
176
+ });
177
+ const json = JSON.parse(
178
+ await fs.promises.readFile(outputPath, "utf8")
179
+ );
180
+ fs.promises.unlink(outputPath).catch(() => {
181
+ });
182
+ return json;
183
+ } catch (error) {
184
+ await fs.promises.unlink(`${tmpJsonPath}.json`).catch(() => {
185
+ });
186
+ throw error;
187
+ }
188
+ }
189
+ function buildTranscribeArgs(options) {
190
+ const args = [
191
+ "--file",
192
+ options.inputPath,
193
+ "--output-file",
194
+ options.outputPath,
195
+ "--output-json-full",
196
+ "--model",
197
+ options.modelPath,
198
+ "--print-progress",
199
+ options.language ? ["--language", options.language.toLowerCase()] : null,
200
+ options.flashAttention ? ["--flash-attn"] : null,
201
+ options.suppressNonSpeechTokens ? ["--suppress-nst", "--no-prints"] : null,
202
+ ["--processors", String(options.processors)],
203
+ ["--threads", String(options.threads)]
204
+ ];
205
+ return args.flat().filter((arg) => arg !== null);
206
+ }
207
+ function runWhisperProcess(options) {
208
+ const {
209
+ executable,
210
+ args,
211
+ cwd,
212
+ printOutput,
213
+ onProgress,
214
+ signal,
215
+ expectedOutputPath
216
+ } = options;
217
+ if (signal == null ? void 0 : signal.aborted) {
218
+ return Promise.reject(new Error("Signal aborted"));
219
+ }
220
+ return new Promise((resolve, reject) => {
221
+ const task = spawn(executable, args, { cwd, signal: signal ?? void 0 });
222
+ let output = "";
223
+ const handleData = (data) => {
224
+ const str = data.toString("utf-8");
225
+ output += str;
226
+ if (str.includes("progress =")) {
227
+ const match = str.match(/progress\s*=\s*([\d.]+)/);
228
+ if (match == null ? void 0 : match[1]) {
229
+ onProgress == null ? void 0 : onProgress(parseFloat(match[1]) / 100);
230
+ }
231
+ }
232
+ };
233
+ task.stdout.on("data", (data) => {
234
+ handleData(data);
235
+ if (printOutput) {
236
+ process.stdout.write(data);
237
+ }
238
+ });
239
+ task.stderr.on("data", (data) => {
240
+ handleData(data);
241
+ if (printOutput) {
242
+ process.stderr.write(data);
243
+ }
244
+ });
245
+ task.on("exit", (code, exitSignal) => {
246
+ if (existsSync(expectedOutputPath)) {
247
+ onProgress == null ? void 0 : onProgress(1);
248
+ resolve(expectedOutputPath);
249
+ return;
250
+ }
251
+ if (exitSignal) {
252
+ reject(new Error(`Process killed with signal ${exitSignal}: ${output}`));
253
+ return;
254
+ }
255
+ if (output.includes("must be 16 kHz")) {
256
+ reject(
257
+ new Error(
258
+ "Audio file must be 16 kHz. Convert your audio to 16-bit, 16KHz WAV format."
259
+ )
260
+ );
261
+ return;
262
+ }
263
+ reject(new Error(`Transcription failed (exit code ${code}): ${output}`));
264
+ });
265
+ task.on("error", (err) => {
266
+ reject(new Error(`Failed to start whisper process: ${err.message}`));
267
+ });
268
+ });
269
+ }
270
+ export {
271
+ ensureModelDownloaded,
272
+ ensureWhisperCppInstalled,
273
+ inputPreference,
274
+ recognize
275
+ };
@@ -0,0 +1,119 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var WhisperServerSTT_exports = {};
20
+ __export(WhisperServerSTT_exports, {
21
+ inputPreference: () => inputPreference,
22
+ recognize: () => recognize
23
+ });
24
+ module.exports = __toCommonJS(WhisperServerSTT_exports);
25
+ var import_node_fs = require("node:fs");
26
+ var import_node_path = require("node:path");
27
+ var import_audio = require("../audio/index.cjs");
28
+ var import_WhisperTimeline = require("../utilities/WhisperTimeline.cjs");
29
+ const inputPreference = "file";
30
+ const defaultOptions = {
31
+ baseURL: "http://localhost:8080",
32
+ inferencePath: "/audio/transcriptions",
33
+ temperature: 0
34
+ };
35
+ async function recognize(input, languageCode, options = {}) {
36
+ const opts = { ...defaultOptions, ...options };
37
+ const timing = opts.timing;
38
+ const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, opts.inputFormat);
39
+ const conversionNeeded = source.format !== "wav";
40
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
41
+ timing == null ? void 0 : timing.setMetadata("targetFormat", "wav");
42
+ const doPrepare = () => (0, import_audio.prepareWavForService)(source, { sampleRate: 16e3, channels: 1 });
43
+ const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
44
+ try {
45
+ const filePath = (0, import_audio.toFilePath)(prepared.source);
46
+ if (!filePath) {
47
+ throw new Error(
48
+ "Whisper server requires a file path. The audio could not be prepared as a file."
49
+ );
50
+ }
51
+ const filename = (0, import_node_path.basename)(filePath);
52
+ const blob = await (0, import_node_fs.openAsBlob)(filePath);
53
+ const form = new FormData();
54
+ form.append("file", blob, filename);
55
+ form.append("temperature", String(opts.temperature));
56
+ form.append("response_format", "verbose_json");
57
+ if (languageCode) {
58
+ form.append("language", languageCode);
59
+ }
60
+ const url = `${opts.baseURL}${opts.inferencePath}`;
61
+ const headers = {};
62
+ if (opts.apiKey) {
63
+ headers["Authorization"] = `Bearer ${opts.apiKey}`;
64
+ }
65
+ const doUpload = () => fetch(url, { method: "POST", body: form, headers });
66
+ const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
67
+ if (!response.ok) {
68
+ const text = await response.text();
69
+ throw new Error(`Whisper server error: ${response.status} ${text}`);
70
+ }
71
+ const data = await response.json();
72
+ const { timeline, transcript } = await extractTimelineAndTranscript(
73
+ data,
74
+ filePath
75
+ );
76
+ if (!timeline) {
77
+ throw new Error("Failed to extract timeline from Whisper server response");
78
+ }
79
+ return { transcript, timeline };
80
+ } finally {
81
+ await prepared.cleanup();
82
+ }
83
+ }
84
+ async function extractTimelineAndTranscript(response, audioPath) {
85
+ var _a, _b, _c, _d;
86
+ if (response.segments.length === 0) {
87
+ return { timeline: void 0, transcript: ((_a = response.text) == null ? void 0 : _a.trim()) ?? "" };
88
+ }
89
+ const hasNestedWords = (((_c = (_b = response.segments[0]) == null ? void 0 : _b.words) == null ? void 0 : _c.length) ?? 0) > 0;
90
+ if (hasNestedWords) {
91
+ const rawSegments = (0, import_WhisperTimeline.parseWhisperServerOutput)(response.segments);
92
+ const splitBoundaries = await detectSplitBoundaries(rawSegments, audioPath);
93
+ const timeline2 = (0, import_WhisperTimeline.extractCorrectedTimeline)(rawSegments, { splitBoundaries });
94
+ const transcript = timeline2.map((entry) => entry.text).join(" ");
95
+ return { timeline: timeline2, transcript };
96
+ }
97
+ const timeline = response.segments.map((seg) => ({
98
+ type: "segment",
99
+ text: seg.text.trim(),
100
+ startTime: seg.start,
101
+ endTime: seg.end
102
+ }));
103
+ return { timeline, transcript: ((_d = response.text) == null ? void 0 : _d.trim()) ?? "" };
104
+ }
105
+ async function detectSplitBoundaries(rawSegments, audioPath) {
106
+ const boundaryCount = (0, import_WhisperTimeline.countProcessorBoundaries)(rawSegments);
107
+ if (boundaryCount === 0) return void 0;
108
+ try {
109
+ const audioDuration = await (0, import_audio.getAudioDuration)(audioPath);
110
+ return (0, import_WhisperTimeline.calculateWhisperSplits)(audioDuration, boundaryCount + 1);
111
+ } catch {
112
+ return void 0;
113
+ }
114
+ }
115
+ // Annotate the CommonJS export names for ESM import in node:
116
+ 0 && (module.exports = {
117
+ inputPreference,
118
+ recognize
119
+ });
@@ -0,0 +1,24 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.cjs';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
3
+ import { Timeline } from '../utilities/Timeline.cjs';
4
+ import { Timing } from '../utilities/Timing.cjs';
5
+ import 'node:fs';
6
+ import 'node:stream';
7
+
8
+ type InputPreference = "file";
9
+ declare const inputPreference: InputPreference;
10
+ interface WhisperServerOptions {
11
+ baseURL?: string;
12
+ inferencePath?: string;
13
+ temperature?: number;
14
+ apiKey?: string;
15
+ inputFormat?: AudioFormat;
16
+ timing?: Timing | undefined;
17
+ }
18
+ interface RecognitionResult {
19
+ transcript: string;
20
+ timeline?: Timeline;
21
+ }
22
+ declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options?: WhisperServerOptions): Promise<RecognitionResult>;
23
+
24
+ export { type InputPreference, type RecognitionResult, type WhisperServerOptions, inputPreference, recognize };
@@ -0,0 +1,24 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.js';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
3
+ import { Timeline } from '../utilities/Timeline.js';
4
+ import { Timing } from '../utilities/Timing.js';
5
+ import 'node:fs';
6
+ import 'node:stream';
7
+
8
+ type InputPreference = "file";
9
+ declare const inputPreference: InputPreference;
10
+ interface WhisperServerOptions {
11
+ baseURL?: string;
12
+ inferencePath?: string;
13
+ temperature?: number;
14
+ apiKey?: string;
15
+ inputFormat?: AudioFormat;
16
+ timing?: Timing | undefined;
17
+ }
18
+ interface RecognitionResult {
19
+ transcript: string;
20
+ timeline?: Timeline;
21
+ }
22
+ declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options?: WhisperServerOptions): Promise<RecognitionResult>;
23
+
24
+ export { type InputPreference, type RecognitionResult, type WhisperServerOptions, inputPreference, recognize };
@@ -0,0 +1,105 @@
1
+ import { openAsBlob } from "node:fs";
2
+ import { basename } from "node:path";
3
+ import {
4
+ getAudioDuration,
5
+ isAudioSource,
6
+ normalizeToAudioSource,
7
+ prepareWavForService,
8
+ toFilePath
9
+ } from "../audio/index.js";
10
+ import {
11
+ calculateWhisperSplits,
12
+ countProcessorBoundaries,
13
+ extractCorrectedTimeline,
14
+ parseWhisperServerOutput
15
+ } from "../utilities/WhisperTimeline.js";
16
+ const inputPreference = "file";
17
+ const defaultOptions = {
18
+ baseURL: "http://localhost:8080",
19
+ inferencePath: "/audio/transcriptions",
20
+ temperature: 0
21
+ };
22
+ async function recognize(input, languageCode, options = {}) {
23
+ const opts = { ...defaultOptions, ...options };
24
+ const timing = opts.timing;
25
+ const source = isAudioSource(input) ? input : normalizeToAudioSource(input, opts.inputFormat);
26
+ const conversionNeeded = source.format !== "wav";
27
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
28
+ timing == null ? void 0 : timing.setMetadata("targetFormat", "wav");
29
+ const doPrepare = () => prepareWavForService(source, { sampleRate: 16e3, channels: 1 });
30
+ const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
31
+ try {
32
+ const filePath = toFilePath(prepared.source);
33
+ if (!filePath) {
34
+ throw new Error(
35
+ "Whisper server requires a file path. The audio could not be prepared as a file."
36
+ );
37
+ }
38
+ const filename = basename(filePath);
39
+ const blob = await openAsBlob(filePath);
40
+ const form = new FormData();
41
+ form.append("file", blob, filename);
42
+ form.append("temperature", String(opts.temperature));
43
+ form.append("response_format", "verbose_json");
44
+ if (languageCode) {
45
+ form.append("language", languageCode);
46
+ }
47
+ const url = `${opts.baseURL}${opts.inferencePath}`;
48
+ const headers = {};
49
+ if (opts.apiKey) {
50
+ headers["Authorization"] = `Bearer ${opts.apiKey}`;
51
+ }
52
+ const doUpload = () => fetch(url, { method: "POST", body: form, headers });
53
+ const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
54
+ if (!response.ok) {
55
+ const text = await response.text();
56
+ throw new Error(`Whisper server error: ${response.status} ${text}`);
57
+ }
58
+ const data = await response.json();
59
+ const { timeline, transcript } = await extractTimelineAndTranscript(
60
+ data,
61
+ filePath
62
+ );
63
+ if (!timeline) {
64
+ throw new Error("Failed to extract timeline from Whisper server response");
65
+ }
66
+ return { transcript, timeline };
67
+ } finally {
68
+ await prepared.cleanup();
69
+ }
70
+ }
71
+ async function extractTimelineAndTranscript(response, audioPath) {
72
+ var _a, _b, _c, _d;
73
+ if (response.segments.length === 0) {
74
+ return { timeline: void 0, transcript: ((_a = response.text) == null ? void 0 : _a.trim()) ?? "" };
75
+ }
76
+ const hasNestedWords = (((_c = (_b = response.segments[0]) == null ? void 0 : _b.words) == null ? void 0 : _c.length) ?? 0) > 0;
77
+ if (hasNestedWords) {
78
+ const rawSegments = parseWhisperServerOutput(response.segments);
79
+ const splitBoundaries = await detectSplitBoundaries(rawSegments, audioPath);
80
+ const timeline2 = extractCorrectedTimeline(rawSegments, { splitBoundaries });
81
+ const transcript = timeline2.map((entry) => entry.text).join(" ");
82
+ return { timeline: timeline2, transcript };
83
+ }
84
+ const timeline = response.segments.map((seg) => ({
85
+ type: "segment",
86
+ text: seg.text.trim(),
87
+ startTime: seg.start,
88
+ endTime: seg.end
89
+ }));
90
+ return { timeline, transcript: ((_d = response.text) == null ? void 0 : _d.trim()) ?? "" };
91
+ }
92
+ async function detectSplitBoundaries(rawSegments, audioPath) {
93
+ const boundaryCount = countProcessorBoundaries(rawSegments);
94
+ if (boundaryCount === 0) return void 0;
95
+ try {
96
+ const audioDuration = await getAudioDuration(audioPath);
97
+ return calculateWhisperSplits(audioDuration, boundaryCount + 1);
98
+ } catch {
99
+ return void 0;
100
+ }
101
+ }
102
+ export {
103
+ inputPreference,
104
+ recognize
105
+ };