@storyteller-platform/ghost-story 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +611 -0
- package/README.md +18 -0
- package/dist/api/APIOptions.cjs +16 -0
- package/dist/api/APIOptions.d.cts +18 -0
- package/dist/api/APIOptions.d.ts +18 -0
- package/dist/api/APIOptions.js +0 -0
- package/dist/api/Recognition.cjs +263 -0
- package/dist/api/Recognition.d.cts +77 -0
- package/dist/api/Recognition.d.ts +77 -0
- package/dist/api/Recognition.js +233 -0
- package/dist/api/VoiceActivityDetection.cjs +77 -0
- package/dist/api/VoiceActivityDetection.d.cts +24 -0
- package/dist/api/VoiceActivityDetection.d.ts +24 -0
- package/dist/api/VoiceActivityDetection.js +43 -0
- package/dist/audio/AudioConverter.cjs +331 -0
- package/dist/audio/AudioConverter.d.cts +53 -0
- package/dist/audio/AudioConverter.d.ts +53 -0
- package/dist/audio/AudioConverter.js +310 -0
- package/dist/audio/AudioFormat.cjs +151 -0
- package/dist/audio/AudioFormat.d.cts +25 -0
- package/dist/audio/AudioFormat.d.ts +25 -0
- package/dist/audio/AudioFormat.js +123 -0
- package/dist/audio/AudioSource.cjs +119 -0
- package/dist/audio/AudioSource.d.cts +33 -0
- package/dist/audio/AudioSource.d.ts +33 -0
- package/dist/audio/AudioSource.js +88 -0
- package/dist/audio/index.cjs +74 -0
- package/dist/audio/index.d.cts +6 -0
- package/dist/audio/index.d.ts +6 -0
- package/dist/audio/index.js +54 -0
- package/dist/cli/bin.cjs +277 -0
- package/dist/cli/bin.d.cts +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +275 -0
- package/dist/cli/config.cjs +347 -0
- package/dist/cli/config.d.cts +33 -0
- package/dist/cli/config.d.ts +33 -0
- package/dist/cli/config.js +285 -0
- package/dist/cli/install.cjs +334 -0
- package/dist/cli/install.d.cts +62 -0
- package/dist/cli/install.d.ts +62 -0
- package/dist/cli/install.js +316 -0
- package/dist/cli/whisper-server.cjs +172 -0
- package/dist/cli/whisper-server.d.cts +24 -0
- package/dist/cli/whisper-server.d.ts +24 -0
- package/dist/cli/whisper-server.js +152 -0
- package/dist/config.cjs +60 -0
- package/dist/config.d.cts +12 -0
- package/dist/config.d.ts +12 -0
- package/dist/config.js +32 -0
- package/dist/convert.cjs +88 -0
- package/dist/convert.d.cts +12 -0
- package/dist/convert.d.ts +12 -0
- package/dist/convert.js +63 -0
- package/dist/encodings/Ascii.cjs +75 -0
- package/dist/encodings/Ascii.d.cts +13 -0
- package/dist/encodings/Ascii.d.ts +13 -0
- package/dist/encodings/Ascii.js +48 -0
- package/dist/encodings/Base64.cjs +155 -0
- package/dist/encodings/Base64.d.cts +5 -0
- package/dist/encodings/Base64.d.ts +5 -0
- package/dist/encodings/Base64.js +129 -0
- package/dist/encodings/TextEncodingsCommon.cjs +16 -0
- package/dist/encodings/TextEncodingsCommon.d.cts +6 -0
- package/dist/encodings/TextEncodingsCommon.d.ts +6 -0
- package/dist/encodings/TextEncodingsCommon.js +0 -0
- package/dist/index.cjs +153 -0
- package/dist/index.d.cts +15 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +140 -0
- package/dist/recognition/AmazonTranscribeSTT.cjs +188 -0
- package/dist/recognition/AmazonTranscribeSTT.d.cts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.d.ts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.js +160 -0
- package/dist/recognition/AzureCognitiveServicesSTT.cjs +124 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.cts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.ts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.js +95 -0
- package/dist/recognition/DeepgramSTT.cjs +172 -0
- package/dist/recognition/DeepgramSTT.d.cts +23 -0
- package/dist/recognition/DeepgramSTT.d.ts +23 -0
- package/dist/recognition/DeepgramSTT.js +153 -0
- package/dist/recognition/GoogleCloudSTT.cjs +125 -0
- package/dist/recognition/GoogleCloudSTT.d.cts +35 -0
- package/dist/recognition/GoogleCloudSTT.d.ts +35 -0
- package/dist/recognition/GoogleCloudSTT.js +107 -0
- package/dist/recognition/OpenAICloudSTT.cjs +180 -0
- package/dist/recognition/OpenAICloudSTT.d.cts +29 -0
- package/dist/recognition/OpenAICloudSTT.d.ts +29 -0
- package/dist/recognition/OpenAICloudSTT.js +150 -0
- package/dist/recognition/WhisperCppSTT.cjs +296 -0
- package/dist/recognition/WhisperCppSTT.d.cts +40 -0
- package/dist/recognition/WhisperCppSTT.d.ts +40 -0
- package/dist/recognition/WhisperCppSTT.js +275 -0
- package/dist/recognition/WhisperServerSTT.cjs +119 -0
- package/dist/recognition/WhisperServerSTT.d.cts +24 -0
- package/dist/recognition/WhisperServerSTT.d.ts +24 -0
- package/dist/recognition/WhisperServerSTT.js +105 -0
- package/dist/utilities/FileSystem.cjs +54 -0
- package/dist/utilities/FileSystem.d.cts +3 -0
- package/dist/utilities/FileSystem.d.ts +3 -0
- package/dist/utilities/FileSystem.js +20 -0
- package/dist/utilities/Locale.cjs +46 -0
- package/dist/utilities/Locale.d.cts +9 -0
- package/dist/utilities/Locale.d.ts +9 -0
- package/dist/utilities/Locale.js +20 -0
- package/dist/utilities/ObjectUtilities.cjs +41 -0
- package/dist/utilities/ObjectUtilities.d.cts +3 -0
- package/dist/utilities/ObjectUtilities.d.ts +3 -0
- package/dist/utilities/ObjectUtilities.js +7 -0
- package/dist/utilities/Timeline.cjs +120 -0
- package/dist/utilities/Timeline.d.cts +23 -0
- package/dist/utilities/Timeline.d.ts +23 -0
- package/dist/utilities/Timeline.js +94 -0
- package/dist/utilities/Timing.cjs +287 -0
- package/dist/utilities/Timing.d.cts +64 -0
- package/dist/utilities/Timing.d.ts +64 -0
- package/dist/utilities/Timing.js +256 -0
- package/dist/utilities/WhisperTimeline.cjs +344 -0
- package/dist/utilities/WhisperTimeline.d.cts +86 -0
- package/dist/utilities/WhisperTimeline.d.ts +86 -0
- package/dist/utilities/WhisperTimeline.js +313 -0
- package/dist/vad/ActiveGate.cjs +357 -0
- package/dist/vad/ActiveGate.d.cts +53 -0
- package/dist/vad/ActiveGate.d.ts +53 -0
- package/dist/vad/ActiveGate.js +329 -0
- package/dist/vad/ActiveGateOg.cjs +1366 -0
- package/dist/vad/ActiveGateOg.d.cts +33 -0
- package/dist/vad/ActiveGateOg.d.ts +33 -0
- package/dist/vad/ActiveGateOg.js +1341 -0
- package/dist/vad/Silero.cjs +174 -0
- package/dist/vad/Silero.d.cts +25 -0
- package/dist/vad/Silero.d.ts +25 -0
- package/dist/vad/Silero.js +153 -0
- package/package.json +125 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var OpenAICloudSTT_exports = {};
|
|
30
|
+
__export(OpenAICloudSTT_exports, {
|
|
31
|
+
inputPreference: () => inputPreference,
|
|
32
|
+
recognize: () => recognize
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(OpenAICloudSTT_exports);
|
|
35
|
+
var import_node_fs = require("node:fs");
|
|
36
|
+
var import_audio = require("../audio/index.cjs");
|
|
37
|
+
var import_ObjectUtilities = require("../utilities/ObjectUtilities.cjs");
|
|
38
|
+
const SERVICE_ID = "openai-cloud";
|
|
39
|
+
const inputPreference = "stream";
|
|
40
|
+
const defaultOptions = {
|
|
41
|
+
apiKey: void 0,
|
|
42
|
+
organization: void 0,
|
|
43
|
+
baseURL: void 0,
|
|
44
|
+
model: void 0,
|
|
45
|
+
temperature: 0,
|
|
46
|
+
prompt: void 0,
|
|
47
|
+
timeout: void 0,
|
|
48
|
+
maxRetries: 10,
|
|
49
|
+
requestWordTimestamps: void 0
|
|
50
|
+
};
|
|
51
|
+
async function recognize(input, languageCode, options) {
|
|
52
|
+
const opts = (0, import_ObjectUtilities.extendDeep)(defaultOptions, options);
|
|
53
|
+
const timing = opts.timing;
|
|
54
|
+
if (opts.requestWordTimestamps === void 0) {
|
|
55
|
+
opts.requestWordTimestamps = opts.baseURL === void 0;
|
|
56
|
+
}
|
|
57
|
+
if (opts.model === void 0) {
|
|
58
|
+
if (opts.baseURL === void 0) {
|
|
59
|
+
opts.model = "whisper-1";
|
|
60
|
+
} else {
|
|
61
|
+
throw new Error(
|
|
62
|
+
"A custom provider for the OpenAI Cloud API requires specifying a model name"
|
|
63
|
+
);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, opts.inputFormat);
|
|
67
|
+
const doPrepare = () => (0, import_audio.prepareForService)(source, { service: SERVICE_ID, preferFile: true });
|
|
68
|
+
const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
|
|
69
|
+
const conversionOccurred = source.format !== prepared.source.format;
|
|
70
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionOccurred);
|
|
71
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", prepared.source.format);
|
|
72
|
+
try {
|
|
73
|
+
const { default: OpenAI } = await import("openai");
|
|
74
|
+
const openai = new OpenAI(opts);
|
|
75
|
+
const filePath = (0, import_audio.toFilePath)(prepared.source);
|
|
76
|
+
if (!filePath) {
|
|
77
|
+
throw new Error(
|
|
78
|
+
"OpenAI Cloud STT requires a file path. The audio could not be prepared as a file."
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
const file = (0, import_node_fs.createReadStream)(filePath);
|
|
82
|
+
const timestamp_granularities = opts.requestWordTimestamps ? ["word", "segment"] : void 0;
|
|
83
|
+
const doUpload = () => openai.audio.transcriptions.create({
|
|
84
|
+
file,
|
|
85
|
+
model: opts.model,
|
|
86
|
+
language: languageCode,
|
|
87
|
+
prompt: opts.prompt,
|
|
88
|
+
response_format: "verbose_json",
|
|
89
|
+
temperature: opts.temperature,
|
|
90
|
+
timestamp_granularities,
|
|
91
|
+
stream: false
|
|
92
|
+
});
|
|
93
|
+
const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
|
|
94
|
+
const verboseResponse = response;
|
|
95
|
+
const transcript = verboseResponse.text.trim();
|
|
96
|
+
const timeline = extractTimeline(verboseResponse);
|
|
97
|
+
if (!timeline) {
|
|
98
|
+
throw new Error("Failed to extract timeline from OpenAI Cloud response");
|
|
99
|
+
}
|
|
100
|
+
return { transcript, timeline };
|
|
101
|
+
} finally {
|
|
102
|
+
await prepared.cleanup();
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
function extractTimeline(response) {
|
|
106
|
+
var _a;
|
|
107
|
+
if (response.words) {
|
|
108
|
+
return response.words.map((entry) => ({
|
|
109
|
+
type: "word",
|
|
110
|
+
text: entry.word,
|
|
111
|
+
startTime: entry.start,
|
|
112
|
+
endTime: entry.end
|
|
113
|
+
}));
|
|
114
|
+
}
|
|
115
|
+
const hasNestedWords = response.segments.length > 0 && ((_a = response.segments[0]) == null ? void 0 : _a.words) && response.segments[0].words.length > 0;
|
|
116
|
+
if (hasNestedWords) {
|
|
117
|
+
return extractWordTimelineFromSegments(response.segments);
|
|
118
|
+
}
|
|
119
|
+
return response.segments.map((entry) => ({
|
|
120
|
+
type: "segment",
|
|
121
|
+
text: entry.text,
|
|
122
|
+
startTime: entry.start,
|
|
123
|
+
endTime: entry.end
|
|
124
|
+
}));
|
|
125
|
+
}
|
|
126
|
+
function extractWordTimelineFromSegments(segments) {
|
|
127
|
+
var _a, _b;
|
|
128
|
+
if (segments.length === 0) {
|
|
129
|
+
return [];
|
|
130
|
+
}
|
|
131
|
+
const splitOffsets = findSplitOffsets(segments);
|
|
132
|
+
const wordTimeline = [];
|
|
133
|
+
let currentOffsetIndex = 0;
|
|
134
|
+
for (let i = 0; i < segments.length; i++) {
|
|
135
|
+
const segment = segments[i];
|
|
136
|
+
if (!(segment == null ? void 0 : segment.words) || segment.words.length === 0 || !segment.words[0]) {
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
while (currentOffsetIndex < splitOffsets.length - 1 && i >= (((_a = splitOffsets[currentOffsetIndex + 1]) == null ? void 0 : _a.segmentIndex) ?? -1)) {
|
|
140
|
+
currentOffsetIndex++;
|
|
141
|
+
}
|
|
142
|
+
const timeOffset = currentOffsetIndex < splitOffsets.length ? ((_b = splitOffsets[currentOffsetIndex]) == null ? void 0 : _b.offset) ?? 0 : 0;
|
|
143
|
+
for (const word of segment.words) {
|
|
144
|
+
const text = word.word.trim();
|
|
145
|
+
if (text === "" || text.includes("BLANK_AUDIO")) {
|
|
146
|
+
continue;
|
|
147
|
+
}
|
|
148
|
+
wordTimeline.push({
|
|
149
|
+
type: "word",
|
|
150
|
+
text,
|
|
151
|
+
startTime: word.start + timeOffset,
|
|
152
|
+
endTime: word.end + timeOffset,
|
|
153
|
+
confidence: word.probability ?? 0
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
return wordTimeline;
|
|
158
|
+
}
|
|
159
|
+
function findSplitOffsets(segments) {
|
|
160
|
+
var _a;
|
|
161
|
+
const splitOffsets = [];
|
|
162
|
+
let lastWordEnd = -1;
|
|
163
|
+
for (let i = 0; i < segments.length; i++) {
|
|
164
|
+
const segment = segments[i];
|
|
165
|
+
if (!(segment == null ? void 0 : segment.words) || segment.words.length === 0 || !segment.words[0]) {
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
const firstWordStart = segment.words[0].start;
|
|
169
|
+
if (firstWordStart < lastWordEnd - 0.5 || lastWordEnd === -1) {
|
|
170
|
+
splitOffsets.push({ segmentIndex: i, offset: segment.start });
|
|
171
|
+
}
|
|
172
|
+
lastWordEnd = ((_a = segment.words[segment.words.length - 1]) == null ? void 0 : _a.end) ?? -1;
|
|
173
|
+
}
|
|
174
|
+
return splitOffsets;
|
|
175
|
+
}
|
|
176
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
177
|
+
0 && (module.exports = {
|
|
178
|
+
inputPreference,
|
|
179
|
+
recognize
|
|
180
|
+
});
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { AudioFormat } from '../audio/AudioFormat.cjs';
|
|
2
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
|
|
3
|
+
import { Timeline } from '../utilities/Timeline.cjs';
|
|
4
|
+
import { Timing } from '../utilities/Timing.cjs';
|
|
5
|
+
import 'node:fs';
|
|
6
|
+
import 'node:stream';
|
|
7
|
+
|
|
8
|
+
type InputPreference = "stream";
|
|
9
|
+
declare const inputPreference: InputPreference;
|
|
10
|
+
interface OpenAICloudSTTOptions {
|
|
11
|
+
model?: "whisper-1" | undefined;
|
|
12
|
+
apiKey?: string | undefined;
|
|
13
|
+
organization?: string | undefined;
|
|
14
|
+
baseURL?: string | undefined;
|
|
15
|
+
temperature?: number | undefined;
|
|
16
|
+
prompt?: string | undefined;
|
|
17
|
+
timeout?: number | undefined;
|
|
18
|
+
maxRetries?: number | undefined;
|
|
19
|
+
requestWordTimestamps?: boolean | undefined;
|
|
20
|
+
inputFormat?: AudioFormat;
|
|
21
|
+
timing?: Timing | undefined;
|
|
22
|
+
}
|
|
23
|
+
interface RecognitionResult {
|
|
24
|
+
transcript: string;
|
|
25
|
+
timeline?: Timeline;
|
|
26
|
+
}
|
|
27
|
+
declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options: OpenAICloudSTTOptions): Promise<RecognitionResult>;
|
|
28
|
+
|
|
29
|
+
export { type InputPreference, type OpenAICloudSTTOptions, type RecognitionResult, inputPreference, recognize };
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { AudioFormat } from '../audio/AudioFormat.js';
|
|
2
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
|
|
3
|
+
import { Timeline } from '../utilities/Timeline.js';
|
|
4
|
+
import { Timing } from '../utilities/Timing.js';
|
|
5
|
+
import 'node:fs';
|
|
6
|
+
import 'node:stream';
|
|
7
|
+
|
|
8
|
+
type InputPreference = "stream";
|
|
9
|
+
declare const inputPreference: InputPreference;
|
|
10
|
+
interface OpenAICloudSTTOptions {
|
|
11
|
+
model?: "whisper-1" | undefined;
|
|
12
|
+
apiKey?: string | undefined;
|
|
13
|
+
organization?: string | undefined;
|
|
14
|
+
baseURL?: string | undefined;
|
|
15
|
+
temperature?: number | undefined;
|
|
16
|
+
prompt?: string | undefined;
|
|
17
|
+
timeout?: number | undefined;
|
|
18
|
+
maxRetries?: number | undefined;
|
|
19
|
+
requestWordTimestamps?: boolean | undefined;
|
|
20
|
+
inputFormat?: AudioFormat;
|
|
21
|
+
timing?: Timing | undefined;
|
|
22
|
+
}
|
|
23
|
+
interface RecognitionResult {
|
|
24
|
+
transcript: string;
|
|
25
|
+
timeline?: Timeline;
|
|
26
|
+
}
|
|
27
|
+
declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options: OpenAICloudSTTOptions): Promise<RecognitionResult>;
|
|
28
|
+
|
|
29
|
+
export { type InputPreference, type OpenAICloudSTTOptions, type RecognitionResult, inputPreference, recognize };
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import { createReadStream } from "node:fs";
|
|
2
|
+
import {
|
|
3
|
+
isAudioSource,
|
|
4
|
+
normalizeToAudioSource,
|
|
5
|
+
prepareForService,
|
|
6
|
+
toFilePath
|
|
7
|
+
} from "../audio/index.js";
|
|
8
|
+
import { extendDeep } from "../utilities/ObjectUtilities.js";
|
|
9
|
+
const SERVICE_ID = "openai-cloud";
|
|
10
|
+
const inputPreference = "stream";
|
|
11
|
+
const defaultOptions = {
|
|
12
|
+
apiKey: void 0,
|
|
13
|
+
organization: void 0,
|
|
14
|
+
baseURL: void 0,
|
|
15
|
+
model: void 0,
|
|
16
|
+
temperature: 0,
|
|
17
|
+
prompt: void 0,
|
|
18
|
+
timeout: void 0,
|
|
19
|
+
maxRetries: 10,
|
|
20
|
+
requestWordTimestamps: void 0
|
|
21
|
+
};
|
|
22
|
+
async function recognize(input, languageCode, options) {
|
|
23
|
+
const opts = extendDeep(defaultOptions, options);
|
|
24
|
+
const timing = opts.timing;
|
|
25
|
+
if (opts.requestWordTimestamps === void 0) {
|
|
26
|
+
opts.requestWordTimestamps = opts.baseURL === void 0;
|
|
27
|
+
}
|
|
28
|
+
if (opts.model === void 0) {
|
|
29
|
+
if (opts.baseURL === void 0) {
|
|
30
|
+
opts.model = "whisper-1";
|
|
31
|
+
} else {
|
|
32
|
+
throw new Error(
|
|
33
|
+
"A custom provider for the OpenAI Cloud API requires specifying a model name"
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
const source = isAudioSource(input) ? input : normalizeToAudioSource(input, opts.inputFormat);
|
|
38
|
+
const doPrepare = () => prepareForService(source, { service: SERVICE_ID, preferFile: true });
|
|
39
|
+
const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
|
|
40
|
+
const conversionOccurred = source.format !== prepared.source.format;
|
|
41
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionOccurred);
|
|
42
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", prepared.source.format);
|
|
43
|
+
try {
|
|
44
|
+
const { default: OpenAI } = await import("openai");
|
|
45
|
+
const openai = new OpenAI(opts);
|
|
46
|
+
const filePath = toFilePath(prepared.source);
|
|
47
|
+
if (!filePath) {
|
|
48
|
+
throw new Error(
|
|
49
|
+
"OpenAI Cloud STT requires a file path. The audio could not be prepared as a file."
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
const file = createReadStream(filePath);
|
|
53
|
+
const timestamp_granularities = opts.requestWordTimestamps ? ["word", "segment"] : void 0;
|
|
54
|
+
const doUpload = () => openai.audio.transcriptions.create({
|
|
55
|
+
file,
|
|
56
|
+
model: opts.model,
|
|
57
|
+
language: languageCode,
|
|
58
|
+
prompt: opts.prompt,
|
|
59
|
+
response_format: "verbose_json",
|
|
60
|
+
temperature: opts.temperature,
|
|
61
|
+
timestamp_granularities,
|
|
62
|
+
stream: false
|
|
63
|
+
});
|
|
64
|
+
const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
|
|
65
|
+
const verboseResponse = response;
|
|
66
|
+
const transcript = verboseResponse.text.trim();
|
|
67
|
+
const timeline = extractTimeline(verboseResponse);
|
|
68
|
+
if (!timeline) {
|
|
69
|
+
throw new Error("Failed to extract timeline from OpenAI Cloud response");
|
|
70
|
+
}
|
|
71
|
+
return { transcript, timeline };
|
|
72
|
+
} finally {
|
|
73
|
+
await prepared.cleanup();
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
function extractTimeline(response) {
|
|
77
|
+
var _a;
|
|
78
|
+
if (response.words) {
|
|
79
|
+
return response.words.map((entry) => ({
|
|
80
|
+
type: "word",
|
|
81
|
+
text: entry.word,
|
|
82
|
+
startTime: entry.start,
|
|
83
|
+
endTime: entry.end
|
|
84
|
+
}));
|
|
85
|
+
}
|
|
86
|
+
const hasNestedWords = response.segments.length > 0 && ((_a = response.segments[0]) == null ? void 0 : _a.words) && response.segments[0].words.length > 0;
|
|
87
|
+
if (hasNestedWords) {
|
|
88
|
+
return extractWordTimelineFromSegments(response.segments);
|
|
89
|
+
}
|
|
90
|
+
return response.segments.map((entry) => ({
|
|
91
|
+
type: "segment",
|
|
92
|
+
text: entry.text,
|
|
93
|
+
startTime: entry.start,
|
|
94
|
+
endTime: entry.end
|
|
95
|
+
}));
|
|
96
|
+
}
|
|
97
|
+
function extractWordTimelineFromSegments(segments) {
|
|
98
|
+
var _a, _b;
|
|
99
|
+
if (segments.length === 0) {
|
|
100
|
+
return [];
|
|
101
|
+
}
|
|
102
|
+
const splitOffsets = findSplitOffsets(segments);
|
|
103
|
+
const wordTimeline = [];
|
|
104
|
+
let currentOffsetIndex = 0;
|
|
105
|
+
for (let i = 0; i < segments.length; i++) {
|
|
106
|
+
const segment = segments[i];
|
|
107
|
+
if (!(segment == null ? void 0 : segment.words) || segment.words.length === 0 || !segment.words[0]) {
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
while (currentOffsetIndex < splitOffsets.length - 1 && i >= (((_a = splitOffsets[currentOffsetIndex + 1]) == null ? void 0 : _a.segmentIndex) ?? -1)) {
|
|
111
|
+
currentOffsetIndex++;
|
|
112
|
+
}
|
|
113
|
+
const timeOffset = currentOffsetIndex < splitOffsets.length ? ((_b = splitOffsets[currentOffsetIndex]) == null ? void 0 : _b.offset) ?? 0 : 0;
|
|
114
|
+
for (const word of segment.words) {
|
|
115
|
+
const text = word.word.trim();
|
|
116
|
+
if (text === "" || text.includes("BLANK_AUDIO")) {
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
wordTimeline.push({
|
|
120
|
+
type: "word",
|
|
121
|
+
text,
|
|
122
|
+
startTime: word.start + timeOffset,
|
|
123
|
+
endTime: word.end + timeOffset,
|
|
124
|
+
confidence: word.probability ?? 0
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
return wordTimeline;
|
|
129
|
+
}
|
|
130
|
+
function findSplitOffsets(segments) {
|
|
131
|
+
var _a;
|
|
132
|
+
const splitOffsets = [];
|
|
133
|
+
let lastWordEnd = -1;
|
|
134
|
+
for (let i = 0; i < segments.length; i++) {
|
|
135
|
+
const segment = segments[i];
|
|
136
|
+
if (!(segment == null ? void 0 : segment.words) || segment.words.length === 0 || !segment.words[0]) {
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
const firstWordStart = segment.words[0].start;
|
|
140
|
+
if (firstWordStart < lastWordEnd - 0.5 || lastWordEnd === -1) {
|
|
141
|
+
splitOffsets.push({ segmentIndex: i, offset: segment.start });
|
|
142
|
+
}
|
|
143
|
+
lastWordEnd = ((_a = segment.words[segment.words.length - 1]) == null ? void 0 : _a.end) ?? -1;
|
|
144
|
+
}
|
|
145
|
+
return splitOffsets;
|
|
146
|
+
}
|
|
147
|
+
export {
|
|
148
|
+
inputPreference,
|
|
149
|
+
recognize
|
|
150
|
+
};
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var WhisperCppSTT_exports = {};
|
|
30
|
+
__export(WhisperCppSTT_exports, {
|
|
31
|
+
ensureModelDownloaded: () => ensureModelDownloaded,
|
|
32
|
+
ensureWhisperCppInstalled: () => ensureWhisperCppInstalled,
|
|
33
|
+
inputPreference: () => inputPreference,
|
|
34
|
+
recognize: () => recognize
|
|
35
|
+
});
|
|
36
|
+
module.exports = __toCommonJS(WhisperCppSTT_exports);
|
|
37
|
+
var import_node_child_process = require("node:child_process");
|
|
38
|
+
var import_node_fs = __toESM(require("node:fs"), 1);
|
|
39
|
+
var import_node_os = __toESM(require("node:os"), 1);
|
|
40
|
+
var import_node_path = __toESM(require("node:path"), 1);
|
|
41
|
+
var import_fs_extra = require("fs-extra");
|
|
42
|
+
var import_audio = require("../audio/index.cjs");
|
|
43
|
+
var import_config = require("../cli/config.cjs");
|
|
44
|
+
var import_install = require("../cli/install.cjs");
|
|
45
|
+
var import_WhisperTimeline = require("../utilities/WhisperTimeline.cjs");
|
|
46
|
+
const inputPreference = "file";
|
|
47
|
+
const defaultOptions = {
|
|
48
|
+
processors: 1,
|
|
49
|
+
threads: 4,
|
|
50
|
+
flashAttention: true,
|
|
51
|
+
suppressNonSpeechTokens: true,
|
|
52
|
+
tokenLevelTimestamps: true,
|
|
53
|
+
printOutput: false,
|
|
54
|
+
model: "tiny.en",
|
|
55
|
+
autoInstall: true
|
|
56
|
+
};
|
|
57
|
+
const acceptedFormats = ["wav", "flac", "ogg", "mp3"];
|
|
58
|
+
async function recognize(input, options) {
|
|
59
|
+
const opts = { ...defaultOptions, ...options };
|
|
60
|
+
const timing = opts.timing;
|
|
61
|
+
const modelDir = opts.modelDir ?? (0, import_config.getModelDir)();
|
|
62
|
+
const installDir = opts.installDir ?? (0, import_config.getInstallDir)();
|
|
63
|
+
const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, opts.inputFormat);
|
|
64
|
+
await (0, import_fs_extra.ensureDir)(modelDir);
|
|
65
|
+
const doInstall = async () => {
|
|
66
|
+
await ensureWhisperCppInstalled();
|
|
67
|
+
await ensureModelDownloaded(modelDir, opts.model, opts.printOutput);
|
|
68
|
+
};
|
|
69
|
+
if (opts.autoInstall) {
|
|
70
|
+
if (timing) {
|
|
71
|
+
await timing.timeAsync("installation", doInstall);
|
|
72
|
+
} else {
|
|
73
|
+
await doInstall();
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
const conversionNeeded = !acceptedFormats.includes(source.format);
|
|
77
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
|
|
78
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", conversionNeeded ? "wav" : source.format);
|
|
79
|
+
const doPrepare = async () => {
|
|
80
|
+
if (!conversionNeeded) return { source, cleanup: async () => {
|
|
81
|
+
} };
|
|
82
|
+
return (0, import_audio.prepareWavForService)(source, { sampleRate: 16e3, channels: 1 });
|
|
83
|
+
};
|
|
84
|
+
const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
|
|
85
|
+
try {
|
|
86
|
+
const inputPath = (0, import_audio.toFilePath)(prepared.source);
|
|
87
|
+
if (!inputPath) {
|
|
88
|
+
throw new Error(
|
|
89
|
+
"whisper.cpp requires a file path. The audio could not be prepared as a file."
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
if (!(0, import_node_fs.existsSync)(inputPath)) {
|
|
93
|
+
throw new Error(`Input file does not exist: ${inputPath}`);
|
|
94
|
+
}
|
|
95
|
+
const audioDuration = await (0, import_audio.getAudioDuration)(inputPath);
|
|
96
|
+
const effectiveProcessors = (0, import_WhisperTimeline.calculateEffectiveProcessors)(
|
|
97
|
+
audioDuration,
|
|
98
|
+
opts.processors
|
|
99
|
+
);
|
|
100
|
+
const doTranscribe = () => transcribe({
|
|
101
|
+
inputPath,
|
|
102
|
+
model: opts.model,
|
|
103
|
+
installDir,
|
|
104
|
+
modelFolder: modelDir,
|
|
105
|
+
language: opts.language ?? null,
|
|
106
|
+
tokenLevelTimestamps: opts.tokenLevelTimestamps,
|
|
107
|
+
printOutput: opts.printOutput,
|
|
108
|
+
flashAttention: opts.flashAttention,
|
|
109
|
+
suppressNonSpeechTokens: opts.suppressNonSpeechTokens,
|
|
110
|
+
processors: effectiveProcessors,
|
|
111
|
+
threads: opts.threads,
|
|
112
|
+
onProgress: opts.onProgress ?? null,
|
|
113
|
+
signal: opts.signal ?? null
|
|
114
|
+
});
|
|
115
|
+
const transcription = timing ? await timing.timeAsync("transcription", doTranscribe) : await doTranscribe();
|
|
116
|
+
const rawSegments = (0, import_WhisperTimeline.parseWhisperCppOutput)(transcription.transcription);
|
|
117
|
+
const splitBoundaries = effectiveProcessors > 1 ? (0, import_WhisperTimeline.calculateWhisperSplits)(audioDuration, effectiveProcessors) : [];
|
|
118
|
+
const timeline = (0, import_WhisperTimeline.extractCorrectedTimeline)(rawSegments, {
|
|
119
|
+
splitBoundaries: splitBoundaries.length > 0 ? splitBoundaries : void 0
|
|
120
|
+
});
|
|
121
|
+
const transcript = transcription.transcription.map((s) => s.text).join("").trim();
|
|
122
|
+
return {
|
|
123
|
+
transcript,
|
|
124
|
+
timeline,
|
|
125
|
+
language: transcription.result.language
|
|
126
|
+
};
|
|
127
|
+
} finally {
|
|
128
|
+
await prepared.cleanup();
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
async function ensureWhisperCppInstalled() {
|
|
132
|
+
await (0, import_install.installBinary)({ printOutput: false });
|
|
133
|
+
}
|
|
134
|
+
async function ensureModelDownloaded(modelDir, modelName, printOutput) {
|
|
135
|
+
const modelPath = (0, import_config.getModelPath)(modelName, modelDir);
|
|
136
|
+
if ((0, import_node_fs.existsSync)(modelPath)) {
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
139
|
+
await (0, import_install.installModel)({
|
|
140
|
+
model: modelName,
|
|
141
|
+
modelDir,
|
|
142
|
+
printOutput
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
function getModelPath(folder, model) {
|
|
146
|
+
return import_node_path.default.join(folder, `ggml-${model}.bin`);
|
|
147
|
+
}
|
|
148
|
+
async function transcribe(options) {
|
|
149
|
+
const {
|
|
150
|
+
inputPath,
|
|
151
|
+
model,
|
|
152
|
+
installDir,
|
|
153
|
+
modelFolder,
|
|
154
|
+
language,
|
|
155
|
+
tokenLevelTimestamps,
|
|
156
|
+
printOutput,
|
|
157
|
+
flashAttention,
|
|
158
|
+
suppressNonSpeechTokens,
|
|
159
|
+
processors,
|
|
160
|
+
threads,
|
|
161
|
+
onProgress,
|
|
162
|
+
signal
|
|
163
|
+
} = options;
|
|
164
|
+
const executable = (0, import_config.getWhisperExecutablePath)(installDir);
|
|
165
|
+
const modelPath = getModelPath(modelFolder, model);
|
|
166
|
+
if (!(0, import_node_fs.existsSync)(executable)) {
|
|
167
|
+
throw new Error(`Whisper executable not found at ${executable}`);
|
|
168
|
+
}
|
|
169
|
+
if (!(0, import_node_fs.existsSync)(modelPath)) {
|
|
170
|
+
throw new Error(`Model not found at ${modelPath}`);
|
|
171
|
+
}
|
|
172
|
+
const tmpDir = import_node_path.default.join(import_node_os.default.tmpdir(), "ghost-story-whisper");
|
|
173
|
+
await (0, import_fs_extra.ensureDir)(tmpDir);
|
|
174
|
+
const tmpJsonPath = import_node_path.default.join(tmpDir, `transcription-${Date.now()}`);
|
|
175
|
+
const args = buildTranscribeArgs({
|
|
176
|
+
inputPath,
|
|
177
|
+
modelPath,
|
|
178
|
+
outputPath: tmpJsonPath,
|
|
179
|
+
model,
|
|
180
|
+
language,
|
|
181
|
+
tokenLevelTimestamps,
|
|
182
|
+
flashAttention,
|
|
183
|
+
suppressNonSpeechTokens,
|
|
184
|
+
processors,
|
|
185
|
+
threads
|
|
186
|
+
});
|
|
187
|
+
try {
|
|
188
|
+
const outputPath = await runWhisperProcess({
|
|
189
|
+
executable,
|
|
190
|
+
args,
|
|
191
|
+
cwd: installDir,
|
|
192
|
+
printOutput,
|
|
193
|
+
onProgress,
|
|
194
|
+
signal,
|
|
195
|
+
expectedOutputPath: `${tmpJsonPath}.json`
|
|
196
|
+
});
|
|
197
|
+
const json = JSON.parse(
|
|
198
|
+
await import_node_fs.default.promises.readFile(outputPath, "utf8")
|
|
199
|
+
);
|
|
200
|
+
import_node_fs.default.promises.unlink(outputPath).catch(() => {
|
|
201
|
+
});
|
|
202
|
+
return json;
|
|
203
|
+
} catch (error) {
|
|
204
|
+
await import_node_fs.default.promises.unlink(`${tmpJsonPath}.json`).catch(() => {
|
|
205
|
+
});
|
|
206
|
+
throw error;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
function buildTranscribeArgs(options) {
|
|
210
|
+
const args = [
|
|
211
|
+
"--file",
|
|
212
|
+
options.inputPath,
|
|
213
|
+
"--output-file",
|
|
214
|
+
options.outputPath,
|
|
215
|
+
"--output-json-full",
|
|
216
|
+
"--model",
|
|
217
|
+
options.modelPath,
|
|
218
|
+
"--print-progress",
|
|
219
|
+
options.language ? ["--language", options.language.toLowerCase()] : null,
|
|
220
|
+
options.flashAttention ? ["--flash-attn"] : null,
|
|
221
|
+
options.suppressNonSpeechTokens ? ["--suppress-nst", "--no-prints"] : null,
|
|
222
|
+
["--processors", String(options.processors)],
|
|
223
|
+
["--threads", String(options.threads)]
|
|
224
|
+
];
|
|
225
|
+
return args.flat().filter((arg) => arg !== null);
|
|
226
|
+
}
|
|
227
|
+
function runWhisperProcess(options) {
|
|
228
|
+
const {
|
|
229
|
+
executable,
|
|
230
|
+
args,
|
|
231
|
+
cwd,
|
|
232
|
+
printOutput,
|
|
233
|
+
onProgress,
|
|
234
|
+
signal,
|
|
235
|
+
expectedOutputPath
|
|
236
|
+
} = options;
|
|
237
|
+
if (signal == null ? void 0 : signal.aborted) {
|
|
238
|
+
return Promise.reject(new Error("Signal aborted"));
|
|
239
|
+
}
|
|
240
|
+
return new Promise((resolve, reject) => {
|
|
241
|
+
const task = (0, import_node_child_process.spawn)(executable, args, { cwd, signal: signal ?? void 0 });
|
|
242
|
+
let output = "";
|
|
243
|
+
const handleData = (data) => {
|
|
244
|
+
const str = data.toString("utf-8");
|
|
245
|
+
output += str;
|
|
246
|
+
if (str.includes("progress =")) {
|
|
247
|
+
const match = str.match(/progress\s*=\s*([\d.]+)/);
|
|
248
|
+
if (match == null ? void 0 : match[1]) {
|
|
249
|
+
onProgress == null ? void 0 : onProgress(parseFloat(match[1]) / 100);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
};
|
|
253
|
+
task.stdout.on("data", (data) => {
|
|
254
|
+
handleData(data);
|
|
255
|
+
if (printOutput) {
|
|
256
|
+
process.stdout.write(data);
|
|
257
|
+
}
|
|
258
|
+
});
|
|
259
|
+
task.stderr.on("data", (data) => {
|
|
260
|
+
handleData(data);
|
|
261
|
+
if (printOutput) {
|
|
262
|
+
process.stderr.write(data);
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
task.on("exit", (code, exitSignal) => {
|
|
266
|
+
if ((0, import_node_fs.existsSync)(expectedOutputPath)) {
|
|
267
|
+
onProgress == null ? void 0 : onProgress(1);
|
|
268
|
+
resolve(expectedOutputPath);
|
|
269
|
+
return;
|
|
270
|
+
}
|
|
271
|
+
if (exitSignal) {
|
|
272
|
+
reject(new Error(`Process killed with signal ${exitSignal}: ${output}`));
|
|
273
|
+
return;
|
|
274
|
+
}
|
|
275
|
+
if (output.includes("must be 16 kHz")) {
|
|
276
|
+
reject(
|
|
277
|
+
new Error(
|
|
278
|
+
"Audio file must be 16 kHz. Convert your audio to 16-bit, 16KHz WAV format."
|
|
279
|
+
)
|
|
280
|
+
);
|
|
281
|
+
return;
|
|
282
|
+
}
|
|
283
|
+
reject(new Error(`Transcription failed (exit code ${code}): ${output}`));
|
|
284
|
+
});
|
|
285
|
+
task.on("error", (err) => {
|
|
286
|
+
reject(new Error(`Failed to start whisper process: ${err.message}`));
|
|
287
|
+
});
|
|
288
|
+
});
|
|
289
|
+
}
|
|
290
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
291
|
+
0 && (module.exports = {
|
|
292
|
+
ensureModelDownloaded,
|
|
293
|
+
ensureWhisperCppInstalled,
|
|
294
|
+
inputPreference,
|
|
295
|
+
recognize
|
|
296
|
+
});
|