@storyteller-platform/ghost-story 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +611 -0
- package/README.md +18 -0
- package/dist/api/APIOptions.cjs +16 -0
- package/dist/api/APIOptions.d.cts +18 -0
- package/dist/api/APIOptions.d.ts +18 -0
- package/dist/api/APIOptions.js +0 -0
- package/dist/api/Recognition.cjs +263 -0
- package/dist/api/Recognition.d.cts +77 -0
- package/dist/api/Recognition.d.ts +77 -0
- package/dist/api/Recognition.js +233 -0
- package/dist/api/VoiceActivityDetection.cjs +77 -0
- package/dist/api/VoiceActivityDetection.d.cts +24 -0
- package/dist/api/VoiceActivityDetection.d.ts +24 -0
- package/dist/api/VoiceActivityDetection.js +43 -0
- package/dist/audio/AudioConverter.cjs +331 -0
- package/dist/audio/AudioConverter.d.cts +53 -0
- package/dist/audio/AudioConverter.d.ts +53 -0
- package/dist/audio/AudioConverter.js +310 -0
- package/dist/audio/AudioFormat.cjs +151 -0
- package/dist/audio/AudioFormat.d.cts +25 -0
- package/dist/audio/AudioFormat.d.ts +25 -0
- package/dist/audio/AudioFormat.js +123 -0
- package/dist/audio/AudioSource.cjs +119 -0
- package/dist/audio/AudioSource.d.cts +33 -0
- package/dist/audio/AudioSource.d.ts +33 -0
- package/dist/audio/AudioSource.js +88 -0
- package/dist/audio/index.cjs +74 -0
- package/dist/audio/index.d.cts +6 -0
- package/dist/audio/index.d.ts +6 -0
- package/dist/audio/index.js +54 -0
- package/dist/cli/bin.cjs +277 -0
- package/dist/cli/bin.d.cts +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +275 -0
- package/dist/cli/config.cjs +347 -0
- package/dist/cli/config.d.cts +33 -0
- package/dist/cli/config.d.ts +33 -0
- package/dist/cli/config.js +285 -0
- package/dist/cli/install.cjs +334 -0
- package/dist/cli/install.d.cts +62 -0
- package/dist/cli/install.d.ts +62 -0
- package/dist/cli/install.js +316 -0
- package/dist/cli/whisper-server.cjs +172 -0
- package/dist/cli/whisper-server.d.cts +24 -0
- package/dist/cli/whisper-server.d.ts +24 -0
- package/dist/cli/whisper-server.js +152 -0
- package/dist/config.cjs +60 -0
- package/dist/config.d.cts +12 -0
- package/dist/config.d.ts +12 -0
- package/dist/config.js +32 -0
- package/dist/convert.cjs +88 -0
- package/dist/convert.d.cts +12 -0
- package/dist/convert.d.ts +12 -0
- package/dist/convert.js +63 -0
- package/dist/encodings/Ascii.cjs +75 -0
- package/dist/encodings/Ascii.d.cts +13 -0
- package/dist/encodings/Ascii.d.ts +13 -0
- package/dist/encodings/Ascii.js +48 -0
- package/dist/encodings/Base64.cjs +155 -0
- package/dist/encodings/Base64.d.cts +5 -0
- package/dist/encodings/Base64.d.ts +5 -0
- package/dist/encodings/Base64.js +129 -0
- package/dist/encodings/TextEncodingsCommon.cjs +16 -0
- package/dist/encodings/TextEncodingsCommon.d.cts +6 -0
- package/dist/encodings/TextEncodingsCommon.d.ts +6 -0
- package/dist/encodings/TextEncodingsCommon.js +0 -0
- package/dist/index.cjs +153 -0
- package/dist/index.d.cts +15 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +140 -0
- package/dist/recognition/AmazonTranscribeSTT.cjs +188 -0
- package/dist/recognition/AmazonTranscribeSTT.d.cts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.d.ts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.js +160 -0
- package/dist/recognition/AzureCognitiveServicesSTT.cjs +124 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.cts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.ts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.js +95 -0
- package/dist/recognition/DeepgramSTT.cjs +172 -0
- package/dist/recognition/DeepgramSTT.d.cts +23 -0
- package/dist/recognition/DeepgramSTT.d.ts +23 -0
- package/dist/recognition/DeepgramSTT.js +153 -0
- package/dist/recognition/GoogleCloudSTT.cjs +125 -0
- package/dist/recognition/GoogleCloudSTT.d.cts +35 -0
- package/dist/recognition/GoogleCloudSTT.d.ts +35 -0
- package/dist/recognition/GoogleCloudSTT.js +107 -0
- package/dist/recognition/OpenAICloudSTT.cjs +180 -0
- package/dist/recognition/OpenAICloudSTT.d.cts +29 -0
- package/dist/recognition/OpenAICloudSTT.d.ts +29 -0
- package/dist/recognition/OpenAICloudSTT.js +150 -0
- package/dist/recognition/WhisperCppSTT.cjs +296 -0
- package/dist/recognition/WhisperCppSTT.d.cts +40 -0
- package/dist/recognition/WhisperCppSTT.d.ts +40 -0
- package/dist/recognition/WhisperCppSTT.js +275 -0
- package/dist/recognition/WhisperServerSTT.cjs +119 -0
- package/dist/recognition/WhisperServerSTT.d.cts +24 -0
- package/dist/recognition/WhisperServerSTT.d.ts +24 -0
- package/dist/recognition/WhisperServerSTT.js +105 -0
- package/dist/utilities/FileSystem.cjs +54 -0
- package/dist/utilities/FileSystem.d.cts +3 -0
- package/dist/utilities/FileSystem.d.ts +3 -0
- package/dist/utilities/FileSystem.js +20 -0
- package/dist/utilities/Locale.cjs +46 -0
- package/dist/utilities/Locale.d.cts +9 -0
- package/dist/utilities/Locale.d.ts +9 -0
- package/dist/utilities/Locale.js +20 -0
- package/dist/utilities/ObjectUtilities.cjs +41 -0
- package/dist/utilities/ObjectUtilities.d.cts +3 -0
- package/dist/utilities/ObjectUtilities.d.ts +3 -0
- package/dist/utilities/ObjectUtilities.js +7 -0
- package/dist/utilities/Timeline.cjs +120 -0
- package/dist/utilities/Timeline.d.cts +23 -0
- package/dist/utilities/Timeline.d.ts +23 -0
- package/dist/utilities/Timeline.js +94 -0
- package/dist/utilities/Timing.cjs +287 -0
- package/dist/utilities/Timing.d.cts +64 -0
- package/dist/utilities/Timing.d.ts +64 -0
- package/dist/utilities/Timing.js +256 -0
- package/dist/utilities/WhisperTimeline.cjs +344 -0
- package/dist/utilities/WhisperTimeline.d.cts +86 -0
- package/dist/utilities/WhisperTimeline.d.ts +86 -0
- package/dist/utilities/WhisperTimeline.js +313 -0
- package/dist/vad/ActiveGate.cjs +357 -0
- package/dist/vad/ActiveGate.d.cts +53 -0
- package/dist/vad/ActiveGate.d.ts +53 -0
- package/dist/vad/ActiveGate.js +329 -0
- package/dist/vad/ActiveGateOg.cjs +1366 -0
- package/dist/vad/ActiveGateOg.d.cts +33 -0
- package/dist/vad/ActiveGateOg.d.ts +33 -0
- package/dist/vad/ActiveGateOg.js +1341 -0
- package/dist/vad/Silero.cjs +174 -0
- package/dist/vad/Silero.d.cts +25 -0
- package/dist/vad/Silero.d.ts +25 -0
- package/dist/vad/Silero.js +153 -0
- package/package.json +125 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { RecognitionOptions } from './Recognition.js';
|
|
2
|
+
import 'node:fs';
|
|
3
|
+
import 'node:stream';
|
|
4
|
+
import '../audio/AudioFormat.js';
|
|
5
|
+
import '../audio/AudioSource.js';
|
|
6
|
+
import '../config.js';
|
|
7
|
+
import '../recognition/OpenAICloudSTT.js';
|
|
8
|
+
import '../utilities/Timeline.js';
|
|
9
|
+
import '../utilities/Timing.js';
|
|
10
|
+
import '../recognition/WhisperCppSTT.js';
|
|
11
|
+
import '../cli/config.js';
|
|
12
|
+
import '../recognition/WhisperServerSTT.js';
|
|
13
|
+
|
|
14
|
+
interface APIOptions {
|
|
15
|
+
RecognitionOptions: RecognitionOptions;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export type { APIOptions };
|
|
File without changes
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var Recognition_exports = {};
|
|
30
|
+
__export(Recognition_exports, {
|
|
31
|
+
recognitionEngines: () => recognitionEngines,
|
|
32
|
+
recognize: () => recognize
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(Recognition_exports);
|
|
35
|
+
var import_audio = require("../audio/index.cjs");
|
|
36
|
+
var import_config = require("../config.cjs");
|
|
37
|
+
var import_Locale = require("../utilities/Locale.cjs");
|
|
38
|
+
var import_ObjectUtilities = require("../utilities/ObjectUtilities.cjs");
|
|
39
|
+
var import_Timeline = require("../utilities/Timeline.cjs");
|
|
40
|
+
var import_Timing = require("../utilities/Timing.cjs");
|
|
41
|
+
async function recognize(input, options) {
|
|
42
|
+
const opts = (0, import_ObjectUtilities.extendDeep)(
|
|
43
|
+
defaultRecognitionOptions,
|
|
44
|
+
options
|
|
45
|
+
);
|
|
46
|
+
const timing = (0, import_Timing.createTiming)();
|
|
47
|
+
timing.setMetadata("engine", opts.engine);
|
|
48
|
+
timing.setMetadata(
|
|
49
|
+
"conversionMode",
|
|
50
|
+
opts.conversionMode ?? (0, import_config.getConversionMode)()
|
|
51
|
+
);
|
|
52
|
+
if (!opts.language) {
|
|
53
|
+
throw new Error("Language must be specified");
|
|
54
|
+
}
|
|
55
|
+
const languageCode = opts.language;
|
|
56
|
+
const shortLanguageCode = (0, import_Locale.getShortLanguageCode)(languageCode);
|
|
57
|
+
timing.setMetadata("language", languageCode);
|
|
58
|
+
const source = resolveAudioSource(input, opts);
|
|
59
|
+
timing.setMetadata("inputFormat", source.format);
|
|
60
|
+
let transcript = "";
|
|
61
|
+
let timeline;
|
|
62
|
+
switch (opts.engine) {
|
|
63
|
+
case "whisper.cpp": {
|
|
64
|
+
const WhisperCppSTT = await import("../recognition/WhisperCppSTT.cjs");
|
|
65
|
+
timing.setMetadata("model", opts.options.model);
|
|
66
|
+
timing.setMetadata("processors", opts.options.processors ?? 1);
|
|
67
|
+
timing.setMetadata("threads", opts.options.threads ?? 4);
|
|
68
|
+
const result = await WhisperCppSTT.recognize(source, {
|
|
69
|
+
...opts.options,
|
|
70
|
+
language: shortLanguageCode,
|
|
71
|
+
timing
|
|
72
|
+
});
|
|
73
|
+
transcript = result.transcript;
|
|
74
|
+
timeline = result.timeline;
|
|
75
|
+
break;
|
|
76
|
+
}
|
|
77
|
+
case "openai-cloud": {
|
|
78
|
+
const OpenAICloudSTT = await import("../recognition/OpenAICloudSTT.cjs");
|
|
79
|
+
timing.setMetadata("model", opts.options.model ?? "whisper-1");
|
|
80
|
+
const result = await OpenAICloudSTT.recognize(source, shortLanguageCode, {
|
|
81
|
+
...opts.options,
|
|
82
|
+
timing
|
|
83
|
+
});
|
|
84
|
+
transcript = result.transcript;
|
|
85
|
+
timeline = result.timeline;
|
|
86
|
+
break;
|
|
87
|
+
}
|
|
88
|
+
case "whisper-server": {
|
|
89
|
+
const WhisperServerSTT = await import("../recognition/WhisperServerSTT.cjs");
|
|
90
|
+
const result = await WhisperServerSTT.recognize(
|
|
91
|
+
source,
|
|
92
|
+
shortLanguageCode,
|
|
93
|
+
{ ...opts.options, timing }
|
|
94
|
+
);
|
|
95
|
+
transcript = result.transcript;
|
|
96
|
+
timeline = result.timeline;
|
|
97
|
+
break;
|
|
98
|
+
}
|
|
99
|
+
case "google-cloud": {
|
|
100
|
+
const GoogleCloudSTT = await import("../recognition/GoogleCloudSTT.cjs");
|
|
101
|
+
if (!opts.options.apiKey) {
|
|
102
|
+
throw new Error("Google Cloud API key is required");
|
|
103
|
+
}
|
|
104
|
+
const result = await GoogleCloudSTT.recognize(
|
|
105
|
+
source,
|
|
106
|
+
{ ...opts.options, timing },
|
|
107
|
+
shortLanguageCode
|
|
108
|
+
);
|
|
109
|
+
transcript = result.transcript;
|
|
110
|
+
timeline = result.timeline;
|
|
111
|
+
break;
|
|
112
|
+
}
|
|
113
|
+
case "microsoft-azure": {
|
|
114
|
+
const AzureCognitiveServicesSTT = await import("../recognition/AzureCognitiveServicesSTT.cjs");
|
|
115
|
+
if (!opts.options.subscriptionKey) {
|
|
116
|
+
throw new Error("Azure subscription key is required");
|
|
117
|
+
}
|
|
118
|
+
if (!opts.options.serviceRegion) {
|
|
119
|
+
throw new Error("Azure service region is required");
|
|
120
|
+
}
|
|
121
|
+
const result = await AzureCognitiveServicesSTT.recognize(
|
|
122
|
+
source,
|
|
123
|
+
{
|
|
124
|
+
subscriptionKey: opts.options.subscriptionKey,
|
|
125
|
+
serviceRegion: opts.options.serviceRegion,
|
|
126
|
+
inputFormat: opts.inputFormat,
|
|
127
|
+
timing
|
|
128
|
+
},
|
|
129
|
+
shortLanguageCode
|
|
130
|
+
);
|
|
131
|
+
transcript = result.transcript;
|
|
132
|
+
timeline = result.timeline;
|
|
133
|
+
break;
|
|
134
|
+
}
|
|
135
|
+
case "amazon-transcribe": {
|
|
136
|
+
const AmazonTranscribeSTT = await import("../recognition/AmazonTranscribeSTT.cjs");
|
|
137
|
+
const result = await AmazonTranscribeSTT.recognize(source, languageCode, {
|
|
138
|
+
region: opts.options.region,
|
|
139
|
+
accessKeyId: opts.options.accessKeyId,
|
|
140
|
+
secretAccessKey: opts.options.secretAccessKey,
|
|
141
|
+
inputFormat: opts.inputFormat,
|
|
142
|
+
timing
|
|
143
|
+
});
|
|
144
|
+
transcript = result.transcript;
|
|
145
|
+
timeline = result.timeline;
|
|
146
|
+
break;
|
|
147
|
+
}
|
|
148
|
+
case "deepgram": {
|
|
149
|
+
const DeepgramSTT = await import("../recognition/DeepgramSTT.cjs");
|
|
150
|
+
if (!opts.options.apiKey) {
|
|
151
|
+
throw new Error("Deepgram API key is required");
|
|
152
|
+
}
|
|
153
|
+
timing.setMetadata("model", opts.options.model);
|
|
154
|
+
const result = await DeepgramSTT.recognize(
|
|
155
|
+
source,
|
|
156
|
+
shortLanguageCode,
|
|
157
|
+
{
|
|
158
|
+
apiKey: opts.options.apiKey,
|
|
159
|
+
model: opts.options.model,
|
|
160
|
+
punctuate: opts.options.punctuate,
|
|
161
|
+
inputFormat: opts.inputFormat,
|
|
162
|
+
timing,
|
|
163
|
+
conversionMode: opts.conversionMode
|
|
164
|
+
},
|
|
165
|
+
opts.signal
|
|
166
|
+
);
|
|
167
|
+
transcript = result.transcript;
|
|
168
|
+
timeline = result.timeline;
|
|
169
|
+
break;
|
|
170
|
+
}
|
|
171
|
+
default: {
|
|
172
|
+
const _engine = opts;
|
|
173
|
+
throw new Error(
|
|
174
|
+
`Unknown engine: ${_engine.engine}`
|
|
175
|
+
);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
if (!timeline) {
|
|
179
|
+
throw new Error(`No timeline returned from engine ${opts.engine}`);
|
|
180
|
+
}
|
|
181
|
+
(0, import_Timeline.addWordTextOffsetsToTimelineInPlace)(timeline, transcript);
|
|
182
|
+
return {
|
|
183
|
+
transcript,
|
|
184
|
+
timeline,
|
|
185
|
+
language: languageCode,
|
|
186
|
+
timing: timing.summary()
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
function resolveAudioSource(input, opts) {
|
|
190
|
+
if ((0, import_audio.isAudioSource)(input)) {
|
|
191
|
+
return input;
|
|
192
|
+
}
|
|
193
|
+
return (0, import_audio.normalizeToAudioSource)(input, opts.inputFormat);
|
|
194
|
+
}
|
|
195
|
+
const defaultRecognitionOptions = {
|
|
196
|
+
engine: "whisper.cpp",
|
|
197
|
+
options: {
|
|
198
|
+
model: "tiny.en"
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
const recognitionEngines = [
|
|
202
|
+
{
|
|
203
|
+
id: "whisper.cpp",
|
|
204
|
+
name: "OpenAI Whisper (C++ port)",
|
|
205
|
+
description: "Local whisper.cpp binary. Accepts wav, flac, ogg, mp3.",
|
|
206
|
+
type: "local",
|
|
207
|
+
acceptsFormats: ["wav", "flac", "ogg", "mp3"],
|
|
208
|
+
preferredFormat: "wav"
|
|
209
|
+
},
|
|
210
|
+
{
|
|
211
|
+
id: "whisper-server",
|
|
212
|
+
name: "Whisper Server",
|
|
213
|
+
description: "whisper.cpp server API. Prefers wav format.",
|
|
214
|
+
type: "server",
|
|
215
|
+
acceptsFormats: ["wav", "flac", "ogg", "mp3"],
|
|
216
|
+
preferredFormat: "wav"
|
|
217
|
+
},
|
|
218
|
+
{
|
|
219
|
+
id: "openai-cloud",
|
|
220
|
+
name: "OpenAI Cloud",
|
|
221
|
+
description: "OpenAI cloud API. Accepts wav, flac, mp3, m4a, ogg, webm.",
|
|
222
|
+
type: "cloud",
|
|
223
|
+
acceptsFormats: ["wav", "flac", "mp3", "m4a", "ogg", "webm"],
|
|
224
|
+
preferredFormat: "mp3"
|
|
225
|
+
},
|
|
226
|
+
{
|
|
227
|
+
id: "google-cloud",
|
|
228
|
+
name: "Google Cloud",
|
|
229
|
+
description: "Google Cloud Speech-to-Text. Prefers flac format.",
|
|
230
|
+
type: "cloud",
|
|
231
|
+
acceptsFormats: ["wav", "flac", "mp3", "ogg", "opus", "webm"],
|
|
232
|
+
preferredFormat: "flac"
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
id: "microsoft-azure",
|
|
236
|
+
name: "Azure Cognitive Services",
|
|
237
|
+
description: "Microsoft Azure Speech-to-Text. Requires wav format.",
|
|
238
|
+
type: "cloud",
|
|
239
|
+
acceptsFormats: ["wav"],
|
|
240
|
+
preferredFormat: "wav"
|
|
241
|
+
},
|
|
242
|
+
{
|
|
243
|
+
id: "amazon-transcribe",
|
|
244
|
+
name: "Amazon Transcribe",
|
|
245
|
+
description: "Amazon Transcribe streaming. Accepts flac, opus, ogg.",
|
|
246
|
+
type: "cloud",
|
|
247
|
+
acceptsFormats: ["flac", "opus", "ogg"],
|
|
248
|
+
preferredFormat: "flac"
|
|
249
|
+
},
|
|
250
|
+
{
|
|
251
|
+
id: "deepgram",
|
|
252
|
+
name: "Deepgram",
|
|
253
|
+
description: "Deepgram API. Accepts most common formats.",
|
|
254
|
+
type: "cloud",
|
|
255
|
+
acceptsFormats: ["wav", "flac", "mp3", "opus", "ogg", "webm", "m4a"],
|
|
256
|
+
preferredFormat: "wav"
|
|
257
|
+
}
|
|
258
|
+
];
|
|
259
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
260
|
+
0 && (module.exports = {
|
|
261
|
+
recognitionEngines,
|
|
262
|
+
recognize
|
|
263
|
+
});
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { ReadStream } from 'node:fs';
|
|
2
|
+
import { Readable } from 'node:stream';
|
|
3
|
+
import { AudioFormat } from '../audio/AudioFormat.cjs';
|
|
4
|
+
import { AudioSource } from '../audio/AudioSource.cjs';
|
|
5
|
+
import { ConversionMode } from '../config.cjs';
|
|
6
|
+
import { OpenAICloudSTTOptions } from '../recognition/OpenAICloudSTT.cjs';
|
|
7
|
+
import { WhisperCppOptions } from '../recognition/WhisperCppSTT.cjs';
|
|
8
|
+
import { WhisperServerOptions } from '../recognition/WhisperServerSTT.cjs';
|
|
9
|
+
import { Timeline } from '../utilities/Timeline.cjs';
|
|
10
|
+
import { TimingSummary } from '../utilities/Timing.cjs';
|
|
11
|
+
import '../cli/config.cjs';
|
|
12
|
+
|
|
13
|
+
type AudioInput = Readable | ReadStream | string | AudioSource;
|
|
14
|
+
type Audio = AudioInput;
|
|
15
|
+
declare function recognize(input: AudioInput, options: RecognitionOptions): Promise<RecognitionResult>;
|
|
16
|
+
interface RecognitionResult {
|
|
17
|
+
transcript: string;
|
|
18
|
+
timeline: Timeline;
|
|
19
|
+
language: string;
|
|
20
|
+
timing: TimingSummary;
|
|
21
|
+
}
|
|
22
|
+
type RecognitionEngine = "whisper.cpp" | "whisper-server" | "google-cloud" | "microsoft-azure" | "amazon-transcribe" | "openai-cloud" | "deepgram";
|
|
23
|
+
interface BaseRecognitionOptions {
|
|
24
|
+
language: string;
|
|
25
|
+
signal?: AbortSignal | null | undefined;
|
|
26
|
+
inputFormat?: AudioFormat;
|
|
27
|
+
conversionMode?: ConversionMode | undefined;
|
|
28
|
+
}
|
|
29
|
+
type RecognitionOptions = (BaseRecognitionOptions & {
|
|
30
|
+
engine: "whisper.cpp";
|
|
31
|
+
options: WhisperCppOptions;
|
|
32
|
+
}) | (BaseRecognitionOptions & {
|
|
33
|
+
engine: "whisper-server";
|
|
34
|
+
options: WhisperServerOptions;
|
|
35
|
+
}) | (BaseRecognitionOptions & {
|
|
36
|
+
engine: "openai-cloud";
|
|
37
|
+
options: OpenAICloudSTTOptions;
|
|
38
|
+
}) | (BaseRecognitionOptions & {
|
|
39
|
+
engine: "google-cloud";
|
|
40
|
+
options: {
|
|
41
|
+
apiKey: string;
|
|
42
|
+
alternativeLanguageCodes?: string[];
|
|
43
|
+
profanityFilter?: boolean;
|
|
44
|
+
autoPunctuation?: boolean;
|
|
45
|
+
useEnhancedModel?: boolean;
|
|
46
|
+
};
|
|
47
|
+
}) | (BaseRecognitionOptions & {
|
|
48
|
+
engine: "microsoft-azure";
|
|
49
|
+
options: {
|
|
50
|
+
subscriptionKey: string;
|
|
51
|
+
serviceRegion: string;
|
|
52
|
+
};
|
|
53
|
+
}) | (BaseRecognitionOptions & {
|
|
54
|
+
engine: "amazon-transcribe";
|
|
55
|
+
options: {
|
|
56
|
+
region: string;
|
|
57
|
+
accessKeyId: string;
|
|
58
|
+
secretAccessKey: string;
|
|
59
|
+
};
|
|
60
|
+
}) | (BaseRecognitionOptions & {
|
|
61
|
+
engine: "deepgram";
|
|
62
|
+
options: {
|
|
63
|
+
apiKey: string;
|
|
64
|
+
model: string;
|
|
65
|
+
punctuate: boolean;
|
|
66
|
+
};
|
|
67
|
+
});
|
|
68
|
+
declare const recognitionEngines: {
|
|
69
|
+
id: RecognitionEngine;
|
|
70
|
+
name: string;
|
|
71
|
+
description: string;
|
|
72
|
+
type: "local" | "cloud" | "server";
|
|
73
|
+
acceptsFormats: AudioFormat[];
|
|
74
|
+
preferredFormat: AudioFormat;
|
|
75
|
+
}[];
|
|
76
|
+
|
|
77
|
+
export { type Audio, type AudioInput, type RecognitionEngine, type RecognitionOptions, type RecognitionResult, recognitionEngines, recognize };
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { ReadStream } from 'node:fs';
|
|
2
|
+
import { Readable } from 'node:stream';
|
|
3
|
+
import { AudioFormat } from '../audio/AudioFormat.js';
|
|
4
|
+
import { AudioSource } from '../audio/AudioSource.js';
|
|
5
|
+
import { ConversionMode } from '../config.js';
|
|
6
|
+
import { OpenAICloudSTTOptions } from '../recognition/OpenAICloudSTT.js';
|
|
7
|
+
import { WhisperCppOptions } from '../recognition/WhisperCppSTT.js';
|
|
8
|
+
import { WhisperServerOptions } from '../recognition/WhisperServerSTT.js';
|
|
9
|
+
import { Timeline } from '../utilities/Timeline.js';
|
|
10
|
+
import { TimingSummary } from '../utilities/Timing.js';
|
|
11
|
+
import '../cli/config.js';
|
|
12
|
+
|
|
13
|
+
type AudioInput = Readable | ReadStream | string | AudioSource;
|
|
14
|
+
type Audio = AudioInput;
|
|
15
|
+
declare function recognize(input: AudioInput, options: RecognitionOptions): Promise<RecognitionResult>;
|
|
16
|
+
interface RecognitionResult {
|
|
17
|
+
transcript: string;
|
|
18
|
+
timeline: Timeline;
|
|
19
|
+
language: string;
|
|
20
|
+
timing: TimingSummary;
|
|
21
|
+
}
|
|
22
|
+
type RecognitionEngine = "whisper.cpp" | "whisper-server" | "google-cloud" | "microsoft-azure" | "amazon-transcribe" | "openai-cloud" | "deepgram";
|
|
23
|
+
interface BaseRecognitionOptions {
|
|
24
|
+
language: string;
|
|
25
|
+
signal?: AbortSignal | null | undefined;
|
|
26
|
+
inputFormat?: AudioFormat;
|
|
27
|
+
conversionMode?: ConversionMode | undefined;
|
|
28
|
+
}
|
|
29
|
+
type RecognitionOptions = (BaseRecognitionOptions & {
|
|
30
|
+
engine: "whisper.cpp";
|
|
31
|
+
options: WhisperCppOptions;
|
|
32
|
+
}) | (BaseRecognitionOptions & {
|
|
33
|
+
engine: "whisper-server";
|
|
34
|
+
options: WhisperServerOptions;
|
|
35
|
+
}) | (BaseRecognitionOptions & {
|
|
36
|
+
engine: "openai-cloud";
|
|
37
|
+
options: OpenAICloudSTTOptions;
|
|
38
|
+
}) | (BaseRecognitionOptions & {
|
|
39
|
+
engine: "google-cloud";
|
|
40
|
+
options: {
|
|
41
|
+
apiKey: string;
|
|
42
|
+
alternativeLanguageCodes?: string[];
|
|
43
|
+
profanityFilter?: boolean;
|
|
44
|
+
autoPunctuation?: boolean;
|
|
45
|
+
useEnhancedModel?: boolean;
|
|
46
|
+
};
|
|
47
|
+
}) | (BaseRecognitionOptions & {
|
|
48
|
+
engine: "microsoft-azure";
|
|
49
|
+
options: {
|
|
50
|
+
subscriptionKey: string;
|
|
51
|
+
serviceRegion: string;
|
|
52
|
+
};
|
|
53
|
+
}) | (BaseRecognitionOptions & {
|
|
54
|
+
engine: "amazon-transcribe";
|
|
55
|
+
options: {
|
|
56
|
+
region: string;
|
|
57
|
+
accessKeyId: string;
|
|
58
|
+
secretAccessKey: string;
|
|
59
|
+
};
|
|
60
|
+
}) | (BaseRecognitionOptions & {
|
|
61
|
+
engine: "deepgram";
|
|
62
|
+
options: {
|
|
63
|
+
apiKey: string;
|
|
64
|
+
model: string;
|
|
65
|
+
punctuate: boolean;
|
|
66
|
+
};
|
|
67
|
+
});
|
|
68
|
+
declare const recognitionEngines: {
|
|
69
|
+
id: RecognitionEngine;
|
|
70
|
+
name: string;
|
|
71
|
+
description: string;
|
|
72
|
+
type: "local" | "cloud" | "server";
|
|
73
|
+
acceptsFormats: AudioFormat[];
|
|
74
|
+
preferredFormat: AudioFormat;
|
|
75
|
+
}[];
|
|
76
|
+
|
|
77
|
+
export { type Audio, type AudioInput, type RecognitionEngine, type RecognitionOptions, type RecognitionResult, recognitionEngines, recognize };
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import {
|
|
2
|
+
isAudioSource,
|
|
3
|
+
normalizeToAudioSource
|
|
4
|
+
} from "../audio/index.js";
|
|
5
|
+
import { getConversionMode } from "../config.js";
|
|
6
|
+
import { getShortLanguageCode } from "../utilities/Locale.js";
|
|
7
|
+
import { extendDeep } from "../utilities/ObjectUtilities.js";
|
|
8
|
+
import {
|
|
9
|
+
addWordTextOffsetsToTimelineInPlace
|
|
10
|
+
} from "../utilities/Timeline.js";
|
|
11
|
+
import { createTiming } from "../utilities/Timing.js";
|
|
12
|
+
async function recognize(input, options) {
|
|
13
|
+
const opts = extendDeep(
|
|
14
|
+
defaultRecognitionOptions,
|
|
15
|
+
options
|
|
16
|
+
);
|
|
17
|
+
const timing = createTiming();
|
|
18
|
+
timing.setMetadata("engine", opts.engine);
|
|
19
|
+
timing.setMetadata(
|
|
20
|
+
"conversionMode",
|
|
21
|
+
opts.conversionMode ?? getConversionMode()
|
|
22
|
+
);
|
|
23
|
+
if (!opts.language) {
|
|
24
|
+
throw new Error("Language must be specified");
|
|
25
|
+
}
|
|
26
|
+
const languageCode = opts.language;
|
|
27
|
+
const shortLanguageCode = getShortLanguageCode(languageCode);
|
|
28
|
+
timing.setMetadata("language", languageCode);
|
|
29
|
+
const source = resolveAudioSource(input, opts);
|
|
30
|
+
timing.setMetadata("inputFormat", source.format);
|
|
31
|
+
let transcript = "";
|
|
32
|
+
let timeline;
|
|
33
|
+
switch (opts.engine) {
|
|
34
|
+
case "whisper.cpp": {
|
|
35
|
+
const WhisperCppSTT = await import("../recognition/WhisperCppSTT.js");
|
|
36
|
+
timing.setMetadata("model", opts.options.model);
|
|
37
|
+
timing.setMetadata("processors", opts.options.processors ?? 1);
|
|
38
|
+
timing.setMetadata("threads", opts.options.threads ?? 4);
|
|
39
|
+
const result = await WhisperCppSTT.recognize(source, {
|
|
40
|
+
...opts.options,
|
|
41
|
+
language: shortLanguageCode,
|
|
42
|
+
timing
|
|
43
|
+
});
|
|
44
|
+
transcript = result.transcript;
|
|
45
|
+
timeline = result.timeline;
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
48
|
+
case "openai-cloud": {
|
|
49
|
+
const OpenAICloudSTT = await import("../recognition/OpenAICloudSTT.js");
|
|
50
|
+
timing.setMetadata("model", opts.options.model ?? "whisper-1");
|
|
51
|
+
const result = await OpenAICloudSTT.recognize(source, shortLanguageCode, {
|
|
52
|
+
...opts.options,
|
|
53
|
+
timing
|
|
54
|
+
});
|
|
55
|
+
transcript = result.transcript;
|
|
56
|
+
timeline = result.timeline;
|
|
57
|
+
break;
|
|
58
|
+
}
|
|
59
|
+
case "whisper-server": {
|
|
60
|
+
const WhisperServerSTT = await import("../recognition/WhisperServerSTT.js");
|
|
61
|
+
const result = await WhisperServerSTT.recognize(
|
|
62
|
+
source,
|
|
63
|
+
shortLanguageCode,
|
|
64
|
+
{ ...opts.options, timing }
|
|
65
|
+
);
|
|
66
|
+
transcript = result.transcript;
|
|
67
|
+
timeline = result.timeline;
|
|
68
|
+
break;
|
|
69
|
+
}
|
|
70
|
+
case "google-cloud": {
|
|
71
|
+
const GoogleCloudSTT = await import("../recognition/GoogleCloudSTT.js");
|
|
72
|
+
if (!opts.options.apiKey) {
|
|
73
|
+
throw new Error("Google Cloud API key is required");
|
|
74
|
+
}
|
|
75
|
+
const result = await GoogleCloudSTT.recognize(
|
|
76
|
+
source,
|
|
77
|
+
{ ...opts.options, timing },
|
|
78
|
+
shortLanguageCode
|
|
79
|
+
);
|
|
80
|
+
transcript = result.transcript;
|
|
81
|
+
timeline = result.timeline;
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
84
|
+
case "microsoft-azure": {
|
|
85
|
+
const AzureCognitiveServicesSTT = await import("../recognition/AzureCognitiveServicesSTT.js");
|
|
86
|
+
if (!opts.options.subscriptionKey) {
|
|
87
|
+
throw new Error("Azure subscription key is required");
|
|
88
|
+
}
|
|
89
|
+
if (!opts.options.serviceRegion) {
|
|
90
|
+
throw new Error("Azure service region is required");
|
|
91
|
+
}
|
|
92
|
+
const result = await AzureCognitiveServicesSTT.recognize(
|
|
93
|
+
source,
|
|
94
|
+
{
|
|
95
|
+
subscriptionKey: opts.options.subscriptionKey,
|
|
96
|
+
serviceRegion: opts.options.serviceRegion,
|
|
97
|
+
inputFormat: opts.inputFormat,
|
|
98
|
+
timing
|
|
99
|
+
},
|
|
100
|
+
shortLanguageCode
|
|
101
|
+
);
|
|
102
|
+
transcript = result.transcript;
|
|
103
|
+
timeline = result.timeline;
|
|
104
|
+
break;
|
|
105
|
+
}
|
|
106
|
+
case "amazon-transcribe": {
|
|
107
|
+
const AmazonTranscribeSTT = await import("../recognition/AmazonTranscribeSTT.js");
|
|
108
|
+
const result = await AmazonTranscribeSTT.recognize(source, languageCode, {
|
|
109
|
+
region: opts.options.region,
|
|
110
|
+
accessKeyId: opts.options.accessKeyId,
|
|
111
|
+
secretAccessKey: opts.options.secretAccessKey,
|
|
112
|
+
inputFormat: opts.inputFormat,
|
|
113
|
+
timing
|
|
114
|
+
});
|
|
115
|
+
transcript = result.transcript;
|
|
116
|
+
timeline = result.timeline;
|
|
117
|
+
break;
|
|
118
|
+
}
|
|
119
|
+
case "deepgram": {
|
|
120
|
+
const DeepgramSTT = await import("../recognition/DeepgramSTT.js");
|
|
121
|
+
if (!opts.options.apiKey) {
|
|
122
|
+
throw new Error("Deepgram API key is required");
|
|
123
|
+
}
|
|
124
|
+
timing.setMetadata("model", opts.options.model);
|
|
125
|
+
const result = await DeepgramSTT.recognize(
|
|
126
|
+
source,
|
|
127
|
+
shortLanguageCode,
|
|
128
|
+
{
|
|
129
|
+
apiKey: opts.options.apiKey,
|
|
130
|
+
model: opts.options.model,
|
|
131
|
+
punctuate: opts.options.punctuate,
|
|
132
|
+
inputFormat: opts.inputFormat,
|
|
133
|
+
timing,
|
|
134
|
+
conversionMode: opts.conversionMode
|
|
135
|
+
},
|
|
136
|
+
opts.signal
|
|
137
|
+
);
|
|
138
|
+
transcript = result.transcript;
|
|
139
|
+
timeline = result.timeline;
|
|
140
|
+
break;
|
|
141
|
+
}
|
|
142
|
+
default: {
|
|
143
|
+
const _engine = opts;
|
|
144
|
+
throw new Error(
|
|
145
|
+
`Unknown engine: ${_engine.engine}`
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
if (!timeline) {
|
|
150
|
+
throw new Error(`No timeline returned from engine ${opts.engine}`);
|
|
151
|
+
}
|
|
152
|
+
addWordTextOffsetsToTimelineInPlace(timeline, transcript);
|
|
153
|
+
return {
|
|
154
|
+
transcript,
|
|
155
|
+
timeline,
|
|
156
|
+
language: languageCode,
|
|
157
|
+
timing: timing.summary()
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
function resolveAudioSource(input, opts) {
|
|
161
|
+
if (isAudioSource(input)) {
|
|
162
|
+
return input;
|
|
163
|
+
}
|
|
164
|
+
return normalizeToAudioSource(input, opts.inputFormat);
|
|
165
|
+
}
|
|
166
|
+
const defaultRecognitionOptions = {
|
|
167
|
+
engine: "whisper.cpp",
|
|
168
|
+
options: {
|
|
169
|
+
model: "tiny.en"
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
const recognitionEngines = [
|
|
173
|
+
{
|
|
174
|
+
id: "whisper.cpp",
|
|
175
|
+
name: "OpenAI Whisper (C++ port)",
|
|
176
|
+
description: "Local whisper.cpp binary. Accepts wav, flac, ogg, mp3.",
|
|
177
|
+
type: "local",
|
|
178
|
+
acceptsFormats: ["wav", "flac", "ogg", "mp3"],
|
|
179
|
+
preferredFormat: "wav"
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
id: "whisper-server",
|
|
183
|
+
name: "Whisper Server",
|
|
184
|
+
description: "whisper.cpp server API. Prefers wav format.",
|
|
185
|
+
type: "server",
|
|
186
|
+
acceptsFormats: ["wav", "flac", "ogg", "mp3"],
|
|
187
|
+
preferredFormat: "wav"
|
|
188
|
+
},
|
|
189
|
+
{
|
|
190
|
+
id: "openai-cloud",
|
|
191
|
+
name: "OpenAI Cloud",
|
|
192
|
+
description: "OpenAI cloud API. Accepts wav, flac, mp3, m4a, ogg, webm.",
|
|
193
|
+
type: "cloud",
|
|
194
|
+
acceptsFormats: ["wav", "flac", "mp3", "m4a", "ogg", "webm"],
|
|
195
|
+
preferredFormat: "mp3"
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
id: "google-cloud",
|
|
199
|
+
name: "Google Cloud",
|
|
200
|
+
description: "Google Cloud Speech-to-Text. Prefers flac format.",
|
|
201
|
+
type: "cloud",
|
|
202
|
+
acceptsFormats: ["wav", "flac", "mp3", "ogg", "opus", "webm"],
|
|
203
|
+
preferredFormat: "flac"
|
|
204
|
+
},
|
|
205
|
+
{
|
|
206
|
+
id: "microsoft-azure",
|
|
207
|
+
name: "Azure Cognitive Services",
|
|
208
|
+
description: "Microsoft Azure Speech-to-Text. Requires wav format.",
|
|
209
|
+
type: "cloud",
|
|
210
|
+
acceptsFormats: ["wav"],
|
|
211
|
+
preferredFormat: "wav"
|
|
212
|
+
},
|
|
213
|
+
{
|
|
214
|
+
id: "amazon-transcribe",
|
|
215
|
+
name: "Amazon Transcribe",
|
|
216
|
+
description: "Amazon Transcribe streaming. Accepts flac, opus, ogg.",
|
|
217
|
+
type: "cloud",
|
|
218
|
+
acceptsFormats: ["flac", "opus", "ogg"],
|
|
219
|
+
preferredFormat: "flac"
|
|
220
|
+
},
|
|
221
|
+
{
|
|
222
|
+
id: "deepgram",
|
|
223
|
+
name: "Deepgram",
|
|
224
|
+
description: "Deepgram API. Accepts most common formats.",
|
|
225
|
+
type: "cloud",
|
|
226
|
+
acceptsFormats: ["wav", "flac", "mp3", "opus", "ogg", "webm", "m4a"],
|
|
227
|
+
preferredFormat: "wav"
|
|
228
|
+
}
|
|
229
|
+
];
|
|
230
|
+
export {
|
|
231
|
+
recognitionEngines,
|
|
232
|
+
recognize
|
|
233
|
+
};
|