@storyteller-platform/ghost-story 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/LICENSE.md +611 -0
  2. package/README.md +18 -0
  3. package/dist/api/APIOptions.cjs +16 -0
  4. package/dist/api/APIOptions.d.cts +18 -0
  5. package/dist/api/APIOptions.d.ts +18 -0
  6. package/dist/api/APIOptions.js +0 -0
  7. package/dist/api/Recognition.cjs +263 -0
  8. package/dist/api/Recognition.d.cts +77 -0
  9. package/dist/api/Recognition.d.ts +77 -0
  10. package/dist/api/Recognition.js +233 -0
  11. package/dist/api/VoiceActivityDetection.cjs +77 -0
  12. package/dist/api/VoiceActivityDetection.d.cts +24 -0
  13. package/dist/api/VoiceActivityDetection.d.ts +24 -0
  14. package/dist/api/VoiceActivityDetection.js +43 -0
  15. package/dist/audio/AudioConverter.cjs +331 -0
  16. package/dist/audio/AudioConverter.d.cts +53 -0
  17. package/dist/audio/AudioConverter.d.ts +53 -0
  18. package/dist/audio/AudioConverter.js +310 -0
  19. package/dist/audio/AudioFormat.cjs +151 -0
  20. package/dist/audio/AudioFormat.d.cts +25 -0
  21. package/dist/audio/AudioFormat.d.ts +25 -0
  22. package/dist/audio/AudioFormat.js +123 -0
  23. package/dist/audio/AudioSource.cjs +119 -0
  24. package/dist/audio/AudioSource.d.cts +33 -0
  25. package/dist/audio/AudioSource.d.ts +33 -0
  26. package/dist/audio/AudioSource.js +88 -0
  27. package/dist/audio/index.cjs +74 -0
  28. package/dist/audio/index.d.cts +6 -0
  29. package/dist/audio/index.d.ts +6 -0
  30. package/dist/audio/index.js +54 -0
  31. package/dist/cli/bin.cjs +277 -0
  32. package/dist/cli/bin.d.cts +1 -0
  33. package/dist/cli/bin.d.ts +1 -0
  34. package/dist/cli/bin.js +275 -0
  35. package/dist/cli/config.cjs +347 -0
  36. package/dist/cli/config.d.cts +33 -0
  37. package/dist/cli/config.d.ts +33 -0
  38. package/dist/cli/config.js +285 -0
  39. package/dist/cli/install.cjs +334 -0
  40. package/dist/cli/install.d.cts +62 -0
  41. package/dist/cli/install.d.ts +62 -0
  42. package/dist/cli/install.js +316 -0
  43. package/dist/cli/whisper-server.cjs +172 -0
  44. package/dist/cli/whisper-server.d.cts +24 -0
  45. package/dist/cli/whisper-server.d.ts +24 -0
  46. package/dist/cli/whisper-server.js +152 -0
  47. package/dist/config.cjs +60 -0
  48. package/dist/config.d.cts +12 -0
  49. package/dist/config.d.ts +12 -0
  50. package/dist/config.js +32 -0
  51. package/dist/convert.cjs +88 -0
  52. package/dist/convert.d.cts +12 -0
  53. package/dist/convert.d.ts +12 -0
  54. package/dist/convert.js +63 -0
  55. package/dist/encodings/Ascii.cjs +75 -0
  56. package/dist/encodings/Ascii.d.cts +13 -0
  57. package/dist/encodings/Ascii.d.ts +13 -0
  58. package/dist/encodings/Ascii.js +48 -0
  59. package/dist/encodings/Base64.cjs +155 -0
  60. package/dist/encodings/Base64.d.cts +5 -0
  61. package/dist/encodings/Base64.d.ts +5 -0
  62. package/dist/encodings/Base64.js +129 -0
  63. package/dist/encodings/TextEncodingsCommon.cjs +16 -0
  64. package/dist/encodings/TextEncodingsCommon.d.cts +6 -0
  65. package/dist/encodings/TextEncodingsCommon.d.ts +6 -0
  66. package/dist/encodings/TextEncodingsCommon.js +0 -0
  67. package/dist/index.cjs +153 -0
  68. package/dist/index.d.cts +15 -0
  69. package/dist/index.d.ts +15 -0
  70. package/dist/index.js +140 -0
  71. package/dist/recognition/AmazonTranscribeSTT.cjs +188 -0
  72. package/dist/recognition/AmazonTranscribeSTT.d.cts +21 -0
  73. package/dist/recognition/AmazonTranscribeSTT.d.ts +21 -0
  74. package/dist/recognition/AmazonTranscribeSTT.js +160 -0
  75. package/dist/recognition/AzureCognitiveServicesSTT.cjs +124 -0
  76. package/dist/recognition/AzureCognitiveServicesSTT.d.cts +21 -0
  77. package/dist/recognition/AzureCognitiveServicesSTT.d.ts +21 -0
  78. package/dist/recognition/AzureCognitiveServicesSTT.js +95 -0
  79. package/dist/recognition/DeepgramSTT.cjs +172 -0
  80. package/dist/recognition/DeepgramSTT.d.cts +23 -0
  81. package/dist/recognition/DeepgramSTT.d.ts +23 -0
  82. package/dist/recognition/DeepgramSTT.js +153 -0
  83. package/dist/recognition/GoogleCloudSTT.cjs +125 -0
  84. package/dist/recognition/GoogleCloudSTT.d.cts +35 -0
  85. package/dist/recognition/GoogleCloudSTT.d.ts +35 -0
  86. package/dist/recognition/GoogleCloudSTT.js +107 -0
  87. package/dist/recognition/OpenAICloudSTT.cjs +180 -0
  88. package/dist/recognition/OpenAICloudSTT.d.cts +29 -0
  89. package/dist/recognition/OpenAICloudSTT.d.ts +29 -0
  90. package/dist/recognition/OpenAICloudSTT.js +150 -0
  91. package/dist/recognition/WhisperCppSTT.cjs +296 -0
  92. package/dist/recognition/WhisperCppSTT.d.cts +40 -0
  93. package/dist/recognition/WhisperCppSTT.d.ts +40 -0
  94. package/dist/recognition/WhisperCppSTT.js +275 -0
  95. package/dist/recognition/WhisperServerSTT.cjs +119 -0
  96. package/dist/recognition/WhisperServerSTT.d.cts +24 -0
  97. package/dist/recognition/WhisperServerSTT.d.ts +24 -0
  98. package/dist/recognition/WhisperServerSTT.js +105 -0
  99. package/dist/utilities/FileSystem.cjs +54 -0
  100. package/dist/utilities/FileSystem.d.cts +3 -0
  101. package/dist/utilities/FileSystem.d.ts +3 -0
  102. package/dist/utilities/FileSystem.js +20 -0
  103. package/dist/utilities/Locale.cjs +46 -0
  104. package/dist/utilities/Locale.d.cts +9 -0
  105. package/dist/utilities/Locale.d.ts +9 -0
  106. package/dist/utilities/Locale.js +20 -0
  107. package/dist/utilities/ObjectUtilities.cjs +41 -0
  108. package/dist/utilities/ObjectUtilities.d.cts +3 -0
  109. package/dist/utilities/ObjectUtilities.d.ts +3 -0
  110. package/dist/utilities/ObjectUtilities.js +7 -0
  111. package/dist/utilities/Timeline.cjs +120 -0
  112. package/dist/utilities/Timeline.d.cts +23 -0
  113. package/dist/utilities/Timeline.d.ts +23 -0
  114. package/dist/utilities/Timeline.js +94 -0
  115. package/dist/utilities/Timing.cjs +287 -0
  116. package/dist/utilities/Timing.d.cts +64 -0
  117. package/dist/utilities/Timing.d.ts +64 -0
  118. package/dist/utilities/Timing.js +256 -0
  119. package/dist/utilities/WhisperTimeline.cjs +344 -0
  120. package/dist/utilities/WhisperTimeline.d.cts +86 -0
  121. package/dist/utilities/WhisperTimeline.d.ts +86 -0
  122. package/dist/utilities/WhisperTimeline.js +313 -0
  123. package/dist/vad/ActiveGate.cjs +357 -0
  124. package/dist/vad/ActiveGate.d.cts +53 -0
  125. package/dist/vad/ActiveGate.d.ts +53 -0
  126. package/dist/vad/ActiveGate.js +329 -0
  127. package/dist/vad/ActiveGateOg.cjs +1366 -0
  128. package/dist/vad/ActiveGateOg.d.cts +33 -0
  129. package/dist/vad/ActiveGateOg.d.ts +33 -0
  130. package/dist/vad/ActiveGateOg.js +1341 -0
  131. package/dist/vad/Silero.cjs +174 -0
  132. package/dist/vad/Silero.d.cts +25 -0
  133. package/dist/vad/Silero.d.ts +25 -0
  134. package/dist/vad/Silero.js +153 -0
  135. package/package.json +125 -0
@@ -0,0 +1,18 @@
1
+ import { RecognitionOptions } from './Recognition.js';
2
+ import 'node:fs';
3
+ import 'node:stream';
4
+ import '../audio/AudioFormat.js';
5
+ import '../audio/AudioSource.js';
6
+ import '../config.js';
7
+ import '../recognition/OpenAICloudSTT.js';
8
+ import '../utilities/Timeline.js';
9
+ import '../utilities/Timing.js';
10
+ import '../recognition/WhisperCppSTT.js';
11
+ import '../cli/config.js';
12
+ import '../recognition/WhisperServerSTT.js';
13
+
14
+ interface APIOptions {
15
+ RecognitionOptions: RecognitionOptions;
16
+ }
17
+
18
+ export type { APIOptions };
File without changes
@@ -0,0 +1,263 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+ var Recognition_exports = {};
30
+ __export(Recognition_exports, {
31
+ recognitionEngines: () => recognitionEngines,
32
+ recognize: () => recognize
33
+ });
34
+ module.exports = __toCommonJS(Recognition_exports);
35
+ var import_audio = require("../audio/index.cjs");
36
+ var import_config = require("../config.cjs");
37
+ var import_Locale = require("../utilities/Locale.cjs");
38
+ var import_ObjectUtilities = require("../utilities/ObjectUtilities.cjs");
39
+ var import_Timeline = require("../utilities/Timeline.cjs");
40
+ var import_Timing = require("../utilities/Timing.cjs");
41
+ async function recognize(input, options) {
42
+ const opts = (0, import_ObjectUtilities.extendDeep)(
43
+ defaultRecognitionOptions,
44
+ options
45
+ );
46
+ const timing = (0, import_Timing.createTiming)();
47
+ timing.setMetadata("engine", opts.engine);
48
+ timing.setMetadata(
49
+ "conversionMode",
50
+ opts.conversionMode ?? (0, import_config.getConversionMode)()
51
+ );
52
+ if (!opts.language) {
53
+ throw new Error("Language must be specified");
54
+ }
55
+ const languageCode = opts.language;
56
+ const shortLanguageCode = (0, import_Locale.getShortLanguageCode)(languageCode);
57
+ timing.setMetadata("language", languageCode);
58
+ const source = resolveAudioSource(input, opts);
59
+ timing.setMetadata("inputFormat", source.format);
60
+ let transcript = "";
61
+ let timeline;
62
+ switch (opts.engine) {
63
+ case "whisper.cpp": {
64
+ const WhisperCppSTT = await import("../recognition/WhisperCppSTT.cjs");
65
+ timing.setMetadata("model", opts.options.model);
66
+ timing.setMetadata("processors", opts.options.processors ?? 1);
67
+ timing.setMetadata("threads", opts.options.threads ?? 4);
68
+ const result = await WhisperCppSTT.recognize(source, {
69
+ ...opts.options,
70
+ language: shortLanguageCode,
71
+ timing
72
+ });
73
+ transcript = result.transcript;
74
+ timeline = result.timeline;
75
+ break;
76
+ }
77
+ case "openai-cloud": {
78
+ const OpenAICloudSTT = await import("../recognition/OpenAICloudSTT.cjs");
79
+ timing.setMetadata("model", opts.options.model ?? "whisper-1");
80
+ const result = await OpenAICloudSTT.recognize(source, shortLanguageCode, {
81
+ ...opts.options,
82
+ timing
83
+ });
84
+ transcript = result.transcript;
85
+ timeline = result.timeline;
86
+ break;
87
+ }
88
+ case "whisper-server": {
89
+ const WhisperServerSTT = await import("../recognition/WhisperServerSTT.cjs");
90
+ const result = await WhisperServerSTT.recognize(
91
+ source,
92
+ shortLanguageCode,
93
+ { ...opts.options, timing }
94
+ );
95
+ transcript = result.transcript;
96
+ timeline = result.timeline;
97
+ break;
98
+ }
99
+ case "google-cloud": {
100
+ const GoogleCloudSTT = await import("../recognition/GoogleCloudSTT.cjs");
101
+ if (!opts.options.apiKey) {
102
+ throw new Error("Google Cloud API key is required");
103
+ }
104
+ const result = await GoogleCloudSTT.recognize(
105
+ source,
106
+ { ...opts.options, timing },
107
+ shortLanguageCode
108
+ );
109
+ transcript = result.transcript;
110
+ timeline = result.timeline;
111
+ break;
112
+ }
113
+ case "microsoft-azure": {
114
+ const AzureCognitiveServicesSTT = await import("../recognition/AzureCognitiveServicesSTT.cjs");
115
+ if (!opts.options.subscriptionKey) {
116
+ throw new Error("Azure subscription key is required");
117
+ }
118
+ if (!opts.options.serviceRegion) {
119
+ throw new Error("Azure service region is required");
120
+ }
121
+ const result = await AzureCognitiveServicesSTT.recognize(
122
+ source,
123
+ {
124
+ subscriptionKey: opts.options.subscriptionKey,
125
+ serviceRegion: opts.options.serviceRegion,
126
+ inputFormat: opts.inputFormat,
127
+ timing
128
+ },
129
+ shortLanguageCode
130
+ );
131
+ transcript = result.transcript;
132
+ timeline = result.timeline;
133
+ break;
134
+ }
135
+ case "amazon-transcribe": {
136
+ const AmazonTranscribeSTT = await import("../recognition/AmazonTranscribeSTT.cjs");
137
+ const result = await AmazonTranscribeSTT.recognize(source, languageCode, {
138
+ region: opts.options.region,
139
+ accessKeyId: opts.options.accessKeyId,
140
+ secretAccessKey: opts.options.secretAccessKey,
141
+ inputFormat: opts.inputFormat,
142
+ timing
143
+ });
144
+ transcript = result.transcript;
145
+ timeline = result.timeline;
146
+ break;
147
+ }
148
+ case "deepgram": {
149
+ const DeepgramSTT = await import("../recognition/DeepgramSTT.cjs");
150
+ if (!opts.options.apiKey) {
151
+ throw new Error("Deepgram API key is required");
152
+ }
153
+ timing.setMetadata("model", opts.options.model);
154
+ const result = await DeepgramSTT.recognize(
155
+ source,
156
+ shortLanguageCode,
157
+ {
158
+ apiKey: opts.options.apiKey,
159
+ model: opts.options.model,
160
+ punctuate: opts.options.punctuate,
161
+ inputFormat: opts.inputFormat,
162
+ timing,
163
+ conversionMode: opts.conversionMode
164
+ },
165
+ opts.signal
166
+ );
167
+ transcript = result.transcript;
168
+ timeline = result.timeline;
169
+ break;
170
+ }
171
+ default: {
172
+ const _engine = opts;
173
+ throw new Error(
174
+ `Unknown engine: ${_engine.engine}`
175
+ );
176
+ }
177
+ }
178
+ if (!timeline) {
179
+ throw new Error(`No timeline returned from engine ${opts.engine}`);
180
+ }
181
+ (0, import_Timeline.addWordTextOffsetsToTimelineInPlace)(timeline, transcript);
182
+ return {
183
+ transcript,
184
+ timeline,
185
+ language: languageCode,
186
+ timing: timing.summary()
187
+ };
188
+ }
189
+ function resolveAudioSource(input, opts) {
190
+ if ((0, import_audio.isAudioSource)(input)) {
191
+ return input;
192
+ }
193
+ return (0, import_audio.normalizeToAudioSource)(input, opts.inputFormat);
194
+ }
195
+ const defaultRecognitionOptions = {
196
+ engine: "whisper.cpp",
197
+ options: {
198
+ model: "tiny.en"
199
+ }
200
+ };
201
+ const recognitionEngines = [
202
+ {
203
+ id: "whisper.cpp",
204
+ name: "OpenAI Whisper (C++ port)",
205
+ description: "Local whisper.cpp binary. Accepts wav, flac, ogg, mp3.",
206
+ type: "local",
207
+ acceptsFormats: ["wav", "flac", "ogg", "mp3"],
208
+ preferredFormat: "wav"
209
+ },
210
+ {
211
+ id: "whisper-server",
212
+ name: "Whisper Server",
213
+ description: "whisper.cpp server API. Prefers wav format.",
214
+ type: "server",
215
+ acceptsFormats: ["wav", "flac", "ogg", "mp3"],
216
+ preferredFormat: "wav"
217
+ },
218
+ {
219
+ id: "openai-cloud",
220
+ name: "OpenAI Cloud",
221
+ description: "OpenAI cloud API. Accepts wav, flac, mp3, m4a, ogg, webm.",
222
+ type: "cloud",
223
+ acceptsFormats: ["wav", "flac", "mp3", "m4a", "ogg", "webm"],
224
+ preferredFormat: "mp3"
225
+ },
226
+ {
227
+ id: "google-cloud",
228
+ name: "Google Cloud",
229
+ description: "Google Cloud Speech-to-Text. Prefers flac format.",
230
+ type: "cloud",
231
+ acceptsFormats: ["wav", "flac", "mp3", "ogg", "opus", "webm"],
232
+ preferredFormat: "flac"
233
+ },
234
+ {
235
+ id: "microsoft-azure",
236
+ name: "Azure Cognitive Services",
237
+ description: "Microsoft Azure Speech-to-Text. Requires wav format.",
238
+ type: "cloud",
239
+ acceptsFormats: ["wav"],
240
+ preferredFormat: "wav"
241
+ },
242
+ {
243
+ id: "amazon-transcribe",
244
+ name: "Amazon Transcribe",
245
+ description: "Amazon Transcribe streaming. Accepts flac, opus, ogg.",
246
+ type: "cloud",
247
+ acceptsFormats: ["flac", "opus", "ogg"],
248
+ preferredFormat: "flac"
249
+ },
250
+ {
251
+ id: "deepgram",
252
+ name: "Deepgram",
253
+ description: "Deepgram API. Accepts most common formats.",
254
+ type: "cloud",
255
+ acceptsFormats: ["wav", "flac", "mp3", "opus", "ogg", "webm", "m4a"],
256
+ preferredFormat: "wav"
257
+ }
258
+ ];
259
+ // Annotate the CommonJS export names for ESM import in node:
260
+ 0 && (module.exports = {
261
+ recognitionEngines,
262
+ recognize
263
+ });
@@ -0,0 +1,77 @@
1
+ import { ReadStream } from 'node:fs';
2
+ import { Readable } from 'node:stream';
3
+ import { AudioFormat } from '../audio/AudioFormat.cjs';
4
+ import { AudioSource } from '../audio/AudioSource.cjs';
5
+ import { ConversionMode } from '../config.cjs';
6
+ import { OpenAICloudSTTOptions } from '../recognition/OpenAICloudSTT.cjs';
7
+ import { WhisperCppOptions } from '../recognition/WhisperCppSTT.cjs';
8
+ import { WhisperServerOptions } from '../recognition/WhisperServerSTT.cjs';
9
+ import { Timeline } from '../utilities/Timeline.cjs';
10
+ import { TimingSummary } from '../utilities/Timing.cjs';
11
+ import '../cli/config.cjs';
12
+
13
+ type AudioInput = Readable | ReadStream | string | AudioSource;
14
+ type Audio = AudioInput;
15
+ declare function recognize(input: AudioInput, options: RecognitionOptions): Promise<RecognitionResult>;
16
+ interface RecognitionResult {
17
+ transcript: string;
18
+ timeline: Timeline;
19
+ language: string;
20
+ timing: TimingSummary;
21
+ }
22
+ type RecognitionEngine = "whisper.cpp" | "whisper-server" | "google-cloud" | "microsoft-azure" | "amazon-transcribe" | "openai-cloud" | "deepgram";
23
+ interface BaseRecognitionOptions {
24
+ language: string;
25
+ signal?: AbortSignal | null | undefined;
26
+ inputFormat?: AudioFormat;
27
+ conversionMode?: ConversionMode | undefined;
28
+ }
29
+ type RecognitionOptions = (BaseRecognitionOptions & {
30
+ engine: "whisper.cpp";
31
+ options: WhisperCppOptions;
32
+ }) | (BaseRecognitionOptions & {
33
+ engine: "whisper-server";
34
+ options: WhisperServerOptions;
35
+ }) | (BaseRecognitionOptions & {
36
+ engine: "openai-cloud";
37
+ options: OpenAICloudSTTOptions;
38
+ }) | (BaseRecognitionOptions & {
39
+ engine: "google-cloud";
40
+ options: {
41
+ apiKey: string;
42
+ alternativeLanguageCodes?: string[];
43
+ profanityFilter?: boolean;
44
+ autoPunctuation?: boolean;
45
+ useEnhancedModel?: boolean;
46
+ };
47
+ }) | (BaseRecognitionOptions & {
48
+ engine: "microsoft-azure";
49
+ options: {
50
+ subscriptionKey: string;
51
+ serviceRegion: string;
52
+ };
53
+ }) | (BaseRecognitionOptions & {
54
+ engine: "amazon-transcribe";
55
+ options: {
56
+ region: string;
57
+ accessKeyId: string;
58
+ secretAccessKey: string;
59
+ };
60
+ }) | (BaseRecognitionOptions & {
61
+ engine: "deepgram";
62
+ options: {
63
+ apiKey: string;
64
+ model: string;
65
+ punctuate: boolean;
66
+ };
67
+ });
68
+ declare const recognitionEngines: {
69
+ id: RecognitionEngine;
70
+ name: string;
71
+ description: string;
72
+ type: "local" | "cloud" | "server";
73
+ acceptsFormats: AudioFormat[];
74
+ preferredFormat: AudioFormat;
75
+ }[];
76
+
77
+ export { type Audio, type AudioInput, type RecognitionEngine, type RecognitionOptions, type RecognitionResult, recognitionEngines, recognize };
@@ -0,0 +1,77 @@
1
+ import { ReadStream } from 'node:fs';
2
+ import { Readable } from 'node:stream';
3
+ import { AudioFormat } from '../audio/AudioFormat.js';
4
+ import { AudioSource } from '../audio/AudioSource.js';
5
+ import { ConversionMode } from '../config.js';
6
+ import { OpenAICloudSTTOptions } from '../recognition/OpenAICloudSTT.js';
7
+ import { WhisperCppOptions } from '../recognition/WhisperCppSTT.js';
8
+ import { WhisperServerOptions } from '../recognition/WhisperServerSTT.js';
9
+ import { Timeline } from '../utilities/Timeline.js';
10
+ import { TimingSummary } from '../utilities/Timing.js';
11
+ import '../cli/config.js';
12
+
13
+ type AudioInput = Readable | ReadStream | string | AudioSource;
14
+ type Audio = AudioInput;
15
+ declare function recognize(input: AudioInput, options: RecognitionOptions): Promise<RecognitionResult>;
16
+ interface RecognitionResult {
17
+ transcript: string;
18
+ timeline: Timeline;
19
+ language: string;
20
+ timing: TimingSummary;
21
+ }
22
+ type RecognitionEngine = "whisper.cpp" | "whisper-server" | "google-cloud" | "microsoft-azure" | "amazon-transcribe" | "openai-cloud" | "deepgram";
23
+ interface BaseRecognitionOptions {
24
+ language: string;
25
+ signal?: AbortSignal | null | undefined;
26
+ inputFormat?: AudioFormat;
27
+ conversionMode?: ConversionMode | undefined;
28
+ }
29
+ type RecognitionOptions = (BaseRecognitionOptions & {
30
+ engine: "whisper.cpp";
31
+ options: WhisperCppOptions;
32
+ }) | (BaseRecognitionOptions & {
33
+ engine: "whisper-server";
34
+ options: WhisperServerOptions;
35
+ }) | (BaseRecognitionOptions & {
36
+ engine: "openai-cloud";
37
+ options: OpenAICloudSTTOptions;
38
+ }) | (BaseRecognitionOptions & {
39
+ engine: "google-cloud";
40
+ options: {
41
+ apiKey: string;
42
+ alternativeLanguageCodes?: string[];
43
+ profanityFilter?: boolean;
44
+ autoPunctuation?: boolean;
45
+ useEnhancedModel?: boolean;
46
+ };
47
+ }) | (BaseRecognitionOptions & {
48
+ engine: "microsoft-azure";
49
+ options: {
50
+ subscriptionKey: string;
51
+ serviceRegion: string;
52
+ };
53
+ }) | (BaseRecognitionOptions & {
54
+ engine: "amazon-transcribe";
55
+ options: {
56
+ region: string;
57
+ accessKeyId: string;
58
+ secretAccessKey: string;
59
+ };
60
+ }) | (BaseRecognitionOptions & {
61
+ engine: "deepgram";
62
+ options: {
63
+ apiKey: string;
64
+ model: string;
65
+ punctuate: boolean;
66
+ };
67
+ });
68
+ declare const recognitionEngines: {
69
+ id: RecognitionEngine;
70
+ name: string;
71
+ description: string;
72
+ type: "local" | "cloud" | "server";
73
+ acceptsFormats: AudioFormat[];
74
+ preferredFormat: AudioFormat;
75
+ }[];
76
+
77
+ export { type Audio, type AudioInput, type RecognitionEngine, type RecognitionOptions, type RecognitionResult, recognitionEngines, recognize };
@@ -0,0 +1,233 @@
1
+ import {
2
+ isAudioSource,
3
+ normalizeToAudioSource
4
+ } from "../audio/index.js";
5
+ import { getConversionMode } from "../config.js";
6
+ import { getShortLanguageCode } from "../utilities/Locale.js";
7
+ import { extendDeep } from "../utilities/ObjectUtilities.js";
8
+ import {
9
+ addWordTextOffsetsToTimelineInPlace
10
+ } from "../utilities/Timeline.js";
11
+ import { createTiming } from "../utilities/Timing.js";
12
+ async function recognize(input, options) {
13
+ const opts = extendDeep(
14
+ defaultRecognitionOptions,
15
+ options
16
+ );
17
+ const timing = createTiming();
18
+ timing.setMetadata("engine", opts.engine);
19
+ timing.setMetadata(
20
+ "conversionMode",
21
+ opts.conversionMode ?? getConversionMode()
22
+ );
23
+ if (!opts.language) {
24
+ throw new Error("Language must be specified");
25
+ }
26
+ const languageCode = opts.language;
27
+ const shortLanguageCode = getShortLanguageCode(languageCode);
28
+ timing.setMetadata("language", languageCode);
29
+ const source = resolveAudioSource(input, opts);
30
+ timing.setMetadata("inputFormat", source.format);
31
+ let transcript = "";
32
+ let timeline;
33
+ switch (opts.engine) {
34
+ case "whisper.cpp": {
35
+ const WhisperCppSTT = await import("../recognition/WhisperCppSTT.js");
36
+ timing.setMetadata("model", opts.options.model);
37
+ timing.setMetadata("processors", opts.options.processors ?? 1);
38
+ timing.setMetadata("threads", opts.options.threads ?? 4);
39
+ const result = await WhisperCppSTT.recognize(source, {
40
+ ...opts.options,
41
+ language: shortLanguageCode,
42
+ timing
43
+ });
44
+ transcript = result.transcript;
45
+ timeline = result.timeline;
46
+ break;
47
+ }
48
+ case "openai-cloud": {
49
+ const OpenAICloudSTT = await import("../recognition/OpenAICloudSTT.js");
50
+ timing.setMetadata("model", opts.options.model ?? "whisper-1");
51
+ const result = await OpenAICloudSTT.recognize(source, shortLanguageCode, {
52
+ ...opts.options,
53
+ timing
54
+ });
55
+ transcript = result.transcript;
56
+ timeline = result.timeline;
57
+ break;
58
+ }
59
+ case "whisper-server": {
60
+ const WhisperServerSTT = await import("../recognition/WhisperServerSTT.js");
61
+ const result = await WhisperServerSTT.recognize(
62
+ source,
63
+ shortLanguageCode,
64
+ { ...opts.options, timing }
65
+ );
66
+ transcript = result.transcript;
67
+ timeline = result.timeline;
68
+ break;
69
+ }
70
+ case "google-cloud": {
71
+ const GoogleCloudSTT = await import("../recognition/GoogleCloudSTT.js");
72
+ if (!opts.options.apiKey) {
73
+ throw new Error("Google Cloud API key is required");
74
+ }
75
+ const result = await GoogleCloudSTT.recognize(
76
+ source,
77
+ { ...opts.options, timing },
78
+ shortLanguageCode
79
+ );
80
+ transcript = result.transcript;
81
+ timeline = result.timeline;
82
+ break;
83
+ }
84
+ case "microsoft-azure": {
85
+ const AzureCognitiveServicesSTT = await import("../recognition/AzureCognitiveServicesSTT.js");
86
+ if (!opts.options.subscriptionKey) {
87
+ throw new Error("Azure subscription key is required");
88
+ }
89
+ if (!opts.options.serviceRegion) {
90
+ throw new Error("Azure service region is required");
91
+ }
92
+ const result = await AzureCognitiveServicesSTT.recognize(
93
+ source,
94
+ {
95
+ subscriptionKey: opts.options.subscriptionKey,
96
+ serviceRegion: opts.options.serviceRegion,
97
+ inputFormat: opts.inputFormat,
98
+ timing
99
+ },
100
+ shortLanguageCode
101
+ );
102
+ transcript = result.transcript;
103
+ timeline = result.timeline;
104
+ break;
105
+ }
106
+ case "amazon-transcribe": {
107
+ const AmazonTranscribeSTT = await import("../recognition/AmazonTranscribeSTT.js");
108
+ const result = await AmazonTranscribeSTT.recognize(source, languageCode, {
109
+ region: opts.options.region,
110
+ accessKeyId: opts.options.accessKeyId,
111
+ secretAccessKey: opts.options.secretAccessKey,
112
+ inputFormat: opts.inputFormat,
113
+ timing
114
+ });
115
+ transcript = result.transcript;
116
+ timeline = result.timeline;
117
+ break;
118
+ }
119
+ case "deepgram": {
120
+ const DeepgramSTT = await import("../recognition/DeepgramSTT.js");
121
+ if (!opts.options.apiKey) {
122
+ throw new Error("Deepgram API key is required");
123
+ }
124
+ timing.setMetadata("model", opts.options.model);
125
+ const result = await DeepgramSTT.recognize(
126
+ source,
127
+ shortLanguageCode,
128
+ {
129
+ apiKey: opts.options.apiKey,
130
+ model: opts.options.model,
131
+ punctuate: opts.options.punctuate,
132
+ inputFormat: opts.inputFormat,
133
+ timing,
134
+ conversionMode: opts.conversionMode
135
+ },
136
+ opts.signal
137
+ );
138
+ transcript = result.transcript;
139
+ timeline = result.timeline;
140
+ break;
141
+ }
142
+ default: {
143
+ const _engine = opts;
144
+ throw new Error(
145
+ `Unknown engine: ${_engine.engine}`
146
+ );
147
+ }
148
+ }
149
+ if (!timeline) {
150
+ throw new Error(`No timeline returned from engine ${opts.engine}`);
151
+ }
152
+ addWordTextOffsetsToTimelineInPlace(timeline, transcript);
153
+ return {
154
+ transcript,
155
+ timeline,
156
+ language: languageCode,
157
+ timing: timing.summary()
158
+ };
159
+ }
160
+ function resolveAudioSource(input, opts) {
161
+ if (isAudioSource(input)) {
162
+ return input;
163
+ }
164
+ return normalizeToAudioSource(input, opts.inputFormat);
165
+ }
166
+ const defaultRecognitionOptions = {
167
+ engine: "whisper.cpp",
168
+ options: {
169
+ model: "tiny.en"
170
+ }
171
+ };
172
+ const recognitionEngines = [
173
+ {
174
+ id: "whisper.cpp",
175
+ name: "OpenAI Whisper (C++ port)",
176
+ description: "Local whisper.cpp binary. Accepts wav, flac, ogg, mp3.",
177
+ type: "local",
178
+ acceptsFormats: ["wav", "flac", "ogg", "mp3"],
179
+ preferredFormat: "wav"
180
+ },
181
+ {
182
+ id: "whisper-server",
183
+ name: "Whisper Server",
184
+ description: "whisper.cpp server API. Prefers wav format.",
185
+ type: "server",
186
+ acceptsFormats: ["wav", "flac", "ogg", "mp3"],
187
+ preferredFormat: "wav"
188
+ },
189
+ {
190
+ id: "openai-cloud",
191
+ name: "OpenAI Cloud",
192
+ description: "OpenAI cloud API. Accepts wav, flac, mp3, m4a, ogg, webm.",
193
+ type: "cloud",
194
+ acceptsFormats: ["wav", "flac", "mp3", "m4a", "ogg", "webm"],
195
+ preferredFormat: "mp3"
196
+ },
197
+ {
198
+ id: "google-cloud",
199
+ name: "Google Cloud",
200
+ description: "Google Cloud Speech-to-Text. Prefers flac format.",
201
+ type: "cloud",
202
+ acceptsFormats: ["wav", "flac", "mp3", "ogg", "opus", "webm"],
203
+ preferredFormat: "flac"
204
+ },
205
+ {
206
+ id: "microsoft-azure",
207
+ name: "Azure Cognitive Services",
208
+ description: "Microsoft Azure Speech-to-Text. Requires wav format.",
209
+ type: "cloud",
210
+ acceptsFormats: ["wav"],
211
+ preferredFormat: "wav"
212
+ },
213
+ {
214
+ id: "amazon-transcribe",
215
+ name: "Amazon Transcribe",
216
+ description: "Amazon Transcribe streaming. Accepts flac, opus, ogg.",
217
+ type: "cloud",
218
+ acceptsFormats: ["flac", "opus", "ogg"],
219
+ preferredFormat: "flac"
220
+ },
221
+ {
222
+ id: "deepgram",
223
+ name: "Deepgram",
224
+ description: "Deepgram API. Accepts most common formats.",
225
+ type: "cloud",
226
+ acceptsFormats: ["wav", "flac", "mp3", "opus", "ogg", "webm", "m4a"],
227
+ preferredFormat: "wav"
228
+ }
229
+ ];
230
+ export {
231
+ recognitionEngines,
232
+ recognize
233
+ };