@storyteller-platform/ghost-story 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +611 -0
- package/README.md +18 -0
- package/dist/api/APIOptions.cjs +16 -0
- package/dist/api/APIOptions.d.cts +18 -0
- package/dist/api/APIOptions.d.ts +18 -0
- package/dist/api/APIOptions.js +0 -0
- package/dist/api/Recognition.cjs +263 -0
- package/dist/api/Recognition.d.cts +77 -0
- package/dist/api/Recognition.d.ts +77 -0
- package/dist/api/Recognition.js +233 -0
- package/dist/api/VoiceActivityDetection.cjs +77 -0
- package/dist/api/VoiceActivityDetection.d.cts +24 -0
- package/dist/api/VoiceActivityDetection.d.ts +24 -0
- package/dist/api/VoiceActivityDetection.js +43 -0
- package/dist/audio/AudioConverter.cjs +331 -0
- package/dist/audio/AudioConverter.d.cts +53 -0
- package/dist/audio/AudioConverter.d.ts +53 -0
- package/dist/audio/AudioConverter.js +310 -0
- package/dist/audio/AudioFormat.cjs +151 -0
- package/dist/audio/AudioFormat.d.cts +25 -0
- package/dist/audio/AudioFormat.d.ts +25 -0
- package/dist/audio/AudioFormat.js +123 -0
- package/dist/audio/AudioSource.cjs +119 -0
- package/dist/audio/AudioSource.d.cts +33 -0
- package/dist/audio/AudioSource.d.ts +33 -0
- package/dist/audio/AudioSource.js +88 -0
- package/dist/audio/index.cjs +74 -0
- package/dist/audio/index.d.cts +6 -0
- package/dist/audio/index.d.ts +6 -0
- package/dist/audio/index.js +54 -0
- package/dist/cli/bin.cjs +277 -0
- package/dist/cli/bin.d.cts +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +275 -0
- package/dist/cli/config.cjs +347 -0
- package/dist/cli/config.d.cts +33 -0
- package/dist/cli/config.d.ts +33 -0
- package/dist/cli/config.js +285 -0
- package/dist/cli/install.cjs +334 -0
- package/dist/cli/install.d.cts +62 -0
- package/dist/cli/install.d.ts +62 -0
- package/dist/cli/install.js +316 -0
- package/dist/cli/whisper-server.cjs +172 -0
- package/dist/cli/whisper-server.d.cts +24 -0
- package/dist/cli/whisper-server.d.ts +24 -0
- package/dist/cli/whisper-server.js +152 -0
- package/dist/config.cjs +60 -0
- package/dist/config.d.cts +12 -0
- package/dist/config.d.ts +12 -0
- package/dist/config.js +32 -0
- package/dist/convert.cjs +88 -0
- package/dist/convert.d.cts +12 -0
- package/dist/convert.d.ts +12 -0
- package/dist/convert.js +63 -0
- package/dist/encodings/Ascii.cjs +75 -0
- package/dist/encodings/Ascii.d.cts +13 -0
- package/dist/encodings/Ascii.d.ts +13 -0
- package/dist/encodings/Ascii.js +48 -0
- package/dist/encodings/Base64.cjs +155 -0
- package/dist/encodings/Base64.d.cts +5 -0
- package/dist/encodings/Base64.d.ts +5 -0
- package/dist/encodings/Base64.js +129 -0
- package/dist/encodings/TextEncodingsCommon.cjs +16 -0
- package/dist/encodings/TextEncodingsCommon.d.cts +6 -0
- package/dist/encodings/TextEncodingsCommon.d.ts +6 -0
- package/dist/encodings/TextEncodingsCommon.js +0 -0
- package/dist/index.cjs +153 -0
- package/dist/index.d.cts +15 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.js +140 -0
- package/dist/recognition/AmazonTranscribeSTT.cjs +188 -0
- package/dist/recognition/AmazonTranscribeSTT.d.cts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.d.ts +21 -0
- package/dist/recognition/AmazonTranscribeSTT.js +160 -0
- package/dist/recognition/AzureCognitiveServicesSTT.cjs +124 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.cts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.d.ts +21 -0
- package/dist/recognition/AzureCognitiveServicesSTT.js +95 -0
- package/dist/recognition/DeepgramSTT.cjs +172 -0
- package/dist/recognition/DeepgramSTT.d.cts +23 -0
- package/dist/recognition/DeepgramSTT.d.ts +23 -0
- package/dist/recognition/DeepgramSTT.js +153 -0
- package/dist/recognition/GoogleCloudSTT.cjs +125 -0
- package/dist/recognition/GoogleCloudSTT.d.cts +35 -0
- package/dist/recognition/GoogleCloudSTT.d.ts +35 -0
- package/dist/recognition/GoogleCloudSTT.js +107 -0
- package/dist/recognition/OpenAICloudSTT.cjs +180 -0
- package/dist/recognition/OpenAICloudSTT.d.cts +29 -0
- package/dist/recognition/OpenAICloudSTT.d.ts +29 -0
- package/dist/recognition/OpenAICloudSTT.js +150 -0
- package/dist/recognition/WhisperCppSTT.cjs +296 -0
- package/dist/recognition/WhisperCppSTT.d.cts +40 -0
- package/dist/recognition/WhisperCppSTT.d.ts +40 -0
- package/dist/recognition/WhisperCppSTT.js +275 -0
- package/dist/recognition/WhisperServerSTT.cjs +119 -0
- package/dist/recognition/WhisperServerSTT.d.cts +24 -0
- package/dist/recognition/WhisperServerSTT.d.ts +24 -0
- package/dist/recognition/WhisperServerSTT.js +105 -0
- package/dist/utilities/FileSystem.cjs +54 -0
- package/dist/utilities/FileSystem.d.cts +3 -0
- package/dist/utilities/FileSystem.d.ts +3 -0
- package/dist/utilities/FileSystem.js +20 -0
- package/dist/utilities/Locale.cjs +46 -0
- package/dist/utilities/Locale.d.cts +9 -0
- package/dist/utilities/Locale.d.ts +9 -0
- package/dist/utilities/Locale.js +20 -0
- package/dist/utilities/ObjectUtilities.cjs +41 -0
- package/dist/utilities/ObjectUtilities.d.cts +3 -0
- package/dist/utilities/ObjectUtilities.d.ts +3 -0
- package/dist/utilities/ObjectUtilities.js +7 -0
- package/dist/utilities/Timeline.cjs +120 -0
- package/dist/utilities/Timeline.d.cts +23 -0
- package/dist/utilities/Timeline.d.ts +23 -0
- package/dist/utilities/Timeline.js +94 -0
- package/dist/utilities/Timing.cjs +287 -0
- package/dist/utilities/Timing.d.cts +64 -0
- package/dist/utilities/Timing.d.ts +64 -0
- package/dist/utilities/Timing.js +256 -0
- package/dist/utilities/WhisperTimeline.cjs +344 -0
- package/dist/utilities/WhisperTimeline.d.cts +86 -0
- package/dist/utilities/WhisperTimeline.d.ts +86 -0
- package/dist/utilities/WhisperTimeline.js +313 -0
- package/dist/vad/ActiveGate.cjs +357 -0
- package/dist/vad/ActiveGate.d.cts +53 -0
- package/dist/vad/ActiveGate.d.ts +53 -0
- package/dist/vad/ActiveGate.js +329 -0
- package/dist/vad/ActiveGateOg.cjs +1366 -0
- package/dist/vad/ActiveGateOg.d.cts +33 -0
- package/dist/vad/ActiveGateOg.d.ts +33 -0
- package/dist/vad/ActiveGateOg.js +1341 -0
- package/dist/vad/Silero.cjs +174 -0
- package/dist/vad/Silero.d.cts +25 -0
- package/dist/vad/Silero.d.ts +25 -0
- package/dist/vad/Silero.js +153 -0
- package/package.json +125 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var AmazonTranscribeSTT_exports = {};
|
|
30
|
+
__export(AmazonTranscribeSTT_exports, {
|
|
31
|
+
languageCodeDefaultDialects: () => languageCodeDefaultDialects,
|
|
32
|
+
recognize: () => recognize
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(AmazonTranscribeSTT_exports);
|
|
35
|
+
var import_client_transcribe_streaming = require("@aws-sdk/client-transcribe-streaming");
|
|
36
|
+
var import_audio = require("../audio/index.cjs");
|
|
37
|
+
const wordCharacterRegExp = new RegExp("\\p{L}|\\p{N}", "u");
|
|
38
|
+
async function recognize(input, languageCode, options) {
|
|
39
|
+
const timing = options.timing;
|
|
40
|
+
const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, options.inputFormat);
|
|
41
|
+
const resolvedLanguageCode = resolveLanguageCode(languageCode);
|
|
42
|
+
let audioStream;
|
|
43
|
+
let runConversion = null;
|
|
44
|
+
const conversionNeeded = source.format !== "flac" && source.format !== "opus";
|
|
45
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
|
|
46
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", conversionNeeded ? "flac" : source.format);
|
|
47
|
+
if (!conversionNeeded) {
|
|
48
|
+
audioStream = (0, import_audio.toReadStream)(source);
|
|
49
|
+
} else {
|
|
50
|
+
const conversion = (0, import_audio.createStreamingConversion)(source, {
|
|
51
|
+
targetFormat: "flac",
|
|
52
|
+
sampleRate: 16e3,
|
|
53
|
+
channels: 1
|
|
54
|
+
});
|
|
55
|
+
audioStream = conversion.stream;
|
|
56
|
+
runConversion = conversion.start;
|
|
57
|
+
}
|
|
58
|
+
const streamingTranscribeSdk = await import("@aws-sdk/client-transcribe-streaming");
|
|
59
|
+
const client = new streamingTranscribeSdk.TranscribeStreamingClient({
|
|
60
|
+
region: options.region,
|
|
61
|
+
credentials: {
|
|
62
|
+
accessKeyId: options.accessKeyId,
|
|
63
|
+
secretAccessKey: options.secretAccessKey
|
|
64
|
+
}
|
|
65
|
+
});
|
|
66
|
+
const params = {
|
|
67
|
+
LanguageCode: resolvedLanguageCode,
|
|
68
|
+
MediaEncoding: "flac",
|
|
69
|
+
MediaSampleRateHertz: 16e3,
|
|
70
|
+
AudioStream: createAsyncIterableFromStream(audioStream)
|
|
71
|
+
};
|
|
72
|
+
const command = new streamingTranscribeSdk.StartStreamTranscriptionCommand(
|
|
73
|
+
params
|
|
74
|
+
);
|
|
75
|
+
const [response] = await (timing == null ? void 0 : timing.timeAsync("upload", async () => {
|
|
76
|
+
var _a;
|
|
77
|
+
const conversionPromise = runConversion == null ? void 0 : runConversion();
|
|
78
|
+
const resp = await client.send(command);
|
|
79
|
+
let transcript2 = "";
|
|
80
|
+
let events2 = [];
|
|
81
|
+
if (!resp.TranscriptResultStream) {
|
|
82
|
+
throw new Error("No transcript result stream");
|
|
83
|
+
}
|
|
84
|
+
for await (const event of resp.TranscriptResultStream) {
|
|
85
|
+
if (!event.TranscriptEvent) continue;
|
|
86
|
+
const results = (_a = event.TranscriptEvent.Transcript) == null ? void 0 : _a.Results;
|
|
87
|
+
if (!(results == null ? void 0 : results.length) || !results[0]) continue;
|
|
88
|
+
const firstResult = results[0];
|
|
89
|
+
const alternatives = firstResult.Alternatives;
|
|
90
|
+
if (!(alternatives == null ? void 0 : alternatives.length)) continue;
|
|
91
|
+
const firstAlternative = alternatives[0];
|
|
92
|
+
if (firstResult.IsPartial === false && (firstAlternative == null ? void 0 : firstAlternative.Items) && firstAlternative.Transcript) {
|
|
93
|
+
events2 = [...events2, ...firstAlternative.Items];
|
|
94
|
+
transcript2 += " " + firstAlternative.Transcript;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
await conversionPromise;
|
|
98
|
+
return [{ transcript: transcript2, events: events2 }];
|
|
99
|
+
})) ?? [{ transcript: "", events: [] }];
|
|
100
|
+
const transcript = response.transcript.replace(/ +/g, " ").trim();
|
|
101
|
+
const events = response.events;
|
|
102
|
+
const timeline = [];
|
|
103
|
+
for (const event of events) {
|
|
104
|
+
const text = event.Content;
|
|
105
|
+
if (!text || !wordCharacterRegExp.test(text)) continue;
|
|
106
|
+
const startTime = event.StartTime ?? 0;
|
|
107
|
+
const endTime = event.EndTime ?? 0;
|
|
108
|
+
const confidence = event.Confidence ?? 0;
|
|
109
|
+
const lastEntry = timeline[timeline.length - 1];
|
|
110
|
+
if (lastEntry && startTime) {
|
|
111
|
+
lastEntry.endTime = startTime;
|
|
112
|
+
}
|
|
113
|
+
timeline.push({
|
|
114
|
+
type: "word",
|
|
115
|
+
text,
|
|
116
|
+
startTime,
|
|
117
|
+
endTime,
|
|
118
|
+
confidence
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
return { transcript, timeline };
|
|
122
|
+
}
|
|
123
|
+
async function* createAsyncIterableFromStream(stream) {
|
|
124
|
+
for await (const chunk of stream) {
|
|
125
|
+
yield { AudioEvent: { AudioChunk: chunk } };
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
function resolveLanguageCode(languageCode) {
|
|
129
|
+
if (languageCode in import_client_transcribe_streaming.LanguageCode) {
|
|
130
|
+
return languageCode;
|
|
131
|
+
}
|
|
132
|
+
if (languageCode.length === 2) {
|
|
133
|
+
const matchingDialect = languageCodeDefaultDialects.find(
|
|
134
|
+
(value) => value.startsWith(languageCode)
|
|
135
|
+
);
|
|
136
|
+
if (matchingDialect) {
|
|
137
|
+
return matchingDialect;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
throw new Error(
|
|
141
|
+
`Language code ${languageCode} is not supported by Amazon Transcribe`
|
|
142
|
+
);
|
|
143
|
+
}
|
|
144
|
+
const languageCodeDefaultDialects = [
|
|
145
|
+
"af-ZA",
|
|
146
|
+
"ar-SA",
|
|
147
|
+
"ca-ES",
|
|
148
|
+
"cs-CZ",
|
|
149
|
+
"da-DK",
|
|
150
|
+
"de-DE",
|
|
151
|
+
"el-GR",
|
|
152
|
+
"en-US",
|
|
153
|
+
"es-ES",
|
|
154
|
+
"eu-ES",
|
|
155
|
+
"fa-IR",
|
|
156
|
+
"fi-FI",
|
|
157
|
+
"fr-FR",
|
|
158
|
+
"gl-ES",
|
|
159
|
+
"he-IL",
|
|
160
|
+
"hi-IN",
|
|
161
|
+
"hr-HR",
|
|
162
|
+
"id-ID",
|
|
163
|
+
"it-IT",
|
|
164
|
+
"ja-JP",
|
|
165
|
+
"ko-KR",
|
|
166
|
+
"lv-LV",
|
|
167
|
+
"ms-MY",
|
|
168
|
+
"nl-NL",
|
|
169
|
+
"no-NO",
|
|
170
|
+
"pl-PL",
|
|
171
|
+
"pt-BR",
|
|
172
|
+
"ro-RO",
|
|
173
|
+
"ru-RU",
|
|
174
|
+
"sk-SK",
|
|
175
|
+
"so-SO",
|
|
176
|
+
"sr-RS",
|
|
177
|
+
"sv-SE",
|
|
178
|
+
"th-TH",
|
|
179
|
+
"tl-PH",
|
|
180
|
+
"uk-UA",
|
|
181
|
+
"vi-VN",
|
|
182
|
+
"zh-CN"
|
|
183
|
+
];
|
|
184
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
185
|
+
0 && (module.exports = {
|
|
186
|
+
languageCodeDefaultDialects,
|
|
187
|
+
recognize
|
|
188
|
+
});
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { AudioFormat } from '../audio/AudioFormat.cjs';
|
|
2
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
|
|
3
|
+
import { Timeline } from '../utilities/Timeline.cjs';
|
|
4
|
+
import { Timing } from '../utilities/Timing.cjs';
|
|
5
|
+
import 'node:fs';
|
|
6
|
+
import 'node:stream';
|
|
7
|
+
|
|
8
|
+
interface AmazonTranscribeOptions {
|
|
9
|
+
region: string;
|
|
10
|
+
accessKeyId: string;
|
|
11
|
+
secretAccessKey: string;
|
|
12
|
+
inputFormat?: AudioFormat | undefined;
|
|
13
|
+
timing?: Timing | undefined;
|
|
14
|
+
}
|
|
15
|
+
declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options: AmazonTranscribeOptions): Promise<{
|
|
16
|
+
transcript: string;
|
|
17
|
+
timeline: Timeline;
|
|
18
|
+
}>;
|
|
19
|
+
declare const languageCodeDefaultDialects: string[];
|
|
20
|
+
|
|
21
|
+
export { type AmazonTranscribeOptions, languageCodeDefaultDialects, recognize };
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { AudioFormat } from '../audio/AudioFormat.js';
|
|
2
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
|
|
3
|
+
import { Timeline } from '../utilities/Timeline.js';
|
|
4
|
+
import { Timing } from '../utilities/Timing.js';
|
|
5
|
+
import 'node:fs';
|
|
6
|
+
import 'node:stream';
|
|
7
|
+
|
|
8
|
+
interface AmazonTranscribeOptions {
|
|
9
|
+
region: string;
|
|
10
|
+
accessKeyId: string;
|
|
11
|
+
secretAccessKey: string;
|
|
12
|
+
inputFormat?: AudioFormat | undefined;
|
|
13
|
+
timing?: Timing | undefined;
|
|
14
|
+
}
|
|
15
|
+
declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options: AmazonTranscribeOptions): Promise<{
|
|
16
|
+
transcript: string;
|
|
17
|
+
timeline: Timeline;
|
|
18
|
+
}>;
|
|
19
|
+
declare const languageCodeDefaultDialects: string[];
|
|
20
|
+
|
|
21
|
+
export { type AmazonTranscribeOptions, languageCodeDefaultDialects, recognize };
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import {
|
|
2
|
+
LanguageCode
|
|
3
|
+
} from "@aws-sdk/client-transcribe-streaming";
|
|
4
|
+
import {
|
|
5
|
+
createStreamingConversion,
|
|
6
|
+
isAudioSource,
|
|
7
|
+
normalizeToAudioSource,
|
|
8
|
+
toReadStream
|
|
9
|
+
} from "../audio/index.js";
|
|
10
|
+
const wordCharacterRegExp = new RegExp("\\p{L}|\\p{N}", "u");
|
|
11
|
+
async function recognize(input, languageCode, options) {
|
|
12
|
+
const timing = options.timing;
|
|
13
|
+
const source = isAudioSource(input) ? input : normalizeToAudioSource(input, options.inputFormat);
|
|
14
|
+
const resolvedLanguageCode = resolveLanguageCode(languageCode);
|
|
15
|
+
let audioStream;
|
|
16
|
+
let runConversion = null;
|
|
17
|
+
const conversionNeeded = source.format !== "flac" && source.format !== "opus";
|
|
18
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
|
|
19
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", conversionNeeded ? "flac" : source.format);
|
|
20
|
+
if (!conversionNeeded) {
|
|
21
|
+
audioStream = toReadStream(source);
|
|
22
|
+
} else {
|
|
23
|
+
const conversion = createStreamingConversion(source, {
|
|
24
|
+
targetFormat: "flac",
|
|
25
|
+
sampleRate: 16e3,
|
|
26
|
+
channels: 1
|
|
27
|
+
});
|
|
28
|
+
audioStream = conversion.stream;
|
|
29
|
+
runConversion = conversion.start;
|
|
30
|
+
}
|
|
31
|
+
const streamingTranscribeSdk = await import("@aws-sdk/client-transcribe-streaming");
|
|
32
|
+
const client = new streamingTranscribeSdk.TranscribeStreamingClient({
|
|
33
|
+
region: options.region,
|
|
34
|
+
credentials: {
|
|
35
|
+
accessKeyId: options.accessKeyId,
|
|
36
|
+
secretAccessKey: options.secretAccessKey
|
|
37
|
+
}
|
|
38
|
+
});
|
|
39
|
+
const params = {
|
|
40
|
+
LanguageCode: resolvedLanguageCode,
|
|
41
|
+
MediaEncoding: "flac",
|
|
42
|
+
MediaSampleRateHertz: 16e3,
|
|
43
|
+
AudioStream: createAsyncIterableFromStream(audioStream)
|
|
44
|
+
};
|
|
45
|
+
const command = new streamingTranscribeSdk.StartStreamTranscriptionCommand(
|
|
46
|
+
params
|
|
47
|
+
);
|
|
48
|
+
const [response] = await (timing == null ? void 0 : timing.timeAsync("upload", async () => {
|
|
49
|
+
var _a;
|
|
50
|
+
const conversionPromise = runConversion == null ? void 0 : runConversion();
|
|
51
|
+
const resp = await client.send(command);
|
|
52
|
+
let transcript2 = "";
|
|
53
|
+
let events2 = [];
|
|
54
|
+
if (!resp.TranscriptResultStream) {
|
|
55
|
+
throw new Error("No transcript result stream");
|
|
56
|
+
}
|
|
57
|
+
for await (const event of resp.TranscriptResultStream) {
|
|
58
|
+
if (!event.TranscriptEvent) continue;
|
|
59
|
+
const results = (_a = event.TranscriptEvent.Transcript) == null ? void 0 : _a.Results;
|
|
60
|
+
if (!(results == null ? void 0 : results.length) || !results[0]) continue;
|
|
61
|
+
const firstResult = results[0];
|
|
62
|
+
const alternatives = firstResult.Alternatives;
|
|
63
|
+
if (!(alternatives == null ? void 0 : alternatives.length)) continue;
|
|
64
|
+
const firstAlternative = alternatives[0];
|
|
65
|
+
if (firstResult.IsPartial === false && (firstAlternative == null ? void 0 : firstAlternative.Items) && firstAlternative.Transcript) {
|
|
66
|
+
events2 = [...events2, ...firstAlternative.Items];
|
|
67
|
+
transcript2 += " " + firstAlternative.Transcript;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
await conversionPromise;
|
|
71
|
+
return [{ transcript: transcript2, events: events2 }];
|
|
72
|
+
})) ?? [{ transcript: "", events: [] }];
|
|
73
|
+
const transcript = response.transcript.replace(/ +/g, " ").trim();
|
|
74
|
+
const events = response.events;
|
|
75
|
+
const timeline = [];
|
|
76
|
+
for (const event of events) {
|
|
77
|
+
const text = event.Content;
|
|
78
|
+
if (!text || !wordCharacterRegExp.test(text)) continue;
|
|
79
|
+
const startTime = event.StartTime ?? 0;
|
|
80
|
+
const endTime = event.EndTime ?? 0;
|
|
81
|
+
const confidence = event.Confidence ?? 0;
|
|
82
|
+
const lastEntry = timeline[timeline.length - 1];
|
|
83
|
+
if (lastEntry && startTime) {
|
|
84
|
+
lastEntry.endTime = startTime;
|
|
85
|
+
}
|
|
86
|
+
timeline.push({
|
|
87
|
+
type: "word",
|
|
88
|
+
text,
|
|
89
|
+
startTime,
|
|
90
|
+
endTime,
|
|
91
|
+
confidence
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
return { transcript, timeline };
|
|
95
|
+
}
|
|
96
|
+
async function* createAsyncIterableFromStream(stream) {
|
|
97
|
+
for await (const chunk of stream) {
|
|
98
|
+
yield { AudioEvent: { AudioChunk: chunk } };
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
function resolveLanguageCode(languageCode) {
|
|
102
|
+
if (languageCode in LanguageCode) {
|
|
103
|
+
return languageCode;
|
|
104
|
+
}
|
|
105
|
+
if (languageCode.length === 2) {
|
|
106
|
+
const matchingDialect = languageCodeDefaultDialects.find(
|
|
107
|
+
(value) => value.startsWith(languageCode)
|
|
108
|
+
);
|
|
109
|
+
if (matchingDialect) {
|
|
110
|
+
return matchingDialect;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
throw new Error(
|
|
114
|
+
`Language code ${languageCode} is not supported by Amazon Transcribe`
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
const languageCodeDefaultDialects = [
|
|
118
|
+
"af-ZA",
|
|
119
|
+
"ar-SA",
|
|
120
|
+
"ca-ES",
|
|
121
|
+
"cs-CZ",
|
|
122
|
+
"da-DK",
|
|
123
|
+
"de-DE",
|
|
124
|
+
"el-GR",
|
|
125
|
+
"en-US",
|
|
126
|
+
"es-ES",
|
|
127
|
+
"eu-ES",
|
|
128
|
+
"fa-IR",
|
|
129
|
+
"fi-FI",
|
|
130
|
+
"fr-FR",
|
|
131
|
+
"gl-ES",
|
|
132
|
+
"he-IL",
|
|
133
|
+
"hi-IN",
|
|
134
|
+
"hr-HR",
|
|
135
|
+
"id-ID",
|
|
136
|
+
"it-IT",
|
|
137
|
+
"ja-JP",
|
|
138
|
+
"ko-KR",
|
|
139
|
+
"lv-LV",
|
|
140
|
+
"ms-MY",
|
|
141
|
+
"nl-NL",
|
|
142
|
+
"no-NO",
|
|
143
|
+
"pl-PL",
|
|
144
|
+
"pt-BR",
|
|
145
|
+
"ro-RO",
|
|
146
|
+
"ru-RU",
|
|
147
|
+
"sk-SK",
|
|
148
|
+
"so-SO",
|
|
149
|
+
"sr-RS",
|
|
150
|
+
"sv-SE",
|
|
151
|
+
"th-TH",
|
|
152
|
+
"tl-PH",
|
|
153
|
+
"uk-UA",
|
|
154
|
+
"vi-VN",
|
|
155
|
+
"zh-CN"
|
|
156
|
+
];
|
|
157
|
+
export {
|
|
158
|
+
languageCodeDefaultDialects,
|
|
159
|
+
recognize
|
|
160
|
+
};
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var AzureCognitiveServicesSTT_exports = {};
|
|
30
|
+
__export(AzureCognitiveServicesSTT_exports, {
|
|
31
|
+
recognize: () => recognize
|
|
32
|
+
});
|
|
33
|
+
module.exports = __toCommonJS(AzureCognitiveServicesSTT_exports);
|
|
34
|
+
var SpeechSDK = __toESM(require("microsoft-cognitiveservices-speech-sdk"), 1);
|
|
35
|
+
var import_audio = require("../audio/index.cjs");
|
|
36
|
+
async function recognize(input, options, languageCode) {
|
|
37
|
+
const timing = options.timing;
|
|
38
|
+
const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, options.inputFormat);
|
|
39
|
+
const conversionNeeded = source.format !== "wav";
|
|
40
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
|
|
41
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", "wav");
|
|
42
|
+
const doPrepare = () => (0, import_audio.prepareWavForService)(source, { sampleRate: 16e3, channels: 1 });
|
|
43
|
+
const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
|
|
44
|
+
try {
|
|
45
|
+
const doRecognition = () => runRecognition(
|
|
46
|
+
prepared.source,
|
|
47
|
+
options.subscriptionKey,
|
|
48
|
+
options.serviceRegion,
|
|
49
|
+
languageCode,
|
|
50
|
+
options.profanity ?? SpeechSDK.ProfanityOption.Raw
|
|
51
|
+
);
|
|
52
|
+
const result = timing ? await timing.timeAsync("upload", doRecognition) : await doRecognition();
|
|
53
|
+
const transcript = result.text;
|
|
54
|
+
const resultObject = JSON.parse(result.json);
|
|
55
|
+
const bestResult = resultObject.NBest[0];
|
|
56
|
+
const timeline = [];
|
|
57
|
+
for (const wordEntry of (bestResult == null ? void 0 : bestResult.Words) ?? []) {
|
|
58
|
+
const text = wordEntry.Word;
|
|
59
|
+
const startTime = wordEntry.Offset / 1e7;
|
|
60
|
+
const endTime = (wordEntry.Offset + wordEntry.Duration) / 1e7;
|
|
61
|
+
timeline.push({ type: "word", text, startTime, endTime });
|
|
62
|
+
}
|
|
63
|
+
return { transcript, timeline };
|
|
64
|
+
} finally {
|
|
65
|
+
await prepared.cleanup();
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
async function runRecognition(source, subscriptionKey, serviceRegion, languageCode, profanity) {
|
|
69
|
+
const audioFormat = SpeechSDK.AudioStreamFormat.getWaveFormat(
|
|
70
|
+
16e3,
|
|
71
|
+
16,
|
|
72
|
+
1,
|
|
73
|
+
SpeechSDK.AudioFormatTag.PCM
|
|
74
|
+
);
|
|
75
|
+
const pushStream = SpeechSDK.AudioInputStream.createPushStream(audioFormat);
|
|
76
|
+
const readable = (0, import_audio.toReadStream)(source);
|
|
77
|
+
const streamPromise = new Promise((resolve, reject) => {
|
|
78
|
+
readable.on("data", (chunk) => {
|
|
79
|
+
const arrayBuffer = new ArrayBuffer(chunk.length);
|
|
80
|
+
const view = new Uint8Array(arrayBuffer);
|
|
81
|
+
chunk.copy(view);
|
|
82
|
+
pushStream.write(arrayBuffer);
|
|
83
|
+
}).on("end", () => {
|
|
84
|
+
pushStream.close();
|
|
85
|
+
resolve();
|
|
86
|
+
}).on("error", (err) => {
|
|
87
|
+
pushStream.close();
|
|
88
|
+
reject(err);
|
|
89
|
+
});
|
|
90
|
+
});
|
|
91
|
+
const recognitionPromise = new Promise(
|
|
92
|
+
(resolve, reject) => {
|
|
93
|
+
const audioConfig = SpeechSDK.AudioConfig.fromStreamInput(pushStream);
|
|
94
|
+
const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(
|
|
95
|
+
subscriptionKey,
|
|
96
|
+
serviceRegion
|
|
97
|
+
);
|
|
98
|
+
speechConfig.speechRecognitionLanguage = languageCode;
|
|
99
|
+
speechConfig.setProfanity(profanity);
|
|
100
|
+
speechConfig.requestWordLevelTimestamps();
|
|
101
|
+
speechConfig.outputFormat = SpeechSDK.OutputFormat.Detailed;
|
|
102
|
+
const recognizer = new SpeechSDK.SpeechRecognizer(
|
|
103
|
+
speechConfig,
|
|
104
|
+
audioConfig
|
|
105
|
+
);
|
|
106
|
+
recognizer.recognizeOnceAsync(
|
|
107
|
+
(result) => {
|
|
108
|
+
recognizer.close();
|
|
109
|
+
resolve(result);
|
|
110
|
+
},
|
|
111
|
+
(error) => {
|
|
112
|
+
recognizer.close();
|
|
113
|
+
reject(new Error(error));
|
|
114
|
+
}
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
);
|
|
118
|
+
await streamPromise;
|
|
119
|
+
return recognitionPromise;
|
|
120
|
+
}
|
|
121
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
122
|
+
0 && (module.exports = {
|
|
123
|
+
recognize
|
|
124
|
+
});
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import * as SpeechSDK from 'microsoft-cognitiveservices-speech-sdk';
|
|
2
|
+
import { AudioFormat } from '../audio/AudioFormat.cjs';
|
|
3
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
|
|
4
|
+
import { Timeline } from '../utilities/Timeline.cjs';
|
|
5
|
+
import { Timing } from '../utilities/Timing.cjs';
|
|
6
|
+
import 'node:fs';
|
|
7
|
+
import 'node:stream';
|
|
8
|
+
|
|
9
|
+
interface AzureSTTOptions {
|
|
10
|
+
subscriptionKey: string;
|
|
11
|
+
serviceRegion: string;
|
|
12
|
+
profanity?: SpeechSDK.ProfanityOption | undefined;
|
|
13
|
+
inputFormat?: AudioFormat | undefined;
|
|
14
|
+
timing?: Timing | undefined;
|
|
15
|
+
}
|
|
16
|
+
declare function recognize(input: RawAudioInput | AudioSource, options: AzureSTTOptions, languageCode: string): Promise<{
|
|
17
|
+
transcript: string;
|
|
18
|
+
timeline: Timeline;
|
|
19
|
+
}>;
|
|
20
|
+
|
|
21
|
+
export { type AzureSTTOptions, recognize };
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import * as SpeechSDK from 'microsoft-cognitiveservices-speech-sdk';
|
|
2
|
+
import { AudioFormat } from '../audio/AudioFormat.js';
|
|
3
|
+
import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
|
|
4
|
+
import { Timeline } from '../utilities/Timeline.js';
|
|
5
|
+
import { Timing } from '../utilities/Timing.js';
|
|
6
|
+
import 'node:fs';
|
|
7
|
+
import 'node:stream';
|
|
8
|
+
|
|
9
|
+
interface AzureSTTOptions {
|
|
10
|
+
subscriptionKey: string;
|
|
11
|
+
serviceRegion: string;
|
|
12
|
+
profanity?: SpeechSDK.ProfanityOption | undefined;
|
|
13
|
+
inputFormat?: AudioFormat | undefined;
|
|
14
|
+
timing?: Timing | undefined;
|
|
15
|
+
}
|
|
16
|
+
declare function recognize(input: RawAudioInput | AudioSource, options: AzureSTTOptions, languageCode: string): Promise<{
|
|
17
|
+
transcript: string;
|
|
18
|
+
timeline: Timeline;
|
|
19
|
+
}>;
|
|
20
|
+
|
|
21
|
+
export { type AzureSTTOptions, recognize };
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import * as SpeechSDK from "microsoft-cognitiveservices-speech-sdk";
|
|
2
|
+
import {
|
|
3
|
+
isAudioSource,
|
|
4
|
+
normalizeToAudioSource,
|
|
5
|
+
prepareWavForService,
|
|
6
|
+
toReadStream
|
|
7
|
+
} from "../audio/index.js";
|
|
8
|
+
async function recognize(input, options, languageCode) {
|
|
9
|
+
const timing = options.timing;
|
|
10
|
+
const source = isAudioSource(input) ? input : normalizeToAudioSource(input, options.inputFormat);
|
|
11
|
+
const conversionNeeded = source.format !== "wav";
|
|
12
|
+
timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
|
|
13
|
+
timing == null ? void 0 : timing.setMetadata("targetFormat", "wav");
|
|
14
|
+
const doPrepare = () => prepareWavForService(source, { sampleRate: 16e3, channels: 1 });
|
|
15
|
+
const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
|
|
16
|
+
try {
|
|
17
|
+
const doRecognition = () => runRecognition(
|
|
18
|
+
prepared.source,
|
|
19
|
+
options.subscriptionKey,
|
|
20
|
+
options.serviceRegion,
|
|
21
|
+
languageCode,
|
|
22
|
+
options.profanity ?? SpeechSDK.ProfanityOption.Raw
|
|
23
|
+
);
|
|
24
|
+
const result = timing ? await timing.timeAsync("upload", doRecognition) : await doRecognition();
|
|
25
|
+
const transcript = result.text;
|
|
26
|
+
const resultObject = JSON.parse(result.json);
|
|
27
|
+
const bestResult = resultObject.NBest[0];
|
|
28
|
+
const timeline = [];
|
|
29
|
+
for (const wordEntry of (bestResult == null ? void 0 : bestResult.Words) ?? []) {
|
|
30
|
+
const text = wordEntry.Word;
|
|
31
|
+
const startTime = wordEntry.Offset / 1e7;
|
|
32
|
+
const endTime = (wordEntry.Offset + wordEntry.Duration) / 1e7;
|
|
33
|
+
timeline.push({ type: "word", text, startTime, endTime });
|
|
34
|
+
}
|
|
35
|
+
return { transcript, timeline };
|
|
36
|
+
} finally {
|
|
37
|
+
await prepared.cleanup();
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
async function runRecognition(source, subscriptionKey, serviceRegion, languageCode, profanity) {
|
|
41
|
+
const audioFormat = SpeechSDK.AudioStreamFormat.getWaveFormat(
|
|
42
|
+
16e3,
|
|
43
|
+
16,
|
|
44
|
+
1,
|
|
45
|
+
SpeechSDK.AudioFormatTag.PCM
|
|
46
|
+
);
|
|
47
|
+
const pushStream = SpeechSDK.AudioInputStream.createPushStream(audioFormat);
|
|
48
|
+
const readable = toReadStream(source);
|
|
49
|
+
const streamPromise = new Promise((resolve, reject) => {
|
|
50
|
+
readable.on("data", (chunk) => {
|
|
51
|
+
const arrayBuffer = new ArrayBuffer(chunk.length);
|
|
52
|
+
const view = new Uint8Array(arrayBuffer);
|
|
53
|
+
chunk.copy(view);
|
|
54
|
+
pushStream.write(arrayBuffer);
|
|
55
|
+
}).on("end", () => {
|
|
56
|
+
pushStream.close();
|
|
57
|
+
resolve();
|
|
58
|
+
}).on("error", (err) => {
|
|
59
|
+
pushStream.close();
|
|
60
|
+
reject(err);
|
|
61
|
+
});
|
|
62
|
+
});
|
|
63
|
+
const recognitionPromise = new Promise(
|
|
64
|
+
(resolve, reject) => {
|
|
65
|
+
const audioConfig = SpeechSDK.AudioConfig.fromStreamInput(pushStream);
|
|
66
|
+
const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(
|
|
67
|
+
subscriptionKey,
|
|
68
|
+
serviceRegion
|
|
69
|
+
);
|
|
70
|
+
speechConfig.speechRecognitionLanguage = languageCode;
|
|
71
|
+
speechConfig.setProfanity(profanity);
|
|
72
|
+
speechConfig.requestWordLevelTimestamps();
|
|
73
|
+
speechConfig.outputFormat = SpeechSDK.OutputFormat.Detailed;
|
|
74
|
+
const recognizer = new SpeechSDK.SpeechRecognizer(
|
|
75
|
+
speechConfig,
|
|
76
|
+
audioConfig
|
|
77
|
+
);
|
|
78
|
+
recognizer.recognizeOnceAsync(
|
|
79
|
+
(result) => {
|
|
80
|
+
recognizer.close();
|
|
81
|
+
resolve(result);
|
|
82
|
+
},
|
|
83
|
+
(error) => {
|
|
84
|
+
recognizer.close();
|
|
85
|
+
reject(new Error(error));
|
|
86
|
+
}
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
);
|
|
90
|
+
await streamPromise;
|
|
91
|
+
return recognitionPromise;
|
|
92
|
+
}
|
|
93
|
+
export {
|
|
94
|
+
recognize
|
|
95
|
+
};
|