@storyteller-platform/ghost-story 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/LICENSE.md +611 -0
  2. package/README.md +18 -0
  3. package/dist/api/APIOptions.cjs +16 -0
  4. package/dist/api/APIOptions.d.cts +18 -0
  5. package/dist/api/APIOptions.d.ts +18 -0
  6. package/dist/api/APIOptions.js +0 -0
  7. package/dist/api/Recognition.cjs +263 -0
  8. package/dist/api/Recognition.d.cts +77 -0
  9. package/dist/api/Recognition.d.ts +77 -0
  10. package/dist/api/Recognition.js +233 -0
  11. package/dist/api/VoiceActivityDetection.cjs +77 -0
  12. package/dist/api/VoiceActivityDetection.d.cts +24 -0
  13. package/dist/api/VoiceActivityDetection.d.ts +24 -0
  14. package/dist/api/VoiceActivityDetection.js +43 -0
  15. package/dist/audio/AudioConverter.cjs +331 -0
  16. package/dist/audio/AudioConverter.d.cts +53 -0
  17. package/dist/audio/AudioConverter.d.ts +53 -0
  18. package/dist/audio/AudioConverter.js +310 -0
  19. package/dist/audio/AudioFormat.cjs +151 -0
  20. package/dist/audio/AudioFormat.d.cts +25 -0
  21. package/dist/audio/AudioFormat.d.ts +25 -0
  22. package/dist/audio/AudioFormat.js +123 -0
  23. package/dist/audio/AudioSource.cjs +119 -0
  24. package/dist/audio/AudioSource.d.cts +33 -0
  25. package/dist/audio/AudioSource.d.ts +33 -0
  26. package/dist/audio/AudioSource.js +88 -0
  27. package/dist/audio/index.cjs +74 -0
  28. package/dist/audio/index.d.cts +6 -0
  29. package/dist/audio/index.d.ts +6 -0
  30. package/dist/audio/index.js +54 -0
  31. package/dist/cli/bin.cjs +277 -0
  32. package/dist/cli/bin.d.cts +1 -0
  33. package/dist/cli/bin.d.ts +1 -0
  34. package/dist/cli/bin.js +275 -0
  35. package/dist/cli/config.cjs +347 -0
  36. package/dist/cli/config.d.cts +33 -0
  37. package/dist/cli/config.d.ts +33 -0
  38. package/dist/cli/config.js +285 -0
  39. package/dist/cli/install.cjs +334 -0
  40. package/dist/cli/install.d.cts +62 -0
  41. package/dist/cli/install.d.ts +62 -0
  42. package/dist/cli/install.js +316 -0
  43. package/dist/cli/whisper-server.cjs +172 -0
  44. package/dist/cli/whisper-server.d.cts +24 -0
  45. package/dist/cli/whisper-server.d.ts +24 -0
  46. package/dist/cli/whisper-server.js +152 -0
  47. package/dist/config.cjs +60 -0
  48. package/dist/config.d.cts +12 -0
  49. package/dist/config.d.ts +12 -0
  50. package/dist/config.js +32 -0
  51. package/dist/convert.cjs +88 -0
  52. package/dist/convert.d.cts +12 -0
  53. package/dist/convert.d.ts +12 -0
  54. package/dist/convert.js +63 -0
  55. package/dist/encodings/Ascii.cjs +75 -0
  56. package/dist/encodings/Ascii.d.cts +13 -0
  57. package/dist/encodings/Ascii.d.ts +13 -0
  58. package/dist/encodings/Ascii.js +48 -0
  59. package/dist/encodings/Base64.cjs +155 -0
  60. package/dist/encodings/Base64.d.cts +5 -0
  61. package/dist/encodings/Base64.d.ts +5 -0
  62. package/dist/encodings/Base64.js +129 -0
  63. package/dist/encodings/TextEncodingsCommon.cjs +16 -0
  64. package/dist/encodings/TextEncodingsCommon.d.cts +6 -0
  65. package/dist/encodings/TextEncodingsCommon.d.ts +6 -0
  66. package/dist/encodings/TextEncodingsCommon.js +0 -0
  67. package/dist/index.cjs +153 -0
  68. package/dist/index.d.cts +15 -0
  69. package/dist/index.d.ts +15 -0
  70. package/dist/index.js +140 -0
  71. package/dist/recognition/AmazonTranscribeSTT.cjs +188 -0
  72. package/dist/recognition/AmazonTranscribeSTT.d.cts +21 -0
  73. package/dist/recognition/AmazonTranscribeSTT.d.ts +21 -0
  74. package/dist/recognition/AmazonTranscribeSTT.js +160 -0
  75. package/dist/recognition/AzureCognitiveServicesSTT.cjs +124 -0
  76. package/dist/recognition/AzureCognitiveServicesSTT.d.cts +21 -0
  77. package/dist/recognition/AzureCognitiveServicesSTT.d.ts +21 -0
  78. package/dist/recognition/AzureCognitiveServicesSTT.js +95 -0
  79. package/dist/recognition/DeepgramSTT.cjs +172 -0
  80. package/dist/recognition/DeepgramSTT.d.cts +23 -0
  81. package/dist/recognition/DeepgramSTT.d.ts +23 -0
  82. package/dist/recognition/DeepgramSTT.js +153 -0
  83. package/dist/recognition/GoogleCloudSTT.cjs +125 -0
  84. package/dist/recognition/GoogleCloudSTT.d.cts +35 -0
  85. package/dist/recognition/GoogleCloudSTT.d.ts +35 -0
  86. package/dist/recognition/GoogleCloudSTT.js +107 -0
  87. package/dist/recognition/OpenAICloudSTT.cjs +180 -0
  88. package/dist/recognition/OpenAICloudSTT.d.cts +29 -0
  89. package/dist/recognition/OpenAICloudSTT.d.ts +29 -0
  90. package/dist/recognition/OpenAICloudSTT.js +150 -0
  91. package/dist/recognition/WhisperCppSTT.cjs +296 -0
  92. package/dist/recognition/WhisperCppSTT.d.cts +40 -0
  93. package/dist/recognition/WhisperCppSTT.d.ts +40 -0
  94. package/dist/recognition/WhisperCppSTT.js +275 -0
  95. package/dist/recognition/WhisperServerSTT.cjs +119 -0
  96. package/dist/recognition/WhisperServerSTT.d.cts +24 -0
  97. package/dist/recognition/WhisperServerSTT.d.ts +24 -0
  98. package/dist/recognition/WhisperServerSTT.js +105 -0
  99. package/dist/utilities/FileSystem.cjs +54 -0
  100. package/dist/utilities/FileSystem.d.cts +3 -0
  101. package/dist/utilities/FileSystem.d.ts +3 -0
  102. package/dist/utilities/FileSystem.js +20 -0
  103. package/dist/utilities/Locale.cjs +46 -0
  104. package/dist/utilities/Locale.d.cts +9 -0
  105. package/dist/utilities/Locale.d.ts +9 -0
  106. package/dist/utilities/Locale.js +20 -0
  107. package/dist/utilities/ObjectUtilities.cjs +41 -0
  108. package/dist/utilities/ObjectUtilities.d.cts +3 -0
  109. package/dist/utilities/ObjectUtilities.d.ts +3 -0
  110. package/dist/utilities/ObjectUtilities.js +7 -0
  111. package/dist/utilities/Timeline.cjs +120 -0
  112. package/dist/utilities/Timeline.d.cts +23 -0
  113. package/dist/utilities/Timeline.d.ts +23 -0
  114. package/dist/utilities/Timeline.js +94 -0
  115. package/dist/utilities/Timing.cjs +287 -0
  116. package/dist/utilities/Timing.d.cts +64 -0
  117. package/dist/utilities/Timing.d.ts +64 -0
  118. package/dist/utilities/Timing.js +256 -0
  119. package/dist/utilities/WhisperTimeline.cjs +344 -0
  120. package/dist/utilities/WhisperTimeline.d.cts +86 -0
  121. package/dist/utilities/WhisperTimeline.d.ts +86 -0
  122. package/dist/utilities/WhisperTimeline.js +313 -0
  123. package/dist/vad/ActiveGate.cjs +357 -0
  124. package/dist/vad/ActiveGate.d.cts +53 -0
  125. package/dist/vad/ActiveGate.d.ts +53 -0
  126. package/dist/vad/ActiveGate.js +329 -0
  127. package/dist/vad/ActiveGateOg.cjs +1366 -0
  128. package/dist/vad/ActiveGateOg.d.cts +33 -0
  129. package/dist/vad/ActiveGateOg.d.ts +33 -0
  130. package/dist/vad/ActiveGateOg.js +1341 -0
  131. package/dist/vad/Silero.cjs +174 -0
  132. package/dist/vad/Silero.d.cts +25 -0
  133. package/dist/vad/Silero.d.ts +25 -0
  134. package/dist/vad/Silero.js +153 -0
  135. package/package.json +125 -0
@@ -0,0 +1,188 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+ var AmazonTranscribeSTT_exports = {};
30
+ __export(AmazonTranscribeSTT_exports, {
31
+ languageCodeDefaultDialects: () => languageCodeDefaultDialects,
32
+ recognize: () => recognize
33
+ });
34
+ module.exports = __toCommonJS(AmazonTranscribeSTT_exports);
35
+ var import_client_transcribe_streaming = require("@aws-sdk/client-transcribe-streaming");
36
+ var import_audio = require("../audio/index.cjs");
37
+ const wordCharacterRegExp = new RegExp("\\p{L}|\\p{N}", "u");
38
+ async function recognize(input, languageCode, options) {
39
+ const timing = options.timing;
40
+ const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, options.inputFormat);
41
+ const resolvedLanguageCode = resolveLanguageCode(languageCode);
42
+ let audioStream;
43
+ let runConversion = null;
44
+ const conversionNeeded = source.format !== "flac" && source.format !== "opus";
45
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
46
+ timing == null ? void 0 : timing.setMetadata("targetFormat", conversionNeeded ? "flac" : source.format);
47
+ if (!conversionNeeded) {
48
+ audioStream = (0, import_audio.toReadStream)(source);
49
+ } else {
50
+ const conversion = (0, import_audio.createStreamingConversion)(source, {
51
+ targetFormat: "flac",
52
+ sampleRate: 16e3,
53
+ channels: 1
54
+ });
55
+ audioStream = conversion.stream;
56
+ runConversion = conversion.start;
57
+ }
58
+ const streamingTranscribeSdk = await import("@aws-sdk/client-transcribe-streaming");
59
+ const client = new streamingTranscribeSdk.TranscribeStreamingClient({
60
+ region: options.region,
61
+ credentials: {
62
+ accessKeyId: options.accessKeyId,
63
+ secretAccessKey: options.secretAccessKey
64
+ }
65
+ });
66
+ const params = {
67
+ LanguageCode: resolvedLanguageCode,
68
+ MediaEncoding: "flac",
69
+ MediaSampleRateHertz: 16e3,
70
+ AudioStream: createAsyncIterableFromStream(audioStream)
71
+ };
72
+ const command = new streamingTranscribeSdk.StartStreamTranscriptionCommand(
73
+ params
74
+ );
75
+ const [response] = await (timing == null ? void 0 : timing.timeAsync("upload", async () => {
76
+ var _a;
77
+ const conversionPromise = runConversion == null ? void 0 : runConversion();
78
+ const resp = await client.send(command);
79
+ let transcript2 = "";
80
+ let events2 = [];
81
+ if (!resp.TranscriptResultStream) {
82
+ throw new Error("No transcript result stream");
83
+ }
84
+ for await (const event of resp.TranscriptResultStream) {
85
+ if (!event.TranscriptEvent) continue;
86
+ const results = (_a = event.TranscriptEvent.Transcript) == null ? void 0 : _a.Results;
87
+ if (!(results == null ? void 0 : results.length) || !results[0]) continue;
88
+ const firstResult = results[0];
89
+ const alternatives = firstResult.Alternatives;
90
+ if (!(alternatives == null ? void 0 : alternatives.length)) continue;
91
+ const firstAlternative = alternatives[0];
92
+ if (firstResult.IsPartial === false && (firstAlternative == null ? void 0 : firstAlternative.Items) && firstAlternative.Transcript) {
93
+ events2 = [...events2, ...firstAlternative.Items];
94
+ transcript2 += " " + firstAlternative.Transcript;
95
+ }
96
+ }
97
+ await conversionPromise;
98
+ return [{ transcript: transcript2, events: events2 }];
99
+ })) ?? [{ transcript: "", events: [] }];
100
+ const transcript = response.transcript.replace(/ +/g, " ").trim();
101
+ const events = response.events;
102
+ const timeline = [];
103
+ for (const event of events) {
104
+ const text = event.Content;
105
+ if (!text || !wordCharacterRegExp.test(text)) continue;
106
+ const startTime = event.StartTime ?? 0;
107
+ const endTime = event.EndTime ?? 0;
108
+ const confidence = event.Confidence ?? 0;
109
+ const lastEntry = timeline[timeline.length - 1];
110
+ if (lastEntry && startTime) {
111
+ lastEntry.endTime = startTime;
112
+ }
113
+ timeline.push({
114
+ type: "word",
115
+ text,
116
+ startTime,
117
+ endTime,
118
+ confidence
119
+ });
120
+ }
121
+ return { transcript, timeline };
122
+ }
123
+ async function* createAsyncIterableFromStream(stream) {
124
+ for await (const chunk of stream) {
125
+ yield { AudioEvent: { AudioChunk: chunk } };
126
+ }
127
+ }
128
+ function resolveLanguageCode(languageCode) {
129
+ if (languageCode in import_client_transcribe_streaming.LanguageCode) {
130
+ return languageCode;
131
+ }
132
+ if (languageCode.length === 2) {
133
+ const matchingDialect = languageCodeDefaultDialects.find(
134
+ (value) => value.startsWith(languageCode)
135
+ );
136
+ if (matchingDialect) {
137
+ return matchingDialect;
138
+ }
139
+ }
140
+ throw new Error(
141
+ `Language code ${languageCode} is not supported by Amazon Transcribe`
142
+ );
143
+ }
144
+ const languageCodeDefaultDialects = [
145
+ "af-ZA",
146
+ "ar-SA",
147
+ "ca-ES",
148
+ "cs-CZ",
149
+ "da-DK",
150
+ "de-DE",
151
+ "el-GR",
152
+ "en-US",
153
+ "es-ES",
154
+ "eu-ES",
155
+ "fa-IR",
156
+ "fi-FI",
157
+ "fr-FR",
158
+ "gl-ES",
159
+ "he-IL",
160
+ "hi-IN",
161
+ "hr-HR",
162
+ "id-ID",
163
+ "it-IT",
164
+ "ja-JP",
165
+ "ko-KR",
166
+ "lv-LV",
167
+ "ms-MY",
168
+ "nl-NL",
169
+ "no-NO",
170
+ "pl-PL",
171
+ "pt-BR",
172
+ "ro-RO",
173
+ "ru-RU",
174
+ "sk-SK",
175
+ "so-SO",
176
+ "sr-RS",
177
+ "sv-SE",
178
+ "th-TH",
179
+ "tl-PH",
180
+ "uk-UA",
181
+ "vi-VN",
182
+ "zh-CN"
183
+ ];
184
+ // Annotate the CommonJS export names for ESM import in node:
185
+ 0 && (module.exports = {
186
+ languageCodeDefaultDialects,
187
+ recognize
188
+ });
@@ -0,0 +1,21 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.cjs';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
3
+ import { Timeline } from '../utilities/Timeline.cjs';
4
+ import { Timing } from '../utilities/Timing.cjs';
5
+ import 'node:fs';
6
+ import 'node:stream';
7
+
8
+ interface AmazonTranscribeOptions {
9
+ region: string;
10
+ accessKeyId: string;
11
+ secretAccessKey: string;
12
+ inputFormat?: AudioFormat | undefined;
13
+ timing?: Timing | undefined;
14
+ }
15
+ declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options: AmazonTranscribeOptions): Promise<{
16
+ transcript: string;
17
+ timeline: Timeline;
18
+ }>;
19
+ declare const languageCodeDefaultDialects: string[];
20
+
21
+ export { type AmazonTranscribeOptions, languageCodeDefaultDialects, recognize };
@@ -0,0 +1,21 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.js';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
3
+ import { Timeline } from '../utilities/Timeline.js';
4
+ import { Timing } from '../utilities/Timing.js';
5
+ import 'node:fs';
6
+ import 'node:stream';
7
+
8
+ interface AmazonTranscribeOptions {
9
+ region: string;
10
+ accessKeyId: string;
11
+ secretAccessKey: string;
12
+ inputFormat?: AudioFormat | undefined;
13
+ timing?: Timing | undefined;
14
+ }
15
+ declare function recognize(input: RawAudioInput | AudioSource, languageCode: string, options: AmazonTranscribeOptions): Promise<{
16
+ transcript: string;
17
+ timeline: Timeline;
18
+ }>;
19
+ declare const languageCodeDefaultDialects: string[];
20
+
21
+ export { type AmazonTranscribeOptions, languageCodeDefaultDialects, recognize };
@@ -0,0 +1,160 @@
1
+ import {
2
+ LanguageCode
3
+ } from "@aws-sdk/client-transcribe-streaming";
4
+ import {
5
+ createStreamingConversion,
6
+ isAudioSource,
7
+ normalizeToAudioSource,
8
+ toReadStream
9
+ } from "../audio/index.js";
10
+ const wordCharacterRegExp = new RegExp("\\p{L}|\\p{N}", "u");
11
+ async function recognize(input, languageCode, options) {
12
+ const timing = options.timing;
13
+ const source = isAudioSource(input) ? input : normalizeToAudioSource(input, options.inputFormat);
14
+ const resolvedLanguageCode = resolveLanguageCode(languageCode);
15
+ let audioStream;
16
+ let runConversion = null;
17
+ const conversionNeeded = source.format !== "flac" && source.format !== "opus";
18
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
19
+ timing == null ? void 0 : timing.setMetadata("targetFormat", conversionNeeded ? "flac" : source.format);
20
+ if (!conversionNeeded) {
21
+ audioStream = toReadStream(source);
22
+ } else {
23
+ const conversion = createStreamingConversion(source, {
24
+ targetFormat: "flac",
25
+ sampleRate: 16e3,
26
+ channels: 1
27
+ });
28
+ audioStream = conversion.stream;
29
+ runConversion = conversion.start;
30
+ }
31
+ const streamingTranscribeSdk = await import("@aws-sdk/client-transcribe-streaming");
32
+ const client = new streamingTranscribeSdk.TranscribeStreamingClient({
33
+ region: options.region,
34
+ credentials: {
35
+ accessKeyId: options.accessKeyId,
36
+ secretAccessKey: options.secretAccessKey
37
+ }
38
+ });
39
+ const params = {
40
+ LanguageCode: resolvedLanguageCode,
41
+ MediaEncoding: "flac",
42
+ MediaSampleRateHertz: 16e3,
43
+ AudioStream: createAsyncIterableFromStream(audioStream)
44
+ };
45
+ const command = new streamingTranscribeSdk.StartStreamTranscriptionCommand(
46
+ params
47
+ );
48
+ const [response] = await (timing == null ? void 0 : timing.timeAsync("upload", async () => {
49
+ var _a;
50
+ const conversionPromise = runConversion == null ? void 0 : runConversion();
51
+ const resp = await client.send(command);
52
+ let transcript2 = "";
53
+ let events2 = [];
54
+ if (!resp.TranscriptResultStream) {
55
+ throw new Error("No transcript result stream");
56
+ }
57
+ for await (const event of resp.TranscriptResultStream) {
58
+ if (!event.TranscriptEvent) continue;
59
+ const results = (_a = event.TranscriptEvent.Transcript) == null ? void 0 : _a.Results;
60
+ if (!(results == null ? void 0 : results.length) || !results[0]) continue;
61
+ const firstResult = results[0];
62
+ const alternatives = firstResult.Alternatives;
63
+ if (!(alternatives == null ? void 0 : alternatives.length)) continue;
64
+ const firstAlternative = alternatives[0];
65
+ if (firstResult.IsPartial === false && (firstAlternative == null ? void 0 : firstAlternative.Items) && firstAlternative.Transcript) {
66
+ events2 = [...events2, ...firstAlternative.Items];
67
+ transcript2 += " " + firstAlternative.Transcript;
68
+ }
69
+ }
70
+ await conversionPromise;
71
+ return [{ transcript: transcript2, events: events2 }];
72
+ })) ?? [{ transcript: "", events: [] }];
73
+ const transcript = response.transcript.replace(/ +/g, " ").trim();
74
+ const events = response.events;
75
+ const timeline = [];
76
+ for (const event of events) {
77
+ const text = event.Content;
78
+ if (!text || !wordCharacterRegExp.test(text)) continue;
79
+ const startTime = event.StartTime ?? 0;
80
+ const endTime = event.EndTime ?? 0;
81
+ const confidence = event.Confidence ?? 0;
82
+ const lastEntry = timeline[timeline.length - 1];
83
+ if (lastEntry && startTime) {
84
+ lastEntry.endTime = startTime;
85
+ }
86
+ timeline.push({
87
+ type: "word",
88
+ text,
89
+ startTime,
90
+ endTime,
91
+ confidence
92
+ });
93
+ }
94
+ return { transcript, timeline };
95
+ }
96
+ async function* createAsyncIterableFromStream(stream) {
97
+ for await (const chunk of stream) {
98
+ yield { AudioEvent: { AudioChunk: chunk } };
99
+ }
100
+ }
101
+ function resolveLanguageCode(languageCode) {
102
+ if (languageCode in LanguageCode) {
103
+ return languageCode;
104
+ }
105
+ if (languageCode.length === 2) {
106
+ const matchingDialect = languageCodeDefaultDialects.find(
107
+ (value) => value.startsWith(languageCode)
108
+ );
109
+ if (matchingDialect) {
110
+ return matchingDialect;
111
+ }
112
+ }
113
+ throw new Error(
114
+ `Language code ${languageCode} is not supported by Amazon Transcribe`
115
+ );
116
+ }
117
+ const languageCodeDefaultDialects = [
118
+ "af-ZA",
119
+ "ar-SA",
120
+ "ca-ES",
121
+ "cs-CZ",
122
+ "da-DK",
123
+ "de-DE",
124
+ "el-GR",
125
+ "en-US",
126
+ "es-ES",
127
+ "eu-ES",
128
+ "fa-IR",
129
+ "fi-FI",
130
+ "fr-FR",
131
+ "gl-ES",
132
+ "he-IL",
133
+ "hi-IN",
134
+ "hr-HR",
135
+ "id-ID",
136
+ "it-IT",
137
+ "ja-JP",
138
+ "ko-KR",
139
+ "lv-LV",
140
+ "ms-MY",
141
+ "nl-NL",
142
+ "no-NO",
143
+ "pl-PL",
144
+ "pt-BR",
145
+ "ro-RO",
146
+ "ru-RU",
147
+ "sk-SK",
148
+ "so-SO",
149
+ "sr-RS",
150
+ "sv-SE",
151
+ "th-TH",
152
+ "tl-PH",
153
+ "uk-UA",
154
+ "vi-VN",
155
+ "zh-CN"
156
+ ];
157
+ export {
158
+ languageCodeDefaultDialects,
159
+ recognize
160
+ };
@@ -0,0 +1,124 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+ var AzureCognitiveServicesSTT_exports = {};
30
+ __export(AzureCognitiveServicesSTT_exports, {
31
+ recognize: () => recognize
32
+ });
33
+ module.exports = __toCommonJS(AzureCognitiveServicesSTT_exports);
34
+ var SpeechSDK = __toESM(require("microsoft-cognitiveservices-speech-sdk"), 1);
35
+ var import_audio = require("../audio/index.cjs");
36
+ async function recognize(input, options, languageCode) {
37
+ const timing = options.timing;
38
+ const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, options.inputFormat);
39
+ const conversionNeeded = source.format !== "wav";
40
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
41
+ timing == null ? void 0 : timing.setMetadata("targetFormat", "wav");
42
+ const doPrepare = () => (0, import_audio.prepareWavForService)(source, { sampleRate: 16e3, channels: 1 });
43
+ const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
44
+ try {
45
+ const doRecognition = () => runRecognition(
46
+ prepared.source,
47
+ options.subscriptionKey,
48
+ options.serviceRegion,
49
+ languageCode,
50
+ options.profanity ?? SpeechSDK.ProfanityOption.Raw
51
+ );
52
+ const result = timing ? await timing.timeAsync("upload", doRecognition) : await doRecognition();
53
+ const transcript = result.text;
54
+ const resultObject = JSON.parse(result.json);
55
+ const bestResult = resultObject.NBest[0];
56
+ const timeline = [];
57
+ for (const wordEntry of (bestResult == null ? void 0 : bestResult.Words) ?? []) {
58
+ const text = wordEntry.Word;
59
+ const startTime = wordEntry.Offset / 1e7;
60
+ const endTime = (wordEntry.Offset + wordEntry.Duration) / 1e7;
61
+ timeline.push({ type: "word", text, startTime, endTime });
62
+ }
63
+ return { transcript, timeline };
64
+ } finally {
65
+ await prepared.cleanup();
66
+ }
67
+ }
68
+ async function runRecognition(source, subscriptionKey, serviceRegion, languageCode, profanity) {
69
+ const audioFormat = SpeechSDK.AudioStreamFormat.getWaveFormat(
70
+ 16e3,
71
+ 16,
72
+ 1,
73
+ SpeechSDK.AudioFormatTag.PCM
74
+ );
75
+ const pushStream = SpeechSDK.AudioInputStream.createPushStream(audioFormat);
76
+ const readable = (0, import_audio.toReadStream)(source);
77
+ const streamPromise = new Promise((resolve, reject) => {
78
+ readable.on("data", (chunk) => {
79
+ const arrayBuffer = new ArrayBuffer(chunk.length);
80
+ const view = new Uint8Array(arrayBuffer);
81
+ chunk.copy(view);
82
+ pushStream.write(arrayBuffer);
83
+ }).on("end", () => {
84
+ pushStream.close();
85
+ resolve();
86
+ }).on("error", (err) => {
87
+ pushStream.close();
88
+ reject(err);
89
+ });
90
+ });
91
+ const recognitionPromise = new Promise(
92
+ (resolve, reject) => {
93
+ const audioConfig = SpeechSDK.AudioConfig.fromStreamInput(pushStream);
94
+ const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(
95
+ subscriptionKey,
96
+ serviceRegion
97
+ );
98
+ speechConfig.speechRecognitionLanguage = languageCode;
99
+ speechConfig.setProfanity(profanity);
100
+ speechConfig.requestWordLevelTimestamps();
101
+ speechConfig.outputFormat = SpeechSDK.OutputFormat.Detailed;
102
+ const recognizer = new SpeechSDK.SpeechRecognizer(
103
+ speechConfig,
104
+ audioConfig
105
+ );
106
+ recognizer.recognizeOnceAsync(
107
+ (result) => {
108
+ recognizer.close();
109
+ resolve(result);
110
+ },
111
+ (error) => {
112
+ recognizer.close();
113
+ reject(new Error(error));
114
+ }
115
+ );
116
+ }
117
+ );
118
+ await streamPromise;
119
+ return recognitionPromise;
120
+ }
121
+ // Annotate the CommonJS export names for ESM import in node:
122
+ 0 && (module.exports = {
123
+ recognize
124
+ });
@@ -0,0 +1,21 @@
1
+ import * as SpeechSDK from 'microsoft-cognitiveservices-speech-sdk';
2
+ import { AudioFormat } from '../audio/AudioFormat.cjs';
3
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
4
+ import { Timeline } from '../utilities/Timeline.cjs';
5
+ import { Timing } from '../utilities/Timing.cjs';
6
+ import 'node:fs';
7
+ import 'node:stream';
8
+
9
+ interface AzureSTTOptions {
10
+ subscriptionKey: string;
11
+ serviceRegion: string;
12
+ profanity?: SpeechSDK.ProfanityOption | undefined;
13
+ inputFormat?: AudioFormat | undefined;
14
+ timing?: Timing | undefined;
15
+ }
16
+ declare function recognize(input: RawAudioInput | AudioSource, options: AzureSTTOptions, languageCode: string): Promise<{
17
+ transcript: string;
18
+ timeline: Timeline;
19
+ }>;
20
+
21
+ export { type AzureSTTOptions, recognize };
@@ -0,0 +1,21 @@
1
+ import * as SpeechSDK from 'microsoft-cognitiveservices-speech-sdk';
2
+ import { AudioFormat } from '../audio/AudioFormat.js';
3
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
4
+ import { Timeline } from '../utilities/Timeline.js';
5
+ import { Timing } from '../utilities/Timing.js';
6
+ import 'node:fs';
7
+ import 'node:stream';
8
+
9
+ interface AzureSTTOptions {
10
+ subscriptionKey: string;
11
+ serviceRegion: string;
12
+ profanity?: SpeechSDK.ProfanityOption | undefined;
13
+ inputFormat?: AudioFormat | undefined;
14
+ timing?: Timing | undefined;
15
+ }
16
+ declare function recognize(input: RawAudioInput | AudioSource, options: AzureSTTOptions, languageCode: string): Promise<{
17
+ transcript: string;
18
+ timeline: Timeline;
19
+ }>;
20
+
21
+ export { type AzureSTTOptions, recognize };
@@ -0,0 +1,95 @@
1
+ import * as SpeechSDK from "microsoft-cognitiveservices-speech-sdk";
2
+ import {
3
+ isAudioSource,
4
+ normalizeToAudioSource,
5
+ prepareWavForService,
6
+ toReadStream
7
+ } from "../audio/index.js";
8
+ async function recognize(input, options, languageCode) {
9
+ const timing = options.timing;
10
+ const source = isAudioSource(input) ? input : normalizeToAudioSource(input, options.inputFormat);
11
+ const conversionNeeded = source.format !== "wav";
12
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
13
+ timing == null ? void 0 : timing.setMetadata("targetFormat", "wav");
14
+ const doPrepare = () => prepareWavForService(source, { sampleRate: 16e3, channels: 1 });
15
+ const prepared = timing ? await timing.timeAsync("conversion", doPrepare) : await doPrepare();
16
+ try {
17
+ const doRecognition = () => runRecognition(
18
+ prepared.source,
19
+ options.subscriptionKey,
20
+ options.serviceRegion,
21
+ languageCode,
22
+ options.profanity ?? SpeechSDK.ProfanityOption.Raw
23
+ );
24
+ const result = timing ? await timing.timeAsync("upload", doRecognition) : await doRecognition();
25
+ const transcript = result.text;
26
+ const resultObject = JSON.parse(result.json);
27
+ const bestResult = resultObject.NBest[0];
28
+ const timeline = [];
29
+ for (const wordEntry of (bestResult == null ? void 0 : bestResult.Words) ?? []) {
30
+ const text = wordEntry.Word;
31
+ const startTime = wordEntry.Offset / 1e7;
32
+ const endTime = (wordEntry.Offset + wordEntry.Duration) / 1e7;
33
+ timeline.push({ type: "word", text, startTime, endTime });
34
+ }
35
+ return { transcript, timeline };
36
+ } finally {
37
+ await prepared.cleanup();
38
+ }
39
+ }
40
+ async function runRecognition(source, subscriptionKey, serviceRegion, languageCode, profanity) {
41
+ const audioFormat = SpeechSDK.AudioStreamFormat.getWaveFormat(
42
+ 16e3,
43
+ 16,
44
+ 1,
45
+ SpeechSDK.AudioFormatTag.PCM
46
+ );
47
+ const pushStream = SpeechSDK.AudioInputStream.createPushStream(audioFormat);
48
+ const readable = toReadStream(source);
49
+ const streamPromise = new Promise((resolve, reject) => {
50
+ readable.on("data", (chunk) => {
51
+ const arrayBuffer = new ArrayBuffer(chunk.length);
52
+ const view = new Uint8Array(arrayBuffer);
53
+ chunk.copy(view);
54
+ pushStream.write(arrayBuffer);
55
+ }).on("end", () => {
56
+ pushStream.close();
57
+ resolve();
58
+ }).on("error", (err) => {
59
+ pushStream.close();
60
+ reject(err);
61
+ });
62
+ });
63
+ const recognitionPromise = new Promise(
64
+ (resolve, reject) => {
65
+ const audioConfig = SpeechSDK.AudioConfig.fromStreamInput(pushStream);
66
+ const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(
67
+ subscriptionKey,
68
+ serviceRegion
69
+ );
70
+ speechConfig.speechRecognitionLanguage = languageCode;
71
+ speechConfig.setProfanity(profanity);
72
+ speechConfig.requestWordLevelTimestamps();
73
+ speechConfig.outputFormat = SpeechSDK.OutputFormat.Detailed;
74
+ const recognizer = new SpeechSDK.SpeechRecognizer(
75
+ speechConfig,
76
+ audioConfig
77
+ );
78
+ recognizer.recognizeOnceAsync(
79
+ (result) => {
80
+ recognizer.close();
81
+ resolve(result);
82
+ },
83
+ (error) => {
84
+ recognizer.close();
85
+ reject(new Error(error));
86
+ }
87
+ );
88
+ }
89
+ );
90
+ await streamPromise;
91
+ return recognitionPromise;
92
+ }
93
+ export {
94
+ recognize
95
+ };