@storyteller-platform/ghost-story 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/LICENSE.md +611 -0
  2. package/README.md +18 -0
  3. package/dist/api/APIOptions.cjs +16 -0
  4. package/dist/api/APIOptions.d.cts +18 -0
  5. package/dist/api/APIOptions.d.ts +18 -0
  6. package/dist/api/APIOptions.js +0 -0
  7. package/dist/api/Recognition.cjs +263 -0
  8. package/dist/api/Recognition.d.cts +77 -0
  9. package/dist/api/Recognition.d.ts +77 -0
  10. package/dist/api/Recognition.js +233 -0
  11. package/dist/api/VoiceActivityDetection.cjs +77 -0
  12. package/dist/api/VoiceActivityDetection.d.cts +24 -0
  13. package/dist/api/VoiceActivityDetection.d.ts +24 -0
  14. package/dist/api/VoiceActivityDetection.js +43 -0
  15. package/dist/audio/AudioConverter.cjs +331 -0
  16. package/dist/audio/AudioConverter.d.cts +53 -0
  17. package/dist/audio/AudioConverter.d.ts +53 -0
  18. package/dist/audio/AudioConverter.js +310 -0
  19. package/dist/audio/AudioFormat.cjs +151 -0
  20. package/dist/audio/AudioFormat.d.cts +25 -0
  21. package/dist/audio/AudioFormat.d.ts +25 -0
  22. package/dist/audio/AudioFormat.js +123 -0
  23. package/dist/audio/AudioSource.cjs +119 -0
  24. package/dist/audio/AudioSource.d.cts +33 -0
  25. package/dist/audio/AudioSource.d.ts +33 -0
  26. package/dist/audio/AudioSource.js +88 -0
  27. package/dist/audio/index.cjs +74 -0
  28. package/dist/audio/index.d.cts +6 -0
  29. package/dist/audio/index.d.ts +6 -0
  30. package/dist/audio/index.js +54 -0
  31. package/dist/cli/bin.cjs +277 -0
  32. package/dist/cli/bin.d.cts +1 -0
  33. package/dist/cli/bin.d.ts +1 -0
  34. package/dist/cli/bin.js +275 -0
  35. package/dist/cli/config.cjs +347 -0
  36. package/dist/cli/config.d.cts +33 -0
  37. package/dist/cli/config.d.ts +33 -0
  38. package/dist/cli/config.js +285 -0
  39. package/dist/cli/install.cjs +334 -0
  40. package/dist/cli/install.d.cts +62 -0
  41. package/dist/cli/install.d.ts +62 -0
  42. package/dist/cli/install.js +316 -0
  43. package/dist/cli/whisper-server.cjs +172 -0
  44. package/dist/cli/whisper-server.d.cts +24 -0
  45. package/dist/cli/whisper-server.d.ts +24 -0
  46. package/dist/cli/whisper-server.js +152 -0
  47. package/dist/config.cjs +60 -0
  48. package/dist/config.d.cts +12 -0
  49. package/dist/config.d.ts +12 -0
  50. package/dist/config.js +32 -0
  51. package/dist/convert.cjs +88 -0
  52. package/dist/convert.d.cts +12 -0
  53. package/dist/convert.d.ts +12 -0
  54. package/dist/convert.js +63 -0
  55. package/dist/encodings/Ascii.cjs +75 -0
  56. package/dist/encodings/Ascii.d.cts +13 -0
  57. package/dist/encodings/Ascii.d.ts +13 -0
  58. package/dist/encodings/Ascii.js +48 -0
  59. package/dist/encodings/Base64.cjs +155 -0
  60. package/dist/encodings/Base64.d.cts +5 -0
  61. package/dist/encodings/Base64.d.ts +5 -0
  62. package/dist/encodings/Base64.js +129 -0
  63. package/dist/encodings/TextEncodingsCommon.cjs +16 -0
  64. package/dist/encodings/TextEncodingsCommon.d.cts +6 -0
  65. package/dist/encodings/TextEncodingsCommon.d.ts +6 -0
  66. package/dist/encodings/TextEncodingsCommon.js +0 -0
  67. package/dist/index.cjs +153 -0
  68. package/dist/index.d.cts +15 -0
  69. package/dist/index.d.ts +15 -0
  70. package/dist/index.js +140 -0
  71. package/dist/recognition/AmazonTranscribeSTT.cjs +188 -0
  72. package/dist/recognition/AmazonTranscribeSTT.d.cts +21 -0
  73. package/dist/recognition/AmazonTranscribeSTT.d.ts +21 -0
  74. package/dist/recognition/AmazonTranscribeSTT.js +160 -0
  75. package/dist/recognition/AzureCognitiveServicesSTT.cjs +124 -0
  76. package/dist/recognition/AzureCognitiveServicesSTT.d.cts +21 -0
  77. package/dist/recognition/AzureCognitiveServicesSTT.d.ts +21 -0
  78. package/dist/recognition/AzureCognitiveServicesSTT.js +95 -0
  79. package/dist/recognition/DeepgramSTT.cjs +172 -0
  80. package/dist/recognition/DeepgramSTT.d.cts +23 -0
  81. package/dist/recognition/DeepgramSTT.d.ts +23 -0
  82. package/dist/recognition/DeepgramSTT.js +153 -0
  83. package/dist/recognition/GoogleCloudSTT.cjs +125 -0
  84. package/dist/recognition/GoogleCloudSTT.d.cts +35 -0
  85. package/dist/recognition/GoogleCloudSTT.d.ts +35 -0
  86. package/dist/recognition/GoogleCloudSTT.js +107 -0
  87. package/dist/recognition/OpenAICloudSTT.cjs +180 -0
  88. package/dist/recognition/OpenAICloudSTT.d.cts +29 -0
  89. package/dist/recognition/OpenAICloudSTT.d.ts +29 -0
  90. package/dist/recognition/OpenAICloudSTT.js +150 -0
  91. package/dist/recognition/WhisperCppSTT.cjs +296 -0
  92. package/dist/recognition/WhisperCppSTT.d.cts +40 -0
  93. package/dist/recognition/WhisperCppSTT.d.ts +40 -0
  94. package/dist/recognition/WhisperCppSTT.js +275 -0
  95. package/dist/recognition/WhisperServerSTT.cjs +119 -0
  96. package/dist/recognition/WhisperServerSTT.d.cts +24 -0
  97. package/dist/recognition/WhisperServerSTT.d.ts +24 -0
  98. package/dist/recognition/WhisperServerSTT.js +105 -0
  99. package/dist/utilities/FileSystem.cjs +54 -0
  100. package/dist/utilities/FileSystem.d.cts +3 -0
  101. package/dist/utilities/FileSystem.d.ts +3 -0
  102. package/dist/utilities/FileSystem.js +20 -0
  103. package/dist/utilities/Locale.cjs +46 -0
  104. package/dist/utilities/Locale.d.cts +9 -0
  105. package/dist/utilities/Locale.d.ts +9 -0
  106. package/dist/utilities/Locale.js +20 -0
  107. package/dist/utilities/ObjectUtilities.cjs +41 -0
  108. package/dist/utilities/ObjectUtilities.d.cts +3 -0
  109. package/dist/utilities/ObjectUtilities.d.ts +3 -0
  110. package/dist/utilities/ObjectUtilities.js +7 -0
  111. package/dist/utilities/Timeline.cjs +120 -0
  112. package/dist/utilities/Timeline.d.cts +23 -0
  113. package/dist/utilities/Timeline.d.ts +23 -0
  114. package/dist/utilities/Timeline.js +94 -0
  115. package/dist/utilities/Timing.cjs +287 -0
  116. package/dist/utilities/Timing.d.cts +64 -0
  117. package/dist/utilities/Timing.d.ts +64 -0
  118. package/dist/utilities/Timing.js +256 -0
  119. package/dist/utilities/WhisperTimeline.cjs +344 -0
  120. package/dist/utilities/WhisperTimeline.d.cts +86 -0
  121. package/dist/utilities/WhisperTimeline.d.ts +86 -0
  122. package/dist/utilities/WhisperTimeline.js +313 -0
  123. package/dist/vad/ActiveGate.cjs +357 -0
  124. package/dist/vad/ActiveGate.d.cts +53 -0
  125. package/dist/vad/ActiveGate.d.ts +53 -0
  126. package/dist/vad/ActiveGate.js +329 -0
  127. package/dist/vad/ActiveGateOg.cjs +1366 -0
  128. package/dist/vad/ActiveGateOg.d.cts +33 -0
  129. package/dist/vad/ActiveGateOg.d.ts +33 -0
  130. package/dist/vad/ActiveGateOg.js +1341 -0
  131. package/dist/vad/Silero.cjs +174 -0
  132. package/dist/vad/Silero.d.cts +25 -0
  133. package/dist/vad/Silero.d.ts +25 -0
  134. package/dist/vad/Silero.js +153 -0
  135. package/package.json +125 -0
@@ -0,0 +1,172 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var DeepgramSTT_exports = {};
20
+ __export(DeepgramSTT_exports, {
21
+ defaultDeepgramSTTOptions: () => defaultDeepgramSTTOptions,
22
+ recognize: () => recognize
23
+ });
24
+ module.exports = __toCommonJS(DeepgramSTT_exports);
25
+ var import_audio = require("../audio/index.cjs");
26
+ var import_config = require("../config.cjs");
27
+ var import_ObjectUtilities = require("../utilities/ObjectUtilities.cjs");
28
+ const SERVICE_ID = "deepgram";
29
+ function formatToDeepgramEncoding(format) {
30
+ switch (format) {
31
+ case "wav":
32
+ return "wav";
33
+ case "flac":
34
+ return "flac";
35
+ case "opus":
36
+ case "ogg":
37
+ return "opus";
38
+ case "mp3":
39
+ return "mp3";
40
+ case "webm":
41
+ return "webm";
42
+ default:
43
+ return "wav";
44
+ }
45
+ }
46
+ function formatToContentType(format) {
47
+ switch (format) {
48
+ case "wav":
49
+ return "audio/wav";
50
+ case "flac":
51
+ return "audio/flac";
52
+ case "opus":
53
+ case "ogg":
54
+ return "audio/ogg";
55
+ case "mp3":
56
+ return "audio/mpeg";
57
+ case "webm":
58
+ return "audio/webm";
59
+ default:
60
+ return "audio/wav";
61
+ }
62
+ }
63
+ async function recognize(input, languageCode, options, signal) {
64
+ var _a, _b;
65
+ const opts = (0, import_ObjectUtilities.extendDeep)(defaultDeepgramSTTOptions, options);
66
+ const timing = opts.timing;
67
+ if (!opts.apiKey) {
68
+ throw new Error("No Deepgram API key provided");
69
+ }
70
+ const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, opts.inputFormat);
71
+ const caps = import_audio.serviceCapabilities[SERVICE_ID];
72
+ const requiresConversion = (0, import_audio.needsConversion)(source.format, SERVICE_ID);
73
+ const targetFormat = requiresConversion ? (caps == null ? void 0 : caps.preferredFormat) ?? "wav" : source.format;
74
+ const mode = opts.conversionMode ?? (0, import_config.getConversionMode)();
75
+ timing == null ? void 0 : timing.setMetadata("targetFormat", targetFormat);
76
+ timing == null ? void 0 : timing.setMetadata("conversionMode", mode);
77
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", requiresConversion);
78
+ const doConversion = () => (0, import_audio.createStreamForUpload)({
79
+ source,
80
+ targetFormat,
81
+ sampleRate: caps == null ? void 0 : caps.preferredSampleRate,
82
+ channels: caps == null ? void 0 : caps.preferredChannels,
83
+ mode
84
+ });
85
+ const uploadResult = timing ? await timing.timeAsync("conversion", doConversion) : await doConversion();
86
+ try {
87
+ const params = {
88
+ model: opts.model,
89
+ encoding: formatToDeepgramEncoding(uploadResult.format),
90
+ punctuate: opts.punctuate ? "true" : "false"
91
+ };
92
+ if (languageCode) {
93
+ params["language"] = languageCode;
94
+ } else {
95
+ params["detect_language"] = "true";
96
+ }
97
+ const searchParams = new URLSearchParams(params);
98
+ const url = `https://api.deepgram.com/v1/listen?${searchParams.toString()}`;
99
+ const doUpload = async () => {
100
+ var _a2;
101
+ const fetchPromise = fetch(url, {
102
+ method: "POST",
103
+ duplex: "half",
104
+ headers: {
105
+ Authorization: `Token ${opts.apiKey}`,
106
+ "Content-Type": formatToContentType(uploadResult.format)
107
+ },
108
+ body: uploadResult.stream,
109
+ signal: signal ?? null
110
+ });
111
+ const conversionPromise = (_a2 = uploadResult.start) == null ? void 0 : _a2.call(uploadResult);
112
+ const resp = await fetchPromise;
113
+ await conversionPromise;
114
+ return resp;
115
+ };
116
+ const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
117
+ if (!response.ok) {
118
+ const text = await response.text();
119
+ throw new Error(`Deepgram request failed: ${response.status} ${text}`);
120
+ }
121
+ const deepgramResponse = await response.json();
122
+ const firstAlternative = (_b = (_a = deepgramResponse.results) == null ? void 0 : _a.channels[0]) == null ? void 0 : _b.alternatives[0];
123
+ const transcript = (firstAlternative == null ? void 0 : firstAlternative.transcript) || "";
124
+ const words = (firstAlternative == null ? void 0 : firstAlternative.words) || [];
125
+ const timeline = words.map(
126
+ (wordEntry) => ({
127
+ type: "word",
128
+ text: wordEntry.word,
129
+ startTime: wordEntry.start,
130
+ endTime: wordEntry.end,
131
+ confidence: wordEntry.confidence
132
+ })
133
+ );
134
+ if (opts.punctuate) {
135
+ applyPunctuationToTimeline(timeline, transcript);
136
+ }
137
+ return { transcript, timeline };
138
+ } finally {
139
+ await uploadResult.cleanup();
140
+ }
141
+ }
142
+ function applyPunctuationToTimeline(timeline, transcript) {
143
+ const lowerCaseTranscript = transcript.toLocaleLowerCase();
144
+ let readOffset = 0;
145
+ for (const wordEntry of timeline) {
146
+ const wordEntryTextLowercase = wordEntry.text.toLocaleLowerCase();
147
+ const matchPosition = lowerCaseTranscript.indexOf(
148
+ wordEntryTextLowercase,
149
+ readOffset
150
+ );
151
+ if (matchPosition === -1) {
152
+ throw new Error(
153
+ `Couldn't match the word '${wordEntry.text}' in the lowercase transcript`
154
+ );
155
+ }
156
+ wordEntry.text = transcript.substring(
157
+ matchPosition,
158
+ matchPosition + wordEntryTextLowercase.length
159
+ );
160
+ readOffset = matchPosition + wordEntry.text.length;
161
+ }
162
+ }
163
+ const defaultDeepgramSTTOptions = {
164
+ apiKey: "",
165
+ model: "nova-2",
166
+ punctuate: true
167
+ };
168
+ // Annotate the CommonJS export names for ESM import in node:
169
+ 0 && (module.exports = {
170
+ defaultDeepgramSTTOptions,
171
+ recognize
172
+ });
@@ -0,0 +1,23 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.cjs';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
3
+ import { ConversionMode } from '../config.cjs';
4
+ import { TimelineEntry } from '../utilities/Timeline.cjs';
5
+ import { Timing } from '../utilities/Timing.cjs';
6
+ import 'node:fs';
7
+ import 'node:stream';
8
+
9
+ declare function recognize(input: RawAudioInput | AudioSource, languageCode: string | undefined, options: DeepgramSTTOptions, signal?: AbortSignal | null): Promise<{
10
+ transcript: string;
11
+ timeline: TimelineEntry[];
12
+ }>;
13
+ interface DeepgramSTTOptions {
14
+ apiKey: string;
15
+ model: string;
16
+ punctuate: boolean;
17
+ inputFormat?: AudioFormat | undefined;
18
+ timing?: Timing | undefined;
19
+ conversionMode?: ConversionMode | undefined;
20
+ }
21
+ declare const defaultDeepgramSTTOptions: DeepgramSTTOptions;
22
+
23
+ export { type DeepgramSTTOptions, defaultDeepgramSTTOptions, recognize };
@@ -0,0 +1,23 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.js';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
3
+ import { ConversionMode } from '../config.js';
4
+ import { TimelineEntry } from '../utilities/Timeline.js';
5
+ import { Timing } from '../utilities/Timing.js';
6
+ import 'node:fs';
7
+ import 'node:stream';
8
+
9
+ declare function recognize(input: RawAudioInput | AudioSource, languageCode: string | undefined, options: DeepgramSTTOptions, signal?: AbortSignal | null): Promise<{
10
+ transcript: string;
11
+ timeline: TimelineEntry[];
12
+ }>;
13
+ interface DeepgramSTTOptions {
14
+ apiKey: string;
15
+ model: string;
16
+ punctuate: boolean;
17
+ inputFormat?: AudioFormat | undefined;
18
+ timing?: Timing | undefined;
19
+ conversionMode?: ConversionMode | undefined;
20
+ }
21
+ declare const defaultDeepgramSTTOptions: DeepgramSTTOptions;
22
+
23
+ export { type DeepgramSTTOptions, defaultDeepgramSTTOptions, recognize };
@@ -0,0 +1,153 @@
1
+ import {
2
+ createStreamForUpload,
3
+ isAudioSource,
4
+ needsConversion,
5
+ normalizeToAudioSource,
6
+ serviceCapabilities
7
+ } from "../audio/index.js";
8
+ import { getConversionMode } from "../config.js";
9
+ import { extendDeep } from "../utilities/ObjectUtilities.js";
10
+ const SERVICE_ID = "deepgram";
11
+ function formatToDeepgramEncoding(format) {
12
+ switch (format) {
13
+ case "wav":
14
+ return "wav";
15
+ case "flac":
16
+ return "flac";
17
+ case "opus":
18
+ case "ogg":
19
+ return "opus";
20
+ case "mp3":
21
+ return "mp3";
22
+ case "webm":
23
+ return "webm";
24
+ default:
25
+ return "wav";
26
+ }
27
+ }
28
+ function formatToContentType(format) {
29
+ switch (format) {
30
+ case "wav":
31
+ return "audio/wav";
32
+ case "flac":
33
+ return "audio/flac";
34
+ case "opus":
35
+ case "ogg":
36
+ return "audio/ogg";
37
+ case "mp3":
38
+ return "audio/mpeg";
39
+ case "webm":
40
+ return "audio/webm";
41
+ default:
42
+ return "audio/wav";
43
+ }
44
+ }
45
+ async function recognize(input, languageCode, options, signal) {
46
+ var _a, _b;
47
+ const opts = extendDeep(defaultDeepgramSTTOptions, options);
48
+ const timing = opts.timing;
49
+ if (!opts.apiKey) {
50
+ throw new Error("No Deepgram API key provided");
51
+ }
52
+ const source = isAudioSource(input) ? input : normalizeToAudioSource(input, opts.inputFormat);
53
+ const caps = serviceCapabilities[SERVICE_ID];
54
+ const requiresConversion = needsConversion(source.format, SERVICE_ID);
55
+ const targetFormat = requiresConversion ? (caps == null ? void 0 : caps.preferredFormat) ?? "wav" : source.format;
56
+ const mode = opts.conversionMode ?? getConversionMode();
57
+ timing == null ? void 0 : timing.setMetadata("targetFormat", targetFormat);
58
+ timing == null ? void 0 : timing.setMetadata("conversionMode", mode);
59
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", requiresConversion);
60
+ const doConversion = () => createStreamForUpload({
61
+ source,
62
+ targetFormat,
63
+ sampleRate: caps == null ? void 0 : caps.preferredSampleRate,
64
+ channels: caps == null ? void 0 : caps.preferredChannels,
65
+ mode
66
+ });
67
+ const uploadResult = timing ? await timing.timeAsync("conversion", doConversion) : await doConversion();
68
+ try {
69
+ const params = {
70
+ model: opts.model,
71
+ encoding: formatToDeepgramEncoding(uploadResult.format),
72
+ punctuate: opts.punctuate ? "true" : "false"
73
+ };
74
+ if (languageCode) {
75
+ params["language"] = languageCode;
76
+ } else {
77
+ params["detect_language"] = "true";
78
+ }
79
+ const searchParams = new URLSearchParams(params);
80
+ const url = `https://api.deepgram.com/v1/listen?${searchParams.toString()}`;
81
+ const doUpload = async () => {
82
+ var _a2;
83
+ const fetchPromise = fetch(url, {
84
+ method: "POST",
85
+ duplex: "half",
86
+ headers: {
87
+ Authorization: `Token ${opts.apiKey}`,
88
+ "Content-Type": formatToContentType(uploadResult.format)
89
+ },
90
+ body: uploadResult.stream,
91
+ signal: signal ?? null
92
+ });
93
+ const conversionPromise = (_a2 = uploadResult.start) == null ? void 0 : _a2.call(uploadResult);
94
+ const resp = await fetchPromise;
95
+ await conversionPromise;
96
+ return resp;
97
+ };
98
+ const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
99
+ if (!response.ok) {
100
+ const text = await response.text();
101
+ throw new Error(`Deepgram request failed: ${response.status} ${text}`);
102
+ }
103
+ const deepgramResponse = await response.json();
104
+ const firstAlternative = (_b = (_a = deepgramResponse.results) == null ? void 0 : _a.channels[0]) == null ? void 0 : _b.alternatives[0];
105
+ const transcript = (firstAlternative == null ? void 0 : firstAlternative.transcript) || "";
106
+ const words = (firstAlternative == null ? void 0 : firstAlternative.words) || [];
107
+ const timeline = words.map(
108
+ (wordEntry) => ({
109
+ type: "word",
110
+ text: wordEntry.word,
111
+ startTime: wordEntry.start,
112
+ endTime: wordEntry.end,
113
+ confidence: wordEntry.confidence
114
+ })
115
+ );
116
+ if (opts.punctuate) {
117
+ applyPunctuationToTimeline(timeline, transcript);
118
+ }
119
+ return { transcript, timeline };
120
+ } finally {
121
+ await uploadResult.cleanup();
122
+ }
123
+ }
124
+ function applyPunctuationToTimeline(timeline, transcript) {
125
+ const lowerCaseTranscript = transcript.toLocaleLowerCase();
126
+ let readOffset = 0;
127
+ for (const wordEntry of timeline) {
128
+ const wordEntryTextLowercase = wordEntry.text.toLocaleLowerCase();
129
+ const matchPosition = lowerCaseTranscript.indexOf(
130
+ wordEntryTextLowercase,
131
+ readOffset
132
+ );
133
+ if (matchPosition === -1) {
134
+ throw new Error(
135
+ `Couldn't match the word '${wordEntry.text}' in the lowercase transcript`
136
+ );
137
+ }
138
+ wordEntry.text = transcript.substring(
139
+ matchPosition,
140
+ matchPosition + wordEntryTextLowercase.length
141
+ );
142
+ readOffset = matchPosition + wordEntry.text.length;
143
+ }
144
+ }
145
+ const defaultDeepgramSTTOptions = {
146
+ apiKey: "",
147
+ model: "nova-2",
148
+ punctuate: true
149
+ };
150
+ export {
151
+ defaultDeepgramSTTOptions,
152
+ recognize
153
+ };
@@ -0,0 +1,125 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var GoogleCloudSTT_exports = {};
20
+ __export(GoogleCloudSTT_exports, {
21
+ recognize: () => recognize
22
+ });
23
+ module.exports = __toCommonJS(GoogleCloudSTT_exports);
24
+ var import_audio = require("../audio/index.cjs");
25
+ var import_Base64 = require("../encodings/Base64.cjs");
26
+ const SERVICE_ID = "google-cloud";
27
+ async function recognize(input, options, languageCode = "en-US") {
28
+ const timing = options.timing;
29
+ const source = (0, import_audio.isAudioSource)(input) ? input : (0, import_audio.normalizeToAudioSource)(input, options.inputFormat);
30
+ let encoding = formatToGoogleEncoding(source.format);
31
+ const conversionNeeded = (0, import_audio.needsConversion)(source.format, SERVICE_ID);
32
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
33
+ timing == null ? void 0 : timing.setMetadata("targetFormat", conversionNeeded ? "flac" : source.format);
34
+ const doConversion = async () => {
35
+ if (conversionNeeded) {
36
+ const converted = await (0, import_audio.convertToBuffer)(source, {
37
+ targetFormat: "flac",
38
+ sampleRate: 16e3,
39
+ channels: 1
40
+ });
41
+ encoding = "FLAC";
42
+ return converted.source.buffer;
43
+ }
44
+ return (0, import_audio.toBuffer)(source);
45
+ };
46
+ const audioBuffer = timing ? await timing.timeAsync("conversion", doConversion) : await doConversion();
47
+ const requestBody = {
48
+ config: {
49
+ encoding,
50
+ sampleRateHertz: 16e3,
51
+ audioChannelCount: 1,
52
+ languageCode,
53
+ alternativeLanguageCodes: options.alternativeLanguageCodes ?? [],
54
+ maxAlternatives: 1,
55
+ profanityFilter: options.profanityFilter ?? false,
56
+ enableWordTimeOffsets: true,
57
+ enableWordConfidence: true,
58
+ enableAutomaticPunctuation: options.autoPunctuation ?? true,
59
+ model: "latest_long",
60
+ useEnhanced: options.useEnhancedModel ?? true
61
+ },
62
+ audio: {
63
+ content: (0, import_Base64.encodeBase64)(audioBuffer)
64
+ }
65
+ };
66
+ const doUpload = () => fetch(`https://speech.googleapis.com/v1p1beta1/speech:recognize`, {
67
+ method: "POST",
68
+ headers: {
69
+ "Content-Type": "application/json",
70
+ Authorization: `Bearer ${options.apiKey}`
71
+ },
72
+ body: JSON.stringify(requestBody)
73
+ });
74
+ const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
75
+ if (!response.ok) {
76
+ const text = await response.text();
77
+ throw new Error(`Google Cloud STT error: ${response.status} ${text}`);
78
+ }
79
+ const result = parseResponseBody(
80
+ await response.json()
81
+ );
82
+ return result;
83
+ }
84
+ function formatToGoogleEncoding(format) {
85
+ switch (format) {
86
+ case "wav":
87
+ return "LINEAR16";
88
+ case "flac":
89
+ return "FLAC";
90
+ case "mp3":
91
+ return "MP3";
92
+ case "opus":
93
+ case "ogg":
94
+ return "OGG_OPUS";
95
+ case "webm":
96
+ return "WEBM_OPUS";
97
+ default:
98
+ return "FLAC";
99
+ }
100
+ }
101
+ function parseResponseBody(responseBody) {
102
+ var _a, _b;
103
+ const results = responseBody.results;
104
+ let transcript = "";
105
+ const timeline = [];
106
+ for (const result of results) {
107
+ if (!((_b = (_a = result.alternatives) == null ? void 0 : _a[0]) == null ? void 0 : _b.transcript)) continue;
108
+ const firstAlternative = result.alternatives[0];
109
+ transcript += firstAlternative.transcript;
110
+ for (const wordEvent of firstAlternative.words) {
111
+ timeline.push({
112
+ type: "word",
113
+ text: wordEvent.word,
114
+ startTime: parseFloat(wordEvent.startTime.replace("s", "")),
115
+ endTime: parseFloat(wordEvent.endTime.replace("s", "")),
116
+ confidence: wordEvent.confidence
117
+ });
118
+ }
119
+ }
120
+ return { transcript, timeline };
121
+ }
122
+ // Annotate the CommonJS export names for ESM import in node:
123
+ 0 && (module.exports = {
124
+ recognize
125
+ });
@@ -0,0 +1,35 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.cjs';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.cjs';
3
+ import { Timeline } from '../utilities/Timeline.cjs';
4
+ import { Timing } from '../utilities/Timing.cjs';
5
+ import 'node:fs';
6
+ import 'node:stream';
7
+
8
+ interface GoogleCloudSTTOptions {
9
+ apiKey: string;
10
+ alternativeLanguageCodes?: string[] | undefined;
11
+ profanityFilter?: boolean | undefined;
12
+ autoPunctuation?: boolean | undefined;
13
+ useEnhancedModel?: boolean | undefined;
14
+ inputFormat?: AudioFormat | undefined;
15
+ timing?: Timing | undefined;
16
+ }
17
+ declare function recognize(input: RawAudioInput | AudioSource, options: GoogleCloudSTTOptions, languageCode?: string): Promise<{
18
+ transcript: string;
19
+ timeline: Timeline;
20
+ }>;
21
+ interface GoogleCloudSTTResponse {
22
+ results: {
23
+ alternatives?: {
24
+ transcript: string;
25
+ words: {
26
+ word: string;
27
+ startTime: string;
28
+ endTime: string;
29
+ confidence: number;
30
+ }[];
31
+ }[];
32
+ }[];
33
+ }
34
+
35
+ export { type GoogleCloudSTTOptions, type GoogleCloudSTTResponse, recognize };
@@ -0,0 +1,35 @@
1
+ import { AudioFormat } from '../audio/AudioFormat.js';
2
+ import { RawAudioInput, AudioSource } from '../audio/AudioSource.js';
3
+ import { Timeline } from '../utilities/Timeline.js';
4
+ import { Timing } from '../utilities/Timing.js';
5
+ import 'node:fs';
6
+ import 'node:stream';
7
+
8
+ interface GoogleCloudSTTOptions {
9
+ apiKey: string;
10
+ alternativeLanguageCodes?: string[] | undefined;
11
+ profanityFilter?: boolean | undefined;
12
+ autoPunctuation?: boolean | undefined;
13
+ useEnhancedModel?: boolean | undefined;
14
+ inputFormat?: AudioFormat | undefined;
15
+ timing?: Timing | undefined;
16
+ }
17
+ declare function recognize(input: RawAudioInput | AudioSource, options: GoogleCloudSTTOptions, languageCode?: string): Promise<{
18
+ transcript: string;
19
+ timeline: Timeline;
20
+ }>;
21
+ interface GoogleCloudSTTResponse {
22
+ results: {
23
+ alternatives?: {
24
+ transcript: string;
25
+ words: {
26
+ word: string;
27
+ startTime: string;
28
+ endTime: string;
29
+ confidence: number;
30
+ }[];
31
+ }[];
32
+ }[];
33
+ }
34
+
35
+ export { type GoogleCloudSTTOptions, type GoogleCloudSTTResponse, recognize };
@@ -0,0 +1,107 @@
1
+ import {
2
+ convertToBuffer,
3
+ isAudioSource,
4
+ needsConversion,
5
+ normalizeToAudioSource,
6
+ toBuffer
7
+ } from "../audio/index.js";
8
+ import { encodeBase64 } from "../encodings/Base64.js";
9
+ const SERVICE_ID = "google-cloud";
10
+ async function recognize(input, options, languageCode = "en-US") {
11
+ const timing = options.timing;
12
+ const source = isAudioSource(input) ? input : normalizeToAudioSource(input, options.inputFormat);
13
+ let encoding = formatToGoogleEncoding(source.format);
14
+ const conversionNeeded = needsConversion(source.format, SERVICE_ID);
15
+ timing == null ? void 0 : timing.setMetadata("conversionRequired", conversionNeeded);
16
+ timing == null ? void 0 : timing.setMetadata("targetFormat", conversionNeeded ? "flac" : source.format);
17
+ const doConversion = async () => {
18
+ if (conversionNeeded) {
19
+ const converted = await convertToBuffer(source, {
20
+ targetFormat: "flac",
21
+ sampleRate: 16e3,
22
+ channels: 1
23
+ });
24
+ encoding = "FLAC";
25
+ return converted.source.buffer;
26
+ }
27
+ return toBuffer(source);
28
+ };
29
+ const audioBuffer = timing ? await timing.timeAsync("conversion", doConversion) : await doConversion();
30
+ const requestBody = {
31
+ config: {
32
+ encoding,
33
+ sampleRateHertz: 16e3,
34
+ audioChannelCount: 1,
35
+ languageCode,
36
+ alternativeLanguageCodes: options.alternativeLanguageCodes ?? [],
37
+ maxAlternatives: 1,
38
+ profanityFilter: options.profanityFilter ?? false,
39
+ enableWordTimeOffsets: true,
40
+ enableWordConfidence: true,
41
+ enableAutomaticPunctuation: options.autoPunctuation ?? true,
42
+ model: "latest_long",
43
+ useEnhanced: options.useEnhancedModel ?? true
44
+ },
45
+ audio: {
46
+ content: encodeBase64(audioBuffer)
47
+ }
48
+ };
49
+ const doUpload = () => fetch(`https://speech.googleapis.com/v1p1beta1/speech:recognize`, {
50
+ method: "POST",
51
+ headers: {
52
+ "Content-Type": "application/json",
53
+ Authorization: `Bearer ${options.apiKey}`
54
+ },
55
+ body: JSON.stringify(requestBody)
56
+ });
57
+ const response = timing ? await timing.timeAsync("upload", doUpload) : await doUpload();
58
+ if (!response.ok) {
59
+ const text = await response.text();
60
+ throw new Error(`Google Cloud STT error: ${response.status} ${text}`);
61
+ }
62
+ const result = parseResponseBody(
63
+ await response.json()
64
+ );
65
+ return result;
66
+ }
67
+ function formatToGoogleEncoding(format) {
68
+ switch (format) {
69
+ case "wav":
70
+ return "LINEAR16";
71
+ case "flac":
72
+ return "FLAC";
73
+ case "mp3":
74
+ return "MP3";
75
+ case "opus":
76
+ case "ogg":
77
+ return "OGG_OPUS";
78
+ case "webm":
79
+ return "WEBM_OPUS";
80
+ default:
81
+ return "FLAC";
82
+ }
83
+ }
84
+ function parseResponseBody(responseBody) {
85
+ var _a, _b;
86
+ const results = responseBody.results;
87
+ let transcript = "";
88
+ const timeline = [];
89
+ for (const result of results) {
90
+ if (!((_b = (_a = result.alternatives) == null ? void 0 : _a[0]) == null ? void 0 : _b.transcript)) continue;
91
+ const firstAlternative = result.alternatives[0];
92
+ transcript += firstAlternative.transcript;
93
+ for (const wordEvent of firstAlternative.words) {
94
+ timeline.push({
95
+ type: "word",
96
+ text: wordEvent.word,
97
+ startTime: parseFloat(wordEvent.startTime.replace("s", "")),
98
+ endTime: parseFloat(wordEvent.endTime.replace("s", "")),
99
+ confidence: wordEvent.confidence
100
+ });
101
+ }
102
+ }
103
+ return { transcript, timeline };
104
+ }
105
+ export {
106
+ recognize
107
+ };