utilitas 2000.3.27 → 2000.3.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -11
- package/dist/utilitas.lite.mjs +1 -1
- package/dist/utilitas.lite.mjs.map +1 -1
- package/lib/alan.mjs +287 -159
- package/lib/manifest.mjs +1 -1
- package/lib/speech.mjs +15 -139
- package/lib/storage.mjs +6 -4
- package/package.json +1 -1
package/lib/manifest.mjs
CHANGED
package/lib/speech.mjs
CHANGED
|
@@ -1,36 +1,20 @@
|
|
|
1
|
-
import { DEFAULT_MODELS, OPENAI_VOICE, countTokens, k } from './alan.mjs';
|
|
2
|
-
import { getFfmpeg, packPcmToWav } from './media.mjs';
|
|
3
1
|
import { get } from './web.mjs';
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
2
|
+
import { getFfmpeg } from './media.mjs';
|
|
3
|
+
import { getTempPath } from './storage.mjs';
|
|
4
|
+
import { hash } from './encryption.mjs';
|
|
6
5
|
|
|
7
6
|
import {
|
|
8
|
-
call,
|
|
9
|
-
need, throwError
|
|
7
|
+
call, ignoreErrFunc, inBrowser, need, throwError,
|
|
10
8
|
} from './utilitas.mjs';
|
|
11
9
|
|
|
12
10
|
import {
|
|
13
|
-
convertAudioTo16kNanoOpusOgg,
|
|
14
|
-
convertAudioTo16kNanoPcmWave,
|
|
11
|
+
convertAudioTo16kNanoOpusOgg, convertAudioTo16kNanoPcmWave,
|
|
15
12
|
} from './media.mjs';
|
|
16
13
|
|
|
17
|
-
const _NEED = ['
|
|
14
|
+
const _NEED = ['whisper-node'];
|
|
18
15
|
|
|
19
|
-
const [
|
|
20
|
-
|
|
21
|
-
GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS,
|
|
22
|
-
OPENAI_TTS_MAX_LENGTH, WHISPER_DEFAULT_MODEL, errorMessage
|
|
23
|
-
] = [
|
|
24
|
-
'BUFFER', 'STREAM', 'BASE64', 'FILE', {}, 'ogg', 'SPEAKER', true, 'wav',
|
|
25
|
-
'gpt-4o-mini-tts', 'gpt-4o-transcribe', 'gemini-2.5-flash-preview-tts',
|
|
26
|
-
4096, 'base', 'Invalid audio data.',
|
|
27
|
-
];
|
|
28
|
-
|
|
29
|
-
const [
|
|
30
|
-
defaultOpenAITtsModel, defaultOpenAISttModel, defaultGeminiTtsModel,
|
|
31
|
-
] = [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS];
|
|
32
|
-
|
|
33
|
-
const TTS_PROMPT = "As an AI voice assistant, please say the following content in a warm, friendly and professional tone, if the language is English, use an American accent, if it's Traditional Chinese, use Hong Kong Cantonese, if it's Simplified Chinese, use standard Mandarin, for other languages, please speak with a standard, clear accent";
|
|
16
|
+
const [FILE, suffix, SPEAKER, cleanup, WHISPER_DEFAULT_MODEL, errorMessage]
|
|
17
|
+
= ['FILE', 'ogg', 'SPEAKER', true, 'base', 'Invalid audio data.'];
|
|
34
18
|
|
|
35
19
|
const WHISPER_MODELS = [
|
|
36
20
|
// npx whisper-node download tiny.en
|
|
@@ -83,105 +67,22 @@ const getWhisperModelReady = async (model, options) => {
|
|
|
83
67
|
return (await get(getWhisperModelUrl(model), { fuzzy: true }))?.cache?.content;
|
|
84
68
|
};
|
|
85
69
|
|
|
86
|
-
const
|
|
87
|
-
if (options) {
|
|
88
|
-
assert(
|
|
89
|
-
options?.tts || options?.stt,
|
|
90
|
-
'At least one of TTS or STT is selected.', 500
|
|
91
|
-
);
|
|
92
|
-
const provider = ensureString(options?.provider, { case: 'UP' });
|
|
93
|
-
switch (provider) {
|
|
94
|
-
case 'OPENAI':
|
|
95
|
-
clients._provider = provider;
|
|
96
|
-
const OpenAI = await need('openai');
|
|
97
|
-
const openai = new OpenAI(options);
|
|
98
|
-
if (options?.tts) {
|
|
99
|
-
clients.tts = openai.audio.speech;
|
|
100
|
-
}
|
|
101
|
-
if (options?.stt) {
|
|
102
|
-
clients.stt = openai.audio.transcriptions;
|
|
103
|
-
clients.toFile = OpenAI.toFile;
|
|
104
|
-
}
|
|
105
|
-
break;
|
|
106
|
-
case 'GOOGLE':
|
|
107
|
-
clients._provider = provider;
|
|
108
|
-
const { GoogleGenAI } = await need('@google/genai');
|
|
109
|
-
const client = new GoogleGenAI(options);
|
|
110
|
-
if (options?.tts) {
|
|
111
|
-
clients.tts = client.models.generateContent;
|
|
112
|
-
}
|
|
113
|
-
break;
|
|
114
|
-
case '':
|
|
115
|
-
clients._provider = 'LOCAL';
|
|
116
|
-
options?.tts && await checkSay({ assert: true });
|
|
117
|
-
options?.stt && await checkWhisper({ assert: true });
|
|
118
|
-
break;
|
|
119
|
-
default:
|
|
120
|
-
throwError('Invalid speech provider.', 500);
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
assert(
|
|
124
|
-
countKeys(clients), 'Speech API client has not been initialized.', 501
|
|
125
|
-
);
|
|
126
|
-
return clients;
|
|
127
|
-
};
|
|
128
|
-
|
|
129
|
-
const checkSay = async (options) => {
|
|
70
|
+
const checkSay = async () => {
|
|
130
71
|
const result = !!(await ignoreErrFunc(async () => (
|
|
131
72
|
await Promise.all([need('node:os'), need('say'), getFfmpeg()])
|
|
132
73
|
)[0].platform() === 'darwin'));
|
|
133
|
-
|
|
74
|
+
assert(result, 'Say API is not available.', 500);
|
|
134
75
|
return result;
|
|
135
76
|
};
|
|
136
77
|
|
|
137
|
-
const checkWhisper = async (
|
|
78
|
+
const checkWhisper = async () => {
|
|
138
79
|
const result = !!(await ignoreErrFunc(() => Promise.all([
|
|
139
80
|
need('whisper-node'), getFfmpeg()
|
|
140
81
|
])));
|
|
141
|
-
|
|
82
|
+
assert(result, 'Whisper API is not available.', 500);
|
|
142
83
|
return result;
|
|
143
84
|
};
|
|
144
85
|
|
|
145
|
-
const ttsOpenAI = async (input, options) => {
|
|
146
|
-
assert(clients.tts, 'OpenAI TTS API has not been initialized.', 500);
|
|
147
|
-
assert(input, 'Text is required.', 400);
|
|
148
|
-
assert(input.length <= OPENAI_TTS_MAX_LENGTH, 'Text is too long.', 400);
|
|
149
|
-
// https://platform.openai.com/docs/api-reference/audio/createSpeech
|
|
150
|
-
const content = await clients.tts.create({
|
|
151
|
-
model: defaultOpenAITtsModel, voice: DEFAULT_MODELS[OPENAI_VOICE],
|
|
152
|
-
instructions: 'Speak in a friendly and sweet tone.',
|
|
153
|
-
response_format: 'opus', input, ...options?.params || {},
|
|
154
|
-
});
|
|
155
|
-
const buffer = Buffer.from(await content.arrayBuffer());
|
|
156
|
-
return await convert(buffer, { suffix, ...options || {} });
|
|
157
|
-
};
|
|
158
|
-
|
|
159
|
-
// https://ai.google.dev/gemini-api/docs/speech-generation#voices
|
|
160
|
-
const ttsGoogle = async (contents, options) => {
|
|
161
|
-
assert(clients.tts, 'Google TTS API has not been initialized.', 500);
|
|
162
|
-
assert(contents, 'Text is required.', 400);
|
|
163
|
-
assert(await countTokens(contents) <= k(32), 'Text is too long.', 400);
|
|
164
|
-
const resp = await clients.tts({
|
|
165
|
-
model: options?.model || defaultGeminiTtsModel,
|
|
166
|
-
contents: `${options?.prompt || TTS_PROMPT}: ${contents}`,
|
|
167
|
-
config: mergeAtoB(options?.config, {
|
|
168
|
-
responseModalities: ['AUDIO'],
|
|
169
|
-
speechConfig: {
|
|
170
|
-
voiceConfig: {
|
|
171
|
-
prebuiltVoiceConfig: {
|
|
172
|
-
voiceName: options?.voice || 'Zephyr',
|
|
173
|
-
},
|
|
174
|
-
},
|
|
175
|
-
},
|
|
176
|
-
}),
|
|
177
|
-
});
|
|
178
|
-
const rawAudio = resp?.candidates?.[0]?.content?.parts?.[0]?.inlineData;
|
|
179
|
-
assert(rawAudio, 'Failed to generate audio.', 500);
|
|
180
|
-
return options?.raw ? rawAudio : await packPcmToWav(rawAudio?.data, {
|
|
181
|
-
input: BASE64, expected: 'FILE', suffix: wav, ...options || {},
|
|
182
|
-
});
|
|
183
|
-
};
|
|
184
|
-
|
|
185
86
|
const ttsSay = async (text, options) => {
|
|
186
87
|
const say = await need('say');
|
|
187
88
|
assert(text, 'Text is required.', 400);
|
|
@@ -208,22 +109,6 @@ const ttsBrowser = async (text) => {
|
|
|
208
109
|
return speechSynthesis.speak(new SpeechSynthesisUtterance(text));
|
|
209
110
|
};
|
|
210
111
|
|
|
211
|
-
const sttOpenAI = async (audio, options) => {
|
|
212
|
-
assert(clients.stt, 'OpenAI STT API has not been initialized.', 500);
|
|
213
|
-
const input = ensureString(options?.input, { case: 'UP' });
|
|
214
|
-
const { content, cleanup } = await convert(audio, {
|
|
215
|
-
input: options?.input, ...options || {}, expected: STREAM, errorMessage,
|
|
216
|
-
suffix: ['', BUFFER].includes(input) ? suffix : null,
|
|
217
|
-
withCleanupFunc: true,
|
|
218
|
-
});
|
|
219
|
-
const result = await clients.stt.create({
|
|
220
|
-
file: await clients.toFile(content), model: defaultOpenAISttModel,
|
|
221
|
-
response_format: 'text', ...options?.params || {},
|
|
222
|
-
});
|
|
223
|
-
await cleanup();
|
|
224
|
-
return result;
|
|
225
|
-
};
|
|
226
|
-
|
|
227
112
|
// This function is not working properly, a pull request is filed:
|
|
228
113
|
// https://github.com/ariym/whisper-node/pull/58
|
|
229
114
|
const sttWhisper = async (audio, options) => {
|
|
@@ -253,33 +138,24 @@ const sttWhisper = async (audio, options) => {
|
|
|
253
138
|
const tts = async (text, options) => {
|
|
254
139
|
let engine;
|
|
255
140
|
if (inBrowser()) { engine = ttsBrowser }
|
|
256
|
-
else if (clients?.tts && clients._provider === 'GOOGLE') { engine = ttsGoogle; }
|
|
257
|
-
else if (clients?.tts && clients._provider === 'OPENAI') { engine = ttsOpenAI; }
|
|
258
141
|
else if (await checkSay()) { engine = ttsSay; }
|
|
259
|
-
else { throwError('Text-to-Speech engine
|
|
142
|
+
else { throwError('Text-to-Speech engine is not available.', 500); }
|
|
260
143
|
return await engine(text, options);
|
|
261
144
|
};
|
|
262
145
|
|
|
263
146
|
const stt = async (audio, options) => {
|
|
264
147
|
let engine;
|
|
265
|
-
if (
|
|
266
|
-
else
|
|
267
|
-
else { throwError('Speech-to-Text engine has not been initialized.', 500); }
|
|
148
|
+
if (await checkWhisper()) { engine = sttWhisper; }
|
|
149
|
+
else { throwError('Speech-to-Text engine is not available.', 500); }
|
|
268
150
|
return await engine(audio, options);
|
|
269
151
|
};
|
|
270
152
|
|
|
271
|
-
export default init;
|
|
272
153
|
export {
|
|
273
154
|
_NEED,
|
|
274
|
-
OPENAI_TTS_MAX_LENGTH,
|
|
275
155
|
checkSay,
|
|
276
156
|
checkWhisper,
|
|
277
|
-
init,
|
|
278
157
|
stt,
|
|
279
|
-
sttOpenAI,
|
|
280
158
|
sttWhisper,
|
|
281
159
|
tts,
|
|
282
|
-
ttsGoogle,
|
|
283
|
-
ttsOpenAI,
|
|
284
160
|
ttsSay,
|
|
285
161
|
};
|
package/lib/storage.mjs
CHANGED
|
@@ -240,7 +240,7 @@ const blobToBuffer = async blob => {
|
|
|
240
240
|
|
|
241
241
|
const convert = async (any, options) => {
|
|
242
242
|
assert(any, options?.errorMessage || 'Invalid input.', 400);
|
|
243
|
-
|
|
243
|
+
let result = {};
|
|
244
244
|
let [input, expected] = [(
|
|
245
245
|
Buffer.isBuffer(any)
|
|
246
246
|
|| ArrayBuffer.isArrayBuffer(any)
|
|
@@ -248,7 +248,7 @@ const convert = async (any, options) => {
|
|
|
248
248
|
) ? BUFFER : options?.input, options?.expected || BUFFER].map(
|
|
249
249
|
x => ensureString(x, { case: 'UP' })
|
|
250
250
|
);
|
|
251
|
-
let [oriFile, meta, mime, subExp] = [null, null,
|
|
251
|
+
let [oriFile, meta, mime, subExp] = [null, null, null, expected];
|
|
252
252
|
switch (input) {
|
|
253
253
|
case FILE:
|
|
254
254
|
oriFile = any;
|
|
@@ -269,6 +269,7 @@ const convert = async (any, options) => {
|
|
|
269
269
|
input = BUFFER;
|
|
270
270
|
break;
|
|
271
271
|
}
|
|
272
|
+
mime || (mime = (await getMime(any, any))?.mime || MIME_BINARY);
|
|
272
273
|
switch (expected) {
|
|
273
274
|
case STREAM: subExp = FILE; break;
|
|
274
275
|
case DATAURL: subExp = BUFFER; break;
|
|
@@ -313,8 +314,9 @@ const convert = async (any, options) => {
|
|
|
313
314
|
|
|
314
315
|
const getMime = async (buf, filename) => {
|
|
315
316
|
const mimeType = await ignoreErrFunc(() => need('mime-types'));
|
|
316
|
-
const mime = extract(await fileTypeFromBuffer(buf), 'mime')
|
|
317
|
-
|| (filename && mimeType?.lookup?.(filename))
|
|
317
|
+
const mime = (buf && Buffer.isBuffer(buf) && extract(await fileTypeFromBuffer(buf), 'mime'))
|
|
318
|
+
|| (filename && String.isString(filename) && mimeType?.lookup?.(filename))
|
|
319
|
+
|| MIME_BINARY;
|
|
318
320
|
return { mime, extension: mimeType?.extension?.(mime) || 'bin' };
|
|
319
321
|
};
|
|
320
322
|
|