utilitas 1999.1.70 → 1999.1.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/alan.mjs CHANGED
@@ -1,8 +1,8 @@
1
1
  import { checkSearch, distill, search } from './web.mjs';
2
2
  import { create as createUoid } from './uoid.mjs';
3
- import { createWavHeader } from './media.mjs';
4
3
  import { end, loop } from './event.mjs';
5
4
  import { fileTypeFromBuffer } from 'file-type';
5
+ import { packPcmToWav } from './media.mjs';
6
6
  import { v4 as uuidv4 } from 'uuid';
7
7
 
8
8
  import {
@@ -707,12 +707,8 @@ const packResp = async (resp, options) => {
707
707
  const str = simpleText.indexOf(x);
708
708
  str >= 0 && (simpleText = simpleText.slice(0, str).trim());
709
709
  });
710
- audio && (audio = Buffer.isBuffer(audio) ? audio : await convert(audio, {
711
- input: BASE64, expected: BUFFER,
712
- })) && audio.length && (audio = Buffer.concat([
713
- createWavHeader(audio.length), audio
714
- ])) && (audio = await convert(audio, {
715
- input: BUFFER, expected: BUFFER, ...options || {},
710
+ audio = await ignoreErrFunc(async () => await packPcmToWav(audio, {
711
+ input: Buffer.isBuffer(audio) ? BUFFER : BASE64, expected: BUFFER,
716
712
  }));
717
713
  if (images?.length) {
718
714
  for (let i in images) {
@@ -1592,6 +1588,7 @@ export {
1592
1588
  getSession,
1593
1589
  init,
1594
1590
  initChat,
1591
+ k,
1595
1592
  listFiles,
1596
1593
  listGptFineTuningEvents,
1597
1594
  listGptFineTuningJobs,
package/lib/manifest.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  const manifest = {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "1999.1.70",
4
+ "version": "1999.1.72",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -30,7 +30,6 @@ const manifest = {
30
30
  "@ffprobe-installer/ffprobe": "^2.1.2",
31
31
  "@google-cloud/speech": "^7.1.0",
32
32
  "@google-cloud/storage": "^7.16.0",
33
- "@google-cloud/text-to-speech": "^6.1.0",
34
33
  "@google-cloud/vision": "^5.1.0",
35
34
  "@google/genai": "^1.0.0",
36
35
  "@mozilla/readability": "github:mozilla/readability",
package/lib/media.mjs CHANGED
@@ -38,6 +38,17 @@ const createWavHeader = (
38
38
  return header;
39
39
  };
40
40
 
41
+ const packPcmToWav = async (audio, options) => {
42
+ (audio = await convert(audio, { ...options || {}, expected: BUFFER })) // DON'T override expected
43
+ && audio.length
44
+ && (audio = Buffer.concat([createWavHeader(audio.length), audio]))
45
+ && (audio = await convert(audio, {
46
+ expected: BUFFER, ...options || {}, input: BUFFER, // DON'T override input
47
+ }));
48
+ assert(audio, 'Failed to pack PCM to WAV.', 500);
49
+ return audio;
50
+ };
51
+
41
52
  // https://codex.so/ffmpeg-node-js
42
53
  const getFfmpeg = async (options) => {
43
54
  const ffmpeg = await need('fluent-ffmpeg');
@@ -92,4 +103,5 @@ export {
92
103
  convertAudioTo16kNanoPcmWave,
93
104
  createWavHeader,
94
105
  getFfmpeg,
106
+ packPcmToWav,
95
107
  };
package/lib/speech.mjs CHANGED
@@ -1,9 +1,9 @@
1
- import { DEFAULT_MODELS, OPENAI_VOICE } from './alan.mjs';
1
+ import { DEFAULT_MODELS, OPENAI_VOICE, countTokens, k } from './alan.mjs';
2
2
  import { getApiKeyCredentials, hash } from './encryption.mjs';
3
- import { getFfmpeg } from './media.mjs';
3
+ import { getFfmpeg, packPcmToWav } from './media.mjs';
4
4
  import { get } from './web.mjs';
5
5
  import { convert, getTempPath } from './storage.mjs';
6
- import { ensureString } from './utilitas.mjs';
6
+ import { ensureString, mergeAtoB } from './utilitas.mjs';
7
7
 
8
8
  import {
9
9
  call, countKeys, ignoreErrFunc, inBrowser,
@@ -17,19 +17,32 @@ import {
17
17
 
18
18
  const _NEED = [
19
19
  '@google-cloud/speech',
20
- '@google-cloud/text-to-speech',
20
+ '@google/genai',
21
21
  'OpenAI',
22
22
  'whisper-node',
23
23
  ];
24
24
 
25
25
  const WHISPER_DEFAULT_MODEL = 'base';
26
26
  const errorMessage = 'Invalid audio data.';
27
- const [BUFFER, STREAM, BASE64, FILE, clients, languageCode, audioEncoding, suffix, SPEAKER, cleanup]
28
- = ['BUFFER', 'STREAM', 'BASE64', 'FILE', {}, 'en-US', 'OGG_OPUS', 'ogg', 'SPEAKER', true];
29
- const [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, OPENAI_TTS_MAX_LENGTH]
30
- = ['gpt-4o-mini-tts', 'gpt-4o-transcribe', 4096];
31
- const [defaultOpenAITtsModel, defaultOpenAISttModel]
32
- = [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE];
27
+
28
+ const [
29
+ BUFFER, STREAM, BASE64, FILE, clients, languageCode, audioEncoding, suffix,
30
+ SPEAKER, cleanup, wav,
31
+ ] = [
32
+ 'BUFFER', 'STREAM', 'BASE64', 'FILE', {}, 'en-US', 'OGG_OPUS', 'ogg',
33
+ 'SPEAKER', true, 'wav'
34
+ ];
35
+
36
+ const [
37
+ GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_PRO_TTS, GEMINI_25_FLASH_TTS,
38
+ OPENAI_TTS_MAX_LENGTH,
39
+ ] = [
40
+ 'gpt-4o-mini-tts', 'gpt-4o-transcribe', 'gemini-2.5-pro-preview-tts',
41
+ 'gemini-2.5-flash-preview-tts', 4096
42
+ ];
43
+
44
+ const [defaultOpenAITtsModel, defaultOpenAISttModel, defaultGeminiTtsModel]
45
+ = [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_PRO_TTS];
33
46
 
34
47
  const WHISPER_MODELS = [
35
48
  // npx whisper-node download tiny.en
@@ -104,13 +117,14 @@ const init = async (options) => {
104
117
  break;
105
118
  case 'GOOGLE':
106
119
  clients._provider = provider;
107
- const sslCreds = await getApiKeyCredentials(options);
108
120
  if (options?.tts) {
109
- const tts = (await need('@google-cloud/text-to-speech')).default;
110
- clients.tts = new tts.TextToSpeechClient({ sslCreds });
121
+ let { GoogleGenAI } = await need('@google/genai');
122
+ let client = new GoogleGenAI(options);
123
+ clients.tts = client.models.generateContent;
111
124
  }
112
125
  if (options?.stt) {
113
126
  const stt = (await need('@google-cloud/speech')).default;
127
+ const sslCreds = await getApiKeyCredentials(options);
114
128
  clients.stt = new stt.SpeechClient({ sslCreds });
115
129
  }
116
130
  break;
@@ -159,15 +173,29 @@ const ttsOpenAI = async (input, options) => {
159
173
  return await convert(buffer, { suffix, ...options || {} });
160
174
  };
161
175
 
162
- const ttsGoogle = async (text, options) => {
176
+ // https://ai.google.dev/gemini-api/docs/speech-generation#voices
177
+ const ttsGoogle = async (contents, options) => {
163
178
  assert(clients.tts, 'Google TTS API has not been initialized.', 500);
164
- assert(text, 'Text is required.', 400);
165
- const [response] = await clients.tts.synthesizeSpeech({
166
- input: { text, ...options?.input || {} },
167
- voice: { languageCode, name: 'en-US-Wavenet-F', ...options?.voice || {} },
168
- audioConfig: { audioEncoding, ...options?.audioConfig || {} },
179
+ assert(contents, 'Text is required.', 400);
180
+ assert(await countTokens(contents) <= k(32), 'Text is too long.', 400);
181
+ const resp = await clients.tts({
182
+ model: options?.model || defaultGeminiTtsModel, contents,
183
+ config: mergeAtoB(options?.config, {
184
+ responseModalities: ['AUDIO'],
185
+ speechConfig: {
186
+ voiceConfig: {
187
+ prebuiltVoiceConfig: {
188
+ voiceName: options?.voice || 'Leda',
189
+ },
190
+ },
191
+ },
192
+ }),
193
+ });
194
+ const rawAudio = resp?.candidates?.[0]?.content?.parts?.[0]?.inlineData;
195
+ assert(rawAudio, 'Failed to generate audio.', 500);
196
+ return options?.raw ? rawAudio : await packPcmToWav(rawAudio?.data, {
197
+ input: BASE64, expected: 'FILE', suffix: wav, ...options || {},
169
198
  });
170
- return await convert(response.audioContent, { suffix, ...options || {} });
171
199
  };
172
200
 
173
201
  const ttsSay = async (text, options) => {
@@ -275,13 +303,16 @@ const stt = async (audio, options) => {
275
303
  export default init;
276
304
  export {
277
305
  _NEED,
306
+ OPENAI_TTS_MAX_LENGTH,
278
307
  checkSay,
279
308
  checkWhisper,
280
- init, OPENAI_TTS_MAX_LENGTH, stt, sttGoogle,
309
+ init,
310
+ stt,
311
+ sttGoogle,
281
312
  sttOpenAI,
282
313
  sttWhisper,
283
314
  tts,
284
315
  ttsGoogle,
285
316
  ttsOpenAI,
286
- ttsSay
317
+ ttsSay,
287
318
  };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "1999.1.70",
4
+ "version": "1999.1.72",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -41,7 +41,6 @@
41
41
  "@ffprobe-installer/ffprobe": "^2.1.2",
42
42
  "@google-cloud/speech": "^7.1.0",
43
43
  "@google-cloud/storage": "^7.16.0",
44
- "@google-cloud/text-to-speech": "^6.1.0",
45
44
  "@google-cloud/vision": "^5.1.0",
46
45
  "@google/genai": "^1.0.0",
47
46
  "@mozilla/readability": "github:mozilla/readability",