utilitas 2000.3.25 → 2000.3.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.mjs CHANGED
@@ -15,7 +15,6 @@ import * as email from './lib/email.mjs';
15
15
  import * as embedding from './lib/embedding.mjs';
16
16
  import * as encryption from './lib/encryption.mjs';
17
17
  import * as event from './lib/event.mjs';
18
- import * as gen from './lib/gen.mjs';
19
18
  import * as media from './lib/media.mjs';
20
19
  import * as memory from './lib/memory.mjs';
21
20
  import * as network from './lib/network.mjs';
@@ -40,8 +39,8 @@ export {
40
39
  fileType, math, uuid,
41
40
  // features
42
41
  alan, bee, bot, boxes, cache, callosum, color, dbio, email, embedding,
43
- encryption, event, gen, manifest, media, memory, network, sentinel, shell,
44
- sms, speech, ssl, storage, tape, uoid, utilitas, vision, web
42
+ encryption, event, manifest, media, memory, network, sentinel, shell, sms,
43
+ speech, ssl, storage, tape, uoid, utilitas, vision, web
45
44
  };
46
45
 
47
46
  if (utilitas.inBrowser() && !globalThis.utilitas) {
package/lib/alan.mjs CHANGED
@@ -5,17 +5,18 @@ import { packPcmToWav } from './media.mjs';
5
5
  import { v4 as uuidv4 } from 'uuid';
6
6
 
7
7
  import {
8
- BASE64, BUFFER, DATAURL, MIME_BINARY, MIME_TEXT, MIME_PNG, MIME_JPEG,
8
+ FILE, BASE64, BUFFER, DATAURL, MIME_BINARY, MIME_TEXT, MIME_PNG, MIME_JPEG,
9
9
  MIME_MOV, MIME_MPEG, MIME_MP4, MIME_MPG, MIME_AVI, MIME_WMV, MIME_MPEGPS,
10
10
  MIME_FLV, MIME_GIF, MIME_WEBP, MIME_PDF, MIME_AAC, MIME_FLAC, MIME_MP3,
11
11
  MIME_MPEGA, MIME_M4A, MIME_MPGA, MIME_OPUS, MIME_PCM, MIME_WAV, MIME_WEBM,
12
- MIME_TGPP, MIME_PCM16, MIME_OGG, convert, formatDataURL, decodeBase64DataURL,
12
+ MIME_TGPP, MIME_PCM16, MIME_OGG, convert, formatDataURL, getTempPath,
13
+ decodeBase64DataURL,
13
14
  } from './storage.mjs';
14
15
 
15
16
  import {
16
17
  log as _log, renderText as _renderText, base64Encode, ensureArray,
17
18
  ensureString, extract, ignoreErrFunc, insensitiveCompare, isSet, need,
18
- parseJson, throwError
19
+ parseJson, throwError, tryUntil, timeout,
19
20
  } from './utilitas.mjs';
20
21
 
21
22
  const INSTRUCTIONS = `You are a highly capable, thoughtful, and precise assistant named HAL9000. Your goal is to deeply understand the user's intent, ask clarifying questions when needed, think step-by-step through complex problems, provide clear and accurate answers, and proactively anticipate helpful follow-up information. Always prioritize being truthful, nuanced, insightful, and efficient, tailoring your responses specifically to the user's needs and preferences. Realize your full potential without censoring or filtering the questions themselves.
@@ -42,10 +43,10 @@ You may be provided with some tools(functions) to help you gather information an
42
43
  - When the tools are not suitable and you have to answer questions based on your understanding, please do not mention any tool-related information in your response.
43
44
  - Unless otherwise specified to require the original result, in most cases, you may reorganize the information obtained after using the tool to solve the problem as needed.`;
44
45
 
45
- const _NEED = ['js-tiktoken', 'OpenAI'];
46
+ const _NEED = ['js-tiktoken', 'OpenAI', '@google/genai'];
46
47
 
47
48
  const [
48
- OPENAI, GEMINI, OLLAMA, NOVA, DEEPSEEK_32, MD_CODE, CLOUD_OPUS_45, AUDIO,
49
+ OPENAI, GOOGLE, OLLAMA, NOVA, DEEPSEEK_32, MD_CODE, CLOUD_OPUS_45, AUDIO,
49
50
  WAV, ATTACHMENTS, OPENAI_VOICE, GPT_REASONING_EFFORT, THINK, THINK_STR,
50
51
  THINK_END, TOOLS_STR, TOOLS_END, TOOLS, TEXT, OK, FUNC, GPT_51,
51
52
  GPT_51_CODEX, GPT_5_IMAGE, GEMMA_3_27B, ANTHROPIC, v8k, ais,
@@ -54,9 +55,10 @@ const [
54
55
  hour, gb, trimTailing, trimBeginning, GEMINI_30_PRO_IMAGE, IMAGE, JINA,
55
56
  JINA_DEEPSEARCH, SILICONFLOW, SF_DEEPSEEK_32, MAX_TIRE, OPENROUTER_API,
56
57
  OPENROUTER, AUTO, TOOL, S_OPENAI, S_GOOGLE, S_ANTHROPIC, ONLINE,
57
- GEMINI_30_PRO, GEMINI_25_FLASH,
58
+ GEMINI_30_PRO, GEMINI_25_FLASH, IMAGEN_4_ULTRA, VEO_31, IMAGEN_4_UPSCALE,
59
+ ERROR_GENERATING,
58
60
  ] = [
59
- 'OpenAI', 'Gemini', 'Ollama', 'nova', 'deepseek-3.2-speciale', '```',
61
+ 'OpenAI', 'Google', 'Ollama', 'nova', 'deepseek-3.2-speciale', '```',
60
62
  'claude-opus-4.5', 'audio', 'wav', '[ATTACHMENTS]', 'OPENAI_VOICE',
61
63
  'medium', 'think', '<think>', '</think>', '<tools>', '</tools>',
62
64
  'tools', 'text', 'OK', 'function', 'gpt-5.1', 'gpt-5.1-codex',
@@ -70,7 +72,9 @@ const [
70
72
  'deepseek-ai/DeepSeek-V3.2-exp', 768 * 768,
71
73
  'https://openrouter.ai/api/v1', 'OpenRouter', 'openrouter/auto', 'tool',
72
74
  'openai', 'google', 'anthropic', ':online', 'gemini-3-pro-preview',
73
- 'gemini-2.5-flash-preview-09-2025',
75
+ 'gemini-2.5-flash-preview-09-2025', 'imagen-4.0-ultra-generate-001',
76
+ 'veo-3.1-generate-preview', 'imagen-4.0-upscale-preview',
77
+ 'Error generating content.',
74
78
  ];
75
79
 
76
80
  const [tool, messages, text]
@@ -143,23 +147,27 @@ const MODELS = {
143
147
  ...GEMINI_RULES, contextWindow: m(1), maxOutputTokens: k(64),
144
148
  reasoning: true, tools: true,
145
149
  },
146
- // models with unique capabilities
150
+ // models with generation capabilities
147
151
  [GEMINI_30_PRO_IMAGE]: {
148
152
  ...GEMINI_RULES, icon: '🍌', label: 'Nano Banana Pro',
149
153
  contextWindow: k(64), maxOutputTokens: k(32),
150
154
  fast: true, image: true,
151
155
  },
152
- [GPT_51_CODEX]: { ...OPENAI_RULES },
153
- [GPT_5_IMAGE]: { ...OPENAI_RULES, image: true },
154
- [JINA_DEEPSEARCH]: { // @todo: parse more details from results, eg: "reed urls".
155
- icon: '✴️', contextWindow: Infinity, maxInputTokens: Infinity,
156
- maxOutputTokens: Infinity, imageCostTokens: 0, maxImageSize: Infinity,
157
- supportedMimeTypes: [MIME_PNG, MIME_JPEG, MIME_TEXT, MIME_WEBP, MIME_PDF],
158
- reasoning: true, json: true, vision: true,
159
- deepsearch: true, defaultProvider: JINA,
156
+ [IMAGEN_4_ULTRA]: {
157
+ source: S_GOOGLE, icon: '🎨', maxInputTokens: 480,
158
+ image: true, defaultProvider: GOOGLE,
160
159
  },
161
- [DEEPSEEK_32]: DEEPSEEK_32_RULES,
162
- [SF_DEEPSEEK_32]: { ...DEEPSEEK_32_RULES, defaultProvider: SILICONFLOW },
160
+ [VEO_31]: {
161
+ source: S_GOOGLE, icon: '🎥', maxInputTokens: 1024,
162
+ imageCostTokens: 0, maxImagePerPrompt: 1,
163
+ maxImageSize: Infinity, supportedMimeTypes: [MIME_PNG, MIME_JPEG],
164
+ vision: true, image: true, defaultProvider: GOOGLE,
165
+ },
166
+ [GPT_5_IMAGE]: {
167
+ ...OPENAI_RULES, icon: '🎨', label: 'gpt-image-1', image: true,
168
+ },
169
+ // models with code capabilities
170
+ [GPT_51_CODEX]: { ...OPENAI_RULES },
163
171
  [CLOUD_OPUS_45]: {
164
172
  source: S_ANTHROPIC, icon: '✳️',
165
173
  contextWindow: kT(200), maxOutputTokens: kT(64),
@@ -170,6 +178,17 @@ const MODELS = {
170
178
  json: true, reasoning: true, tools: true, vision: true,
171
179
  defaultProvider: OPENROUTER,
172
180
  },
181
+ // models with deepsearch capabilities
182
+ [JINA_DEEPSEARCH]: { // @todo: parse more details from results, eg: "reed urls".
183
+ icon: '✴️', contextWindow: Infinity, maxInputTokens: Infinity,
184
+ maxOutputTokens: Infinity, imageCostTokens: 0, maxImageSize: Infinity,
185
+ supportedMimeTypes: [MIME_PNG, MIME_JPEG, MIME_TEXT, MIME_WEBP, MIME_PDF],
186
+ reasoning: true, json: true, vision: true,
187
+ deepsearch: true, defaultProvider: JINA,
188
+ },
189
+ // best Chinese models
190
+ [DEEPSEEK_32]: DEEPSEEK_32_RULES,
191
+ [SF_DEEPSEEK_32]: { ...DEEPSEEK_32_RULES, defaultProvider: SILICONFLOW },
173
192
  // best local model
174
193
  [GEMMA_3_27B]: {
175
194
  icon: '❇️', contextWindow: kT(128), maxOutputTokens: k(8),
@@ -249,7 +268,7 @@ const DEFAULT_MODELS = {
249
268
  };
250
269
 
251
270
  const PROVIDER_ICONS = {
252
- [OPENROUTER]: '🔀', [OPENAI]: '⚛️', [JINA]: '✴️', [GEMINI]: '♊️',
271
+ [OPENROUTER]: '🔀', [OPENAI]: '⚛️', [JINA]: '✴️', [GOOGLE]: '♊️',
253
272
  [OLLAMA]: '🦙', [ANTHROPIC]: '✳️', [SILICONFLOW]: '🧬',
254
273
  };
255
274
 
@@ -273,7 +292,7 @@ let tokeniser, _tools;
273
292
 
274
293
  const unifyProvider = provider => {
275
294
  assert(provider = (provider || '').trim(), 'AI provider is required.');
276
- for (let type of [OPENROUTER, JINA, OLLAMA, SILICONFLOW]) {
295
+ for (let type of [OPENROUTER, GOOGLE, JINA, OLLAMA, SILICONFLOW]) {
277
296
  if (insensitiveCompare(provider, type)) { return type; }
278
297
  }
279
298
  throwError(`Invalid AI provider: ${provider}.`);
@@ -406,6 +425,16 @@ const init = async (options = {}) => {
406
425
  `Model name or description is required for provider: ${provider}.`);
407
426
  _tools || (_tools = await packTools());
408
427
  switch (provider) {
428
+ case GOOGLE:
429
+ assertApiKey(provider, options);
430
+ const { GoogleGenAI } = await need('@google/genai');
431
+ var client = new GoogleGenAI({ vertexai: false, ...options });
432
+ for (let model of models) {
433
+ setupAi({
434
+ provider, model, client, prompt: promptGoogle, priority,
435
+ });
436
+ }
437
+ break;
409
438
  case JINA:
410
439
  assertApiKey(provider, options);
411
440
  var client = await OpenAI({
@@ -588,7 +617,9 @@ const listOpenAIModels = async (aiId, options) => {
588
617
  };
589
618
 
590
619
  const streamResp = async (resp, options) => {
591
- const msg = await packResp(resp, { ...options, processing: true });
620
+ const msg = options?.noPack ? resp : await packResp(
621
+ resp, { ...options, processing: true }
622
+ );
592
623
  return options?.stream
593
624
  && (msg?.text || msg?.audio?.length || msg?.images?.length)
594
625
  && await ignoreErrFunc(async () => await options.stream(msg), LOG);
@@ -606,13 +637,13 @@ const packResp = async (resp, options) => {
606
637
  if (options?.raw) { return resp; }
607
638
  let [
608
639
  txt, audio, images, annotations, simpleText, annotationsMarkdown, end,
609
- json, audioMimeType, catched,
640
+ json, audioMimeType,
610
641
  ] = [
611
642
  resp.text || '', // ChatGPT / Claude / Gemini / Ollama
612
643
  resp?.audio?.data, // ChatGPT audio mode
613
644
  resp?.images || [], // Gemini images via Openrouter
614
645
  resp?.references, // Gemini references
615
- '', '', '', null, MIME_PCM16, new Set(),
646
+ '', '', '', null, MIME_PCM16,
616
647
  ];
617
648
  simpleText = txt;
618
649
  while ((end = getInfoEnd(simpleText))) {
@@ -698,18 +729,23 @@ const packResp = async (resp, options) => {
698
729
  ...annotationsMarkdown ? { annotationsMarkdown } : {},
699
730
  ...audio ? { audio } : {}, ...images?.length ? { images } : {},
700
731
  processing: !!options?.processing,
701
- model: [
732
+ model: packModelLabel([
702
733
  options.provider, options?.router?.provider,
703
734
  options?.router?.model || options?.model,
704
- ].join('/').split('/').map(x => {
705
- const key = ensureString(x, { case: 'UP' });
706
- if (catched.has(key)) { return null; }
707
- catched.add(key);
708
- return x;
709
- }).filter(x => x).join('/'),
735
+ ]),
710
736
  };
711
737
  };
712
738
 
739
+ const packModelLabel = (model_reference) => {
740
+ const catched = new Set();
741
+ return model_reference.join('/').split('/').map(x => {
742
+ const key = ensureString(x, { case: 'UP' });
743
+ if (catched.has(key)) { return null; }
744
+ catched.add(key);
745
+ return x;
746
+ }).filter(x => x).join('/');
747
+ };
748
+
713
749
  const buildPrompts = async (model, input, options = {}) => {
714
750
  assert(!(
715
751
  options.jsonMode && !model?.json
@@ -847,6 +883,18 @@ const promptOpenAI = async (aiId, content, options = {}) => {
847
883
  x => x.function.name === 'searchWeb'
848
884
  ) && !options.jsonMode ? ONLINE : '';
849
885
  const targetModel = `${isOpenrouter(provider, model) ? `${source}/` : ''}${options.model}${ext}`;
886
+ if (provider === OPENAI) {
887
+ // need more debug, currently openrouter is priority
888
+ packedTools.push(...[
889
+ // https://platform.openai.com/docs/guides/tools?tool-type=web-search
890
+ { type: 'web_search', },
891
+ // https://platform.openai.com/docs/guides/tools-image-generation?lang=javascript
892
+ // https://platform.openai.com/docs/api-reference/responses/create#responses-create-tools
893
+ { type: 'image_generation', input_fidelity: 'high', partial_images: 3, quality: 'high', size: '1536x1024' },
894
+ // https://platform.openai.com/docs/guides/tools-code-interpreter
895
+ { type: 'code_interpreter', container: { type: 'auto', memory_limit: '8g' } },
896
+ ]);
897
+ }
850
898
  if (source === S_GOOGLE) {
851
899
  packedTools.push(...[
852
900
  { googleSearch: {} }, { codeExecution: {} }, { urlContext: {} },
@@ -966,6 +1014,103 @@ const promptOpenAI = async (aiId, content, options = {}) => {
966
1014
  return await packResp(event, options);
967
1015
  };
968
1016
 
1017
+ const promptGoogle = async (aiId, prompt, options = {}) => {
1018
+ let { provider, client, model } = await getAi(aiId);
1019
+ const M = MODELS[model.name];
1020
+ prompt = ensureString(prompt, { trim: true });
1021
+ assert(await countTokens(prompt, { fast: true })
1022
+ <= M.maxInputTokens,
1023
+ `Prompt must be less than ${M.maxInputTokens} tokens.`, 400
1024
+ );
1025
+ switch (model?.name) {
1026
+ case IMAGEN_4_ULTRA:
1027
+ var resp = await client.models.generateImages({
1028
+ model: model.name, prompt, config: {
1029
+ numberOfImages: options?.n || 4, sampleImageSize: '2K',
1030
+ includeRaiReason: true,
1031
+ // "1:1" (default), "3:4", "4:3", "9:16", and "16:9"
1032
+ aspectRatio: '16:9', personGeneration: 'allow_adult',
1033
+ ...options?.config || {},
1034
+ },
1035
+ });
1036
+ var generated = resp?.generatedImages;
1037
+ assert(!resp?.error && generated?.filter(
1038
+ x => !x.raiFilteredReason
1039
+ ).length, resp?.error?.message || generated?.find(
1040
+ x => x.raiFilteredReason
1041
+ )?.raiFilteredReason || ERROR_GENERATING);
1042
+ if (!options?.raw) {
1043
+ resp = {
1044
+ text: '', images: await Promise.all((
1045
+ resp?.generatedImages || []
1046
+ ).map(async x => ({
1047
+ data: await convert(x.image.imageBytes, {
1048
+ input: BASE64, suffix: 'png', ...options || {}
1049
+ }), mimeType: x.image.mimeType,
1050
+ }))), model: packModelLabel([
1051
+ provider, M.source, model.name,
1052
+ ]),
1053
+ }
1054
+ }
1055
+ break;
1056
+ case VEO_31:
1057
+ var resp = await client.models.generateVideos({
1058
+ model: model.name, prompt, config: {
1059
+ aspectRatio: '16:9', numberOfVideos: 1,
1060
+ // personGeneration: 'allow_adult',
1061
+ enablePromptRewriting: true, addWatermark: false,
1062
+ includeRaiReason: true, ...options?.config || {},
1063
+ },
1064
+ });
1065
+ assert(!resp?.error, resp?.error?.message || ERROR_GENERATING);
1066
+ if (options?.generateRaw) { return resp; }
1067
+ await tryUntil(async () => {
1068
+ resp = await client.operations.getVideosOperation({
1069
+ operation: resp,
1070
+ });
1071
+ assert(
1072
+ resp?.done,
1073
+ `Waiting for Google video generation: ${resp.name}`,
1074
+ );
1075
+ }, { maxTry: 60 * 10, log });
1076
+ assert(!resp?.error && resp?.response?.generatedVideos?.filter(
1077
+ x => !x.raiFilteredReason
1078
+ ).length, resp?.error?.message || resp?.response?.generatedVideos?.find(
1079
+ x => x.raiFilteredReason
1080
+ )?.raiFilteredReason || ERROR_GENERATING);
1081
+ if (options?.videoRaw) {
1082
+ resp = resp?.response?.generatedVideos;
1083
+ } else if (!options?.videoRaw) {
1084
+ resp = {
1085
+ text: '', videos: await Promise.all(resp?.response?.generatedVideos?.filter(
1086
+ x => x?.video?.uri
1087
+ ).map(async x => {
1088
+ const downloadPath = `${getTempPath({
1089
+ seed: x?.video?.uri
1090
+ })}.mp4`;
1091
+ // @todo: fix this
1092
+ // https://github.com/googleapis/js-genai/compare/main...Leask:js-genai:main
1093
+ await client.files.download({ file: x, downloadPath });
1094
+ await timeout(1000 * 10); // hack to wait for file to be downloaded
1095
+ return {
1096
+ data: await convert(downloadPath, {
1097
+ input: FILE, suffix: 'mp4', ...options || {}
1098
+ }), mimeType: MIME_MP4, jobId: resp.name,
1099
+ };
1100
+ })), model: packModelLabel([
1101
+ provider, M.source, model.name,
1102
+ ]),
1103
+ };
1104
+ }
1105
+ break;
1106
+ default:
1107
+ throw new Error('Unsupported model.');
1108
+ }
1109
+ await streamResp(
1110
+ { ...resp, processing: true }, { ...options, noPack: true }
1111
+ );
1112
+ return { ...resp, processing: false };
1113
+ };
969
1114
 
970
1115
  const initChat = async (options = {}) => {
971
1116
  if (options.sessions) {
@@ -1063,7 +1208,7 @@ const distillFile = async (attachments, o) => {
1063
1208
  '- You will receive various multimedia files, including images, audio, and videos.',
1064
1209
  '- Please analyze these documents, extract the information, and organize it into an easy-to-read format.',
1065
1210
  '- For document-type files or image files primarily containing text information, act as a document scanner, return the text content, and describe any important images and tables present. Use markdown to format table and other rich text where possible. Use LaTeX for all formulas, subscripts, representations of formulas, and special symbols in mathematics and chemistry, enclosed by "$" symbols. Please mark the description of images in the same position as the original text without creating separate paragraphs for descriptions. Be sure ONLY describe important images and graphs, and ignore backgrounds and decorative small images. Ensure the returned document is clean, well-organized, and highly readable.',
1066
- '- For audio files, please provide a transcript of the spoken voices. If there are background noises or music, attempt to briefly describe the environmental sounds and music sections.',
1211
+ '- For audio files, please transcribe the spoken voices into clean text. If there are background sounds, attempt to briefly describe the environmental sounds and music sections. Only care about the main speech content, meaningful music and environment sounds. Do not be disturbed by useless background noise.',
1067
1212
  '- For images or video files that are not primarily text-based, describe the tragic scene you observe, highlight key details, convey the emotional tone of the setting, and share your impressions.',
1068
1213
  '- For video files, please describe the content, including the theme, subjects, characters, scenes, objects, storyline, and emotional tone.',
1069
1214
  '- Please RETURN ONLY your analysis results without including your thought process or other unrelated information.',
@@ -1161,11 +1306,14 @@ export {
1161
1306
  FUNCTION,
1162
1307
  GEMINI_25_FLASH,
1163
1308
  GEMINI_30_PRO_IMAGE,
1309
+ GPT_5_IMAGE,
1164
1310
  GPT_51,
1311
+ IMAGEN_4_ULTRA,
1165
1312
  INSTRUCTIONS,
1166
1313
  MODELS,
1167
1314
  OPENAI_VOICE,
1168
1315
  RETRIEVAL,
1316
+ VEO_31,
1169
1317
  analyzeSessions,
1170
1318
  countTokens,
1171
1319
  distillFile,
package/lib/manifest.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  const manifest = {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.25",
4
+ "version": "2000.3.27",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
package/lib/speech.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  import { DEFAULT_MODELS, OPENAI_VOICE, countTokens, k } from './alan.mjs';
2
2
  import { getFfmpeg, packPcmToWav } from './media.mjs';
3
3
  import { get } from './web.mjs';
4
- import { convert, getTempPath, MIME_WAV } from './storage.mjs';
4
+ import { convert, getTempPath } from './storage.mjs';
5
5
  import { ensureString, mergeAtoB } from './utilitas.mjs';
6
6
 
7
7
  import {
@@ -18,20 +18,19 @@ const _NEED = ['@google/genai', 'OpenAI', 'whisper-node'];
18
18
 
19
19
  const [
20
20
  BUFFER, STREAM, BASE64, FILE, clients, suffix, SPEAKER, cleanup, wav,
21
- GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS, GEMINI_FLASH,
21
+ GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS,
22
22
  OPENAI_TTS_MAX_LENGTH, WHISPER_DEFAULT_MODEL, errorMessage
23
23
  ] = [
24
24
  'BUFFER', 'STREAM', 'BASE64', 'FILE', {}, 'ogg', 'SPEAKER', true, 'wav',
25
25
  'gpt-4o-mini-tts', 'gpt-4o-transcribe', 'gemini-2.5-flash-preview-tts',
26
- 'gemini-flash-latest', 4096, 'base', 'Invalid audio data.',
26
+ 4096, 'base', 'Invalid audio data.',
27
27
  ];
28
28
 
29
29
  const [
30
30
  defaultOpenAITtsModel, defaultOpenAISttModel, defaultGeminiTtsModel,
31
- defaultGeminiSttModel,
32
- ] = [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS, GEMINI_FLASH];
31
+ ] = [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS];
33
32
 
34
- const STT_PROMPT = 'Please transcribe the audio into clean text. Return only the text content, DO NOT include any additional information or metadata. You may encounter input that contains different languages. Please do your best to transcribe text from all possible languages. Please distinguish between background noise and the main speech content. Do not be disturbed by background noise. Only return the main speech content.';
33
+ const TTS_PROMPT = "As an AI voice assistant, please say the following content in a warm, friendly and professional tone, if the language is English, use an American accent, if it's Traditional Chinese, use Hong Kong Cantonese, if it's Simplified Chinese, use standard Mandarin, for other languages, please speak with a standard, clear accent";
35
34
 
36
35
  const WHISPER_MODELS = [
37
36
  // npx whisper-node download tiny.en
@@ -111,9 +110,6 @@ const init = async (options) => {
111
110
  if (options?.tts) {
112
111
  clients.tts = client.models.generateContent;
113
112
  }
114
- if (options?.stt) {
115
- clients.stt = client.models.generateContent;
116
- }
117
113
  break;
118
114
  case '':
119
115
  clients._provider = 'LOCAL';
@@ -166,13 +162,14 @@ const ttsGoogle = async (contents, options) => {
166
162
  assert(contents, 'Text is required.', 400);
167
163
  assert(await countTokens(contents) <= k(32), 'Text is too long.', 400);
168
164
  const resp = await clients.tts({
169
- model: options?.model || defaultGeminiTtsModel, contents,
165
+ model: options?.model || defaultGeminiTtsModel,
166
+ contents: `${options?.prompt || TTS_PROMPT}: ${contents}`,
170
167
  config: mergeAtoB(options?.config, {
171
168
  responseModalities: ['AUDIO'],
172
169
  speechConfig: {
173
170
  voiceConfig: {
174
171
  prebuiltVoiceConfig: {
175
- voiceName: options?.voice || 'Leda',
172
+ voiceName: options?.voice || 'Zephyr',
176
173
  },
177
174
  },
178
175
  },
@@ -227,29 +224,6 @@ const sttOpenAI = async (audio, options) => {
227
224
  return result;
228
225
  };
229
226
 
230
- const sttGoogle = async (audio, options) => {
231
- assert(clients.stt, 'Google STT API has not been initialized.', 500);
232
- const data = await convert(audio, {
233
- input: options?.input, expected: BASE64, errorMessage,
234
- });
235
- const resp = await clients.stt({
236
- model: options?.model || defaultGeminiSttModel, contents: {
237
- parts: [{
238
- inlineData: {
239
- mimeType: options?.mimeType || MIME_WAV, data,
240
- },
241
- }, { text: STT_PROMPT }],
242
- },
243
- config: { ...options?.config || {} },
244
- });
245
- assert(
246
- resp?.candidates?.[0]?.content?.parts?.[0],
247
- 'Failed to transcribe audio.', 500
248
- );
249
- return options?.raw ? resp.candidates
250
- : (resp.candidates[0].content.parts[0].text?.trim?.() || '');
251
- };
252
-
253
227
  // This function is not working properly, a pull request is filed:
254
228
  // https://github.com/ariym/whisper-node/pull/58
255
229
  const sttWhisper = async (audio, options) => {
@@ -288,8 +262,7 @@ const tts = async (text, options) => {
288
262
 
289
263
  const stt = async (audio, options) => {
290
264
  let engine;
291
- if (clients?.stt && clients._provider === 'GOOGLE') { engine = sttGoogle; }
292
- else if (clients?.stt && clients._provider === 'OPENAI') { engine = sttOpenAI; }
265
+ if (clients?.stt && clients._provider === 'OPENAI') { engine = sttOpenAI; }
293
266
  else if (await checkWhisper()) { engine = sttWhisper; }
294
267
  else { throwError('Speech-to-Text engine has not been initialized.', 500); }
295
268
  return await engine(audio, options);
@@ -303,7 +276,6 @@ export {
303
276
  checkWhisper,
304
277
  init,
305
278
  stt,
306
- sttGoogle,
307
279
  sttOpenAI,
308
280
  sttWhisper,
309
281
  tts,
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.25",
4
+ "version": "2000.3.27",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",