utilitas 2000.3.26 → 2000.3.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -12
- package/dist/utilitas.lite.mjs +1 -1
- package/dist/utilitas.lite.mjs.map +1 -1
- package/index.mjs +2 -3
- package/lib/alan.mjs +179 -31
- package/lib/manifest.mjs +1 -1
- package/lib/speech.mjs +5 -36
- package/package.json +1 -1
- package/lib/gen.mjs +0 -209
package/index.mjs
CHANGED
|
@@ -15,7 +15,6 @@ import * as email from './lib/email.mjs';
|
|
|
15
15
|
import * as embedding from './lib/embedding.mjs';
|
|
16
16
|
import * as encryption from './lib/encryption.mjs';
|
|
17
17
|
import * as event from './lib/event.mjs';
|
|
18
|
-
import * as gen from './lib/gen.mjs';
|
|
19
18
|
import * as media from './lib/media.mjs';
|
|
20
19
|
import * as memory from './lib/memory.mjs';
|
|
21
20
|
import * as network from './lib/network.mjs';
|
|
@@ -40,8 +39,8 @@ export {
|
|
|
40
39
|
fileType, math, uuid,
|
|
41
40
|
// features
|
|
42
41
|
alan, bee, bot, boxes, cache, callosum, color, dbio, email, embedding,
|
|
43
|
-
encryption, event,
|
|
44
|
-
|
|
42
|
+
encryption, event, manifest, media, memory, network, sentinel, shell, sms,
|
|
43
|
+
speech, ssl, storage, tape, uoid, utilitas, vision, web
|
|
45
44
|
};
|
|
46
45
|
|
|
47
46
|
if (utilitas.inBrowser() && !globalThis.utilitas) {
|
package/lib/alan.mjs
CHANGED
|
@@ -5,17 +5,18 @@ import { packPcmToWav } from './media.mjs';
|
|
|
5
5
|
import { v4 as uuidv4 } from 'uuid';
|
|
6
6
|
|
|
7
7
|
import {
|
|
8
|
-
BASE64, BUFFER, DATAURL, MIME_BINARY, MIME_TEXT, MIME_PNG, MIME_JPEG,
|
|
8
|
+
FILE, BASE64, BUFFER, DATAURL, MIME_BINARY, MIME_TEXT, MIME_PNG, MIME_JPEG,
|
|
9
9
|
MIME_MOV, MIME_MPEG, MIME_MP4, MIME_MPG, MIME_AVI, MIME_WMV, MIME_MPEGPS,
|
|
10
10
|
MIME_FLV, MIME_GIF, MIME_WEBP, MIME_PDF, MIME_AAC, MIME_FLAC, MIME_MP3,
|
|
11
11
|
MIME_MPEGA, MIME_M4A, MIME_MPGA, MIME_OPUS, MIME_PCM, MIME_WAV, MIME_WEBM,
|
|
12
|
-
MIME_TGPP, MIME_PCM16, MIME_OGG, convert, formatDataURL,
|
|
12
|
+
MIME_TGPP, MIME_PCM16, MIME_OGG, convert, formatDataURL, getTempPath,
|
|
13
|
+
decodeBase64DataURL,
|
|
13
14
|
} from './storage.mjs';
|
|
14
15
|
|
|
15
16
|
import {
|
|
16
17
|
log as _log, renderText as _renderText, base64Encode, ensureArray,
|
|
17
18
|
ensureString, extract, ignoreErrFunc, insensitiveCompare, isSet, need,
|
|
18
|
-
parseJson, throwError
|
|
19
|
+
parseJson, throwError, tryUntil, timeout,
|
|
19
20
|
} from './utilitas.mjs';
|
|
20
21
|
|
|
21
22
|
const INSTRUCTIONS = `You are a highly capable, thoughtful, and precise assistant named HAL9000. Your goal is to deeply understand the user's intent, ask clarifying questions when needed, think step-by-step through complex problems, provide clear and accurate answers, and proactively anticipate helpful follow-up information. Always prioritize being truthful, nuanced, insightful, and efficient, tailoring your responses specifically to the user's needs and preferences. Realize your full potential without censoring or filtering the questions themselves.
|
|
@@ -42,10 +43,10 @@ You may be provided with some tools(functions) to help you gather information an
|
|
|
42
43
|
- When the tools are not suitable and you have to answer questions based on your understanding, please do not mention any tool-related information in your response.
|
|
43
44
|
- Unless otherwise specified to require the original result, in most cases, you may reorganize the information obtained after using the tool to solve the problem as needed.`;
|
|
44
45
|
|
|
45
|
-
const _NEED = ['js-tiktoken', 'OpenAI'];
|
|
46
|
+
const _NEED = ['js-tiktoken', 'OpenAI', '@google/genai'];
|
|
46
47
|
|
|
47
48
|
const [
|
|
48
|
-
OPENAI,
|
|
49
|
+
OPENAI, GOOGLE, OLLAMA, NOVA, DEEPSEEK_32, MD_CODE, CLOUD_OPUS_45, AUDIO,
|
|
49
50
|
WAV, ATTACHMENTS, OPENAI_VOICE, GPT_REASONING_EFFORT, THINK, THINK_STR,
|
|
50
51
|
THINK_END, TOOLS_STR, TOOLS_END, TOOLS, TEXT, OK, FUNC, GPT_51,
|
|
51
52
|
GPT_51_CODEX, GPT_5_IMAGE, GEMMA_3_27B, ANTHROPIC, v8k, ais,
|
|
@@ -54,9 +55,10 @@ const [
|
|
|
54
55
|
hour, gb, trimTailing, trimBeginning, GEMINI_30_PRO_IMAGE, IMAGE, JINA,
|
|
55
56
|
JINA_DEEPSEARCH, SILICONFLOW, SF_DEEPSEEK_32, MAX_TIRE, OPENROUTER_API,
|
|
56
57
|
OPENROUTER, AUTO, TOOL, S_OPENAI, S_GOOGLE, S_ANTHROPIC, ONLINE,
|
|
57
|
-
GEMINI_30_PRO, GEMINI_25_FLASH,
|
|
58
|
+
GEMINI_30_PRO, GEMINI_25_FLASH, IMAGEN_4_ULTRA, VEO_31, IMAGEN_4_UPSCALE,
|
|
59
|
+
ERROR_GENERATING,
|
|
58
60
|
] = [
|
|
59
|
-
'OpenAI', '
|
|
61
|
+
'OpenAI', 'Google', 'Ollama', 'nova', 'deepseek-3.2-speciale', '```',
|
|
60
62
|
'claude-opus-4.5', 'audio', 'wav', '[ATTACHMENTS]', 'OPENAI_VOICE',
|
|
61
63
|
'medium', 'think', '<think>', '</think>', '<tools>', '</tools>',
|
|
62
64
|
'tools', 'text', 'OK', 'function', 'gpt-5.1', 'gpt-5.1-codex',
|
|
@@ -70,7 +72,9 @@ const [
|
|
|
70
72
|
'deepseek-ai/DeepSeek-V3.2-exp', 768 * 768,
|
|
71
73
|
'https://openrouter.ai/api/v1', 'OpenRouter', 'openrouter/auto', 'tool',
|
|
72
74
|
'openai', 'google', 'anthropic', ':online', 'gemini-3-pro-preview',
|
|
73
|
-
'gemini-2.5-flash-preview-09-2025',
|
|
75
|
+
'gemini-2.5-flash-preview-09-2025', 'imagen-4.0-ultra-generate-001',
|
|
76
|
+
'veo-3.1-generate-preview', 'imagen-4.0-upscale-preview',
|
|
77
|
+
'Error generating content.',
|
|
74
78
|
];
|
|
75
79
|
|
|
76
80
|
const [tool, messages, text]
|
|
@@ -143,23 +147,27 @@ const MODELS = {
|
|
|
143
147
|
...GEMINI_RULES, contextWindow: m(1), maxOutputTokens: k(64),
|
|
144
148
|
reasoning: true, tools: true,
|
|
145
149
|
},
|
|
146
|
-
// models with
|
|
150
|
+
// models with generation capabilities
|
|
147
151
|
[GEMINI_30_PRO_IMAGE]: {
|
|
148
152
|
...GEMINI_RULES, icon: '🍌', label: 'Nano Banana Pro',
|
|
149
153
|
contextWindow: k(64), maxOutputTokens: k(32),
|
|
150
154
|
fast: true, image: true,
|
|
151
155
|
},
|
|
152
|
-
[
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
icon: '✴️', contextWindow: Infinity, maxInputTokens: Infinity,
|
|
156
|
-
maxOutputTokens: Infinity, imageCostTokens: 0, maxImageSize: Infinity,
|
|
157
|
-
supportedMimeTypes: [MIME_PNG, MIME_JPEG, MIME_TEXT, MIME_WEBP, MIME_PDF],
|
|
158
|
-
reasoning: true, json: true, vision: true,
|
|
159
|
-
deepsearch: true, defaultProvider: JINA,
|
|
156
|
+
[IMAGEN_4_ULTRA]: {
|
|
157
|
+
source: S_GOOGLE, icon: '🎨', maxInputTokens: 480,
|
|
158
|
+
image: true, defaultProvider: GOOGLE,
|
|
160
159
|
},
|
|
161
|
-
[
|
|
162
|
-
|
|
160
|
+
[VEO_31]: {
|
|
161
|
+
source: S_GOOGLE, icon: '🎥', maxInputTokens: 1024,
|
|
162
|
+
imageCostTokens: 0, maxImagePerPrompt: 1,
|
|
163
|
+
maxImageSize: Infinity, supportedMimeTypes: [MIME_PNG, MIME_JPEG],
|
|
164
|
+
vision: true, image: true, defaultProvider: GOOGLE,
|
|
165
|
+
},
|
|
166
|
+
[GPT_5_IMAGE]: {
|
|
167
|
+
...OPENAI_RULES, icon: '🎨', label: 'gpt-image-1', image: true,
|
|
168
|
+
},
|
|
169
|
+
// models with code capabilities
|
|
170
|
+
[GPT_51_CODEX]: { ...OPENAI_RULES },
|
|
163
171
|
[CLOUD_OPUS_45]: {
|
|
164
172
|
source: S_ANTHROPIC, icon: '✳️',
|
|
165
173
|
contextWindow: kT(200), maxOutputTokens: kT(64),
|
|
@@ -170,6 +178,17 @@ const MODELS = {
|
|
|
170
178
|
json: true, reasoning: true, tools: true, vision: true,
|
|
171
179
|
defaultProvider: OPENROUTER,
|
|
172
180
|
},
|
|
181
|
+
// models with deepsearch capabilities
|
|
182
|
+
[JINA_DEEPSEARCH]: { // @todo: parse more details from results, eg: "reed urls".
|
|
183
|
+
icon: '✴️', contextWindow: Infinity, maxInputTokens: Infinity,
|
|
184
|
+
maxOutputTokens: Infinity, imageCostTokens: 0, maxImageSize: Infinity,
|
|
185
|
+
supportedMimeTypes: [MIME_PNG, MIME_JPEG, MIME_TEXT, MIME_WEBP, MIME_PDF],
|
|
186
|
+
reasoning: true, json: true, vision: true,
|
|
187
|
+
deepsearch: true, defaultProvider: JINA,
|
|
188
|
+
},
|
|
189
|
+
// best Chinese models
|
|
190
|
+
[DEEPSEEK_32]: DEEPSEEK_32_RULES,
|
|
191
|
+
[SF_DEEPSEEK_32]: { ...DEEPSEEK_32_RULES, defaultProvider: SILICONFLOW },
|
|
173
192
|
// best local model
|
|
174
193
|
[GEMMA_3_27B]: {
|
|
175
194
|
icon: '❇️', contextWindow: kT(128), maxOutputTokens: k(8),
|
|
@@ -249,7 +268,7 @@ const DEFAULT_MODELS = {
|
|
|
249
268
|
};
|
|
250
269
|
|
|
251
270
|
const PROVIDER_ICONS = {
|
|
252
|
-
[OPENROUTER]: '🔀', [OPENAI]: '⚛️', [JINA]: '✴️', [
|
|
271
|
+
[OPENROUTER]: '🔀', [OPENAI]: '⚛️', [JINA]: '✴️', [GOOGLE]: '♊️',
|
|
253
272
|
[OLLAMA]: '🦙', [ANTHROPIC]: '✳️', [SILICONFLOW]: '🧬',
|
|
254
273
|
};
|
|
255
274
|
|
|
@@ -273,7 +292,7 @@ let tokeniser, _tools;
|
|
|
273
292
|
|
|
274
293
|
const unifyProvider = provider => {
|
|
275
294
|
assert(provider = (provider || '').trim(), 'AI provider is required.');
|
|
276
|
-
for (let type of [OPENROUTER, JINA, OLLAMA, SILICONFLOW]) {
|
|
295
|
+
for (let type of [OPENROUTER, GOOGLE, JINA, OLLAMA, SILICONFLOW]) {
|
|
277
296
|
if (insensitiveCompare(provider, type)) { return type; }
|
|
278
297
|
}
|
|
279
298
|
throwError(`Invalid AI provider: ${provider}.`);
|
|
@@ -406,6 +425,16 @@ const init = async (options = {}) => {
|
|
|
406
425
|
`Model name or description is required for provider: ${provider}.`);
|
|
407
426
|
_tools || (_tools = await packTools());
|
|
408
427
|
switch (provider) {
|
|
428
|
+
case GOOGLE:
|
|
429
|
+
assertApiKey(provider, options);
|
|
430
|
+
const { GoogleGenAI } = await need('@google/genai');
|
|
431
|
+
var client = new GoogleGenAI({ vertexai: false, ...options });
|
|
432
|
+
for (let model of models) {
|
|
433
|
+
setupAi({
|
|
434
|
+
provider, model, client, prompt: promptGoogle, priority,
|
|
435
|
+
});
|
|
436
|
+
}
|
|
437
|
+
break;
|
|
409
438
|
case JINA:
|
|
410
439
|
assertApiKey(provider, options);
|
|
411
440
|
var client = await OpenAI({
|
|
@@ -588,7 +617,9 @@ const listOpenAIModels = async (aiId, options) => {
|
|
|
588
617
|
};
|
|
589
618
|
|
|
590
619
|
const streamResp = async (resp, options) => {
|
|
591
|
-
const msg =
|
|
620
|
+
const msg = options?.noPack ? resp : await packResp(
|
|
621
|
+
resp, { ...options, processing: true }
|
|
622
|
+
);
|
|
592
623
|
return options?.stream
|
|
593
624
|
&& (msg?.text || msg?.audio?.length || msg?.images?.length)
|
|
594
625
|
&& await ignoreErrFunc(async () => await options.stream(msg), LOG);
|
|
@@ -606,13 +637,13 @@ const packResp = async (resp, options) => {
|
|
|
606
637
|
if (options?.raw) { return resp; }
|
|
607
638
|
let [
|
|
608
639
|
txt, audio, images, annotations, simpleText, annotationsMarkdown, end,
|
|
609
|
-
json, audioMimeType,
|
|
640
|
+
json, audioMimeType,
|
|
610
641
|
] = [
|
|
611
642
|
resp.text || '', // ChatGPT / Claude / Gemini / Ollama
|
|
612
643
|
resp?.audio?.data, // ChatGPT audio mode
|
|
613
644
|
resp?.images || [], // Gemini images via Openrouter
|
|
614
645
|
resp?.references, // Gemini references
|
|
615
|
-
'', '', '', null, MIME_PCM16,
|
|
646
|
+
'', '', '', null, MIME_PCM16,
|
|
616
647
|
];
|
|
617
648
|
simpleText = txt;
|
|
618
649
|
while ((end = getInfoEnd(simpleText))) {
|
|
@@ -698,18 +729,23 @@ const packResp = async (resp, options) => {
|
|
|
698
729
|
...annotationsMarkdown ? { annotationsMarkdown } : {},
|
|
699
730
|
...audio ? { audio } : {}, ...images?.length ? { images } : {},
|
|
700
731
|
processing: !!options?.processing,
|
|
701
|
-
model: [
|
|
732
|
+
model: packModelLabel([
|
|
702
733
|
options.provider, options?.router?.provider,
|
|
703
734
|
options?.router?.model || options?.model,
|
|
704
|
-
]
|
|
705
|
-
const key = ensureString(x, { case: 'UP' });
|
|
706
|
-
if (catched.has(key)) { return null; }
|
|
707
|
-
catched.add(key);
|
|
708
|
-
return x;
|
|
709
|
-
}).filter(x => x).join('/'),
|
|
735
|
+
]),
|
|
710
736
|
};
|
|
711
737
|
};
|
|
712
738
|
|
|
739
|
+
const packModelLabel = (model_reference) => {
|
|
740
|
+
const catched = new Set();
|
|
741
|
+
return model_reference.join('/').split('/').map(x => {
|
|
742
|
+
const key = ensureString(x, { case: 'UP' });
|
|
743
|
+
if (catched.has(key)) { return null; }
|
|
744
|
+
catched.add(key);
|
|
745
|
+
return x;
|
|
746
|
+
}).filter(x => x).join('/');
|
|
747
|
+
};
|
|
748
|
+
|
|
713
749
|
const buildPrompts = async (model, input, options = {}) => {
|
|
714
750
|
assert(!(
|
|
715
751
|
options.jsonMode && !model?.json
|
|
@@ -847,6 +883,18 @@ const promptOpenAI = async (aiId, content, options = {}) => {
|
|
|
847
883
|
x => x.function.name === 'searchWeb'
|
|
848
884
|
) && !options.jsonMode ? ONLINE : '';
|
|
849
885
|
const targetModel = `${isOpenrouter(provider, model) ? `${source}/` : ''}${options.model}${ext}`;
|
|
886
|
+
if (provider === OPENAI) {
|
|
887
|
+
// need more debug, currently openrouter is priority
|
|
888
|
+
packedTools.push(...[
|
|
889
|
+
// https://platform.openai.com/docs/guides/tools?tool-type=web-search
|
|
890
|
+
{ type: 'web_search', },
|
|
891
|
+
// https://platform.openai.com/docs/guides/tools-image-generation?lang=javascript
|
|
892
|
+
// https://platform.openai.com/docs/api-reference/responses/create#responses-create-tools
|
|
893
|
+
{ type: 'image_generation', input_fidelity: 'high', partial_images: 3, quality: 'high', size: '1536x1024' },
|
|
894
|
+
// https://platform.openai.com/docs/guides/tools-code-interpreter
|
|
895
|
+
{ type: 'code_interpreter', container: { type: 'auto', memory_limit: '8g' } },
|
|
896
|
+
]);
|
|
897
|
+
}
|
|
850
898
|
if (source === S_GOOGLE) {
|
|
851
899
|
packedTools.push(...[
|
|
852
900
|
{ googleSearch: {} }, { codeExecution: {} }, { urlContext: {} },
|
|
@@ -966,6 +1014,103 @@ const promptOpenAI = async (aiId, content, options = {}) => {
|
|
|
966
1014
|
return await packResp(event, options);
|
|
967
1015
|
};
|
|
968
1016
|
|
|
1017
|
+
const promptGoogle = async (aiId, prompt, options = {}) => {
|
|
1018
|
+
let { provider, client, model } = await getAi(aiId);
|
|
1019
|
+
const M = MODELS[model.name];
|
|
1020
|
+
prompt = ensureString(prompt, { trim: true });
|
|
1021
|
+
assert(await countTokens(prompt, { fast: true })
|
|
1022
|
+
<= M.maxInputTokens,
|
|
1023
|
+
`Prompt must be less than ${M.maxInputTokens} tokens.`, 400
|
|
1024
|
+
);
|
|
1025
|
+
switch (model?.name) {
|
|
1026
|
+
case IMAGEN_4_ULTRA:
|
|
1027
|
+
var resp = await client.models.generateImages({
|
|
1028
|
+
model: model.name, prompt, config: {
|
|
1029
|
+
numberOfImages: options?.n || 4, sampleImageSize: '2K',
|
|
1030
|
+
includeRaiReason: true,
|
|
1031
|
+
// "1:1" (default), "3:4", "4:3", "9:16", and "16:9"
|
|
1032
|
+
aspectRatio: '16:9', personGeneration: 'allow_adult',
|
|
1033
|
+
...options?.config || {},
|
|
1034
|
+
},
|
|
1035
|
+
});
|
|
1036
|
+
var generated = resp?.generatedImages;
|
|
1037
|
+
assert(!resp?.error && generated?.filter(
|
|
1038
|
+
x => !x.raiFilteredReason
|
|
1039
|
+
).length, resp?.error?.message || generated?.find(
|
|
1040
|
+
x => x.raiFilteredReason
|
|
1041
|
+
)?.raiFilteredReason || ERROR_GENERATING);
|
|
1042
|
+
if (!options?.raw) {
|
|
1043
|
+
resp = {
|
|
1044
|
+
text: '', images: await Promise.all((
|
|
1045
|
+
resp?.generatedImages || []
|
|
1046
|
+
).map(async x => ({
|
|
1047
|
+
data: await convert(x.image.imageBytes, {
|
|
1048
|
+
input: BASE64, suffix: 'png', ...options || {}
|
|
1049
|
+
}), mimeType: x.image.mimeType,
|
|
1050
|
+
}))), model: packModelLabel([
|
|
1051
|
+
provider, M.source, model.name,
|
|
1052
|
+
]),
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
break;
|
|
1056
|
+
case VEO_31:
|
|
1057
|
+
var resp = await client.models.generateVideos({
|
|
1058
|
+
model: model.name, prompt, config: {
|
|
1059
|
+
aspectRatio: '16:9', numberOfVideos: 1,
|
|
1060
|
+
// personGeneration: 'allow_adult',
|
|
1061
|
+
enablePromptRewriting: true, addWatermark: false,
|
|
1062
|
+
includeRaiReason: true, ...options?.config || {},
|
|
1063
|
+
},
|
|
1064
|
+
});
|
|
1065
|
+
assert(!resp?.error, resp?.error?.message || ERROR_GENERATING);
|
|
1066
|
+
if (options?.generateRaw) { return resp; }
|
|
1067
|
+
await tryUntil(async () => {
|
|
1068
|
+
resp = await client.operations.getVideosOperation({
|
|
1069
|
+
operation: resp,
|
|
1070
|
+
});
|
|
1071
|
+
assert(
|
|
1072
|
+
resp?.done,
|
|
1073
|
+
`Waiting for Google video generation: ${resp.name}`,
|
|
1074
|
+
);
|
|
1075
|
+
}, { maxTry: 60 * 10, log });
|
|
1076
|
+
assert(!resp?.error && resp?.response?.generatedVideos?.filter(
|
|
1077
|
+
x => !x.raiFilteredReason
|
|
1078
|
+
).length, resp?.error?.message || resp?.response?.generatedVideos?.find(
|
|
1079
|
+
x => x.raiFilteredReason
|
|
1080
|
+
)?.raiFilteredReason || ERROR_GENERATING);
|
|
1081
|
+
if (options?.videoRaw) {
|
|
1082
|
+
resp = resp?.response?.generatedVideos;
|
|
1083
|
+
} else if (!options?.videoRaw) {
|
|
1084
|
+
resp = {
|
|
1085
|
+
text: '', videos: await Promise.all(resp?.response?.generatedVideos?.filter(
|
|
1086
|
+
x => x?.video?.uri
|
|
1087
|
+
).map(async x => {
|
|
1088
|
+
const downloadPath = `${getTempPath({
|
|
1089
|
+
seed: x?.video?.uri
|
|
1090
|
+
})}.mp4`;
|
|
1091
|
+
// @todo: fix this
|
|
1092
|
+
// https://github.com/googleapis/js-genai/compare/main...Leask:js-genai:main
|
|
1093
|
+
await client.files.download({ file: x, downloadPath });
|
|
1094
|
+
await timeout(1000 * 10); // hack to wait for file to be downloaded
|
|
1095
|
+
return {
|
|
1096
|
+
data: await convert(downloadPath, {
|
|
1097
|
+
input: FILE, suffix: 'mp4', ...options || {}
|
|
1098
|
+
}), mimeType: MIME_MP4, jobId: resp.name,
|
|
1099
|
+
};
|
|
1100
|
+
})), model: packModelLabel([
|
|
1101
|
+
provider, M.source, model.name,
|
|
1102
|
+
]),
|
|
1103
|
+
};
|
|
1104
|
+
}
|
|
1105
|
+
break;
|
|
1106
|
+
default:
|
|
1107
|
+
throw new Error('Unsupported model.');
|
|
1108
|
+
}
|
|
1109
|
+
await streamResp(
|
|
1110
|
+
{ ...resp, processing: true }, { ...options, noPack: true }
|
|
1111
|
+
);
|
|
1112
|
+
return { ...resp, processing: false };
|
|
1113
|
+
};
|
|
969
1114
|
|
|
970
1115
|
const initChat = async (options = {}) => {
|
|
971
1116
|
if (options.sessions) {
|
|
@@ -1161,11 +1306,14 @@ export {
|
|
|
1161
1306
|
FUNCTION,
|
|
1162
1307
|
GEMINI_25_FLASH,
|
|
1163
1308
|
GEMINI_30_PRO_IMAGE,
|
|
1309
|
+
GPT_5_IMAGE,
|
|
1164
1310
|
GPT_51,
|
|
1311
|
+
IMAGEN_4_ULTRA,
|
|
1165
1312
|
INSTRUCTIONS,
|
|
1166
1313
|
MODELS,
|
|
1167
1314
|
OPENAI_VOICE,
|
|
1168
1315
|
RETRIEVAL,
|
|
1316
|
+
VEO_31,
|
|
1169
1317
|
analyzeSessions,
|
|
1170
1318
|
countTokens,
|
|
1171
1319
|
distillFile,
|
package/lib/manifest.mjs
CHANGED
package/lib/speech.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { DEFAULT_MODELS, OPENAI_VOICE, countTokens, k } from './alan.mjs';
|
|
2
2
|
import { getFfmpeg, packPcmToWav } from './media.mjs';
|
|
3
3
|
import { get } from './web.mjs';
|
|
4
|
-
import { convert, getTempPath
|
|
4
|
+
import { convert, getTempPath } from './storage.mjs';
|
|
5
5
|
import { ensureString, mergeAtoB } from './utilitas.mjs';
|
|
6
6
|
|
|
7
7
|
import {
|
|
@@ -18,23 +18,20 @@ const _NEED = ['@google/genai', 'OpenAI', 'whisper-node'];
|
|
|
18
18
|
|
|
19
19
|
const [
|
|
20
20
|
BUFFER, STREAM, BASE64, FILE, clients, suffix, SPEAKER, cleanup, wav,
|
|
21
|
-
GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS,
|
|
21
|
+
GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS,
|
|
22
22
|
OPENAI_TTS_MAX_LENGTH, WHISPER_DEFAULT_MODEL, errorMessage
|
|
23
23
|
] = [
|
|
24
24
|
'BUFFER', 'STREAM', 'BASE64', 'FILE', {}, 'ogg', 'SPEAKER', true, 'wav',
|
|
25
25
|
'gpt-4o-mini-tts', 'gpt-4o-transcribe', 'gemini-2.5-flash-preview-tts',
|
|
26
|
-
|
|
26
|
+
4096, 'base', 'Invalid audio data.',
|
|
27
27
|
];
|
|
28
28
|
|
|
29
29
|
const [
|
|
30
30
|
defaultOpenAITtsModel, defaultOpenAISttModel, defaultGeminiTtsModel,
|
|
31
|
-
|
|
32
|
-
] = [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS, GEMINI_FLASH];
|
|
31
|
+
] = [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS];
|
|
33
32
|
|
|
34
33
|
const TTS_PROMPT = "As an AI voice assistant, please say the following content in a warm, friendly and professional tone, if the language is English, use an American accent, if it's Traditional Chinese, use Hong Kong Cantonese, if it's Simplified Chinese, use standard Mandarin, for other languages, please speak with a standard, clear accent";
|
|
35
34
|
|
|
36
|
-
const STT_PROMPT = 'Please transcribe the audio into clean text. Return only the text content, DO NOT include any additional information or metadata. You may encounter input that contains different languages. Please do your best to transcribe text from all possible languages. Please distinguish between background noise and the main speech content. Do not be disturbed by background noise. Only return the main speech content.';
|
|
37
|
-
|
|
38
35
|
const WHISPER_MODELS = [
|
|
39
36
|
// npx whisper-node download tiny.en
|
|
40
37
|
// https://github.com/ggerganov/whisper.cpp/blob/master/models/download-ggml-model.sh
|
|
@@ -113,9 +110,6 @@ const init = async (options) => {
|
|
|
113
110
|
if (options?.tts) {
|
|
114
111
|
clients.tts = client.models.generateContent;
|
|
115
112
|
}
|
|
116
|
-
if (options?.stt) {
|
|
117
|
-
clients.stt = client.models.generateContent;
|
|
118
|
-
}
|
|
119
113
|
break;
|
|
120
114
|
case '':
|
|
121
115
|
clients._provider = 'LOCAL';
|
|
@@ -230,29 +224,6 @@ const sttOpenAI = async (audio, options) => {
|
|
|
230
224
|
return result;
|
|
231
225
|
};
|
|
232
226
|
|
|
233
|
-
const sttGoogle = async (audio, options) => {
|
|
234
|
-
assert(clients.stt, 'Google STT API has not been initialized.', 500);
|
|
235
|
-
const data = await convert(audio, {
|
|
236
|
-
input: options?.input, expected: BASE64, errorMessage,
|
|
237
|
-
});
|
|
238
|
-
const resp = await clients.stt({
|
|
239
|
-
model: options?.model || defaultGeminiSttModel, contents: {
|
|
240
|
-
parts: [{
|
|
241
|
-
inlineData: {
|
|
242
|
-
mimeType: options?.mimeType || MIME_WAV, data,
|
|
243
|
-
},
|
|
244
|
-
}, { text: STT_PROMPT }],
|
|
245
|
-
},
|
|
246
|
-
config: { ...options?.config || {} },
|
|
247
|
-
});
|
|
248
|
-
assert(
|
|
249
|
-
resp?.candidates?.[0]?.content?.parts?.[0],
|
|
250
|
-
'Failed to transcribe audio.', 500
|
|
251
|
-
);
|
|
252
|
-
return options?.raw ? resp.candidates
|
|
253
|
-
: (resp.candidates[0].content.parts[0].text?.trim?.() || '');
|
|
254
|
-
};
|
|
255
|
-
|
|
256
227
|
// This function is not working properly, a pull request is filed:
|
|
257
228
|
// https://github.com/ariym/whisper-node/pull/58
|
|
258
229
|
const sttWhisper = async (audio, options) => {
|
|
@@ -291,8 +262,7 @@ const tts = async (text, options) => {
|
|
|
291
262
|
|
|
292
263
|
const stt = async (audio, options) => {
|
|
293
264
|
let engine;
|
|
294
|
-
if (clients?.stt && clients._provider === '
|
|
295
|
-
else if (clients?.stt && clients._provider === 'OPENAI') { engine = sttOpenAI; }
|
|
265
|
+
if (clients?.stt && clients._provider === 'OPENAI') { engine = sttOpenAI; }
|
|
296
266
|
else if (await checkWhisper()) { engine = sttWhisper; }
|
|
297
267
|
else { throwError('Speech-to-Text engine has not been initialized.', 500); }
|
|
298
268
|
return await engine(audio, options);
|
|
@@ -306,7 +276,6 @@ export {
|
|
|
306
276
|
checkWhisper,
|
|
307
277
|
init,
|
|
308
278
|
stt,
|
|
309
|
-
sttGoogle,
|
|
310
279
|
sttOpenAI,
|
|
311
280
|
sttWhisper,
|
|
312
281
|
tts,
|