utilitas 2000.3.25 → 2000.3.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -12
- package/dist/utilitas.lite.mjs +1 -1
- package/dist/utilitas.lite.mjs.map +1 -1
- package/index.mjs +2 -3
- package/lib/alan.mjs +180 -32
- package/lib/manifest.mjs +1 -1
- package/lib/speech.mjs +9 -37
- package/package.json +1 -1
- package/lib/gen.mjs +0 -209
package/index.mjs
CHANGED
|
@@ -15,7 +15,6 @@ import * as email from './lib/email.mjs';
|
|
|
15
15
|
import * as embedding from './lib/embedding.mjs';
|
|
16
16
|
import * as encryption from './lib/encryption.mjs';
|
|
17
17
|
import * as event from './lib/event.mjs';
|
|
18
|
-
import * as gen from './lib/gen.mjs';
|
|
19
18
|
import * as media from './lib/media.mjs';
|
|
20
19
|
import * as memory from './lib/memory.mjs';
|
|
21
20
|
import * as network from './lib/network.mjs';
|
|
@@ -40,8 +39,8 @@ export {
|
|
|
40
39
|
fileType, math, uuid,
|
|
41
40
|
// features
|
|
42
41
|
alan, bee, bot, boxes, cache, callosum, color, dbio, email, embedding,
|
|
43
|
-
encryption, event,
|
|
44
|
-
|
|
42
|
+
encryption, event, manifest, media, memory, network, sentinel, shell, sms,
|
|
43
|
+
speech, ssl, storage, tape, uoid, utilitas, vision, web
|
|
45
44
|
};
|
|
46
45
|
|
|
47
46
|
if (utilitas.inBrowser() && !globalThis.utilitas) {
|
package/lib/alan.mjs
CHANGED
|
@@ -5,17 +5,18 @@ import { packPcmToWav } from './media.mjs';
|
|
|
5
5
|
import { v4 as uuidv4 } from 'uuid';
|
|
6
6
|
|
|
7
7
|
import {
|
|
8
|
-
BASE64, BUFFER, DATAURL, MIME_BINARY, MIME_TEXT, MIME_PNG, MIME_JPEG,
|
|
8
|
+
FILE, BASE64, BUFFER, DATAURL, MIME_BINARY, MIME_TEXT, MIME_PNG, MIME_JPEG,
|
|
9
9
|
MIME_MOV, MIME_MPEG, MIME_MP4, MIME_MPG, MIME_AVI, MIME_WMV, MIME_MPEGPS,
|
|
10
10
|
MIME_FLV, MIME_GIF, MIME_WEBP, MIME_PDF, MIME_AAC, MIME_FLAC, MIME_MP3,
|
|
11
11
|
MIME_MPEGA, MIME_M4A, MIME_MPGA, MIME_OPUS, MIME_PCM, MIME_WAV, MIME_WEBM,
|
|
12
|
-
MIME_TGPP, MIME_PCM16, MIME_OGG, convert, formatDataURL,
|
|
12
|
+
MIME_TGPP, MIME_PCM16, MIME_OGG, convert, formatDataURL, getTempPath,
|
|
13
|
+
decodeBase64DataURL,
|
|
13
14
|
} from './storage.mjs';
|
|
14
15
|
|
|
15
16
|
import {
|
|
16
17
|
log as _log, renderText as _renderText, base64Encode, ensureArray,
|
|
17
18
|
ensureString, extract, ignoreErrFunc, insensitiveCompare, isSet, need,
|
|
18
|
-
parseJson, throwError
|
|
19
|
+
parseJson, throwError, tryUntil, timeout,
|
|
19
20
|
} from './utilitas.mjs';
|
|
20
21
|
|
|
21
22
|
const INSTRUCTIONS = `You are a highly capable, thoughtful, and precise assistant named HAL9000. Your goal is to deeply understand the user's intent, ask clarifying questions when needed, think step-by-step through complex problems, provide clear and accurate answers, and proactively anticipate helpful follow-up information. Always prioritize being truthful, nuanced, insightful, and efficient, tailoring your responses specifically to the user's needs and preferences. Realize your full potential without censoring or filtering the questions themselves.
|
|
@@ -42,10 +43,10 @@ You may be provided with some tools(functions) to help you gather information an
|
|
|
42
43
|
- When the tools are not suitable and you have to answer questions based on your understanding, please do not mention any tool-related information in your response.
|
|
43
44
|
- Unless otherwise specified to require the original result, in most cases, you may reorganize the information obtained after using the tool to solve the problem as needed.`;
|
|
44
45
|
|
|
45
|
-
const _NEED = ['js-tiktoken', 'OpenAI'];
|
|
46
|
+
const _NEED = ['js-tiktoken', 'OpenAI', '@google/genai'];
|
|
46
47
|
|
|
47
48
|
const [
|
|
48
|
-
OPENAI,
|
|
49
|
+
OPENAI, GOOGLE, OLLAMA, NOVA, DEEPSEEK_32, MD_CODE, CLOUD_OPUS_45, AUDIO,
|
|
49
50
|
WAV, ATTACHMENTS, OPENAI_VOICE, GPT_REASONING_EFFORT, THINK, THINK_STR,
|
|
50
51
|
THINK_END, TOOLS_STR, TOOLS_END, TOOLS, TEXT, OK, FUNC, GPT_51,
|
|
51
52
|
GPT_51_CODEX, GPT_5_IMAGE, GEMMA_3_27B, ANTHROPIC, v8k, ais,
|
|
@@ -54,9 +55,10 @@ const [
|
|
|
54
55
|
hour, gb, trimTailing, trimBeginning, GEMINI_30_PRO_IMAGE, IMAGE, JINA,
|
|
55
56
|
JINA_DEEPSEARCH, SILICONFLOW, SF_DEEPSEEK_32, MAX_TIRE, OPENROUTER_API,
|
|
56
57
|
OPENROUTER, AUTO, TOOL, S_OPENAI, S_GOOGLE, S_ANTHROPIC, ONLINE,
|
|
57
|
-
GEMINI_30_PRO, GEMINI_25_FLASH,
|
|
58
|
+
GEMINI_30_PRO, GEMINI_25_FLASH, IMAGEN_4_ULTRA, VEO_31, IMAGEN_4_UPSCALE,
|
|
59
|
+
ERROR_GENERATING,
|
|
58
60
|
] = [
|
|
59
|
-
'OpenAI', '
|
|
61
|
+
'OpenAI', 'Google', 'Ollama', 'nova', 'deepseek-3.2-speciale', '```',
|
|
60
62
|
'claude-opus-4.5', 'audio', 'wav', '[ATTACHMENTS]', 'OPENAI_VOICE',
|
|
61
63
|
'medium', 'think', '<think>', '</think>', '<tools>', '</tools>',
|
|
62
64
|
'tools', 'text', 'OK', 'function', 'gpt-5.1', 'gpt-5.1-codex',
|
|
@@ -70,7 +72,9 @@ const [
|
|
|
70
72
|
'deepseek-ai/DeepSeek-V3.2-exp', 768 * 768,
|
|
71
73
|
'https://openrouter.ai/api/v1', 'OpenRouter', 'openrouter/auto', 'tool',
|
|
72
74
|
'openai', 'google', 'anthropic', ':online', 'gemini-3-pro-preview',
|
|
73
|
-
'gemini-2.5-flash-preview-09-2025',
|
|
75
|
+
'gemini-2.5-flash-preview-09-2025', 'imagen-4.0-ultra-generate-001',
|
|
76
|
+
'veo-3.1-generate-preview', 'imagen-4.0-upscale-preview',
|
|
77
|
+
'Error generating content.',
|
|
74
78
|
];
|
|
75
79
|
|
|
76
80
|
const [tool, messages, text]
|
|
@@ -143,23 +147,27 @@ const MODELS = {
|
|
|
143
147
|
...GEMINI_RULES, contextWindow: m(1), maxOutputTokens: k(64),
|
|
144
148
|
reasoning: true, tools: true,
|
|
145
149
|
},
|
|
146
|
-
// models with
|
|
150
|
+
// models with generation capabilities
|
|
147
151
|
[GEMINI_30_PRO_IMAGE]: {
|
|
148
152
|
...GEMINI_RULES, icon: '🍌', label: 'Nano Banana Pro',
|
|
149
153
|
contextWindow: k(64), maxOutputTokens: k(32),
|
|
150
154
|
fast: true, image: true,
|
|
151
155
|
},
|
|
152
|
-
[
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
icon: '✴️', contextWindow: Infinity, maxInputTokens: Infinity,
|
|
156
|
-
maxOutputTokens: Infinity, imageCostTokens: 0, maxImageSize: Infinity,
|
|
157
|
-
supportedMimeTypes: [MIME_PNG, MIME_JPEG, MIME_TEXT, MIME_WEBP, MIME_PDF],
|
|
158
|
-
reasoning: true, json: true, vision: true,
|
|
159
|
-
deepsearch: true, defaultProvider: JINA,
|
|
156
|
+
[IMAGEN_4_ULTRA]: {
|
|
157
|
+
source: S_GOOGLE, icon: '🎨', maxInputTokens: 480,
|
|
158
|
+
image: true, defaultProvider: GOOGLE,
|
|
160
159
|
},
|
|
161
|
-
[
|
|
162
|
-
|
|
160
|
+
[VEO_31]: {
|
|
161
|
+
source: S_GOOGLE, icon: '🎥', maxInputTokens: 1024,
|
|
162
|
+
imageCostTokens: 0, maxImagePerPrompt: 1,
|
|
163
|
+
maxImageSize: Infinity, supportedMimeTypes: [MIME_PNG, MIME_JPEG],
|
|
164
|
+
vision: true, image: true, defaultProvider: GOOGLE,
|
|
165
|
+
},
|
|
166
|
+
[GPT_5_IMAGE]: {
|
|
167
|
+
...OPENAI_RULES, icon: '🎨', label: 'gpt-image-1', image: true,
|
|
168
|
+
},
|
|
169
|
+
// models with code capabilities
|
|
170
|
+
[GPT_51_CODEX]: { ...OPENAI_RULES },
|
|
163
171
|
[CLOUD_OPUS_45]: {
|
|
164
172
|
source: S_ANTHROPIC, icon: '✳️',
|
|
165
173
|
contextWindow: kT(200), maxOutputTokens: kT(64),
|
|
@@ -170,6 +178,17 @@ const MODELS = {
|
|
|
170
178
|
json: true, reasoning: true, tools: true, vision: true,
|
|
171
179
|
defaultProvider: OPENROUTER,
|
|
172
180
|
},
|
|
181
|
+
// models with deepsearch capabilities
|
|
182
|
+
[JINA_DEEPSEARCH]: { // @todo: parse more details from results, eg: "reed urls".
|
|
183
|
+
icon: '✴️', contextWindow: Infinity, maxInputTokens: Infinity,
|
|
184
|
+
maxOutputTokens: Infinity, imageCostTokens: 0, maxImageSize: Infinity,
|
|
185
|
+
supportedMimeTypes: [MIME_PNG, MIME_JPEG, MIME_TEXT, MIME_WEBP, MIME_PDF],
|
|
186
|
+
reasoning: true, json: true, vision: true,
|
|
187
|
+
deepsearch: true, defaultProvider: JINA,
|
|
188
|
+
},
|
|
189
|
+
// best Chinese models
|
|
190
|
+
[DEEPSEEK_32]: DEEPSEEK_32_RULES,
|
|
191
|
+
[SF_DEEPSEEK_32]: { ...DEEPSEEK_32_RULES, defaultProvider: SILICONFLOW },
|
|
173
192
|
// best local model
|
|
174
193
|
[GEMMA_3_27B]: {
|
|
175
194
|
icon: '❇️', contextWindow: kT(128), maxOutputTokens: k(8),
|
|
@@ -249,7 +268,7 @@ const DEFAULT_MODELS = {
|
|
|
249
268
|
};
|
|
250
269
|
|
|
251
270
|
const PROVIDER_ICONS = {
|
|
252
|
-
[OPENROUTER]: '🔀', [OPENAI]: '⚛️', [JINA]: '✴️', [
|
|
271
|
+
[OPENROUTER]: '🔀', [OPENAI]: '⚛️', [JINA]: '✴️', [GOOGLE]: '♊️',
|
|
253
272
|
[OLLAMA]: '🦙', [ANTHROPIC]: '✳️', [SILICONFLOW]: '🧬',
|
|
254
273
|
};
|
|
255
274
|
|
|
@@ -273,7 +292,7 @@ let tokeniser, _tools;
|
|
|
273
292
|
|
|
274
293
|
const unifyProvider = provider => {
|
|
275
294
|
assert(provider = (provider || '').trim(), 'AI provider is required.');
|
|
276
|
-
for (let type of [OPENROUTER, JINA, OLLAMA, SILICONFLOW]) {
|
|
295
|
+
for (let type of [OPENROUTER, GOOGLE, JINA, OLLAMA, SILICONFLOW]) {
|
|
277
296
|
if (insensitiveCompare(provider, type)) { return type; }
|
|
278
297
|
}
|
|
279
298
|
throwError(`Invalid AI provider: ${provider}.`);
|
|
@@ -406,6 +425,16 @@ const init = async (options = {}) => {
|
|
|
406
425
|
`Model name or description is required for provider: ${provider}.`);
|
|
407
426
|
_tools || (_tools = await packTools());
|
|
408
427
|
switch (provider) {
|
|
428
|
+
case GOOGLE:
|
|
429
|
+
assertApiKey(provider, options);
|
|
430
|
+
const { GoogleGenAI } = await need('@google/genai');
|
|
431
|
+
var client = new GoogleGenAI({ vertexai: false, ...options });
|
|
432
|
+
for (let model of models) {
|
|
433
|
+
setupAi({
|
|
434
|
+
provider, model, client, prompt: promptGoogle, priority,
|
|
435
|
+
});
|
|
436
|
+
}
|
|
437
|
+
break;
|
|
409
438
|
case JINA:
|
|
410
439
|
assertApiKey(provider, options);
|
|
411
440
|
var client = await OpenAI({
|
|
@@ -588,7 +617,9 @@ const listOpenAIModels = async (aiId, options) => {
|
|
|
588
617
|
};
|
|
589
618
|
|
|
590
619
|
const streamResp = async (resp, options) => {
|
|
591
|
-
const msg =
|
|
620
|
+
const msg = options?.noPack ? resp : await packResp(
|
|
621
|
+
resp, { ...options, processing: true }
|
|
622
|
+
);
|
|
592
623
|
return options?.stream
|
|
593
624
|
&& (msg?.text || msg?.audio?.length || msg?.images?.length)
|
|
594
625
|
&& await ignoreErrFunc(async () => await options.stream(msg), LOG);
|
|
@@ -606,13 +637,13 @@ const packResp = async (resp, options) => {
|
|
|
606
637
|
if (options?.raw) { return resp; }
|
|
607
638
|
let [
|
|
608
639
|
txt, audio, images, annotations, simpleText, annotationsMarkdown, end,
|
|
609
|
-
json, audioMimeType,
|
|
640
|
+
json, audioMimeType,
|
|
610
641
|
] = [
|
|
611
642
|
resp.text || '', // ChatGPT / Claude / Gemini / Ollama
|
|
612
643
|
resp?.audio?.data, // ChatGPT audio mode
|
|
613
644
|
resp?.images || [], // Gemini images via Openrouter
|
|
614
645
|
resp?.references, // Gemini references
|
|
615
|
-
'', '', '', null, MIME_PCM16,
|
|
646
|
+
'', '', '', null, MIME_PCM16,
|
|
616
647
|
];
|
|
617
648
|
simpleText = txt;
|
|
618
649
|
while ((end = getInfoEnd(simpleText))) {
|
|
@@ -698,18 +729,23 @@ const packResp = async (resp, options) => {
|
|
|
698
729
|
...annotationsMarkdown ? { annotationsMarkdown } : {},
|
|
699
730
|
...audio ? { audio } : {}, ...images?.length ? { images } : {},
|
|
700
731
|
processing: !!options?.processing,
|
|
701
|
-
model: [
|
|
732
|
+
model: packModelLabel([
|
|
702
733
|
options.provider, options?.router?.provider,
|
|
703
734
|
options?.router?.model || options?.model,
|
|
704
|
-
]
|
|
705
|
-
const key = ensureString(x, { case: 'UP' });
|
|
706
|
-
if (catched.has(key)) { return null; }
|
|
707
|
-
catched.add(key);
|
|
708
|
-
return x;
|
|
709
|
-
}).filter(x => x).join('/'),
|
|
735
|
+
]),
|
|
710
736
|
};
|
|
711
737
|
};
|
|
712
738
|
|
|
739
|
+
const packModelLabel = (model_reference) => {
|
|
740
|
+
const catched = new Set();
|
|
741
|
+
return model_reference.join('/').split('/').map(x => {
|
|
742
|
+
const key = ensureString(x, { case: 'UP' });
|
|
743
|
+
if (catched.has(key)) { return null; }
|
|
744
|
+
catched.add(key);
|
|
745
|
+
return x;
|
|
746
|
+
}).filter(x => x).join('/');
|
|
747
|
+
};
|
|
748
|
+
|
|
713
749
|
const buildPrompts = async (model, input, options = {}) => {
|
|
714
750
|
assert(!(
|
|
715
751
|
options.jsonMode && !model?.json
|
|
@@ -847,6 +883,18 @@ const promptOpenAI = async (aiId, content, options = {}) => {
|
|
|
847
883
|
x => x.function.name === 'searchWeb'
|
|
848
884
|
) && !options.jsonMode ? ONLINE : '';
|
|
849
885
|
const targetModel = `${isOpenrouter(provider, model) ? `${source}/` : ''}${options.model}${ext}`;
|
|
886
|
+
if (provider === OPENAI) {
|
|
887
|
+
// need more debug, currently openrouter is priority
|
|
888
|
+
packedTools.push(...[
|
|
889
|
+
// https://platform.openai.com/docs/guides/tools?tool-type=web-search
|
|
890
|
+
{ type: 'web_search', },
|
|
891
|
+
// https://platform.openai.com/docs/guides/tools-image-generation?lang=javascript
|
|
892
|
+
// https://platform.openai.com/docs/api-reference/responses/create#responses-create-tools
|
|
893
|
+
{ type: 'image_generation', input_fidelity: 'high', partial_images: 3, quality: 'high', size: '1536x1024' },
|
|
894
|
+
// https://platform.openai.com/docs/guides/tools-code-interpreter
|
|
895
|
+
{ type: 'code_interpreter', container: { type: 'auto', memory_limit: '8g' } },
|
|
896
|
+
]);
|
|
897
|
+
}
|
|
850
898
|
if (source === S_GOOGLE) {
|
|
851
899
|
packedTools.push(...[
|
|
852
900
|
{ googleSearch: {} }, { codeExecution: {} }, { urlContext: {} },
|
|
@@ -966,6 +1014,103 @@ const promptOpenAI = async (aiId, content, options = {}) => {
|
|
|
966
1014
|
return await packResp(event, options);
|
|
967
1015
|
};
|
|
968
1016
|
|
|
1017
|
+
const promptGoogle = async (aiId, prompt, options = {}) => {
|
|
1018
|
+
let { provider, client, model } = await getAi(aiId);
|
|
1019
|
+
const M = MODELS[model.name];
|
|
1020
|
+
prompt = ensureString(prompt, { trim: true });
|
|
1021
|
+
assert(await countTokens(prompt, { fast: true })
|
|
1022
|
+
<= M.maxInputTokens,
|
|
1023
|
+
`Prompt must be less than ${M.maxInputTokens} tokens.`, 400
|
|
1024
|
+
);
|
|
1025
|
+
switch (model?.name) {
|
|
1026
|
+
case IMAGEN_4_ULTRA:
|
|
1027
|
+
var resp = await client.models.generateImages({
|
|
1028
|
+
model: model.name, prompt, config: {
|
|
1029
|
+
numberOfImages: options?.n || 4, sampleImageSize: '2K',
|
|
1030
|
+
includeRaiReason: true,
|
|
1031
|
+
// "1:1" (default), "3:4", "4:3", "9:16", and "16:9"
|
|
1032
|
+
aspectRatio: '16:9', personGeneration: 'allow_adult',
|
|
1033
|
+
...options?.config || {},
|
|
1034
|
+
},
|
|
1035
|
+
});
|
|
1036
|
+
var generated = resp?.generatedImages;
|
|
1037
|
+
assert(!resp?.error && generated?.filter(
|
|
1038
|
+
x => !x.raiFilteredReason
|
|
1039
|
+
).length, resp?.error?.message || generated?.find(
|
|
1040
|
+
x => x.raiFilteredReason
|
|
1041
|
+
)?.raiFilteredReason || ERROR_GENERATING);
|
|
1042
|
+
if (!options?.raw) {
|
|
1043
|
+
resp = {
|
|
1044
|
+
text: '', images: await Promise.all((
|
|
1045
|
+
resp?.generatedImages || []
|
|
1046
|
+
).map(async x => ({
|
|
1047
|
+
data: await convert(x.image.imageBytes, {
|
|
1048
|
+
input: BASE64, suffix: 'png', ...options || {}
|
|
1049
|
+
}), mimeType: x.image.mimeType,
|
|
1050
|
+
}))), model: packModelLabel([
|
|
1051
|
+
provider, M.source, model.name,
|
|
1052
|
+
]),
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
break;
|
|
1056
|
+
case VEO_31:
|
|
1057
|
+
var resp = await client.models.generateVideos({
|
|
1058
|
+
model: model.name, prompt, config: {
|
|
1059
|
+
aspectRatio: '16:9', numberOfVideos: 1,
|
|
1060
|
+
// personGeneration: 'allow_adult',
|
|
1061
|
+
enablePromptRewriting: true, addWatermark: false,
|
|
1062
|
+
includeRaiReason: true, ...options?.config || {},
|
|
1063
|
+
},
|
|
1064
|
+
});
|
|
1065
|
+
assert(!resp?.error, resp?.error?.message || ERROR_GENERATING);
|
|
1066
|
+
if (options?.generateRaw) { return resp; }
|
|
1067
|
+
await tryUntil(async () => {
|
|
1068
|
+
resp = await client.operations.getVideosOperation({
|
|
1069
|
+
operation: resp,
|
|
1070
|
+
});
|
|
1071
|
+
assert(
|
|
1072
|
+
resp?.done,
|
|
1073
|
+
`Waiting for Google video generation: ${resp.name}`,
|
|
1074
|
+
);
|
|
1075
|
+
}, { maxTry: 60 * 10, log });
|
|
1076
|
+
assert(!resp?.error && resp?.response?.generatedVideos?.filter(
|
|
1077
|
+
x => !x.raiFilteredReason
|
|
1078
|
+
).length, resp?.error?.message || resp?.response?.generatedVideos?.find(
|
|
1079
|
+
x => x.raiFilteredReason
|
|
1080
|
+
)?.raiFilteredReason || ERROR_GENERATING);
|
|
1081
|
+
if (options?.videoRaw) {
|
|
1082
|
+
resp = resp?.response?.generatedVideos;
|
|
1083
|
+
} else if (!options?.videoRaw) {
|
|
1084
|
+
resp = {
|
|
1085
|
+
text: '', videos: await Promise.all(resp?.response?.generatedVideos?.filter(
|
|
1086
|
+
x => x?.video?.uri
|
|
1087
|
+
).map(async x => {
|
|
1088
|
+
const downloadPath = `${getTempPath({
|
|
1089
|
+
seed: x?.video?.uri
|
|
1090
|
+
})}.mp4`;
|
|
1091
|
+
// @todo: fix this
|
|
1092
|
+
// https://github.com/googleapis/js-genai/compare/main...Leask:js-genai:main
|
|
1093
|
+
await client.files.download({ file: x, downloadPath });
|
|
1094
|
+
await timeout(1000 * 10); // hack to wait for file to be downloaded
|
|
1095
|
+
return {
|
|
1096
|
+
data: await convert(downloadPath, {
|
|
1097
|
+
input: FILE, suffix: 'mp4', ...options || {}
|
|
1098
|
+
}), mimeType: MIME_MP4, jobId: resp.name,
|
|
1099
|
+
};
|
|
1100
|
+
})), model: packModelLabel([
|
|
1101
|
+
provider, M.source, model.name,
|
|
1102
|
+
]),
|
|
1103
|
+
};
|
|
1104
|
+
}
|
|
1105
|
+
break;
|
|
1106
|
+
default:
|
|
1107
|
+
throw new Error('Unsupported model.');
|
|
1108
|
+
}
|
|
1109
|
+
await streamResp(
|
|
1110
|
+
{ ...resp, processing: true }, { ...options, noPack: true }
|
|
1111
|
+
);
|
|
1112
|
+
return { ...resp, processing: false };
|
|
1113
|
+
};
|
|
969
1114
|
|
|
970
1115
|
const initChat = async (options = {}) => {
|
|
971
1116
|
if (options.sessions) {
|
|
@@ -1063,7 +1208,7 @@ const distillFile = async (attachments, o) => {
|
|
|
1063
1208
|
'- You will receive various multimedia files, including images, audio, and videos.',
|
|
1064
1209
|
'- Please analyze these documents, extract the information, and organize it into an easy-to-read format.',
|
|
1065
1210
|
'- For document-type files or image files primarily containing text information, act as a document scanner, return the text content, and describe any important images and tables present. Use markdown to format table and other rich text where possible. Use LaTeX for all formulas, subscripts, representations of formulas, and special symbols in mathematics and chemistry, enclosed by "$" symbols. Please mark the description of images in the same position as the original text without creating separate paragraphs for descriptions. Be sure ONLY describe important images and graphs, and ignore backgrounds and decorative small images. Ensure the returned document is clean, well-organized, and highly readable.',
|
|
1066
|
-
'- For audio files, please
|
|
1211
|
+
'- For audio files, please transcribe the spoken voices into clean text. If there are background sounds, attempt to briefly describe the environmental sounds and music sections. Only care about the main speech content, meaningful music and environment sounds. Do not be disturbed by useless background noise.',
|
|
1067
1212
|
'- For images or video files that are not primarily text-based, describe the tragic scene you observe, highlight key details, convey the emotional tone of the setting, and share your impressions.',
|
|
1068
1213
|
'- For video files, please describe the content, including the theme, subjects, characters, scenes, objects, storyline, and emotional tone.',
|
|
1069
1214
|
'- Please RETURN ONLY your analysis results without including your thought process or other unrelated information.',
|
|
@@ -1161,11 +1306,14 @@ export {
|
|
|
1161
1306
|
FUNCTION,
|
|
1162
1307
|
GEMINI_25_FLASH,
|
|
1163
1308
|
GEMINI_30_PRO_IMAGE,
|
|
1309
|
+
GPT_5_IMAGE,
|
|
1164
1310
|
GPT_51,
|
|
1311
|
+
IMAGEN_4_ULTRA,
|
|
1165
1312
|
INSTRUCTIONS,
|
|
1166
1313
|
MODELS,
|
|
1167
1314
|
OPENAI_VOICE,
|
|
1168
1315
|
RETRIEVAL,
|
|
1316
|
+
VEO_31,
|
|
1169
1317
|
analyzeSessions,
|
|
1170
1318
|
countTokens,
|
|
1171
1319
|
distillFile,
|
package/lib/manifest.mjs
CHANGED
package/lib/speech.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { DEFAULT_MODELS, OPENAI_VOICE, countTokens, k } from './alan.mjs';
|
|
2
2
|
import { getFfmpeg, packPcmToWav } from './media.mjs';
|
|
3
3
|
import { get } from './web.mjs';
|
|
4
|
-
import { convert, getTempPath
|
|
4
|
+
import { convert, getTempPath } from './storage.mjs';
|
|
5
5
|
import { ensureString, mergeAtoB } from './utilitas.mjs';
|
|
6
6
|
|
|
7
7
|
import {
|
|
@@ -18,20 +18,19 @@ const _NEED = ['@google/genai', 'OpenAI', 'whisper-node'];
|
|
|
18
18
|
|
|
19
19
|
const [
|
|
20
20
|
BUFFER, STREAM, BASE64, FILE, clients, suffix, SPEAKER, cleanup, wav,
|
|
21
|
-
GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS,
|
|
21
|
+
GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS,
|
|
22
22
|
OPENAI_TTS_MAX_LENGTH, WHISPER_DEFAULT_MODEL, errorMessage
|
|
23
23
|
] = [
|
|
24
24
|
'BUFFER', 'STREAM', 'BASE64', 'FILE', {}, 'ogg', 'SPEAKER', true, 'wav',
|
|
25
25
|
'gpt-4o-mini-tts', 'gpt-4o-transcribe', 'gemini-2.5-flash-preview-tts',
|
|
26
|
-
|
|
26
|
+
4096, 'base', 'Invalid audio data.',
|
|
27
27
|
];
|
|
28
28
|
|
|
29
29
|
const [
|
|
30
30
|
defaultOpenAITtsModel, defaultOpenAISttModel, defaultGeminiTtsModel,
|
|
31
|
-
|
|
32
|
-
] = [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS, GEMINI_FLASH];
|
|
31
|
+
] = [GPT_4O_MIMI_TTS, GPT_4O_TRANSCRIBE, GEMINI_25_FLASH_TTS];
|
|
33
32
|
|
|
34
|
-
const
|
|
33
|
+
const TTS_PROMPT = "As an AI voice assistant, please say the following content in a warm, friendly and professional tone, if the language is English, use an American accent, if it's Traditional Chinese, use Hong Kong Cantonese, if it's Simplified Chinese, use standard Mandarin, for other languages, please speak with a standard, clear accent";
|
|
35
34
|
|
|
36
35
|
const WHISPER_MODELS = [
|
|
37
36
|
// npx whisper-node download tiny.en
|
|
@@ -111,9 +110,6 @@ const init = async (options) => {
|
|
|
111
110
|
if (options?.tts) {
|
|
112
111
|
clients.tts = client.models.generateContent;
|
|
113
112
|
}
|
|
114
|
-
if (options?.stt) {
|
|
115
|
-
clients.stt = client.models.generateContent;
|
|
116
|
-
}
|
|
117
113
|
break;
|
|
118
114
|
case '':
|
|
119
115
|
clients._provider = 'LOCAL';
|
|
@@ -166,13 +162,14 @@ const ttsGoogle = async (contents, options) => {
|
|
|
166
162
|
assert(contents, 'Text is required.', 400);
|
|
167
163
|
assert(await countTokens(contents) <= k(32), 'Text is too long.', 400);
|
|
168
164
|
const resp = await clients.tts({
|
|
169
|
-
model: options?.model || defaultGeminiTtsModel,
|
|
165
|
+
model: options?.model || defaultGeminiTtsModel,
|
|
166
|
+
contents: `${options?.prompt || TTS_PROMPT}: ${contents}`,
|
|
170
167
|
config: mergeAtoB(options?.config, {
|
|
171
168
|
responseModalities: ['AUDIO'],
|
|
172
169
|
speechConfig: {
|
|
173
170
|
voiceConfig: {
|
|
174
171
|
prebuiltVoiceConfig: {
|
|
175
|
-
voiceName: options?.voice || '
|
|
172
|
+
voiceName: options?.voice || 'Zephyr',
|
|
176
173
|
},
|
|
177
174
|
},
|
|
178
175
|
},
|
|
@@ -227,29 +224,6 @@ const sttOpenAI = async (audio, options) => {
|
|
|
227
224
|
return result;
|
|
228
225
|
};
|
|
229
226
|
|
|
230
|
-
const sttGoogle = async (audio, options) => {
|
|
231
|
-
assert(clients.stt, 'Google STT API has not been initialized.', 500);
|
|
232
|
-
const data = await convert(audio, {
|
|
233
|
-
input: options?.input, expected: BASE64, errorMessage,
|
|
234
|
-
});
|
|
235
|
-
const resp = await clients.stt({
|
|
236
|
-
model: options?.model || defaultGeminiSttModel, contents: {
|
|
237
|
-
parts: [{
|
|
238
|
-
inlineData: {
|
|
239
|
-
mimeType: options?.mimeType || MIME_WAV, data,
|
|
240
|
-
},
|
|
241
|
-
}, { text: STT_PROMPT }],
|
|
242
|
-
},
|
|
243
|
-
config: { ...options?.config || {} },
|
|
244
|
-
});
|
|
245
|
-
assert(
|
|
246
|
-
resp?.candidates?.[0]?.content?.parts?.[0],
|
|
247
|
-
'Failed to transcribe audio.', 500
|
|
248
|
-
);
|
|
249
|
-
return options?.raw ? resp.candidates
|
|
250
|
-
: (resp.candidates[0].content.parts[0].text?.trim?.() || '');
|
|
251
|
-
};
|
|
252
|
-
|
|
253
227
|
// This function is not working properly, a pull request is filed:
|
|
254
228
|
// https://github.com/ariym/whisper-node/pull/58
|
|
255
229
|
const sttWhisper = async (audio, options) => {
|
|
@@ -288,8 +262,7 @@ const tts = async (text, options) => {
|
|
|
288
262
|
|
|
289
263
|
const stt = async (audio, options) => {
|
|
290
264
|
let engine;
|
|
291
|
-
if (clients?.stt && clients._provider === '
|
|
292
|
-
else if (clients?.stt && clients._provider === 'OPENAI') { engine = sttOpenAI; }
|
|
265
|
+
if (clients?.stt && clients._provider === 'OPENAI') { engine = sttOpenAI; }
|
|
293
266
|
else if (await checkWhisper()) { engine = sttWhisper; }
|
|
294
267
|
else { throwError('Speech-to-Text engine has not been initialized.', 500); }
|
|
295
268
|
return await engine(audio, options);
|
|
@@ -303,7 +276,6 @@ export {
|
|
|
303
276
|
checkWhisper,
|
|
304
277
|
init,
|
|
305
278
|
stt,
|
|
306
|
-
sttGoogle,
|
|
307
279
|
sttOpenAI,
|
|
308
280
|
sttWhisper,
|
|
309
281
|
tts,
|