utilitas 1999.1.93 → 1999.1.96
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +16 -0
- package/README.md +12 -20
- package/dist/utilitas.lite.mjs +1 -1
- package/dist/utilitas.lite.mjs.map +1 -1
- package/lib/alan.mjs +340 -757
- package/lib/embedding.mjs +174 -0
- package/lib/manifest.mjs +17 -19
- package/lib/storage.mjs +11 -5
- package/package.json +18 -20
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import { convert } from './storage.mjs';
|
|
2
|
+
import { countTokens } from './alan.mjs';
|
|
3
|
+
import { ensureArray, ensureString, need } from './utilitas.mjs';
|
|
4
|
+
|
|
5
|
+
const _NEED = ['openai'];
|
|
6
|
+
const clients = {};
|
|
7
|
+
const ELLIPSIS = '...';
|
|
8
|
+
const buildTextWithEllipsis = (txt, trim) => `${txt}${(trim ? ELLIPSIS : '')}`;
|
|
9
|
+
|
|
10
|
+
const [
|
|
11
|
+
OPENAI,
|
|
12
|
+
OPENROUTER,
|
|
13
|
+
JINA,
|
|
14
|
+
OPENAI_MODEL_EMBED_SMALL,
|
|
15
|
+
OPENAI_MODEL_EMBED_LARGE,
|
|
16
|
+
GOOGLE_MODEL_GEMINI_EMBED,
|
|
17
|
+
JINA_MODEL_CLIP_2,
|
|
18
|
+
JINA_MODEL_V_3,
|
|
19
|
+
] = [
|
|
20
|
+
'OPENAI',
|
|
21
|
+
'OPENROUTER',
|
|
22
|
+
'JINA',
|
|
23
|
+
'text-embedding-3-small', // dim: 1536
|
|
24
|
+
'text-embedding-3-large', // dim: 3072
|
|
25
|
+
'gemini-embedding-001', // dim: 768, 1536, or 3072(default)
|
|
26
|
+
'jina-clip-v2', // dim: 1024
|
|
27
|
+
'jina-embeddings-v3', // dim: 256‑1024
|
|
28
|
+
];
|
|
29
|
+
|
|
30
|
+
const PROVIDER_BASE_URL = {
|
|
31
|
+
[OPENROUTER]: 'https://openrouter.ai/api/v1',
|
|
32
|
+
[JINA]: 'https://api.jina.ai/v1/',
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
const DEFAULT_MODELS = {
|
|
36
|
+
[OPENAI]: OPENAI_MODEL_EMBED_SMALL,
|
|
37
|
+
[OPENROUTER]: GOOGLE_MODEL_GEMINI_EMBED,
|
|
38
|
+
[JINA]: JINA_MODEL_CLIP_2,
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
const MODEL_CONFIG = {
|
|
42
|
+
[OPENAI_MODEL_EMBED_SMALL]: { source: 'openai', maxTokens: 8192 },
|
|
43
|
+
[OPENAI_MODEL_EMBED_LARGE]: { source: 'openai', maxTokens: 8192 },
|
|
44
|
+
[GOOGLE_MODEL_GEMINI_EMBED]: { source: 'google', maxTokens: 20000 },
|
|
45
|
+
[JINA_MODEL_CLIP_2]: {
|
|
46
|
+
maxTokens: 8192,
|
|
47
|
+
image: true,
|
|
48
|
+
options: {
|
|
49
|
+
task: 'retrieval.query',
|
|
50
|
+
dimensions: 1024,
|
|
51
|
+
normalized: true,
|
|
52
|
+
embedding_type: 'float',
|
|
53
|
+
},
|
|
54
|
+
},
|
|
55
|
+
// Token calculation may be incorrect because its limitation applies to the
|
|
56
|
+
// entire request rather than individual entries.
|
|
57
|
+
[JINA_MODEL_V_3]: {
|
|
58
|
+
maxTokens: 8192,
|
|
59
|
+
image: false,
|
|
60
|
+
options: {
|
|
61
|
+
task: 'retrieval.query',
|
|
62
|
+
dimensions: 1024,
|
|
63
|
+
normalized: true,
|
|
64
|
+
late_chunking: true,
|
|
65
|
+
embedding_type: 'float',
|
|
66
|
+
},
|
|
67
|
+
},
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
const ensureProvider = (options) => {
|
|
71
|
+
options.provider = ensureString(options?.provider, { case: 'UP' });
|
|
72
|
+
assert(
|
|
73
|
+
DEFAULT_MODELS?.[options.provider], 'Provider is required.', 400
|
|
74
|
+
);
|
|
75
|
+
return options.provider;
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
const ensureApiKey = (options) => {
|
|
79
|
+
assert(options?.apiKey, 'API key is required.', 400);
|
|
80
|
+
return options.apiKey;
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
const trimTextToLimit = async (text, limit = Infinity) => {
|
|
84
|
+
text = ensureString(text, { trim: true });
|
|
85
|
+
let trimmed = false;
|
|
86
|
+
let lastCheck = null;
|
|
87
|
+
while ((lastCheck = await countTokens(
|
|
88
|
+
buildTextWithEllipsis(text, trimmed), { fast: true }
|
|
89
|
+
)) > limit) {
|
|
90
|
+
text = text.split(' ').slice(
|
|
91
|
+
0, -Math.ceil((Math.abs(lastCheck - limit) / 10))
|
|
92
|
+
).join(' ').trimEnd();
|
|
93
|
+
trimmed = true;
|
|
94
|
+
}
|
|
95
|
+
return buildTextWithEllipsis(text, trimmed);
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
const getClient = (provider) => {
|
|
99
|
+
provider = ensureString(provider, { case: 'UP' })
|
|
100
|
+
|| Object.keys(clients || {})[0];
|
|
101
|
+
assert(provider, 'No embedding provider has been initialized.', 500);
|
|
102
|
+
return { ...clients?.[provider], provider };
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
const init = async (options = {}) => {
|
|
106
|
+
ensureApiKey(options);
|
|
107
|
+
const provider = ensureProvider(options);
|
|
108
|
+
const OpenAI = await need('openai');
|
|
109
|
+
const baseURL = options?.baseURL || PROVIDER_BASE_URL[provider];
|
|
110
|
+
const model = options?.model || DEFAULT_MODELS[provider];
|
|
111
|
+
clients[provider] = {
|
|
112
|
+
client: new OpenAI({ ...options, baseURL }),
|
|
113
|
+
model, source: MODEL_CONFIG[model]?.source,
|
|
114
|
+
};
|
|
115
|
+
return getClient(provider);
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
const embedding = async (input, options = {}) => {
|
|
119
|
+
let [{ client, model: selectedModel, provider, source }, resp]
|
|
120
|
+
= [getClient(options?.provider), null];
|
|
121
|
+
const model = options?.model || selectedModel;
|
|
122
|
+
const multiple = Array.isArray(input);
|
|
123
|
+
input = await Promise.all(ensureArray(input).map(async x => {
|
|
124
|
+
x = Object.isObject(x) ? x : { text: x };
|
|
125
|
+
assert(
|
|
126
|
+
Object.keys(x).length == 1,
|
|
127
|
+
'Only one type of input is allowed at a time.', 400
|
|
128
|
+
);
|
|
129
|
+
if (x.text) {
|
|
130
|
+
x.text = await trimTextToLimit(
|
|
131
|
+
x.text, MODEL_CONFIG[model]?.maxTokens
|
|
132
|
+
);
|
|
133
|
+
} else if (x.image) {
|
|
134
|
+
assert(
|
|
135
|
+
MODEL_CONFIG[model]?.image,
|
|
136
|
+
`Model ${model} does not support image embeddings.`, 400
|
|
137
|
+
);
|
|
138
|
+
if (options?.input) {
|
|
139
|
+
x.image = await convert(
|
|
140
|
+
x.image, { ...options, expected: 'base64' }
|
|
141
|
+
);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return x;
|
|
145
|
+
}));
|
|
146
|
+
MODEL_CONFIG[model]?.image || (input = input.map(x => x.text));
|
|
147
|
+
assert(input.length, 'Input is required.', 400);
|
|
148
|
+
const body = {
|
|
149
|
+
model, input, ...MODEL_CONFIG[model]?.options || {},
|
|
150
|
+
...options?.requestOptions || {},
|
|
151
|
+
};
|
|
152
|
+
switch (provider) {
|
|
153
|
+
case JINA:
|
|
154
|
+
resp = await client.post('/embeddings', { body });
|
|
155
|
+
break;
|
|
156
|
+
case OPENROUTER:
|
|
157
|
+
source = options?.source || source
|
|
158
|
+
|| MODEL_CONFIG[body.model]?.source;
|
|
159
|
+
body.model = `${source ? `${source}/` : ''}${body.model}`;
|
|
160
|
+
case OPENAI:
|
|
161
|
+
resp = await client.embeddings.create(body);
|
|
162
|
+
}
|
|
163
|
+
assert(resp?.data?.length, 'No embeddings returned.', 500);
|
|
164
|
+
if (options?.raw) { return resp; }
|
|
165
|
+
const vectors = resp.data.map(x => x.embedding);
|
|
166
|
+
return multiple ? vectors : vectors[0];
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
export default init;
|
|
170
|
+
export {
|
|
171
|
+
_NEED,
|
|
172
|
+
embedding,
|
|
173
|
+
init,
|
|
174
|
+
};
|
package/lib/manifest.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
const manifest = {
|
|
2
2
|
"name": "utilitas",
|
|
3
3
|
"description": "Just another common utility for JavaScript.",
|
|
4
|
-
"version": "1999.1.
|
|
4
|
+
"version": "1999.1.96",
|
|
5
5
|
"private": false,
|
|
6
6
|
"homepage": "https://github.com/Leask/utilitas",
|
|
7
7
|
"main": "index.mjs",
|
|
@@ -19,55 +19,53 @@ const manifest = {
|
|
|
19
19
|
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"file-type": "^21.
|
|
23
|
-
"mathjs": "^15.
|
|
22
|
+
"file-type": "^21.1.0",
|
|
23
|
+
"mathjs": "^15.1.0",
|
|
24
24
|
"uuid": "^13.0.0"
|
|
25
25
|
},
|
|
26
26
|
"devDependencies": {
|
|
27
|
-
"@anthropic-ai/sdk": "^0.67.0",
|
|
28
|
-
"@anthropic-ai/vertex-sdk": "^0.14.0",
|
|
29
27
|
"@ffmpeg-installer/ffmpeg": "^1.1.0",
|
|
30
28
|
"@ffprobe-installer/ffprobe": "^2.1.2",
|
|
31
29
|
"@google-cloud/speech": "^7.2.1",
|
|
32
|
-
"@google-cloud/storage": "^7.17.
|
|
30
|
+
"@google-cloud/storage": "^7.17.3",
|
|
33
31
|
"@google-cloud/vision": "^5.3.4",
|
|
34
|
-
"@google/genai": "^1.
|
|
32
|
+
"@google/genai": "^1.29.0",
|
|
35
33
|
"@mozilla/readability": "github:mozilla/readability",
|
|
36
|
-
"@sentry/node": "^10.
|
|
37
|
-
"@sentry/profiling-node": "^10.
|
|
34
|
+
"@sentry/node": "^10.25.0",
|
|
35
|
+
"@sentry/profiling-node": "^10.25.0",
|
|
38
36
|
"acme-client": "^5.4.0",
|
|
39
37
|
"browserify-fs": "^1.0.0",
|
|
40
38
|
"buffer": "^6.0.3",
|
|
41
39
|
"fast-geoip": "^1.1.88",
|
|
42
40
|
"fluent-ffmpeg": "^2.1.3",
|
|
43
41
|
"form-data": "^4.0.4",
|
|
44
|
-
"ioredis": "^5.8.
|
|
42
|
+
"ioredis": "^5.8.2",
|
|
45
43
|
"js-tiktoken": "^1.0.21",
|
|
46
|
-
"jsdom": "^27.0
|
|
44
|
+
"jsdom": "^27.2.0",
|
|
47
45
|
"lorem-ipsum": "^2.0.8",
|
|
48
46
|
"mailgun.js": "^12.1.1",
|
|
49
|
-
"mailparser": "^3.
|
|
47
|
+
"mailparser": "^3.9.0",
|
|
50
48
|
"mime": "^4.1.0",
|
|
51
|
-
"mysql2": "^3.15.
|
|
52
|
-
"node-mailjet": "^6.0.
|
|
49
|
+
"mysql2": "^3.15.3",
|
|
50
|
+
"node-mailjet": "^6.0.11",
|
|
53
51
|
"node-polyfill-webpack-plugin": "^4.1.0",
|
|
54
52
|
"office-text-extractor": "^3.0.3",
|
|
55
|
-
"openai": "^6.
|
|
56
|
-
"pdfjs-dist": "^5.4.
|
|
53
|
+
"openai": "^6.8.1",
|
|
54
|
+
"pdfjs-dist": "^5.4.394",
|
|
57
55
|
"pg": "^8.16.3",
|
|
58
56
|
"pgvector": "^0.2.1",
|
|
59
57
|
"ping": "^1.0.0",
|
|
60
58
|
"process": "^0.11.10",
|
|
61
|
-
"puppeteer": "^24.
|
|
59
|
+
"puppeteer": "^24.29.1",
|
|
62
60
|
"say": "^0.16.0",
|
|
63
61
|
"telegraf": "^4.16.3",
|
|
64
62
|
"telesignsdk": "^3.0.4",
|
|
65
63
|
"tesseract.js": "^6.0.1",
|
|
66
|
-
"twilio": "^5.10.
|
|
64
|
+
"twilio": "^5.10.5",
|
|
67
65
|
"url": "github:Leask/node-url",
|
|
68
66
|
"webpack-cli": "^6.0.1",
|
|
69
67
|
"whisper-node": "^1.1.1",
|
|
70
|
-
"wrangler": "^4.
|
|
68
|
+
"wrangler": "^4.47.0",
|
|
71
69
|
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz",
|
|
72
70
|
"youtube-transcript": "^1.2.1"
|
|
73
71
|
}
|
package/lib/storage.mjs
CHANGED
|
@@ -208,9 +208,10 @@ const outputFile = async (buffer, options) => {
|
|
|
208
208
|
await writeFile(options?.file, buffer, _encoding);
|
|
209
209
|
return options?.file;
|
|
210
210
|
}
|
|
211
|
+
const { extension } = await getMime(buffer, options?.filename);
|
|
211
212
|
return await writeTempFile(buffer, {
|
|
212
213
|
filename: options?.filename,
|
|
213
|
-
encoding: _encoding, suffix: options?.suffix,
|
|
214
|
+
encoding: _encoding, suffix: options?.suffix || extension,
|
|
214
215
|
});
|
|
215
216
|
};
|
|
216
217
|
|
|
@@ -310,17 +311,22 @@ const convert = async (any, options) => {
|
|
|
310
311
|
return Object.keys(result).length === 1 ? result.content : result;
|
|
311
312
|
};
|
|
312
313
|
|
|
314
|
+
const getMime = async (buf, filename) => {
|
|
315
|
+
const mimeType = await ignoreErrFunc(() => need('mime-types'));
|
|
316
|
+
const mime = extract(await fileTypeFromBuffer(buf), 'mime')
|
|
317
|
+
|| (filename && mimeType?.lookup?.(filename)) || MIME_BINARY;
|
|
318
|
+
return { mime, extension: mimeType?.extension?.(mime) || 'bin' };
|
|
319
|
+
};
|
|
320
|
+
|
|
313
321
|
const analyzeFile = async (any, options) => {
|
|
314
322
|
const { meta, content } = await convert(any, { meta: 1, ...options || {} });
|
|
315
323
|
const hashAlgorithm = options?.hashAlgorithm || defaultAlgorithm;
|
|
316
324
|
const _hash = hash(content, hashAlgorithm);
|
|
317
325
|
const filename = options?.filename || (meta ? basename(any) : null) || hash;
|
|
318
|
-
const
|
|
326
|
+
const { mime } = await getMime(content, filename);
|
|
319
327
|
return {
|
|
320
328
|
content, extname: extname(filename).replace(/^\.|\.$/g, ''),
|
|
321
|
-
filename, hashAlgorithm, hash: _hash,
|
|
322
|
-
mime: extract(await fileTypeFromBuffer(content), 'mime') || lookup(filename) || MIME_BINARY,
|
|
323
|
-
size: content.length,
|
|
329
|
+
filename, hashAlgorithm, hash: _hash, mime, size: content.length,
|
|
324
330
|
};
|
|
325
331
|
};
|
|
326
332
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "utilitas",
|
|
3
3
|
"description": "Just another common utility for JavaScript.",
|
|
4
|
-
"version": "1999.1.
|
|
4
|
+
"version": "1999.1.96",
|
|
5
5
|
"private": false,
|
|
6
6
|
"homepage": "https://github.com/Leask/utilitas",
|
|
7
7
|
"main": "index.mjs",
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
"scripts": {
|
|
13
13
|
"start": "node index.mjs",
|
|
14
14
|
"debug": "node --inspect --trace-warnings debug.mjs",
|
|
15
|
-
"test": "node
|
|
15
|
+
"test": "node test.mjs",
|
|
16
16
|
"updep": "npx npm-check-updates -u && npm install && wget https://raw.githubusercontent.com/Marak/colors.js/master/lib/styles.js -O ./lib/style.cjs && wget https://raw.githubusercontent.com/sindresorhus/cli-boxes/refs/heads/main/boxes.json -O ./lib/boxes.json",
|
|
17
17
|
"gitsync": "( git commit -am \"Released @ `date`\" || true ) && git pull && git push",
|
|
18
18
|
"pack": "./node_modules/.bin/webpack-cli --config webpack.config.mjs",
|
|
@@ -30,55 +30,53 @@
|
|
|
30
30
|
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz"
|
|
31
31
|
},
|
|
32
32
|
"dependencies": {
|
|
33
|
-
"file-type": "^21.
|
|
34
|
-
"mathjs": "^15.
|
|
33
|
+
"file-type": "^21.1.0",
|
|
34
|
+
"mathjs": "^15.1.0",
|
|
35
35
|
"uuid": "^13.0.0"
|
|
36
36
|
},
|
|
37
37
|
"devDependencies": {
|
|
38
|
-
"@anthropic-ai/sdk": "^0.67.0",
|
|
39
|
-
"@anthropic-ai/vertex-sdk": "^0.14.0",
|
|
40
38
|
"@ffmpeg-installer/ffmpeg": "^1.1.0",
|
|
41
39
|
"@ffprobe-installer/ffprobe": "^2.1.2",
|
|
42
40
|
"@google-cloud/speech": "^7.2.1",
|
|
43
|
-
"@google-cloud/storage": "^7.17.
|
|
41
|
+
"@google-cloud/storage": "^7.17.3",
|
|
44
42
|
"@google-cloud/vision": "^5.3.4",
|
|
45
|
-
"@google/genai": "^1.
|
|
43
|
+
"@google/genai": "^1.29.0",
|
|
46
44
|
"@mozilla/readability": "github:mozilla/readability",
|
|
47
|
-
"@sentry/node": "^10.
|
|
48
|
-
"@sentry/profiling-node": "^10.
|
|
45
|
+
"@sentry/node": "^10.25.0",
|
|
46
|
+
"@sentry/profiling-node": "^10.25.0",
|
|
49
47
|
"acme-client": "^5.4.0",
|
|
50
48
|
"browserify-fs": "^1.0.0",
|
|
51
49
|
"buffer": "^6.0.3",
|
|
52
50
|
"fast-geoip": "^1.1.88",
|
|
53
51
|
"fluent-ffmpeg": "^2.1.3",
|
|
54
52
|
"form-data": "^4.0.4",
|
|
55
|
-
"ioredis": "^5.8.
|
|
53
|
+
"ioredis": "^5.8.2",
|
|
56
54
|
"js-tiktoken": "^1.0.21",
|
|
57
|
-
"jsdom": "^27.0
|
|
55
|
+
"jsdom": "^27.2.0",
|
|
58
56
|
"lorem-ipsum": "^2.0.8",
|
|
59
57
|
"mailgun.js": "^12.1.1",
|
|
60
|
-
"mailparser": "^3.
|
|
58
|
+
"mailparser": "^3.9.0",
|
|
61
59
|
"mime": "^4.1.0",
|
|
62
|
-
"mysql2": "^3.15.
|
|
63
|
-
"node-mailjet": "^6.0.
|
|
60
|
+
"mysql2": "^3.15.3",
|
|
61
|
+
"node-mailjet": "^6.0.11",
|
|
64
62
|
"node-polyfill-webpack-plugin": "^4.1.0",
|
|
65
63
|
"office-text-extractor": "^3.0.3",
|
|
66
|
-
"openai": "^6.
|
|
67
|
-
"pdfjs-dist": "^5.4.
|
|
64
|
+
"openai": "^6.8.1",
|
|
65
|
+
"pdfjs-dist": "^5.4.394",
|
|
68
66
|
"pg": "^8.16.3",
|
|
69
67
|
"pgvector": "^0.2.1",
|
|
70
68
|
"ping": "^1.0.0",
|
|
71
69
|
"process": "^0.11.10",
|
|
72
|
-
"puppeteer": "^24.
|
|
70
|
+
"puppeteer": "^24.29.1",
|
|
73
71
|
"say": "^0.16.0",
|
|
74
72
|
"telegraf": "^4.16.3",
|
|
75
73
|
"telesignsdk": "^3.0.4",
|
|
76
74
|
"tesseract.js": "^6.0.1",
|
|
77
|
-
"twilio": "^5.10.
|
|
75
|
+
"twilio": "^5.10.5",
|
|
78
76
|
"url": "github:Leask/node-url",
|
|
79
77
|
"webpack-cli": "^6.0.1",
|
|
80
78
|
"whisper-node": "^1.1.1",
|
|
81
|
-
"wrangler": "^4.
|
|
79
|
+
"wrangler": "^4.47.0",
|
|
82
80
|
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz",
|
|
83
81
|
"youtube-transcript": "^1.2.1"
|
|
84
82
|
}
|