utilitas 1999.1.93 → 1999.1.96

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ import { convert } from './storage.mjs';
2
+ import { countTokens } from './alan.mjs';
3
+ import { ensureArray, ensureString, need } from './utilitas.mjs';
4
+
5
+ const _NEED = ['openai'];
6
+ const clients = {};
7
+ const ELLIPSIS = '...';
8
+ const buildTextWithEllipsis = (txt, trim) => `${txt}${(trim ? ELLIPSIS : '')}`;
9
+
10
+ const [
11
+ OPENAI,
12
+ OPENROUTER,
13
+ JINA,
14
+ OPENAI_MODEL_EMBED_SMALL,
15
+ OPENAI_MODEL_EMBED_LARGE,
16
+ GOOGLE_MODEL_GEMINI_EMBED,
17
+ JINA_MODEL_CLIP_2,
18
+ JINA_MODEL_V_3,
19
+ ] = [
20
+ 'OPENAI',
21
+ 'OPENROUTER',
22
+ 'JINA',
23
+ 'text-embedding-3-small', // dim: 1536
24
+ 'text-embedding-3-large', // dim: 3072
25
+ 'gemini-embedding-001', // dim: 768, 1536, or 3072(default)
26
+ 'jina-clip-v2', // dim: 1024
27
+ 'jina-embeddings-v3', // dim: 256‑1024
28
+ ];
29
+
30
+ const PROVIDER_BASE_URL = {
31
+ [OPENROUTER]: 'https://openrouter.ai/api/v1',
32
+ [JINA]: 'https://api.jina.ai/v1/',
33
+ };
34
+
35
+ const DEFAULT_MODELS = {
36
+ [OPENAI]: OPENAI_MODEL_EMBED_SMALL,
37
+ [OPENROUTER]: GOOGLE_MODEL_GEMINI_EMBED,
38
+ [JINA]: JINA_MODEL_CLIP_2,
39
+ };
40
+
41
+ const MODEL_CONFIG = {
42
+ [OPENAI_MODEL_EMBED_SMALL]: { source: 'openai', maxTokens: 8192 },
43
+ [OPENAI_MODEL_EMBED_LARGE]: { source: 'openai', maxTokens: 8192 },
44
+ [GOOGLE_MODEL_GEMINI_EMBED]: { source: 'google', maxTokens: 20000 },
45
+ [JINA_MODEL_CLIP_2]: {
46
+ maxTokens: 8192,
47
+ image: true,
48
+ options: {
49
+ task: 'retrieval.query',
50
+ dimensions: 1024,
51
+ normalized: true,
52
+ embedding_type: 'float',
53
+ },
54
+ },
55
+ // Token calculation may be incorrect because its limitation applies to the
56
+ // entire request rather than individual entries.
57
+ [JINA_MODEL_V_3]: {
58
+ maxTokens: 8192,
59
+ image: false,
60
+ options: {
61
+ task: 'retrieval.query',
62
+ dimensions: 1024,
63
+ normalized: true,
64
+ late_chunking: true,
65
+ embedding_type: 'float',
66
+ },
67
+ },
68
+ };
69
+
70
+ const ensureProvider = (options) => {
71
+ options.provider = ensureString(options?.provider, { case: 'UP' });
72
+ assert(
73
+ DEFAULT_MODELS?.[options.provider], 'Provider is required.', 400
74
+ );
75
+ return options.provider;
76
+ };
77
+
78
+ const ensureApiKey = (options) => {
79
+ assert(options?.apiKey, 'API key is required.', 400);
80
+ return options.apiKey;
81
+ };
82
+
83
+ const trimTextToLimit = async (text, limit = Infinity) => {
84
+ text = ensureString(text, { trim: true });
85
+ let trimmed = false;
86
+ let lastCheck = null;
87
+ while ((lastCheck = await countTokens(
88
+ buildTextWithEllipsis(text, trimmed), { fast: true }
89
+ )) > limit) {
90
+ text = text.split(' ').slice(
91
+ 0, -Math.ceil((Math.abs(lastCheck - limit) / 10))
92
+ ).join(' ').trimEnd();
93
+ trimmed = true;
94
+ }
95
+ return buildTextWithEllipsis(text, trimmed);
96
+ };
97
+
98
+ const getClient = (provider) => {
99
+ provider = ensureString(provider, { case: 'UP' })
100
+ || Object.keys(clients || {})[0];
101
+ assert(provider, 'No embedding provider has been initialized.', 500);
102
+ return { ...clients?.[provider], provider };
103
+ };
104
+
105
+ const init = async (options = {}) => {
106
+ ensureApiKey(options);
107
+ const provider = ensureProvider(options);
108
+ const OpenAI = await need('openai');
109
+ const baseURL = options?.baseURL || PROVIDER_BASE_URL[provider];
110
+ const model = options?.model || DEFAULT_MODELS[provider];
111
+ clients[provider] = {
112
+ client: new OpenAI({ ...options, baseURL }),
113
+ model, source: MODEL_CONFIG[model]?.source,
114
+ };
115
+ return getClient(provider);
116
+ };
117
+
118
+ const embedding = async (input, options = {}) => {
119
+ let [{ client, model: selectedModel, provider, source }, resp]
120
+ = [getClient(options?.provider), null];
121
+ const model = options?.model || selectedModel;
122
+ const multiple = Array.isArray(input);
123
+ input = await Promise.all(ensureArray(input).map(async x => {
124
+ x = Object.isObject(x) ? x : { text: x };
125
+ assert(
126
+ Object.keys(x).length == 1,
127
+ 'Only one type of input is allowed at a time.', 400
128
+ );
129
+ if (x.text) {
130
+ x.text = await trimTextToLimit(
131
+ x.text, MODEL_CONFIG[model]?.maxTokens
132
+ );
133
+ } else if (x.image) {
134
+ assert(
135
+ MODEL_CONFIG[model]?.image,
136
+ `Model ${model} does not support image embeddings.`, 400
137
+ );
138
+ if (options?.input) {
139
+ x.image = await convert(
140
+ x.image, { ...options, expected: 'base64' }
141
+ );
142
+ }
143
+ }
144
+ return x;
145
+ }));
146
+ MODEL_CONFIG[model]?.image || (input = input.map(x => x.text));
147
+ assert(input.length, 'Input is required.', 400);
148
+ const body = {
149
+ model, input, ...MODEL_CONFIG[model]?.options || {},
150
+ ...options?.requestOptions || {},
151
+ };
152
+ switch (provider) {
153
+ case JINA:
154
+ resp = await client.post('/embeddings', { body });
155
+ break;
156
+ case OPENROUTER:
157
+ source = options?.source || source
158
+ || MODEL_CONFIG[body.model]?.source;
159
+ body.model = `${source ? `${source}/` : ''}${body.model}`;
160
+ case OPENAI:
161
+ resp = await client.embeddings.create(body);
162
+ }
163
+ assert(resp?.data?.length, 'No embeddings returned.', 500);
164
+ if (options?.raw) { return resp; }
165
+ const vectors = resp.data.map(x => x.embedding);
166
+ return multiple ? vectors : vectors[0];
167
+ };
168
+
169
+ export default init;
170
+ export {
171
+ _NEED,
172
+ embedding,
173
+ init,
174
+ };
package/lib/manifest.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  const manifest = {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "1999.1.93",
4
+ "version": "1999.1.96",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -19,55 +19,53 @@ const manifest = {
19
19
  "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz"
20
20
  },
21
21
  "dependencies": {
22
- "file-type": "^21.0.0",
23
- "mathjs": "^15.0.0",
22
+ "file-type": "^21.1.0",
23
+ "mathjs": "^15.1.0",
24
24
  "uuid": "^13.0.0"
25
25
  },
26
26
  "devDependencies": {
27
- "@anthropic-ai/sdk": "^0.67.0",
28
- "@anthropic-ai/vertex-sdk": "^0.14.0",
29
27
  "@ffmpeg-installer/ffmpeg": "^1.1.0",
30
28
  "@ffprobe-installer/ffprobe": "^2.1.2",
31
29
  "@google-cloud/speech": "^7.2.1",
32
- "@google-cloud/storage": "^7.17.2",
30
+ "@google-cloud/storage": "^7.17.3",
33
31
  "@google-cloud/vision": "^5.3.4",
34
- "@google/genai": "^1.25.0",
32
+ "@google/genai": "^1.29.0",
35
33
  "@mozilla/readability": "github:mozilla/readability",
36
- "@sentry/node": "^10.20.0",
37
- "@sentry/profiling-node": "^10.20.0",
34
+ "@sentry/node": "^10.25.0",
35
+ "@sentry/profiling-node": "^10.25.0",
38
36
  "acme-client": "^5.4.0",
39
37
  "browserify-fs": "^1.0.0",
40
38
  "buffer": "^6.0.3",
41
39
  "fast-geoip": "^1.1.88",
42
40
  "fluent-ffmpeg": "^2.1.3",
43
41
  "form-data": "^4.0.4",
44
- "ioredis": "^5.8.1",
42
+ "ioredis": "^5.8.2",
45
43
  "js-tiktoken": "^1.0.21",
46
- "jsdom": "^27.0.1",
44
+ "jsdom": "^27.2.0",
47
45
  "lorem-ipsum": "^2.0.8",
48
46
  "mailgun.js": "^12.1.1",
49
- "mailparser": "^3.7.5",
47
+ "mailparser": "^3.9.0",
50
48
  "mime": "^4.1.0",
51
- "mysql2": "^3.15.2",
52
- "node-mailjet": "^6.0.9",
49
+ "mysql2": "^3.15.3",
50
+ "node-mailjet": "^6.0.11",
53
51
  "node-polyfill-webpack-plugin": "^4.1.0",
54
52
  "office-text-extractor": "^3.0.3",
55
- "openai": "^6.5.0",
56
- "pdfjs-dist": "^5.4.296",
53
+ "openai": "^6.8.1",
54
+ "pdfjs-dist": "^5.4.394",
57
55
  "pg": "^8.16.3",
58
56
  "pgvector": "^0.2.1",
59
57
  "ping": "^1.0.0",
60
58
  "process": "^0.11.10",
61
- "puppeteer": "^24.25.0",
59
+ "puppeteer": "^24.29.1",
62
60
  "say": "^0.16.0",
63
61
  "telegraf": "^4.16.3",
64
62
  "telesignsdk": "^3.0.4",
65
63
  "tesseract.js": "^6.0.1",
66
- "twilio": "^5.10.3",
64
+ "twilio": "^5.10.5",
67
65
  "url": "github:Leask/node-url",
68
66
  "webpack-cli": "^6.0.1",
69
67
  "whisper-node": "^1.1.1",
70
- "wrangler": "^4.43.0",
68
+ "wrangler": "^4.47.0",
71
69
  "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz",
72
70
  "youtube-transcript": "^1.2.1"
73
71
  }
package/lib/storage.mjs CHANGED
@@ -208,9 +208,10 @@ const outputFile = async (buffer, options) => {
208
208
  await writeFile(options?.file, buffer, _encoding);
209
209
  return options?.file;
210
210
  }
211
+ const { extension } = await getMime(buffer, options?.filename);
211
212
  return await writeTempFile(buffer, {
212
213
  filename: options?.filename,
213
- encoding: _encoding, suffix: options?.suffix,
214
+ encoding: _encoding, suffix: options?.suffix || extension,
214
215
  });
215
216
  };
216
217
 
@@ -310,17 +311,22 @@ const convert = async (any, options) => {
310
311
  return Object.keys(result).length === 1 ? result.content : result;
311
312
  };
312
313
 
314
+ const getMime = async (buf, filename) => {
315
+ const mimeType = await ignoreErrFunc(() => need('mime-types'));
316
+ const mime = extract(await fileTypeFromBuffer(buf), 'mime')
317
+ || (filename && mimeType?.lookup?.(filename)) || MIME_BINARY;
318
+ return { mime, extension: mimeType?.extension?.(mime) || 'bin' };
319
+ };
320
+
313
321
  const analyzeFile = async (any, options) => {
314
322
  const { meta, content } = await convert(any, { meta: 1, ...options || {} });
315
323
  const hashAlgorithm = options?.hashAlgorithm || defaultAlgorithm;
316
324
  const _hash = hash(content, hashAlgorithm);
317
325
  const filename = options?.filename || (meta ? basename(any) : null) || hash;
318
- const lookup = (await ignoreErrFunc(() => need('mime-types')))?.lookup || voidFunc;
326
+ const { mime } = await getMime(content, filename);
319
327
  return {
320
328
  content, extname: extname(filename).replace(/^\.|\.$/g, ''),
321
- filename, hashAlgorithm, hash: _hash,
322
- mime: extract(await fileTypeFromBuffer(content), 'mime') || lookup(filename) || MIME_BINARY,
323
- size: content.length,
329
+ filename, hashAlgorithm, hash: _hash, mime, size: content.length,
324
330
  };
325
331
  };
326
332
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "1999.1.93",
4
+ "version": "1999.1.96",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -12,7 +12,7 @@
12
12
  "scripts": {
13
13
  "start": "node index.mjs",
14
14
  "debug": "node --inspect --trace-warnings debug.mjs",
15
- "test": "node --inspect --trace-warnings test.mjs",
15
+ "test": "node test.mjs",
16
16
  "updep": "npx npm-check-updates -u && npm install && wget https://raw.githubusercontent.com/Marak/colors.js/master/lib/styles.js -O ./lib/style.cjs && wget https://raw.githubusercontent.com/sindresorhus/cli-boxes/refs/heads/main/boxes.json -O ./lib/boxes.json",
17
17
  "gitsync": "( git commit -am \"Released @ `date`\" || true ) && git pull && git push",
18
18
  "pack": "./node_modules/.bin/webpack-cli --config webpack.config.mjs",
@@ -30,55 +30,53 @@
30
30
  "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz"
31
31
  },
32
32
  "dependencies": {
33
- "file-type": "^21.0.0",
34
- "mathjs": "^15.0.0",
33
+ "file-type": "^21.1.0",
34
+ "mathjs": "^15.1.0",
35
35
  "uuid": "^13.0.0"
36
36
  },
37
37
  "devDependencies": {
38
- "@anthropic-ai/sdk": "^0.67.0",
39
- "@anthropic-ai/vertex-sdk": "^0.14.0",
40
38
  "@ffmpeg-installer/ffmpeg": "^1.1.0",
41
39
  "@ffprobe-installer/ffprobe": "^2.1.2",
42
40
  "@google-cloud/speech": "^7.2.1",
43
- "@google-cloud/storage": "^7.17.2",
41
+ "@google-cloud/storage": "^7.17.3",
44
42
  "@google-cloud/vision": "^5.3.4",
45
- "@google/genai": "^1.25.0",
43
+ "@google/genai": "^1.29.0",
46
44
  "@mozilla/readability": "github:mozilla/readability",
47
- "@sentry/node": "^10.20.0",
48
- "@sentry/profiling-node": "^10.20.0",
45
+ "@sentry/node": "^10.25.0",
46
+ "@sentry/profiling-node": "^10.25.0",
49
47
  "acme-client": "^5.4.0",
50
48
  "browserify-fs": "^1.0.0",
51
49
  "buffer": "^6.0.3",
52
50
  "fast-geoip": "^1.1.88",
53
51
  "fluent-ffmpeg": "^2.1.3",
54
52
  "form-data": "^4.0.4",
55
- "ioredis": "^5.8.1",
53
+ "ioredis": "^5.8.2",
56
54
  "js-tiktoken": "^1.0.21",
57
- "jsdom": "^27.0.1",
55
+ "jsdom": "^27.2.0",
58
56
  "lorem-ipsum": "^2.0.8",
59
57
  "mailgun.js": "^12.1.1",
60
- "mailparser": "^3.7.5",
58
+ "mailparser": "^3.9.0",
61
59
  "mime": "^4.1.0",
62
- "mysql2": "^3.15.2",
63
- "node-mailjet": "^6.0.9",
60
+ "mysql2": "^3.15.3",
61
+ "node-mailjet": "^6.0.11",
64
62
  "node-polyfill-webpack-plugin": "^4.1.0",
65
63
  "office-text-extractor": "^3.0.3",
66
- "openai": "^6.5.0",
67
- "pdfjs-dist": "^5.4.296",
64
+ "openai": "^6.8.1",
65
+ "pdfjs-dist": "^5.4.394",
68
66
  "pg": "^8.16.3",
69
67
  "pgvector": "^0.2.1",
70
68
  "ping": "^1.0.0",
71
69
  "process": "^0.11.10",
72
- "puppeteer": "^24.25.0",
70
+ "puppeteer": "^24.29.1",
73
71
  "say": "^0.16.0",
74
72
  "telegraf": "^4.16.3",
75
73
  "telesignsdk": "^3.0.4",
76
74
  "tesseract.js": "^6.0.1",
77
- "twilio": "^5.10.3",
75
+ "twilio": "^5.10.5",
78
76
  "url": "github:Leask/node-url",
79
77
  "webpack-cli": "^6.0.1",
80
78
  "whisper-node": "^1.1.1",
81
- "wrangler": "^4.43.0",
79
+ "wrangler": "^4.47.0",
82
80
  "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.1/xlsx-0.20.1.tgz",
83
81
  "youtube-transcript": "^1.2.1"
84
82
  }