utilitas 2000.3.46 → 2000.3.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.mjs CHANGED
@@ -12,7 +12,7 @@ import * as cache from './lib/cache.mjs';
12
12
  import * as callosum from './lib/callosum.mjs';
13
13
  import * as dbio from './lib/dbio.mjs';
14
14
  import * as email from './lib/email.mjs';
15
- import * as embedding from './lib/embedding.mjs';
15
+ import * as rag from './lib/rag.mjs';
16
16
  import * as encryption from './lib/encryption.mjs';
17
17
  import * as event from './lib/event.mjs';
18
18
  import * as media from './lib/media.mjs';
@@ -38,9 +38,9 @@ export {
38
38
  // dependencies
39
39
  fileType, math, uuid,
40
40
  // features
41
- alan, bee, bot, boxes, cache, callosum, color, dbio, email, embedding,
42
- encryption, event, manifest, media, memory, network, sentinel, shell, sms,
43
- speech, ssl, storage, tape, uoid, utilitas, vision, web
41
+ alan, bee, bot, boxes, cache, callosum, color, dbio, email, rag, encryption,
42
+ event, manifest, media, memory, network, sentinel, shell, sms, speech, ssl,
43
+ storage, tape, uoid, utilitas, vision, web
44
44
  };
45
45
 
46
46
  if (utilitas.inBrowser() && !globalThis.utilitas) {
package/lib/alan.mjs CHANGED
@@ -1398,6 +1398,7 @@ const trimText = async (text, limit = Infinity) => {
1398
1398
  text = ensureString(text, { trim: true });
1399
1399
  let trimmed = false;
1400
1400
  let lastCheck = null;
1401
+ limit = Math.max(limit, 0);
1401
1402
  while ((lastCheck = await countTokens(
1402
1403
  buildTextWithEllipsis(text, trimmed), { fast: true }
1403
1404
  )) > limit) {
package/lib/manifest.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  const manifest = {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.46",
4
+ "version": "2000.3.48",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -26,6 +26,7 @@ const manifest = {
26
26
  "devDependencies": {
27
27
  "@ffmpeg-installer/ffmpeg": "^1.1.0",
28
28
  "@ffprobe-installer/ffprobe": "^2.1.2",
29
+ "@google-cloud/discoveryengine": "^2.5.2",
29
30
  "@google-cloud/storage": "^7.18.0",
30
31
  "@google/genai": "^1.31.0",
31
32
  "@mozilla/readability": "github:mozilla/readability",
package/lib/rag.mjs ADDED
@@ -0,0 +1,252 @@
1
+ import { countTokens, trimText } from './alan.mjs';
2
+ import { convert } from './storage.mjs';
3
+ import { ensureArray, ensureString, need } from './utilitas.mjs';
4
+
5
+ const _NEED = ['openai', '@google-cloud/discoveryengine'];
6
+ const embeddingClients = {};
7
+ const rerankerClients = {};
8
+
9
+ const [
10
+ OPENAI, GOOGLE, OPENROUTER, JINA,
11
+ GOOGLE_DEFAULT_LOCATION, GOOGLE_RERANK_CONFIG_ID,
12
+ OPENAI_MODEL_EMBED_SMALL,
13
+ OPENAI_MODEL_EMBED_LARGE,
14
+ GOOGLE_MODEL_GEMINI_EMBED,
15
+ JINA_MODEL_V_4,
16
+ GOOGLE_MODEL_SEMANTIC_RANKER,
17
+ ] = [
18
+ 'OPENAI', 'GOOGLE', 'OPENROUTER', 'JINA',
19
+ 'global', 'default_ranking_config',
20
+ 'text-embedding-3-small', // dim: 1536
21
+ 'text-embedding-3-large', // dim: 3072
22
+ 'gemini-embedding-001', // dim: 768(default), 1536, or 3072(google default)
23
+ 'jina-embeddings-v4', // dim: 256‑2048
24
+ 'semantic-ranker-default@latest',
25
+ ];
26
+
27
+ const PROVIDER_BASE_URL = {
28
+ [OPENROUTER]: 'https://openrouter.ai/api/v1',
29
+ [JINA]: 'https://api.jina.ai/v1/',
30
+ };
31
+
32
+ const DEFAULT_EMBEDDING_MODELS = {
33
+ [OPENAI]: OPENAI_MODEL_EMBED_SMALL,
34
+ [OPENROUTER]: GOOGLE_MODEL_GEMINI_EMBED,
35
+ [JINA]: JINA_MODEL_V_4,
36
+ };
37
+
38
+ const DEFAULT_RERANKER_MODELS = {
39
+ [GOOGLE]: GOOGLE_MODEL_SEMANTIC_RANKER,
40
+ };
41
+
42
+ const MODEL_CONFIG = {
43
+ [OPENAI_MODEL_EMBED_SMALL]: {
44
+ source: 'openai', image: false, maxTokens: 8192,
45
+ },
46
+ [OPENAI_MODEL_EMBED_LARGE]: {
47
+ source: 'openai', image: false, maxTokens: 8192,
48
+ },
49
+ [GOOGLE_MODEL_GEMINI_EMBED]: {
50
+ source: 'google', image: false, maxTokens: 2048,
51
+ options: { dimensions: 768 },
52
+ },
53
+ // Token calculation may be incorrect because its limitation applies to the
54
+ // entire request rather than individual entries.
55
+ // https://jina.ai/embeddings
56
+ [JINA_MODEL_V_4]: {
57
+ source: 'jina', image: true, maxTokens: 8192, recordsLimit: 512,
58
+ options: {
59
+ task: 'text-matching', // 'retrieval.query', 'retrieval.passage'
60
+ dimensions: 768, // normalized: true, by default DONT submit
61
+ truncate: true, // late_chunking: true, by default DONT submit
62
+ embedding_type: 'float',
63
+ },
64
+ },
65
+ [GOOGLE_MODEL_SEMANTIC_RANKER]: {
66
+ source: 'google', image: false, maxTokens: 1024, recordsLimit: 200,
67
+ },
68
+ };
69
+
70
+ const ensureEmbeddingProvider = (options) => {
71
+ options.provider = ensureString(options?.provider, { case: 'UP' });
72
+ assert(
73
+ DEFAULT_EMBEDDING_MODELS?.[options.provider],
74
+ 'Embedding provider is required.', 400
75
+ );
76
+ return options.provider;
77
+ };
78
+
79
+ const ensureRerankerProvider = (options) => {
80
+ options.provider = ensureString(options?.provider, { case: 'UP' });
81
+ assert(
82
+ DEFAULT_RERANKER_MODELS?.[options.provider],
83
+ 'Reranker provider is required.', 400
84
+ );
85
+ return options.provider;
86
+ };
87
+
88
+ const ensureApiKey = (options) => {
89
+ assert(options?.apiKey, 'API key is required.', 400);
90
+ return options.apiKey;
91
+ };
92
+
93
+ const ensureGoogleCredentials = (options) => {
94
+ assert(options?.googleCredentials, 'Google credentials are required.', 400);
95
+ assert(options?.projectId, 'Google project ID is required.', 400);
96
+ return options;
97
+ };
98
+
99
+ const getEmbeddingClient = (provider) => {
100
+ provider = ensureString(provider, { case: 'UP' })
101
+ || Object.keys(embeddingClients || {})[0];
102
+ assert(provider, 'No embedding provider has been initialized.', 500);
103
+ return { ...embeddingClients?.[provider], provider };
104
+ };
105
+
106
+ const getRerankerClient = (provider) => {
107
+ provider = ensureString(provider, { case: 'UP' })
108
+ || Object.keys(rerankerClients || {})[0];
109
+ assert(provider, 'No reranker provider has been initialized.', 500);
110
+ return { ...rerankerClients?.[provider], provider };
111
+ };
112
+
113
+ const initEmbedding = async (options = {}) => {
114
+ if (options?.debug) {
115
+ (await need('node:util')).inspect.defaultOptions.depth = null;
116
+ options.logLevel = 'debug';
117
+ }
118
+ ensureApiKey(options);
119
+ const provider = ensureEmbeddingProvider(options);
120
+ const OpenAI = await need('openai');
121
+ const baseURL = options?.baseURL || PROVIDER_BASE_URL[provider];
122
+ const model = options?.model || DEFAULT_EMBEDDING_MODELS[provider];
123
+ embeddingClients[provider] = {
124
+ client: new OpenAI({ ...options, baseURL }),
125
+ model, source: MODEL_CONFIG[model]?.source,
126
+ };
127
+ return getEmbeddingClient(provider);
128
+ };
129
+
130
+ const embed = async (input, options = {}) => {
131
+ let [{ client, model: selectedModel, provider, source }, resp]
132
+ = [getEmbeddingClient(options?.provider), null];
133
+ const model = options?.model || selectedModel;
134
+ const multiple = Array.isArray(input);
135
+ input = await Promise.all(ensureArray(input).map(async x => {
136
+ x = Object.isObject(x) ? x : { text: x };
137
+ assert(
138
+ Object.keys(x).length == 1,
139
+ 'Only one type of input is allowed at a time.', 400
140
+ );
141
+ if (x.text) {
142
+ x.text = await trimText(x.text, MODEL_CONFIG[model]?.maxTokens);
143
+ } else if (x.image) {
144
+ assert(
145
+ MODEL_CONFIG[model]?.image,
146
+ `Model ${model} does not support image embeddings.`, 400
147
+ );
148
+ if (options?.input) {
149
+ x.image = await convert(
150
+ x.image, { ...options, expected: 'base64' }
151
+ );
152
+ }
153
+ }
154
+ return x;
155
+ }));
156
+ MODEL_CONFIG[model]?.image || (input = input.map(x => x.text));
157
+ assert(input.length, 'Input is required.', 400);
158
+ const body = {
159
+ model, input, ...MODEL_CONFIG[model]?.options || {},
160
+ ...options?.requestOptions || {},
161
+ };
162
+ switch (provider) {
163
+ case JINA:
164
+ resp = await client.post('/embeddings', { body });
165
+ break;
166
+ case OPENROUTER:
167
+ source = options?.source || source
168
+ || MODEL_CONFIG[body.model]?.source;
169
+ body.model = `${source ? `${source}/` : ''}${body.model}`;
170
+ case OPENAI:
171
+ resp = await client.embeddings.create(body);
172
+ break;
173
+ default:
174
+ throw new Error(`Unsupported embedding provider: ${provider}`);
175
+ }
176
+ assert(resp?.data?.length, 'No embeddings returned.', 500);
177
+ if (options?.raw) { return resp; }
178
+ const vectors = resp.data.map(x => x.embedding);
179
+ return multiple ? vectors : vectors[0];
180
+ };
181
+
182
+ const initReranker = async (options = {}) => {
183
+ const provider = ensureRerankerProvider(options);
184
+ switch (provider) {
185
+ case GOOGLE:
186
+ ensureGoogleCredentials(options);
187
+ const { RankServiceClient } = await need(
188
+ '@google-cloud/discoveryengine', { raw: true }
189
+ );
190
+ const location = options?.location || GOOGLE_DEFAULT_LOCATION;
191
+ const clientOptions = {
192
+ ...location ? { apiEndpoint: `${location}-discoveryengine.googleapis.com` } : {},
193
+ ...options?.apiEndpoint ? { apiEndpoint: options.apiEndpoint } : {},
194
+ keyFilename: options.googleCredentials,
195
+ };
196
+ const client = new RankServiceClient(clientOptions);
197
+ rerankerClients[provider] = {
198
+ model: options?.model || DEFAULT_RERANKER_MODELS[provider],
199
+ client, rankingConfigPath: client.rankingConfigPath(
200
+ options.projectId, location,
201
+ options?.rerankerConfigId || GOOGLE_RERANK_CONFIG_ID
202
+ ),
203
+ };
204
+ break;
205
+ default:
206
+ throw new Error(`Unsupported reranker provider: ${provider}`);
207
+ }
208
+ return getRerankerClient(provider);
209
+ };
210
+
211
+ const rerank = async (query, records, options = {}) => {
212
+ assert(query, 'Query is required.', 400);
213
+ assert(records?.length, 'Records are required.', 400);
214
+ const { provider, model, client, rankingConfigPath }
215
+ = getRerankerClient(options?.provider);
216
+ records = records.map((content, id) => Object.isObject(content)
217
+ ? content : { id: String(id), content }).slice(
218
+ 0, MODEL_CONFIG[model]?.recordsLimit || records.length
219
+ );
220
+ const maxTokens = MODEL_CONFIG[model]?.maxTokens || Infinity;
221
+ let result;
222
+ for (let i in records) {
223
+ records[i].title = await trimText(records[i]?.title || '', maxTokens);
224
+ const titleTokens = await countTokens(records[i].title);
225
+ const availableTokens = maxTokens - titleTokens;
226
+ records[i].content = availableTokens > 0 ? await trimText(
227
+ records[i].content, availableTokens
228
+ ) : '';
229
+ }
230
+ switch (provider) {
231
+ case GOOGLE:
232
+ const request = {
233
+ model, query, rankingConfig: rankingConfigPath,
234
+ records, topN: ~~options?.topN || records.length,
235
+ ...options?.requestOptions || {},
236
+ };
237
+ result = (await client.rank(request))?.[0]?.records;
238
+ break;
239
+ default:
240
+ throw new Error(`Unsupported reranker provider: ${provider}`);
241
+ }
242
+ // print(result);
243
+ return result || [];
244
+ };
245
+
246
+ export {
247
+ _NEED,
248
+ embed,
249
+ initEmbedding,
250
+ initReranker,
251
+ rerank,
252
+ };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.46",
4
+ "version": "2000.3.48",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -37,6 +37,7 @@
37
37
  "devDependencies": {
38
38
  "@ffmpeg-installer/ffmpeg": "^1.1.0",
39
39
  "@ffprobe-installer/ffprobe": "^2.1.2",
40
+ "@google-cloud/discoveryengine": "^2.5.2",
40
41
  "@google-cloud/storage": "^7.18.0",
41
42
  "@google/genai": "^1.31.0",
42
43
  "@mozilla/readability": "github:mozilla/readability",
package/lib/embedding.mjs DELETED
@@ -1,160 +0,0 @@
1
- import { convert } from './storage.mjs';
2
- import { ensureArray, ensureString, need } from './utilitas.mjs';
3
- import { trimText } from './alan.mjs';
4
-
5
- const _NEED = ['openai'];
6
- const clients = {};
7
-
8
- const [
9
- OPENAI,
10
- OPENROUTER,
11
- JINA,
12
- OPENAI_MODEL_EMBED_SMALL,
13
- OPENAI_MODEL_EMBED_LARGE,
14
- GOOGLE_MODEL_GEMINI_EMBED,
15
- JINA_MODEL_CLIP_2,
16
- JINA_MODEL_V_3,
17
- ] = [
18
- 'OPENAI',
19
- 'OPENROUTER',
20
- 'JINA',
21
- 'text-embedding-3-small', // dim: 1536
22
- 'text-embedding-3-large', // dim: 3072
23
- 'gemini-embedding-001', // dim: 768(default), 1536, or 3072(google default)
24
- 'jina-clip-v2', // dim: 1024
25
- 'jina-embeddings-v3', // dim: 256‑1024
26
- ];
27
-
28
- const PROVIDER_BASE_URL = {
29
- [OPENROUTER]: 'https://openrouter.ai/api/v1',
30
- [JINA]: 'https://api.jina.ai/v1/',
31
- };
32
-
33
- const DEFAULT_MODELS = {
34
- [OPENAI]: OPENAI_MODEL_EMBED_SMALL,
35
- [OPENROUTER]: GOOGLE_MODEL_GEMINI_EMBED,
36
- [JINA]: JINA_MODEL_CLIP_2,
37
- };
38
-
39
- const MODEL_CONFIG = {
40
- [OPENAI_MODEL_EMBED_SMALL]: { source: 'openai', maxTokens: 8192 },
41
- [OPENAI_MODEL_EMBED_LARGE]: { source: 'openai', maxTokens: 8192 },
42
- [GOOGLE_MODEL_GEMINI_EMBED]: {
43
- source: 'google', maxTokens: 2048, options: { dimensions: 768 },
44
- },
45
- [JINA_MODEL_CLIP_2]: {
46
- maxTokens: 8192,
47
- image: true,
48
- options: {
49
- task: 'retrieval.query',
50
- dimensions: 1024,
51
- normalized: true,
52
- embedding_type: 'float',
53
- },
54
- },
55
- // Token calculation may be incorrect because its limitation applies to the
56
- // entire request rather than individual entries.
57
- [JINA_MODEL_V_3]: {
58
- maxTokens: 8192,
59
- image: false,
60
- options: {
61
- task: 'retrieval.query',
62
- dimensions: 1024,
63
- normalized: true,
64
- late_chunking: true,
65
- embedding_type: 'float',
66
- },
67
- },
68
- };
69
-
70
- const ensureProvider = (options) => {
71
- options.provider = ensureString(options?.provider, { case: 'UP' });
72
- assert(
73
- DEFAULT_MODELS?.[options.provider], 'Provider is required.', 400
74
- );
75
- return options.provider;
76
- };
77
-
78
- const ensureApiKey = (options) => {
79
- assert(options?.apiKey, 'API key is required.', 400);
80
- return options.apiKey;
81
- };
82
-
83
- const getClient = (provider) => {
84
- provider = ensureString(provider, { case: 'UP' })
85
- || Object.keys(clients || {})[0];
86
- assert(provider, 'No embedding provider has been initialized.', 500);
87
- return { ...clients?.[provider], provider };
88
- };
89
-
90
- const init = async (options = {}) => {
91
- ensureApiKey(options);
92
- const provider = ensureProvider(options);
93
- const OpenAI = await need('openai');
94
- const baseURL = options?.baseURL || PROVIDER_BASE_URL[provider];
95
- const model = options?.model || DEFAULT_MODELS[provider];
96
- clients[provider] = {
97
- client: new OpenAI({ ...options, baseURL }),
98
- model, source: MODEL_CONFIG[model]?.source,
99
- };
100
- return getClient(provider);
101
- };
102
-
103
- const embed = async (input, options = {}) => {
104
- let [{ client, model: selectedModel, provider, source }, resp]
105
- = [getClient(options?.provider), null];
106
- const model = options?.model || selectedModel;
107
- const multiple = Array.isArray(input);
108
- input = await Promise.all(ensureArray(input).map(async x => {
109
- x = Object.isObject(x) ? x : { text: x };
110
- assert(
111
- Object.keys(x).length == 1,
112
- 'Only one type of input is allowed at a time.', 400
113
- );
114
- if (x.text) {
115
- x.text = await trimText(x.text, MODEL_CONFIG[model]?.maxTokens);
116
- } else if (x.image) {
117
- assert(
118
- MODEL_CONFIG[model]?.image,
119
- `Model ${model} does not support image embeddings.`, 400
120
- );
121
- if (options?.input) {
122
- x.image = await convert(
123
- x.image, { ...options, expected: 'base64' }
124
- );
125
- }
126
- }
127
- return x;
128
- }));
129
- MODEL_CONFIG[model]?.image || (input = input.map(x => x.text));
130
- assert(input.length, 'Input is required.', 400);
131
- const body = {
132
- model, input, ...MODEL_CONFIG[model]?.options || {},
133
- ...options?.requestOptions || {},
134
- };
135
- switch (provider) {
136
- case JINA:
137
- resp = await client.post('/embeddings', { body });
138
- break;
139
- case OPENROUTER:
140
- source = options?.source || source
141
- || MODEL_CONFIG[body.model]?.source;
142
- body.model = `${source ? `${source}/` : ''}${body.model}`;
143
- case OPENAI:
144
- resp = await client.embeddings.create(body);
145
- break;
146
- default:
147
- throw new Error(`Unsupported provider: ${provider}`);
148
- }
149
- assert(resp?.data?.length, 'No embeddings returned.', 500);
150
- if (options?.raw) { return resp; }
151
- const vectors = resp.data.map(x => x.embedding);
152
- return multiple ? vectors : vectors[0];
153
- };
154
-
155
- export default init;
156
- export {
157
- _NEED,
158
- embed,
159
- init,
160
- };