utilitas 2000.3.45 → 2000.3.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.mjs CHANGED
@@ -12,7 +12,7 @@ import * as cache from './lib/cache.mjs';
12
12
  import * as callosum from './lib/callosum.mjs';
13
13
  import * as dbio from './lib/dbio.mjs';
14
14
  import * as email from './lib/email.mjs';
15
- import * as embedding from './lib/embedding.mjs';
15
+ import * as rag from './lib/rag.mjs';
16
16
  import * as encryption from './lib/encryption.mjs';
17
17
  import * as event from './lib/event.mjs';
18
18
  import * as media from './lib/media.mjs';
@@ -38,9 +38,9 @@ export {
38
38
  // dependencies
39
39
  fileType, math, uuid,
40
40
  // features
41
- alan, bee, bot, boxes, cache, callosum, color, dbio, email, embedding,
42
- encryption, event, manifest, media, memory, network, sentinel, shell, sms,
43
- speech, ssl, storage, tape, uoid, utilitas, vision, web
41
+ alan, bee, bot, boxes, cache, callosum, color, dbio, email, rag, encryption,
42
+ event, manifest, media, memory, network, sentinel, shell, sms, speech, ssl,
43
+ storage, tape, uoid, utilitas, vision, web
44
44
  };
45
45
 
46
46
  if (utilitas.inBrowser() && !globalThis.utilitas) {
package/lib/manifest.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  const manifest = {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.45",
4
+ "version": "2000.3.47",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -26,6 +26,7 @@ const manifest = {
26
26
  "devDependencies": {
27
27
  "@ffmpeg-installer/ffmpeg": "^1.1.0",
28
28
  "@ffprobe-installer/ffprobe": "^2.1.2",
29
+ "@google-cloud/discoveryengine": "^2.5.2",
29
30
  "@google-cloud/storage": "^7.18.0",
30
31
  "@google/genai": "^1.31.0",
31
32
  "@mozilla/readability": "github:mozilla/readability",
package/lib/rag.mjs ADDED
@@ -0,0 +1,248 @@
1
+ import { convert } from './storage.mjs';
2
+ import { ensureArray, ensureString, need } from './utilitas.mjs';
3
+ import { trimText } from './alan.mjs';
4
+
5
+ const _NEED = ['openai', '@google-cloud/discoveryengine'];
6
+ const embeddingClients = {};
7
+ const rerankerClients = {};
8
+
9
+ const [
10
+ OPENAI, GOOGLE, OPENROUTER, JINA,
11
+ GOOGLE_DEFAULT_LOCATION, GOOGLE_RERANK_CONFIG_ID,
12
+ OPENAI_MODEL_EMBED_SMALL,
13
+ OPENAI_MODEL_EMBED_LARGE,
14
+ GOOGLE_MODEL_GEMINI_EMBED,
15
+ JINA_MODEL_CLIP_2,
16
+ JINA_MODEL_V_3,
17
+ GOOGLE_MODEL_SEMANTIC_RANKER,
18
+ ] = [
19
+ 'OPENAI', 'GOOGLE', 'OPENROUTER', 'JINA',
20
+ 'global', 'default_ranking_config',
21
+ 'text-embedding-3-small', // dim: 1536
22
+ 'text-embedding-3-large', // dim: 3072
23
+ 'gemini-embedding-001', // dim: 768(default), 1536, or 3072(google default)
24
+ 'jina-clip-v2', // dim: 1024
25
+ 'jina-embeddings-v3', // dim: 256‑1024
26
+ 'semantic-ranker-default@latest',
27
+ ];
28
+
29
+ const PROVIDER_BASE_URL = {
30
+ [OPENROUTER]: 'https://openrouter.ai/api/v1',
31
+ [JINA]: 'https://api.jina.ai/v1/',
32
+ };
33
+
34
+ const DEFAULT_EMBEDDING_MODELS = {
35
+ [OPENAI]: OPENAI_MODEL_EMBED_SMALL,
36
+ [OPENROUTER]: GOOGLE_MODEL_GEMINI_EMBED,
37
+ [JINA]: JINA_MODEL_CLIP_2,
38
+ };
39
+
40
+ const DEFAULT_RERANKER_MODELS = {
41
+ [GOOGLE]: GOOGLE_MODEL_SEMANTIC_RANKER,
42
+ };
43
+
44
+ const MODEL_CONFIG = {
45
+ [OPENAI_MODEL_EMBED_SMALL]: {
46
+ source: 'openai', image: false, maxTokens: 8192,
47
+ },
48
+ [OPENAI_MODEL_EMBED_LARGE]: {
49
+ source: 'openai', image: false, maxTokens: 8192,
50
+ },
51
+ [GOOGLE_MODEL_GEMINI_EMBED]: {
52
+ source: 'google', image: false, maxTokens: 2048,
53
+ options: { dimensions: 768 },
54
+ },
55
+ [JINA_MODEL_CLIP_2]: {
56
+ source: 'jina', image: true, maxTokens: 8192, options: {
57
+ task: 'retrieval.query', dimensions: 1024,
58
+ normalized: true, embedding_type: 'float',
59
+ },
60
+ },
61
+ // Token calculation may be incorrect because its limitation applies to the
62
+ // entire request rather than individual entries.
63
+ [JINA_MODEL_V_3]: {
64
+ source: 'jina', image: false, maxTokens: 8192, options: {
65
+ task: 'retrieval.query', dimensions: 1024, normalized: true,
66
+ late_chunking: true, embedding_type: 'float',
67
+ },
68
+ },
69
+ [GOOGLE_MODEL_SEMANTIC_RANKER]: {
70
+ source: 'google', image: false, maxTokens: 8192,
71
+ options: {
72
+ // task: 'retrieval.query',
73
+ // dimensions: 1024,
74
+ // normalized: true,
75
+ // late_chunking: true,
76
+ // embedding_type: 'float',
77
+ },
78
+ },
79
+ };
80
+
81
+ const ensureEmbeddingProvider = (options) => {
82
+ options.provider = ensureString(options?.provider, { case: 'UP' });
83
+ assert(
84
+ DEFAULT_EMBEDDING_MODELS?.[options.provider],
85
+ 'Embedding provider is required.', 400
86
+ );
87
+ return options.provider;
88
+ };
89
+
90
+ const ensureRerankerProvider = (options) => {
91
+ options.provider = ensureString(options?.provider, { case: 'UP' });
92
+ assert(
93
+ DEFAULT_RERANKER_MODELS?.[options.provider],
94
+ 'Reranker provider is required.', 400
95
+ );
96
+ return options.provider;
97
+ };
98
+
99
+ const ensureApiKey = (options) => {
100
+ assert(options?.apiKey, 'API key is required.', 400);
101
+ return options.apiKey;
102
+ };
103
+
104
+ const ensureGoogleCredentials = (options) => {
105
+ assert(options?.googleCredentials, 'Google credentials are required.', 400);
106
+ assert(options?.projectId, 'Google project ID is required.', 400);
107
+ return options;
108
+ };
109
+
110
+ const getEmbeddingClient = (provider) => {
111
+ provider = ensureString(provider, { case: 'UP' })
112
+ || Object.keys(embeddingClients || {})[0];
113
+ assert(provider, 'No embedding provider has been initialized.', 500);
114
+ return { ...embeddingClients?.[provider], provider };
115
+ };
116
+
117
+ const getRerankerClient = (provider) => {
118
+ provider = ensureString(provider, { case: 'UP' })
119
+ || Object.keys(rerankerClients || {})[0];
120
+ assert(provider, 'No reranker provider has been initialized.', 500);
121
+ return { ...rerankerClients?.[provider], provider };
122
+ };
123
+
124
+ const initEmbedding = async (options = {}) => {
125
+ ensureApiKey(options);
126
+ const provider = ensureEmbeddingProvider(options);
127
+ const OpenAI = await need('openai');
128
+ const baseURL = options?.baseURL || PROVIDER_BASE_URL[provider];
129
+ const model = options?.model || DEFAULT_EMBEDDING_MODELS[provider];
130
+ embeddingClients[provider] = {
131
+ client: new OpenAI({ ...options, baseURL }),
132
+ model, source: MODEL_CONFIG[model]?.source,
133
+ };
134
+ return getEmbeddingClient(provider);
135
+ };
136
+
137
+ const embed = async (input, options = {}) => {
138
+ let [{ client, model: selectedModel, provider, source }, resp]
139
+ = [getEmbeddingClient(options?.provider), null];
140
+ const model = options?.model || selectedModel;
141
+ const multiple = Array.isArray(input);
142
+ input = await Promise.all(ensureArray(input).map(async x => {
143
+ x = Object.isObject(x) ? x : { text: x };
144
+ assert(
145
+ Object.keys(x).length == 1,
146
+ 'Only one type of input is allowed at a time.', 400
147
+ );
148
+ if (x.text) {
149
+ x.text = await trimText(x.text, MODEL_CONFIG[model]?.maxTokens);
150
+ } else if (x.image) {
151
+ assert(
152
+ MODEL_CONFIG[model]?.image,
153
+ `Model ${model} does not support image embeddings.`, 400
154
+ );
155
+ if (options?.input) {
156
+ x.image = await convert(
157
+ x.image, { ...options, expected: 'base64' }
158
+ );
159
+ }
160
+ }
161
+ return x;
162
+ }));
163
+ MODEL_CONFIG[model]?.image || (input = input.map(x => x.text));
164
+ assert(input.length, 'Input is required.', 400);
165
+ const body = {
166
+ model, input, ...MODEL_CONFIG[model]?.options || {},
167
+ ...options?.requestOptions || {},
168
+ };
169
+ switch (provider) {
170
+ case JINA:
171
+ resp = await client.post('/embeddings', { body });
172
+ break;
173
+ case OPENROUTER:
174
+ source = options?.source || source
175
+ || MODEL_CONFIG[body.model]?.source;
176
+ body.model = `${source ? `${source}/` : ''}${body.model}`;
177
+ case OPENAI:
178
+ resp = await client.embeddings.create(body);
179
+ break;
180
+ default:
181
+ throw new Error(`Unsupported embedding provider: ${provider}`);
182
+ }
183
+ assert(resp?.data?.length, 'No embeddings returned.', 500);
184
+ if (options?.raw) { return resp; }
185
+ const vectors = resp.data.map(x => x.embedding);
186
+ return multiple ? vectors : vectors[0];
187
+ };
188
+
189
+ const initReranker = async (options = {}) => {
190
+ const provider = ensureRerankerProvider(options);
191
+ switch (provider) {
192
+ case GOOGLE:
193
+ ensureGoogleCredentials(options);
194
+ const { RankServiceClient } = await need(
195
+ '@google-cloud/discoveryengine', { raw: true }
196
+ );
197
+ const location = options?.location || GOOGLE_DEFAULT_LOCATION;
198
+ const clientOptions = {
199
+ ...location ? { apiEndpoint: `${location}-discoveryengine.googleapis.com` } : {},
200
+ ...options?.apiEndpoint ? { apiEndpoint: options.apiEndpoint } : {},
201
+ keyFilename: options.googleCredentials,
202
+ };
203
+ const client = new RankServiceClient(clientOptions);
204
+ rerankerClients[provider] = {
205
+ model: options?.model || DEFAULT_RERANKER_MODELS[provider],
206
+ client, rankingConfigPath: client.rankingConfigPath(
207
+ options.projectId, location,
208
+ options?.rerankerConfigId || GOOGLE_RERANK_CONFIG_ID
209
+ ),
210
+ };
211
+ break;
212
+ default:
213
+ throw new Error(`Unsupported reranker provider: ${provider}`);
214
+ }
215
+ return getRerankerClient(provider);
216
+ };
217
+
218
+ const rerank = async (query, documents, options = {}) => {
219
+ assert(query, 'Query is required.', 400);
220
+ assert(documents?.length, 'Documents are required.', 400);
221
+ const records = documents.map((doc, idx) => ({
222
+ id: String(idx), content: doc,
223
+ }));
224
+ const { provider, model, client, rankingConfigPath }
225
+ = getRerankerClient(options?.provider);
226
+ let result;
227
+ switch (provider) {
228
+ case GOOGLE:
229
+ const request = {
230
+ model, query, rankingConfig: rankingConfigPath,
231
+ records, topN: ~~options?.topN || documents.length,
232
+ };
233
+ result = (await client.rank(request))?.[0]?.records;
234
+ break;
235
+ default:
236
+ throw new Error(`Unsupported reranker provider: ${provider}`);
237
+ }
238
+ // print(result);
239
+ return result || [];
240
+ };
241
+
242
+ export {
243
+ _NEED,
244
+ embed,
245
+ initEmbedding,
246
+ initReranker,
247
+ rerank,
248
+ };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.45",
4
+ "version": "2000.3.47",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
@@ -37,6 +37,7 @@
37
37
  "devDependencies": {
38
38
  "@ffmpeg-installer/ffmpeg": "^1.1.0",
39
39
  "@ffprobe-installer/ffprobe": "^2.1.2",
40
+ "@google-cloud/discoveryengine": "^2.5.2",
40
41
  "@google-cloud/storage": "^7.18.0",
41
42
  "@google/genai": "^1.31.0",
42
43
  "@mozilla/readability": "github:mozilla/readability",
package/lib/embedding.mjs DELETED
@@ -1,157 +0,0 @@
1
- import { convert } from './storage.mjs';
2
- import { ensureArray, ensureString, need } from './utilitas.mjs';
3
- import { trimText } from './alan.mjs';
4
-
5
- const _NEED = ['openai'];
6
- const clients = {};
7
-
8
- const [
9
- OPENAI,
10
- OPENROUTER,
11
- JINA,
12
- OPENAI_MODEL_EMBED_SMALL,
13
- OPENAI_MODEL_EMBED_LARGE,
14
- GOOGLE_MODEL_GEMINI_EMBED,
15
- JINA_MODEL_CLIP_2,
16
- JINA_MODEL_V_3,
17
- ] = [
18
- 'OPENAI',
19
- 'OPENROUTER',
20
- 'JINA',
21
- 'text-embedding-3-small', // dim: 1536
22
- 'text-embedding-3-large', // dim: 3072
23
- 'gemini-embedding-001', // dim: 768(default), 1536, or 3072(google default)
24
- 'jina-clip-v2', // dim: 1024
25
- 'jina-embeddings-v3', // dim: 256‑1024
26
- ];
27
-
28
- const PROVIDER_BASE_URL = {
29
- [OPENROUTER]: 'https://openrouter.ai/api/v1',
30
- [JINA]: 'https://api.jina.ai/v1/',
31
- };
32
-
33
- const DEFAULT_MODELS = {
34
- [OPENAI]: OPENAI_MODEL_EMBED_SMALL,
35
- [OPENROUTER]: GOOGLE_MODEL_GEMINI_EMBED,
36
- [JINA]: JINA_MODEL_CLIP_2,
37
- };
38
-
39
- const MODEL_CONFIG = {
40
- [OPENAI_MODEL_EMBED_SMALL]: { source: 'openai', maxTokens: 8192 },
41
- [OPENAI_MODEL_EMBED_LARGE]: { source: 'openai', maxTokens: 8192 },
42
- [GOOGLE_MODEL_GEMINI_EMBED]: {
43
- source: 'google', maxTokens: 2048, options: { dimensions: 768 },
44
- },
45
- [JINA_MODEL_CLIP_2]: {
46
- maxTokens: 8192,
47
- image: true,
48
- options: {
49
- task: 'retrieval.query',
50
- dimensions: 1024,
51
- normalized: true,
52
- embedding_type: 'float',
53
- },
54
- },
55
- // Token calculation may be incorrect because its limitation applies to the
56
- // entire request rather than individual entries.
57
- [JINA_MODEL_V_3]: {
58
- maxTokens: 8192,
59
- image: false,
60
- options: {
61
- task: 'retrieval.query',
62
- dimensions: 1024,
63
- normalized: true,
64
- late_chunking: true,
65
- embedding_type: 'float',
66
- },
67
- },
68
- };
69
-
70
- const ensureProvider = (options) => {
71
- options.provider = ensureString(options?.provider, { case: 'UP' });
72
- assert(
73
- DEFAULT_MODELS?.[options.provider], 'Provider is required.', 400
74
- );
75
- return options.provider;
76
- };
77
-
78
- const ensureApiKey = (options) => {
79
- assert(options?.apiKey, 'API key is required.', 400);
80
- return options.apiKey;
81
- };
82
-
83
- const getClient = (provider) => {
84
- provider = ensureString(provider, { case: 'UP' })
85
- || Object.keys(clients || {})[0];
86
- assert(provider, 'No embedding provider has been initialized.', 500);
87
- return { ...clients?.[provider], provider };
88
- };
89
-
90
- const init = async (options = {}) => {
91
- ensureApiKey(options);
92
- const provider = ensureProvider(options);
93
- const OpenAI = await need('openai');
94
- const baseURL = options?.baseURL || PROVIDER_BASE_URL[provider];
95
- const model = options?.model || DEFAULT_MODELS[provider];
96
- clients[provider] = {
97
- client: new OpenAI({ ...options, baseURL }),
98
- model, source: MODEL_CONFIG[model]?.source,
99
- };
100
- return getClient(provider);
101
- };
102
-
103
- const embed = async (input, options = {}) => {
104
- let [{ client, model: selectedModel, provider, source }, resp]
105
- = [getClient(options?.provider), null];
106
- const model = options?.model || selectedModel;
107
- const multiple = Array.isArray(input);
108
- input = await Promise.all(ensureArray(input).map(async x => {
109
- x = Object.isObject(x) ? x : { text: x };
110
- assert(
111
- Object.keys(x).length == 1,
112
- 'Only one type of input is allowed at a time.', 400
113
- );
114
- if (x.text) {
115
- x.text = await trimText(x.text, MODEL_CONFIG[model]?.maxTokens);
116
- } else if (x.image) {
117
- assert(
118
- MODEL_CONFIG[model]?.image,
119
- `Model ${model} does not support image embeddings.`, 400
120
- );
121
- if (options?.input) {
122
- x.image = await convert(
123
- x.image, { ...options, expected: 'base64' }
124
- );
125
- }
126
- }
127
- return x;
128
- }));
129
- MODEL_CONFIG[model]?.image || (input = input.map(x => x.text));
130
- assert(input.length, 'Input is required.', 400);
131
- const body = {
132
- model, input, ...MODEL_CONFIG[model]?.options || {},
133
- ...options?.requestOptions || {},
134
- };
135
- switch (provider) {
136
- case JINA:
137
- resp = await client.post('/embeddings', { body });
138
- break;
139
- case OPENROUTER:
140
- source = options?.source || source
141
- || MODEL_CONFIG[body.model]?.source;
142
- body.model = `${source ? `${source}/` : ''}${body.model}`;
143
- case OPENAI:
144
- resp = await client.embeddings.create(body);
145
- }
146
- assert(resp?.data?.length, 'No embeddings returned.', 500);
147
- if (options?.raw) { return resp; }
148
- const vectors = resp.data.map(x => x.embedding);
149
- return multiple ? vectors : vectors[0];
150
- };
151
-
152
- export default init;
153
- export {
154
- _NEED,
155
- embed,
156
- init,
157
- };