utilitas 2000.3.47 → 2000.3.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/alan.mjs CHANGED
@@ -1398,6 +1398,7 @@ const trimText = async (text, limit = Infinity) => {
1398
1398
  text = ensureString(text, { trim: true });
1399
1399
  let trimmed = false;
1400
1400
  let lastCheck = null;
1401
+ limit = Math.max(limit, 0);
1401
1402
  while ((lastCheck = await countTokens(
1402
1403
  buildTextWithEllipsis(text, trimmed), { fast: true }
1403
1404
  )) > limit) {
package/lib/manifest.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  const manifest = {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.47",
4
+ "version": "2000.3.49",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
package/lib/rag.mjs CHANGED
@@ -1,6 +1,6 @@
1
- import { convert } from './storage.mjs';
1
+ import { BASE64, convert } from './storage.mjs';
2
+ import { countTokens, trimText } from './alan.mjs';
2
3
  import { ensureArray, ensureString, need } from './utilitas.mjs';
3
- import { trimText } from './alan.mjs';
4
4
 
5
5
  const _NEED = ['openai', '@google-cloud/discoveryengine'];
6
6
  const embeddingClients = {};
@@ -12,18 +12,18 @@ const [
12
12
  OPENAI_MODEL_EMBED_SMALL,
13
13
  OPENAI_MODEL_EMBED_LARGE,
14
14
  GOOGLE_MODEL_GEMINI_EMBED,
15
- JINA_MODEL_CLIP_2,
16
- JINA_MODEL_V_3,
15
+ JINA_MODEL_V_4,
17
16
  GOOGLE_MODEL_SEMANTIC_RANKER,
17
+ JINA_MODEL_RERANKER_M0,
18
18
  ] = [
19
19
  'OPENAI', 'GOOGLE', 'OPENROUTER', 'JINA',
20
20
  'global', 'default_ranking_config',
21
21
  'text-embedding-3-small', // dim: 1536
22
22
  'text-embedding-3-large', // dim: 3072
23
23
  'gemini-embedding-001', // dim: 768(default), 1536, or 3072(google default)
24
- 'jina-clip-v2', // dim: 1024
25
- 'jina-embeddings-v3', // dim: 256‑1024
24
+ 'jina-embeddings-v4', // dim: 256‑2048
26
25
  'semantic-ranker-default@latest',
26
+ 'jina-reranker-m0',
27
27
  ];
28
28
 
29
29
  const PROVIDER_BASE_URL = {
@@ -34,11 +34,12 @@ const PROVIDER_BASE_URL = {
34
34
  const DEFAULT_EMBEDDING_MODELS = {
35
35
  [OPENAI]: OPENAI_MODEL_EMBED_SMALL,
36
36
  [OPENROUTER]: GOOGLE_MODEL_GEMINI_EMBED,
37
- [JINA]: JINA_MODEL_CLIP_2,
37
+ [JINA]: JINA_MODEL_V_4,
38
38
  };
39
39
 
40
40
  const DEFAULT_RERANKER_MODELS = {
41
41
  [GOOGLE]: GOOGLE_MODEL_SEMANTIC_RANKER,
42
+ [JINA]: JINA_MODEL_RERANKER_M0,
42
43
  };
43
44
 
44
45
  const MODEL_CONFIG = {
@@ -52,29 +53,25 @@ const MODEL_CONFIG = {
52
53
  source: 'google', image: false, maxTokens: 2048,
53
54
  options: { dimensions: 768 },
54
55
  },
55
- [JINA_MODEL_CLIP_2]: {
56
- source: 'jina', image: true, maxTokens: 8192, options: {
57
- task: 'retrieval.query', dimensions: 1024,
58
- normalized: true, embedding_type: 'float',
59
- },
60
- },
61
56
  // Token calculation may be incorrect because its limitation applies to the
62
57
  // entire request rather than individual entries.
63
- [JINA_MODEL_V_3]: {
64
- source: 'jina', image: false, maxTokens: 8192, options: {
65
- task: 'retrieval.query', dimensions: 1024, normalized: true,
66
- late_chunking: true, embedding_type: 'float',
58
+ // https://jina.ai/embeddings
59
+ [JINA_MODEL_V_4]: {
60
+ source: 'jina', image: true, maxTokens: 8192, recordsLimit: 512,
61
+ options: {
62
+ task: 'text-matching', // 'retrieval.query', 'retrieval.passage'
63
+ dimensions: 768, // normalized: true, by default DONT submit
64
+ truncate: true, // late_chunking: true, by default DONT submit
65
+ embedding_type: 'float',
67
66
  },
68
67
  },
69
68
  [GOOGLE_MODEL_SEMANTIC_RANKER]: {
70
- source: 'google', image: false, maxTokens: 8192,
71
- options: {
72
- // task: 'retrieval.query',
73
- // dimensions: 1024,
74
- // normalized: true,
75
- // late_chunking: true,
76
- // embedding_type: 'float',
77
- },
69
+ source: 'google', image: false, maxTokens: 1024, recordsLimit: 200,
70
+ options: { ignoreRecordDetailsInResponse: true },
71
+ },
72
+ [JINA_MODEL_RERANKER_M0]: {
73
+ source: 'jina', image: true, maxTokens: 1024, recordsLimit: 200,
74
+ options: { return_documents: false },
78
75
  },
79
76
  };
80
77
 
@@ -122,6 +119,10 @@ const getRerankerClient = (provider) => {
122
119
  };
123
120
 
124
121
  const initEmbedding = async (options = {}) => {
122
+ if (options?.debug) {
123
+ (await need('node:util')).inspect.defaultOptions.depth = null;
124
+ options.logLevel = 'debug';
125
+ }
125
126
  ensureApiKey(options);
126
127
  const provider = ensureEmbeddingProvider(options);
127
128
  const OpenAI = await need('openai');
@@ -154,7 +155,7 @@ const embed = async (input, options = {}) => {
154
155
  );
155
156
  if (options?.input) {
156
157
  x.image = await convert(
157
- x.image, { ...options, expected: 'base64' }
158
+ x.image, { ...options, expected: BASE64 }
158
159
  );
159
160
  }
160
161
  }
@@ -188,6 +189,7 @@ const embed = async (input, options = {}) => {
188
189
 
189
190
  const initReranker = async (options = {}) => {
190
191
  const provider = ensureRerankerProvider(options);
192
+ const model = options?.model || DEFAULT_RERANKER_MODELS[provider];
191
193
  switch (provider) {
192
194
  case GOOGLE:
193
195
  ensureGoogleCredentials(options);
@@ -202,40 +204,81 @@ const initReranker = async (options = {}) => {
202
204
  };
203
205
  const client = new RankServiceClient(clientOptions);
204
206
  rerankerClients[provider] = {
205
- model: options?.model || DEFAULT_RERANKER_MODELS[provider],
206
- client, rankingConfigPath: client.rankingConfigPath(
207
+ client, model, rankingConfigPath: client.rankingConfigPath(
207
208
  options.projectId, location,
208
209
  options?.rerankerConfigId || GOOGLE_RERANK_CONFIG_ID
209
210
  ),
210
211
  };
211
212
  break;
213
+ case JINA:
214
+ const OpenAI = await need('openai');
215
+ const baseURL = options?.baseURL || PROVIDER_BASE_URL[provider];
216
+ rerankerClients[provider] = {
217
+ client: new OpenAI({ ...options, baseURL }),
218
+ model, source: MODEL_CONFIG[model]?.source,
219
+ };
220
+ break;
212
221
  default:
213
222
  throw new Error(`Unsupported reranker provider: ${provider}`);
214
223
  }
215
224
  return getRerankerClient(provider);
216
225
  };
217
226
 
218
- const rerank = async (query, documents, options = {}) => {
227
+ const rerank = async (query, records, options = {}) => {
219
228
  assert(query, 'Query is required.', 400);
220
- assert(documents?.length, 'Documents are required.', 400);
221
- const records = documents.map((doc, idx) => ({
222
- id: String(idx), content: doc,
223
- }));
229
+ assert(records?.length, 'Records are required.', 400);
224
230
  const { provider, model, client, rankingConfigPath }
225
231
  = getRerankerClient(options?.provider);
232
+ records = records.map((content, id) => Object.isObject(content)
233
+ ? content : { id: String(id), content }).slice(
234
+ 0, MODEL_CONFIG[model]?.recordsLimit || records.length
235
+ );
236
+ const maxTokens = MODEL_CONFIG[model]?.maxTokens || Infinity;
226
237
  let result;
238
+ for (let i in records) {
239
+ records[i].title = await trimText(records[i]?.title || '', maxTokens);
240
+ const titleTokens = await countTokens(records[i].title);
241
+ const availableTokens = maxTokens - titleTokens;
242
+ records[i].content = availableTokens > 0 ? await trimText(
243
+ records[i].content, availableTokens
244
+ ) : '';
245
+ records[i].image = records[i].image ? await convert(records[i].image, {
246
+ ...options, expected: BASE64,
247
+ }) : undefined;
248
+ }
227
249
  switch (provider) {
228
250
  case GOOGLE:
229
- const request = {
251
+ var body = {
230
252
  model, query, rankingConfig: rankingConfigPath,
231
- records, topN: ~~options?.topN || documents.length,
253
+ records, topN: ~~options?.topN || records.length,
254
+ ...MODEL_CONFIG[model]?.options || {},
255
+ ...options?.requestOptions || {},
256
+ };
257
+ result = (await client.rank(body))?.[0]?.records;
258
+ options?.raw || (result = result.map(x => ({
259
+ index: ~~x.id, score: x.score,
260
+ })));
261
+ break;
262
+ case JINA:
263
+ records = records.map(x =>
264
+ ((x.title || x.content) ? { text: [x.title, x.content].filter(x => x).join('\n') } : null)
265
+ || (x.image ? { image: x.image } : null)
266
+ ).filter(x => x);
267
+ assert(records.length, 'No valid records found.', 400);
268
+ var body = {
269
+ model, query, documents: records,
270
+ ...MODEL_CONFIG[model]?.options || {},
271
+ ...options?.requestOptions || {},
232
272
  };
233
- result = (await client.rank(request))?.[0]?.records;
273
+ result = (await client.post('/rerank', { body }))?.results;
274
+ options?.raw || (result = result.map(x => ({
275
+ index: ~~(x.index), score: x.relevance_score,
276
+ })));
234
277
  break;
235
278
  default:
236
279
  throw new Error(`Unsupported reranker provider: ${provider}`);
237
280
  }
238
- // print(result);
281
+ result.sort((a, b) => b.score - a.score);
239
282
  return result || [];
240
283
  };
241
284
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.47",
4
+ "version": "2000.3.49",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",