utilitas 2000.3.47 → 2000.3.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/alan.mjs CHANGED
@@ -1398,6 +1398,7 @@ const trimText = async (text, limit = Infinity) => {
1398
1398
  text = ensureString(text, { trim: true });
1399
1399
  let trimmed = false;
1400
1400
  let lastCheck = null;
1401
+ limit = Math.max(limit, 0);
1401
1402
  while ((lastCheck = await countTokens(
1402
1403
  buildTextWithEllipsis(text, trimmed), { fast: true }
1403
1404
  )) > limit) {
package/lib/manifest.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  const manifest = {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.47",
4
+ "version": "2000.3.48",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",
package/lib/rag.mjs CHANGED
@@ -1,6 +1,6 @@
1
+ import { countTokens, trimText } from './alan.mjs';
1
2
  import { convert } from './storage.mjs';
2
3
  import { ensureArray, ensureString, need } from './utilitas.mjs';
3
- import { trimText } from './alan.mjs';
4
4
 
5
5
  const _NEED = ['openai', '@google-cloud/discoveryengine'];
6
6
  const embeddingClients = {};
@@ -12,8 +12,7 @@ const [
12
12
  OPENAI_MODEL_EMBED_SMALL,
13
13
  OPENAI_MODEL_EMBED_LARGE,
14
14
  GOOGLE_MODEL_GEMINI_EMBED,
15
- JINA_MODEL_CLIP_2,
16
- JINA_MODEL_V_3,
15
+ JINA_MODEL_V_4,
17
16
  GOOGLE_MODEL_SEMANTIC_RANKER,
18
17
  ] = [
19
18
  'OPENAI', 'GOOGLE', 'OPENROUTER', 'JINA',
@@ -21,8 +20,7 @@ const [
21
20
  'text-embedding-3-small', // dim: 1536
22
21
  'text-embedding-3-large', // dim: 3072
23
22
  'gemini-embedding-001', // dim: 768(default), 1536, or 3072(google default)
24
- 'jina-clip-v2', // dim: 1024
25
- 'jina-embeddings-v3', // dim: 256‑1024
23
+ 'jina-embeddings-v4', // dim: 256‑2048
26
24
  'semantic-ranker-default@latest',
27
25
  ];
28
26
 
@@ -34,7 +32,7 @@ const PROVIDER_BASE_URL = {
34
32
  const DEFAULT_EMBEDDING_MODELS = {
35
33
  [OPENAI]: OPENAI_MODEL_EMBED_SMALL,
36
34
  [OPENROUTER]: GOOGLE_MODEL_GEMINI_EMBED,
37
- [JINA]: JINA_MODEL_CLIP_2,
35
+ [JINA]: JINA_MODEL_V_4,
38
36
  };
39
37
 
40
38
  const DEFAULT_RERANKER_MODELS = {
@@ -52,29 +50,20 @@ const MODEL_CONFIG = {
52
50
  source: 'google', image: false, maxTokens: 2048,
53
51
  options: { dimensions: 768 },
54
52
  },
55
- [JINA_MODEL_CLIP_2]: {
56
- source: 'jina', image: true, maxTokens: 8192, options: {
57
- task: 'retrieval.query', dimensions: 1024,
58
- normalized: true, embedding_type: 'float',
59
- },
60
- },
61
53
  // Token calculation may be incorrect because its limitation applies to the
62
54
  // entire request rather than individual entries.
63
- [JINA_MODEL_V_3]: {
64
- source: 'jina', image: false, maxTokens: 8192, options: {
65
- task: 'retrieval.query', dimensions: 1024, normalized: true,
66
- late_chunking: true, embedding_type: 'float',
55
+ // https://jina.ai/embeddings
56
+ [JINA_MODEL_V_4]: {
57
+ source: 'jina', image: true, maxTokens: 8192, recordsLimit: 512,
58
+ options: {
59
+ task: 'text-matching', // 'retrieval.query', 'retrieval.passage'
60
+ dimensions: 768, // normalized: true, by default DONT submit
61
+ truncate: true, // late_chunking: true, by default DONT submit
62
+ embedding_type: 'float',
67
63
  },
68
64
  },
69
65
  [GOOGLE_MODEL_SEMANTIC_RANKER]: {
70
- source: 'google', image: false, maxTokens: 8192,
71
- options: {
72
- // task: 'retrieval.query',
73
- // dimensions: 1024,
74
- // normalized: true,
75
- // late_chunking: true,
76
- // embedding_type: 'float',
77
- },
66
+ source: 'google', image: false, maxTokens: 1024, recordsLimit: 200,
78
67
  },
79
68
  };
80
69
 
@@ -122,6 +111,10 @@ const getRerankerClient = (provider) => {
122
111
  };
123
112
 
124
113
  const initEmbedding = async (options = {}) => {
114
+ if (options?.debug) {
115
+ (await need('node:util')).inspect.defaultOptions.depth = null;
116
+ options.logLevel = 'debug';
117
+ }
125
118
  ensureApiKey(options);
126
119
  const provider = ensureEmbeddingProvider(options);
127
120
  const OpenAI = await need('openai');
@@ -215,20 +208,31 @@ const initReranker = async (options = {}) => {
215
208
  return getRerankerClient(provider);
216
209
  };
217
210
 
218
- const rerank = async (query, documents, options = {}) => {
211
+ const rerank = async (query, records, options = {}) => {
219
212
  assert(query, 'Query is required.', 400);
220
- assert(documents?.length, 'Documents are required.', 400);
221
- const records = documents.map((doc, idx) => ({
222
- id: String(idx), content: doc,
223
- }));
213
+ assert(records?.length, 'Records are required.', 400);
224
214
  const { provider, model, client, rankingConfigPath }
225
215
  = getRerankerClient(options?.provider);
216
+ records = records.map((content, id) => Object.isObject(content)
217
+ ? content : { id: String(id), content }).slice(
218
+ 0, MODEL_CONFIG[model]?.recordsLimit || records.length
219
+ );
220
+ const maxTokens = MODEL_CONFIG[model]?.maxTokens || Infinity;
226
221
  let result;
222
+ for (let i in records) {
223
+ records[i].title = await trimText(records[i]?.title || '', maxTokens);
224
+ const titleTokens = await countTokens(records[i].title);
225
+ const availableTokens = maxTokens - titleTokens;
226
+ records[i].content = availableTokens > 0 ? await trimText(
227
+ records[i].content, availableTokens
228
+ ) : '';
229
+ }
227
230
  switch (provider) {
228
231
  case GOOGLE:
229
232
  const request = {
230
233
  model, query, rankingConfig: rankingConfigPath,
231
- records, topN: ~~options?.topN || documents.length,
234
+ records, topN: ~~options?.topN || records.length,
235
+ ...options?.requestOptions || {},
232
236
  };
233
237
  result = (await client.rank(request))?.[0]?.records;
234
238
  break;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "utilitas",
3
3
  "description": "Just another common utility for JavaScript.",
4
- "version": "2000.3.47",
4
+ "version": "2000.3.48",
5
5
  "private": false,
6
6
  "homepage": "https://github.com/Leask/utilitas",
7
7
  "main": "index.mjs",