nodebb-plugin-search-agent 0.0.92 → 0.0.94
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/searchHandler.js +118 -41
- package/library.js +2 -2
- package/package.json +1 -1
- package/services/embeddingService.js +438 -90
- package/services/vectorSearchService.js +379 -93
- package/templates/admin/plugins/search-agent.tpl +12 -0
- package/lib/cosineSimilarity.js +0 -42
- package/test/testCosine.js +0 -15
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
const https = require('https');
|
|
4
4
|
|
|
5
5
|
function winston() {
|
|
6
|
-
|
|
6
|
+
return require.main.require('winston');
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
const OPENAI_EMBEDDINGS_HOSTNAME = 'api.openai.com';
|
|
@@ -11,15 +11,236 @@ const OPENAI_EMBEDDINGS_PATH = '/v1/embeddings';
|
|
|
11
11
|
const EMBEDDING_MODEL = 'text-embedding-3-small';
|
|
12
12
|
const MAX_RETRIES = 3;
|
|
13
13
|
const RETRY_DELAY_MS = 500;
|
|
14
|
-
|
|
15
|
-
//
|
|
16
|
-
//
|
|
17
|
-
const MAX_CHARS =
|
|
14
|
+
|
|
15
|
+
// text-embedding-3-small supports 8,192 tokens.
|
|
16
|
+
// Conservative char limits help avoid token overflows, especially for non-ASCII text.
|
|
17
|
+
const MAX_CHARS = 10000;
|
|
18
|
+
const CHUNK_OVERLAP = 300;
|
|
19
|
+
const MIN_CHUNK_CHARS = 500;
|
|
20
|
+
const TARGET_CHUNK_CHARS = 2200;
|
|
21
|
+
|
|
22
|
+
// ─── Text cleanup ─────────────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
function extractPureText(text) {
|
|
25
|
+
if (typeof text !== 'string') return '';
|
|
26
|
+
|
|
27
|
+
// Remove Markdown images: 
|
|
28
|
+
let cleaned = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
|
|
29
|
+
|
|
30
|
+
// Remove HTML <img> tags
|
|
31
|
+
cleaned = cleaned.replace(/<img\b[^>]*>/gi, '');
|
|
32
|
+
|
|
33
|
+
// Remove links to files/images (common extensions)
|
|
34
|
+
cleaned = cleaned.replace(
|
|
35
|
+
/https?:\/\/(\S+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?))(\?\S*)?/gi,
|
|
36
|
+
''
|
|
37
|
+
);
|
|
38
|
+
|
|
39
|
+
// Remove Markdown file links: [desc](url.ext)
|
|
40
|
+
cleaned = cleaned.replace(
|
|
41
|
+
/\[[^\]]*\]\([^)]*\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)\)/gi,
|
|
42
|
+
''
|
|
43
|
+
);
|
|
44
|
+
|
|
45
|
+
// Remove any remaining <a href=...> tags to files
|
|
46
|
+
cleaned = cleaned.replace(
|
|
47
|
+
/<a\b[^>]*href=["']?[^"'>]+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)[^>]*>.*?<\/a>/gi,
|
|
48
|
+
''
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
// Remove any remaining HTML tags
|
|
52
|
+
cleaned = cleaned.replace(/<[^>]+>/g, ' ');
|
|
53
|
+
|
|
54
|
+
// Remove extra whitespace
|
|
55
|
+
cleaned = cleaned
|
|
56
|
+
.replace(/[ \t]+/g, ' ')
|
|
57
|
+
.replace(/\n{2,}/g, '\n')
|
|
58
|
+
.trim();
|
|
59
|
+
|
|
60
|
+
return cleaned;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function normalizeWhitespace(text) {
|
|
64
|
+
return text
|
|
65
|
+
.replace(/\r\n/g, '\n')
|
|
66
|
+
.replace(/[ \t]+/g, ' ')
|
|
67
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
68
|
+
.trim();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function splitIntoBlocks(text) {
|
|
72
|
+
const normalized = normalizeWhitespace(text);
|
|
73
|
+
|
|
74
|
+
const rawBlocks = normalized
|
|
75
|
+
.split(/\n{2,}|(?=^#{1,6}\s)|(?=^\s*[-*+]\s)|(?=^\s*\d+\.\s)|(?=^\s*>\s)|(?=^```)/gm)
|
|
76
|
+
.map(block => block.trim())
|
|
77
|
+
.filter(Boolean);
|
|
78
|
+
|
|
79
|
+
return rawBlocks;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function splitLargeBlock(block, maxLen = MAX_CHARS) {
|
|
83
|
+
if (block.length <= maxLen) return [block];
|
|
84
|
+
|
|
85
|
+
const sentences = block.match(/[^.!?\n]+[.!?\n]+|[^.!?\n]+$/g) || [block];
|
|
86
|
+
const chunks = [];
|
|
87
|
+
let current = '';
|
|
88
|
+
|
|
89
|
+
for (const sentence of sentences) {
|
|
90
|
+
const s = sentence.trim();
|
|
91
|
+
if (!s) continue;
|
|
92
|
+
|
|
93
|
+
if ((current + ' ' + s).trim().length <= maxLen) {
|
|
94
|
+
current = current ? `${current} ${s}` : s;
|
|
95
|
+
continue;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (current) {
|
|
99
|
+
chunks.push(current);
|
|
100
|
+
current = '';
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Fallback for very long sentence
|
|
104
|
+
if (s.length > maxLen) {
|
|
105
|
+
let i = 0;
|
|
106
|
+
while (i < s.length) {
|
|
107
|
+
chunks.push(s.slice(i, i + maxLen).trim());
|
|
108
|
+
i += maxLen;
|
|
109
|
+
}
|
|
110
|
+
} else {
|
|
111
|
+
current = s;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (current) chunks.push(current);
|
|
116
|
+
|
|
117
|
+
return chunks;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function buildOverlapPrefix(prevChunk, overlapChars = CHUNK_OVERLAP) {
|
|
121
|
+
if (!prevChunk) return '';
|
|
122
|
+
return prevChunk.slice(Math.max(0, prevChunk.length - overlapChars)).trim();
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function splitIntoSemanticChunks(
|
|
126
|
+
text,
|
|
127
|
+
{
|
|
128
|
+
maxLen = MAX_CHARS,
|
|
129
|
+
targetLen = TARGET_CHUNK_CHARS,
|
|
130
|
+
minLen = MIN_CHUNK_CHARS,
|
|
131
|
+
overlap = CHUNK_OVERLAP,
|
|
132
|
+
} = {}
|
|
133
|
+
) {
|
|
134
|
+
if (!text) return [];
|
|
135
|
+
if (text.length <= maxLen) return [text];
|
|
136
|
+
|
|
137
|
+
const blocks = splitIntoBlocks(text).flatMap(block => splitLargeBlock(block, maxLen));
|
|
138
|
+
|
|
139
|
+
const chunks = [];
|
|
140
|
+
let current = '';
|
|
141
|
+
|
|
142
|
+
for (const block of blocks) {
|
|
143
|
+
const next = current ? `${current}\n\n${block}` : block;
|
|
144
|
+
|
|
145
|
+
if (next.length <= targetLen || current.length < minLen) {
|
|
146
|
+
if (next.length <= maxLen) {
|
|
147
|
+
current = next;
|
|
148
|
+
continue;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (current) {
|
|
153
|
+
chunks.push(current.trim());
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
current = block;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if (current) {
|
|
160
|
+
chunks.push(current.trim());
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const withOverlap = chunks.map((chunk, i) => {
|
|
164
|
+
if (i === 0) return chunk;
|
|
165
|
+
|
|
166
|
+
const prefix = buildOverlapPrefix(chunks[i - 1], overlap);
|
|
167
|
+
const merged = prefix ? `${prefix}\n\n${chunk}` : chunk;
|
|
168
|
+
|
|
169
|
+
return merged.length <= maxLen ? merged : merged.slice(merged.length - maxLen);
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
const finalChunks = [];
|
|
173
|
+
for (const chunk of withOverlap) {
|
|
174
|
+
if (
|
|
175
|
+
finalChunks.length > 0 &&
|
|
176
|
+
chunk.length < minLen &&
|
|
177
|
+
finalChunks[finalChunks.length - 1].length + 2 + chunk.length <= maxLen
|
|
178
|
+
) {
|
|
179
|
+
finalChunks[finalChunks.length - 1] += `\n\n${chunk}`;
|
|
180
|
+
} else {
|
|
181
|
+
finalChunks.push(chunk);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return finalChunks;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// ─── Vector helpers ───────────────────────────────────────────────────────────
|
|
189
|
+
|
|
190
|
+
function averageVectors(vectors) {
|
|
191
|
+
if (!vectors.length) return [];
|
|
192
|
+
const len = vectors[0].length;
|
|
193
|
+
const sum = new Array(len).fill(0);
|
|
194
|
+
|
|
195
|
+
for (const v of vectors) {
|
|
196
|
+
for (let i = 0; i < len; i++) {
|
|
197
|
+
sum[i] += v[i];
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
return sum.map(x => x / vectors.length);
|
|
202
|
+
}
|
|
18
203
|
|
|
19
204
|
function truncate(text) {
|
|
20
|
-
|
|
205
|
+
return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
|
|
21
206
|
}
|
|
22
207
|
|
|
208
|
+
function estimateTokens(str) {
|
|
209
|
+
const ascii = /^[\x00-\x7F]*$/.test(str);
|
|
210
|
+
return ascii ? Math.ceil(str.length / 4) : Math.ceil(str.length / 1.5);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// ─── Embedding cache ──────────────────────────────────────────────────────────
|
|
214
|
+
|
|
215
|
+
const _embedCache = new Map();
|
|
216
|
+
const EMBED_CACHE_MAX = 500;
|
|
217
|
+
|
|
218
|
+
function getCachedEmbedding(key) {
|
|
219
|
+
if (!_embedCache.has(key)) return null;
|
|
220
|
+
|
|
221
|
+
const value = _embedCache.get(key);
|
|
222
|
+
|
|
223
|
+
// Refresh LRU-ish order
|
|
224
|
+
_embedCache.delete(key);
|
|
225
|
+
_embedCache.set(key, value);
|
|
226
|
+
|
|
227
|
+
return value;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function setCachedEmbedding(key, value) {
|
|
231
|
+
if (_embedCache.has(key)) {
|
|
232
|
+
_embedCache.delete(key);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
while (_embedCache.size >= EMBED_CACHE_MAX) {
|
|
236
|
+
_embedCache.delete(_embedCache.keys().next().value);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
_embedCache.set(key, value);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// ─── OpenAI request helpers ───────────────────────────────────────────────────
|
|
243
|
+
|
|
23
244
|
/**
|
|
24
245
|
* Performs an HTTPS POST request to the OpenAI embeddings endpoint.
|
|
25
246
|
* @param {string} apiKey
|
|
@@ -27,119 +248,246 @@ function truncate(text) {
|
|
|
27
248
|
* @returns {Promise<object>} Parsed JSON response body
|
|
28
249
|
*/
|
|
29
250
|
function requestEmbeddings(apiKey, input) {
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
251
|
+
return new Promise((resolve, reject) => {
|
|
252
|
+
const body = JSON.stringify({ model: EMBEDDING_MODEL, input });
|
|
253
|
+
|
|
254
|
+
const options = {
|
|
255
|
+
hostname: OPENAI_EMBEDDINGS_HOSTNAME,
|
|
256
|
+
path: OPENAI_EMBEDDINGS_PATH,
|
|
257
|
+
method: 'POST',
|
|
258
|
+
headers: {
|
|
259
|
+
'Content-Type': 'application/json',
|
|
260
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
261
|
+
'Content-Length': Buffer.byteLength(body),
|
|
262
|
+
},
|
|
263
|
+
};
|
|
264
|
+
|
|
265
|
+
const req = https.request(options, res => {
|
|
266
|
+
const chunks = [];
|
|
267
|
+
|
|
268
|
+
res.on('data', chunk => chunks.push(chunk));
|
|
269
|
+
|
|
270
|
+
res.on('end', () => {
|
|
271
|
+
let parsed;
|
|
272
|
+
try {
|
|
273
|
+
parsed = JSON.parse(Buffer.concat(chunks).toString('utf8'));
|
|
274
|
+
} catch (e) {
|
|
275
|
+
return reject(new Error(`Failed to parse OpenAI response: ${e.message}`));
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
if (res.statusCode >= 400) {
|
|
279
|
+
const message = (parsed.error && parsed.error.message) || `HTTP ${res.statusCode}`;
|
|
280
|
+
return reject(new Error(`OpenAI API error: ${message}`));
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
resolve(parsed);
|
|
284
|
+
});
|
|
285
|
+
});
|
|
286
|
+
|
|
287
|
+
req.on('error', err => reject(new Error(`Network error calling OpenAI: ${err.message}`)));
|
|
288
|
+
req.write(body);
|
|
289
|
+
req.end();
|
|
290
|
+
});
|
|
67
291
|
}
|
|
68
292
|
|
|
69
293
|
/**
|
|
70
294
|
* Retries an async operation up to maxRetries times with exponential back-off.
|
|
71
|
-
* @param {Function} fn
|
|
295
|
+
* @param {Function} fn
|
|
72
296
|
* @param {number} retries
|
|
73
297
|
* @returns {Promise<*>}
|
|
74
298
|
*/
|
|
75
299
|
async function withRetry(fn, retries = MAX_RETRIES) {
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
300
|
+
let lastError;
|
|
301
|
+
|
|
302
|
+
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
303
|
+
try {
|
|
304
|
+
return await fn();
|
|
305
|
+
} catch (err) {
|
|
306
|
+
lastError = err;
|
|
307
|
+
|
|
308
|
+
if (attempt < retries) {
|
|
309
|
+
winston().warn(
|
|
310
|
+
`[search-agent] embeddingService: attempt ${attempt} failed (${err.message}), retrying in ${RETRY_DELAY_MS * attempt} ms...`
|
|
311
|
+
);
|
|
312
|
+
await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS * attempt));
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
throw lastError;
|
|
89
318
|
}
|
|
90
319
|
|
|
320
|
+
// ─── Public API ───────────────────────────────────────────────────────────────
|
|
321
|
+
|
|
91
322
|
/**
|
|
92
323
|
* Converts a single text string into an embedding vector.
|
|
93
324
|
* @param {string} text
|
|
94
325
|
* @returns {Promise<number[]>}
|
|
95
326
|
*/
|
|
96
327
|
async function embed(text) {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
328
|
+
if (typeof text !== 'string' || text.trim() === '') {
|
|
329
|
+
throw new Error('embed() requires a non-empty string');
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
333
|
+
if (!apiKey) {
|
|
334
|
+
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
const pureText = extractPureText(text);
|
|
338
|
+
if (!pureText) {
|
|
339
|
+
throw new Error('embed() received no usable text after filtering');
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
const chunks = splitIntoSemanticChunks(pureText, {
|
|
343
|
+
maxLen: MAX_CHARS,
|
|
344
|
+
targetLen: TARGET_CHUNK_CHARS,
|
|
345
|
+
minLen: MIN_CHUNK_CHARS,
|
|
346
|
+
overlap: CHUNK_OVERLAP,
|
|
347
|
+
});
|
|
348
|
+
|
|
349
|
+
if (chunks.length === 1) {
|
|
350
|
+
const safe = truncate(pureText);
|
|
351
|
+
const cached = getCachedEmbedding(safe);
|
|
352
|
+
if (cached) {
|
|
353
|
+
winston().verbose('[search-agent] embeddingService: embedding cache hit');
|
|
354
|
+
return cached;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
const tokenCount = estimateTokens(safe);
|
|
358
|
+
winston().info(
|
|
359
|
+
`[search-agent] embeddingService: generating embedding for text (${safe.length} chars, ~${tokenCount} tokens)`
|
|
360
|
+
);
|
|
361
|
+
|
|
362
|
+
const response = await withRetry(() => requestEmbeddings(apiKey, safe));
|
|
363
|
+
const embedding = response.data[0].embedding;
|
|
364
|
+
|
|
365
|
+
winston().verbose('[search-agent] embeddingService: embedding generated successfully');
|
|
366
|
+
setCachedEmbedding(safe, embedding);
|
|
367
|
+
|
|
368
|
+
return embedding;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
winston().info(
|
|
372
|
+
`[search-agent] embeddingService: splitting long text into ${chunks.length} semantic chunks for embedding`
|
|
373
|
+
);
|
|
374
|
+
|
|
375
|
+
chunks.forEach((chunk, i) => {
|
|
376
|
+
const tokenCount = estimateTokens(chunk);
|
|
377
|
+
winston().info(
|
|
378
|
+
`[search-agent] embeddingService: chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
|
|
379
|
+
);
|
|
380
|
+
});
|
|
381
|
+
|
|
382
|
+
const vectors = await embedBatch(chunks);
|
|
383
|
+
const avg = averageVectors(vectors);
|
|
384
|
+
const safe = truncate(pureText);
|
|
100
385
|
|
|
101
|
-
|
|
102
|
-
if (!apiKey) {
|
|
103
|
-
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
104
|
-
}
|
|
386
|
+
setCachedEmbedding(safe, avg);
|
|
105
387
|
|
|
106
|
-
|
|
107
|
-
winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
|
|
108
|
-
const response = await withRetry(() => requestEmbeddings(apiKey, safe));
|
|
109
|
-
winston().verbose('[search-agent] embeddingService: embedding generated successfully');
|
|
110
|
-
return response.data[0].embedding;
|
|
388
|
+
return avg;
|
|
111
389
|
}
|
|
112
390
|
|
|
113
391
|
/**
|
|
114
392
|
* Converts an array of text strings into an array of embedding vectors.
|
|
115
|
-
* Texts are sent in
|
|
393
|
+
* Texts are sent in batched API requests after semantic chunking.
|
|
116
394
|
* @param {string[]} texts
|
|
117
395
|
* @returns {Promise<number[][]>}
|
|
118
396
|
*/
|
|
119
397
|
async function embedBatch(texts) {
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
398
|
+
if (!Array.isArray(texts) || texts.length === 0) {
|
|
399
|
+
throw new Error('embedBatch() requires a non-empty array of strings');
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
|
|
403
|
+
if (invalid !== -1) {
|
|
404
|
+
throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
408
|
+
if (!apiKey) {
|
|
409
|
+
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const allChunks = [];
|
|
413
|
+
const chunkMap = [];
|
|
414
|
+
|
|
415
|
+
for (const [textIdx, text] of texts.entries()) {
|
|
416
|
+
const pureText = extractPureText(text);
|
|
417
|
+
|
|
418
|
+
if (!pureText) {
|
|
419
|
+
chunkMap.push({ count: 0 });
|
|
420
|
+
continue;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
const chunks = splitIntoSemanticChunks(pureText, {
|
|
424
|
+
maxLen: MAX_CHARS,
|
|
425
|
+
targetLen: TARGET_CHUNK_CHARS,
|
|
426
|
+
minLen: MIN_CHUNK_CHARS,
|
|
427
|
+
overlap: CHUNK_OVERLAP,
|
|
428
|
+
});
|
|
429
|
+
|
|
430
|
+
chunkMap.push({ count: chunks.length });
|
|
431
|
+
allChunks.push(...chunks);
|
|
432
|
+
|
|
433
|
+
if (chunks.length === 1) {
|
|
434
|
+
const tokenCount = estimateTokens(chunks[0]);
|
|
435
|
+
winston().info(
|
|
436
|
+
`[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - 1 chunk, ${chunks[0].length} chars, ~${tokenCount} tokens`
|
|
437
|
+
);
|
|
438
|
+
} else {
|
|
439
|
+
winston().info(
|
|
440
|
+
`[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - ${chunks.length} chunks`
|
|
441
|
+
);
|
|
442
|
+
chunks.forEach((chunk, i) => {
|
|
443
|
+
const tokenCount = estimateTokens(chunk);
|
|
444
|
+
winston().info(
|
|
445
|
+
`[search-agent] embeddingService: chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
|
|
446
|
+
);
|
|
447
|
+
});
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
winston().verbose(
|
|
452
|
+
`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`
|
|
453
|
+
);
|
|
454
|
+
|
|
455
|
+
if (allChunks.length === 0) {
|
|
456
|
+
return chunkMap.map(({ count }) => (count === 0 ? [] : null));
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
const safeChunks = allChunks.map(chunk => truncate(chunk));
|
|
460
|
+
const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
|
|
461
|
+
|
|
462
|
+
winston().verbose(
|
|
463
|
+
`[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`
|
|
464
|
+
);
|
|
123
465
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
}
|
|
466
|
+
const vectors = response.data
|
|
467
|
+
.sort((a, b) => a.index - b.index)
|
|
468
|
+
.map(item => item.embedding);
|
|
128
469
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
132
|
-
}
|
|
470
|
+
const result = [];
|
|
471
|
+
let idx = 0;
|
|
133
472
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
473
|
+
for (const { count } of chunkMap) {
|
|
474
|
+
if (count === 0) {
|
|
475
|
+
result.push([]);
|
|
476
|
+
} else if (count === 1) {
|
|
477
|
+
result.push(vectors[idx]);
|
|
478
|
+
idx += 1;
|
|
479
|
+
} else {
|
|
480
|
+
result.push(averageVectors(vectors.slice(idx, idx + count)));
|
|
481
|
+
idx += count;
|
|
482
|
+
}
|
|
483
|
+
}
|
|
138
484
|
|
|
139
|
-
|
|
140
|
-
return response.data
|
|
141
|
-
.sort((a, b) => a.index - b.index)
|
|
142
|
-
.map(item => item.embedding);
|
|
485
|
+
return result;
|
|
143
486
|
}
|
|
144
487
|
|
|
145
|
-
module.exports = {
|
|
488
|
+
module.exports = {
|
|
489
|
+
embed,
|
|
490
|
+
embedBatch,
|
|
491
|
+
extractPureText,
|
|
492
|
+
splitIntoSemanticChunks,
|
|
493
|
+
};
|