nodebb-plugin-search-agent 0.0.92 → 0.0.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  const https = require('https');
4
4
 
5
5
  function winston() {
6
- return require.main.require('winston');
6
+ return require.main.require('winston');
7
7
  }
8
8
 
9
9
  const OPENAI_EMBEDDINGS_HOSTNAME = 'api.openai.com';
@@ -11,15 +11,236 @@ const OPENAI_EMBEDDINGS_PATH = '/v1/embeddings';
11
11
  const EMBEDDING_MODEL = 'text-embedding-3-small';
12
12
  const MAX_RETRIES = 3;
13
13
  const RETRY_DELAY_MS = 500;
14
- // text-embedding-3-small supports 8 192 tokens.
15
- // Hebrew/non-ASCII text tokenizes at ~1.5–2 chars/token (UTF-8 multibyte).
16
- // Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
17
- const MAX_CHARS = 12000;
14
+
15
+ // text-embedding-3-small supports 8,192 tokens.
16
+ // Conservative char limits help avoid token overflows, especially for non-ASCII text.
17
+ const MAX_CHARS = 10000;
18
+ const CHUNK_OVERLAP = 300;
19
+ const MIN_CHUNK_CHARS = 500;
20
+ const TARGET_CHUNK_CHARS = 2200;
21
+
22
+ // ─── Text cleanup ─────────────────────────────────────────────────────────────
23
+
24
+ function extractPureText(text) {
25
+ if (typeof text !== 'string') return '';
26
+
27
+ // Remove Markdown images: ![alt](url)
28
+ let cleaned = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
29
+
30
+ // Remove HTML <img> tags
31
+ cleaned = cleaned.replace(/<img\b[^>]*>/gi, '');
32
+
33
+ // Remove links to files/images (common extensions)
34
+ cleaned = cleaned.replace(
35
+ /https?:\/\/(\S+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?))(\?\S*)?/gi,
36
+ ''
37
+ );
38
+
39
+ // Remove Markdown file links: [desc](url.ext)
40
+ cleaned = cleaned.replace(
41
+ /\[[^\]]*\]\([^)]*\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)\)/gi,
42
+ ''
43
+ );
44
+
45
+ // Remove any remaining <a href=...> tags to files
46
+ cleaned = cleaned.replace(
47
+ /<a\b[^>]*href=["']?[^"'>]+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)[^>]*>.*?<\/a>/gi,
48
+ ''
49
+ );
50
+
51
+ // Remove any remaining HTML tags
52
+ cleaned = cleaned.replace(/<[^>]+>/g, ' ');
53
+
54
+ // Remove extra whitespace
55
+ cleaned = cleaned
56
+ .replace(/[ \t]+/g, ' ')
57
+ .replace(/\n{2,}/g, '\n')
58
+ .trim();
59
+
60
+ return cleaned;
61
+ }
62
+
63
+ function normalizeWhitespace(text) {
64
+ return text
65
+ .replace(/\r\n/g, '\n')
66
+ .replace(/[ \t]+/g, ' ')
67
+ .replace(/\n{3,}/g, '\n\n')
68
+ .trim();
69
+ }
70
+
71
+ function splitIntoBlocks(text) {
72
+ const normalized = normalizeWhitespace(text);
73
+
74
+ const rawBlocks = normalized
75
+ .split(/\n{2,}|(?=^#{1,6}\s)|(?=^\s*[-*+]\s)|(?=^\s*\d+\.\s)|(?=^\s*>\s)|(?=^```)/gm)
76
+ .map(block => block.trim())
77
+ .filter(Boolean);
78
+
79
+ return rawBlocks;
80
+ }
81
+
82
+ function splitLargeBlock(block, maxLen = MAX_CHARS) {
83
+ if (block.length <= maxLen) return [block];
84
+
85
+ const sentences = block.match(/[^.!?\n]+[.!?\n]+|[^.!?\n]+$/g) || [block];
86
+ const chunks = [];
87
+ let current = '';
88
+
89
+ for (const sentence of sentences) {
90
+ const s = sentence.trim();
91
+ if (!s) continue;
92
+
93
+ if ((current + ' ' + s).trim().length <= maxLen) {
94
+ current = current ? `${current} ${s}` : s;
95
+ continue;
96
+ }
97
+
98
+ if (current) {
99
+ chunks.push(current);
100
+ current = '';
101
+ }
102
+
103
+ // Fallback for very long sentence
104
+ if (s.length > maxLen) {
105
+ let i = 0;
106
+ while (i < s.length) {
107
+ chunks.push(s.slice(i, i + maxLen).trim());
108
+ i += maxLen;
109
+ }
110
+ } else {
111
+ current = s;
112
+ }
113
+ }
114
+
115
+ if (current) chunks.push(current);
116
+
117
+ return chunks;
118
+ }
119
+
120
+ function buildOverlapPrefix(prevChunk, overlapChars = CHUNK_OVERLAP) {
121
+ if (!prevChunk) return '';
122
+ return prevChunk.slice(Math.max(0, prevChunk.length - overlapChars)).trim();
123
+ }
124
+
125
+ function splitIntoSemanticChunks(
126
+ text,
127
+ {
128
+ maxLen = MAX_CHARS,
129
+ targetLen = TARGET_CHUNK_CHARS,
130
+ minLen = MIN_CHUNK_CHARS,
131
+ overlap = CHUNK_OVERLAP,
132
+ } = {}
133
+ ) {
134
+ if (!text) return [];
135
+ if (text.length <= maxLen) return [text];
136
+
137
+ const blocks = splitIntoBlocks(text).flatMap(block => splitLargeBlock(block, maxLen));
138
+
139
+ const chunks = [];
140
+ let current = '';
141
+
142
+ for (const block of blocks) {
143
+ const next = current ? `${current}\n\n${block}` : block;
144
+
145
+ if (next.length <= targetLen || current.length < minLen) {
146
+ if (next.length <= maxLen) {
147
+ current = next;
148
+ continue;
149
+ }
150
+ }
151
+
152
+ if (current) {
153
+ chunks.push(current.trim());
154
+ }
155
+
156
+ current = block;
157
+ }
158
+
159
+ if (current) {
160
+ chunks.push(current.trim());
161
+ }
162
+
163
+ const withOverlap = chunks.map((chunk, i) => {
164
+ if (i === 0) return chunk;
165
+
166
+ const prefix = buildOverlapPrefix(chunks[i - 1], overlap);
167
+ const merged = prefix ? `${prefix}\n\n${chunk}` : chunk;
168
+
169
+ return merged.length <= maxLen ? merged : merged.slice(merged.length - maxLen);
170
+ });
171
+
172
+ const finalChunks = [];
173
+ for (const chunk of withOverlap) {
174
+ if (
175
+ finalChunks.length > 0 &&
176
+ chunk.length < minLen &&
177
+ finalChunks[finalChunks.length - 1].length + 2 + chunk.length <= maxLen
178
+ ) {
179
+ finalChunks[finalChunks.length - 1] += `\n\n${chunk}`;
180
+ } else {
181
+ finalChunks.push(chunk);
182
+ }
183
+ }
184
+
185
+ return finalChunks;
186
+ }
187
+
188
+ // ─── Vector helpers ───────────────────────────────────────────────────────────
189
+
190
+ function averageVectors(vectors) {
191
+ if (!vectors.length) return [];
192
+ const len = vectors[0].length;
193
+ const sum = new Array(len).fill(0);
194
+
195
+ for (const v of vectors) {
196
+ for (let i = 0; i < len; i++) {
197
+ sum[i] += v[i];
198
+ }
199
+ }
200
+
201
+ return sum.map(x => x / vectors.length);
202
+ }
18
203
 
19
204
  function truncate(text) {
20
- return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
205
+ return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
21
206
  }
22
207
 
208
+ function estimateTokens(str) {
209
+ const ascii = /^[\x00-\x7F]*$/.test(str);
210
+ return ascii ? Math.ceil(str.length / 4) : Math.ceil(str.length / 1.5);
211
+ }
212
+
213
+ // ─── Embedding cache ──────────────────────────────────────────────────────────
214
+
215
+ const _embedCache = new Map();
216
+ const EMBED_CACHE_MAX = 500;
217
+
218
+ function getCachedEmbedding(key) {
219
+ if (!_embedCache.has(key)) return null;
220
+
221
+ const value = _embedCache.get(key);
222
+
223
+ // Refresh LRU-ish order
224
+ _embedCache.delete(key);
225
+ _embedCache.set(key, value);
226
+
227
+ return value;
228
+ }
229
+
230
+ function setCachedEmbedding(key, value) {
231
+ if (_embedCache.has(key)) {
232
+ _embedCache.delete(key);
233
+ }
234
+
235
+ while (_embedCache.size >= EMBED_CACHE_MAX) {
236
+ _embedCache.delete(_embedCache.keys().next().value);
237
+ }
238
+
239
+ _embedCache.set(key, value);
240
+ }
241
+
242
+ // ─── OpenAI request helpers ───────────────────────────────────────────────────
243
+
23
244
  /**
24
245
  * Performs an HTTPS POST request to the OpenAI embeddings endpoint.
25
246
  * @param {string} apiKey
@@ -27,119 +248,246 @@ function truncate(text) {
27
248
  * @returns {Promise<object>} Parsed JSON response body
28
249
  */
29
250
  function requestEmbeddings(apiKey, input) {
30
- return new Promise((resolve, reject) => {
31
- const body = JSON.stringify({ model: EMBEDDING_MODEL, input });
32
- const options = {
33
- hostname: OPENAI_EMBEDDINGS_HOSTNAME,
34
- path: OPENAI_EMBEDDINGS_PATH,
35
- method: 'POST',
36
- headers: {
37
- 'Content-Type': 'application/json',
38
- 'Authorization': `Bearer ${apiKey}`,
39
- 'Content-Length': Buffer.byteLength(body),
40
- },
41
- };
42
-
43
- const req = https.request(options, (res) => {
44
- const chunks = [];
45
- res.on('data', chunk => chunks.push(chunk));
46
- res.on('end', () => {
47
- let parsed;
48
- try {
49
- parsed = JSON.parse(Buffer.concat(chunks).toString('utf8'));
50
- } catch (e) {
51
- return reject(new Error(`Failed to parse OpenAI response: ${e.message}`));
52
- }
53
-
54
- if (res.statusCode >= 400) {
55
- const message = (parsed.error && parsed.error.message) || `HTTP ${res.statusCode}`;
56
- return reject(new Error(`OpenAI API error: ${message}`));
57
- }
58
-
59
- resolve(parsed);
60
- });
61
- });
62
-
63
- req.on('error', err => reject(new Error(`Network error calling OpenAI: ${err.message}`)));
64
- req.write(body);
65
- req.end();
66
- });
251
+ return new Promise((resolve, reject) => {
252
+ const body = JSON.stringify({ model: EMBEDDING_MODEL, input });
253
+
254
+ const options = {
255
+ hostname: OPENAI_EMBEDDINGS_HOSTNAME,
256
+ path: OPENAI_EMBEDDINGS_PATH,
257
+ method: 'POST',
258
+ headers: {
259
+ 'Content-Type': 'application/json',
260
+ 'Authorization': `Bearer ${apiKey}`,
261
+ 'Content-Length': Buffer.byteLength(body),
262
+ },
263
+ };
264
+
265
+ const req = https.request(options, res => {
266
+ const chunks = [];
267
+
268
+ res.on('data', chunk => chunks.push(chunk));
269
+
270
+ res.on('end', () => {
271
+ let parsed;
272
+ try {
273
+ parsed = JSON.parse(Buffer.concat(chunks).toString('utf8'));
274
+ } catch (e) {
275
+ return reject(new Error(`Failed to parse OpenAI response: ${e.message}`));
276
+ }
277
+
278
+ if (res.statusCode >= 400) {
279
+ const message = (parsed.error && parsed.error.message) || `HTTP ${res.statusCode}`;
280
+ return reject(new Error(`OpenAI API error: ${message}`));
281
+ }
282
+
283
+ resolve(parsed);
284
+ });
285
+ });
286
+
287
+ req.on('error', err => reject(new Error(`Network error calling OpenAI: ${err.message}`)));
288
+ req.write(body);
289
+ req.end();
290
+ });
67
291
  }
68
292
 
69
293
  /**
70
294
  * Retries an async operation up to maxRetries times with exponential back-off.
71
- * @param {Function} fn - Async function to retry
295
+ * @param {Function} fn
72
296
  * @param {number} retries
73
297
  * @returns {Promise<*>}
74
298
  */
75
299
  async function withRetry(fn, retries = MAX_RETRIES) {
76
- let lastError;
77
- for (let attempt = 1; attempt <= retries; attempt++) {
78
- try {
79
- return await fn();
80
- } catch (err) {
81
- lastError = err;
82
- if (attempt < retries) {
83
- winston().warn(`[search-agent] embeddingService: attempt ${attempt} failed (${err.message}), retrying in ${RETRY_DELAY_MS * attempt} ms…`);
84
- await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS * attempt));
85
- }
86
- }
87
- }
88
- throw lastError;
300
+ let lastError;
301
+
302
+ for (let attempt = 1; attempt <= retries; attempt++) {
303
+ try {
304
+ return await fn();
305
+ } catch (err) {
306
+ lastError = err;
307
+
308
+ if (attempt < retries) {
309
+ winston().warn(
310
+ `[search-agent] embeddingService: attempt ${attempt} failed (${err.message}), retrying in ${RETRY_DELAY_MS * attempt} ms...`
311
+ );
312
+ await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS * attempt));
313
+ }
314
+ }
315
+ }
316
+
317
+ throw lastError;
89
318
  }
90
319
 
320
+ // ─── Public API ───────────────────────────────────────────────────────────────
321
+
91
322
  /**
92
323
  * Converts a single text string into an embedding vector.
93
324
  * @param {string} text
94
325
  * @returns {Promise<number[]>}
95
326
  */
96
327
  async function embed(text) {
97
- if (typeof text !== 'string' || text.trim() === '') {
98
- throw new Error('embed() requires a non-empty string');
99
- }
328
+ if (typeof text !== 'string' || text.trim() === '') {
329
+ throw new Error('embed() requires a non-empty string');
330
+ }
331
+
332
+ const apiKey = process.env.OPENAI_API_KEY;
333
+ if (!apiKey) {
334
+ throw new Error('OPENAI_API_KEY environment variable is not set');
335
+ }
336
+
337
+ const pureText = extractPureText(text);
338
+ if (!pureText) {
339
+ throw new Error('embed() received no usable text after filtering');
340
+ }
341
+
342
+ const chunks = splitIntoSemanticChunks(pureText, {
343
+ maxLen: MAX_CHARS,
344
+ targetLen: TARGET_CHUNK_CHARS,
345
+ minLen: MIN_CHUNK_CHARS,
346
+ overlap: CHUNK_OVERLAP,
347
+ });
348
+
349
+ if (chunks.length === 1) {
350
+ const safe = truncate(pureText);
351
+ const cached = getCachedEmbedding(safe);
352
+ if (cached) {
353
+ winston().verbose('[search-agent] embeddingService: embedding cache hit');
354
+ return cached;
355
+ }
356
+
357
+ const tokenCount = estimateTokens(safe);
358
+ winston().info(
359
+ `[search-agent] embeddingService: generating embedding for text (${safe.length} chars, ~${tokenCount} tokens)`
360
+ );
361
+
362
+ const response = await withRetry(() => requestEmbeddings(apiKey, safe));
363
+ const embedding = response.data[0].embedding;
364
+
365
+ winston().verbose('[search-agent] embeddingService: embedding generated successfully');
366
+ setCachedEmbedding(safe, embedding);
367
+
368
+ return embedding;
369
+ }
370
+
371
+ winston().info(
372
+ `[search-agent] embeddingService: splitting long text into ${chunks.length} semantic chunks for embedding`
373
+ );
374
+
375
+ chunks.forEach((chunk, i) => {
376
+ const tokenCount = estimateTokens(chunk);
377
+ winston().info(
378
+ `[search-agent] embeddingService: chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
379
+ );
380
+ });
381
+
382
+ const vectors = await embedBatch(chunks);
383
+ const avg = averageVectors(vectors);
384
+ const safe = truncate(pureText);
100
385
 
101
- const apiKey = process.env.OPENAI_API_KEY;
102
- if (!apiKey) {
103
- throw new Error('OPENAI_API_KEY environment variable is not set');
104
- }
386
+ setCachedEmbedding(safe, avg);
105
387
 
106
- const safe = truncate(text);
107
- winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
108
- const response = await withRetry(() => requestEmbeddings(apiKey, safe));
109
- winston().verbose('[search-agent] embeddingService: embedding generated successfully');
110
- return response.data[0].embedding;
388
+ return avg;
111
389
  }
112
390
 
113
391
  /**
114
392
  * Converts an array of text strings into an array of embedding vectors.
115
- * Texts are sent in a single batched API request.
393
+ * Texts are sent in batched API requests after semantic chunking.
116
394
  * @param {string[]} texts
117
395
  * @returns {Promise<number[][]>}
118
396
  */
119
397
  async function embedBatch(texts) {
120
- if (!Array.isArray(texts) || texts.length === 0) {
121
- throw new Error('embedBatch() requires a non-empty array of strings');
122
- }
398
+ if (!Array.isArray(texts) || texts.length === 0) {
399
+ throw new Error('embedBatch() requires a non-empty array of strings');
400
+ }
401
+
402
+ const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
403
+ if (invalid !== -1) {
404
+ throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
405
+ }
406
+
407
+ const apiKey = process.env.OPENAI_API_KEY;
408
+ if (!apiKey) {
409
+ throw new Error('OPENAI_API_KEY environment variable is not set');
410
+ }
411
+
412
+ const allChunks = [];
413
+ const chunkMap = [];
414
+
415
+ for (const [textIdx, text] of texts.entries()) {
416
+ const pureText = extractPureText(text);
417
+
418
+ if (!pureText) {
419
+ chunkMap.push({ count: 0 });
420
+ continue;
421
+ }
422
+
423
+ const chunks = splitIntoSemanticChunks(pureText, {
424
+ maxLen: MAX_CHARS,
425
+ targetLen: TARGET_CHUNK_CHARS,
426
+ minLen: MIN_CHUNK_CHARS,
427
+ overlap: CHUNK_OVERLAP,
428
+ });
429
+
430
+ chunkMap.push({ count: chunks.length });
431
+ allChunks.push(...chunks);
432
+
433
+ if (chunks.length === 1) {
434
+ const tokenCount = estimateTokens(chunks[0]);
435
+ winston().info(
436
+ `[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - 1 chunk, ${chunks[0].length} chars, ~${tokenCount} tokens`
437
+ );
438
+ } else {
439
+ winston().info(
440
+ `[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - ${chunks.length} chunks`
441
+ );
442
+ chunks.forEach((chunk, i) => {
443
+ const tokenCount = estimateTokens(chunk);
444
+ winston().info(
445
+ `[search-agent] embeddingService: chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
446
+ );
447
+ });
448
+ }
449
+ }
450
+
451
+ winston().verbose(
452
+ `[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`
453
+ );
454
+
455
+ if (allChunks.length === 0) {
456
+ return chunkMap.map(({ count }) => (count === 0 ? [] : null));
457
+ }
458
+
459
+ const safeChunks = allChunks.map(chunk => truncate(chunk));
460
+ const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
461
+
462
+ winston().verbose(
463
+ `[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`
464
+ );
123
465
 
124
- const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
125
- if (invalid !== -1) {
126
- throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
127
- }
466
+ const vectors = response.data
467
+ .sort((a, b) => a.index - b.index)
468
+ .map(item => item.embedding);
128
469
 
129
- const apiKey = process.env.OPENAI_API_KEY;
130
- if (!apiKey) {
131
- throw new Error('OPENAI_API_KEY environment variable is not set');
132
- }
470
+ const result = [];
471
+ let idx = 0;
133
472
 
134
- const safeTexts = texts.map(truncate);
135
- winston().verbose(`[search-agent] embeddingService: generating batch embeddings for ${safeTexts.length} text(s)`);
136
- const response = await withRetry(() => requestEmbeddings(apiKey, safeTexts));
137
- winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${texts.length} vector(s))`);
473
+ for (const { count } of chunkMap) {
474
+ if (count === 0) {
475
+ result.push([]);
476
+ } else if (count === 1) {
477
+ result.push(vectors[idx]);
478
+ idx += 1;
479
+ } else {
480
+ result.push(averageVectors(vectors.slice(idx, idx + count)));
481
+ idx += count;
482
+ }
483
+ }
138
484
 
139
- // OpenAI returns items sorted by index field, but sort explicitly to be safe
140
- return response.data
141
- .sort((a, b) => a.index - b.index)
142
- .map(item => item.embedding);
485
+ return result;
143
486
  }
144
487
 
145
- module.exports = { embed, embedBatch };
488
+ module.exports = {
489
+ embed,
490
+ embedBatch,
491
+ extractPureText,
492
+ splitIntoSemanticChunks,
493
+ };