nodebb-plugin-search-agent 0.0.941 → 0.0.943

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nodebb-plugin-search-agent",
3
- "version": "0.0.941",
3
+ "version": "0.0.943",
4
4
  "description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
5
5
  "main": "library.js",
6
6
  "author": "Racheli Bayfus",
@@ -3,7 +3,7 @@
3
3
  const https = require('https');
4
4
 
5
5
  function winston() {
6
- return require.main.require('winston');
6
+ return require.main.require('winston');
7
7
  }
8
8
 
9
9
  const OPENAI_EMBEDDINGS_HOSTNAME = 'api.openai.com';
@@ -11,21 +11,236 @@ const OPENAI_EMBEDDINGS_PATH = '/v1/embeddings';
11
11
  const EMBEDDING_MODEL = 'text-embedding-3-small';
12
12
  const MAX_RETRIES = 3;
13
13
  const RETRY_DELAY_MS = 500;
14
- // text-embedding-3-small supports 8 192 tokens.
15
- // Hebrew/non-ASCII text tokenizes at ~1.5–2 chars/token (UTF-8 multibyte).
16
- // Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
17
- const MAX_CHARS = 12000;
14
+
15
+ // text-embedding-3-small supports 8,192 tokens.
16
+ // Conservative char limits help avoid token overflows, especially for non-ASCII text.
17
+ const MAX_CHARS = 10000;
18
+ const CHUNK_OVERLAP = 300;
19
+ const MIN_CHUNK_CHARS = 500;
20
+ const TARGET_CHUNK_CHARS = 2200;
21
+
22
+ // ─── Text cleanup ─────────────────────────────────────────────────────────────
23
+
24
+ function extractPureText(text) {
25
+ if (typeof text !== 'string') return '';
26
+
27
+ // Remove Markdown images: ![alt](url)
28
+ let cleaned = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
29
+
30
+ // Remove HTML <img> tags
31
+ cleaned = cleaned.replace(/<img\b[^>]*>/gi, '');
32
+
33
+ // Remove links to files/images (common extensions)
34
+ cleaned = cleaned.replace(
35
+ /https?:\/\/(\S+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?))(\?\S*)?/gi,
36
+ ''
37
+ );
38
+
39
+ // Remove Markdown file links: [desc](url.ext)
40
+ cleaned = cleaned.replace(
41
+ /\[[^\]]*\]\([^)]*\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)\)/gi,
42
+ ''
43
+ );
44
+
45
+ // Remove any remaining <a href=...> tags to files
46
+ cleaned = cleaned.replace(
47
+ /<a\b[^>]*href=["']?[^"'>]+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)[^>]*>.*?<\/a>/gi,
48
+ ''
49
+ );
50
+
51
+ // Remove any remaining HTML tags
52
+ cleaned = cleaned.replace(/<[^>]+>/g, ' ');
53
+
54
+ // Remove extra whitespace
55
+ cleaned = cleaned
56
+ .replace(/[ \t]+/g, ' ')
57
+ .replace(/\n{2,}/g, '\n')
58
+ .trim();
59
+
60
+ return cleaned;
61
+ }
62
+
63
+ function normalizeWhitespace(text) {
64
+ return text
65
+ .replace(/\r\n/g, '\n')
66
+ .replace(/[ \t]+/g, ' ')
67
+ .replace(/\n{3,}/g, '\n\n')
68
+ .trim();
69
+ }
70
+
71
+ function splitIntoBlocks(text) {
72
+ const normalized = normalizeWhitespace(text);
73
+
74
+ const rawBlocks = normalized
75
+ .split(/\n{2,}|(?=^#{1,6}\s)|(?=^\s*[-*+]\s)|(?=^\s*\d+\.\s)|(?=^\s*>\s)|(?=^```)/gm)
76
+ .map(block => block.trim())
77
+ .filter(Boolean);
78
+
79
+ return rawBlocks;
80
+ }
81
+
82
+ function splitLargeBlock(block, maxLen = MAX_CHARS) {
83
+ if (block.length <= maxLen) return [block];
84
+
85
+ const sentences = block.match(/[^.!?\n]+[.!?\n]+|[^.!?\n]+$/g) || [block];
86
+ const chunks = [];
87
+ let current = '';
88
+
89
+ for (const sentence of sentences) {
90
+ const s = sentence.trim();
91
+ if (!s) continue;
92
+
93
+ if ((current + ' ' + s).trim().length <= maxLen) {
94
+ current = current ? `${current} ${s}` : s;
95
+ continue;
96
+ }
97
+
98
+ if (current) {
99
+ chunks.push(current);
100
+ current = '';
101
+ }
102
+
103
+ // Fallback for very long sentence
104
+ if (s.length > maxLen) {
105
+ let i = 0;
106
+ while (i < s.length) {
107
+ chunks.push(s.slice(i, i + maxLen).trim());
108
+ i += maxLen;
109
+ }
110
+ } else {
111
+ current = s;
112
+ }
113
+ }
114
+
115
+ if (current) chunks.push(current);
116
+
117
+ return chunks;
118
+ }
119
+
120
+ function buildOverlapPrefix(prevChunk, overlapChars = CHUNK_OVERLAP) {
121
+ if (!prevChunk) return '';
122
+ return prevChunk.slice(Math.max(0, prevChunk.length - overlapChars)).trim();
123
+ }
124
+
125
+ function splitIntoSemanticChunks(
126
+ text,
127
+ {
128
+ maxLen = MAX_CHARS,
129
+ targetLen = TARGET_CHUNK_CHARS,
130
+ minLen = MIN_CHUNK_CHARS,
131
+ overlap = CHUNK_OVERLAP,
132
+ } = {}
133
+ ) {
134
+ if (!text) return [];
135
+ if (text.length <= maxLen) return [text];
136
+
137
+ const blocks = splitIntoBlocks(text).flatMap(block => splitLargeBlock(block, maxLen));
138
+
139
+ const chunks = [];
140
+ let current = '';
141
+
142
+ for (const block of blocks) {
143
+ const next = current ? `${current}\n\n${block}` : block;
144
+
145
+ if (next.length <= targetLen || current.length < minLen) {
146
+ if (next.length <= maxLen) {
147
+ current = next;
148
+ continue;
149
+ }
150
+ }
151
+
152
+ if (current) {
153
+ chunks.push(current.trim());
154
+ }
155
+
156
+ current = block;
157
+ }
158
+
159
+ if (current) {
160
+ chunks.push(current.trim());
161
+ }
162
+
163
+ const withOverlap = chunks.map((chunk, i) => {
164
+ if (i === 0) return chunk;
165
+
166
+ const prefix = buildOverlapPrefix(chunks[i - 1], overlap);
167
+ const merged = prefix ? `${prefix}\n\n${chunk}` : chunk;
168
+
169
+ return merged.length <= maxLen ? merged : merged.slice(merged.length - maxLen);
170
+ });
171
+
172
+ const finalChunks = [];
173
+ for (const chunk of withOverlap) {
174
+ if (
175
+ finalChunks.length > 0 &&
176
+ chunk.length < minLen &&
177
+ finalChunks[finalChunks.length - 1].length + 2 + chunk.length <= maxLen
178
+ ) {
179
+ finalChunks[finalChunks.length - 1] += `\n\n${chunk}`;
180
+ } else {
181
+ finalChunks.push(chunk);
182
+ }
183
+ }
184
+
185
+ return finalChunks;
186
+ }
187
+
188
+ // ─── Vector helpers ───────────────────────────────────────────────────────────
189
+
190
+ function averageVectors(vectors) {
191
+ if (!vectors.length) return [];
192
+ const len = vectors[0].length;
193
+ const sum = new Array(len).fill(0);
194
+
195
+ for (const v of vectors) {
196
+ for (let i = 0; i < len; i++) {
197
+ sum[i] += v[i];
198
+ }
199
+ }
200
+
201
+ return sum.map(x => x / vectors.length);
202
+ }
203
+
204
+ function truncate(text) {
205
+ return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
206
+ }
207
+
208
+ function estimateTokens(str) {
209
+ const ascii = /^[\x00-\x7F]*$/.test(str);
210
+ return ascii ? Math.ceil(str.length / 4) : Math.ceil(str.length / 1.5);
211
+ }
18
212
 
19
213
  // ─── Embedding cache ──────────────────────────────────────────────────────────
20
- // Avoids calling the embeddings API for the same text within a session.
21
- // HyDE output varies, so the biggest wins come from repeated identical queries.
214
+
22
215
  const _embedCache = new Map();
23
216
  const EMBED_CACHE_MAX = 500;
24
217
 
25
- function truncate(text) {
26
- return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
218
+ function getCachedEmbedding(key) {
219
+ if (!_embedCache.has(key)) return null;
220
+
221
+ const value = _embedCache.get(key);
222
+
223
+ // Refresh LRU-ish order
224
+ _embedCache.delete(key);
225
+ _embedCache.set(key, value);
226
+
227
+ return value;
27
228
  }
28
229
 
230
+ function setCachedEmbedding(key, value) {
231
+ if (_embedCache.has(key)) {
232
+ _embedCache.delete(key);
233
+ }
234
+
235
+ while (_embedCache.size >= EMBED_CACHE_MAX) {
236
+ _embedCache.delete(_embedCache.keys().next().value);
237
+ }
238
+
239
+ _embedCache.set(key, value);
240
+ }
241
+
242
+ // ─── OpenAI request helpers ───────────────────────────────────────────────────
243
+
29
244
  /**
30
245
  * Performs an HTTPS POST request to the OpenAI embeddings endpoint.
31
246
  * @param {string} apiKey
@@ -33,132 +248,246 @@ function truncate(text) {
33
248
  * @returns {Promise<object>} Parsed JSON response body
34
249
  */
35
250
  function requestEmbeddings(apiKey, input) {
36
- return new Promise((resolve, reject) => {
37
- const body = JSON.stringify({ model: EMBEDDING_MODEL, input });
38
- const options = {
39
- hostname: OPENAI_EMBEDDINGS_HOSTNAME,
40
- path: OPENAI_EMBEDDINGS_PATH,
41
- method: 'POST',
42
- headers: {
43
- 'Content-Type': 'application/json',
44
- 'Authorization': `Bearer ${apiKey}`,
45
- 'Content-Length': Buffer.byteLength(body),
46
- },
47
- };
48
-
49
- const req = https.request(options, (res) => {
50
- const chunks = [];
51
- res.on('data', chunk => chunks.push(chunk));
52
- res.on('end', () => {
53
- let parsed;
54
- try {
55
- parsed = JSON.parse(Buffer.concat(chunks).toString('utf8'));
56
- } catch (e) {
57
- return reject(new Error(`Failed to parse OpenAI response: ${e.message}`));
58
- }
59
-
60
- if (res.statusCode >= 400) {
61
- const message = (parsed.error && parsed.error.message) || `HTTP ${res.statusCode}`;
62
- return reject(new Error(`OpenAI API error: ${message}`));
63
- }
64
-
65
- resolve(parsed);
66
- });
67
- });
68
-
69
- req.on('error', err => reject(new Error(`Network error calling OpenAI: ${err.message}`)));
70
- req.write(body);
71
- req.end();
72
- });
251
+ return new Promise((resolve, reject) => {
252
+ const body = JSON.stringify({ model: EMBEDDING_MODEL, input });
253
+
254
+ const options = {
255
+ hostname: OPENAI_EMBEDDINGS_HOSTNAME,
256
+ path: OPENAI_EMBEDDINGS_PATH,
257
+ method: 'POST',
258
+ headers: {
259
+ 'Content-Type': 'application/json',
260
+ 'Authorization': `Bearer ${apiKey}`,
261
+ 'Content-Length': Buffer.byteLength(body),
262
+ },
263
+ };
264
+
265
+ const req = https.request(options, res => {
266
+ const chunks = [];
267
+
268
+ res.on('data', chunk => chunks.push(chunk));
269
+
270
+ res.on('end', () => {
271
+ let parsed;
272
+ try {
273
+ parsed = JSON.parse(Buffer.concat(chunks).toString('utf8'));
274
+ } catch (e) {
275
+ return reject(new Error(`Failed to parse OpenAI response: ${e.message}`));
276
+ }
277
+
278
+ if (res.statusCode >= 400) {
279
+ const message = (parsed.error && parsed.error.message) || `HTTP ${res.statusCode}`;
280
+ return reject(new Error(`OpenAI API error: ${message}`));
281
+ }
282
+
283
+ resolve(parsed);
284
+ });
285
+ });
286
+
287
+ req.on('error', err => reject(new Error(`Network error calling OpenAI: ${err.message}`)));
288
+ req.write(body);
289
+ req.end();
290
+ });
73
291
  }
74
292
 
75
293
  /**
76
294
  * Retries an async operation up to maxRetries times with exponential back-off.
77
- * @param {Function} fn - Async function to retry
295
+ * @param {Function} fn
78
296
  * @param {number} retries
79
297
  * @returns {Promise<*>}
80
298
  */
81
299
  async function withRetry(fn, retries = MAX_RETRIES) {
82
- let lastError;
83
- for (let attempt = 1; attempt <= retries; attempt++) {
84
- try {
85
- return await fn();
86
- } catch (err) {
87
- lastError = err;
88
- if (attempt < retries) {
89
- winston().warn(`[search-agent] embeddingService: attempt ${attempt} failed (${err.message}), retrying in ${RETRY_DELAY_MS * attempt} ms…`);
90
- await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS * attempt));
91
- }
92
- }
93
- }
94
- throw lastError;
300
+ let lastError;
301
+
302
+ for (let attempt = 1; attempt <= retries; attempt++) {
303
+ try {
304
+ return await fn();
305
+ } catch (err) {
306
+ lastError = err;
307
+
308
+ if (attempt < retries) {
309
+ winston().warn(
310
+ `[search-agent] embeddingService: attempt ${attempt} failed (${err.message}), retrying in ${RETRY_DELAY_MS * attempt} ms...`
311
+ );
312
+ await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS * attempt));
313
+ }
314
+ }
315
+ }
316
+
317
+ throw lastError;
95
318
  }
96
319
 
320
+ // ─── Public API ───────────────────────────────────────────────────────────────
321
+
97
322
  /**
98
323
  * Converts a single text string into an embedding vector.
99
324
  * @param {string} text
100
325
  * @returns {Promise<number[]>}
101
326
  */
102
327
  async function embed(text) {
103
- if (typeof text !== 'string' || text.trim() === '') {
104
- throw new Error('embed() requires a non-empty string');
105
- }
328
+ if (typeof text !== 'string' || text.trim() === '') {
329
+ throw new Error('embed() requires a non-empty string');
330
+ }
331
+
332
+ const apiKey = process.env.OPENAI_API_KEY;
333
+ if (!apiKey) {
334
+ throw new Error('OPENAI_API_KEY environment variable is not set');
335
+ }
336
+
337
+ const pureText = extractPureText(text);
338
+ if (!pureText) {
339
+ throw new Error('embed() received no usable text after filtering');
340
+ }
106
341
 
107
- const apiKey = process.env.OPENAI_API_KEY;
108
- if (!apiKey) {
109
- throw new Error('OPENAI_API_KEY environment variable is not set');
110
- }
342
+ const chunks = splitIntoSemanticChunks(pureText, {
343
+ maxLen: MAX_CHARS,
344
+ targetLen: TARGET_CHUNK_CHARS,
345
+ minLen: MIN_CHUNK_CHARS,
346
+ overlap: CHUNK_OVERLAP,
347
+ });
111
348
 
112
- const safe = truncate(text);
349
+ if (chunks.length === 1) {
350
+ const safe = truncate(pureText);
351
+ const cached = getCachedEmbedding(safe);
352
+ if (cached) {
353
+ winston().verbose('[search-agent] embeddingService: embedding cache hit');
354
+ return cached;
355
+ }
113
356
 
114
- if (_embedCache.has(safe)) {
115
- winston().verbose('[search-agent] embeddingService: embedding cache hit');
116
- return _embedCache.get(safe);
117
- }
357
+ const tokenCount = estimateTokens(safe);
358
+ winston().info(
359
+ `[search-agent] embeddingService: generating embedding for text (${safe.length} chars, ~${tokenCount} tokens)`
360
+ );
118
361
 
119
- winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
120
- const response = await withRetry(() => requestEmbeddings(apiKey, safe));
121
- winston().verbose('[search-agent] embeddingService: embedding generated successfully');
122
- const embedding = response.data[0].embedding;
362
+ const response = await withRetry(() => requestEmbeddings(apiKey, safe));
363
+ const embedding = response.data[0].embedding;
123
364
 
124
- if (_embedCache.size >= EMBED_CACHE_MAX) {
125
- _embedCache.delete(_embedCache.keys().next().value);
126
- }
127
- _embedCache.set(safe, embedding);
365
+ winston().verbose('[search-agent] embeddingService: embedding generated successfully');
366
+ setCachedEmbedding(safe, embedding);
128
367
 
129
- return embedding;
368
+ return embedding;
369
+ }
370
+
371
+ winston().info(
372
+ `[search-agent] embeddingService: splitting long text into ${chunks.length} semantic chunks for embedding`
373
+ );
374
+
375
+ chunks.forEach((chunk, i) => {
376
+ const tokenCount = estimateTokens(chunk);
377
+ winston().info(
378
+ `[search-agent] embeddingService: chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
379
+ );
380
+ });
381
+
382
+ const vectors = await embedBatch(chunks);
383
+ const avg = averageVectors(vectors);
384
+ const safe = truncate(pureText);
385
+
386
+ setCachedEmbedding(safe, avg);
387
+
388
+ return avg;
130
389
  }
131
390
 
132
391
  /**
133
392
  * Converts an array of text strings into an array of embedding vectors.
134
- * Texts are sent in a single batched API request.
393
+ * Texts are sent in batched API requests after semantic chunking.
135
394
  * @param {string[]} texts
136
395
  * @returns {Promise<number[][]>}
137
396
  */
138
397
  async function embedBatch(texts) {
139
- if (!Array.isArray(texts) || texts.length === 0) {
140
- throw new Error('embedBatch() requires a non-empty array of strings');
141
- }
398
+ if (!Array.isArray(texts) || texts.length === 0) {
399
+ throw new Error('embedBatch() requires a non-empty array of strings');
400
+ }
401
+
402
+ const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
403
+ if (invalid !== -1) {
404
+ throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
405
+ }
406
+
407
+ const apiKey = process.env.OPENAI_API_KEY;
408
+ if (!apiKey) {
409
+ throw new Error('OPENAI_API_KEY environment variable is not set');
410
+ }
411
+
412
+ const allChunks = [];
413
+ const chunkMap = [];
414
+
415
+ for (const [textIdx, text] of texts.entries()) {
416
+ const pureText = extractPureText(text);
417
+
418
+ if (!pureText) {
419
+ chunkMap.push({ count: 0 });
420
+ continue;
421
+ }
422
+
423
+ const chunks = splitIntoSemanticChunks(pureText, {
424
+ maxLen: MAX_CHARS,
425
+ targetLen: TARGET_CHUNK_CHARS,
426
+ minLen: MIN_CHUNK_CHARS,
427
+ overlap: CHUNK_OVERLAP,
428
+ });
429
+
430
+ chunkMap.push({ count: chunks.length });
431
+ allChunks.push(...chunks);
432
+
433
+ if (chunks.length === 1) {
434
+ const tokenCount = estimateTokens(chunks[0]);
435
+ winston().info(
436
+ `[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - 1 chunk, ${chunks[0].length} chars, ~${tokenCount} tokens`
437
+ );
438
+ } else {
439
+ winston().info(
440
+ `[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - ${chunks.length} chunks`
441
+ );
442
+ chunks.forEach((chunk, i) => {
443
+ const tokenCount = estimateTokens(chunk);
444
+ winston().info(
445
+ `[search-agent] embeddingService: chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
446
+ );
447
+ });
448
+ }
449
+ }
450
+
451
+ winston().verbose(
452
+ `[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`
453
+ );
454
+
455
+ if (allChunks.length === 0) {
456
+ return chunkMap.map(({ count }) => (count === 0 ? [] : null));
457
+ }
458
+
459
+ const safeChunks = allChunks.map(chunk => truncate(chunk));
460
+ const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
461
+
462
+ winston().verbose(
463
+ `[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`
464
+ );
142
465
 
143
- const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
144
- if (invalid !== -1) {
145
- throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
146
- }
466
+ const vectors = response.data
467
+ .sort((a, b) => a.index - b.index)
468
+ .map(item => item.embedding);
147
469
 
148
- const apiKey = process.env.OPENAI_API_KEY;
149
- if (!apiKey) {
150
- throw new Error('OPENAI_API_KEY environment variable is not set');
151
- }
470
+ const result = [];
471
+ let idx = 0;
152
472
 
153
- const safeTexts = texts.map(truncate);
154
- winston().verbose(`[search-agent] embeddingService: generating batch embeddings for ${safeTexts.length} text(s)`);
155
- const response = await withRetry(() => requestEmbeddings(apiKey, safeTexts));
156
- winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${texts.length} vector(s))`);
473
+ for (const { count } of chunkMap) {
474
+ if (count === 0) {
475
+ result.push([]);
476
+ } else if (count === 1) {
477
+ result.push(vectors[idx]);
478
+ idx += 1;
479
+ } else {
480
+ result.push(averageVectors(vectors.slice(idx, idx + count)));
481
+ idx += count;
482
+ }
483
+ }
157
484
 
158
- // OpenAI returns items sorted by index field, but sort explicitly to be safe
159
- return response.data
160
- .sort((a, b) => a.index - b.index)
161
- .map(item => item.embedding);
485
+ return result;
162
486
  }
163
487
 
164
- module.exports = { embed, embedBatch };
488
+ module.exports = {
489
+ embed,
490
+ embedBatch,
491
+ extractPureText,
492
+ splitIntoSemanticChunks,
493
+ };
@@ -5,123 +5,389 @@ const { embed } = require('./embeddingService');
5
5
  const { getAllEmbeddings } = require('./vectorStore');
6
6
 
7
7
  function winston() {
8
- return require.main.require('winston');
8
+ return require.main.require('winston');
9
9
  }
10
10
 
11
11
  // Fetch this many candidates from Orama — cast a wide net so the AI has enough to choose from
12
12
  const TOP_K = 50;
13
- // Absolute minimum cosine similarity — only filters pure noise (near-zero similarity).
14
- // Do NOT raise this: the relevant result often scores lower than irrelevant ones.
15
- // The AI re-ranker (which reads content) is the precision gate, not this floor.
16
- const MIN_SCORE = 0.10;
17
- // Rebuild the Orama index after this interval (mirrors TF-IDF cache TTL)
13
+
14
+ // Absolute minimum similarity only filters pure noise.
15
+ // Keep this low; the later ranking layers should handle precision.
16
+ const MIN_SCORE = 0.15;
17
+
18
+ // Rebuild the Orama index after this interval
18
19
  const INDEX_TTL_MS = 5 * 60 * 1000;
19
20
 
21
+ // Hybrid search configuration
22
+ const VECTOR_SIMILARITY = 0.1;
23
+ const SEARCH_PROPERTIES = ['title', 'category', 'tags', 'content', 'parent_content'];
24
+ const FIELD_BOOSTS = {
25
+ title: 3.5,
26
+ tags: 2.8,
27
+ category: 2.2,
28
+ content: 1.0,
29
+ parent_content: 0.8,
30
+ };
31
+
20
32
  let _db = null;
21
33
  let _dbTs = 0;
22
34
  let _buildPromise = null;
23
35
 
36
+ // Finance-heavy Hebrew forum query expansion.
37
+ // These are intentionally conservative: good recall lift without flooding the query.
38
+ const QUERY_EXPANSIONS = {
39
+ // General finance
40
+ 'מניה': ['מניות', 'נייר ערך', 'שוק ההון', 'בורסה'],
41
+ 'מניות': ['מניה', 'ניירות ערך', 'שוק ההון', 'בורסה'],
42
+ 'אגח': ['אג"ח', 'איגרת חוב', 'איגרות חוב', 'חוב'],
43
+ 'אג"ח': ['אגח', 'איגרת חוב', 'איגרות חוב', 'חוב'],
44
+ 'קרן': ['קרנות', 'קרן נאמנות', 'קרן סל'],
45
+ 'קרנות': ['קרן', 'קרן נאמנות', 'קרן סל'],
46
+ 'קרן סל': ['etf', 'תעודת סל', 'קרן מחקה'],
47
+ 'תעודת סל': ['etf', 'קרן סל', 'קרן מחקה'],
48
+ 'etf': ['קרן סל', 'תעודת סל', 'קרן מחקה'],
49
+ 'מדד': ['מדדים', 'מדד מניות', 'תשואת מדד'],
50
+ 'מדדים': ['מדד', 'מדד מניות', 'תשואת מדד'],
51
+ 'תיק': ['תיק השקעות', 'פיזור', 'החזקות'],
52
+ 'השקעה': ['השקעות', 'להשקיע', 'תיק השקעות'],
53
+ 'השקעות': ['השקעה', 'להשקיע', 'תיק השקעות'],
54
+ 'להשקיע': ['השקעה', 'השקעות', 'תיק השקעות'],
55
+ 'תשואה': ['רווח', 'תשואות', 'רווחיות'],
56
+ 'רווח': ['רווחים', 'תשואה', 'רווחיות'],
57
+ 'הפסד': ['הפסדים', 'ירידה', 'מינוס'],
58
+ 'דיבידנד': ['דיבידנדים', 'חלוקת רווחים'],
59
+ 'מכפיל': ['מכפיל רווח', 'pe', 'p/e'],
60
+ 'pe': ['מכפיל', 'מכפיל רווח', 'p/e'],
61
+ 'p/e': ['מכפיל', 'מכפיל רווח', 'pe'],
62
+ 'מינוף': ['ממונף', 'הלוואה', 'מרגין', 'margin'],
63
+ 'מרגין': ['margin', 'מינוף'],
64
+ 'margin': ['מרגין', 'מינוף'],
65
+ 'סיכון': ['סיכונים', 'תנודתיות', 'חשיפה'],
66
+ 'נזילות': ['נזיל', 'מזומן', 'סחירות'],
67
+ 'סחירות': ['נזילות', 'נזיל'],
68
+
69
+ // Tax / regulation
70
+ 'מס': ['מיסוי', 'מסים', 'רשות המסים'],
71
+ 'מיסוי': ['מס', 'מסים', 'רשות המסים'],
72
+ 'מסים': ['מס', 'מיסוי', 'רשות המסים'],
73
+ 'קיזוז': ['קיזוז הפסדים', 'מגן מס'],
74
+ 'דוח': ['דו"ח', 'דיווח', 'טופס'],
75
+ 'דו"ח': ['דוח', 'דיווח', 'טופס'],
76
+
77
+ // Savings / pension
78
+ 'פנסיה': ['קרן פנסיה', 'חיסכון פנסיוני', 'קצבה'],
79
+ 'גמל': ['קופת גמל', 'קופ"ג'],
80
+ 'קופג': ['קופת גמל', 'קופ"ג', 'גמל'],
81
+ 'קופ"ג': ['קופת גמל', 'קופג', 'גמל'],
82
+ 'השתלמות': ['קרן השתלמות'],
83
+ 'משכנתא': ['משכנתאות', 'ריבית', 'הלוואת דיור'],
84
+ 'הלוואה': ['הלוואות', 'אשראי', 'מימון'],
85
+ 'אשראי': ['הלוואה', 'הלוואות', 'מימון'],
86
+
87
+ // Trading / technical
88
+ 'מסחר': ['טריידינג', 'קניה', 'מכירה', 'פקודה'],
89
+ 'טריידינג': ['מסחר', 'מסחר יומי', 'קניה', 'מכירה'],
90
+ 'שורט': ['short', 'מכירה בחסר'],
91
+ 'short': ['שורט', 'מכירה בחסר'],
92
+ 'לונג': ['long', 'החזקה'],
93
+ 'long': ['לונג', 'החזקה'],
94
+ 'פקודה': ['פקודות', 'לימיט', 'מרקט'],
95
+ 'לימיט': ['limit', 'פקודת לימיט'],
96
+ 'limit': ['לימיט', 'פקודת לימיט'],
97
+ 'מרקט': ['market', 'פקודת שוק'],
98
+ 'market': ['מרקט', 'פקודת שוק'],
99
+
100
+ // Crypto
101
+ 'ביטקוין': ['btc', 'קריפטו', 'מטבע דיגיטלי'],
102
+ 'btc': ['ביטקוין', 'קריפטו', 'מטבע דיגיטלי'],
103
+ 'אתריום': ['eth', 'קריפטו', 'מטבע דיגיטלי'],
104
+ 'eth': ['אתריום', 'קריפטו', 'מטבע דיגיטלי'],
105
+ 'קריפטו': ['מטבע דיגיטלי', 'ביטקוין', 'אתריום', 'בלוקציין'],
106
+ 'בלוקציין': ['קריפטו', 'מטבע דיגיטלי'],
107
+
108
+ // Hebrew forum / advice intent
109
+ 'מומלץ': ['כדאי', 'המלצה', 'עדיף'],
110
+ 'כדאי': ['מומלץ', 'המלצה', 'עדיף'],
111
+ 'המלצה': ['מומלץ', 'כדאי', 'עדיף'],
112
+ 'בעיה': ['תקלה', 'קושי', 'לא עובד'],
113
+ 'תקלה': ['בעיה', 'לא עובד', 'שגיאה'],
114
+ 'שגיאה': ['תקלה', 'בעיה', 'לא עובד'],
115
+ };
116
+
117
+ // Common phrase-level expansions that are better handled before token expansion.
118
+ const PHRASE_EXPANSIONS = [
119
+ {
120
+ pattern: /\b(?:קרן\s+סל|תעודת\s+סל|קרן\s+מחקה)\b/gi,
121
+ terms: ['etf', 'קרן סל', 'תעודת סל', 'קרן מחקה'],
122
+ },
123
+ {
124
+ pattern: /\b(?:איגרת\s+חוב|איגרות\s+חוב|אג["׳׳]?\s?ח)\b/gi,
125
+ terms: ['אגח', 'אג"ח', 'איגרת חוב', 'איגרות חוב'],
126
+ },
127
+ {
128
+ pattern: /\b(?:קופת\s+גמל|קופ["׳׳]?\s?ג)\b/gi,
129
+ terms: ['קופת גמל', 'קופג', 'קופ"ג', 'גמל'],
130
+ },
131
+ {
132
+ pattern: /\b(?:מכפיל\s+רווח|p\/e|pe)\b/gi,
133
+ terms: ['מכפיל', 'מכפיל רווח', 'pe', 'p/e'],
134
+ },
135
+ {
136
+ pattern: /\b(?:מכירה\s+בחסר|short)\b/gi,
137
+ terms: ['שורט', 'short', 'מכירה בחסר'],
138
+ },
139
+ ];
140
+
141
+ // Generic filler words to ignore for lexical expansion
142
+ const STOP_WORDS = new Set([
143
+ 'של', 'על', 'עם', 'בלי', 'גם', 'או', 'אם', 'אבל', 'כי', 'זה', 'זאת', 'זו',
144
+ 'יש', 'אין', 'אני', 'אתה', 'את', 'הוא', 'היא', 'הם', 'הן', 'אנחנו', 'מה',
145
+ 'איך', 'למה', 'מתי', 'איפה', 'האם', 'כל', 'עוד', 'כמו', 'רק', 'מאוד', 'פחות',
146
+ 'יותר', 'אחרי', 'לפני', 'תוך', 'דרך', 'לגבי', 'בנוגע', 'בשביל', 'מול',
147
+ ]);
148
+
149
+ function normalizeHebrew(text) {
150
+ return String(text || '')
151
+ // remove niqqud / cantillation
152
+ .replace(/[\u0591-\u05C7]/g, '')
153
+ // normalize Hebrew punctuation variants
154
+ .replace(/[׳']/g, '\'')
155
+ .replace(/[״"]/g, '"')
156
+ // collapse whitespace
157
+ .replace(/\s+/g, ' ')
158
+ .trim();
159
+ }
160
+
161
+ function normalizeToken(token) {
162
+ return normalizeHebrew(token)
163
+ .toLowerCase()
164
+ .replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu, '');
165
+ }
166
+
167
+ function uniqueTerms(terms, maxTerms = 24) {
168
+ const seen = new Set();
169
+ const out = [];
170
+
171
+ for (const raw of terms) {
172
+ const term = normalizeHebrew(raw).trim();
173
+ if (!term) continue;
174
+
175
+ const key = term.toLowerCase();
176
+ if (seen.has(key)) continue;
177
+
178
+ seen.add(key);
179
+ out.push(term);
180
+
181
+ if (out.length >= maxTerms) break;
182
+ }
183
+
184
+ return out;
185
+ }
186
+
187
+ function expandQuery(query) {
188
+ const normalized = normalizeHebrew(query);
189
+ const expanded = [normalized];
190
+
191
+ for (const phraseRule of PHRASE_EXPANSIONS) {
192
+ if (phraseRule.pattern.test(normalized)) {
193
+ expanded.push(...phraseRule.terms);
194
+ }
195
+ phraseRule.pattern.lastIndex = 0;
196
+ }
197
+
198
+ const tokens = normalized
199
+ .split(/[\s,/|()[\]{}:;!?]+/)
200
+ .map(normalizeToken)
201
+ .filter(Boolean)
202
+ .filter(token => !STOP_WORDS.has(token));
203
+
204
+ for (const token of tokens) {
205
+ expanded.push(token);
206
+
207
+ const synonyms = QUERY_EXPANSIONS[token];
208
+ if (synonyms) {
209
+ expanded.push(...synonyms);
210
+ }
211
+
212
+ // A little morphology help for Hebrew singular/plural and abbreviations
213
+ if (token.endsWith('ים') && token.length > 3) {
214
+ expanded.push(token.slice(0, -2));
215
+ }
216
+ if (token.endsWith('ות') && token.length > 3) {
217
+ expanded.push(token.slice(0, -2));
218
+ }
219
+ if (token.endsWith('ה') && token.length > 2) {
220
+ expanded.push(token.slice(0, -1));
221
+ }
222
+ }
223
+
224
+ const terms = uniqueTerms(expanded, 24);
225
+
226
+ return {
227
+ original: query,
228
+ normalized,
229
+ terms,
230
+ // Orama lexical search receives one expanded term string
231
+ term: terms.join(' '),
232
+ };
233
+ }
234
+
235
+ function coerceString(value) {
236
+ if (value == null) return '';
237
+
238
+ if (Array.isArray(value)) {
239
+ return value
240
+ .map(v => coerceString(v))
241
+ .filter(Boolean)
242
+ .join(', ');
243
+ }
244
+
245
+ return String(value).trim();
246
+ }
247
+
248
+ function buildDocument(row) {
249
+ return {
250
+ id: String(row.post_id),
251
+ post_id: row.post_id,
252
+ topic_id: row.topic_id,
253
+ title: coerceString(row.title),
254
+ category: coerceString(row.category),
255
+ tags: coerceString(row.tags),
256
+ parent_content: coerceString(row.parent_content),
257
+ content: coerceString(row.content),
258
+ embedding: row.embedding,
259
+ };
260
+ }
261
+
24
262
  async function buildIndex() {
25
- const storedEmbeddings = await getAllEmbeddings();
26
-
27
- // Detect dimension from data; fall back to 1536 (text-embedding-3-small default)
28
- const dimensions = storedEmbeddings.length > 0
29
- ? storedEmbeddings[0].embedding.length
30
- : 1536;
31
-
32
- const db = await create({
33
- schema: {
34
- post_id: 'number',
35
- topic_id: 'number',
36
- content: 'string',
37
- embedding: `vector[${dimensions}]`,
38
- },
39
- });
40
-
41
- if (storedEmbeddings.length > 0) {
42
- await insertMultiple(db, storedEmbeddings.map(e => ({
43
- id: String(e.post_id),
44
- post_id: e.post_id,
45
- topic_id: e.topic_id,
46
- content: e.content,
47
- embedding: e.embedding,
48
- })));
49
- }
50
-
51
- winston().info(`[search-agent] vectorSearchService: Orama index built with ${storedEmbeddings.length} document(s)`);
52
- return db;
263
+ const storedEmbeddings = await getAllEmbeddings();
264
+
265
+ // Detect dimension from data; fall back to 1536 (text-embedding-3-small default)
266
+ const dimensions = storedEmbeddings.length > 0
267
+ ? storedEmbeddings[0].embedding.length
268
+ : 1536;
269
+
270
+ const db = await create({
271
+ schema: {
272
+ post_id: 'number',
273
+ topic_id: 'number',
274
+ title: 'string',
275
+ category: 'string',
276
+ tags: 'string',
277
+ parent_content: 'string',
278
+ content: 'string',
279
+ embedding: `vector[${dimensions}]`,
280
+ },
281
+ });
282
+
283
+ if (storedEmbeddings.length > 0) {
284
+ await insertMultiple(db, storedEmbeddings.map(buildDocument));
285
+ }
286
+
287
+ winston().info(
288
+ `[search-agent] vectorSearchService: Orama index built with ${storedEmbeddings.length} document(s)`
289
+ );
290
+
291
+ return db;
53
292
  }
54
293
 
55
294
  async function getDb() {
56
- const now = Date.now();
57
- if (_db && (now - _dbTs) < INDEX_TTL_MS) {
58
- return _db;
59
- }
60
-
61
- if (_buildPromise) {
62
- return _buildPromise;
63
- }
64
-
65
- _buildPromise = buildIndex().then((db) => {
66
- _db = db;
67
- _dbTs = Date.now();
68
- _buildPromise = null;
69
- return db;
70
- }).catch((err) => {
71
- _buildPromise = null;
72
- throw err;
73
- });
74
-
75
- return _buildPromise;
295
+ const now = Date.now();
296
+ if (_db && (now - _dbTs) < INDEX_TTL_MS) {
297
+ return _db;
298
+ }
299
+
300
+ if (_buildPromise) {
301
+ return _buildPromise;
302
+ }
303
+
304
+ _buildPromise = buildIndex()
305
+ .then((db) => {
306
+ _db = db;
307
+ _dbTs = Date.now();
308
+ _buildPromise = null;
309
+ return db;
310
+ })
311
+ .catch((err) => {
312
+ _buildPromise = null;
313
+ throw err;
314
+ });
315
+
316
+ return _buildPromise;
76
317
  }
77
318
 
78
319
  /** Invalidate the in-memory Orama index (e.g. after new embeddings are saved). */
79
320
  function invalidateIndex() {
80
- _db = null;
81
- _dbTs = 0;
82
- winston().info('[search-agent] vectorSearchService: Orama index invalidated');
321
+ _db = null;
322
+ _dbTs = 0;
323
+ winston().info('[search-agent] vectorSearchService: Orama index invalidated');
83
324
  }
84
325
 
85
326
  /**
86
- * Performs semantic search against stored post embeddings using Orama vector search.
327
+ * Performs hybrid search against stored post embeddings using:
328
+ * 1. vector similarity on the original query embedding
329
+ * 2. lexical search on an expanded Hebrew query
330
+ * 3. field boosts to favor title/tags/category matches
87
331
  *
88
332
  * @param {string} query - The search query string.
89
- * @returns {Promise<Array<{ topic_id: number, post_id: number, content: string, score: number }>>}
90
- * Top results sorted by cosine similarity descending.
333
+ * @param {number} limit - Max results to return.
334
+ * @returns {Promise<Array<{ topic_id: number, post_id: number, title: string, category: string, tags: string, content: string, score: number }>>}
91
335
  */
92
336
  async function search(query, limit = TOP_K) {
93
- if (typeof query !== 'string' || query.trim() === '') {
94
- throw new Error('search() requires a non-empty query string');
95
- }
96
-
97
- winston().verbose(`[search-agent] vectorSearchService: running Orama vector search for "${query.trim()}"`);
98
-
99
- const [queryEmbedding, db] = await Promise.all([
100
- embed(query),
101
- getDb(),
102
- ]);
103
-
104
- const results = await oramaSearch(db, {
105
- mode: 'vector',
106
- vector: { value: queryEmbedding, property: 'embedding' },
107
- limit,
108
- similarity: 0.1,
109
- includeVectors: false,
110
- });
111
-
112
- winston().verbose(`[search-agent] vectorSearchService: Orama returned ${results.hits.length} hit(s)`);
113
-
114
- const filtered = results.hits.filter(hit => hit.score >= MIN_SCORE);
115
- winston().verbose(
116
- `[search-agent] vectorSearchService: ${filtered.length}/${results.hits.length} hit(s) passed noise floor (MIN_SCORE=${MIN_SCORE})`
117
- );
118
-
119
- return filtered.map(hit => ({
120
- topic_id: hit.document.topic_id,
121
- post_id: hit.document.post_id,
122
- content: hit.document.content,
123
- score: hit.score,
124
- }));
337
+ if (typeof query !== 'string' || query.trim() === '') {
338
+ throw new Error('search() requires a non-empty query string');
339
+ }
340
+
341
+ const trimmed = query.trim();
342
+ const expanded = expandQuery(trimmed);
343
+
344
+ winston().verbose(
345
+ `[search-agent] vectorSearchService: running Orama hybrid search for "${trimmed}" (expanded="${expanded.term}")`
346
+ );
347
+
348
+ const [queryEmbedding, db] = await Promise.all([
349
+ // Keep the embedding on the original query only.
350
+ // Expansion is mainly for lexical recall, especially in Hebrew forum language.
351
+ embed(trimmed),
352
+ getDb(),
353
+ ]);
354
+
355
+ const results = await oramaSearch(db, {
356
+ mode: 'hybrid',
357
+ term: expanded.term,
358
+ properties: SEARCH_PROPERTIES,
359
+ boost: FIELD_BOOSTS,
360
+ vector: {
361
+ value: queryEmbedding,
362
+ property: 'embedding',
363
+ },
364
+ limit,
365
+ similarity: VECTOR_SIMILARITY,
366
+ includeVectors: false,
367
+ });
368
+
369
+ const hits = Array.isArray(results && results.hits) ? results.hits : [];
370
+
371
+ winston().verbose(`[search-agent] vectorSearchService: Orama returned ${hits.length} hit(s)`);
372
+
373
+ const filtered = hits.filter(hit => typeof hit.score === 'number' && hit.score >= MIN_SCORE);
374
+
375
+ winston().verbose(
376
+ `[search-agent] vectorSearchService: ${filtered.length}/${hits.length} hit(s) passed noise floor (MIN_SCORE=${MIN_SCORE})`
377
+ );
378
+
379
+ return filtered.map(hit => ({
380
+ topic_id: hit.document.topic_id,
381
+ post_id: hit.document.post_id,
382
+ title: hit.document.title || '',
383
+ category: hit.document.category || '',
384
+ tags: hit.document.tags || '',
385
+ content: hit.document.content,
386
+ score: hit.score,
387
+ }));
125
388
  }
126
389
 
127
- module.exports = { search, invalidateIndex };
390
+ module.exports = {
391
+ search,
392
+ invalidateIndex
393
+ };