nodebb-plugin-search-agent 0.0.938 → 0.0.941
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cosineSimilarity.js +42 -0
- package/lib/searchHandler.js +41 -73
- package/library.js +2 -2
- package/package.json +1 -1
- package/services/embeddingService.js +102 -431
- package/services/vectorSearchService.js +93 -379
- package/test/testCosine.js +15 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Computes the cosine similarity between two numeric vectors.
|
|
5
|
+
* Handles mismatched lengths by using the shorter vector's dimension.
|
|
6
|
+
* Returns 0 if either vector has zero magnitude.
|
|
7
|
+
*
|
|
8
|
+
* @param {number[]} a
|
|
9
|
+
* @param {number[]} b
|
|
10
|
+
* @returns {number} similarity in [-1, 1]
|
|
11
|
+
*/
|
|
12
|
+
function cosineSimilarity(a, b) {
|
|
13
|
+
const len = Math.min(a.length, b.length);
|
|
14
|
+
let dot = 0;
|
|
15
|
+
let magA = 0;
|
|
16
|
+
let magB = 0;
|
|
17
|
+
|
|
18
|
+
for (let i = 0; i < len; i++) {
|
|
19
|
+
dot += a[i] * b[i];
|
|
20
|
+
magA += a[i] * a[i];
|
|
21
|
+
magB += b[i] * b[i];
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const denom = Math.sqrt(magA) * Math.sqrt(magB);
|
|
25
|
+
return denom === 0 ? 0 : dot / denom;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Ranks items by cosine similarity to a query embedding.
|
|
30
|
+
* Each item must have an `embedding` property (number[]).
|
|
31
|
+
*
|
|
32
|
+
* @param {number[]} queryEmbedding
|
|
33
|
+
* @param {Array<{embedding: number[], [key: string]: any}>} items
|
|
34
|
+
* @returns {Array<{item: object, score: number}>} sorted descending by score
|
|
35
|
+
*/
|
|
36
|
+
function rankBySimilarity(queryEmbedding, items) {
|
|
37
|
+
return items
|
|
38
|
+
.map(item => ({ item, score: cosineSimilarity(queryEmbedding, item.embedding) }))
|
|
39
|
+
.sort((a, b) => b.score - a.score);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
module.exports = { cosineSimilarity, rankBySimilarity };
|
package/lib/searchHandler.js
CHANGED
|
@@ -1,9 +1,3 @@
|
|
|
1
|
-
// ─── Token estimation helper ───────────────────────────────────────────────
|
|
2
|
-
function estimateTokens(str) {
|
|
3
|
-
// Roughly 4 chars/token for English, 2 for Hebrew/UTF-8, but 4 is safe for cost estimation
|
|
4
|
-
return Math.ceil(str.length / 4);
|
|
5
|
-
}
|
|
6
|
-
|
|
7
1
|
'use strict';
|
|
8
2
|
|
|
9
3
|
const https = require('https');
|
|
@@ -181,18 +175,18 @@ function callOpenAI(apiKey, model, messages) {
|
|
|
181
175
|
* @param {string} model
|
|
182
176
|
* @returns {Promise<string>}
|
|
183
177
|
*/
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
178
|
+
async function expandQueryWithHyDE(queryText, apiKey, model) {
|
|
179
|
+
const response = await callOpenAI(apiKey, model, [
|
|
180
|
+
{
|
|
181
|
+
role: 'system',
|
|
182
|
+
content:
|
|
183
|
+
'אתה חבר בפורום. בהינתן שאלת חיפוש, כתוב פוסט תגובה קצר וריאליסטי בפורום שעונה ישירות על השאלה. ' +
|
|
184
|
+
'כתוב רק את תוכן הפוסט — ללא ברכות, הערות מטא, או שורת נושא.',
|
|
185
|
+
},
|
|
186
|
+
{ role: 'user', content: queryText },
|
|
187
|
+
]);
|
|
188
|
+
return (response.choices[0].message.content || '').trim() || queryText;
|
|
189
|
+
}
|
|
196
190
|
|
|
197
191
|
/**
|
|
198
192
|
* Send candidates to OpenAI for independent per-topic relevance scoring.
|
|
@@ -204,38 +198,25 @@ function callOpenAI(apiKey, model, messages) {
|
|
|
204
198
|
async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxResults, snippetByTid = {}) {
|
|
205
199
|
console.log('Re-ranking with AI:', { queryText, candidates: candidates.map(c => ({ tid: c.tid, title: (topicMap[String(c.tid)] || {}).title })) });
|
|
206
200
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
});
|
|
216
|
-
const postEmbeddings = await embedBatch(postSnippets);
|
|
217
|
-
|
|
218
|
-
// Format: [tid:..., embedding: [v1, v2, ...]]
|
|
219
|
-
const candidateList = candidates.map((c, i) => {
|
|
220
|
-
return `[tid:${c.tid}]\nembedding: [${postEmbeddings[i].slice(0, 8).map(x => x.toFixed(4)).join(', ')} ...]`;
|
|
221
|
-
}).join('\n\n');
|
|
201
|
+
// Only send the embedded query and the matched post snippet for each candidate
|
|
202
|
+
const candidateList = candidates
|
|
203
|
+
.map((c) => {
|
|
204
|
+
const raw = (snippetByTid[String(c.tid)] || '').replace(/<[^>]*>/g, ' ').replace(/[ \t]+/g, ' ').trim();
|
|
205
|
+
// Only send the snippet, not the title
|
|
206
|
+
return `[tid:${c.tid}]\n${raw.slice(0, 1500)}`;
|
|
207
|
+
})
|
|
208
|
+
.join('\n\n');
|
|
222
209
|
|
|
223
210
|
const systemPrompt =
|
|
224
211
|
'אתה מסנן חיפוש פורום מחמיר. ' +
|
|
225
|
-
'לכל מועמד ברשימה, דרג את הרלוונטיות של
|
|
212
|
+
'לכל מועמד ברשימה, דרג את הרלוונטיות של קטע הפוסט לשאלת המשתמש בסקלה 0-10: ' +
|
|
226
213
|
'10 = עונה ישירות ובאופן מלא. 7-9 = עונה על חלק משמעותי. 0-6 = לא רלוונטי. ' +
|
|
227
214
|
'החזר אך ורק JSON תקני במבנה: {"tid": ציון, ...} — לדוגמה: {"42": 9, "15": 3}. ' +
|
|
228
|
-
'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.'
|
|
229
|
-
|
|
230
|
-
const userMessage =
|
|
231
|
-
`embedding של שאלת המשתמש: [${queryEmbedding.slice(0, 8).map(x => x.toFixed(4)).join(', ')} ...]\n\nפוסטים:\n${candidateList}`;
|
|
215
|
+
'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.'+
|
|
216
|
+
'הוסף שדה נוסף "scoreExplanation" עם משפט קצר שמסביר למה קטע עם ציון נמוך לא רלוונטי.';
|
|
232
217
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
const embeddingTokens = estimateTokens(queryText) + postSnippets.reduce((sum, s) => sum + estimateTokens(s), 0);
|
|
236
|
-
const llmPromptTokens = estimateTokens(systemPrompt) + estimateTokens(userMessage);
|
|
237
|
-
const winston = require.main.require('winston');
|
|
238
|
-
winston.info(`[search-agent] Token usage: embedding API ≈ ${embeddingTokens} tokens, LLM prompt ≈ ${llmPromptTokens} tokens (for this search)`);
|
|
218
|
+
const userMessage =
|
|
219
|
+
`שאלת המשתמש (מוטמעת): "${queryText}"\n\nפוסטים:\n${candidateList}`;
|
|
239
220
|
|
|
240
221
|
const response = await callOpenAI(apiKey, model, [
|
|
241
222
|
{ role: 'system', content: systemPrompt },
|
|
@@ -255,25 +236,12 @@ async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxR
|
|
|
255
236
|
const scores = JSON.parse(match[0]);
|
|
256
237
|
const candidateByTid = Object.fromEntries(candidates.map(c => [String(c.tid), c]));
|
|
257
238
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
// If nothing passed the threshold, return the top scoring candidate (if any)
|
|
266
|
-
if (filtered.length === 0 && candidates.length > 0) {
|
|
267
|
-
// Find the tid with the highest score
|
|
268
|
-
const sortedAll = Object.entries(scores)
|
|
269
|
-
.sort(([, a], [, b]) => Number(b) - Number(a));
|
|
270
|
-
if (sortedAll.length > 0) {
|
|
271
|
-
const [topTid] = sortedAll[0];
|
|
272
|
-
const topCandidate = candidateByTid[topTid];
|
|
273
|
-
if (topCandidate) filtered = [topCandidate];
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
return filtered;
|
|
239
|
+
return Object.entries(scores)
|
|
240
|
+
.filter(([, score]) => Number(score) >= 7)
|
|
241
|
+
.sort(([, a], [, b]) => Number(b) - Number(a))
|
|
242
|
+
.slice(0, maxResults)
|
|
243
|
+
.map(([tid]) => candidateByTid[tid])
|
|
244
|
+
.filter(Boolean);
|
|
277
245
|
}
|
|
278
246
|
|
|
279
247
|
// ─── Public API ───────────────────────────────────────────────────────────────
|
|
@@ -305,16 +273,16 @@ async function searchTopics(queryText) {
|
|
|
305
273
|
// HyDE: replace the short raw query with a hypothetical answer so the
|
|
306
274
|
// embedding matches post content more closely.
|
|
307
275
|
let embeddingQuery = queryText;
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
276
|
+
if (useAI && settings.hydeEnabled) {
|
|
277
|
+
try {
|
|
278
|
+
embeddingQuery = await expandQueryWithHyDE(
|
|
279
|
+
queryText, settings.openaiApiKey, settings.openaiModel
|
|
280
|
+
);
|
|
281
|
+
winston.verbose(`[search-agent] HyDE expanded query (${embeddingQuery.length} chars)`);
|
|
282
|
+
} catch (hydeErr) {
|
|
283
|
+
winston.warn(`[search-agent] HyDE expansion failed, using raw query: ${hydeErr.message}`);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
318
286
|
|
|
319
287
|
// Request more candidates when AI will re-rank them.
|
|
320
288
|
const vectorLimit = useAI ? settings.aiCandidates : settings.maxResults;
|
package/library.js
CHANGED
|
@@ -30,13 +30,13 @@ plugin.init = async (params) => {
|
|
|
30
30
|
|
|
31
31
|
// Start initial embedding sync in the background — does not block NodeBB startup.
|
|
32
32
|
winston.info('[search-agent] Starting initial embedding sync…');
|
|
33
|
-
|
|
33
|
+
startSync().catch(err => winston.warn(`[search-agent] Initial sync failed: ${err.message}`));
|
|
34
34
|
|
|
35
35
|
// Re-sync every 10 minutes to pick up new posts.
|
|
36
36
|
const RESYNC_INTERVAL_MS = 10 * 60 * 1000;
|
|
37
37
|
setInterval(() => {
|
|
38
38
|
winston.info('[search-agent] Running scheduled embedding re-sync…');
|
|
39
|
-
|
|
39
|
+
startSync().catch(err => winston.warn(`[search-agent] Scheduled re-sync failed: ${err.message}`));
|
|
40
40
|
}, RESYNC_INTERVAL_MS).unref();
|
|
41
41
|
|
|
42
42
|
winston.info('[plugins/search-agent] Initialised.');
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nodebb-plugin-search-agent",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.941",
|
|
4
4
|
"description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
|
|
5
5
|
"main": "library.js",
|
|
6
6
|
"author": "Racheli Bayfus",
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
const https = require('https');
|
|
4
4
|
|
|
5
5
|
function winston() {
|
|
6
|
-
|
|
6
|
+
return require.main.require('winston');
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
const OPENAI_EMBEDDINGS_HOSTNAME = 'api.openai.com';
|
|
@@ -11,236 +11,21 @@ const OPENAI_EMBEDDINGS_PATH = '/v1/embeddings';
|
|
|
11
11
|
const EMBEDDING_MODEL = 'text-embedding-3-small';
|
|
12
12
|
const MAX_RETRIES = 3;
|
|
13
13
|
const RETRY_DELAY_MS = 500;
|
|
14
|
-
|
|
15
|
-
//
|
|
16
|
-
//
|
|
17
|
-
const MAX_CHARS =
|
|
18
|
-
const CHUNK_OVERLAP = 300;
|
|
19
|
-
const MIN_CHUNK_CHARS = 500;
|
|
20
|
-
const TARGET_CHUNK_CHARS = 2200;
|
|
21
|
-
|
|
22
|
-
// ─── Text cleanup ─────────────────────────────────────────────────────────────
|
|
23
|
-
|
|
24
|
-
function extractPureText(text) {
|
|
25
|
-
if (typeof text !== 'string') return '';
|
|
26
|
-
|
|
27
|
-
// Remove Markdown images: 
|
|
28
|
-
let cleaned = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
|
|
29
|
-
|
|
30
|
-
// Remove HTML <img> tags
|
|
31
|
-
cleaned = cleaned.replace(/<img\b[^>]*>/gi, '');
|
|
32
|
-
|
|
33
|
-
// Remove links to files/images (common extensions)
|
|
34
|
-
cleaned = cleaned.replace(
|
|
35
|
-
/https?:\/\/(\S+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?))(\?\S*)?/gi,
|
|
36
|
-
''
|
|
37
|
-
);
|
|
38
|
-
|
|
39
|
-
// Remove Markdown file links: [desc](url.ext)
|
|
40
|
-
cleaned = cleaned.replace(
|
|
41
|
-
/\[[^\]]*\]\([^)]*\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)\)/gi,
|
|
42
|
-
''
|
|
43
|
-
);
|
|
44
|
-
|
|
45
|
-
// Remove any remaining <a href=...> tags to files
|
|
46
|
-
cleaned = cleaned.replace(
|
|
47
|
-
/<a\b[^>]*href=["']?[^"'>]+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)[^>]*>.*?<\/a>/gi,
|
|
48
|
-
''
|
|
49
|
-
);
|
|
50
|
-
|
|
51
|
-
// Remove any remaining HTML tags
|
|
52
|
-
cleaned = cleaned.replace(/<[^>]+>/g, ' ');
|
|
53
|
-
|
|
54
|
-
// Remove extra whitespace
|
|
55
|
-
cleaned = cleaned
|
|
56
|
-
.replace(/[ \t]+/g, ' ')
|
|
57
|
-
.replace(/\n{2,}/g, '\n')
|
|
58
|
-
.trim();
|
|
59
|
-
|
|
60
|
-
return cleaned;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
function normalizeWhitespace(text) {
|
|
64
|
-
return text
|
|
65
|
-
.replace(/\r\n/g, '\n')
|
|
66
|
-
.replace(/[ \t]+/g, ' ')
|
|
67
|
-
.replace(/\n{3,}/g, '\n\n')
|
|
68
|
-
.trim();
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
function splitIntoBlocks(text) {
|
|
72
|
-
const normalized = normalizeWhitespace(text);
|
|
73
|
-
|
|
74
|
-
const rawBlocks = normalized
|
|
75
|
-
.split(/\n{2,}|(?=^#{1,6}\s)|(?=^\s*[-*+]\s)|(?=^\s*\d+\.\s)|(?=^\s*>\s)|(?=^```)/gm)
|
|
76
|
-
.map(block => block.trim())
|
|
77
|
-
.filter(Boolean);
|
|
78
|
-
|
|
79
|
-
return rawBlocks;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
function splitLargeBlock(block, maxLen = MAX_CHARS) {
|
|
83
|
-
if (block.length <= maxLen) return [block];
|
|
84
|
-
|
|
85
|
-
const sentences = block.match(/[^.!?\n]+[.!?\n]+|[^.!?\n]+$/g) || [block];
|
|
86
|
-
const chunks = [];
|
|
87
|
-
let current = '';
|
|
88
|
-
|
|
89
|
-
for (const sentence of sentences) {
|
|
90
|
-
const s = sentence.trim();
|
|
91
|
-
if (!s) continue;
|
|
92
|
-
|
|
93
|
-
if ((current + ' ' + s).trim().length <= maxLen) {
|
|
94
|
-
current = current ? `${current} ${s}` : s;
|
|
95
|
-
continue;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
if (current) {
|
|
99
|
-
chunks.push(current);
|
|
100
|
-
current = '';
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
// Fallback for very long sentence
|
|
104
|
-
if (s.length > maxLen) {
|
|
105
|
-
let i = 0;
|
|
106
|
-
while (i < s.length) {
|
|
107
|
-
chunks.push(s.slice(i, i + maxLen).trim());
|
|
108
|
-
i += maxLen;
|
|
109
|
-
}
|
|
110
|
-
} else {
|
|
111
|
-
current = s;
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
if (current) chunks.push(current);
|
|
116
|
-
|
|
117
|
-
return chunks;
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
function buildOverlapPrefix(prevChunk, overlapChars = CHUNK_OVERLAP) {
|
|
121
|
-
if (!prevChunk) return '';
|
|
122
|
-
return prevChunk.slice(Math.max(0, prevChunk.length - overlapChars)).trim();
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
function splitIntoSemanticChunks(
|
|
126
|
-
text,
|
|
127
|
-
{
|
|
128
|
-
maxLen = MAX_CHARS,
|
|
129
|
-
targetLen = TARGET_CHUNK_CHARS,
|
|
130
|
-
minLen = MIN_CHUNK_CHARS,
|
|
131
|
-
overlap = CHUNK_OVERLAP,
|
|
132
|
-
} = {}
|
|
133
|
-
) {
|
|
134
|
-
if (!text) return [];
|
|
135
|
-
if (text.length <= maxLen) return [text];
|
|
136
|
-
|
|
137
|
-
const blocks = splitIntoBlocks(text).flatMap(block => splitLargeBlock(block, maxLen));
|
|
138
|
-
|
|
139
|
-
const chunks = [];
|
|
140
|
-
let current = '';
|
|
141
|
-
|
|
142
|
-
for (const block of blocks) {
|
|
143
|
-
const next = current ? `${current}\n\n${block}` : block;
|
|
144
|
-
|
|
145
|
-
if (next.length <= targetLen || current.length < minLen) {
|
|
146
|
-
if (next.length <= maxLen) {
|
|
147
|
-
current = next;
|
|
148
|
-
continue;
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
if (current) {
|
|
153
|
-
chunks.push(current.trim());
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
current = block;
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
if (current) {
|
|
160
|
-
chunks.push(current.trim());
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
const withOverlap = chunks.map((chunk, i) => {
|
|
164
|
-
if (i === 0) return chunk;
|
|
165
|
-
|
|
166
|
-
const prefix = buildOverlapPrefix(chunks[i - 1], overlap);
|
|
167
|
-
const merged = prefix ? `${prefix}\n\n${chunk}` : chunk;
|
|
168
|
-
|
|
169
|
-
return merged.length <= maxLen ? merged : merged.slice(merged.length - maxLen);
|
|
170
|
-
});
|
|
171
|
-
|
|
172
|
-
const finalChunks = [];
|
|
173
|
-
for (const chunk of withOverlap) {
|
|
174
|
-
if (
|
|
175
|
-
finalChunks.length > 0 &&
|
|
176
|
-
chunk.length < minLen &&
|
|
177
|
-
finalChunks[finalChunks.length - 1].length + 2 + chunk.length <= maxLen
|
|
178
|
-
) {
|
|
179
|
-
finalChunks[finalChunks.length - 1] += `\n\n${chunk}`;
|
|
180
|
-
} else {
|
|
181
|
-
finalChunks.push(chunk);
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
return finalChunks;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
// ─── Vector helpers ───────────────────────────────────────────────────────────
|
|
189
|
-
|
|
190
|
-
function averageVectors(vectors) {
|
|
191
|
-
if (!vectors.length) return [];
|
|
192
|
-
const len = vectors[0].length;
|
|
193
|
-
const sum = new Array(len).fill(0);
|
|
194
|
-
|
|
195
|
-
for (const v of vectors) {
|
|
196
|
-
for (let i = 0; i < len; i++) {
|
|
197
|
-
sum[i] += v[i];
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
return sum.map(x => x / vectors.length);
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
function truncate(text) {
|
|
205
|
-
return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
function estimateTokens(str) {
|
|
209
|
-
const ascii = /^[\x00-\x7F]*$/.test(str);
|
|
210
|
-
return ascii ? Math.ceil(str.length / 4) : Math.ceil(str.length / 1.5);
|
|
211
|
-
}
|
|
14
|
+
// text-embedding-3-small supports 8 192 tokens.
|
|
15
|
+
// Hebrew/non-ASCII text tokenizes at ~1.5–2 chars/token (UTF-8 multibyte).
|
|
16
|
+
// Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
|
|
17
|
+
const MAX_CHARS = 12000;
|
|
212
18
|
|
|
213
19
|
// ─── Embedding cache ──────────────────────────────────────────────────────────
|
|
214
|
-
|
|
20
|
+
// Avoids calling the embeddings API for the same text within a session.
|
|
21
|
+
// HyDE output varies, so the biggest wins come from repeated identical queries.
|
|
215
22
|
const _embedCache = new Map();
|
|
216
23
|
const EMBED_CACHE_MAX = 500;
|
|
217
24
|
|
|
218
|
-
function
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
const value = _embedCache.get(key);
|
|
222
|
-
|
|
223
|
-
// Refresh LRU-ish order
|
|
224
|
-
_embedCache.delete(key);
|
|
225
|
-
_embedCache.set(key, value);
|
|
226
|
-
|
|
227
|
-
return value;
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
function setCachedEmbedding(key, value) {
|
|
231
|
-
if (_embedCache.has(key)) {
|
|
232
|
-
_embedCache.delete(key);
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
while (_embedCache.size >= EMBED_CACHE_MAX) {
|
|
236
|
-
_embedCache.delete(_embedCache.keys().next().value);
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
_embedCache.set(key, value);
|
|
25
|
+
function truncate(text) {
|
|
26
|
+
return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
|
|
240
27
|
}
|
|
241
28
|
|
|
242
|
-
// ─── OpenAI request helpers ───────────────────────────────────────────────────
|
|
243
|
-
|
|
244
29
|
/**
|
|
245
30
|
* Performs an HTTPS POST request to the OpenAI embeddings endpoint.
|
|
246
31
|
* @param {string} apiKey
|
|
@@ -248,246 +33,132 @@ function setCachedEmbedding(key, value) {
|
|
|
248
33
|
* @returns {Promise<object>} Parsed JSON response body
|
|
249
34
|
*/
|
|
250
35
|
function requestEmbeddings(apiKey, input) {
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
req.write(body);
|
|
289
|
-
req.end();
|
|
290
|
-
});
|
|
36
|
+
return new Promise((resolve, reject) => {
|
|
37
|
+
const body = JSON.stringify({ model: EMBEDDING_MODEL, input });
|
|
38
|
+
const options = {
|
|
39
|
+
hostname: OPENAI_EMBEDDINGS_HOSTNAME,
|
|
40
|
+
path: OPENAI_EMBEDDINGS_PATH,
|
|
41
|
+
method: 'POST',
|
|
42
|
+
headers: {
|
|
43
|
+
'Content-Type': 'application/json',
|
|
44
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
45
|
+
'Content-Length': Buffer.byteLength(body),
|
|
46
|
+
},
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
const req = https.request(options, (res) => {
|
|
50
|
+
const chunks = [];
|
|
51
|
+
res.on('data', chunk => chunks.push(chunk));
|
|
52
|
+
res.on('end', () => {
|
|
53
|
+
let parsed;
|
|
54
|
+
try {
|
|
55
|
+
parsed = JSON.parse(Buffer.concat(chunks).toString('utf8'));
|
|
56
|
+
} catch (e) {
|
|
57
|
+
return reject(new Error(`Failed to parse OpenAI response: ${e.message}`));
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
if (res.statusCode >= 400) {
|
|
61
|
+
const message = (parsed.error && parsed.error.message) || `HTTP ${res.statusCode}`;
|
|
62
|
+
return reject(new Error(`OpenAI API error: ${message}`));
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
resolve(parsed);
|
|
66
|
+
});
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
req.on('error', err => reject(new Error(`Network error calling OpenAI: ${err.message}`)));
|
|
70
|
+
req.write(body);
|
|
71
|
+
req.end();
|
|
72
|
+
});
|
|
291
73
|
}
|
|
292
74
|
|
|
293
75
|
/**
|
|
294
76
|
* Retries an async operation up to maxRetries times with exponential back-off.
|
|
295
|
-
* @param {Function} fn
|
|
77
|
+
* @param {Function} fn - Async function to retry
|
|
296
78
|
* @param {number} retries
|
|
297
79
|
* @returns {Promise<*>}
|
|
298
80
|
*/
|
|
299
81
|
async function withRetry(fn, retries = MAX_RETRIES) {
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
throw lastError;
|
|
82
|
+
let lastError;
|
|
83
|
+
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
84
|
+
try {
|
|
85
|
+
return await fn();
|
|
86
|
+
} catch (err) {
|
|
87
|
+
lastError = err;
|
|
88
|
+
if (attempt < retries) {
|
|
89
|
+
winston().warn(`[search-agent] embeddingService: attempt ${attempt} failed (${err.message}), retrying in ${RETRY_DELAY_MS * attempt} ms…`);
|
|
90
|
+
await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS * attempt));
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
throw lastError;
|
|
318
95
|
}
|
|
319
96
|
|
|
320
|
-
// ─── Public API ───────────────────────────────────────────────────────────────
|
|
321
|
-
|
|
322
97
|
/**
|
|
323
98
|
* Converts a single text string into an embedding vector.
|
|
324
99
|
* @param {string} text
|
|
325
100
|
* @returns {Promise<number[]>}
|
|
326
101
|
*/
|
|
327
102
|
async function embed(text) {
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
const apiKey = process.env.OPENAI_API_KEY;
|
|
333
|
-
if (!apiKey) {
|
|
334
|
-
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
const pureText = extractPureText(text);
|
|
338
|
-
if (!pureText) {
|
|
339
|
-
throw new Error('embed() received no usable text after filtering');
|
|
340
|
-
}
|
|
103
|
+
if (typeof text !== 'string' || text.trim() === '') {
|
|
104
|
+
throw new Error('embed() requires a non-empty string');
|
|
105
|
+
}
|
|
341
106
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
overlap: CHUNK_OVERLAP,
|
|
347
|
-
});
|
|
107
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
108
|
+
if (!apiKey) {
|
|
109
|
+
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
110
|
+
}
|
|
348
111
|
|
|
349
|
-
|
|
350
|
-
const safe = truncate(pureText);
|
|
351
|
-
const cached = getCachedEmbedding(safe);
|
|
352
|
-
if (cached) {
|
|
353
|
-
winston().verbose('[search-agent] embeddingService: embedding cache hit');
|
|
354
|
-
return cached;
|
|
355
|
-
}
|
|
112
|
+
const safe = truncate(text);
|
|
356
113
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
114
|
+
if (_embedCache.has(safe)) {
|
|
115
|
+
winston().verbose('[search-agent] embeddingService: embedding cache hit');
|
|
116
|
+
return _embedCache.get(safe);
|
|
117
|
+
}
|
|
361
118
|
|
|
362
|
-
|
|
363
|
-
|
|
119
|
+
winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
|
|
120
|
+
const response = await withRetry(() => requestEmbeddings(apiKey, safe));
|
|
121
|
+
winston().verbose('[search-agent] embeddingService: embedding generated successfully');
|
|
122
|
+
const embedding = response.data[0].embedding;
|
|
364
123
|
|
|
365
|
-
|
|
366
|
-
|
|
124
|
+
if (_embedCache.size >= EMBED_CACHE_MAX) {
|
|
125
|
+
_embedCache.delete(_embedCache.keys().next().value);
|
|
126
|
+
}
|
|
127
|
+
_embedCache.set(safe, embedding);
|
|
367
128
|
|
|
368
|
-
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
winston().info(
|
|
372
|
-
`[search-agent] embeddingService: splitting long text into ${chunks.length} semantic chunks for embedding`
|
|
373
|
-
);
|
|
374
|
-
|
|
375
|
-
chunks.forEach((chunk, i) => {
|
|
376
|
-
const tokenCount = estimateTokens(chunk);
|
|
377
|
-
winston().info(
|
|
378
|
-
`[search-agent] embeddingService: chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
|
|
379
|
-
);
|
|
380
|
-
});
|
|
381
|
-
|
|
382
|
-
const vectors = await embedBatch(chunks);
|
|
383
|
-
const avg = averageVectors(vectors);
|
|
384
|
-
const safe = truncate(pureText);
|
|
385
|
-
|
|
386
|
-
setCachedEmbedding(safe, avg);
|
|
387
|
-
|
|
388
|
-
return avg;
|
|
129
|
+
return embedding;
|
|
389
130
|
}
|
|
390
131
|
|
|
391
132
|
/**
|
|
392
133
|
* Converts an array of text strings into an array of embedding vectors.
|
|
393
|
-
* Texts are sent in batched API
|
|
134
|
+
* Texts are sent in a single batched API request.
|
|
394
135
|
* @param {string[]} texts
|
|
395
136
|
* @returns {Promise<number[][]>}
|
|
396
137
|
*/
|
|
397
138
|
async function embedBatch(texts) {
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
|
|
403
|
-
if (invalid !== -1) {
|
|
404
|
-
throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
const apiKey = process.env.OPENAI_API_KEY;
|
|
408
|
-
if (!apiKey) {
|
|
409
|
-
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
const allChunks = [];
|
|
413
|
-
const chunkMap = [];
|
|
414
|
-
|
|
415
|
-
for (const [textIdx, text] of texts.entries()) {
|
|
416
|
-
const pureText = extractPureText(text);
|
|
417
|
-
|
|
418
|
-
if (!pureText) {
|
|
419
|
-
chunkMap.push({ count: 0 });
|
|
420
|
-
continue;
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
const chunks = splitIntoSemanticChunks(pureText, {
|
|
424
|
-
maxLen: MAX_CHARS,
|
|
425
|
-
targetLen: TARGET_CHUNK_CHARS,
|
|
426
|
-
minLen: MIN_CHUNK_CHARS,
|
|
427
|
-
overlap: CHUNK_OVERLAP,
|
|
428
|
-
});
|
|
429
|
-
|
|
430
|
-
chunkMap.push({ count: chunks.length });
|
|
431
|
-
allChunks.push(...chunks);
|
|
432
|
-
|
|
433
|
-
if (chunks.length === 1) {
|
|
434
|
-
const tokenCount = estimateTokens(chunks[0]);
|
|
435
|
-
winston().info(
|
|
436
|
-
`[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - 1 chunk, ${chunks[0].length} chars, ~${tokenCount} tokens`
|
|
437
|
-
);
|
|
438
|
-
} else {
|
|
439
|
-
winston().info(
|
|
440
|
-
`[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - ${chunks.length} chunks`
|
|
441
|
-
);
|
|
442
|
-
chunks.forEach((chunk, i) => {
|
|
443
|
-
const tokenCount = estimateTokens(chunk);
|
|
444
|
-
winston().info(
|
|
445
|
-
`[search-agent] embeddingService: chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
|
|
446
|
-
);
|
|
447
|
-
});
|
|
448
|
-
}
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
winston().verbose(
|
|
452
|
-
`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`
|
|
453
|
-
);
|
|
454
|
-
|
|
455
|
-
if (allChunks.length === 0) {
|
|
456
|
-
return chunkMap.map(({ count }) => (count === 0 ? [] : null));
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
const safeChunks = allChunks.map(chunk => truncate(chunk));
|
|
460
|
-
const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
|
|
461
|
-
|
|
462
|
-
winston().verbose(
|
|
463
|
-
`[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`
|
|
464
|
-
);
|
|
139
|
+
if (!Array.isArray(texts) || texts.length === 0) {
|
|
140
|
+
throw new Error('embedBatch() requires a non-empty array of strings');
|
|
141
|
+
}
|
|
465
142
|
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
143
|
+
const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
|
|
144
|
+
if (invalid !== -1) {
|
|
145
|
+
throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
|
|
146
|
+
}
|
|
469
147
|
|
|
470
|
-
|
|
471
|
-
|
|
148
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
149
|
+
if (!apiKey) {
|
|
150
|
+
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
151
|
+
}
|
|
472
152
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
result.push(vectors[idx]);
|
|
478
|
-
idx += 1;
|
|
479
|
-
} else {
|
|
480
|
-
result.push(averageVectors(vectors.slice(idx, idx + count)));
|
|
481
|
-
idx += count;
|
|
482
|
-
}
|
|
483
|
-
}
|
|
153
|
+
const safeTexts = texts.map(truncate);
|
|
154
|
+
winston().verbose(`[search-agent] embeddingService: generating batch embeddings for ${safeTexts.length} text(s)`);
|
|
155
|
+
const response = await withRetry(() => requestEmbeddings(apiKey, safeTexts));
|
|
156
|
+
winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${texts.length} vector(s))`);
|
|
484
157
|
|
|
485
|
-
|
|
158
|
+
// OpenAI returns items sorted by index field, but sort explicitly to be safe
|
|
159
|
+
return response.data
|
|
160
|
+
.sort((a, b) => a.index - b.index)
|
|
161
|
+
.map(item => item.embedding);
|
|
486
162
|
}
|
|
487
163
|
|
|
488
|
-
module.exports = {
|
|
489
|
-
embed,
|
|
490
|
-
embedBatch,
|
|
491
|
-
extractPureText,
|
|
492
|
-
splitIntoSemanticChunks,
|
|
493
|
-
};
|
|
164
|
+
module.exports = { embed, embedBatch };
|
|
@@ -5,409 +5,123 @@ const { embed } = require('./embeddingService');
|
|
|
5
5
|
const { getAllEmbeddings } = require('./vectorStore');
|
|
6
6
|
|
|
7
7
|
function winston() {
|
|
8
|
-
|
|
8
|
+
return require.main.require('winston');
|
|
9
9
|
}
|
|
10
10
|
|
|
11
11
|
// Fetch this many candidates from Orama — cast a wide net so the AI has enough to choose from
|
|
12
12
|
const TOP_K = 50;
|
|
13
|
-
|
|
14
|
-
//
|
|
15
|
-
//
|
|
16
|
-
const MIN_SCORE = 0.
|
|
17
|
-
|
|
18
|
-
// Rebuild the Orama index after this interval
|
|
13
|
+
// Absolute minimum cosine similarity — only filters pure noise (near-zero similarity).
|
|
14
|
+
// Do NOT raise this: the relevant result often scores lower than irrelevant ones.
|
|
15
|
+
// The AI re-ranker (which reads content) is the precision gate, not this floor.
|
|
16
|
+
const MIN_SCORE = 0.10;
|
|
17
|
+
// Rebuild the Orama index after this interval (mirrors TF-IDF cache TTL)
|
|
19
18
|
const INDEX_TTL_MS = 5 * 60 * 1000;
|
|
20
19
|
|
|
21
|
-
// Hybrid search configuration
|
|
22
|
-
const VECTOR_SIMILARITY = 0.1;
|
|
23
|
-
const SEARCH_PROPERTIES = ['title', 'category', 'tags', 'content', 'parent_content'];
|
|
24
|
-
const FIELD_BOOSTS = {
|
|
25
|
-
title: 3.5,
|
|
26
|
-
tags: 2.8,
|
|
27
|
-
category: 2.2,
|
|
28
|
-
content: 1.0,
|
|
29
|
-
parent_content: 0.8,
|
|
30
|
-
};
|
|
31
|
-
|
|
32
20
|
let _db = null;
|
|
33
21
|
let _dbTs = 0;
|
|
34
22
|
let _buildPromise = null;
|
|
35
23
|
|
|
36
|
-
// Finance-heavy Hebrew forum query expansion.
|
|
37
|
-
// These are intentionally conservative: good recall lift without flooding the query.
|
|
38
|
-
const QUERY_EXPANSIONS = {
|
|
39
|
-
// General finance
|
|
40
|
-
'מניה': ['מניות', 'נייר ערך', 'שוק ההון', 'בורסה'],
|
|
41
|
-
'מניות': ['מניה', 'ניירות ערך', 'שוק ההון', 'בורסה'],
|
|
42
|
-
'אגח': ['אג"ח', 'איגרת חוב', 'איגרות חוב', 'חוב'],
|
|
43
|
-
'אג"ח': ['אגח', 'איגרת חוב', 'איגרות חוב', 'חוב'],
|
|
44
|
-
'קרן': ['קרנות', 'קרן נאמנות', 'קרן סל'],
|
|
45
|
-
'קרנות': ['קרן', 'קרן נאמנות', 'קרן סל'],
|
|
46
|
-
'קרן סל': ['etf', 'תעודת סל', 'קרן מחקה'],
|
|
47
|
-
'תעודת סל': ['etf', 'קרן סל', 'קרן מחקה'],
|
|
48
|
-
'etf': ['קרן סל', 'תעודת סל', 'קרן מחקה'],
|
|
49
|
-
'מדד': ['מדדים', 'מדד מניות', 'תשואת מדד'],
|
|
50
|
-
'מדדים': ['מדד', 'מדד מניות', 'תשואת מדד'],
|
|
51
|
-
'תיק': ['תיק השקעות', 'פיזור', 'החזקות'],
|
|
52
|
-
'השקעה': ['השקעות', 'להשקיע', 'תיק השקעות'],
|
|
53
|
-
'השקעות': ['השקעה', 'להשקיע', 'תיק השקעות'],
|
|
54
|
-
'להשקיע': ['השקעה', 'השקעות', 'תיק השקעות'],
|
|
55
|
-
'תשואה': ['רווח', 'תשואות', 'רווחיות'],
|
|
56
|
-
'רווח': ['רווחים', 'תשואה', 'רווחיות'],
|
|
57
|
-
'הפסד': ['הפסדים', 'ירידה', 'מינוס'],
|
|
58
|
-
'דיבידנד': ['דיבידנדים', 'חלוקת רווחים'],
|
|
59
|
-
'מכפיל': ['מכפיל רווח', 'pe', 'p/e'],
|
|
60
|
-
'pe': ['מכפיל', 'מכפיל רווח', 'p/e'],
|
|
61
|
-
'p/e': ['מכפיל', 'מכפיל רווח', 'pe'],
|
|
62
|
-
'מינוף': ['ממונף', 'הלוואה', 'מרגין', 'margin'],
|
|
63
|
-
'מרגין': ['margin', 'מינוף'],
|
|
64
|
-
'margin': ['מרגין', 'מינוף'],
|
|
65
|
-
'סיכון': ['סיכונים', 'תנודתיות', 'חשיפה'],
|
|
66
|
-
'נזילות': ['נזיל', 'מזומן', 'סחירות'],
|
|
67
|
-
'סחירות': ['נזילות', 'נזיל'],
|
|
68
|
-
|
|
69
|
-
// Tax / regulation
|
|
70
|
-
'מס': ['מיסוי', 'מסים', 'רשות המסים'],
|
|
71
|
-
'מיסוי': ['מס', 'מסים', 'רשות המסים'],
|
|
72
|
-
'מסים': ['מס', 'מיסוי', 'רשות המסים'],
|
|
73
|
-
'קיזוז': ['קיזוז הפסדים', 'מגן מס'],
|
|
74
|
-
'דוח': ['דו"ח', 'דיווח', 'טופס'],
|
|
75
|
-
'דו"ח': ['דוח', 'דיווח', 'טופס'],
|
|
76
|
-
|
|
77
|
-
// Savings / pension
|
|
78
|
-
'פנסיה': ['קרן פנסיה', 'חיסכון פנסיוני', 'קצבה'],
|
|
79
|
-
'גמל': ['קופת גמל', 'קופ"ג'],
|
|
80
|
-
'קופג': ['קופת גמל', 'קופ"ג', 'גמל'],
|
|
81
|
-
'קופ"ג': ['קופת גמל', 'קופג', 'גמל'],
|
|
82
|
-
'השתלמות': ['קרן השתלמות'],
|
|
83
|
-
'משכנתא': ['משכנתאות', 'ריבית', 'הלוואת דיור'],
|
|
84
|
-
'הלוואה': ['הלוואות', 'אשראי', 'מימון'],
|
|
85
|
-
'אשראי': ['הלוואה', 'הלוואות', 'מימון'],
|
|
86
|
-
|
|
87
|
-
// Trading / technical
|
|
88
|
-
'מסחר': ['טריידינג', 'קניה', 'מכירה', 'פקודה'],
|
|
89
|
-
'טריידינג': ['מסחר', 'מסחר יומי', 'קניה', 'מכירה'],
|
|
90
|
-
'שורט': ['short', 'מכירה בחסר'],
|
|
91
|
-
'short': ['שורט', 'מכירה בחסר'],
|
|
92
|
-
'לונג': ['long', 'החזקה'],
|
|
93
|
-
'long': ['לונג', 'החזקה'],
|
|
94
|
-
'פקודה': ['פקודות', 'לימיט', 'מרקט'],
|
|
95
|
-
'לימיט': ['limit', 'פקודת לימיט'],
|
|
96
|
-
'limit': ['לימיט', 'פקודת לימיט'],
|
|
97
|
-
'מרקט': ['market', 'פקודת שוק'],
|
|
98
|
-
'market': ['מרקט', 'פקודת שוק'],
|
|
99
|
-
|
|
100
|
-
// Crypto
|
|
101
|
-
'ביטקוין': ['btc', 'קריפטו', 'מטבע דיגיטלי'],
|
|
102
|
-
'btc': ['ביטקוין', 'קריפטו', 'מטבע דיגיטלי'],
|
|
103
|
-
'אתריום': ['eth', 'קריפטו', 'מטבע דיגיטלי'],
|
|
104
|
-
'eth': ['אתריום', 'קריפטו', 'מטבע דיגיטלי'],
|
|
105
|
-
'קריפטו': ['מטבע דיגיטלי', 'ביטקוין', 'אתריום', 'בלוקציין'],
|
|
106
|
-
'בלוקציין': ['קריפטו', 'מטבע דיגיטלי'],
|
|
107
|
-
|
|
108
|
-
// Hebrew forum / advice intent
|
|
109
|
-
'מומלץ': ['כדאי', 'המלצה', 'עדיף'],
|
|
110
|
-
'כדאי': ['מומלץ', 'המלצה', 'עדיף'],
|
|
111
|
-
'המלצה': ['מומלץ', 'כדאי', 'עדיף'],
|
|
112
|
-
'בעיה': ['תקלה', 'קושי', 'לא עובד'],
|
|
113
|
-
'תקלה': ['בעיה', 'לא עובד', 'שגיאה'],
|
|
114
|
-
'שגיאה': ['תקלה', 'בעיה', 'לא עובד'],
|
|
115
|
-
};
|
|
116
|
-
|
|
117
|
-
// Common phrase-level expansions that are better handled before token expansion.
|
|
118
|
-
const PHRASE_EXPANSIONS = [
|
|
119
|
-
{
|
|
120
|
-
pattern: /\b(?:קרן\s+סל|תעודת\s+סל|קרן\s+מחקה)\b/gi,
|
|
121
|
-
terms: ['etf', 'קרן סל', 'תעודת סל', 'קרן מחקה'],
|
|
122
|
-
},
|
|
123
|
-
{
|
|
124
|
-
pattern: /\b(?:איגרת\s+חוב|איגרות\s+חוב|אג["׳׳]?\s?ח)\b/gi,
|
|
125
|
-
terms: ['אגח', 'אג"ח', 'איגרת חוב', 'איגרות חוב'],
|
|
126
|
-
},
|
|
127
|
-
{
|
|
128
|
-
pattern: /\b(?:קופת\s+גמל|קופ["׳׳]?\s?ג)\b/gi,
|
|
129
|
-
terms: ['קופת גמל', 'קופג', 'קופ"ג', 'גמל'],
|
|
130
|
-
},
|
|
131
|
-
{
|
|
132
|
-
pattern: /\b(?:מכפיל\s+רווח|p\/e|pe)\b/gi,
|
|
133
|
-
terms: ['מכפיל', 'מכפיל רווח', 'pe', 'p/e'],
|
|
134
|
-
},
|
|
135
|
-
{
|
|
136
|
-
pattern: /\b(?:מכירה\s+בחסר|short)\b/gi,
|
|
137
|
-
terms: ['שורט', 'short', 'מכירה בחסר'],
|
|
138
|
-
},
|
|
139
|
-
];
|
|
140
|
-
|
|
141
|
-
// Generic filler words to ignore for lexical expansion
|
|
142
|
-
const STOP_WORDS = new Set([
|
|
143
|
-
'של', 'על', 'עם', 'בלי', 'גם', 'או', 'אם', 'אבל', 'כי', 'זה', 'זאת', 'זו',
|
|
144
|
-
'יש', 'אין', 'אני', 'אתה', 'את', 'הוא', 'היא', 'הם', 'הן', 'אנחנו', 'מה',
|
|
145
|
-
'איך', 'למה', 'מתי', 'איפה', 'האם', 'כל', 'עוד', 'כמו', 'רק', 'מאוד', 'פחות',
|
|
146
|
-
'יותר', 'אחרי', 'לפני', 'תוך', 'דרך', 'לגבי', 'בנוגע', 'בשביל', 'מול',
|
|
147
|
-
]);
|
|
148
|
-
|
|
149
|
-
function normalizeHebrew(text) {
|
|
150
|
-
if(!text) return text;
|
|
151
|
-
|
|
152
|
-
// Remove common prefixes
|
|
153
|
-
const prefixes = ['ה', 'ו', 'ב', 'ל', 'מ', 'ש', 'כ'];
|
|
154
|
-
for (const prefix of prefixes) {
|
|
155
|
-
if (text.startsWith(prefix) && text.length > 3) {
|
|
156
|
-
text = text.slice(1);
|
|
157
|
-
break;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
// Remove common plural suffixes
|
|
162
|
-
const pluralSuffixes = ['ים', 'ות'];
|
|
163
|
-
for (const suffix of pluralSuffixes) {
|
|
164
|
-
if (text.endsWith(suffix) && text.length > 3) {
|
|
165
|
-
text = text.slice(0, -suffix.length);
|
|
166
|
-
break;
|
|
167
|
-
}
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
return String(text || '')
|
|
171
|
-
// remove niqqud / cantillation
|
|
172
|
-
.replace(/[\u0591-\u05C7]/g, '')
|
|
173
|
-
// normalize Hebrew punctuation variants
|
|
174
|
-
.replace(/[׳']/g, '\'')
|
|
175
|
-
.replace(/[״"]/g, '"')
|
|
176
|
-
// collapse whitespace
|
|
177
|
-
.replace(/\s+/g, ' ')
|
|
178
|
-
.trim();
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
function normalizeToken(token) {
|
|
182
|
-
return normalizeHebrew(token)
|
|
183
|
-
.toLowerCase()
|
|
184
|
-
.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu, '');
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
function uniqueTerms(terms, maxTerms = 24) {
|
|
188
|
-
const seen = new Set();
|
|
189
|
-
const out = [];
|
|
190
|
-
|
|
191
|
-
for (const raw of terms) {
|
|
192
|
-
const term = normalizeHebrew(raw).trim();
|
|
193
|
-
if (!term) continue;
|
|
194
|
-
|
|
195
|
-
const key = term.toLowerCase();
|
|
196
|
-
if (seen.has(key)) continue;
|
|
197
|
-
|
|
198
|
-
seen.add(key);
|
|
199
|
-
out.push(term);
|
|
200
|
-
|
|
201
|
-
if (out.length >= maxTerms) break;
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
return out;
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
function expandQuery(query) {
|
|
208
|
-
const normalized = normalizeHebrew(query);
|
|
209
|
-
const expanded = [normalized];
|
|
210
|
-
|
|
211
|
-
for (const phraseRule of PHRASE_EXPANSIONS) {
|
|
212
|
-
if (phraseRule.pattern.test(normalized)) {
|
|
213
|
-
expanded.push(...phraseRule.terms);
|
|
214
|
-
}
|
|
215
|
-
phraseRule.pattern.lastIndex = 0;
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
const tokens = normalized
|
|
219
|
-
.split(/[\s,/|()[\]{}:;!?]+/)
|
|
220
|
-
.map(normalizeToken)
|
|
221
|
-
.filter(Boolean)
|
|
222
|
-
.filter(token => !STOP_WORDS.has(token));
|
|
223
|
-
|
|
224
|
-
for (const token of tokens) {
|
|
225
|
-
expanded.push(token);
|
|
226
|
-
|
|
227
|
-
const synonyms = QUERY_EXPANSIONS[token];
|
|
228
|
-
if (synonyms) {
|
|
229
|
-
expanded.push(...synonyms);
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
// A little morphology help for Hebrew singular/plural and abbreviations
|
|
233
|
-
if (token.endsWith('ים') && token.length > 3) {
|
|
234
|
-
expanded.push(token.slice(0, -2));
|
|
235
|
-
}
|
|
236
|
-
if (token.endsWith('ות') && token.length > 3) {
|
|
237
|
-
expanded.push(token.slice(0, -2));
|
|
238
|
-
}
|
|
239
|
-
if (token.endsWith('ה') && token.length > 2) {
|
|
240
|
-
expanded.push(token.slice(0, -1));
|
|
241
|
-
}
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
const terms = uniqueTerms(expanded, 24);
|
|
245
|
-
|
|
246
|
-
return {
|
|
247
|
-
original: query,
|
|
248
|
-
normalized,
|
|
249
|
-
terms,
|
|
250
|
-
// Orama lexical search receives one expanded term string
|
|
251
|
-
term: terms.join(' '),
|
|
252
|
-
};
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
function coerceString(value) {
|
|
256
|
-
if (value == null) return '';
|
|
257
|
-
|
|
258
|
-
if (Array.isArray(value)) {
|
|
259
|
-
return value
|
|
260
|
-
.map(v => coerceString(v))
|
|
261
|
-
.filter(Boolean)
|
|
262
|
-
.join(', ');
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
return String(value).trim();
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
function buildDocument(row) {
|
|
269
|
-
return {
|
|
270
|
-
id: String(row.post_id),
|
|
271
|
-
post_id: row.post_id,
|
|
272
|
-
topic_id: row.topic_id,
|
|
273
|
-
title: coerceString(row.title),
|
|
274
|
-
category: coerceString(row.category),
|
|
275
|
-
tags: coerceString(row.tags),
|
|
276
|
-
parent_content: coerceString(row.parent_content),
|
|
277
|
-
content: coerceString(row.content),
|
|
278
|
-
embedding: row.embedding,
|
|
279
|
-
};
|
|
280
|
-
}
|
|
281
|
-
|
|
282
24
|
async function buildIndex() {
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
return db;
|
|
25
|
+
const storedEmbeddings = await getAllEmbeddings();
|
|
26
|
+
|
|
27
|
+
// Detect dimension from data; fall back to 1536 (text-embedding-3-small default)
|
|
28
|
+
const dimensions = storedEmbeddings.length > 0
|
|
29
|
+
? storedEmbeddings[0].embedding.length
|
|
30
|
+
: 1536;
|
|
31
|
+
|
|
32
|
+
const db = await create({
|
|
33
|
+
schema: {
|
|
34
|
+
post_id: 'number',
|
|
35
|
+
topic_id: 'number',
|
|
36
|
+
content: 'string',
|
|
37
|
+
embedding: `vector[${dimensions}]`,
|
|
38
|
+
},
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
if (storedEmbeddings.length > 0) {
|
|
42
|
+
await insertMultiple(db, storedEmbeddings.map(e => ({
|
|
43
|
+
id: String(e.post_id),
|
|
44
|
+
post_id: e.post_id,
|
|
45
|
+
topic_id: e.topic_id,
|
|
46
|
+
content: e.content,
|
|
47
|
+
embedding: e.embedding,
|
|
48
|
+
})));
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
winston().info(`[search-agent] vectorSearchService: Orama index built with ${storedEmbeddings.length} document(s)`);
|
|
52
|
+
return db;
|
|
312
53
|
}
|
|
313
54
|
|
|
314
55
|
async function getDb() {
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
return _buildPromise;
|
|
56
|
+
const now = Date.now();
|
|
57
|
+
if (_db && (now - _dbTs) < INDEX_TTL_MS) {
|
|
58
|
+
return _db;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (_buildPromise) {
|
|
62
|
+
return _buildPromise;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
_buildPromise = buildIndex().then((db) => {
|
|
66
|
+
_db = db;
|
|
67
|
+
_dbTs = Date.now();
|
|
68
|
+
_buildPromise = null;
|
|
69
|
+
return db;
|
|
70
|
+
}).catch((err) => {
|
|
71
|
+
_buildPromise = null;
|
|
72
|
+
throw err;
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
return _buildPromise;
|
|
337
76
|
}
|
|
338
77
|
|
|
339
78
|
/** Invalidate the in-memory Orama index (e.g. after new embeddings are saved). */
|
|
340
79
|
function invalidateIndex() {
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
80
|
+
_db = null;
|
|
81
|
+
_dbTs = 0;
|
|
82
|
+
winston().info('[search-agent] vectorSearchService: Orama index invalidated');
|
|
344
83
|
}
|
|
345
84
|
|
|
346
85
|
/**
|
|
347
|
-
* Performs
|
|
348
|
-
* 1. vector similarity on the original query embedding
|
|
349
|
-
* 2. lexical search on an expanded Hebrew query
|
|
350
|
-
* 3. field boosts to favor title/tags/category matches
|
|
86
|
+
* Performs semantic search against stored post embeddings using Orama vector search.
|
|
351
87
|
*
|
|
352
88
|
* @param {string} query - The search query string.
|
|
353
|
-
* @
|
|
354
|
-
*
|
|
89
|
+
* @returns {Promise<Array<{ topic_id: number, post_id: number, content: string, score: number }>>}
|
|
90
|
+
* Top results sorted by cosine similarity descending.
|
|
355
91
|
*/
|
|
356
92
|
async function search(query, limit = TOP_K) {
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
const hits = Array.isArray(results && results.hits) ? results.hits : [];
|
|
390
|
-
|
|
391
|
-
winston().verbose(`[search-agent] vectorSearchService: Orama returned ${hits.length} hit(s)`);
|
|
392
|
-
|
|
393
|
-
const filtered = hits.filter(hit => typeof hit.score === 'number' && hit.score >= MIN_SCORE);
|
|
394
|
-
|
|
395
|
-
winston().verbose(
|
|
396
|
-
`[search-agent] vectorSearchService: ${filtered.length}/${hits.length} hit(s) passed noise floor (MIN_SCORE=${MIN_SCORE})`
|
|
397
|
-
);
|
|
398
|
-
|
|
399
|
-
return filtered.map(hit => ({
|
|
400
|
-
topic_id: hit.document.topic_id,
|
|
401
|
-
post_id: hit.document.post_id,
|
|
402
|
-
title: hit.document.title || '',
|
|
403
|
-
category: hit.document.category || '',
|
|
404
|
-
tags: hit.document.tags || '',
|
|
405
|
-
content: hit.document.content,
|
|
406
|
-
score: hit.score,
|
|
407
|
-
}));
|
|
93
|
+
if (typeof query !== 'string' || query.trim() === '') {
|
|
94
|
+
throw new Error('search() requires a non-empty query string');
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
winston().verbose(`[search-agent] vectorSearchService: running Orama vector search for "${query.trim()}"`);
|
|
98
|
+
|
|
99
|
+
const [queryEmbedding, db] = await Promise.all([
|
|
100
|
+
embed(query),
|
|
101
|
+
getDb(),
|
|
102
|
+
]);
|
|
103
|
+
|
|
104
|
+
const results = await oramaSearch(db, {
|
|
105
|
+
mode: 'vector',
|
|
106
|
+
vector: { value: queryEmbedding, property: 'embedding' },
|
|
107
|
+
limit,
|
|
108
|
+
similarity: 0.1,
|
|
109
|
+
includeVectors: false,
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
winston().verbose(`[search-agent] vectorSearchService: Orama returned ${results.hits.length} hit(s)`);
|
|
113
|
+
|
|
114
|
+
const filtered = results.hits.filter(hit => hit.score >= MIN_SCORE);
|
|
115
|
+
winston().verbose(
|
|
116
|
+
`[search-agent] vectorSearchService: ${filtered.length}/${results.hits.length} hit(s) passed noise floor (MIN_SCORE=${MIN_SCORE})`
|
|
117
|
+
);
|
|
118
|
+
|
|
119
|
+
return filtered.map(hit => ({
|
|
120
|
+
topic_id: hit.document.topic_id,
|
|
121
|
+
post_id: hit.document.post_id,
|
|
122
|
+
content: hit.document.content,
|
|
123
|
+
score: hit.score,
|
|
124
|
+
}));
|
|
408
125
|
}
|
|
409
126
|
|
|
410
|
-
module.exports = {
|
|
411
|
-
search,
|
|
412
|
-
invalidateIndex,
|
|
413
|
-
};
|
|
127
|
+
module.exports = { search, invalidateIndex };
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
const { cosineSimilarity } = require('../lib/cosineSimilarity');
|
|
2
|
+
|
|
3
|
+
function testCosine() {
|
|
4
|
+
console.log("Testing cosine similarity...");
|
|
5
|
+
|
|
6
|
+
const a = cosineSimilarity([1, 0], [1, 0]);
|
|
7
|
+
const b = cosineSimilarity([1, 0], [0, 1]);
|
|
8
|
+
|
|
9
|
+
if (a < 0.9) throw new Error("Expected high similarity");
|
|
10
|
+
if (b > 0.1) throw new Error("Expected low similarity");
|
|
11
|
+
|
|
12
|
+
console.log("✅ Cosine OK");
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
module.exports = testCosine;
|