nodebb-plugin-search-agent 0.0.931 → 0.0.932

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -181,18 +181,18 @@ function callOpenAI(apiKey, model, messages) {
181
181
  * @param {string} model
182
182
  * @returns {Promise<string>}
183
183
  */
184
- async function expandQueryWithHyDE(queryText, apiKey, model) {
185
- const response = await callOpenAI(apiKey, model, [
186
- {
187
- role: 'system',
188
- content:
189
- 'אתה חבר בפורום. בהינתן שאלת חיפוש, כתוב פוסט תגובה קצר וריאליסטי בפורום שעונה ישירות על השאלה. ' +
190
- 'כתוב רק את תוכן הפוסט — ללא ברכות, הערות מטא, או שורת נושא.',
191
- },
192
- { role: 'user', content: queryText },
193
- ]);
194
- return (response.choices[0].message.content || '').trim() || queryText;
195
- }
184
+ // async function expandQueryWithHyDE(queryText, apiKey, model) {
185
+ // const response = await callOpenAI(apiKey, model, [
186
+ // {
187
+ // role: 'system',
188
+ // content:
189
+ // 'אתה חבר בפורום. בהינתן שאלת חיפוש, כתוב פוסט תגובה קצר וריאליסטי בפורום שעונה ישירות על השאלה. ' +
190
+ // 'כתוב רק את תוכן הפוסט — ללא ברכות, הערות מטא, או שורת נושא.',
191
+ // },
192
+ // { role: 'user', content: queryText },
193
+ // ]);
194
+ // return (response.choices[0].message.content || '').trim() || queryText;
195
+ // }
196
196
 
197
197
  /**
198
198
  * Send candidates to OpenAI for independent per-topic relevance scoring.
@@ -225,9 +225,8 @@ async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxR
225
225
  'לכל מועמד ברשימה, דרג את הרלוונטיות של embedding הפוסט לembedding של השאלה בסקלה 0-10: ' +
226
226
  '10 = עונה ישירות ובאופן מלא. 7-9 = עונה על חלק משמעותי. 0-6 = לא רלוונטי. ' +
227
227
  'החזר אך ורק JSON תקני במבנה: {"tid": ציון, ...} — לדוגמה: {"42": 9, "15": 3}. ' +
228
- 'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.'+
229
- 'הוסף שדה נוסף "scoreExplanation" עם משפט קצר שמסביר לפי מה נעשה הדירוג.';
230
-
228
+ 'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.';
229
+
231
230
  const userMessage =
232
231
  `embedding של שאלת המשתמש: [${queryEmbedding.slice(0, 8).map(x => x.toFixed(4)).join(', ')} ...]\n\nפוסטים:\n${candidateList}`;
233
232
 
@@ -293,16 +292,16 @@ async function searchTopics(queryText) {
293
292
  // HyDE: replace the short raw query with a hypothetical answer so the
294
293
  // embedding matches post content more closely.
295
294
  let embeddingQuery = queryText;
296
- if (useAI && settings.hydeEnabled) {
297
- try {
298
- embeddingQuery = await expandQueryWithHyDE(
299
- queryText, settings.openaiApiKey, settings.openaiModel
300
- );
301
- winston.verbose(`[search-agent] HyDE expanded query (${embeddingQuery.length} chars)`);
302
- } catch (hydeErr) {
303
- winston.warn(`[search-agent] HyDE expansion failed, using raw query: ${hydeErr.message}`);
304
- }
305
- }
295
+ // if (useAI && settings.hydeEnabled) {
296
+ // try {
297
+ // embeddingQuery = await expandQueryWithHyDE(
298
+ // queryText, settings.openaiApiKey, settings.openaiModel
299
+ // );
300
+ // winston.verbose(`[search-agent] HyDE expanded query (${embeddingQuery.length} chars)`);
301
+ // } catch (hydeErr) {
302
+ // winston.warn(`[search-agent] HyDE expansion failed, using raw query: ${hydeErr.message}`);
303
+ // }
304
+ // }
306
305
 
307
306
  // Request more candidates when AI will re-rank them.
308
307
  const vectorLimit = useAI ? settings.aiCandidates : settings.maxResults;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nodebb-plugin-search-agent",
3
- "version": "0.0.931",
3
+ "version": "0.0.932",
4
4
  "description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
5
5
  "main": "library.js",
6
6
  "author": "Racheli Bayfus",
@@ -14,7 +14,34 @@ const RETRY_DELAY_MS = 500;
14
14
  // text-embedding-3-small supports 8 192 tokens.
15
15
  // Hebrew/non-ASCII text tokenizes at ~1.5–2 chars/token (UTF-8 multibyte).
16
16
  // Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
17
+
17
18
  const MAX_CHARS = 12000;
19
+ const CHUNK_OVERLAP = 2000; // chars to overlap between chunks for context
20
+
21
+ // Split a long string into chunks of maxLen, with optional overlap
22
+ function splitIntoChunks(text, maxLen = MAX_CHARS, overlap = CHUNK_OVERLAP) {
23
+ if (text.length <= maxLen) return [text];
24
+ const chunks = [];
25
+ let i = 0;
26
+ while (i < text.length) {
27
+ const chunk = text.slice(i, i + maxLen);
28
+ chunks.push(chunk);
29
+ if (i + maxLen >= text.length) break;
30
+ i += maxLen - overlap;
31
+ }
32
+ return chunks;
33
+ }
34
+
35
+ // Average a list of vectors (arrays of numbers)
36
+ function averageVectors(vectors) {
37
+ if (!vectors.length) return [];
38
+ const len = vectors[0].length;
39
+ const sum = new Array(len).fill(0);
40
+ for (const v of vectors) {
41
+ for (let i = 0; i < len; i++) sum[i] += v[i];
42
+ }
43
+ return sum.map(x => x / vectors.length);
44
+ }
18
45
 
19
46
  // ─── Embedding cache ──────────────────────────────────────────────────────────
20
47
  // Avoids calling the embeddings API for the same text within a session.
@@ -109,24 +136,36 @@ async function embed(text) {
109
136
  throw new Error('OPENAI_API_KEY environment variable is not set');
110
137
  }
111
138
 
112
- const safe = truncate(text);
113
-
114
- if (_embedCache.has(safe)) {
115
- winston().verbose('[search-agent] embeddingService: embedding cache hit');
116
- return _embedCache.get(safe);
117
- }
118
-
119
- winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
120
- const response = await withRetry(() => requestEmbeddings(apiKey, safe));
121
- winston().verbose('[search-agent] embeddingService: embedding generated successfully');
122
- const embedding = response.data[0].embedding;
123
-
124
- if (_embedCache.size >= EMBED_CACHE_MAX) {
125
- _embedCache.delete(_embedCache.keys().next().value);
139
+ // Split into chunks if too long
140
+ const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
141
+ if (chunks.length === 1) {
142
+ const safe = truncate(text);
143
+ if (_embedCache.has(safe)) {
144
+ winston().verbose('[search-agent] embeddingService: embedding cache hit');
145
+ return _embedCache.get(safe);
146
+ }
147
+ winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
148
+ const response = await withRetry(() => requestEmbeddings(apiKey, safe));
149
+ winston().verbose('[search-agent] embeddingService: embedding generated successfully');
150
+ const embedding = response.data[0].embedding;
151
+ if (_embedCache.size >= EMBED_CACHE_MAX) {
152
+ _embedCache.delete(_embedCache.keys().next().value);
153
+ }
154
+ _embedCache.set(safe, embedding);
155
+ return embedding;
156
+ } else {
157
+ // For multi-chunk, embed all and average
158
+ winston().verbose(`[search-agent] embeddingService: splitting long text into ${chunks.length} chunks for embedding`);
159
+ const vectors = await embedBatch(chunks);
160
+ const avg = averageVectors(vectors);
161
+ // Optionally cache the average for the full text
162
+ const safe = truncate(text);
163
+ if (_embedCache.size >= EMBED_CACHE_MAX) {
164
+ _embedCache.delete(_embedCache.keys().next().value);
165
+ }
166
+ _embedCache.set(safe, avg);
167
+ return avg;
126
168
  }
127
- _embedCache.set(safe, embedding);
128
-
129
- return embedding;
130
169
  }
131
170
 
132
171
  /**
@@ -150,15 +189,32 @@ async function embedBatch(texts) {
150
189
  throw new Error('OPENAI_API_KEY environment variable is not set');
151
190
  }
152
191
 
153
- const safeTexts = texts.map(truncate);
154
- winston().verbose(`[search-agent] embeddingService: generating batch embeddings for ${safeTexts.length} text(s)`);
155
- const response = await withRetry(() => requestEmbeddings(apiKey, safeTexts));
156
- winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${texts.length} vector(s))`);
157
-
158
- // OpenAI returns items sorted by index field, but sort explicitly to be safe
159
- return response.data
160
- .sort((a, b) => a.index - b.index)
161
- .map(item => item.embedding);
192
+ // For each text, if too long, split and average embeddings
193
+ const allChunks = [];
194
+ const chunkMap = [];
195
+ for (const text of texts) {
196
+ const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
197
+ chunkMap.push({ count: chunks.length });
198
+ allChunks.push(...chunks);
199
+ }
200
+ winston().verbose(`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`);
201
+ const safeChunks = allChunks.map(truncate);
202
+ const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
203
+ winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`);
204
+ const vectors = response.data.sort((a, b) => a.index - b.index).map(item => item.embedding);
205
+ // Recombine chunk embeddings for each original text
206
+ const result = [];
207
+ let idx = 0;
208
+ for (const { count } of chunkMap) {
209
+ if (count === 1) {
210
+ result.push(vectors[idx]);
211
+ idx += 1;
212
+ } else {
213
+ result.push(averageVectors(vectors.slice(idx, idx + count)));
214
+ idx += count;
215
+ }
216
+ }
217
+ return result;
162
218
  }
163
219
 
164
220
  module.exports = { embed, embedBatch };
@@ -1,42 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * Computes the cosine similarity between two numeric vectors.
5
- * Handles mismatched lengths by using the shorter vector's dimension.
6
- * Returns 0 if either vector has zero magnitude.
7
- *
8
- * @param {number[]} a
9
- * @param {number[]} b
10
- * @returns {number} similarity in [-1, 1]
11
- */
12
- function cosineSimilarity(a, b) {
13
- const len = Math.min(a.length, b.length);
14
- let dot = 0;
15
- let magA = 0;
16
- let magB = 0;
17
-
18
- for (let i = 0; i < len; i++) {
19
- dot += a[i] * b[i];
20
- magA += a[i] * a[i];
21
- magB += b[i] * b[i];
22
- }
23
-
24
- const denom = Math.sqrt(magA) * Math.sqrt(magB);
25
- return denom === 0 ? 0 : dot / denom;
26
- }
27
-
28
- /**
29
- * Ranks items by cosine similarity to a query embedding.
30
- * Each item must have an `embedding` property (number[]).
31
- *
32
- * @param {number[]} queryEmbedding
33
- * @param {Array<{embedding: number[], [key: string]: any}>} items
34
- * @returns {Array<{item: object, score: number}>} sorted descending by score
35
- */
36
- function rankBySimilarity(queryEmbedding, items) {
37
- return items
38
- .map(item => ({ item, score: cosineSimilarity(queryEmbedding, item.embedding) }))
39
- .sort((a, b) => b.score - a.score);
40
- }
41
-
42
- module.exports = { cosineSimilarity, rankBySimilarity };
package/lib/similarity.js DELETED
@@ -1,125 +0,0 @@
1
- 'use strict';
2
-
3
- // Common stop-words (English + Hebrew) to exclude from TF-IDF vectors
4
- const STOP_WORDS = new Set([
5
- // English
6
- 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
7
- 'of', 'with', 'by', 'from', 'is', 'it', 'its', 'be', 'as', 'was',
8
- 'are', 'were', 'been', 'have', 'has', 'had', 'do', 'does', 'did',
9
- 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'shall',
10
- 'not', 'no', 'so', 'if', 'this', 'that', 'these', 'those', 'i', 'we',
11
- 'you', 'he', 'she', 'they', 'my', 'your', 'his', 'her', 'our', 'their',
12
- 'how', 'what', 'when', 'where', 'why', 'who', 'which', 'am', 'up',
13
- 'out', 'about', 'into', 'than', 'more', 'also', 'me', 'him', 'us', 'them',
14
- // Hebrew
15
- 'של', 'את', 'אל', 'על', 'עם', 'הם', 'הן', 'זה', 'זו', 'זאת',
16
- 'כי', 'לא', 'כן', 'יש', 'אם', 'רק', 'גם', 'אבל', 'אנחנו', 'אני',
17
- 'אתה', 'את', 'הוא', 'היא', 'אנו', 'אתם', 'אתן', 'הם', 'הן',
18
- 'זה', 'זו', 'אלה', 'אלו', 'כל', 'כך', 'כבר', 'עוד', 'רק', 'כן',
19
- 'אחד', 'יותר', 'פה', 'שם', 'מה', 'מי', 'איך', 'מתי', 'איפה',
20
- 'היה', 'הייתה', 'יהיה', 'תהיה', 'הוא', 'היא', 'הם', 'הן',
21
- 'אסור', 'מותר', 'צריך', 'רוצה', 'יכול', 'יכולה', 'לו', 'לה',
22
- 'בו', 'בה', 'עליו', 'עליה', 'בין', 'כבר', 'עכשיו', 'היום',
23
- 'כן', 'לכן', 'כדי', 'כדאי', 'שלי', 'שלך', 'שלו', 'שלה',
24
- 'שלנו', 'שלכם', 'שלהם', 'שלהן', 'להם', 'להן', 'לנו', 'לכם',
25
- ]);
26
-
27
- /**
28
- * Strip HTML, lowercase, remove punctuation, drop stop-words and short tokens.
29
- * @param {string|null|undefined} text
30
- * @returns {string[]}
31
- */
32
- function tokenize(text) {
33
- if (!text) return [];
34
- return text
35
- .replace(/<[^>]*>/g, ' ') // strip HTML tags
36
- .toLowerCase()
37
- .replace(/[^\p{L}\p{N}\s]/gu, ' ') // keep all Unicode letters & digits (Hebrew, Latin, etc.)
38
- .split(/\s+/)
39
- .filter(t => t.length >= 2 && !STOP_WORDS.has(t)); // min 2 chars to keep short Hebrew words
40
- }
41
-
42
- /**
43
- * Build a TF-IDF index from an array of topic objects.
44
- * Each topic must have: tid, slug, title, mainPostContent (optional).
45
- * @param {{ tid: number|string, slug: string, title: string, mainPostContent?: string }[]} topics
46
- * @returns {{ tid: number|string, slug: string, vector: Map<string, number> }[]}
47
- */
48
- function buildIndex(topics) {
49
- if (!topics || topics.length === 0) return [];
50
-
51
- // Step 1: term frequencies per document
52
- const docs = topics.map((t) => {
53
- const tokens = tokenize(`${t.title || ''} ${t.mainPostContent || ''}`);
54
- const tf = new Map();
55
- for (const token of tokens) {
56
- tf.set(token, (tf.get(token) || 0) + 1);
57
- }
58
- return { tid: t.tid, slug: t.slug, tf, len: tokens.length };
59
- });
60
-
61
- // Step 2: document frequency (how many docs contain each term)
62
- const df = new Map();
63
- for (const doc of docs) {
64
- for (const term of doc.tf.keys()) {
65
- df.set(term, (df.get(term) || 0) + 1);
66
- }
67
- }
68
-
69
- const N = docs.length;
70
-
71
- // Step 3: compute TF-IDF vector per document
72
- return docs.map((doc) => {
73
- const vector = new Map();
74
- for (const [term, freq] of doc.tf) {
75
- const tf = doc.len > 0 ? freq / doc.len : 0;
76
- // Smoothed IDF to avoid division by zero
77
- const idf = Math.log((N + 1) / (df.get(term) + 1)) + 1;
78
- vector.set(term, tf * idf);
79
- }
80
- return { tid: doc.tid, slug: doc.slug, vector };
81
- });
82
- }
83
-
84
- /**
85
- * Rank indexed documents against a query using cosine similarity.
86
- * @param {string} queryText
87
- * @param {{ tid: number|string, slug: string, vector: Map<string, number> }[]} index
88
- * @param {number} [topN=10]
89
- * @returns {{ tid: number|string, slug: string, score: number }[]}
90
- */
91
- function query(queryText, index, topN = 10) {
92
- if (!index || index.length === 0) return [];
93
-
94
- const qTokens = tokenize(queryText);
95
- if (qTokens.length === 0) return [];
96
-
97
- // Build raw term-count vector for the query
98
- const qVec = new Map();
99
- for (const token of qTokens) {
100
- qVec.set(token, (qVec.get(token) || 0) + 1);
101
- }
102
-
103
- const results = [];
104
- for (const doc of index) {
105
- let dot = 0;
106
- let docNormSq = 0;
107
- let qNormSq = 0;
108
-
109
- for (const [term, qVal] of qVec) {
110
- dot += qVal * (doc.vector.get(term) || 0);
111
- }
112
- for (const val of doc.vector.values()) docNormSq += val * val;
113
- for (const val of qVec.values()) qNormSq += val * val;
114
-
115
- const norm = Math.sqrt(docNormSq) * Math.sqrt(qNormSq);
116
- const score = norm > 0 ? dot / norm : 0;
117
- if (score > 0) {
118
- results.push({ tid: doc.tid, slug: doc.slug, score });
119
- }
120
- }
121
-
122
- return results.sort((a, b) => b.score - a.score).slice(0, topN);
123
- }
124
-
125
- module.exports = { tokenize, buildIndex, query };
@@ -1,15 +0,0 @@
1
- const { cosineSimilarity } = require('../lib/cosineSimilarity');
2
-
3
- function testCosine() {
4
- console.log("Testing cosine similarity...");
5
-
6
- const a = cosineSimilarity([1, 0], [1, 0]);
7
- const b = cosineSimilarity([1, 0], [0, 1]);
8
-
9
- if (a < 0.9) throw new Error("Expected high similarity");
10
- if (b > 0.1) throw new Error("Expected low similarity");
11
-
12
- console.log("✅ Cosine OK");
13
- }
14
-
15
- module.exports = testCosine;