nodebb-plugin-search-agent 0.0.93 → 0.0.932

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,9 @@
1
+ // ─── Token estimation helper ───────────────────────────────────────────────
2
+ function estimateTokens(str) {
3
+ // Roughly 4 chars/token for English, 2 for Hebrew/UTF-8, but 4 is safe for cost estimation
4
+ return Math.ceil(str.length / 4);
5
+ }
6
+
1
7
  'use strict';
2
8
 
3
9
  const https = require('https');
@@ -175,18 +181,18 @@ function callOpenAI(apiKey, model, messages) {
175
181
  * @param {string} model
176
182
  * @returns {Promise<string>}
177
183
  */
178
- async function expandQueryWithHyDE(queryText, apiKey, model) {
179
- const response = await callOpenAI(apiKey, model, [
180
- {
181
- role: 'system',
182
- content:
183
- 'אתה חבר בפורום. בהינתן שאלת חיפוש, כתוב פוסט תגובה קצר וריאליסטי בפורום שעונה ישירות על השאלה. ' +
184
- 'כתוב רק את תוכן הפוסט — ללא ברכות, הערות מטא, או שורת נושא.',
185
- },
186
- { role: 'user', content: queryText },
187
- ]);
188
- return (response.choices[0].message.content || '').trim() || queryText;
189
- }
184
+ // async function expandQueryWithHyDE(queryText, apiKey, model) {
185
+ // const response = await callOpenAI(apiKey, model, [
186
+ // {
187
+ // role: 'system',
188
+ // content:
189
+ // 'אתה חבר בפורום. בהינתן שאלת חיפוש, כתוב פוסט תגובה קצר וריאליסטי בפורום שעונה ישירות על השאלה. ' +
190
+ // 'כתוב רק את תוכן הפוסט — ללא ברכות, הערות מטא, או שורת נושא.',
191
+ // },
192
+ // { role: 'user', content: queryText },
193
+ // ]);
194
+ // return (response.choices[0].message.content || '').trim() || queryText;
195
+ // }
190
196
 
191
197
  /**
192
198
  * Send candidates to OpenAI for independent per-topic relevance scoring.
@@ -200,6 +206,7 @@ async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxR
200
206
 
201
207
 
202
208
  // Embed the query and all candidate post snippets
209
+
203
210
  const { embed, embedBatch } = require('../services/embeddingService');
204
211
  const queryEmbedding = await embed(queryText);
205
212
  const postSnippets = candidates.map((c) => {
@@ -218,12 +225,18 @@ async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxR
218
225
  'לכל מועמד ברשימה, דרג את הרלוונטיות של embedding הפוסט לembedding של השאלה בסקלה 0-10: ' +
219
226
  '10 = עונה ישירות ובאופן מלא. 7-9 = עונה על חלק משמעותי. 0-6 = לא רלוונטי. ' +
220
227
  'החזר אך ורק JSON תקני במבנה: {"tid": ציון, ...} — לדוגמה: {"42": 9, "15": 3}. ' +
221
- 'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.'+
222
- 'הוסף שדה נוסף "scoreExplanation" עם משפט קצר שמסביר לפי מה נעשה הדירוג.';
223
-
228
+ 'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.';
229
+
224
230
  const userMessage =
225
231
  `embedding של שאלת המשתמש: [${queryEmbedding.slice(0, 8).map(x => x.toFixed(4)).join(', ')} ...]\n\nפוסטים:\n${candidateList}`;
226
232
 
233
+ // --- Token count logging ---
234
+ const totalEmbeddingChars = queryText.length + postSnippets.reduce((sum, s) => sum + s.length, 0);
235
+ const embeddingTokens = estimateTokens(queryText) + postSnippets.reduce((sum, s) => sum + estimateTokens(s), 0);
236
+ const llmPromptTokens = estimateTokens(systemPrompt) + estimateTokens(userMessage);
237
+ const winston = require.main.require('winston');
238
+ winston.info(`[search-agent] Token usage: embedding API ≈ ${embeddingTokens} tokens, LLM prompt ≈ ${llmPromptTokens} tokens (for this search)`);
239
+
227
240
  const response = await callOpenAI(apiKey, model, [
228
241
  { role: 'system', content: systemPrompt },
229
242
  { role: 'user', content: userMessage },
@@ -279,16 +292,16 @@ async function searchTopics(queryText) {
279
292
  // HyDE: replace the short raw query with a hypothetical answer so the
280
293
  // embedding matches post content more closely.
281
294
  let embeddingQuery = queryText;
282
- if (useAI && settings.hydeEnabled) {
283
- try {
284
- embeddingQuery = await expandQueryWithHyDE(
285
- queryText, settings.openaiApiKey, settings.openaiModel
286
- );
287
- winston.verbose(`[search-agent] HyDE expanded query (${embeddingQuery.length} chars)`);
288
- } catch (hydeErr) {
289
- winston.warn(`[search-agent] HyDE expansion failed, using raw query: ${hydeErr.message}`);
290
- }
291
- }
295
+ // if (useAI && settings.hydeEnabled) {
296
+ // try {
297
+ // embeddingQuery = await expandQueryWithHyDE(
298
+ // queryText, settings.openaiApiKey, settings.openaiModel
299
+ // );
300
+ // winston.verbose(`[search-agent] HyDE expanded query (${embeddingQuery.length} chars)`);
301
+ // } catch (hydeErr) {
302
+ // winston.warn(`[search-agent] HyDE expansion failed, using raw query: ${hydeErr.message}`);
303
+ // }
304
+ // }
292
305
 
293
306
  // Request more candidates when AI will re-rank them.
294
307
  const vectorLimit = useAI ? settings.aiCandidates : settings.maxResults;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nodebb-plugin-search-agent",
3
- "version": "0.0.93",
3
+ "version": "0.0.932",
4
4
  "description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
5
5
  "main": "library.js",
6
6
  "author": "Racheli Bayfus",
@@ -14,7 +14,34 @@ const RETRY_DELAY_MS = 500;
14
14
  // text-embedding-3-small supports 8 192 tokens.
15
15
  // Hebrew/non-ASCII text tokenizes at ~1.5–2 chars/token (UTF-8 multibyte).
16
16
  // Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
17
+
17
18
  const MAX_CHARS = 12000;
19
+ const CHUNK_OVERLAP = 2000; // chars to overlap between chunks for context
20
+
21
+ // Split a long string into chunks of maxLen, with optional overlap
22
+ function splitIntoChunks(text, maxLen = MAX_CHARS, overlap = CHUNK_OVERLAP) {
23
+ if (text.length <= maxLen) return [text];
24
+ const chunks = [];
25
+ let i = 0;
26
+ while (i < text.length) {
27
+ const chunk = text.slice(i, i + maxLen);
28
+ chunks.push(chunk);
29
+ if (i + maxLen >= text.length) break;
30
+ i += maxLen - overlap;
31
+ }
32
+ return chunks;
33
+ }
34
+
35
+ // Average a list of vectors (arrays of numbers)
36
+ function averageVectors(vectors) {
37
+ if (!vectors.length) return [];
38
+ const len = vectors[0].length;
39
+ const sum = new Array(len).fill(0);
40
+ for (const v of vectors) {
41
+ for (let i = 0; i < len; i++) sum[i] += v[i];
42
+ }
43
+ return sum.map(x => x / vectors.length);
44
+ }
18
45
 
19
46
  // ─── Embedding cache ──────────────────────────────────────────────────────────
20
47
  // Avoids calling the embeddings API for the same text within a session.
@@ -109,24 +136,36 @@ async function embed(text) {
109
136
  throw new Error('OPENAI_API_KEY environment variable is not set');
110
137
  }
111
138
 
112
- const safe = truncate(text);
113
-
114
- if (_embedCache.has(safe)) {
115
- winston().verbose('[search-agent] embeddingService: embedding cache hit');
116
- return _embedCache.get(safe);
117
- }
118
-
119
- winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
120
- const response = await withRetry(() => requestEmbeddings(apiKey, safe));
121
- winston().verbose('[search-agent] embeddingService: embedding generated successfully');
122
- const embedding = response.data[0].embedding;
123
-
124
- if (_embedCache.size >= EMBED_CACHE_MAX) {
125
- _embedCache.delete(_embedCache.keys().next().value);
139
+ // Split into chunks if too long
140
+ const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
141
+ if (chunks.length === 1) {
142
+ const safe = truncate(text);
143
+ if (_embedCache.has(safe)) {
144
+ winston().verbose('[search-agent] embeddingService: embedding cache hit');
145
+ return _embedCache.get(safe);
146
+ }
147
+ winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
148
+ const response = await withRetry(() => requestEmbeddings(apiKey, safe));
149
+ winston().verbose('[search-agent] embeddingService: embedding generated successfully');
150
+ const embedding = response.data[0].embedding;
151
+ if (_embedCache.size >= EMBED_CACHE_MAX) {
152
+ _embedCache.delete(_embedCache.keys().next().value);
153
+ }
154
+ _embedCache.set(safe, embedding);
155
+ return embedding;
156
+ } else {
157
+ // For multi-chunk, embed all and average
158
+ winston().verbose(`[search-agent] embeddingService: splitting long text into ${chunks.length} chunks for embedding`);
159
+ const vectors = await embedBatch(chunks);
160
+ const avg = averageVectors(vectors);
161
+ // Optionally cache the average for the full text
162
+ const safe = truncate(text);
163
+ if (_embedCache.size >= EMBED_CACHE_MAX) {
164
+ _embedCache.delete(_embedCache.keys().next().value);
165
+ }
166
+ _embedCache.set(safe, avg);
167
+ return avg;
126
168
  }
127
- _embedCache.set(safe, embedding);
128
-
129
- return embedding;
130
169
  }
131
170
 
132
171
  /**
@@ -150,15 +189,32 @@ async function embedBatch(texts) {
150
189
  throw new Error('OPENAI_API_KEY environment variable is not set');
151
190
  }
152
191
 
153
- const safeTexts = texts.map(truncate);
154
- winston().verbose(`[search-agent] embeddingService: generating batch embeddings for ${safeTexts.length} text(s)`);
155
- const response = await withRetry(() => requestEmbeddings(apiKey, safeTexts));
156
- winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${texts.length} vector(s))`);
157
-
158
- // OpenAI returns items sorted by index field, but sort explicitly to be safe
159
- return response.data
160
- .sort((a, b) => a.index - b.index)
161
- .map(item => item.embedding);
192
+ // For each text, if too long, split and average embeddings
193
+ const allChunks = [];
194
+ const chunkMap = [];
195
+ for (const text of texts) {
196
+ const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
197
+ chunkMap.push({ count: chunks.length });
198
+ allChunks.push(...chunks);
199
+ }
200
+ winston().verbose(`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`);
201
+ const safeChunks = allChunks.map(truncate);
202
+ const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
203
+ winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`);
204
+ const vectors = response.data.sort((a, b) => a.index - b.index).map(item => item.embedding);
205
+ // Recombine chunk embeddings for each original text
206
+ const result = [];
207
+ let idx = 0;
208
+ for (const { count } of chunkMap) {
209
+ if (count === 1) {
210
+ result.push(vectors[idx]);
211
+ idx += 1;
212
+ } else {
213
+ result.push(averageVectors(vectors.slice(idx, idx + count)));
214
+ idx += count;
215
+ }
216
+ }
217
+ return result;
162
218
  }
163
219
 
164
220
  module.exports = { embed, embedBatch };
@@ -1,42 +0,0 @@
1
- 'use strict';
2
-
3
- /**
4
- * Computes the cosine similarity between two numeric vectors.
5
- * Handles mismatched lengths by using the shorter vector's dimension.
6
- * Returns 0 if either vector has zero magnitude.
7
- *
8
- * @param {number[]} a
9
- * @param {number[]} b
10
- * @returns {number} similarity in [-1, 1]
11
- */
12
- function cosineSimilarity(a, b) {
13
- const len = Math.min(a.length, b.length);
14
- let dot = 0;
15
- let magA = 0;
16
- let magB = 0;
17
-
18
- for (let i = 0; i < len; i++) {
19
- dot += a[i] * b[i];
20
- magA += a[i] * a[i];
21
- magB += b[i] * b[i];
22
- }
23
-
24
- const denom = Math.sqrt(magA) * Math.sqrt(magB);
25
- return denom === 0 ? 0 : dot / denom;
26
- }
27
-
28
- /**
29
- * Ranks items by cosine similarity to a query embedding.
30
- * Each item must have an `embedding` property (number[]).
31
- *
32
- * @param {number[]} queryEmbedding
33
- * @param {Array<{embedding: number[], [key: string]: any}>} items
34
- * @returns {Array<{item: object, score: number}>} sorted descending by score
35
- */
36
- function rankBySimilarity(queryEmbedding, items) {
37
- return items
38
- .map(item => ({ item, score: cosineSimilarity(queryEmbedding, item.embedding) }))
39
- .sort((a, b) => b.score - a.score);
40
- }
41
-
42
- module.exports = { cosineSimilarity, rankBySimilarity };
package/lib/similarity.js DELETED
@@ -1,125 +0,0 @@
1
- 'use strict';
2
-
3
- // Common stop-words (English + Hebrew) to exclude from TF-IDF vectors
4
- const STOP_WORDS = new Set([
5
- // English
6
- 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
7
- 'of', 'with', 'by', 'from', 'is', 'it', 'its', 'be', 'as', 'was',
8
- 'are', 'were', 'been', 'have', 'has', 'had', 'do', 'does', 'did',
9
- 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'shall',
10
- 'not', 'no', 'so', 'if', 'this', 'that', 'these', 'those', 'i', 'we',
11
- 'you', 'he', 'she', 'they', 'my', 'your', 'his', 'her', 'our', 'their',
12
- 'how', 'what', 'when', 'where', 'why', 'who', 'which', 'am', 'up',
13
- 'out', 'about', 'into', 'than', 'more', 'also', 'me', 'him', 'us', 'them',
14
- // Hebrew
15
- 'של', 'את', 'אל', 'על', 'עם', 'הם', 'הן', 'זה', 'זו', 'זאת',
16
- 'כי', 'לא', 'כן', 'יש', 'אם', 'רק', 'גם', 'אבל', 'אנחנו', 'אני',
17
- 'אתה', 'את', 'הוא', 'היא', 'אנו', 'אתם', 'אתן', 'הם', 'הן',
18
- 'זה', 'זו', 'אלה', 'אלו', 'כל', 'כך', 'כבר', 'עוד', 'רק', 'כן',
19
- 'אחד', 'יותר', 'פה', 'שם', 'מה', 'מי', 'איך', 'מתי', 'איפה',
20
- 'היה', 'הייתה', 'יהיה', 'תהיה', 'הוא', 'היא', 'הם', 'הן',
21
- 'אסור', 'מותר', 'צריך', 'רוצה', 'יכול', 'יכולה', 'לו', 'לה',
22
- 'בו', 'בה', 'עליו', 'עליה', 'בין', 'כבר', 'עכשיו', 'היום',
23
- 'כן', 'לכן', 'כדי', 'כדאי', 'שלי', 'שלך', 'שלו', 'שלה',
24
- 'שלנו', 'שלכם', 'שלהם', 'שלהן', 'להם', 'להן', 'לנו', 'לכם',
25
- ]);
26
-
27
- /**
28
- * Strip HTML, lowercase, remove punctuation, drop stop-words and short tokens.
29
- * @param {string|null|undefined} text
30
- * @returns {string[]}
31
- */
32
- function tokenize(text) {
33
- if (!text) return [];
34
- return text
35
- .replace(/<[^>]*>/g, ' ') // strip HTML tags
36
- .toLowerCase()
37
- .replace(/[^\p{L}\p{N}\s]/gu, ' ') // keep all Unicode letters & digits (Hebrew, Latin, etc.)
38
- .split(/\s+/)
39
- .filter(t => t.length >= 2 && !STOP_WORDS.has(t)); // min 2 chars to keep short Hebrew words
40
- }
41
-
42
- /**
43
- * Build a TF-IDF index from an array of topic objects.
44
- * Each topic must have: tid, slug, title, mainPostContent (optional).
45
- * @param {{ tid: number|string, slug: string, title: string, mainPostContent?: string }[]} topics
46
- * @returns {{ tid: number|string, slug: string, vector: Map<string, number> }[]}
47
- */
48
- function buildIndex(topics) {
49
- if (!topics || topics.length === 0) return [];
50
-
51
- // Step 1: term frequencies per document
52
- const docs = topics.map((t) => {
53
- const tokens = tokenize(`${t.title || ''} ${t.mainPostContent || ''}`);
54
- const tf = new Map();
55
- for (const token of tokens) {
56
- tf.set(token, (tf.get(token) || 0) + 1);
57
- }
58
- return { tid: t.tid, slug: t.slug, tf, len: tokens.length };
59
- });
60
-
61
- // Step 2: document frequency (how many docs contain each term)
62
- const df = new Map();
63
- for (const doc of docs) {
64
- for (const term of doc.tf.keys()) {
65
- df.set(term, (df.get(term) || 0) + 1);
66
- }
67
- }
68
-
69
- const N = docs.length;
70
-
71
- // Step 3: compute TF-IDF vector per document
72
- return docs.map((doc) => {
73
- const vector = new Map();
74
- for (const [term, freq] of doc.tf) {
75
- const tf = doc.len > 0 ? freq / doc.len : 0;
76
- // Smoothed IDF to avoid division by zero
77
- const idf = Math.log((N + 1) / (df.get(term) + 1)) + 1;
78
- vector.set(term, tf * idf);
79
- }
80
- return { tid: doc.tid, slug: doc.slug, vector };
81
- });
82
- }
83
-
84
- /**
85
- * Rank indexed documents against a query using cosine similarity.
86
- * @param {string} queryText
87
- * @param {{ tid: number|string, slug: string, vector: Map<string, number> }[]} index
88
- * @param {number} [topN=10]
89
- * @returns {{ tid: number|string, slug: string, score: number }[]}
90
- */
91
- function query(queryText, index, topN = 10) {
92
- if (!index || index.length === 0) return [];
93
-
94
- const qTokens = tokenize(queryText);
95
- if (qTokens.length === 0) return [];
96
-
97
- // Build raw term-count vector for the query
98
- const qVec = new Map();
99
- for (const token of qTokens) {
100
- qVec.set(token, (qVec.get(token) || 0) + 1);
101
- }
102
-
103
- const results = [];
104
- for (const doc of index) {
105
- let dot = 0;
106
- let docNormSq = 0;
107
- let qNormSq = 0;
108
-
109
- for (const [term, qVal] of qVec) {
110
- dot += qVal * (doc.vector.get(term) || 0);
111
- }
112
- for (const val of doc.vector.values()) docNormSq += val * val;
113
- for (const val of qVec.values()) qNormSq += val * val;
114
-
115
- const norm = Math.sqrt(docNormSq) * Math.sqrt(qNormSq);
116
- const score = norm > 0 ? dot / norm : 0;
117
- if (score > 0) {
118
- results.push({ tid: doc.tid, slug: doc.slug, score });
119
- }
120
- }
121
-
122
- return results.sort((a, b) => b.score - a.score).slice(0, topN);
123
- }
124
-
125
- module.exports = { tokenize, buildIndex, query };
@@ -1,15 +0,0 @@
1
- const { cosineSimilarity } = require('../lib/cosineSimilarity');
2
-
3
- function testCosine() {
4
- console.log("Testing cosine similarity...");
5
-
6
- const a = cosineSimilarity([1, 0], [1, 0]);
7
- const b = cosineSimilarity([1, 0], [0, 1]);
8
-
9
- if (a < 0.9) throw new Error("Expected high similarity");
10
- if (b > 0.1) throw new Error("Expected low similarity");
11
-
12
- console.log("✅ Cosine OK");
13
- }
14
-
15
- module.exports = testCosine;