npm - nodebb-plugin-search-agent - Versions diffs - 0.0.939 → 0.0.942 - Mend

nodebb-plugin-search-agent 0.0.939 → 0.0.942

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/lib/cosineSimilarity.js +42 -0
package/lib/searchHandler.js +41 -73
package/library.js +2 -2
package/package.json +1 -1
package/services/embeddingService.js +1 -1
package/services/vectorSearchService.js +3 -23
package/test/testCosine.js +15 -0

package/lib/cosineSimilarity.js ADDED Viewed

@@ -0,0 +1,42 @@
+'use strict';
+/**
+ * Computes the cosine similarity between two numeric vectors.
+ * Handles mismatched lengths by using the shorter vector's dimension.
+ * Returns 0 if either vector has zero magnitude.
+ *
+ * @param {number[]} a
+ * @param {number[]} b
+ * @returns {number} similarity in [-1, 1]
+ */
+function cosineSimilarity(a, b) {
+	const len = Math.min(a.length, b.length);
+	let dot = 0;
+	let magA = 0;
+	let magB = 0;
+	for (let i = 0; i < len; i++) {
+		dot += a[i] * b[i];
+		magA += a[i] * a[i];
+		magB += b[i] * b[i];
+	}
+	const denom = Math.sqrt(magA) * Math.sqrt(magB);
+	return denom === 0 ? 0 : dot / denom;
+}
+/**
+ * Ranks items by cosine similarity to a query embedding.
+ * Each item must have an `embedding` property (number[]).
+ *
+ * @param {number[]} queryEmbedding
+ * @param {Array<{embedding: number[], [key: string]: any}>} items
+ * @returns {Array<{item: object, score: number}>} sorted descending by score
+ */
+function rankBySimilarity(queryEmbedding, items) {
+	return items
+		.map(item => ({ item, score: cosineSimilarity(queryEmbedding, item.embedding) }))
+		.sort((a, b) => b.score - a.score);
+}
+module.exports = { cosineSimilarity, rankBySimilarity };

package/lib/searchHandler.js CHANGED Viewed

@@ -1,9 +1,3 @@
-// ─── Token estimation helper ───────────────────────────────────────────────
-function estimateTokens(str) {
-	// Roughly 4 chars/token for English, 2 for Hebrew/UTF-8, but 4 is safe for cost estimation
-	return Math.ceil(str.length / 4);
-}
 'use strict';
 const https = require('https');
@@ -181,18 +175,18 @@ function callOpenAI(apiKey, model, messages) {
  * @param {string} model
  * @returns {Promise<string>}
  */
-// async function expandQueryWithHyDE(queryText, apiKey, model) {
-// 	const response = await callOpenAI(apiKey, model, [
-// 		{
-// 			role: 'system',
-// 			content:
-// 				'אתה חבר בפורום. בהינתן שאלת חיפוש, כתוב פוסט תגובה קצר וריאליסטי בפורום שעונה ישירות על השאלה. ' +
-// 				'כתוב רק את תוכן הפוסט — ללא ברכות, הערות מטא, או שורת נושא.',
-// 		},
-// 		{ role: 'user', content: queryText },
-// 	]);
-// 	return (response.choices[0].message.content || '').trim() || queryText;
-// }
+async function expandQueryWithHyDE(queryText, apiKey, model) {
+	const response = await callOpenAI(apiKey, model, [
+		{
+			role: 'system',
+			content:
+				'אתה חבר בפורום. בהינתן שאלת חיפוש, כתוב פוסט תגובה קצר וריאליסטי בפורום שעונה ישירות על השאלה. ' +
+				'כתוב רק את תוכן הפוסט — ללא ברכות, הערות מטא, או שורת נושא.',
+		},
+		{ role: 'user', content: queryText },
+	]);
+	return (response.choices[0].message.content || '').trim() || queryText;
+}
 /**
  * Send candidates to OpenAI for independent per-topic relevance scoring.
@@ -204,38 +198,25 @@ function callOpenAI(apiKey, model, messages) {
 async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxResults, snippetByTid = {}) {
 	console.log('Re-ranking with AI:', { queryText, candidates: candidates.map(c => ({ tid: c.tid, title: (topicMap[String(c.tid)] || {}).title })) });
-	// Embed the query and all candidate post snippets
-	const { embed, embedBatch } = require('../services/embeddingService');
-	const queryEmbedding = await embed(queryText);
-	const postSnippets = candidates.map((c) => {
-		const raw = (snippetByTid[String(c.tid)] || '').replace(/<[^>]*>/g, ' ').replace(/[ \t]+/g, ' ').trim();
-		return raw.slice(0, 1500);
-	});
-	const postEmbeddings = await embedBatch(postSnippets);
-	// Format: [tid:..., embedding: [v1, v2, ...]]
-	const candidateList = candidates.map((c, i) => {
-		return `[tid:${c.tid}]\nembedding: [${postEmbeddings[i].slice(0, 8).map(x => x.toFixed(4)).join(', ')} ...]`;
-	}).join('\n\n');
+	// Only send the embedded query and the matched post snippet for each candidate
+	const candidateList = candidates
+		.map((c) => {
+			const raw = (snippetByTid[String(c.tid)] || '').replace(/<[^>]*>/g, ' ').replace(/[ \t]+/g, ' ').trim();
+			// Only send the snippet, not the title
+			return `[tid:${c.tid}]\n${raw.slice(0, 1500)}`;
+		})
+		.join('\n\n');
 	const systemPrompt =
 		'אתה מסנן חיפוש פורום מחמיר. ' +
-		'לכל מועמד ברשימה, דרג את הרלוונטיות של embedding הפוסט לembedding של השאלה בסקלה 0-10: ' +
+		'לכל מועמד ברשימה, דרג את הרלוונטיות של קטע הפוסט לשאלת המשתמש בסקלה 0-10: ' +
 		'10 = עונה ישירות ובאופן מלא. 7-9 = עונה על חלק משמעותי. 0-6 = לא רלוונטי. ' +
 		'החזר אך ורק JSON תקני במבנה: {"tid": ציון, ...} — לדוגמה: {"42": 9, "15": 3}. ' +
-		'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.';
-	const userMessage =
-		`embedding של שאלת המשתמש: [${queryEmbedding.slice(0, 8).map(x => x.toFixed(4)).join(', ')} ...]\n\nפוסטים:\n${candidateList}`;
+		'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.'+
+		'הוסף שדה נוסף "scoreExplanation" עם משפט קצר שמסביר למה קטע עם ציון נמוך לא רלוונטי.';
-	// --- Token count logging ---
-	const totalEmbeddingChars = queryText.length + postSnippets.reduce((sum, s) => sum + s.length, 0);
-	const embeddingTokens = estimateTokens(queryText) + postSnippets.reduce((sum, s) => sum + estimateTokens(s), 0);
-	const llmPromptTokens = estimateTokens(systemPrompt) + estimateTokens(userMessage);
-	const winston = require.main.require('winston');
-	winston.info(`[search-agent] Token usage: embedding API ≈ ${embeddingTokens} tokens, LLM prompt ≈ ${llmPromptTokens} tokens (for this search)`);
+	const userMessage =
+		`שאלת המשתמש (מוטמעת): "${queryText}"\n\nפוסטים:\n${candidateList}`;
 	const response = await callOpenAI(apiKey, model, [
 		{ role: 'system', content: systemPrompt },
@@ -255,25 +236,12 @@ async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxR
 	const scores = JSON.parse(match[0]);
 	const candidateByTid = Object.fromEntries(candidates.map(c => [String(c.tid), c]));
-	       let filtered = Object.entries(scores)
-		       .filter(([, score]) => Number(score) >= 7)
-		       .sort(([, a], [, b]) => Number(b) - Number(a))
-		       .slice(0, maxResults)
-		       .map(([tid]) => candidateByTid[tid])
-		       .filter(Boolean);
-	       // If nothing passed the threshold, return the top scoring candidate (if any)
-	       if (filtered.length === 0 && candidates.length > 0) {
-		       // Find the tid with the highest score
-		       const sortedAll = Object.entries(scores)
-			       .sort(([, a], [, b]) => Number(b) - Number(a));
-		       if (sortedAll.length > 0) {
-			       const [topTid] = sortedAll[0];
-			       const topCandidate = candidateByTid[topTid];
-			       if (topCandidate) filtered = [topCandidate];
-		       }
-	       }
-	       return filtered;
+	return Object.entries(scores)
+		.filter(([, score]) => Number(score) >= 7)
+		.sort(([, a], [, b]) => Number(b) - Number(a))
+		.slice(0, maxResults)
+		.map(([tid]) => candidateByTid[tid])
+		.filter(Boolean);
 }
 // ─── Public API ───────────────────────────────────────────────────────────────
@@ -305,16 +273,16 @@ async function searchTopics(queryText) {
 		// HyDE: replace the short raw query with a hypothetical answer so the
 		// embedding matches post content more closely.
 		let embeddingQuery = queryText;
-		// if (useAI && settings.hydeEnabled) {
-		// 	try {
-		// 		embeddingQuery = await expandQueryWithHyDE(
-		// 			queryText, settings.openaiApiKey, settings.openaiModel
-		// 		);
-		// 		winston.verbose(`[search-agent] HyDE expanded query (${embeddingQuery.length} chars)`);
-		// 	} catch (hydeErr) {
-		// 		winston.warn(`[search-agent] HyDE expansion failed, using raw query: ${hydeErr.message}`);
-		// 	}
-		// }
+		if (useAI && settings.hydeEnabled) {
+			try {
+				embeddingQuery = await expandQueryWithHyDE(
+					queryText, settings.openaiApiKey, settings.openaiModel
+				);
+				winston.verbose(`[search-agent] HyDE expanded query (${embeddingQuery.length} chars)`);
+			} catch (hydeErr) {
+				winston.warn(`[search-agent] HyDE expansion failed, using raw query: ${hydeErr.message}`);
+			}
+		}
 		// Request more candidates when AI will re-rank them.
 		const vectorLimit = useAI ? settings.aiCandidates : settings.maxResults;

package/library.js CHANGED Viewed

@@ -30,13 +30,13 @@ plugin.init = async (params) => {
 	// Start initial embedding sync in the background — does not block NodeBB startup.
 	winston.info('[search-agent] Starting initial embedding sync…');
-    startSync();
+	startSync().catch(err => winston.warn(`[search-agent] Initial sync failed: ${err.message}`));
 	// Re-sync every 10 minutes to pick up new posts.
 	const RESYNC_INTERVAL_MS = 10 * 60 * 1000;
 	setInterval(() => {
 		winston.info('[search-agent] Running scheduled embedding re-sync…');
-        startSync();
+		startSync().catch(err => winston.warn(`[search-agent] Scheduled re-sync failed: ${err.message}`));
 	}, RESYNC_INTERVAL_MS).unref();
 	winston.info('[plugins/search-agent] Initialised.');

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "nodebb-plugin-search-agent",
-  "version": "0.0.939",
+  "version": "0.0.942",
   "description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
   "main": "library.js",
   "author": "Racheli Bayfus",

package/services/embeddingService.js CHANGED Viewed

@@ -490,4 +490,4 @@ module.exports = {
     embedBatch,
     extractPureText,
     splitIntoSemanticChunks,
-};
+};

package/services/vectorSearchService.js CHANGED Viewed

@@ -147,26 +147,6 @@ const STOP_WORDS = new Set([
 ]);
 function normalizeHebrew(text) {
-	if(!text) return text;
-	// Remove common prefixes
-	const prefixes = ['ה', 'ו', 'ב', 'ל', 'מ', 'ש', 'כ'];
-    for (const prefix of prefixes) {
-        if (text.startsWith(prefix) && text.length > 3) {
-            text = text.slice(1);
-            break;
-        }
-    }
-    // Remove common plural suffixes
-    const pluralSuffixes = ['ים', 'ות'];
-    for (const suffix of pluralSuffixes) {
-        if (text.endsWith(suffix) && text.length > 3) {
-            text = text.slice(0, -suffix.length);
-            break;
-        }
-    }
     return String(text || '')
         // remove niqqud / cantillation
         .replace(/[\u0591-\u05C7]/g, '')
@@ -373,8 +353,8 @@ async function search(query, limit = TOP_K) {
     ]);
     const results = await oramaSearch(db, {
-        // mode: 'hybrid',
-        query: expanded.term,
+        mode: 'hybrid',
+        term: expanded.term,
         properties: SEARCH_PROPERTIES,
         boost: FIELD_BOOSTS,
         vector: {
@@ -409,5 +389,5 @@ async function search(query, limit = TOP_K) {
 module.exports = {
     search,
-    invalidateIndex,
+    invalidateIndex
 };

package/test/testCosine.js ADDED Viewed

@@ -0,0 +1,15 @@
+const { cosineSimilarity } = require('../lib/cosineSimilarity');
+function testCosine() {
+  console.log("Testing cosine similarity...");
+  const a = cosineSimilarity([1, 0], [1, 0]);
+  const b = cosineSimilarity([1, 0], [0, 1]);
+  if (a < 0.9) throw new Error("Expected high similarity");
+  if (b > 0.1) throw new Error("Expected low similarity");
+  console.log("✅ Cosine OK");
+}
+module.exports = testCosine;