npm - nodebb-plugin-search-agent - Versions diffs - 0.0.93 → 0.0.932 - Mend

nodebb-plugin-search-agent 0.0.93 → 0.0.932

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/lib/searchHandler.js +38 -25
package/package.json +1 -1
package/services/embeddingService.js +82 -26
package/lib/cosineSimilarity.js +0 -42
package/lib/similarity.js +0 -125
package/test/testCosine.js +0 -15

package/lib/searchHandler.js CHANGED Viewed

@@ -1,3 +1,9 @@
+// ─── Token estimation helper ───────────────────────────────────────────────
+function estimateTokens(str) {
+	// Roughly 4 chars/token for English, 2 for Hebrew/UTF-8, but 4 is safe for cost estimation
+	return Math.ceil(str.length / 4);
+}
 'use strict';
 const https = require('https');
@@ -175,18 +181,18 @@ function callOpenAI(apiKey, model, messages) {
  * @param {string} model
  * @returns {Promise<string>}
  */
-async function expandQueryWithHyDE(queryText, apiKey, model) {
-	const response = await callOpenAI(apiKey, model, [
-		{
-			role: 'system',
-			content:
-				'אתה חבר בפורום. בהינתן שאלת חיפוש, כתוב פוסט תגובה קצר וריאליסטי בפורום שעונה ישירות על השאלה. ' +
-				'כתוב רק את תוכן הפוסט — ללא ברכות, הערות מטא, או שורת נושא.',
-		},
-		{ role: 'user', content: queryText },
-	]);
-	return (response.choices[0].message.content || '').trim() || queryText;
-}
+// async function expandQueryWithHyDE(queryText, apiKey, model) {
+// 	const response = await callOpenAI(apiKey, model, [
+// 		{
+// 			role: 'system',
+// 			content:
+// 				'אתה חבר בפורום. בהינתן שאלת חיפוש, כתוב פוסט תגובה קצר וריאליסטי בפורום שעונה ישירות על השאלה. ' +
+// 				'כתוב רק את תוכן הפוסט — ללא ברכות, הערות מטא, או שורת נושא.',
+// 		},
+// 		{ role: 'user', content: queryText },
+// 	]);
+// 	return (response.choices[0].message.content || '').trim() || queryText;
+// }
 /**
  * Send candidates to OpenAI for independent per-topic relevance scoring.
@@ -200,6 +206,7 @@ async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxR
 	// Embed the query and all candidate post snippets
 	const { embed, embedBatch } = require('../services/embeddingService');
 	const queryEmbedding = await embed(queryText);
 	const postSnippets = candidates.map((c) => {
@@ -218,12 +225,18 @@ async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxR
 		'לכל מועמד ברשימה, דרג את הרלוונטיות של embedding הפוסט לembedding של השאלה בסקלה 0-10: ' +
 		'10 = עונה ישירות ובאופן מלא. 7-9 = עונה על חלק משמעותי. 0-6 = לא רלוונטי. ' +
 		'החזר אך ורק JSON תקני במבנה: {"tid": ציון, ...} — לדוגמה: {"42": 9, "15": 3}. ' +
-		'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.'+
-		'הוסף שדה נוסף "scoreExplanation" עם משפט קצר שמסביר לפי מה נעשה הדירוג.';
+		'אין להוסיף הסברים, טקסט נוסף, או עיצוב מחוץ ל-JSON.';
 	const userMessage =
 		`embedding של שאלת המשתמש: [${queryEmbedding.slice(0, 8).map(x => x.toFixed(4)).join(', ')} ...]\n\nפוסטים:\n${candidateList}`;
+	// --- Token count logging ---
+	const totalEmbeddingChars = queryText.length + postSnippets.reduce((sum, s) => sum + s.length, 0);
+	const embeddingTokens = estimateTokens(queryText) + postSnippets.reduce((sum, s) => sum + estimateTokens(s), 0);
+	const llmPromptTokens = estimateTokens(systemPrompt) + estimateTokens(userMessage);
+	const winston = require.main.require('winston');
+	winston.info(`[search-agent] Token usage: embedding API ≈ ${embeddingTokens} tokens, LLM prompt ≈ ${llmPromptTokens} tokens (for this search)`);
 	const response = await callOpenAI(apiKey, model, [
 		{ role: 'system', content: systemPrompt },
 		{ role: 'user', content: userMessage },
@@ -279,16 +292,16 @@ async function searchTopics(queryText) {
 		// HyDE: replace the short raw query with a hypothetical answer so the
 		// embedding matches post content more closely.
 		let embeddingQuery = queryText;
-		if (useAI && settings.hydeEnabled) {
-			try {
-				embeddingQuery = await expandQueryWithHyDE(
-					queryText, settings.openaiApiKey, settings.openaiModel
-				);
-				winston.verbose(`[search-agent] HyDE expanded query (${embeddingQuery.length} chars)`);
-			} catch (hydeErr) {
-				winston.warn(`[search-agent] HyDE expansion failed, using raw query: ${hydeErr.message}`);
-			}
-		}
+		// if (useAI && settings.hydeEnabled) {
+		// 	try {
+		// 		embeddingQuery = await expandQueryWithHyDE(
+		// 			queryText, settings.openaiApiKey, settings.openaiModel
+		// 		);
+		// 		winston.verbose(`[search-agent] HyDE expanded query (${embeddingQuery.length} chars)`);
+		// 	} catch (hydeErr) {
+		// 		winston.warn(`[search-agent] HyDE expansion failed, using raw query: ${hydeErr.message}`);
+		// 	}
+		// }
 		// Request more candidates when AI will re-rank them.
 		const vectorLimit = useAI ? settings.aiCandidates : settings.maxResults;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "nodebb-plugin-search-agent",
-  "version": "0.0.93",
+  "version": "0.0.932",
   "description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
   "main": "library.js",
   "author": "Racheli Bayfus",

package/services/embeddingService.js CHANGED Viewed

@@ -14,7 +14,34 @@ const RETRY_DELAY_MS = 500;
 // text-embedding-3-small supports 8 192 tokens.
 // Hebrew/non-ASCII text tokenizes at ~1.5–2 chars/token (UTF-8 multibyte).
 // Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
 const MAX_CHARS = 12000;
+const CHUNK_OVERLAP = 2000; // chars to overlap between chunks for context
+// Split a long string into chunks of maxLen, with optional overlap
+function splitIntoChunks(text, maxLen = MAX_CHARS, overlap = CHUNK_OVERLAP) {
+	if (text.length <= maxLen) return [text];
+	const chunks = [];
+	let i = 0;
+	while (i < text.length) {
+		const chunk = text.slice(i, i + maxLen);
+		chunks.push(chunk);
+		if (i + maxLen >= text.length) break;
+		i += maxLen - overlap;
+	}
+	return chunks;
+}
+// Average a list of vectors (arrays of numbers)
+function averageVectors(vectors) {
+	if (!vectors.length) return [];
+	const len = vectors[0].length;
+	const sum = new Array(len).fill(0);
+	for (const v of vectors) {
+		for (let i = 0; i < len; i++) sum[i] += v[i];
+	}
+	return sum.map(x => x / vectors.length);
+}
 // ─── Embedding cache ──────────────────────────────────────────────────────────
 // Avoids calling the embeddings API for the same text within a session.
@@ -109,24 +136,36 @@ async function embed(text) {
 		throw new Error('OPENAI_API_KEY environment variable is not set');
 	}
-	const safe = truncate(text);
-	if (_embedCache.has(safe)) {
-		winston().verbose('[search-agent] embeddingService: embedding cache hit');
-		return _embedCache.get(safe);
-	}
-	winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
-	const response = await withRetry(() => requestEmbeddings(apiKey, safe));
-	winston().verbose('[search-agent] embeddingService: embedding generated successfully');
-	const embedding = response.data[0].embedding;
-	if (_embedCache.size >= EMBED_CACHE_MAX) {
-		_embedCache.delete(_embedCache.keys().next().value);
+	// Split into chunks if too long
+	const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
+	if (chunks.length === 1) {
+		const safe = truncate(text);
+		if (_embedCache.has(safe)) {
+			winston().verbose('[search-agent] embeddingService: embedding cache hit');
+			return _embedCache.get(safe);
+		}
+		winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
+		const response = await withRetry(() => requestEmbeddings(apiKey, safe));
+		winston().verbose('[search-agent] embeddingService: embedding generated successfully');
+		const embedding = response.data[0].embedding;
+		if (_embedCache.size >= EMBED_CACHE_MAX) {
+			_embedCache.delete(_embedCache.keys().next().value);
+		}
+		_embedCache.set(safe, embedding);
+		return embedding;
+	} else {
+		// For multi-chunk, embed all and average
+		winston().verbose(`[search-agent] embeddingService: splitting long text into ${chunks.length} chunks for embedding`);
+		const vectors = await embedBatch(chunks);
+		const avg = averageVectors(vectors);
+		// Optionally cache the average for the full text
+		const safe = truncate(text);
+		if (_embedCache.size >= EMBED_CACHE_MAX) {
+			_embedCache.delete(_embedCache.keys().next().value);
+		}
+		_embedCache.set(safe, avg);
+		return avg;
 	}
-	_embedCache.set(safe, embedding);
-	return embedding;
 }
 /**
@@ -150,15 +189,32 @@ async function embedBatch(texts) {
 		throw new Error('OPENAI_API_KEY environment variable is not set');
 	}
-	const safeTexts = texts.map(truncate);
-	winston().verbose(`[search-agent] embeddingService: generating batch embeddings for ${safeTexts.length} text(s)`);
-	const response = await withRetry(() => requestEmbeddings(apiKey, safeTexts));
-	winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${texts.length} vector(s))`);
-	// OpenAI returns items sorted by index field, but sort explicitly to be safe
-	return response.data
-		.sort((a, b) => a.index - b.index)
-		.map(item => item.embedding);
+	// For each text, if too long, split and average embeddings
+	const allChunks = [];
+	const chunkMap = [];
+	for (const text of texts) {
+		const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
+		chunkMap.push({ count: chunks.length });
+		allChunks.push(...chunks);
+	}
+	winston().verbose(`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`);
+	const safeChunks = allChunks.map(truncate);
+	const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
+	winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`);
+	const vectors = response.data.sort((a, b) => a.index - b.index).map(item => item.embedding);
+	// Recombine chunk embeddings for each original text
+	const result = [];
+	let idx = 0;
+	for (const { count } of chunkMap) {
+		if (count === 1) {
+			result.push(vectors[idx]);
+			idx += 1;
+		} else {
+			result.push(averageVectors(vectors.slice(idx, idx + count)));
+			idx += count;
+		}
+	}
+	return result;
 }
 module.exports = { embed, embedBatch };

package/lib/cosineSimilarity.js DELETED Viewed

@@ -1,42 +0,0 @@
-'use strict';
-/**
- * Computes the cosine similarity between two numeric vectors.
- * Handles mismatched lengths by using the shorter vector's dimension.
- * Returns 0 if either vector has zero magnitude.
- *
- * @param {number[]} a
- * @param {number[]} b
- * @returns {number} similarity in [-1, 1]
- */
-function cosineSimilarity(a, b) {
-	const len = Math.min(a.length, b.length);
-	let dot = 0;
-	let magA = 0;
-	let magB = 0;
-	for (let i = 0; i < len; i++) {
-		dot += a[i] * b[i];
-		magA += a[i] * a[i];
-		magB += b[i] * b[i];
-	}
-	const denom = Math.sqrt(magA) * Math.sqrt(magB);
-	return denom === 0 ? 0 : dot / denom;
-}
-/**
- * Ranks items by cosine similarity to a query embedding.
- * Each item must have an `embedding` property (number[]).
- *
- * @param {number[]} queryEmbedding
- * @param {Array<{embedding: number[], [key: string]: any}>} items
- * @returns {Array<{item: object, score: number}>} sorted descending by score
- */
-function rankBySimilarity(queryEmbedding, items) {
-	return items
-		.map(item => ({ item, score: cosineSimilarity(queryEmbedding, item.embedding) }))
-		.sort((a, b) => b.score - a.score);
-}
-module.exports = { cosineSimilarity, rankBySimilarity };

package/lib/similarity.js DELETED Viewed

@@ -1,125 +0,0 @@
-'use strict';
-// Common stop-words (English + Hebrew) to exclude from TF-IDF vectors
-const STOP_WORDS = new Set([
-	// English
-	'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
-	'of', 'with', 'by', 'from', 'is', 'it', 'its', 'be', 'as', 'was',
-	'are', 'were', 'been', 'have', 'has', 'had', 'do', 'does', 'did',
-	'will', 'would', 'could', 'should', 'may', 'might', 'can', 'shall',
-	'not', 'no', 'so', 'if', 'this', 'that', 'these', 'those', 'i', 'we',
-	'you', 'he', 'she', 'they', 'my', 'your', 'his', 'her', 'our', 'their',
-	'how', 'what', 'when', 'where', 'why', 'who', 'which', 'am', 'up',
-	'out', 'about', 'into', 'than', 'more', 'also', 'me', 'him', 'us', 'them',
-	// Hebrew
-	'של', 'את', 'אל', 'על', 'עם', 'הם', 'הן', 'זה', 'זו', 'זאת',
-	'כי', 'לא', 'כן', 'יש', 'אם', 'רק', 'גם', 'אבל', 'אנחנו', 'אני',
-	'אתה', 'את', 'הוא', 'היא', 'אנו', 'אתם', 'אתן', 'הם', 'הן',
-	'זה', 'זו', 'אלה', 'אלו', 'כל', 'כך', 'כבר', 'עוד', 'רק', 'כן',
-	'אחד', 'יותר', 'פה', 'שם', 'מה', 'מי', 'איך', 'מתי', 'איפה',
-	'היה', 'הייתה', 'יהיה', 'תהיה', 'הוא', 'היא', 'הם', 'הן',
-	'אסור', 'מותר', 'צריך', 'רוצה', 'יכול', 'יכולה', 'לו', 'לה',
-	'בו', 'בה', 'עליו', 'עליה', 'בין', 'כבר', 'עכשיו', 'היום',
-	'כן', 'לכן', 'כדי', 'כדאי', 'שלי', 'שלך', 'שלו', 'שלה',
-	'שלנו', 'שלכם', 'שלהם', 'שלהן', 'להם', 'להן', 'לנו', 'לכם',
-]);
-/**
- * Strip HTML, lowercase, remove punctuation, drop stop-words and short tokens.
- * @param {string|null|undefined} text
- * @returns {string[]}
- */
-function tokenize(text) {
-	if (!text) return [];
-	return text
-		.replace(/<[^>]*>/g, ' ')           // strip HTML tags
-		.toLowerCase()
-		.replace(/[^\p{L}\p{N}\s]/gu, ' ') // keep all Unicode letters & digits (Hebrew, Latin, etc.)
-		.split(/\s+/)
-		.filter(t => t.length >= 2 && !STOP_WORDS.has(t)); // min 2 chars to keep short Hebrew words
-}
-/**
- * Build a TF-IDF index from an array of topic objects.
- * Each topic must have: tid, slug, title, mainPostContent (optional).
- * @param {{ tid: number|string, slug: string, title: string, mainPostContent?: string }[]} topics
- * @returns {{ tid: number|string, slug: string, vector: Map<string, number> }[]}
- */
-function buildIndex(topics) {
-	if (!topics || topics.length === 0) return [];
-	// Step 1: term frequencies per document
-	const docs = topics.map((t) => {
-		const tokens = tokenize(`${t.title || ''} ${t.mainPostContent || ''}`);
-		const tf = new Map();
-		for (const token of tokens) {
-			tf.set(token, (tf.get(token) || 0) + 1);
-		}
-		return { tid: t.tid, slug: t.slug, tf, len: tokens.length };
-	});
-	// Step 2: document frequency (how many docs contain each term)
-	const df = new Map();
-	for (const doc of docs) {
-		for (const term of doc.tf.keys()) {
-			df.set(term, (df.get(term) || 0) + 1);
-		}
-	}
-	const N = docs.length;
-	// Step 3: compute TF-IDF vector per document
-	return docs.map((doc) => {
-		const vector = new Map();
-		for (const [term, freq] of doc.tf) {
-			const tf = doc.len > 0 ? freq / doc.len : 0;
-			// Smoothed IDF to avoid division by zero
-			const idf = Math.log((N + 1) / (df.get(term) + 1)) + 1;
-			vector.set(term, tf * idf);
-		}
-		return { tid: doc.tid, slug: doc.slug, vector };
-	});
-}
-/**
- * Rank indexed documents against a query using cosine similarity.
- * @param {string} queryText
- * @param {{ tid: number|string, slug: string, vector: Map<string, number> }[]} index
- * @param {number} [topN=10]
- * @returns {{ tid: number|string, slug: string, score: number }[]}
- */
-function query(queryText, index, topN = 10) {
-	if (!index || index.length === 0) return [];
-	const qTokens = tokenize(queryText);
-	if (qTokens.length === 0) return [];
-	// Build raw term-count vector for the query
-	const qVec = new Map();
-	for (const token of qTokens) {
-		qVec.set(token, (qVec.get(token) || 0) + 1);
-	}
-	const results = [];
-	for (const doc of index) {
-		let dot = 0;
-		let docNormSq = 0;
-		let qNormSq = 0;
-		for (const [term, qVal] of qVec) {
-			dot += qVal * (doc.vector.get(term) || 0);
-		}
-		for (const val of doc.vector.values()) docNormSq += val * val;
-		for (const val of qVec.values()) qNormSq += val * val;
-		const norm = Math.sqrt(docNormSq) * Math.sqrt(qNormSq);
-		const score = norm > 0 ? dot / norm : 0;
-		if (score > 0) {
-			results.push({ tid: doc.tid, slug: doc.slug, score });
-		}
-	}
-	return results.sort((a, b) => b.score - a.score).slice(0, topN);
-}
-module.exports = { tokenize, buildIndex, query };

package/test/testCosine.js DELETED Viewed

@@ -1,15 +0,0 @@
-const { cosineSimilarity } = require('../lib/cosineSimilarity');
-function testCosine() {
-  console.log("Testing cosine similarity...");
-  const a = cosineSimilarity([1, 0], [1, 0]);
-  const b = cosineSimilarity([1, 0], [0, 1]);
-  if (a < 0.9) throw new Error("Expected high similarity");
-  if (b > 0.1) throw new Error("Expected low similarity");
-  console.log("✅ Cosine OK");
-}
-module.exports = testCosine;