npm - nodebb-plugin-search-agent - Versions diffs - 0.0.933 → 0.0.935 - Mend

nodebb-plugin-search-agent 0.0.933 → 0.0.935

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/services/embeddingService.js +40 -5

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "nodebb-plugin-search-agent",
-  "version": "0.0.933",
+  "version": "0.0.935",
   "description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
   "main": "library.js",
   "author": "Racheli Bayfus",

package/services/embeddingService.js CHANGED Viewed

@@ -1,3 +1,22 @@
+// Remove images, files, and non-text content from input
+function extractPureText(text) {
+	if (typeof text !== 'string') return '';
+	// Remove Markdown images: ![alt](url)
+	let cleaned = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
+	// Remove HTML <img> tags
+	cleaned = cleaned.replace(/<img\b[^>]*>/gi, '');
+	// Remove links to files/images (common extensions)
+	cleaned = cleaned.replace(/https?:\/\/(\S+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?))(\?\S*)?/gi, '');
+	// Remove Markdown file links: [desc](url.ext)
+	cleaned = cleaned.replace(/\[[^\]]*\]\([^)]*\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)\)/gi, '');
+	// Remove any remaining <a href=...> tags to files
+	cleaned = cleaned.replace(/<a\b[^>]*href=["']?[^"'>]+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)[^>]*>.*?<\/a>/gi, '');
+	// Remove any remaining HTML tags
+	cleaned = cleaned.replace(/<[^>]+>/g, ' ');
+	// Remove extra whitespace
+	cleaned = cleaned.replace(/[ \t]+/g, ' ').replace(/\n{2,}/g, '\n').trim();
+	return cleaned;
+}
 'use strict';
 const https = require('https');
@@ -16,7 +35,7 @@ const RETRY_DELAY_MS = 500;
 // Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
 const MAX_CHARS = 12000;
-const CHUNK_OVERLAP = 2000; // chars to overlap between chunks for context
+const CHUNK_OVERLAP = 200; // chars to overlap between chunks for context
 // Split a long string into chunks of maxLen, with optional overlap
 function splitIntoChunks(text, maxLen = MAX_CHARS, overlap = CHUNK_OVERLAP) {
@@ -136,8 +155,13 @@ async function embed(text) {
 		throw new Error('OPENAI_API_KEY environment variable is not set');
 	}
+	// Remove non-text content
+	const pureText = extractPureText(text);
+	if (!pureText) {
+		throw new Error('embed() received no usable text after filtering');
+	}
 	// Split into chunks if too long
-	const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
+	const chunks = splitIntoChunks(pureText, MAX_CHARS, CHUNK_OVERLAP);
 	if (chunks.length === 1) {
 		const safe = truncate(text);
 		if (_embedCache.has(safe)) {
@@ -189,15 +213,24 @@ async function embedBatch(texts) {
 		throw new Error('OPENAI_API_KEY environment variable is not set');
 	}
-	// For each text, if too long, split and average embeddings
+	// For each text, filter to pure text, then split and average embeddings
 	const allChunks = [];
 	const chunkMap = [];
 	for (const text of texts) {
-		const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
+		const pureText = extractPureText(text);
+		if (!pureText) {
+			chunkMap.push({ count: 0 });
+			continue;
+		}
+		const chunks = splitIntoChunks(pureText, MAX_CHARS, CHUNK_OVERLAP);
 		chunkMap.push({ count: chunks.length });
 		allChunks.push(...chunks);
 	}
 	winston().verbose(`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`);
+	if (allChunks.length === 0) {
+		// All texts were filtered out
+		return chunkMap.map(({ count }) => count === 0 ? [] : null);
+	}
 	const safeChunks = allChunks.map(truncate);
 	const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
 	winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`);
@@ -206,7 +239,9 @@ async function embedBatch(texts) {
 	const result = [];
 	let idx = 0;
 	for (const { count } of chunkMap) {
-		if (count === 1) {
+		if (count === 0) {
+			result.push([]); // No usable text
+		} else if (count === 1) {
 			result.push(vectors[idx]);
 			idx += 1;
 		} else {