npm - nodebb-plugin-search-agent - Versions diffs - 0.0.92 → 0.0.94 - Mend

nodebb-plugin-search-agent 0.0.92 → 0.0.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/lib/searchHandler.js +118 -41
package/library.js +2 -2
package/package.json +1 -1
package/services/embeddingService.js +438 -90
package/services/vectorSearchService.js +379 -93
package/templates/admin/plugins/search-agent.tpl +12 -0
package/lib/cosineSimilarity.js +0 -42
package/test/testCosine.js +0 -15

package/services/embeddingService.js CHANGED Viewed

@@ -3,7 +3,7 @@
 const https = require('https');
 function winston() {
-	return require.main.require('winston');
+    return require.main.require('winston');
 }
 const OPENAI_EMBEDDINGS_HOSTNAME = 'api.openai.com';
@@ -11,15 +11,236 @@ const OPENAI_EMBEDDINGS_PATH = '/v1/embeddings';
 const EMBEDDING_MODEL = 'text-embedding-3-small';
 const MAX_RETRIES = 3;
 const RETRY_DELAY_MS = 500;
-// text-embedding-3-small supports 8 192 tokens.
-// Hebrew/non-ASCII text tokenizes at ~1.5–2 chars/token (UTF-8 multibyte).
-// Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
-const MAX_CHARS = 12000;
+// text-embedding-3-small supports 8,192 tokens.
+// Conservative char limits help avoid token overflows, especially for non-ASCII text.
+const MAX_CHARS = 10000;
+const CHUNK_OVERLAP = 300;
+const MIN_CHUNK_CHARS = 500;
+const TARGET_CHUNK_CHARS = 2200;
+// ─── Text cleanup ─────────────────────────────────────────────────────────────
+function extractPureText(text) {
+    if (typeof text !== 'string') return '';
+    // Remove Markdown images: ![alt](url)
+    let cleaned = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
+    // Remove HTML <img> tags
+    cleaned = cleaned.replace(/<img\b[^>]*>/gi, '');
+    // Remove links to files/images (common extensions)
+    cleaned = cleaned.replace(
+        /https?:\/\/(\S+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?))(\?\S*)?/gi,
+        ''
+    );
+    // Remove Markdown file links: [desc](url.ext)
+    cleaned = cleaned.replace(
+        /\[[^\]]*\]\([^)]*\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)\)/gi,
+        ''
+    );
+    // Remove any remaining <a href=...> tags to files
+    cleaned = cleaned.replace(
+        /<a\b[^>]*href=["']?[^"'>]+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)[^>]*>.*?<\/a>/gi,
+        ''
+    );
+    // Remove any remaining HTML tags
+    cleaned = cleaned.replace(/<[^>]+>/g, ' ');
+    // Remove extra whitespace
+    cleaned = cleaned
+        .replace(/[ \t]+/g, ' ')
+        .replace(/\n{2,}/g, '\n')
+        .trim();
+    return cleaned;
+}
+function normalizeWhitespace(text) {
+    return text
+        .replace(/\r\n/g, '\n')
+        .replace(/[ \t]+/g, ' ')
+        .replace(/\n{3,}/g, '\n\n')
+        .trim();
+}
+function splitIntoBlocks(text) {
+    const normalized = normalizeWhitespace(text);
+    const rawBlocks = normalized
+        .split(/\n{2,}|(?=^#{1,6}\s)|(?=^\s*[-*+]\s)|(?=^\s*\d+\.\s)|(?=^\s*>\s)|(?=^```)/gm)
+        .map(block => block.trim())
+        .filter(Boolean);
+    return rawBlocks;
+}
+function splitLargeBlock(block, maxLen = MAX_CHARS) {
+    if (block.length <= maxLen) return [block];
+    const sentences = block.match(/[^.!?\n]+[.!?\n]+|[^.!?\n]+$/g) || [block];
+    const chunks = [];
+    let current = '';
+    for (const sentence of sentences) {
+        const s = sentence.trim();
+        if (!s) continue;
+        if ((current + ' ' + s).trim().length <= maxLen) {
+            current = current ? `${current} ${s}` : s;
+            continue;
+        }
+        if (current) {
+            chunks.push(current);
+            current = '';
+        }
+        // Fallback for very long sentence
+        if (s.length > maxLen) {
+            let i = 0;
+            while (i < s.length) {
+                chunks.push(s.slice(i, i + maxLen).trim());
+                i += maxLen;
+            }
+        } else {
+            current = s;
+        }
+    }
+    if (current) chunks.push(current);
+    return chunks;
+}
+function buildOverlapPrefix(prevChunk, overlapChars = CHUNK_OVERLAP) {
+    if (!prevChunk) return '';
+    return prevChunk.slice(Math.max(0, prevChunk.length - overlapChars)).trim();
+}
+function splitIntoSemanticChunks(
+    text,
+    {
+        maxLen = MAX_CHARS,
+        targetLen = TARGET_CHUNK_CHARS,
+        minLen = MIN_CHUNK_CHARS,
+        overlap = CHUNK_OVERLAP,
+    } = {}
+) {
+    if (!text) return [];
+    if (text.length <= maxLen) return [text];
+    const blocks = splitIntoBlocks(text).flatMap(block => splitLargeBlock(block, maxLen));
+    const chunks = [];
+    let current = '';
+    for (const block of blocks) {
+        const next = current ? `${current}\n\n${block}` : block;
+        if (next.length <= targetLen || current.length < minLen) {
+            if (next.length <= maxLen) {
+                current = next;
+                continue;
+            }
+        }
+        if (current) {
+            chunks.push(current.trim());
+        }
+        current = block;
+    }
+    if (current) {
+        chunks.push(current.trim());
+    }
+    const withOverlap = chunks.map((chunk, i) => {
+        if (i === 0) return chunk;
+        const prefix = buildOverlapPrefix(chunks[i - 1], overlap);
+        const merged = prefix ? `${prefix}\n\n${chunk}` : chunk;
+        return merged.length <= maxLen ? merged : merged.slice(merged.length - maxLen);
+    });
+    const finalChunks = [];
+    for (const chunk of withOverlap) {
+        if (
+            finalChunks.length > 0 &&
+            chunk.length < minLen &&
+            finalChunks[finalChunks.length - 1].length + 2 + chunk.length <= maxLen
+        ) {
+            finalChunks[finalChunks.length - 1] += `\n\n${chunk}`;
+        } else {
+            finalChunks.push(chunk);
+        }
+    }
+    return finalChunks;
+}
+// ─── Vector helpers ───────────────────────────────────────────────────────────
+function averageVectors(vectors) {
+    if (!vectors.length) return [];
+    const len = vectors[0].length;
+    const sum = new Array(len).fill(0);
+    for (const v of vectors) {
+        for (let i = 0; i < len; i++) {
+            sum[i] += v[i];
+        }
+    }
+    return sum.map(x => x / vectors.length);
+}
 function truncate(text) {
-	return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
+    return text.length > MAX_CHARS ? text.slice(0, MAX_CHARS) : text;
 }
+function estimateTokens(str) {
+    const ascii = /^[\x00-\x7F]*$/.test(str);
+    return ascii ? Math.ceil(str.length / 4) : Math.ceil(str.length / 1.5);
+}
+// ─── Embedding cache ──────────────────────────────────────────────────────────
+const _embedCache = new Map();
+const EMBED_CACHE_MAX = 500;
+function getCachedEmbedding(key) {
+    if (!_embedCache.has(key)) return null;
+    const value = _embedCache.get(key);
+    // Refresh LRU-ish order
+    _embedCache.delete(key);
+    _embedCache.set(key, value);
+    return value;
+}
+function setCachedEmbedding(key, value) {
+    if (_embedCache.has(key)) {
+        _embedCache.delete(key);
+    }
+    while (_embedCache.size >= EMBED_CACHE_MAX) {
+        _embedCache.delete(_embedCache.keys().next().value);
+    }
+    _embedCache.set(key, value);
+}
+// ─── OpenAI request helpers ───────────────────────────────────────────────────
 /**
  * Performs an HTTPS POST request to the OpenAI embeddings endpoint.
  * @param {string} apiKey
@@ -27,119 +248,246 @@ function truncate(text) {
  * @returns {Promise<object>} Parsed JSON response body
  */
 function requestEmbeddings(apiKey, input) {
-	return new Promise((resolve, reject) => {
-		const body = JSON.stringify({ model: EMBEDDING_MODEL, input });
-		const options = {
-			hostname: OPENAI_EMBEDDINGS_HOSTNAME,
-			path: OPENAI_EMBEDDINGS_PATH,
-			method: 'POST',
-			headers: {
-				'Content-Type': 'application/json',
-				'Authorization': `Bearer ${apiKey}`,
-				'Content-Length': Buffer.byteLength(body),
-			},
-		};
-		const req = https.request(options, (res) => {
-			const chunks = [];
-			res.on('data', chunk => chunks.push(chunk));
-			res.on('end', () => {
-				let parsed;
-				try {
-					parsed = JSON.parse(Buffer.concat(chunks).toString('utf8'));
-				} catch (e) {
-					return reject(new Error(`Failed to parse OpenAI response: ${e.message}`));
-				}
-				if (res.statusCode >= 400) {
-					const message = (parsed.error && parsed.error.message) || `HTTP ${res.statusCode}`;
-					return reject(new Error(`OpenAI API error: ${message}`));
-				}
-				resolve(parsed);
-			});
-		});
-		req.on('error', err => reject(new Error(`Network error calling OpenAI: ${err.message}`)));
-		req.write(body);
-		req.end();
-	});
+    return new Promise((resolve, reject) => {
+        const body = JSON.stringify({ model: EMBEDDING_MODEL, input });
+        const options = {
+            hostname: OPENAI_EMBEDDINGS_HOSTNAME,
+            path: OPENAI_EMBEDDINGS_PATH,
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json',
+                'Authorization': `Bearer ${apiKey}`,
+                'Content-Length': Buffer.byteLength(body),
+            },
+        };
+        const req = https.request(options, res => {
+            const chunks = [];
+            res.on('data', chunk => chunks.push(chunk));
+            res.on('end', () => {
+                let parsed;
+                try {
+                    parsed = JSON.parse(Buffer.concat(chunks).toString('utf8'));
+                } catch (e) {
+                    return reject(new Error(`Failed to parse OpenAI response: ${e.message}`));
+                }
+                if (res.statusCode >= 400) {
+                    const message = (parsed.error && parsed.error.message) || `HTTP ${res.statusCode}`;
+                    return reject(new Error(`OpenAI API error: ${message}`));
+                }
+                resolve(parsed);
+            });
+        });
+        req.on('error', err => reject(new Error(`Network error calling OpenAI: ${err.message}`)));
+        req.write(body);
+        req.end();
+    });
 }
 /**
  * Retries an async operation up to maxRetries times with exponential back-off.
- * @param {Function} fn - Async function to retry
+ * @param {Function} fn
  * @param {number} retries
  * @returns {Promise<*>}
  */
 async function withRetry(fn, retries = MAX_RETRIES) {
-	let lastError;
-	for (let attempt = 1; attempt <= retries; attempt++) {
-		try {
-			return await fn();
-		} catch (err) {
-			lastError = err;
-			if (attempt < retries) {
-				winston().warn(`[search-agent] embeddingService: attempt ${attempt} failed (${err.message}), retrying in ${RETRY_DELAY_MS * attempt} ms…`);
-				await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS * attempt));
-			}
-		}
-	}
-	throw lastError;
+    let lastError;
+    for (let attempt = 1; attempt <= retries; attempt++) {
+        try {
+            return await fn();
+        } catch (err) {
+            lastError = err;
+            if (attempt < retries) {
+                winston().warn(
+                    `[search-agent] embeddingService: attempt ${attempt} failed (${err.message}), retrying in ${RETRY_DELAY_MS * attempt} ms...`
+                );
+                await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS * attempt));
+            }
+        }
+    }
+    throw lastError;
 }
+// ─── Public API ───────────────────────────────────────────────────────────────
 /**
  * Converts a single text string into an embedding vector.
  * @param {string} text
  * @returns {Promise<number[]>}
  */
 async function embed(text) {
-	if (typeof text !== 'string' || text.trim() === '') {
-		throw new Error('embed() requires a non-empty string');
-	}
+    if (typeof text !== 'string' || text.trim() === '') {
+        throw new Error('embed() requires a non-empty string');
+    }
+    const apiKey = process.env.OPENAI_API_KEY;
+    if (!apiKey) {
+        throw new Error('OPENAI_API_KEY environment variable is not set');
+    }
+    const pureText = extractPureText(text);
+    if (!pureText) {
+        throw new Error('embed() received no usable text after filtering');
+    }
+    const chunks = splitIntoSemanticChunks(pureText, {
+        maxLen: MAX_CHARS,
+        targetLen: TARGET_CHUNK_CHARS,
+        minLen: MIN_CHUNK_CHARS,
+        overlap: CHUNK_OVERLAP,
+    });
+    if (chunks.length === 1) {
+        const safe = truncate(pureText);
+        const cached = getCachedEmbedding(safe);
+        if (cached) {
+            winston().verbose('[search-agent] embeddingService: embedding cache hit');
+            return cached;
+        }
+        const tokenCount = estimateTokens(safe);
+        winston().info(
+            `[search-agent] embeddingService: generating embedding for text (${safe.length} chars, ~${tokenCount} tokens)`
+        );
+        const response = await withRetry(() => requestEmbeddings(apiKey, safe));
+        const embedding = response.data[0].embedding;
+        winston().verbose('[search-agent] embeddingService: embedding generated successfully');
+        setCachedEmbedding(safe, embedding);
+        return embedding;
+    }
+    winston().info(
+        `[search-agent] embeddingService: splitting long text into ${chunks.length} semantic chunks for embedding`
+    );
+    chunks.forEach((chunk, i) => {
+        const tokenCount = estimateTokens(chunk);
+        winston().info(
+            `[search-agent] embeddingService: chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
+        );
+    });
+    const vectors = await embedBatch(chunks);
+    const avg = averageVectors(vectors);
+    const safe = truncate(pureText);
-	const apiKey = process.env.OPENAI_API_KEY;
-	if (!apiKey) {
-		throw new Error('OPENAI_API_KEY environment variable is not set');
-	}
+    setCachedEmbedding(safe, avg);
-	const safe = truncate(text);
-	winston().verbose(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars)`);
-	const response = await withRetry(() => requestEmbeddings(apiKey, safe));
-	winston().verbose('[search-agent] embeddingService: embedding generated successfully');
-	return response.data[0].embedding;
+    return avg;
 }
 /**
  * Converts an array of text strings into an array of embedding vectors.
- * Texts are sent in a single batched API request.
+ * Texts are sent in batched API requests after semantic chunking.
  * @param {string[]} texts
  * @returns {Promise<number[][]>}
  */
 async function embedBatch(texts) {
-	if (!Array.isArray(texts) || texts.length === 0) {
-		throw new Error('embedBatch() requires a non-empty array of strings');
-	}
+    if (!Array.isArray(texts) || texts.length === 0) {
+        throw new Error('embedBatch() requires a non-empty array of strings');
+    }
+    const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
+    if (invalid !== -1) {
+        throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
+    }
+    const apiKey = process.env.OPENAI_API_KEY;
+    if (!apiKey) {
+        throw new Error('OPENAI_API_KEY environment variable is not set');
+    }
+    const allChunks = [];
+    const chunkMap = [];
+    for (const [textIdx, text] of texts.entries()) {
+        const pureText = extractPureText(text);
+        if (!pureText) {
+            chunkMap.push({ count: 0 });
+            continue;
+        }
+        const chunks = splitIntoSemanticChunks(pureText, {
+            maxLen: MAX_CHARS,
+            targetLen: TARGET_CHUNK_CHARS,
+            minLen: MIN_CHUNK_CHARS,
+            overlap: CHUNK_OVERLAP,
+        });
+        chunkMap.push({ count: chunks.length });
+        allChunks.push(...chunks);
+        if (chunks.length === 1) {
+            const tokenCount = estimateTokens(chunks[0]);
+            winston().info(
+                `[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - 1 chunk, ${chunks[0].length} chars, ~${tokenCount} tokens`
+            );
+        } else {
+            winston().info(
+                `[search-agent] embeddingService: batch input ${textIdx + 1}/${texts.length} - ${chunks.length} chunks`
+            );
+            chunks.forEach((chunk, i) => {
+                const tokenCount = estimateTokens(chunk);
+                winston().info(
+                    `[search-agent] embeddingService:   chunk ${i + 1}/${chunks.length} - ${chunk.length} chars, ~${tokenCount} tokens`
+                );
+            });
+        }
+    }
+    winston().verbose(
+        `[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`
+    );
+    if (allChunks.length === 0) {
+        return chunkMap.map(({ count }) => (count === 0 ? [] : null));
+    }
+    const safeChunks = allChunks.map(chunk => truncate(chunk));
+    const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
+    winston().verbose(
+        `[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`
+    );
-	const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
-	if (invalid !== -1) {
-		throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
-	}
+    const vectors = response.data
+        .sort((a, b) => a.index - b.index)
+        .map(item => item.embedding);
-	const apiKey = process.env.OPENAI_API_KEY;
-	if (!apiKey) {
-		throw new Error('OPENAI_API_KEY environment variable is not set');
-	}
+    const result = [];
+    let idx = 0;
-	const safeTexts = texts.map(truncate);
-	winston().verbose(`[search-agent] embeddingService: generating batch embeddings for ${safeTexts.length} text(s)`);
-	const response = await withRetry(() => requestEmbeddings(apiKey, safeTexts));
-	winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${texts.length} vector(s))`);
+    for (const { count } of chunkMap) {
+        if (count === 0) {
+            result.push([]);
+        } else if (count === 1) {
+            result.push(vectors[idx]);
+            idx += 1;
+        } else {
+            result.push(averageVectors(vectors.slice(idx, idx + count)));
+            idx += count;
+        }
+    }
-	// OpenAI returns items sorted by index field, but sort explicitly to be safe
-	return response.data
-		.sort((a, b) => a.index - b.index)
-		.map(item => item.embedding);
+    return result;
 }
-module.exports = { embed, embedBatch };
+module.exports = {
+    embed,
+    embedBatch,
+    extractPureText,
+    splitIntoSemanticChunks,
+};