nodebb-plugin-search-agent 0.0.933 → 0.0.935
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/services/embeddingService.js +40 -5
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nodebb-plugin-search-agent",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.935",
|
|
4
4
|
"description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
|
|
5
5
|
"main": "library.js",
|
|
6
6
|
"author": "Racheli Bayfus",
|
|
@@ -1,3 +1,22 @@
|
|
|
1
|
+
// Remove images, files, and non-text content from input
|
|
2
|
+
function extractPureText(text) {
|
|
3
|
+
if (typeof text !== 'string') return '';
|
|
4
|
+
// Remove Markdown images: 
|
|
5
|
+
let cleaned = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
|
|
6
|
+
// Remove HTML <img> tags
|
|
7
|
+
cleaned = cleaned.replace(/<img\b[^>]*>/gi, '');
|
|
8
|
+
// Remove links to files/images (common extensions)
|
|
9
|
+
cleaned = cleaned.replace(/https?:\/\/(\S+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?))(\?\S*)?/gi, '');
|
|
10
|
+
// Remove Markdown file links: [desc](url.ext)
|
|
11
|
+
cleaned = cleaned.replace(/\[[^\]]*\]\([^)]*\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)\)/gi, '');
|
|
12
|
+
// Remove any remaining <a href=...> tags to files
|
|
13
|
+
cleaned = cleaned.replace(/<a\b[^>]*href=["']?[^"'>]+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)[^>]*>.*?<\/a>/gi, '');
|
|
14
|
+
// Remove any remaining HTML tags
|
|
15
|
+
cleaned = cleaned.replace(/<[^>]+>/g, ' ');
|
|
16
|
+
// Remove extra whitespace
|
|
17
|
+
cleaned = cleaned.replace(/[ \t]+/g, ' ').replace(/\n{2,}/g, '\n').trim();
|
|
18
|
+
return cleaned;
|
|
19
|
+
}
|
|
1
20
|
'use strict';
|
|
2
21
|
|
|
3
22
|
const https = require('https');
|
|
@@ -16,7 +35,7 @@ const RETRY_DELAY_MS = 500;
|
|
|
16
35
|
// Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
|
|
17
36
|
|
|
18
37
|
const MAX_CHARS = 12000;
|
|
19
|
-
const CHUNK_OVERLAP =
|
|
38
|
+
const CHUNK_OVERLAP = 200; // chars to overlap between chunks for context
|
|
20
39
|
|
|
21
40
|
// Split a long string into chunks of maxLen, with optional overlap
|
|
22
41
|
function splitIntoChunks(text, maxLen = MAX_CHARS, overlap = CHUNK_OVERLAP) {
|
|
@@ -136,8 +155,13 @@ async function embed(text) {
|
|
|
136
155
|
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
137
156
|
}
|
|
138
157
|
|
|
158
|
+
// Remove non-text content
|
|
159
|
+
const pureText = extractPureText(text);
|
|
160
|
+
if (!pureText) {
|
|
161
|
+
throw new Error('embed() received no usable text after filtering');
|
|
162
|
+
}
|
|
139
163
|
// Split into chunks if too long
|
|
140
|
-
const chunks = splitIntoChunks(
|
|
164
|
+
const chunks = splitIntoChunks(pureText, MAX_CHARS, CHUNK_OVERLAP);
|
|
141
165
|
if (chunks.length === 1) {
|
|
142
166
|
const safe = truncate(text);
|
|
143
167
|
if (_embedCache.has(safe)) {
|
|
@@ -189,15 +213,24 @@ async function embedBatch(texts) {
|
|
|
189
213
|
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
190
214
|
}
|
|
191
215
|
|
|
192
|
-
// For each text,
|
|
216
|
+
// For each text, filter to pure text, then split and average embeddings
|
|
193
217
|
const allChunks = [];
|
|
194
218
|
const chunkMap = [];
|
|
195
219
|
for (const text of texts) {
|
|
196
|
-
const
|
|
220
|
+
const pureText = extractPureText(text);
|
|
221
|
+
if (!pureText) {
|
|
222
|
+
chunkMap.push({ count: 0 });
|
|
223
|
+
continue;
|
|
224
|
+
}
|
|
225
|
+
const chunks = splitIntoChunks(pureText, MAX_CHARS, CHUNK_OVERLAP);
|
|
197
226
|
chunkMap.push({ count: chunks.length });
|
|
198
227
|
allChunks.push(...chunks);
|
|
199
228
|
}
|
|
200
229
|
winston().verbose(`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`);
|
|
230
|
+
if (allChunks.length === 0) {
|
|
231
|
+
// All texts were filtered out
|
|
232
|
+
return chunkMap.map(({ count }) => count === 0 ? [] : null);
|
|
233
|
+
}
|
|
201
234
|
const safeChunks = allChunks.map(truncate);
|
|
202
235
|
const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
|
|
203
236
|
winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`);
|
|
@@ -206,7 +239,9 @@ async function embedBatch(texts) {
|
|
|
206
239
|
const result = [];
|
|
207
240
|
let idx = 0;
|
|
208
241
|
for (const { count } of chunkMap) {
|
|
209
|
-
if (count ===
|
|
242
|
+
if (count === 0) {
|
|
243
|
+
result.push([]); // No usable text
|
|
244
|
+
} else if (count === 1) {
|
|
210
245
|
result.push(vectors[idx]);
|
|
211
246
|
idx += 1;
|
|
212
247
|
} else {
|