nodebb-plugin-search-agent 0.0.934 → 0.0.936
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/library.js +2 -2
- package/package.json +1 -1
- package/services/embeddingService.js +101 -40
- package/services/vectorSearchService.js +1 -1
package/library.js
CHANGED
|
@@ -30,13 +30,13 @@ plugin.init = async (params) => {
|
|
|
30
30
|
|
|
31
31
|
// Start initial embedding sync in the background — does not block NodeBB startup.
|
|
32
32
|
winston.info('[search-agent] Starting initial embedding sync…');
|
|
33
|
-
|
|
33
|
+
startSync();
|
|
34
34
|
|
|
35
35
|
// Re-sync every 10 minutes to pick up new posts.
|
|
36
36
|
const RESYNC_INTERVAL_MS = 10 * 60 * 1000;
|
|
37
37
|
setInterval(() => {
|
|
38
38
|
winston.info('[search-agent] Running scheduled embedding re-sync…');
|
|
39
|
-
|
|
39
|
+
startSync();
|
|
40
40
|
}, RESYNC_INTERVAL_MS).unref();
|
|
41
41
|
|
|
42
42
|
winston.info('[plugins/search-agent] Initialised.');
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nodebb-plugin-search-agent",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.936",
|
|
4
4
|
"description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
|
|
5
5
|
"main": "library.js",
|
|
6
6
|
"author": "Racheli Bayfus",
|
|
@@ -1,3 +1,22 @@
|
|
|
1
|
+
// Remove images, files, and non-text content from input
|
|
2
|
+
function extractPureText(text) {
|
|
3
|
+
if (typeof text !== 'string') return '';
|
|
4
|
+
// Remove Markdown images: 
|
|
5
|
+
let cleaned = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
|
|
6
|
+
// Remove HTML <img> tags
|
|
7
|
+
cleaned = cleaned.replace(/<img\b[^>]*>/gi, '');
|
|
8
|
+
// Remove links to files/images (common extensions)
|
|
9
|
+
cleaned = cleaned.replace(/https?:\/\/(\S+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?))(\?\S*)?/gi, '');
|
|
10
|
+
// Remove Markdown file links: [desc](url.ext)
|
|
11
|
+
cleaned = cleaned.replace(/\[[^\]]*\]\([^)]*\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)\)/gi, '');
|
|
12
|
+
// Remove any remaining <a href=...> tags to files
|
|
13
|
+
cleaned = cleaned.replace(/<a\b[^>]*href=["']?[^"'>]+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)[^>]*>.*?<\/a>/gi, '');
|
|
14
|
+
// Remove any remaining HTML tags
|
|
15
|
+
cleaned = cleaned.replace(/<[^>]+>/g, ' ');
|
|
16
|
+
// Remove extra whitespace
|
|
17
|
+
cleaned = cleaned.replace(/[ \t]+/g, ' ').replace(/\n{2,}/g, '\n').trim();
|
|
18
|
+
return cleaned;
|
|
19
|
+
}
|
|
1
20
|
'use strict';
|
|
2
21
|
|
|
3
22
|
const https = require('https');
|
|
@@ -16,7 +35,7 @@ const RETRY_DELAY_MS = 500;
|
|
|
16
35
|
// Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
|
|
17
36
|
|
|
18
37
|
const MAX_CHARS = 12000;
|
|
19
|
-
const CHUNK_OVERLAP =
|
|
38
|
+
const CHUNK_OVERLAP = 200; // chars to overlap between chunks for context
|
|
20
39
|
|
|
21
40
|
// Split a long string into chunks of maxLen, with optional overlap
|
|
22
41
|
function splitIntoChunks(text, maxLen = MAX_CHARS, overlap = CHUNK_OVERLAP) {
|
|
@@ -136,36 +155,52 @@ async function embed(text) {
|
|
|
136
155
|
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
137
156
|
}
|
|
138
157
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
158
|
+
// Remove non-text content
|
|
159
|
+
const pureText = extractPureText(text);
|
|
160
|
+
if (!pureText) {
|
|
161
|
+
throw new Error('embed() received no usable text after filtering');
|
|
162
|
+
}
|
|
163
|
+
// Split into chunks if too long
|
|
164
|
+
const chunks = splitIntoChunks(pureText, MAX_CHARS, CHUNK_OVERLAP);
|
|
165
|
+
// Estimate tokens (roughly 1.5 chars/token for non-ASCII, 4 chars/token for ASCII)
|
|
166
|
+
const estimateTokens = (str) => {
|
|
167
|
+
// If mostly ASCII, use 4 chars/token, else 1.5
|
|
168
|
+
const ascii = /^[\x00-\x7F]*$/.test(str);
|
|
169
|
+
return ascii ? Math.ceil(str.length / 4) : Math.ceil(str.length / 1.5);
|
|
170
|
+
};
|
|
171
|
+
if (chunks.length === 1) {
|
|
172
|
+
const safe = truncate(text);
|
|
173
|
+
if (_embedCache.has(safe)) {
|
|
174
|
+
winston().verbose('[search-agent] embeddingService: embedding cache hit');
|
|
175
|
+
return _embedCache.get(safe);
|
|
176
|
+
}
|
|
177
|
+
const tokenCount = estimateTokens(safe);
|
|
178
|
+
winston().info(`[search-agent] embeddingService: generating embedding for text (${safe.length} chars, ~${tokenCount} tokens)`);
|
|
179
|
+
const response = await withRetry(() => requestEmbeddings(apiKey, safe));
|
|
180
|
+
winston().verbose('[search-agent] embeddingService: embedding generated successfully');
|
|
181
|
+
const embedding = response.data[0].embedding;
|
|
182
|
+
if (_embedCache.size >= EMBED_CACHE_MAX) {
|
|
183
|
+
_embedCache.delete(_embedCache.keys().next().value);
|
|
184
|
+
}
|
|
185
|
+
_embedCache.set(safe, embedding);
|
|
186
|
+
return embedding;
|
|
187
|
+
} else {
|
|
188
|
+
// For multi-chunk, embed all and average
|
|
189
|
+
winston().info(`[search-agent] embeddingService: splitting long text into ${chunks.length} chunks for embedding`);
|
|
190
|
+
chunks.forEach((chunk, i) => {
|
|
191
|
+
const tokenCount = estimateTokens(chunk);
|
|
192
|
+
winston().info(`[search-agent] embeddingService: chunk ${i+1}/${chunks.length} — ${chunk.length} chars, ~${tokenCount} tokens`);
|
|
193
|
+
});
|
|
194
|
+
const vectors = await embedBatch(chunks);
|
|
195
|
+
const avg = averageVectors(vectors);
|
|
196
|
+
// Optionally cache the average for the full text
|
|
197
|
+
const safe = truncate(text);
|
|
198
|
+
if (_embedCache.size >= EMBED_CACHE_MAX) {
|
|
199
|
+
_embedCache.delete(_embedCache.keys().next().value);
|
|
200
|
+
}
|
|
201
|
+
_embedCache.set(safe, avg);
|
|
202
|
+
return avg;
|
|
203
|
+
}
|
|
169
204
|
}
|
|
170
205
|
|
|
171
206
|
/**
|
|
@@ -189,15 +224,39 @@ async function embedBatch(texts) {
|
|
|
189
224
|
throw new Error('OPENAI_API_KEY environment variable is not set');
|
|
190
225
|
}
|
|
191
226
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
227
|
+
// For each text, filter to pure text, then split and average embeddings
|
|
228
|
+
const allChunks = [];
|
|
229
|
+
const chunkMap = [];
|
|
230
|
+
// Estimate tokens (roughly 1.5 chars/token for non-ASCII, 4 chars/token for ASCII)
|
|
231
|
+
const estimateTokens = (str) => {
|
|
232
|
+
const ascii = /^[\x00-\x7F]*$/.test(str);
|
|
233
|
+
return ascii ? Math.ceil(str.length / 4) : Math.ceil(str.length / 1.5);
|
|
234
|
+
};
|
|
235
|
+
for (const [textIdx, text] of texts.entries()) {
|
|
236
|
+
const pureText = extractPureText(text);
|
|
237
|
+
if (!pureText) {
|
|
238
|
+
chunkMap.push({ count: 0 });
|
|
239
|
+
continue;
|
|
240
|
+
}
|
|
241
|
+
const chunks = splitIntoChunks(pureText, MAX_CHARS, CHUNK_OVERLAP);
|
|
242
|
+
chunkMap.push({ count: chunks.length });
|
|
243
|
+
allChunks.push(...chunks);
|
|
244
|
+
if (chunks.length === 1) {
|
|
245
|
+
const tokenCount = estimateTokens(chunks[0]);
|
|
246
|
+
winston().info(`[search-agent] embeddingService: batch input ${textIdx+1}/${texts.length} — 1 chunk, ${chunks[0].length} chars, ~${tokenCount} tokens`);
|
|
247
|
+
} else {
|
|
248
|
+
winston().info(`[search-agent] embeddingService: batch input ${textIdx+1}/${texts.length} — ${chunks.length} chunks`);
|
|
249
|
+
chunks.forEach((chunk, i) => {
|
|
250
|
+
const tokenCount = estimateTokens(chunk);
|
|
251
|
+
winston().info(`[search-agent] embeddingService: chunk ${i+1}/${chunks.length} — ${chunk.length} chars, ~${tokenCount} tokens`);
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
winston().verbose(`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`);
|
|
256
|
+
if (allChunks.length === 0) {
|
|
257
|
+
// All texts were filtered out
|
|
258
|
+
return chunkMap.map(({ count }) => count === 0 ? [] : null);
|
|
199
259
|
}
|
|
200
|
-
winston().verbose(`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`);
|
|
201
260
|
const safeChunks = allChunks.map(truncate);
|
|
202
261
|
const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
|
|
203
262
|
winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`);
|
|
@@ -206,7 +265,9 @@ async function embedBatch(texts) {
|
|
|
206
265
|
const result = [];
|
|
207
266
|
let idx = 0;
|
|
208
267
|
for (const { count } of chunkMap) {
|
|
209
|
-
if (count ===
|
|
268
|
+
if (count === 0) {
|
|
269
|
+
result.push([]); // No usable text
|
|
270
|
+
} else if (count === 1) {
|
|
210
271
|
result.push(vectors[idx]);
|
|
211
272
|
idx += 1;
|
|
212
273
|
} else {
|
|
@@ -13,7 +13,7 @@ const TOP_K = 50;
|
|
|
13
13
|
// Absolute minimum cosine similarity — only filters pure noise (near-zero similarity).
|
|
14
14
|
// Do NOT raise this: the relevant result often scores lower than irrelevant ones.
|
|
15
15
|
// The AI re-ranker (which reads content) is the precision gate, not this floor.
|
|
16
|
-
const MIN_SCORE = 0.
|
|
16
|
+
const MIN_SCORE = 0.15;
|
|
17
17
|
// Rebuild the Orama index after this interval (mirrors TF-IDF cache TTL)
|
|
18
18
|
const INDEX_TTL_MS = 5 * 60 * 1000;
|
|
19
19
|
|