nodebb-plugin-search-agent 0.0.933 → 0.0.935

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nodebb-plugin-search-agent",
3
- "version": "0.0.933",
3
+ "version": "0.0.935",
4
4
  "description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
5
5
  "main": "library.js",
6
6
  "author": "Racheli Bayfus",
@@ -1,3 +1,22 @@
1
+ // Remove images, files, and non-text content from input
2
+ function extractPureText(text) {
3
+ if (typeof text !== 'string') return '';
4
+ // Remove Markdown images: ![alt](url)
5
+ let cleaned = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
6
+ // Remove HTML <img> tags
7
+ cleaned = cleaned.replace(/<img\b[^>]*>/gi, '');
8
+ // Remove links to files/images (common extensions)
9
+ cleaned = cleaned.replace(/https?:\/\/(\S+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?))(\?\S*)?/gi, '');
10
+ // Remove Markdown file links: [desc](url.ext)
11
+ cleaned = cleaned.replace(/\[[^\]]*\]\([^)]*\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)\)/gi, '');
12
+ // Remove any remaining <a href=...> tags to files
13
+ cleaned = cleaned.replace(/<a\b[^>]*href=["']?[^"'>]+\.(jpg|jpeg|png|gif|bmp|svg|webp|pdf|docx?|xlsx?|pptx?|zip|rar|7z|tar|gz|mp3|mp4|avi|mov|wmv|flv|mkv|ogg|wav|exe|bin|apk|ipa|dmg|iso|csv|json|xml|yml|yaml|psd|ai|eps|ttf|otf|woff2?)[^>]*>.*?<\/a>/gi, '');
14
+ // Remove any remaining HTML tags
15
+ cleaned = cleaned.replace(/<[^>]+>/g, ' ');
16
+ // Remove extra whitespace
17
+ cleaned = cleaned.replace(/[ \t]+/g, ' ').replace(/\n{2,}/g, '\n').trim();
18
+ return cleaned;
19
+ }
1
20
  'use strict';
2
21
 
3
22
  const https = require('https');
@@ -16,7 +35,7 @@ const RETRY_DELAY_MS = 500;
16
35
  // Using 1.5 chars/token worst-case: 8000 tokens × 1.5 = 12 000 chars — gives a safe margin.
17
36
 
18
37
  const MAX_CHARS = 12000;
19
- const CHUNK_OVERLAP = 2000; // chars to overlap between chunks for context
38
+ const CHUNK_OVERLAP = 200; // chars to overlap between chunks for context
20
39
 
21
40
  // Split a long string into chunks of maxLen, with optional overlap
22
41
  function splitIntoChunks(text, maxLen = MAX_CHARS, overlap = CHUNK_OVERLAP) {
@@ -136,8 +155,13 @@ async function embed(text) {
136
155
  throw new Error('OPENAI_API_KEY environment variable is not set');
137
156
  }
138
157
 
158
+ // Remove non-text content
159
+ const pureText = extractPureText(text);
160
+ if (!pureText) {
161
+ throw new Error('embed() received no usable text after filtering');
162
+ }
139
163
  // Split into chunks if too long
140
- const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
164
+ const chunks = splitIntoChunks(pureText, MAX_CHARS, CHUNK_OVERLAP);
141
165
  if (chunks.length === 1) {
142
166
  const safe = truncate(text);
143
167
  if (_embedCache.has(safe)) {
@@ -189,15 +213,24 @@ async function embedBatch(texts) {
189
213
  throw new Error('OPENAI_API_KEY environment variable is not set');
190
214
  }
191
215
 
192
- // For each text, if too long, split and average embeddings
216
+ // For each text, filter to pure text, then split and average embeddings
193
217
  const allChunks = [];
194
218
  const chunkMap = [];
195
219
  for (const text of texts) {
196
- const chunks = splitIntoChunks(text, MAX_CHARS, CHUNK_OVERLAP);
220
+ const pureText = extractPureText(text);
221
+ if (!pureText) {
222
+ chunkMap.push({ count: 0 });
223
+ continue;
224
+ }
225
+ const chunks = splitIntoChunks(pureText, MAX_CHARS, CHUNK_OVERLAP);
197
226
  chunkMap.push({ count: chunks.length });
198
227
  allChunks.push(...chunks);
199
228
  }
200
229
  winston().verbose(`[search-agent] embeddingService: batch embedding ${allChunks.length} chunk(s) from ${texts.length} input(s)`);
230
+ if (allChunks.length === 0) {
231
+ // All texts were filtered out
232
+ return chunkMap.map(({ count }) => count === 0 ? [] : null);
233
+ }
201
234
  const safeChunks = allChunks.map(truncate);
202
235
  const response = await withRetry(() => requestEmbeddings(apiKey, safeChunks));
203
236
  winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${safeChunks.length} vector(s))`);
@@ -206,7 +239,9 @@ async function embedBatch(texts) {
206
239
  const result = [];
207
240
  let idx = 0;
208
241
  for (const { count } of chunkMap) {
209
- if (count === 1) {
242
+ if (count === 0) {
243
+ result.push([]); // No usable text
244
+ } else if (count === 1) {
210
245
  result.push(vectors[idx]);
211
246
  idx += 1;
212
247
  } else {