nodebb-plugin-search-agent 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,42 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Computes the cosine similarity between two numeric vectors.
5
+ * Handles mismatched lengths by using the shorter vector's dimension.
6
+ * Returns 0 if either vector has zero magnitude.
7
+ *
8
+ * @param {number[]} a
9
+ * @param {number[]} b
10
+ * @returns {number} similarity in [-1, 1]
11
+ */
12
+ function cosineSimilarity(a, b) {
13
+ const len = Math.min(a.length, b.length);
14
+ let dot = 0;
15
+ let magA = 0;
16
+ let magB = 0;
17
+
18
+ for (let i = 0; i < len; i++) {
19
+ dot += a[i] * b[i];
20
+ magA += a[i] * a[i];
21
+ magB += b[i] * b[i];
22
+ }
23
+
24
+ const denom = Math.sqrt(magA) * Math.sqrt(magB);
25
+ return denom === 0 ? 0 : dot / denom;
26
+ }
27
+
28
+ /**
29
+ * Ranks items by cosine similarity to a query embedding.
30
+ * Each item must have an `embedding` property (number[]).
31
+ *
32
+ * @param {number[]} queryEmbedding
33
+ * @param {Array<{embedding: number[], [key: string]: any}>} items
34
+ * @returns {Array<{item: object, score: number}>} sorted descending by score
35
+ */
36
+ function rankBySimilarity(queryEmbedding, items) {
37
+ return items
38
+ .map(item => ({ item, score: cosineSimilarity(queryEmbedding, item.embedding) }))
39
+ .sort((a, b) => b.score - a.score);
40
+ }
41
+
42
+ module.exports = { cosineSimilarity, rankBySimilarity };
@@ -166,7 +166,7 @@ async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxR
166
166
 
167
167
  /**
168
168
  * Search forum topics for the given query text.
169
- * Uses TF-IDF for candidate selection and optionally OpenAI for re-ranking.
169
+ * Tries semantic (vector) search first; falls back to TF-IDF on failure or empty results.
170
170
  *
171
171
  * @param {string} queryText
172
172
  * @returns {Promise<{ tid: number|string, title: string, url: string }[]>}
@@ -174,6 +174,41 @@ async function reRankWithAI(queryText, candidates, topicMap, apiKey, model, maxR
174
174
  async function searchTopics(queryText) {
175
175
  const winston = require.main.require('winston');
176
176
  const settings = await getSettings();
177
+
178
+ // ── Semantic search (primary) ────────────────────────────────────────────
179
+ try {
180
+ const { search: vectorSearch } = require('../services/vectorSearchService');
181
+ const vectorResults = await vectorSearch(queryText);
182
+
183
+ if (vectorResults.length > 0) {
184
+ const Topics = require.main.require('./src/topics');
185
+ const tids = [...new Set(vectorResults.map(r => r.topic_id))];
186
+ const topics = await Topics.getTopicsFields(tids, ['tid', 'title', 'slug', 'deleted']);
187
+ const topicByTid = Object.fromEntries(
188
+ topics.filter(t => t && t.tid && !t.deleted).map(t => [String(t.tid), t])
189
+ );
190
+
191
+ const results = vectorResults
192
+ .filter(r => topicByTid[String(r.topic_id)])
193
+ .map(r => {
194
+ const t = topicByTid[String(r.topic_id)];
195
+ return { tid: t.tid, title: t.title, url: `/topic/${t.slug || t.tid}` };
196
+ })
197
+ .filter((r, i, arr) => arr.findIndex(x => x.tid === r.tid) === i) // dedupe by tid
198
+ .slice(0, settings.maxResults);
199
+
200
+ if (results.length > 0) {
201
+ winston.info(`[search-agent] Semantic search returned ${results.length} results for "${queryText}".`);
202
+ return results;
203
+ }
204
+ }
205
+
206
+ winston.info(`[search-agent] Semantic search returned no results for "${queryText}", falling back to TF-IDF.`);
207
+ } catch (err) {
208
+ winston.warn(`[search-agent] Semantic search failed, falling back to TF-IDF: ${err.message}`);
209
+ }
210
+
211
+ // ── TF-IDF fallback (original logic) ─────────────────────────────────────
177
212
  const { index, topicMap } = await getIndex(settings.topicLimit);
178
213
 
179
214
  winston.info(`[search-agent] Query: "${queryText}" | index size: ${index.length} topics | aiEnabled: ${settings.aiEnabled && !!settings.openaiApiKey}`);
package/library.js CHANGED
@@ -4,6 +4,8 @@ const winston = require.main.require('winston');
4
4
 
5
5
  const controllers = require('./lib/controllers');
6
6
  const { invalidateCache } = require('./lib/searchHandler');
7
+ const { startSync } = require('./services/syncService');
8
+ const { indexPost } = require('./services/realtimeService');
7
9
 
8
10
  const routeHelpers = require.main.require('./src/routes/helpers');
9
11
 
@@ -14,7 +16,7 @@ const plugin = {};
14
16
  * Register admin page route and set up cache-invalidation on new topic creation.
15
17
  */
16
18
  plugin.init = async (params) => {
17
- console.log('[search-agent] plugin.init called');
19
+ winston.info('[search-agent] plugin.init called');
18
20
  const { router } = params;
19
21
 
20
22
  // Admin settings page
@@ -24,7 +26,19 @@ plugin.init = async (params) => {
24
26
  controllers.renderAdminPage
25
27
  );
26
28
 
27
- console.log('[search-agent] Admin route registered: /admin/plugins/search-agent');
29
+ winston.info('[search-agent] Admin route registered: /admin/plugins/search-agent');
30
+
31
+ // Start initial embedding sync in the background — does not block NodeBB startup.
32
+ winston.info('[search-agent] Starting initial embedding sync…');
33
+ startSync().catch(err => winston.warn(`[search-agent] Initial sync failed: ${err.message}`));
34
+
35
+ // Re-sync every 10 minutes to pick up new posts.
36
+ const RESYNC_INTERVAL_MS = 10 * 60 * 1000;
37
+ setInterval(() => {
38
+ winston.info('[search-agent] Running scheduled embedding re-sync…');
39
+ startSync().catch(err => winston.warn(`[search-agent] Scheduled re-sync failed: ${err.message}`));
40
+ }, RESYNC_INTERVAL_MS).unref();
41
+
28
42
  winston.info('[plugins/search-agent] Initialised.');
29
43
  };
30
44
 
@@ -40,7 +54,7 @@ plugin.init = async (params) => {
40
54
  * To allow guests as well, remove the middleware array below.
41
55
  */
42
56
  plugin.addRoutes = async ({ router, middleware, helpers }) => {
43
- console.log('[search-agent] plugin.addRoutes called — registering API routes');
57
+ winston.info('[search-agent] plugin.addRoutes called — registering API routes');
44
58
  const middlewares = [
45
59
  middleware.ensureLoggedIn,
46
60
  ];
@@ -52,7 +66,7 @@ plugin.addRoutes = async ({ router, middleware, helpers }) => {
52
66
  middlewares,
53
67
  (req, res) => controllers.handleQuery(req, res, helpers)
54
68
  );
55
- console.log('[search-agent] API route registered: POST /api/v3/plugins/search-agent/query');
69
+ winston.info('[search-agent] API route registered: POST /api/v3/plugins/search-agent/query');
56
70
 
57
71
  // Public config endpoint — no auth required so guests can read the visibility setting
58
72
  routeHelpers.setupApiRoute(
@@ -62,7 +76,7 @@ plugin.addRoutes = async ({ router, middleware, helpers }) => {
62
76
  [],
63
77
  (req, res) => controllers.getConfig(req, res, helpers)
64
78
  );
65
- console.log('[search-agent] API route registered: GET /api/v3/plugins/search-agent/config');
79
+ winston.info('[search-agent] API route registered: GET /api/v3/plugins/search-agent/config');
66
80
 
67
81
  // Lightweight cache-bust endpoint (admin only)
68
82
  routeHelpers.setupApiRoute(
@@ -71,12 +85,34 @@ plugin.addRoutes = async ({ router, middleware, helpers }) => {
71
85
  '/search-agent/cache/invalidate',
72
86
  [middleware.ensureLoggedIn, middleware.admin.checkPrivileges],
73
87
  (req, res) => {
74
- console.log('[search-agent] Cache invalidation requested by uid:', req.uid);
88
+ winston.info(`[search-agent] Cache invalidation requested by uid: ${req.uid}`);
75
89
  invalidateCache();
76
90
  helpers.formatApiResponse(200, res, { message: 'Cache invalidated.' });
77
91
  }
78
92
  );
79
- console.log('[search-agent] API route registered: POST /api/v3/plugins/search-agent/cache/invalidate');
93
+ winston.info('[search-agent] API route registered: POST /api/v3/plugins/search-agent/cache/invalidate');
94
+ };
95
+
96
+ /**
97
+ * action:post.save
98
+ * Index a newly created post immediately after it is persisted.
99
+ * Non-blocking — errors are caught and logged without affecting the caller.
100
+ */
101
+ plugin.onPostSave = ({ post }) => {
102
+ indexPost(post).catch(err =>
103
+ winston.warn(`[search-agent] realtimeService: failed to index new post pid ${post && post.pid}: ${err.message}`)
104
+ );
105
+ };
106
+
107
+ /**
108
+ * action:post.edit
109
+ * Re-index a post whenever its content is edited.
110
+ * Non-blocking — errors are caught and logged without affecting the caller.
111
+ */
112
+ plugin.onPostEdit = ({ post }) => {
113
+ indexPost(post).catch(err =>
114
+ winston.warn(`[search-agent] realtimeService: failed to re-index edited post pid ${post && post.pid}: ${err.message}`)
115
+ );
80
116
  };
81
117
 
82
118
  /**
@@ -84,7 +120,7 @@ plugin.addRoutes = async ({ router, middleware, helpers }) => {
84
120
  * Add the plugin entry to the ACP sidebar.
85
121
  */
86
122
  plugin.addAdminNavigation = (header) => {
87
- console.log('[search-agent] plugin.addAdminNavigation called — adding ACP sidebar entry');
123
+ winston.info('[search-agent] plugin.addAdminNavigation called — adding ACP sidebar entry');
88
124
  header.plugins.push({
89
125
  route: '/plugins/search-agent',
90
126
  icon: 'fa-comments',
package/package.json CHANGED
@@ -1,15 +1,17 @@
1
1
  {
2
2
  "name": "nodebb-plugin-search-agent",
3
- "version": "0.0.2",
3
+ "version": "0.0.4",
4
4
  "description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
5
5
  "main": "library.js",
6
6
  "author": "Racheli Bayfus",
7
+ "private": false,
7
8
  "repository": {
8
9
  "type": "git",
9
10
  "url": "git+https://github.com/racheliK9201/nodebb-plugin-search-agent.git"
10
11
  },
11
12
  "scripts": {
12
- "lint": "eslint ."
13
+ "lint": "eslint .",
14
+ "test:ai": "node test/runTests.js"
13
15
  },
14
16
  "keywords": [
15
17
  "nodebb",
@@ -48,5 +50,9 @@
48
50
  "eslint-config-nodebb": "2.0.1",
49
51
  "husky": "9.1.7",
50
52
  "lint-staged": "16.4.0"
53
+ },
54
+ "dependencies": {
55
+ "@orama/orama": "^3.1.18",
56
+ "winston": "^3.19.0"
51
57
  }
52
58
  }
package/plugin.json CHANGED
@@ -7,7 +7,9 @@
7
7
  "hooks": [
8
8
  { "hook": "static:app.load", "method": "init" },
9
9
  { "hook": "static:api.routes", "method": "addRoutes" },
10
- { "hook": "filter:admin.header.build", "method": "addAdminNavigation" }
10
+ { "hook": "filter:admin.header.build", "method": "addAdminNavigation" },
11
+ { "hook": "action:post.save", "method": "onPostSave" },
12
+ { "hook": "action:post.edit", "method": "onPostEdit" }
11
13
  ],
12
14
  "staticDirs": {
13
15
  "static": "./static"
@@ -0,0 +1,135 @@
1
+ 'use strict';
2
+
3
+ const https = require('https');
4
+
5
+ function winston() {
6
+ return require.main.require('winston');
7
+ }
8
+
9
+ const OPENAI_EMBEDDINGS_HOSTNAME = 'api.openai.com';
10
+ const OPENAI_EMBEDDINGS_PATH = '/v1/embeddings';
11
+ const EMBEDDING_MODEL = 'text-embedding-3-small';
12
+ const MAX_RETRIES = 3;
13
+ const RETRY_DELAY_MS = 500;
14
+
15
+ /**
16
+ * Performs an HTTPS POST request to the OpenAI embeddings endpoint.
17
+ * @param {string} apiKey
18
+ * @param {string|string[]} input - Single text or array of texts
19
+ * @returns {Promise<object>} Parsed JSON response body
20
+ */
21
+ function requestEmbeddings(apiKey, input) {
22
+ return new Promise((resolve, reject) => {
23
+ const body = JSON.stringify({ model: EMBEDDING_MODEL, input });
24
+ const options = {
25
+ hostname: OPENAI_EMBEDDINGS_HOSTNAME,
26
+ path: OPENAI_EMBEDDINGS_PATH,
27
+ method: 'POST',
28
+ headers: {
29
+ 'Content-Type': 'application/json',
30
+ 'Authorization': `Bearer ${apiKey}`,
31
+ 'Content-Length': Buffer.byteLength(body),
32
+ },
33
+ };
34
+
35
+ const req = https.request(options, (res) => {
36
+ const chunks = [];
37
+ res.on('data', chunk => chunks.push(chunk));
38
+ res.on('end', () => {
39
+ let parsed;
40
+ try {
41
+ parsed = JSON.parse(Buffer.concat(chunks).toString('utf8'));
42
+ } catch (e) {
43
+ return reject(new Error(`Failed to parse OpenAI response: ${e.message}`));
44
+ }
45
+
46
+ if (res.statusCode >= 400) {
47
+ const message = (parsed.error && parsed.error.message) || `HTTP ${res.statusCode}`;
48
+ return reject(new Error(`OpenAI API error: ${message}`));
49
+ }
50
+
51
+ resolve(parsed);
52
+ });
53
+ });
54
+
55
+ req.on('error', err => reject(new Error(`Network error calling OpenAI: ${err.message}`)));
56
+ req.write(body);
57
+ req.end();
58
+ });
59
+ }
60
+
61
+ /**
62
+ * Retries an async operation up to maxRetries times with exponential back-off.
63
+ * @param {Function} fn - Async function to retry
64
+ * @param {number} retries
65
+ * @returns {Promise<*>}
66
+ */
67
+ async function withRetry(fn, retries = MAX_RETRIES) {
68
+ let lastError;
69
+ for (let attempt = 1; attempt <= retries; attempt++) {
70
+ try {
71
+ return await fn();
72
+ } catch (err) {
73
+ lastError = err;
74
+ if (attempt < retries) {
75
+ winston().warn(`[search-agent] embeddingService: attempt ${attempt} failed (${err.message}), retrying in ${RETRY_DELAY_MS * attempt} ms…`);
76
+ await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS * attempt));
77
+ }
78
+ }
79
+ }
80
+ throw lastError;
81
+ }
82
+
83
+ /**
84
+ * Converts a single text string into an embedding vector.
85
+ * @param {string} text
86
+ * @returns {Promise<number[]>}
87
+ */
88
+ async function embed(text) {
89
+ if (typeof text !== 'string' || text.trim() === '') {
90
+ throw new Error('embed() requires a non-empty string');
91
+ }
92
+
93
+ const apiKey = process.env.OPENAI_API_KEY;
94
+ if (!apiKey) {
95
+ throw new Error('OPENAI_API_KEY environment variable is not set');
96
+ }
97
+
98
+ winston().verbose(`[search-agent] embeddingService: generating embedding for text (${text.length} chars)`);
99
+ const response = await withRetry(() => requestEmbeddings(apiKey, text));
100
+ winston().verbose('[search-agent] embeddingService: embedding generated successfully');
101
+ return response.data[0].embedding;
102
+ }
103
+
104
+ /**
105
+ * Converts an array of text strings into an array of embedding vectors.
106
+ * Texts are sent in a single batched API request.
107
+ * @param {string[]} texts
108
+ * @returns {Promise<number[][]>}
109
+ */
110
+ async function embedBatch(texts) {
111
+ if (!Array.isArray(texts) || texts.length === 0) {
112
+ throw new Error('embedBatch() requires a non-empty array of strings');
113
+ }
114
+
115
+ const invalid = texts.findIndex(t => typeof t !== 'string' || t.trim() === '');
116
+ if (invalid !== -1) {
117
+ throw new Error(`embedBatch() received an empty or non-string value at index ${invalid}`);
118
+ }
119
+
120
+ const apiKey = process.env.OPENAI_API_KEY;
121
+ if (!apiKey) {
122
+ throw new Error('OPENAI_API_KEY environment variable is not set');
123
+ }
124
+
125
+ winston().verbose(`[search-agent] embeddingService: generating batch embeddings for ${texts.length} text(s)`);
126
+ const response = await withRetry(() => requestEmbeddings(apiKey, texts));
127
+ winston().verbose(`[search-agent] embeddingService: batch embeddings generated successfully (${texts.length} vector(s))`);
128
+
129
+ // OpenAI returns items sorted by index field, but sort explicitly to be safe
130
+ return response.data
131
+ .sort((a, b) => a.index - b.index)
132
+ .map(item => item.embedding);
133
+ }
134
+
135
+ module.exports = { embed, embedBatch };
@@ -0,0 +1,42 @@
1
+ 'use strict';
2
+
3
+ const { embed } = require('./embeddingService');
4
+ const { saveEmbedding } = require('./vectorStore');
5
+
6
+ function winston() {
7
+ return require.main.require('winston');
8
+ }
9
+
10
+ /**
11
+ * Generate and persist an embedding for a single post.
12
+ * Safe to call fire-and-forget — all errors are caught and logged.
13
+ *
14
+ * @param {{ pid: number|string, tid: number|string, content: string, deleted?: boolean }} postData
15
+ * @returns {Promise<void>}
16
+ */
17
+ async function indexPost(postData) {
18
+ if (!postData || postData.deleted) {
19
+ return;
20
+ }
21
+
22
+ const pid = parseInt(postData.pid, 10);
23
+ const tid = parseInt(postData.tid, 10);
24
+ const content = postData.content;
25
+
26
+ if (Number.isNaN(pid) || Number.isNaN(tid)) {
27
+ winston().warn('[search-agent] realtimeService: skipping post with invalid pid/tid');
28
+ return;
29
+ }
30
+
31
+ if (typeof content !== 'string' || content.trim() === '') {
32
+ return;
33
+ }
34
+
35
+ const vector = await embed(content);
36
+
37
+ await saveEmbedding({ post_id: pid, topic_id: tid, content, embedding: vector });
38
+
39
+ winston().verbose(`[search-agent] realtimeService: indexed pid ${pid}`);
40
+ }
41
+
42
+ module.exports = { indexPost };
@@ -0,0 +1,175 @@
1
+ 'use strict';
2
+
3
+ const { embedBatch } = require('./embeddingService');
4
+ const { saveEmbedding, getMissingEmbeddings } = require('./vectorStore');
5
+
6
+ const BATCH_SIZE = 25;
7
+ // Brief pause between batches to yield the event loop and avoid I/O starvation
8
+ const INTER_BATCH_DELAY_MS = 200;
9
+
10
+ let _syncRunning = false;
11
+
12
+ function sleep(ms) {
13
+ return new Promise(resolve => setTimeout(resolve, ms));
14
+ }
15
+
16
+ function winston() {
17
+ return require.main.require('winston');
18
+ }
19
+
20
+ /**
21
+ * Fetch every post ID from the NodeBB sorted set, ordered newest-first.
22
+ * @returns {Promise<number[]>}
23
+ */
24
+ async function fetchAllPostIds() {
25
+ const db = require.main.require('./src/database');
26
+ const raw = await db.getSortedSetRevRange('posts:pid', 0, -1);
27
+ return (raw || []).map(id => parseInt(id, 10)).filter(n => !Number.isNaN(n));
28
+ }
29
+
30
+ /**
31
+ * Fetch selected fields for a list of post IDs.
32
+ * @param {number[]} pids
33
+ * @returns {Promise<Array<{pid: number, tid: number, content: string, deleted: boolean}>>}
34
+ */
35
+ async function fetchPostFields(pids) {
36
+ const Posts = require.main.require('./src/posts');
37
+ return Posts.getPostsFields(pids, ['pid', 'tid', 'content', 'deleted']);
38
+ }
39
+
40
+ /**
41
+ * Core sync loop. Iterates all posts in batches, skips already-embedded ones,
42
+ * generates embeddings for the rest, and persists them.
43
+ */
44
+ async function runSync() {
45
+ if (_syncRunning) {
46
+ winston().warn('[search-agent] syncService: sync already in progress, skipping duplicate start.');
47
+ return;
48
+ }
49
+ _syncRunning = true;
50
+ winston().info('[search-agent] syncService: starting embedding sync…');
51
+
52
+ try {
53
+ const allPids = await fetchAllPostIds();
54
+
55
+ if (allPids.length === 0) {
56
+ winston().info('[search-agent] syncService: no posts found — sync complete.');
57
+ return;
58
+ }
59
+
60
+ winston().info(`[search-agent] syncService: ${allPids.length} post(s) to scan.`);
61
+
62
+ let totalChecked = 0;
63
+ let totalEmbedded = 0;
64
+ let totalErrors = 0;
65
+
66
+ for (let offset = 0; offset < allPids.length; offset += BATCH_SIZE) {
67
+ const batchPids = allPids.slice(offset, offset + BATCH_SIZE);
68
+
69
+ // Yield between batches so other async work can proceed
70
+ await sleep(INTER_BATCH_DELAY_MS);
71
+
72
+ // ------------------------------------------------------------------
73
+ // 1. Find which PIDs in this batch still lack an embedding
74
+ // ------------------------------------------------------------------
75
+ let missingPids;
76
+ try {
77
+ missingPids = await getMissingEmbeddings(batchPids);
78
+ } catch (err) {
79
+ winston().error(`[search-agent] syncService: failed to query missing embeddings (offset ${offset}): ${err.message}`);
80
+ totalErrors++;
81
+ continue;
82
+ }
83
+
84
+ totalChecked += batchPids.length;
85
+
86
+ if (missingPids.length === 0) {
87
+ continue; // all already embedded
88
+ }
89
+
90
+ // ------------------------------------------------------------------
91
+ // 2. Fetch post content for the missing PIDs
92
+ // ------------------------------------------------------------------
93
+ let rawPosts;
94
+ try {
95
+ rawPosts = await fetchPostFields(missingPids);
96
+ } catch (err) {
97
+ winston().error(`[search-agent] syncService: failed to fetch post fields (offset ${offset}): ${err.message}`);
98
+ totalErrors++;
99
+ continue;
100
+ }
101
+
102
+ // Drop deleted posts and posts with no usable content
103
+ const posts = rawPosts.filter(
104
+ p => p && !p.deleted && typeof p.content === 'string' && p.content.trim() !== ''
105
+ );
106
+
107
+ if (posts.length === 0) {
108
+ continue;
109
+ }
110
+
111
+ // ------------------------------------------------------------------
112
+ // 3. Generate embeddings for this sub-batch
113
+ // ------------------------------------------------------------------
114
+ let vectors;
115
+ try {
116
+ vectors = await embedBatch(posts.map(p => p.content));
117
+ } catch (err) {
118
+ winston().error(`[search-agent] syncService: failed to generate embeddings (offset ${offset}): ${err.message}`);
119
+ totalErrors++;
120
+ continue;
121
+ }
122
+
123
+ // ------------------------------------------------------------------
124
+ // 4. Persist each embedding — failures are isolated per document
125
+ // ------------------------------------------------------------------
126
+ for (let i = 0; i < posts.length; i++) {
127
+ const post = posts[i];
128
+ try {
129
+ await saveEmbedding({
130
+ post_id: parseInt(post.pid, 10),
131
+ topic_id: parseInt(post.tid, 10),
132
+ content: post.content,
133
+ embedding: vectors[i],
134
+ });
135
+ totalEmbedded++;
136
+ } catch (err) {
137
+ winston().error(`[search-agent] syncService: failed to save embedding for pid ${post.pid}: ${err.message}`);
138
+ totalErrors++;
139
+ }
140
+ }
141
+
142
+ winston().info(
143
+ `[search-agent] syncService: progress — ${totalChecked}/${allPids.length} checked, ` +
144
+ `${totalEmbedded} newly embedded, ${totalErrors} error(s).`
145
+ );
146
+ }
147
+
148
+ winston().info(
149
+ `[search-agent] syncService: sync complete — ` +
150
+ `${totalEmbedded} embedding(s) created, ${totalErrors} error(s).`
151
+ );
152
+ } catch (err) {
153
+ winston().error(`[search-agent] syncService: unexpected error — ${err.message}`);
154
+ } finally {
155
+ _syncRunning = false;
156
+ }
157
+ }
158
+
159
+ /**
160
+ * Kick off the embedding sync as a non-blocking background job.
161
+ *
162
+ * Safe to call multiple times — if a sync is already running the new call is
163
+ * silently dropped. Returns immediately so it never delays server startup.
164
+ */
165
+ function startSync() {
166
+ setImmediate(() => {
167
+ runSync().catch(err => {
168
+ require.main.require('winston').error(
169
+ `[search-agent] syncService: unhandled rejection in runSync — ${err.message}`
170
+ );
171
+ });
172
+ });
173
+ }
174
+
175
+ module.exports = { startSync };
@@ -0,0 +1,116 @@
1
+ 'use strict';
2
+
3
+ const { create, insertMultiple, search: oramaSearch } = require('@orama/orama');
4
+ const { embed } = require('./embeddingService');
5
+ const { getAllEmbeddings } = require('./vectorStore');
6
+
7
+ function winston() {
8
+ return require.main.require('winston');
9
+ }
10
+
11
+ const TOP_K = 10;
12
+ // Rebuild the Orama index after this interval (mirrors TF-IDF cache TTL)
13
+ const INDEX_TTL_MS = 5 * 60 * 1000;
14
+
15
+ let _db = null;
16
+ let _dbTs = 0;
17
+ let _buildPromise = null;
18
+
19
+ async function buildIndex() {
20
+ const storedEmbeddings = await getAllEmbeddings();
21
+
22
+ // Detect dimension from data; fall back to 1536 (text-embedding-3-small default)
23
+ const dimensions = storedEmbeddings.length > 0
24
+ ? storedEmbeddings[0].embedding.length
25
+ : 1536;
26
+
27
+ const db = await create({
28
+ schema: {
29
+ post_id: 'number',
30
+ topic_id: 'number',
31
+ content: 'string',
32
+ embedding: `vector[${dimensions}]`,
33
+ },
34
+ });
35
+
36
+ if (storedEmbeddings.length > 0) {
37
+ await insertMultiple(db, storedEmbeddings.map(e => ({
38
+ id: String(e.post_id),
39
+ post_id: e.post_id,
40
+ topic_id: e.topic_id,
41
+ content: e.content,
42
+ embedding: e.embedding,
43
+ })));
44
+ }
45
+
46
+ winston().info(`[search-agent] vectorSearchService: Orama index built with ${storedEmbeddings.length} document(s)`);
47
+ return db;
48
+ }
49
+
50
+ async function getDb() {
51
+ const now = Date.now();
52
+ if (_db && (now - _dbTs) < INDEX_TTL_MS) {
53
+ return _db;
54
+ }
55
+
56
+ if (_buildPromise) {
57
+ return _buildPromise;
58
+ }
59
+
60
+ _buildPromise = buildIndex().then((db) => {
61
+ _db = db;
62
+ _dbTs = Date.now();
63
+ _buildPromise = null;
64
+ return db;
65
+ }).catch((err) => {
66
+ _buildPromise = null;
67
+ throw err;
68
+ });
69
+
70
+ return _buildPromise;
71
+ }
72
+
73
+ /** Invalidate the in-memory Orama index (e.g. after new embeddings are saved). */
74
+ function invalidateIndex() {
75
+ _db = null;
76
+ _dbTs = 0;
77
+ winston().info('[search-agent] vectorSearchService: Orama index invalidated');
78
+ }
79
+
80
+ /**
81
+ * Performs semantic search against stored post embeddings using Orama vector search.
82
+ *
83
+ * @param {string} query - The search query string.
84
+ * @returns {Promise<Array<{ topic_id: number, post_id: number, content: string, score: number }>>}
85
+ * Top results sorted by cosine similarity descending.
86
+ */
87
+ async function search(query) {
88
+ if (typeof query !== 'string' || query.trim() === '') {
89
+ throw new Error('search() requires a non-empty query string');
90
+ }
91
+
92
+ winston().verbose(`[search-agent] vectorSearchService: running Orama vector search for "${query.trim()}"`);
93
+
94
+ const [queryEmbedding, db] = await Promise.all([
95
+ embed(query),
96
+ getDb(),
97
+ ]);
98
+
99
+ const results = await oramaSearch(db, {
100
+ mode: 'vector',
101
+ vector: { value: queryEmbedding, property: 'embedding' },
102
+ limit: TOP_K,
103
+ includeVectors: false,
104
+ });
105
+
106
+ winston().verbose(`[search-agent] vectorSearchService: Orama returned ${results.hits.length} hit(s)`);
107
+
108
+ return results.hits.map(hit => ({
109
+ topic_id: hit.document.topic_id,
110
+ post_id: hit.document.post_id,
111
+ content: hit.document.content,
112
+ score: hit.score,
113
+ }));
114
+ }
115
+
116
+ module.exports = { search, invalidateIndex };
@@ -0,0 +1,137 @@
1
+ 'use strict';
2
+
3
+ const COLLECTION = 'plugin_ai_embeddings';
4
+ // Maximum number of embeddings held in memory. Older posts (lower post_id) are
5
+ // dropped first when this ceiling is reached on initial load.
6
+ const MAX_DATASET_SIZE = 50_000;
7
+
8
+ function getCollection() {
9
+ const db = require.main.require('./src/database');
10
+ return db.client.collection(COLLECTION);
11
+ }
12
+
13
+ function winston() {
14
+ return require.main.require('winston');
15
+ }
16
+
17
+ // Promise-based singleton so concurrent callers share one init operation
18
+ let _ensureIndexes = null;
19
+
20
+ // In-memory cache of loaded embeddings (null = not yet populated)
21
+ let _cache = null;
22
+ // Inflight load promise shared by concurrent first-callers
23
+ let _cachePromise = null;
24
+
25
+ function ensureIndexes() {
26
+ if (!_ensureIndexes) {
27
+ _ensureIndexes = (async () => {
28
+ const col = getCollection();
29
+ await col.createIndex({ post_id: 1 }, { unique: true });
30
+ await col.createIndex({ topic_id: 1 });
31
+ })();
32
+ }
33
+ return _ensureIndexes;
34
+ }
35
+
36
+ /**
37
+ * Upsert an embedding document for a post. Safe to call repeatedly — no duplicates.
38
+ * Also updates the in-memory cache if it is already loaded.
39
+ *
40
+ * @param {{ post_id: number, topic_id: number, content: string, embedding: number[] }} post
41
+ * @returns {Promise<void>}
42
+ */
43
+ async function saveEmbedding(post) {
44
+ const { post_id, topic_id, content, embedding } = post;
45
+ winston().verbose(`[search-agent] vectorStore: upserting embedding for post_id ${post_id}`);
46
+ await ensureIndexes();
47
+ const col = getCollection();
48
+ await col.updateOne(
49
+ { post_id },
50
+ { $set: { post_id, topic_id, content, embedding } },
51
+ { upsert: true }
52
+ );
53
+
54
+ // Keep in-memory cache consistent without requiring a full reload
55
+ if (_cache !== null) {
56
+ const entry = { post_id, topic_id, content, embedding };
57
+ const idx = _cache.findIndex(e => e.post_id === post_id);
58
+ if (idx !== -1) {
59
+ _cache[idx] = entry;
60
+ } else {
61
+ // Prepend so newest posts stay at the front (mirrors load order)
62
+ _cache.unshift(entry);
63
+ if (_cache.length > MAX_DATASET_SIZE) {
64
+ _cache.length = MAX_DATASET_SIZE;
65
+ }
66
+ }
67
+ }
68
+ }
69
+
70
+ /**
71
+ * Retrieve all stored embeddings from the in-memory cache.
72
+ * On first call the cache is populated from MongoDB (up to MAX_DATASET_SIZE
73
+ * documents, newest posts first). Subsequent calls are served from memory.
74
+ *
75
+ * @returns {Promise<Array<{ post_id: number, topic_id: number, content: string, embedding: number[] }>>}
76
+ */
77
+ async function getAllEmbeddings() {
78
+ if (_cache !== null) {
79
+ winston().verbose(`[search-agent] vectorStore: cache hit — returning ${_cache.length} embedding(s) from memory`);
80
+ return _cache;
81
+ }
82
+
83
+ if (!_cachePromise) {
84
+ _cachePromise = (async () => {
85
+ winston().info('[search-agent] vectorStore: loading embeddings from database…');
86
+ await ensureIndexes();
87
+ const col = getCollection();
88
+ _cache = await col
89
+ .find({}, { projection: { _id: 0 } })
90
+ .sort({ post_id: -1 })
91
+ .limit(MAX_DATASET_SIZE)
92
+ .toArray();
93
+ _cachePromise = null;
94
+ winston().info(`[search-agent] vectorStore: loaded ${_cache.length} embedding(s) from database`);
95
+ return _cache;
96
+ })();
97
+ }
98
+
99
+ return _cachePromise;
100
+ }
101
+
102
+ /**
103
+ * Find a single embedding by post ID.
104
+ *
105
+ * @param {number} post_id
106
+ * @returns {Promise<{ post_id: number, topic_id: number, content: string, embedding: number[] } | null>}
107
+ */
108
+ async function findByPostId(post_id) {
109
+ winston().verbose(`[search-agent] vectorStore: looking up embedding for post_id ${post_id}`);
110
+ await ensureIndexes();
111
+ const col = getCollection();
112
+ return col.findOne({ post_id }, { projection: { _id: 0 } });
113
+ }
114
+
115
+ /**
116
+ * Given an array of post IDs, return the subset that have no stored embedding yet.
117
+ *
118
+ * @param {number[]} postIds
119
+ * @returns {Promise<number[]>}
120
+ */
121
+ async function getMissingEmbeddings(postIds) {
122
+ if (!Array.isArray(postIds) || postIds.length === 0) {
123
+ return [];
124
+ }
125
+ winston().verbose(`[search-agent] vectorStore: checking ${postIds.length} post ID(s) for missing embeddings`);
126
+ await ensureIndexes();
127
+ const col = getCollection();
128
+ const existing = await col
129
+ .find({ post_id: { $in: postIds } }, { projection: { _id: 0, post_id: 1 } })
130
+ .toArray();
131
+ const found = new Set(existing.map(doc => doc.post_id));
132
+ const missing = postIds.filter(id => !found.has(id));
133
+ winston().verbose(`[search-agent] vectorStore: ${missing.length} of ${postIds.length} post(s) missing embeddings`);
134
+ return missing;
135
+ }
136
+
137
+ module.exports = { saveEmbedding, getAllEmbeddings, findByPostId, getMissingEmbeddings };
@@ -0,0 +1,71 @@
1
+ 'use strict';
2
+
3
+ // ─── Stub NodeBB internals ───────────────────────────────────────────────────
4
+ // Services call require.main.require('./src/database') etc. which resolves
5
+ // relative to this file when running standalone. Intercept those calls and
6
+ // return lightweight in-memory fakes so no NodeBB installation is needed.
7
+
8
+ const inMemoryStore = new Map();
9
+
10
+ function makeCollection() {
11
+ return {
12
+ createIndex: async () => {},
13
+ updateOne: async (filter, update) => {
14
+ inMemoryStore.set(filter.post_id, update.$set || {});
15
+ },
16
+ find: (filter) => {
17
+ const all = [...inMemoryStore.values()];
18
+ const filtered =
19
+ filter && filter.post_id && filter.post_id.$in
20
+ ? all.filter(d => filter.post_id.$in.includes(d.post_id))
21
+ : all;
22
+ const chain = {
23
+ sort: () => chain,
24
+ limit: (n) => { chain._n = n; return chain; },
25
+ toArray: async () => (chain._n != null ? filtered.slice(0, chain._n) : filtered),
26
+ _n: null,
27
+ };
28
+ return chain;
29
+ },
30
+ findOne: async (filter) => inMemoryStore.get(filter.post_id) || null,
31
+ };
32
+ }
33
+
34
+ const winstonStub = {
35
+ info: (...a) => console.log('[info]', ...a),
36
+ warn: (...a) => console.warn('[warn]', ...a),
37
+ error: (...a) => console.error('[error]', ...a),
38
+ verbose: (...a) => console.log('[verbose]', ...a),
39
+ };
40
+
41
+ const stubs = {
42
+ './src/database': {
43
+ client: { collection: makeCollection },
44
+ getSortedSetRevRange: async () => [],
45
+ },
46
+ './src/posts': { getPostsFields: async () => [] },
47
+ './src/topics': { getTopicsFields: async () => [], getTopicField: async () => '' },
48
+ './src/meta': { settings: { get: async () => ({}) } },
49
+ './src/user': { isAdministrator: async () => false },
50
+ 'winston': winstonStub,
51
+ };
52
+
53
+ const _origRequire = require.main.require.bind(require.main);
54
+ require.main.require = (id) => stubs[id] || _origRequire(id);
55
+
56
+ // ─── Run tests ───────────────────────────────────────────────────────────────
57
+
58
+ (async () => {
59
+ try {
60
+ await require('./testEmbedding')();
61
+ await require('./testVectorStore')();
62
+ require('./testCosine')();
63
+ await require('./testSync')();
64
+ await require('./testSearch')();
65
+
66
+ console.log("\n🎉 ALL TESTS PASSED");
67
+ } catch (err) {
68
+ console.error("\n❌ TEST FAILED:", err.message);
69
+ process.exit(1);
70
+ }
71
+ })();
@@ -0,0 +1,15 @@
1
+ const { cosineSimilarity } = require('../lib/cosineSimilarity');
2
+
3
+ function testCosine() {
4
+ console.log("Testing cosine similarity...");
5
+
6
+ const a = cosineSimilarity([1, 0], [1, 0]);
7
+ const b = cosineSimilarity([1, 0], [0, 1]);
8
+
9
+ if (a < 0.9) throw new Error("Expected high similarity");
10
+ if (b > 0.1) throw new Error("Expected low similarity");
11
+
12
+ console.log("✅ Cosine OK");
13
+ }
14
+
15
+ module.exports = testCosine;
@@ -0,0 +1,14 @@
1
+ const { embed } = require('../services/embeddingService');
2
+
3
+ async function testEmbedding() {
4
+ console.log("Testing embedding...");
5
+
6
+ const result = await embed("How to install NodeBB plugin?");
7
+
8
+ if (!Array.isArray(result)) throw new Error("Not an array");
9
+ if (result.length === 0) throw new Error("Empty embedding");
10
+
11
+ console.log("✅ Embedding OK. Length:", result.length);
12
+ }
13
+
14
+ module.exports = testEmbedding;
@@ -0,0 +1,16 @@
1
+ const { search } = require('../services/vectorSearchService');
2
+
3
+ async function testSearch() {
4
+ console.log("Testing semantic search...");
5
+
6
+ const results = await search("How to install plugin?");
7
+
8
+ if (!Array.isArray(results)) throw new Error("Invalid result");
9
+ if (results.length === 0) {
10
+ console.warn("⚠️ No results (may be expected if DB empty)");
11
+ }
12
+
13
+ console.log("✅ Search OK", results.length);
14
+ }
15
+
16
+ module.exports = testSearch;
@@ -0,0 +1,11 @@
1
+ const { startSync } = require('../services/syncService');
2
+
3
+ async function testSync() {
4
+ console.log("Testing sync...");
5
+
6
+ await startSync();
7
+
8
+ console.log("✅ Sync executed (check logs for actual indexing)");
9
+ }
10
+
11
+ module.exports = testSync;
@@ -0,0 +1,20 @@
1
+ const store = require('../services/vectorStore');
2
+
3
+ async function testVectorStore() {
4
+ console.log("Testing vector store...");
5
+
6
+ await store.saveEmbedding({
7
+ post_id: 999,
8
+ topic_id: 1,
9
+ content: "test content",
10
+ embedding: [0.1, 0.2, 0.3]
11
+ });
12
+
13
+ const results = await store.getAllEmbeddings(5);
14
+
15
+ if (!results.length) throw new Error("No data returned");
16
+
17
+ console.log("✅ Vector store OK");
18
+ }
19
+
20
+ module.exports = testVectorStore;