nodebb-plugin-search-agent 0.0.932 → 0.0.934

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/lib/similarity.js +125 -0
  2. package/package.json +1 -1
@@ -0,0 +1,125 @@
1
+ 'use strict';
2
+
3
+ // Common stop-words (English + Hebrew) to exclude from TF-IDF vectors
4
+ const STOP_WORDS = new Set([
5
+ // English
6
+ 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
7
+ 'of', 'with', 'by', 'from', 'is', 'it', 'its', 'be', 'as', 'was',
8
+ 'are', 'were', 'been', 'have', 'has', 'had', 'do', 'does', 'did',
9
+ 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'shall',
10
+ 'not', 'no', 'so', 'if', 'this', 'that', 'these', 'those', 'i', 'we',
11
+ 'you', 'he', 'she', 'they', 'my', 'your', 'his', 'her', 'our', 'their',
12
+ 'how', 'what', 'when', 'where', 'why', 'who', 'which', 'am', 'up',
13
+ 'out', 'about', 'into', 'than', 'more', 'also', 'me', 'him', 'us', 'them',
14
+ // Hebrew
15
+ 'של', 'את', 'אל', 'על', 'עם', 'הם', 'הן', 'זה', 'זו', 'זאת',
16
+ 'כי', 'לא', 'כן', 'יש', 'אם', 'רק', 'גם', 'אבל', 'אנחנו', 'אני',
17
+ 'אתה', 'את', 'הוא', 'היא', 'אנו', 'אתם', 'אתן', 'הם', 'הן',
18
+ 'זה', 'זו', 'אלה', 'אלו', 'כל', 'כך', 'כבר', 'עוד', 'רק', 'כן',
19
+ 'אחד', 'יותר', 'פה', 'שם', 'מה', 'מי', 'איך', 'מתי', 'איפה',
20
+ 'היה', 'הייתה', 'יהיה', 'תהיה', 'הוא', 'היא', 'הם', 'הן',
21
+ 'אסור', 'מותר', 'צריך', 'רוצה', 'יכול', 'יכולה', 'לו', 'לה',
22
+ 'בו', 'בה', 'עליו', 'עליה', 'בין', 'כבר', 'עכשיו', 'היום',
23
+ 'כן', 'לכן', 'כדי', 'כדאי', 'שלי', 'שלך', 'שלו', 'שלה',
24
+ 'שלנו', 'שלכם', 'שלהם', 'שלהן', 'להם', 'להן', 'לנו', 'לכם',
25
+ ]);
26
+
27
+ /**
28
+ * Strip HTML, lowercase, remove punctuation, drop stop-words and short tokens.
29
+ * @param {string|null|undefined} text
30
+ * @returns {string[]}
31
+ */
32
+ function tokenize(text) {
33
+ if (!text) return [];
34
+ return text
35
+ .replace(/<[^>]*>/g, ' ') // strip HTML tags
36
+ .toLowerCase()
37
+ .replace(/[^\p{L}\p{N}\s]/gu, ' ') // keep all Unicode letters & digits (Hebrew, Latin, etc.)
38
+ .split(/\s+/)
39
+ .filter(t => t.length >= 2 && !STOP_WORDS.has(t)); // min 2 chars to keep short Hebrew words
40
+ }
41
+
42
+ /**
43
+ * Build a TF-IDF index from an array of topic objects.
44
+ * Each topic must have: tid, slug, title, mainPostContent (optional).
45
+ * @param {{ tid: number|string, slug: string, title: string, mainPostContent?: string }[]} topics
46
+ * @returns {{ tid: number|string, slug: string, vector: Map<string, number> }[]}
47
+ */
48
+ function buildIndex(topics) {
49
+ if (!topics || topics.length === 0) return [];
50
+
51
+ // Step 1: term frequencies per document
52
+ const docs = topics.map((t) => {
53
+ const tokens = tokenize(`${t.title || ''} ${t.mainPostContent || ''}`);
54
+ const tf = new Map();
55
+ for (const token of tokens) {
56
+ tf.set(token, (tf.get(token) || 0) + 1);
57
+ }
58
+ return { tid: t.tid, slug: t.slug, tf, len: tokens.length };
59
+ });
60
+
61
+ // Step 2: document frequency (how many docs contain each term)
62
+ const df = new Map();
63
+ for (const doc of docs) {
64
+ for (const term of doc.tf.keys()) {
65
+ df.set(term, (df.get(term) || 0) + 1);
66
+ }
67
+ }
68
+
69
+ const N = docs.length;
70
+
71
+ // Step 3: compute TF-IDF vector per document
72
+ return docs.map((doc) => {
73
+ const vector = new Map();
74
+ for (const [term, freq] of doc.tf) {
75
+ const tf = doc.len > 0 ? freq / doc.len : 0;
76
+ // Smoothed IDF to avoid division by zero
77
+ const idf = Math.log((N + 1) / (df.get(term) + 1)) + 1;
78
+ vector.set(term, tf * idf);
79
+ }
80
+ return { tid: doc.tid, slug: doc.slug, vector };
81
+ });
82
+ }
83
+
84
+ /**
85
+ * Rank indexed documents against a query using cosine similarity.
86
+ * @param {string} queryText
87
+ * @param {{ tid: number|string, slug: string, vector: Map<string, number> }[]} index
88
+ * @param {number} [topN=10]
89
+ * @returns {{ tid: number|string, slug: string, score: number }[]}
90
+ */
91
+ function query(queryText, index, topN = 10) {
92
+ if (!index || index.length === 0) return [];
93
+
94
+ const qTokens = tokenize(queryText);
95
+ if (qTokens.length === 0) return [];
96
+
97
+ // Build raw term-count vector for the query
98
+ const qVec = new Map();
99
+ for (const token of qTokens) {
100
+ qVec.set(token, (qVec.get(token) || 0) + 1);
101
+ }
102
+
103
+ const results = [];
104
+ for (const doc of index) {
105
+ let dot = 0;
106
+ let docNormSq = 0;
107
+ let qNormSq = 0;
108
+
109
+ for (const [term, qVal] of qVec) {
110
+ dot += qVal * (doc.vector.get(term) || 0);
111
+ }
112
+ for (const val of doc.vector.values()) docNormSq += val * val;
113
+ for (const val of qVec.values()) qNormSq += val * val;
114
+
115
+ const norm = Math.sqrt(docNormSq) * Math.sqrt(qNormSq);
116
+ const score = norm > 0 ? dot / norm : 0;
117
+ if (score > 0) {
118
+ results.push({ tid: doc.tid, slug: doc.slug, score });
119
+ }
120
+ }
121
+
122
+ return results.sort((a, b) => b.score - a.score).slice(0, topN);
123
+ }
124
+
125
+ module.exports = { tokenize, buildIndex, query };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nodebb-plugin-search-agent",
3
- "version": "0.0.932",
3
+ "version": "0.0.934",
4
4
  "description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
5
5
  "main": "library.js",
6
6
  "author": "Racheli Bayfus",