nodebb-plugin-search-agent 0.0.932 → 0.0.934
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/similarity.js +125 -0
- package/package.json +1 -1
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Common stop-words (English + Hebrew) to exclude from TF-IDF vectors
|
|
4
|
+
const STOP_WORDS = new Set([
|
|
5
|
+
// English
|
|
6
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
7
|
+
'of', 'with', 'by', 'from', 'is', 'it', 'its', 'be', 'as', 'was',
|
|
8
|
+
'are', 'were', 'been', 'have', 'has', 'had', 'do', 'does', 'did',
|
|
9
|
+
'will', 'would', 'could', 'should', 'may', 'might', 'can', 'shall',
|
|
10
|
+
'not', 'no', 'so', 'if', 'this', 'that', 'these', 'those', 'i', 'we',
|
|
11
|
+
'you', 'he', 'she', 'they', 'my', 'your', 'his', 'her', 'our', 'their',
|
|
12
|
+
'how', 'what', 'when', 'where', 'why', 'who', 'which', 'am', 'up',
|
|
13
|
+
'out', 'about', 'into', 'than', 'more', 'also', 'me', 'him', 'us', 'them',
|
|
14
|
+
// Hebrew
|
|
15
|
+
'של', 'את', 'אל', 'על', 'עם', 'הם', 'הן', 'זה', 'זו', 'זאת',
|
|
16
|
+
'כי', 'לא', 'כן', 'יש', 'אם', 'רק', 'גם', 'אבל', 'אנחנו', 'אני',
|
|
17
|
+
'אתה', 'את', 'הוא', 'היא', 'אנו', 'אתם', 'אתן', 'הם', 'הן',
|
|
18
|
+
'זה', 'זו', 'אלה', 'אלו', 'כל', 'כך', 'כבר', 'עוד', 'רק', 'כן',
|
|
19
|
+
'אחד', 'יותר', 'פה', 'שם', 'מה', 'מי', 'איך', 'מתי', 'איפה',
|
|
20
|
+
'היה', 'הייתה', 'יהיה', 'תהיה', 'הוא', 'היא', 'הם', 'הן',
|
|
21
|
+
'אסור', 'מותר', 'צריך', 'רוצה', 'יכול', 'יכולה', 'לו', 'לה',
|
|
22
|
+
'בו', 'בה', 'עליו', 'עליה', 'בין', 'כבר', 'עכשיו', 'היום',
|
|
23
|
+
'כן', 'לכן', 'כדי', 'כדאי', 'שלי', 'שלך', 'שלו', 'שלה',
|
|
24
|
+
'שלנו', 'שלכם', 'שלהם', 'שלהן', 'להם', 'להן', 'לנו', 'לכם',
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Strip HTML, lowercase, remove punctuation, drop stop-words and short tokens.
|
|
29
|
+
* @param {string|null|undefined} text
|
|
30
|
+
* @returns {string[]}
|
|
31
|
+
*/
|
|
32
|
+
function tokenize(text) {
|
|
33
|
+
if (!text) return [];
|
|
34
|
+
return text
|
|
35
|
+
.replace(/<[^>]*>/g, ' ') // strip HTML tags
|
|
36
|
+
.toLowerCase()
|
|
37
|
+
.replace(/[^\p{L}\p{N}\s]/gu, ' ') // keep all Unicode letters & digits (Hebrew, Latin, etc.)
|
|
38
|
+
.split(/\s+/)
|
|
39
|
+
.filter(t => t.length >= 2 && !STOP_WORDS.has(t)); // min 2 chars to keep short Hebrew words
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Build a TF-IDF index from an array of topic objects.
|
|
44
|
+
* Each topic must have: tid, slug, title, mainPostContent (optional).
|
|
45
|
+
* @param {{ tid: number|string, slug: string, title: string, mainPostContent?: string }[]} topics
|
|
46
|
+
* @returns {{ tid: number|string, slug: string, vector: Map<string, number> }[]}
|
|
47
|
+
*/
|
|
48
|
+
function buildIndex(topics) {
|
|
49
|
+
if (!topics || topics.length === 0) return [];
|
|
50
|
+
|
|
51
|
+
// Step 1: term frequencies per document
|
|
52
|
+
const docs = topics.map((t) => {
|
|
53
|
+
const tokens = tokenize(`${t.title || ''} ${t.mainPostContent || ''}`);
|
|
54
|
+
const tf = new Map();
|
|
55
|
+
for (const token of tokens) {
|
|
56
|
+
tf.set(token, (tf.get(token) || 0) + 1);
|
|
57
|
+
}
|
|
58
|
+
return { tid: t.tid, slug: t.slug, tf, len: tokens.length };
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
// Step 2: document frequency (how many docs contain each term)
|
|
62
|
+
const df = new Map();
|
|
63
|
+
for (const doc of docs) {
|
|
64
|
+
for (const term of doc.tf.keys()) {
|
|
65
|
+
df.set(term, (df.get(term) || 0) + 1);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const N = docs.length;
|
|
70
|
+
|
|
71
|
+
// Step 3: compute TF-IDF vector per document
|
|
72
|
+
return docs.map((doc) => {
|
|
73
|
+
const vector = new Map();
|
|
74
|
+
for (const [term, freq] of doc.tf) {
|
|
75
|
+
const tf = doc.len > 0 ? freq / doc.len : 0;
|
|
76
|
+
// Smoothed IDF to avoid division by zero
|
|
77
|
+
const idf = Math.log((N + 1) / (df.get(term) + 1)) + 1;
|
|
78
|
+
vector.set(term, tf * idf);
|
|
79
|
+
}
|
|
80
|
+
return { tid: doc.tid, slug: doc.slug, vector };
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Rank indexed documents against a query using cosine similarity.
|
|
86
|
+
* @param {string} queryText
|
|
87
|
+
* @param {{ tid: number|string, slug: string, vector: Map<string, number> }[]} index
|
|
88
|
+
* @param {number} [topN=10]
|
|
89
|
+
* @returns {{ tid: number|string, slug: string, score: number }[]}
|
|
90
|
+
*/
|
|
91
|
+
function query(queryText, index, topN = 10) {
|
|
92
|
+
if (!index || index.length === 0) return [];
|
|
93
|
+
|
|
94
|
+
const qTokens = tokenize(queryText);
|
|
95
|
+
if (qTokens.length === 0) return [];
|
|
96
|
+
|
|
97
|
+
// Build raw term-count vector for the query
|
|
98
|
+
const qVec = new Map();
|
|
99
|
+
for (const token of qTokens) {
|
|
100
|
+
qVec.set(token, (qVec.get(token) || 0) + 1);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const results = [];
|
|
104
|
+
for (const doc of index) {
|
|
105
|
+
let dot = 0;
|
|
106
|
+
let docNormSq = 0;
|
|
107
|
+
let qNormSq = 0;
|
|
108
|
+
|
|
109
|
+
for (const [term, qVal] of qVec) {
|
|
110
|
+
dot += qVal * (doc.vector.get(term) || 0);
|
|
111
|
+
}
|
|
112
|
+
for (const val of doc.vector.values()) docNormSq += val * val;
|
|
113
|
+
for (const val of qVec.values()) qNormSq += val * val;
|
|
114
|
+
|
|
115
|
+
const norm = Math.sqrt(docNormSq) * Math.sqrt(qNormSq);
|
|
116
|
+
const score = norm > 0 ? dot / norm : 0;
|
|
117
|
+
if (score > 0) {
|
|
118
|
+
results.push({ tid: doc.tid, slug: doc.slug, score });
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return results.sort((a, b) => b.score - a.score).slice(0, topN);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
module.exports = { tokenize, buildIndex, query };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nodebb-plugin-search-agent",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.934",
|
|
4
4
|
"description": "NodeBB plugin that adds a floating chat assistant to help users find relevant forum topics using TF-IDF text similarity",
|
|
5
5
|
"main": "library.js",
|
|
6
6
|
"author": "Racheli Bayfus",
|