bekindprofanityfilter 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTORS.md +106 -0
- package/LICENSE +22 -0
- package/README.md +1015 -0
- package/allprofanity.config.example.json +35 -0
- package/bin/init.js +49 -0
- package/config.schema.json +163 -0
- package/dist/algos/aho-corasick.d.ts +75 -0
- package/dist/algos/aho-corasick.js +238 -0
- package/dist/algos/aho-corasick.js.map +1 -0
- package/dist/algos/bloom-filter.d.ts +103 -0
- package/dist/algos/bloom-filter.js +208 -0
- package/dist/algos/bloom-filter.js.map +1 -0
- package/dist/algos/context-patterns.d.ts +102 -0
- package/dist/algos/context-patterns.js +484 -0
- package/dist/algos/context-patterns.js.map +1 -0
- package/dist/index.d.ts +1332 -0
- package/dist/index.js +2631 -0
- package/dist/index.js.map +1 -0
- package/dist/innocence-scoring.d.ts +23 -0
- package/dist/innocence-scoring.js +118 -0
- package/dist/innocence-scoring.js.map +1 -0
- package/dist/language-detector.d.ts +162 -0
- package/dist/language-detector.js +952 -0
- package/dist/language-detector.js.map +1 -0
- package/dist/language-dicts.d.ts +60 -0
- package/dist/language-dicts.js +2718 -0
- package/dist/language-dicts.js.map +1 -0
- package/dist/languages/arabic-words.d.ts +10 -0
- package/dist/languages/arabic-words.js +1649 -0
- package/dist/languages/arabic-words.js.map +1 -0
- package/dist/languages/bengali-words.d.ts +10 -0
- package/dist/languages/bengali-words.js +1696 -0
- package/dist/languages/bengali-words.js.map +1 -0
- package/dist/languages/brazilian-words.d.ts +10 -0
- package/dist/languages/brazilian-words.js +2122 -0
- package/dist/languages/brazilian-words.js.map +1 -0
- package/dist/languages/chinese-words.d.ts +10 -0
- package/dist/languages/chinese-words.js +2728 -0
- package/dist/languages/chinese-words.js.map +1 -0
- package/dist/languages/english-primary-all-languages.d.ts +23 -0
- package/dist/languages/english-primary-all-languages.js +36894 -0
- package/dist/languages/english-primary-all-languages.js.map +1 -0
- package/dist/languages/english-words.d.ts +5 -0
- package/dist/languages/english-words.js +5156 -0
- package/dist/languages/english-words.js.map +1 -0
- package/dist/languages/french-words.d.ts +10 -0
- package/dist/languages/french-words.js +2326 -0
- package/dist/languages/french-words.js.map +1 -0
- package/dist/languages/german-words.d.ts +10 -0
- package/dist/languages/german-words.js +2633 -0
- package/dist/languages/german-words.js.map +1 -0
- package/dist/languages/hindi-words.d.ts +10 -0
- package/dist/languages/hindi-words.js +2341 -0
- package/dist/languages/hindi-words.js.map +1 -0
- package/dist/languages/innocent-words.d.ts +41 -0
- package/dist/languages/innocent-words.js +109 -0
- package/dist/languages/innocent-words.js.map +1 -0
- package/dist/languages/italian-words.d.ts +10 -0
- package/dist/languages/italian-words.js +2287 -0
- package/dist/languages/italian-words.js.map +1 -0
- package/dist/languages/japanese-words.d.ts +11 -0
- package/dist/languages/japanese-words.js +2557 -0
- package/dist/languages/japanese-words.js.map +1 -0
- package/dist/languages/korean-words.d.ts +10 -0
- package/dist/languages/korean-words.js +2509 -0
- package/dist/languages/korean-words.js.map +1 -0
- package/dist/languages/russian-words.d.ts +10 -0
- package/dist/languages/russian-words.js +2175 -0
- package/dist/languages/russian-words.js.map +1 -0
- package/dist/languages/spanish-words.d.ts +11 -0
- package/dist/languages/spanish-words.js +2536 -0
- package/dist/languages/spanish-words.js.map +1 -0
- package/dist/languages/tamil-words.d.ts +10 -0
- package/dist/languages/tamil-words.js +1722 -0
- package/dist/languages/tamil-words.js.map +1 -0
- package/dist/languages/telugu-words.d.ts +10 -0
- package/dist/languages/telugu-words.js +1739 -0
- package/dist/languages/telugu-words.js.map +1 -0
- package/dist/romanization-detector.d.ts +50 -0
- package/dist/romanization-detector.js +779 -0
- package/dist/romanization-detector.js.map +1 -0
- package/package.json +79 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,2631 @@
|
|
|
1
|
+
// Consolidated all-languages dictionary import
|
|
2
|
+
import allLanguagesBadWords from "./languages/english-primary-all-languages.js";
|
|
3
|
+
// Advanced algorithm imports
|
|
4
|
+
import { AhoCorasick } from "./algos/aho-corasick.js";
|
|
5
|
+
import { BloomFilter } from "./algos/bloom-filter.js";
|
|
6
|
+
import { ContextAnalyzer } from "./algos/context-patterns.js";
|
|
7
|
+
// Cross-language innocence scoring
|
|
8
|
+
import { detectLanguages, scoreWord } from "./language-detector.js";
|
|
9
|
+
import innocentWords from "./languages/innocent-words.js";
|
|
10
|
+
import { adjustCertaintyForLanguage } from "./innocence-scoring.js";
|
|
11
|
+
// Export consolidated dictionary for direct access
|
|
12
|
+
export { default as allLanguagesBadWords } from "./languages/english-primary-all-languages.js";
|
|
13
|
+
/**
|
|
14
|
+
* Default console logger implementation for BeKind.
|
|
15
|
+
*
|
|
16
|
+
* @class ConsoleLogger
|
|
17
|
+
* @implements {Logger}
|
|
18
|
+
* @description Logs messages to the browser or Node.js console with an "[BeKind]" prefix.
|
|
19
|
+
* This is the default logger used when no custom logger is provided.
|
|
20
|
+
*
|
|
21
|
+
* @internal
|
|
22
|
+
*/
|
|
23
|
+
class ConsoleLogger {
|
|
24
|
+
/**
|
|
25
|
+
* Log informational messages to console.log with [BeKind] prefix.
|
|
26
|
+
*
|
|
27
|
+
* @param message - The message to log
|
|
28
|
+
* @returns void
|
|
29
|
+
*/
|
|
30
|
+
info(message) {
|
|
31
|
+
console.log(`[BeKind] ${message}`);
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Log warning messages to console.warn with [BeKind] prefix.
|
|
35
|
+
*
|
|
36
|
+
* @param message - The warning message to log
|
|
37
|
+
* @returns void
|
|
38
|
+
*/
|
|
39
|
+
warn(message) {
|
|
40
|
+
console.warn(`[BeKind] ${message}`);
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Log error messages to console.error with [BeKind] prefix.
|
|
44
|
+
*
|
|
45
|
+
* @param message - The error message to log
|
|
46
|
+
* @returns void
|
|
47
|
+
*/
|
|
48
|
+
error(message) {
|
|
49
|
+
console.error(`[BeKind] ${message}`);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Silent logger implementation that suppresses all log output.
|
|
54
|
+
*
|
|
55
|
+
* @class SilentLogger
|
|
56
|
+
* @implements {Logger}
|
|
57
|
+
* @description A no-op logger that discards all log messages. Used when `silent: true` is set
|
|
58
|
+
* in BeKindOptions, or when you want to completely disable logging.
|
|
59
|
+
*
|
|
60
|
+
* @internal
|
|
61
|
+
*/
|
|
62
|
+
class SilentLogger {
|
|
63
|
+
/**
|
|
64
|
+
* No-op implementation - messages are discarded.
|
|
65
|
+
*
|
|
66
|
+
* @param _message - The message (unused)
|
|
67
|
+
* @returns void
|
|
68
|
+
*/
|
|
69
|
+
info(_message) {
|
|
70
|
+
// Silent mode - no logging
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* No-op implementation - warnings are discarded.
|
|
74
|
+
*
|
|
75
|
+
* @param _message - The warning message (unused)
|
|
76
|
+
* @returns void
|
|
77
|
+
*/
|
|
78
|
+
warn(_message) {
|
|
79
|
+
// Silent mode - no logging
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* No-op implementation - errors are discarded.
|
|
83
|
+
*
|
|
84
|
+
* @param _message - The error message (unused)
|
|
85
|
+
* @returns void
|
|
86
|
+
*/
|
|
87
|
+
error(_message) {
|
|
88
|
+
// Silent mode - no logging
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Severity levels for profanity detection results.
|
|
93
|
+
*
|
|
94
|
+
* @enum {number}
|
|
95
|
+
* @description Categorizes the severity of detected profanity based on the number
|
|
96
|
+
* of unique words and total matches found in the text.
|
|
97
|
+
*
|
|
98
|
+
* @readonly
|
|
99
|
+
* @example
|
|
100
|
+
* ```typescript
|
|
101
|
+
* const result = filter.detect("some text");
|
|
102
|
+
* if (result.severity === ProfanitySeverity.EXTREME) {
|
|
103
|
+
* // Handle extreme profanity
|
|
104
|
+
* }
|
|
105
|
+
* ```
|
|
106
|
+
*/
|
|
107
|
+
export var ProfanitySeverity;
|
|
108
|
+
(function (ProfanitySeverity) {
|
|
109
|
+
/** Mild profanity: 1 unique word or 1 total match */
|
|
110
|
+
ProfanitySeverity[ProfanitySeverity["MILD"] = 1] = "MILD";
|
|
111
|
+
/** Moderate profanity: 2 unique words or 2 total matches */
|
|
112
|
+
ProfanitySeverity[ProfanitySeverity["MODERATE"] = 2] = "MODERATE";
|
|
113
|
+
/** Severe profanity: 3 unique words or 3 total matches */
|
|
114
|
+
ProfanitySeverity[ProfanitySeverity["SEVERE"] = 3] = "SEVERE";
|
|
115
|
+
/** Extreme profanity: 4+ unique words or 5+ total matches */
|
|
116
|
+
ProfanitySeverity[ProfanitySeverity["EXTREME"] = 4] = "EXTREME";
|
|
117
|
+
})(ProfanitySeverity = ProfanitySeverity || (ProfanitySeverity = {}));
|
|
118
|
+
/**
|
|
119
|
+
* Per-word severity classification for individual detected words.
|
|
120
|
+
*
|
|
121
|
+
* @enum {number}
|
|
122
|
+
*/
|
|
123
|
+
export var WordSeverity;
|
|
124
|
+
(function (WordSeverity) {
|
|
125
|
+
/** Ambivalent: mild/contextual profanity that may be acceptable (damn, hell, crap, suck) */
|
|
126
|
+
WordSeverity[WordSeverity["AMBIVALENT"] = 1] = "AMBIVALENT";
|
|
127
|
+
/** Profane: should be flagged — strong profanity, slurs, explicit content */
|
|
128
|
+
WordSeverity[WordSeverity["PROFANE"] = 2] = "PROFANE";
|
|
129
|
+
})(WordSeverity = WordSeverity || (WordSeverity = {}));
|
|
130
|
+
/**
|
|
131
|
+
* Validates that an input is a non-empty string.
|
|
132
|
+
*
|
|
133
|
+
* @function validateString
|
|
134
|
+
* @param {unknown} input - The value to validate
|
|
135
|
+
* @param {string} paramName - Name of the parameter being validated (used in error messages)
|
|
136
|
+
* @returns {string} The validated string
|
|
137
|
+
* @throws {TypeError} If input is not a string
|
|
138
|
+
*
|
|
139
|
+
* @internal
|
|
140
|
+
*
|
|
141
|
+
* @example
|
|
142
|
+
* ```typescript
|
|
143
|
+
* const text = validateString(userInput, 'text');
|
|
144
|
+
* // Returns userInput if it's a string, throws TypeError otherwise
|
|
145
|
+
* ```
|
|
146
|
+
*/
|
|
147
|
+
function validateString(input, paramName) {
|
|
148
|
+
if (typeof input !== "string") {
|
|
149
|
+
throw new TypeError(`${paramName} must be a string, got ${typeof input}`);
|
|
150
|
+
}
|
|
151
|
+
return input;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Validates and filters a string array, removing non-string and empty items.
|
|
155
|
+
*
|
|
156
|
+
* @function validateStringArray
|
|
157
|
+
* @param {unknown} input - The value to validate (expected to be an array)
|
|
158
|
+
* @param {string} paramName - Name of the parameter being validated (used in error/warning messages)
|
|
159
|
+
* @returns {string[]} Array of valid, non-empty strings
|
|
160
|
+
* @throws {TypeError} If input is not an array
|
|
161
|
+
*
|
|
162
|
+
* @internal
|
|
163
|
+
*
|
|
164
|
+
* @example
|
|
165
|
+
* ```typescript
|
|
166
|
+
* const words = validateStringArray(['word1', '', 123, 'word2'], 'words');
|
|
167
|
+
* // Returns: ['word1', 'word2']
|
|
168
|
+
* // Logs warning: "Skipping non-string item in words: 123"
|
|
169
|
+
* ```
|
|
170
|
+
*/
|
|
171
|
+
function validateStringArray(input, paramName) {
|
|
172
|
+
if (!Array.isArray(input)) {
|
|
173
|
+
throw new TypeError(`${paramName} must be an array`);
|
|
174
|
+
}
|
|
175
|
+
return input.filter((item) => {
|
|
176
|
+
if (typeof item !== "string") {
|
|
177
|
+
console.warn(`Skipping non-string item in ${paramName}: ${item}`);
|
|
178
|
+
return false;
|
|
179
|
+
}
|
|
180
|
+
return item.trim().length > 0;
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Trie (prefix tree) node for efficient pattern matching and word storage.
|
|
185
|
+
*
|
|
186
|
+
* @class TrieNode
|
|
187
|
+
* @description Implements a trie data structure for O(m) time complexity word matching,
|
|
188
|
+
* where m is the length of the word being searched. Each node represents a character
|
|
189
|
+
* in the word, and paths from root to nodes with isEndOfWord=true represent complete words.
|
|
190
|
+
*
|
|
191
|
+
* @internal
|
|
192
|
+
*
|
|
193
|
+
* @example
|
|
194
|
+
* ```typescript
|
|
195
|
+
* const trie = new TrieNode();
|
|
196
|
+
* trie.addWord('bad');
|
|
197
|
+
* trie.addWord('badword');
|
|
198
|
+
* const matches = trie.findMatches('badwords here', 0, false);
|
|
199
|
+
* // Returns matches for 'bad' and 'badword'
|
|
200
|
+
* ```
|
|
201
|
+
*/
|
|
202
|
+
class TrieNode {
|
|
203
|
+
constructor() {
|
|
204
|
+
/** Map of characters to child nodes for fast lookups */
|
|
205
|
+
this.children = new Map();
|
|
206
|
+
/** Flag indicating if this node represents the end of a complete word */
|
|
207
|
+
this.isEndOfWord = false;
|
|
208
|
+
/** The complete word ending at this node (only set when isEndOfWord is true) */
|
|
209
|
+
this.word = "";
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Get the child node for a given character.
|
|
213
|
+
*/
|
|
214
|
+
getChild(char) {
|
|
215
|
+
return this.children.get(char);
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Adds a word to the trie structure.
|
|
219
|
+
*
|
|
220
|
+
* @param {string} word - The word to add to the trie
|
|
221
|
+
* @returns {void}
|
|
222
|
+
*
|
|
223
|
+
* @remarks
|
|
224
|
+
* - Time Complexity: O(m) where m is the length of the word
|
|
225
|
+
* - Space Complexity: O(m) in worst case when all characters are new
|
|
226
|
+
* - Supports any Unicode characters
|
|
227
|
+
*
|
|
228
|
+
* @example
|
|
229
|
+
* ```typescript
|
|
230
|
+
* const trie = new TrieNode();
|
|
231
|
+
* trie.addWord('hello');
|
|
232
|
+
* trie.addWord('world');
|
|
233
|
+
* ```
|
|
234
|
+
*/
|
|
235
|
+
addWord(word) {
|
|
236
|
+
let current = this;
|
|
237
|
+
for (const char of word) {
|
|
238
|
+
if (!current.children.has(char)) {
|
|
239
|
+
current.children.set(char, new TrieNode());
|
|
240
|
+
}
|
|
241
|
+
const nextNode = current.children.get(char);
|
|
242
|
+
if (nextNode) {
|
|
243
|
+
current = nextNode;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
current.isEndOfWord = true;
|
|
247
|
+
current.word = word;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Removes a word from the trie structure.
|
|
251
|
+
*
|
|
252
|
+
* @param {string} word - The word to remove from the trie
|
|
253
|
+
* @returns {boolean} True if the word existed and was removed, false if word was not found
|
|
254
|
+
*
|
|
255
|
+
* @remarks
|
|
256
|
+
* - Time Complexity: O(m) where m is the length of the word
|
|
257
|
+
* - Also removes unnecessary nodes to keep the trie optimized
|
|
258
|
+
* - Only removes the word marking; shared prefixes with other words are preserved
|
|
259
|
+
*
|
|
260
|
+
* @example
|
|
261
|
+
* ```typescript
|
|
262
|
+
* const trie = new TrieNode();
|
|
263
|
+
* trie.addWord('hello');
|
|
264
|
+
* trie.removeWord('hello'); // Returns: true
|
|
265
|
+
* trie.removeWord('world'); // Returns: false (word not in trie)
|
|
266
|
+
* ```
|
|
267
|
+
*/
|
|
268
|
+
removeWord(word) {
|
|
269
|
+
return this.removeHelper(word, 0);
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Recursive helper method for removing a word from the trie.
|
|
273
|
+
*
|
|
274
|
+
* @param {string} word - The word being removed
|
|
275
|
+
* @param {number} index - Current character index in the word
|
|
276
|
+
* @returns {boolean} True if this node should be deleted (has no children and is not end of another word)
|
|
277
|
+
*
|
|
278
|
+
* @internal
|
|
279
|
+
*/
|
|
280
|
+
removeHelper(word, index) {
|
|
281
|
+
if (index === word.length) {
|
|
282
|
+
if (!this.isEndOfWord)
|
|
283
|
+
return false;
|
|
284
|
+
this.isEndOfWord = false;
|
|
285
|
+
return this.children.size === 0;
|
|
286
|
+
}
|
|
287
|
+
const char = word[index];
|
|
288
|
+
const node = this.children.get(char);
|
|
289
|
+
if (!node)
|
|
290
|
+
return false;
|
|
291
|
+
const shouldDeleteChild = node.removeHelper(word, index + 1);
|
|
292
|
+
if (shouldDeleteChild) {
|
|
293
|
+
this.children.delete(char);
|
|
294
|
+
return this.children.size === 0 && !this.isEndOfWord;
|
|
295
|
+
}
|
|
296
|
+
return false;
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Finds all word matches in text starting at a specific position.
|
|
300
|
+
*
|
|
301
|
+
* @param {string} text - The text to search for profanity
|
|
302
|
+
* @param {number} startPos - The starting position (0-based index) in the text
|
|
303
|
+
* @param {boolean} allowPartial - If true, finds partial matches within larger words
|
|
304
|
+
* @returns {Array<{ word: string; start: number; end: number }>} Array of match objects with word and position info
|
|
305
|
+
*
|
|
306
|
+
* @remarks
|
|
307
|
+
* - Time Complexity: O(k) where k is the length of the longest match from startPos
|
|
308
|
+
* - Returns all valid words that can be formed starting from startPos
|
|
309
|
+
* - When allowPartial is false, respects word boundaries
|
|
310
|
+
*
|
|
311
|
+
* @example
|
|
312
|
+
* ```typescript
|
|
313
|
+
* const trie = new TrieNode();
|
|
314
|
+
* trie.addWord('bad');
|
|
315
|
+
* const matches = trie.findMatches('badword', 0, false);
|
|
316
|
+
* // Returns: [{ word: 'bad', start: 0, end: 3 }]
|
|
317
|
+
* ```
|
|
318
|
+
*/
|
|
319
|
+
findMatches(text, startPos, allowPartial) {
|
|
320
|
+
const matches = [];
|
|
321
|
+
let current = this;
|
|
322
|
+
let pos = startPos;
|
|
323
|
+
while (pos < text.length) {
|
|
324
|
+
const nextNode = current.children.get(text[pos]);
|
|
325
|
+
if (!nextNode)
|
|
326
|
+
break;
|
|
327
|
+
current = nextNode;
|
|
328
|
+
pos++;
|
|
329
|
+
if (current.isEndOfWord) {
|
|
330
|
+
if (!allowPartial) {
|
|
331
|
+
const wordStart = startPos;
|
|
332
|
+
const wordEnd = pos;
|
|
333
|
+
matches.push({
|
|
334
|
+
word: current.word,
|
|
335
|
+
start: wordStart - startPos,
|
|
336
|
+
end: wordEnd - startPos,
|
|
337
|
+
});
|
|
338
|
+
}
|
|
339
|
+
else {
|
|
340
|
+
matches.push({
|
|
341
|
+
word: current.word,
|
|
342
|
+
start: 0,
|
|
343
|
+
end: pos - startPos,
|
|
344
|
+
});
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
return matches;
|
|
349
|
+
}
|
|
350
|
+
/**
|
|
351
|
+
* Clears all words from the trie, resetting it to empty state.
|
|
352
|
+
*
|
|
353
|
+
* @returns {void}
|
|
354
|
+
*
|
|
355
|
+
* @remarks
|
|
356
|
+
* - Time Complexity: O(1) - clears the root node only (JavaScript GC handles children)
|
|
357
|
+
* - Removes all stored words and resets the trie to initial state
|
|
358
|
+
*
|
|
359
|
+
* @example
|
|
360
|
+
* ```typescript
|
|
361
|
+
* const trie = new TrieNode();
|
|
362
|
+
* trie.addWord('hello');
|
|
363
|
+
* trie.addWord('world');
|
|
364
|
+
* trie.clear();
|
|
365
|
+
* // Trie is now empty
|
|
366
|
+
* ```
|
|
367
|
+
*/
|
|
368
|
+
clear() {
|
|
369
|
+
this.children.clear();
|
|
370
|
+
this.isEndOfWord = false;
|
|
371
|
+
this.word = "";
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
/**
|
|
375
|
+
* BeKind - Professional-grade multilingual profanity detection and filtering library.
|
|
376
|
+
*
|
|
377
|
+
* @class BeKind
|
|
378
|
+
* @description A comprehensive, high-performance profanity filtering system supporting 9+ languages
|
|
379
|
+
* with advanced features including leet speak detection, context analysis, multiple matching algorithms,
|
|
380
|
+
* and customizable filtering options.
|
|
381
|
+
*
|
|
382
|
+
* @remarks
|
|
383
|
+
* ### Features:
|
|
384
|
+
* - **Multi-language Support**: English, Hindi, French, German, Spanish, Bengali, Tamil, Telugu, Brazilian Portuguese
|
|
385
|
+
* - **Advanced Algorithms**: Trie, Aho-Corasick, Bloom Filter, and hybrid approaches
|
|
386
|
+
* - **Leet Speak Detection**: Automatically normalizes and detects variations like "h3ll0"
|
|
387
|
+
* - **Context Analysis**: Reduces false positives using surrounding word context
|
|
388
|
+
* - **Performance**: Built-in caching and optimized data structures
|
|
389
|
+
* - **Flexible**: Custom dictionaries, whitelisting, severity levels
|
|
390
|
+
*
|
|
391
|
+
* ### Default Behavior:
|
|
392
|
+
* - Loads English and Hindi dictionaries by default
|
|
393
|
+
* - Case-insensitive matching
|
|
394
|
+
* - Leet speak detection enabled
|
|
395
|
+
* - Uses Trie algorithm (fastest for most cases)
|
|
396
|
+
*
|
|
397
|
+
* @example
|
|
398
|
+
* ```typescript
|
|
399
|
+
* // Basic usage with default instance
|
|
400
|
+
* import allProfanity from 'allprofanity';
|
|
401
|
+
*
|
|
402
|
+
* const result = allProfanity.detect("This is some bad text");
|
|
403
|
+
* console.log(result.hasProfanity); // true
|
|
404
|
+
* console.log(result.cleanedText); // "This is some *** text"
|
|
405
|
+
* console.log(result.severity); // ProfanitySeverity.MILD
|
|
406
|
+
* ```
|
|
407
|
+
*
|
|
408
|
+
* @example
|
|
409
|
+
* ```typescript
|
|
410
|
+
* // Advanced usage with custom configuration
|
|
411
|
+
* import { BeKind, ProfanitySeverity } from 'allprofanity';
|
|
412
|
+
*
|
|
413
|
+
* const filter = new BeKind({
|
|
414
|
+
* languages: ['english', 'french', 'spanish'],
|
|
415
|
+
* enableLeetSpeak: true,
|
|
416
|
+
* strictMode: true,
|
|
417
|
+
* algorithm: {
|
|
418
|
+
* matching: 'hybrid',
|
|
419
|
+
* useBloomFilter: true
|
|
420
|
+
* },
|
|
421
|
+
* performance: {
|
|
422
|
+
* enableCaching: true,
|
|
423
|
+
* cacheSize: 500
|
|
424
|
+
* },
|
|
425
|
+
* whitelistWords: ['class', 'assignment']
|
|
426
|
+
* });
|
|
427
|
+
*
|
|
428
|
+
* const text = "This text has some b@d w0rds";
|
|
429
|
+
* const result = filter.detect(text);
|
|
430
|
+
*
|
|
431
|
+
* if (result.hasProfanity) {
|
|
432
|
+
* console.log(`Found ${result.detectedWords.length} profane words`);
|
|
433
|
+
* console.log(`Severity: ${ProfanitySeverity[result.severity]}`);
|
|
434
|
+
* console.log(`Cleaned: ${result.cleanedText}`);
|
|
435
|
+
* }
|
|
436
|
+
* ```
|
|
437
|
+
*
|
|
438
|
+
* @example
|
|
439
|
+
* ```typescript
|
|
440
|
+
* // Using individual methods
|
|
441
|
+
* const filter = new BeKind();
|
|
442
|
+
*
|
|
443
|
+
* // Simple check
|
|
444
|
+
* if (filter.check("some text")) {
|
|
445
|
+
* console.log("Contains profanity!");
|
|
446
|
+
* }
|
|
447
|
+
*
|
|
448
|
+
* // Clean with custom placeholder
|
|
449
|
+
* const cleaned = filter.clean("bad words here", "#");
|
|
450
|
+
*
|
|
451
|
+
* // Load additional languages
|
|
452
|
+
* filter.loadLanguage('german');
|
|
453
|
+
* filter.loadIndianLanguages(); // Loads hindi, bengali, tamil, telugu
|
|
454
|
+
*
|
|
455
|
+
* // Add custom words
|
|
456
|
+
* filter.add(['customword1', 'customword2']);
|
|
457
|
+
*
|
|
458
|
+
* // Remove words
|
|
459
|
+
* filter.remove(['someword']);
|
|
460
|
+
*
|
|
461
|
+
* // Whitelist words
|
|
462
|
+
* filter.addToWhitelist(['class', 'assignment']);
|
|
463
|
+
* ```
|
|
464
|
+
*
|
|
465
|
+
* @see {@link BeKindOptions} for all configuration options
|
|
466
|
+
* @see {@link ProfanityDetectionResult} for detection result format
|
|
467
|
+
* @see {@link ProfanitySeverity} for severity levels
|
|
468
|
+
*/
|
|
469
|
+
export class BeKind {
|
|
470
|
+
/**
|
|
471
|
+
* Creates a new BeKind instance with the specified configuration.
|
|
472
|
+
*
|
|
473
|
+
* @constructor
|
|
474
|
+
* @param {BeKindOptions} [options] - Configuration options for profanity detection behavior
|
|
475
|
+
*
|
|
476
|
+
* @remarks
|
|
477
|
+
* ### Default Initialization:
|
|
478
|
+
* - Loads English and Hindi dictionaries automatically
|
|
479
|
+
* - Enables leet speak detection
|
|
480
|
+
* - Case-insensitive matching
|
|
481
|
+
* - Uses Trie algorithm for pattern matching
|
|
482
|
+
*
|
|
483
|
+
* ### Performance Considerations:
|
|
484
|
+
* - Initial load time depends on number of languages loaded
|
|
485
|
+
* - Aho-Corasick automaton (if enabled) is built during construction
|
|
486
|
+
* - Bloom Filter (if enabled) is populated during construction
|
|
487
|
+
*
|
|
488
|
+
* @throws {TypeError} If invalid options are provided
|
|
489
|
+
*
|
|
490
|
+
* @example
|
|
491
|
+
* ```typescript
|
|
492
|
+
* // Default instance
|
|
493
|
+
* const filter = new BeKind();
|
|
494
|
+
*
|
|
495
|
+
* // Custom configuration
|
|
496
|
+
* const filter = new BeKind({
|
|
497
|
+
* languages: ['english', 'french'],
|
|
498
|
+
* strictMode: true,
|
|
499
|
+
* defaultPlaceholder: '#',
|
|
500
|
+
* algorithm: { matching: 'hybrid' }
|
|
501
|
+
* });
|
|
502
|
+
*
|
|
503
|
+
* // Silent mode (no logging)
|
|
504
|
+
* const filter = new BeKind({ silent: true });
|
|
505
|
+
* ```
|
|
506
|
+
*
|
|
507
|
+
* @see {@link BeKindOptions} for all available configuration options
|
|
508
|
+
*/
|
|
509
|
+
constructor(options) {
|
|
510
|
+
var _a, _b, _c, _d, _e, _f, _g;
|
|
511
|
+
this.profanityTrie = new TrieNode();
|
|
512
|
+
this.whitelistSet = new Set();
|
|
513
|
+
this.loadedLanguages = new Set();
|
|
514
|
+
this.defaultPlaceholder = "*";
|
|
515
|
+
this.enableLeetSpeak = true;
|
|
516
|
+
this.caseSensitive = false;
|
|
517
|
+
this.strictMode = false;
|
|
518
|
+
this.detectPartialWords = false;
|
|
519
|
+
this.embeddedProfanityDetection = false;
|
|
520
|
+
this.separatorTolerance = 5;
|
|
521
|
+
this.sensitiveMode = false;
|
|
522
|
+
/**
|
|
523
|
+
* Temporary storage for suspicious matches found during separator-tolerant detection.
|
|
524
|
+
* Populated by findSeparatorTolerantMatches() and consumed by detect().
|
|
525
|
+
*/
|
|
526
|
+
this._suspiciousMatches = null;
|
|
527
|
+
this.availableLanguages = {
|
|
528
|
+
all: Object.keys(allLanguagesBadWords || {}),
|
|
529
|
+
};
|
|
530
|
+
/**
|
|
531
|
+
* Word score lookup map. Maps lowercase words to their severity and certainty scores.
|
|
532
|
+
* Populated from the scored word list on construction.
|
|
533
|
+
*/
|
|
534
|
+
this.wordScores = (() => {
|
|
535
|
+
// Normalize dictionary keys to lowercase so getWordScore() lookups work
|
|
536
|
+
// regardless of how words are cased in the dictionary files.
|
|
537
|
+
const raw = allLanguagesBadWords || {};
|
|
538
|
+
const normalized = {};
|
|
539
|
+
for (const [key, value] of Object.entries(raw)) {
|
|
540
|
+
const lk = key.toLowerCase();
|
|
541
|
+
// If duplicate after lowercasing, keep the higher severity entry
|
|
542
|
+
if (!normalized[lk] || value.severity > normalized[lk].severity) {
|
|
543
|
+
normalized[lk] = value;
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
return normalized;
|
|
547
|
+
})();
|
|
548
|
+
/**
|
|
549
|
+
* Set of abhorrent words/phrases that trigger needsManualReview.
|
|
550
|
+
* Includes hate groups, slurs, extremist terminology, and Nazi references.
|
|
551
|
+
* Stored in lowercase for case-insensitive matching.
|
|
552
|
+
*/
|
|
553
|
+
this.abhorrentWords = new Set([
|
|
554
|
+
// Nazi / Third Reich
|
|
555
|
+
"nazi", "nazis", "neonazi", "neo nazi", "neo-nazi", "hitler",
|
|
556
|
+
"heil hitler", "heilhitler", "hitler did nothing wrong",
|
|
557
|
+
"sieg heil", "siegheil", "third reich", "thirdreich",
|
|
558
|
+
"final solution", "finalsolution", "master race", "masterrace",
|
|
559
|
+
"gas the jews", "gasthejews", "holocaust denier", "holocaustdenier",
|
|
560
|
+
"holocaust denial", "holocaustdenial", "holohoax",
|
|
561
|
+
"lebensraum", "herrenvolk", "volkisch", "völkisch",
|
|
562
|
+
"judenfrei", "judenrein", "untermensch", "untermenschen",
|
|
563
|
+
"rassenschande", "übermensch",
|
|
564
|
+
// KKK and white supremacist orgs
|
|
565
|
+
"klan", "klansman", "klansmen", "ku klux klan", "kukluxklan", "kkk",
|
|
566
|
+
"united klans of america", "imperial klans of america",
|
|
567
|
+
"knights of the ku klux klan", "loyal white knights",
|
|
568
|
+
"white camelia knights", "brotherhood of klans",
|
|
569
|
+
"white knights of the kkk",
|
|
570
|
+
// White supremacy / white nationalism
|
|
571
|
+
"white power", "whitepower", "white pride", "whitepride",
|
|
572
|
+
"white supremacy", "whitesupremacy", "white supremacist", "whitesupremacist",
|
|
573
|
+
"white nationalist", "whitenationalist", "white nationalism", "whitenationalism",
|
|
574
|
+
"white ethnostate", "whiteethnostate", "ethnostate",
|
|
575
|
+
"white genocide", "whitegenocide", "racial purity", "racialpurity",
|
|
576
|
+
"race purification", "racepurification", "racial purification", "racialpurification",
|
|
577
|
+
"racial hygiene", "racialhygiene", "ethnic cleansing", "ethniccleansing",
|
|
578
|
+
"aryan nation", "aryan nations", "aryan brotherhood", "aryan circle",
|
|
579
|
+
"aryan guard", "aryan resistance", "aryan strikeforce",
|
|
580
|
+
"white aryan resistance",
|
|
581
|
+
// Extremist groups
|
|
582
|
+
"proud boys", "proudboys", "oath keepers", "oathkeepers",
|
|
583
|
+
"atomwaffen", "atomwaffen division", "patriot front", "patriotfront",
|
|
584
|
+
"vanguard america", "identity evropa", "american identity movement",
|
|
585
|
+
"national socialist", "national socialism", "national socialist movement",
|
|
586
|
+
"american nazi party", "nordic resistance movement",
|
|
587
|
+
"golden dawn", "casa pound", "casapound",
|
|
588
|
+
"generation identity", "identitarian", "identitarian movement",
|
|
589
|
+
"hammerskins", "hammerskin nation", "combat 18", "combat18",
|
|
590
|
+
"blood honour", "blood honor", "volksfront",
|
|
591
|
+
"stormfront", "iron march", "daily stormer", "dailystormer",
|
|
592
|
+
"order of nine angles", "o9a",
|
|
593
|
+
"rise above movement", "vinlanders social club",
|
|
594
|
+
"nazi low riders",
|
|
595
|
+
// Extremist slogans and coded language
|
|
596
|
+
"fourteen words", "fourteenwords", "1488", "14 88",
|
|
597
|
+
"rahowa", "racial holy war", "racialholywar",
|
|
598
|
+
"blood and soil", "bloodandsoil",
|
|
599
|
+
"day of the rope", "dayoftherope",
|
|
600
|
+
"great replacement", "greatreplacement",
|
|
601
|
+
"race war", "racewar",
|
|
602
|
+
"turner diaries", "turnerdiaries",
|
|
603
|
+
"right wing death squad", "rwds",
|
|
604
|
+
"physical removal", "physicalremoval",
|
|
605
|
+
"free helicopter ride", "helicopter ride",
|
|
606
|
+
"race realism", "racerealism", "race realist", "racerealist",
|
|
607
|
+
// Antisemitic
|
|
608
|
+
"jewish question", "jewishquestion", "jq",
|
|
609
|
+
"zionist occupied government", "zog",
|
|
610
|
+
"jewish conspiracy", "jewishconspiracy",
|
|
611
|
+
"protocols of the elders of zion",
|
|
612
|
+
"international jewry", "internationaljewry", "world jewry", "worldjewry",
|
|
613
|
+
"blood libel", "bloodlibel", "jewish problem", "jewishproblem",
|
|
614
|
+
"six million lie", "sixmillionlie",
|
|
615
|
+
"happy merchant", "happymerchant", "le happy merchant",
|
|
616
|
+
"(((them)))", "(((they)))", "(((who)))",
|
|
617
|
+
"oy vey shut it down",
|
|
618
|
+
"death to jews", "kill all jews",
|
|
619
|
+
// Racial slurs — anti-Black
|
|
620
|
+
"lynching", "lynch mob", "lynchmob",
|
|
621
|
+
"jungle bunny", "junglebunny", "jungle bunnies", "junglebunnies",
|
|
622
|
+
"porch monkey", "porchmonkey", "porch monkeys", "porchmonkeys",
|
|
623
|
+
"spear chucker", "spearchucker", "spear chuckers", "spearchuckers",
|
|
624
|
+
"moon cricket", "mooncricket", "moon crickets", "mooncrickets",
|
|
625
|
+
"cotton picker", "cottonpicker", "cotton pickers", "cottonpickers",
|
|
626
|
+
"tar baby", "tarbaby",
|
|
627
|
+
"race soldiers", "racesoldiers",
|
|
628
|
+
"mud people",
|
|
629
|
+
// Racial slurs — anti-Asian
|
|
630
|
+
"gook", "gooks", "chink", "chinks", "chinaman", "chinamen",
|
|
631
|
+
"zipperhead", "zipperheads", "slant eye", "slanteye",
|
|
632
|
+
"ching chong", "chingchong", "yellow peril", "yellowperil",
|
|
633
|
+
"kung flu", "kungflu",
|
|
634
|
+
// Racial slurs — anti-Latino
|
|
635
|
+
"wetback", "wetbacks", "beaner", "beaners",
|
|
636
|
+
"spic", "spics", "spick", "spicks",
|
|
637
|
+
// Racial slurs — anti-Muslim/Arab
|
|
638
|
+
"sand nigger", "sandnigger", "sand niggers", "sandniggers",
|
|
639
|
+
"towel head", "towelhead", "towel heads", "towelheads",
|
|
640
|
+
"raghead", "ragheads", "rag head", "rag heads",
|
|
641
|
+
"camel jockey", "cameljockey", "camel jockeys", "cameljockeys",
|
|
642
|
+
"goat fucker", "goatfucker", "goat fuckers", "goatfuckers",
|
|
643
|
+
"muzzie", "muzzies", "muzrat", "muzrats",
|
|
644
|
+
// Racial slurs — anti-Indigenous
|
|
645
|
+
"prairie nigger", "prairienigger", "timber nigger", "timbernigger",
|
|
646
|
+
"wagon burner", "wagonburner", "wagon burners", "wagonburners",
|
|
647
|
+
"injun", "injuns",
|
|
648
|
+
// Anti-LGBTQ+ hate
|
|
649
|
+
"death to fags", "god hates fags", "godhatesfags",
|
|
650
|
+
"death to gays", "kill all gays",
|
|
651
|
+
// Genocidal language
|
|
652
|
+
"death to muslims", "death to blacks", "death to whites",
|
|
653
|
+
"death to immigrants",
|
|
654
|
+
"kill all muslims", "kill all blacks", "kill all whites",
|
|
655
|
+
"kill all immigrants",
|
|
656
|
+
// Coded hate
|
|
657
|
+
"dindu nuffin", "dindunuffin", "dindu",
|
|
658
|
+
"we wuz kangz", "wewuzkangz",
|
|
659
|
+
"ooga booga", "oogabooga",
|
|
660
|
+
"remove kebab", "removekebab",
|
|
661
|
+
"race traitor", "race traitors", "racetraitor", "racetraitors",
|
|
662
|
+
"sonnenrad", "black sun", "totenkopf", "wolfsangel",
|
|
663
|
+
// ── Additional terms (sourced from ADL, SPLC, GLAAD, Moonshot CVE, ISD Global) ──
|
|
664
|
+
// Anti-Asian slurs — additional
|
|
665
|
+
"jap", "japs", "nip", "nips",
|
|
666
|
+
"coolie", "coolies",
|
|
667
|
+
"paki", "pakis",
|
|
668
|
+
"slope", "slopes", "slopehead", "slopeheads",
|
|
669
|
+
"wog", "wogs",
|
|
670
|
+
"dog eater", "dogeater", "dog eaters", "dogeaters",
|
|
671
|
+
"bat eater", "bateater",
|
|
672
|
+
"china virus", "chinavirus", "wuhan virus", "wuhanvirus",
|
|
673
|
+
"yellow monkey", "yellowmonkey",
|
|
674
|
+
"rice picker", "ricepicker", "rice pickers", "ricepickers",
|
|
675
|
+
// Anti-Latino slurs — additional
|
|
676
|
+
"greaser", "greasers",
|
|
677
|
+
"taco bender", "tacobender",
|
|
678
|
+
"border bunny", "borderbunny", "border bunnies", "borderbunnies",
|
|
679
|
+
"border hopper", "borderhopper", "border hoppers", "borderhoppers",
|
|
680
|
+
"fence hopper", "fencehopper",
|
|
681
|
+
"anchor baby", "anchorbaby", "anchor babies", "anchorbabies",
|
|
682
|
+
"pepper belly", "pepperbelly",
|
|
683
|
+
// Anti-Indigenous slurs — additional
|
|
684
|
+
"redskin", "redskins",
|
|
685
|
+
"squaw", "squaws",
|
|
686
|
+
"half breed", "halfbreed", "half breeds", "halfbreeds",
|
|
687
|
+
"blanket ass", "blanketass",
|
|
688
|
+
"timber monkey", "timbermonkey",
|
|
689
|
+
"red nigger", "rednigger", "bush nigger", "bushnigger",
|
|
690
|
+
// Antisemitic — additional
|
|
691
|
+
"hollowcost", "hollow cost",
|
|
692
|
+
"jewish bankers", "jewishbankers",
|
|
693
|
+
"jewish media", "jewishmedia", "jewish lobby", "jewishlobby",
|
|
694
|
+
"jewed", "jew down",
|
|
695
|
+
"nose check", "nosecheck",
|
|
696
|
+
"early life check", "earlylifecheck", "early life section", "earlylifesection",
|
|
697
|
+
"every single time", "everysingletime",
|
|
698
|
+
"the goyim know", "thegoyimknow",
|
|
699
|
+
"goyim know shut it down", "goyimknowshutitdown",
|
|
700
|
+
"six gorillion", "sixgorillion",
|
|
701
|
+
"oven dodger", "ovendodger", "oven dodgers", "ovendodgers",
|
|
702
|
+
"wooden doors", "woodendoors",
|
|
703
|
+
"holocaust industry", "holocaustindustry",
|
|
704
|
+
"jews will not replace us", "jewswillnotreplaceus",
|
|
705
|
+
"you will not replace us",
|
|
706
|
+
"synagogue of satan", "synagogueofsatan",
|
|
707
|
+
"jewish supremacy", "jewishsupremacy",
|
|
708
|
+
"jewish bolshevism", "jewishbolshevism", "judeo bolshevism", "judeobolshevism",
|
|
709
|
+
"rootless cosmopolitan", "rootlesscosmopolitan",
|
|
710
|
+
"christ killer", "christkiller", "christ killers", "christkillers",
|
|
711
|
+
"greedy jew", "greedyjew", "dirty jew", "dirtyjew",
|
|
712
|
+
"jew rat", "jewrat",
|
|
713
|
+
"sheeny", "sheenies",
|
|
714
|
+
"khazar milkers", "khazarmilkers",
|
|
715
|
+
"small hat", "small hats", "smallhat",
|
|
716
|
+
// Anti-Muslim/Arab — additional
|
|
717
|
+
"deus vult", "deusvult",
|
|
718
|
+
"kebab remover", "kebabremover",
|
|
719
|
+
"mohammedan", "mohammedans",
|
|
720
|
+
"death to islam", "deathtoislam",
|
|
721
|
+
"kill all arabs", "killallarabs",
|
|
722
|
+
"durka durka", "durkadurka",
|
|
723
|
+
"goat lover", "goatlover",
|
|
724
|
+
"cave dweller", "cavedweller", "cave dwellers", "cavedwellers",
|
|
725
|
+
"abeed",
|
|
726
|
+
"islamo fascist", "islamofascist", "islamo fascism", "islamofascism",
|
|
727
|
+
// Anti-Hindu
|
|
728
|
+
"pajeet", "pajeets",
|
|
729
|
+
"poo in loo", "pooinloo", "poo in the loo", "poointheloo",
|
|
730
|
+
"designated shitting street", "designatedshittingstreet",
|
|
731
|
+
"street shitter", "streetshitter", "street shitters", "streetshitters",
|
|
732
|
+
"cow worshipper", "cowworshipper",
|
|
733
|
+
"dot head", "dothead", "dot heads", "dotheads",
|
|
734
|
+
"curry muncher", "currymuncher", "curry munchers", "currymunchers",
|
|
735
|
+
"curry nigger", "currynigger",
|
|
736
|
+
"death to hindus", "kill all hindus",
|
|
737
|
+
// Anti-Sikh
|
|
738
|
+
"diaper head", "diaperhead", "diaper heads", "diaperheads",
|
|
739
|
+
"death to sikhs", "kill all sikhs",
|
|
740
|
+
// Anti-LGBTQ+ hate — eliminationist phrases
|
|
741
|
+
"death to trannies", "death to queers", "death to lesbians",
|
|
742
|
+
"death to transgenders", "death to bisexuals",
|
|
743
|
+
"kill all trannies", "kill all queers", "kill all lesbians",
|
|
744
|
+
"kill all transgenders",
|
|
745
|
+
"hang all fags", "hang all gays", "hang all trannies",
|
|
746
|
+
"burn all fags", "burn all gays",
|
|
747
|
+
"stone the gays", "stone the fags",
|
|
748
|
+
"gas the gays", "gas the fags", "gas the trannies",
|
|
749
|
+
// Anti-LGBTQ+ hate — religious extremist slogans
|
|
750
|
+
"god hates gays", "godhatesgays",
|
|
751
|
+
"god hates queers", "godhatesqueers",
|
|
752
|
+
"god hates trannies", "godhatestrannies",
|
|
753
|
+
"fags deserve death", "fagsdeservedeath",
|
|
754
|
+
"fags burn in hell", "fagsburninhell",
|
|
755
|
+
"gays burn in hell", "gaysburninhell",
|
|
756
|
+
// Anti-trans specific hate
|
|
757
|
+
"troon", "troons",
|
|
758
|
+
"troid", "troids",
|
|
759
|
+
"trannoid", "trannoids",
|
|
760
|
+
"transtrender", "transtrenders",
|
|
761
|
+
"trans are groomers", "transaregroomers",
|
|
762
|
+
"tranny groomers", "trannygroomers",
|
|
763
|
+
"transgender groomers", "transgendergroomers",
|
|
764
|
+
"trans predator", "transpredator", "trans predators", "transpredators",
|
|
765
|
+
"trans are pedophiles", "transarepedophiles",
|
|
766
|
+
"trans are degenerates", "transaredegenerates",
|
|
767
|
+
// Anti-trans suicide baiting
|
|
768
|
+
"join the 41", "jointhe41", "41 percent", "41percent",
|
|
769
|
+
"dilate and cope", "dilateandcope",
|
|
770
|
+
"you will never be a woman", "youwillneverbeawoman",
|
|
771
|
+
"you will never be a real woman", "youwillneverbeareawoman",
|
|
772
|
+
"you will never pass", "youwillneverpass",
|
|
773
|
+
// Anti-LGBTQ+ groomer rhetoric
|
|
774
|
+
"gay groomers", "gaygroomers",
|
|
775
|
+
"lgbtq groomers", "lgbtqgroomers", "lgbt groomers", "lgbtgroomers",
|
|
776
|
+
"drag queen groomers", "dragqueengroomers",
|
|
777
|
+
"ok groomer", "okgroomer",
|
|
778
|
+
"homosexual agenda", "homosexualagenda",
|
|
779
|
+
"gay agenda", "gayagenda", "trans agenda", "transagenda",
|
|
780
|
+
"coming for your children", "comingforyourchildren",
|
|
781
|
+
// Anti-LGBTQ+ dehumanizing slurs — additional
|
|
782
|
+
"carpet muncher", "carpetmuncher", "carpet munchers", "carpetmunchers",
|
|
783
|
+
"pillow biter", "pillowbiter", "fudge packer", "fudgepacker",
|
|
784
|
+
"batty boy", "battyboy", "batty man", "battyman",
|
|
785
|
+
"chi chi man", "chichiman",
|
|
786
|
+
"poof", "poofs", "poofter", "poofters",
|
|
787
|
+
// Anti-LGBTQ+ conversion/cure rhetoric
|
|
788
|
+
"pray the gay away", "praythegayaway",
|
|
789
|
+
"homosexuality is a disease", "homosexualityisadisease",
|
|
790
|
+
// Anti-LGBTQ+ coded mockery
|
|
791
|
+
"attack helicopter", "attackhelicopter",
|
|
792
|
+
"i identify as an attack helicopter",
|
|
793
|
+
"superstraight", "super straight",
|
|
794
|
+
// Modern extremist groups (post-2020, ADL/SPLC documented)
|
|
795
|
+
"active club", "active clubs", "activeclub",
|
|
796
|
+
"white lives matter", "whitelivesmatter",
|
|
797
|
+
"patriot prayer", "patriotprayer",
|
|
798
|
+
"the base", "thebase",
|
|
799
|
+
"feuerkrieg division", "feuerkrieg",
|
|
800
|
+
"terrorgram", "terrorgram collective",
|
|
801
|
+
"goyim defense league", "goyimdefenseleague",
|
|
802
|
+
"national socialist order",
|
|
803
|
+
"aryan freedom network",
|
|
804
|
+
"nationalist social club", "nsc 131", "nsc131",
|
|
805
|
+
"groyper", "groypers", "groyper army",
|
|
806
|
+
"rapewaffen", "rapewaffen division",
|
|
807
|
+
// Boogaloo movement (ADL documented)
|
|
808
|
+
"boogaloo boi", "boogaloo bois", "boogaloo boys",
|
|
809
|
+
"big igloo", "bigigloo",
|
|
810
|
+
"boojahideen",
|
|
811
|
+
// Accelerationist terminology (Moonshot CVE / ISD)
|
|
812
|
+
"siege culture", "siegeculture",
|
|
813
|
+
"siege pill", "siegepill", "siegepilled",
|
|
814
|
+
"read siege", "readsiege",
|
|
815
|
+
"saint tarrant", "sainttarrant",
|
|
816
|
+
"saint breivik", "saintbreivik",
|
|
817
|
+
"saint roof", "saintroof",
|
|
818
|
+
"saint bowers", "saintbowers",
|
|
819
|
+
"dotr",
|
|
820
|
+
// Incel extremist hate speech (ADL/academic research)
|
|
821
|
+
"incel rebellion", "incelrebellion",
|
|
822
|
+
"beta uprising", "betauprising",
|
|
823
|
+
"supreme gentleman", "supremegentleman",
|
|
824
|
+
"foid", "foids", "femoid", "femoids",
|
|
825
|
+
"roastie", "roasties",
|
|
826
|
+
// Eco-fascist terminology (ISD)
|
|
827
|
+
"eco fascism", "ecofascism", "eco fascist", "ecofascist",
|
|
828
|
+
"pine tree gang", "pinetreegang",
|
|
829
|
+
// Internet-era coded hate — additional
|
|
830
|
+
"clown world", "clownworld", "honk honk", "honkhonk", "honkler",
|
|
831
|
+
"despite being 13 percent", "despite 13",
|
|
832
|
+
"6 million wasn't enough", "6mwe",
|
|
833
|
+
"it's okay to be white", "iotbw",
|
|
834
|
+
"skull mask", "skullmask",
|
|
835
|
+
"white boy summer", "whiteboysummer",
|
|
836
|
+
"wpww", "white pride world wide",
|
|
837
|
+
// Coded numbers (ADL Hate Symbols Database)
|
|
838
|
+
"1312",
|
|
839
|
+
// Genocide denial — additional
|
|
840
|
+
"armenian genocide denial",
|
|
841
|
+
"rwandan genocide denial",
|
|
842
|
+
// Anti-immigrant hate — additional
|
|
843
|
+
"remigration",
|
|
844
|
+
"camp of the saints", "campofthesaints",
|
|
845
|
+
"migrant invasion",
|
|
846
|
+
]);
|
|
847
|
+
this.leetMappings = new Map([
|
|
848
|
+
["@", "a"],
|
|
849
|
+
["^", "a"],
|
|
850
|
+
["4", "a"],
|
|
851
|
+
["8", "b"],
|
|
852
|
+
["6", "b"],
|
|
853
|
+
["|3", "b"],
|
|
854
|
+
["(", "c"],
|
|
855
|
+
["<", "c"],
|
|
856
|
+
["©", "c"],
|
|
857
|
+
["|)", "d"],
|
|
858
|
+
["0", "o"],
|
|
859
|
+
["3", "e"],
|
|
860
|
+
["€", "e"],
|
|
861
|
+
["|=", "f"],
|
|
862
|
+
["ph", "f"],
|
|
863
|
+
["9", "g"],
|
|
864
|
+
["#", "h"],
|
|
865
|
+
["|-|", "h"],
|
|
866
|
+
["1", "i"],
|
|
867
|
+
["!", "i"],
|
|
868
|
+
["|", "i"],
|
|
869
|
+
["_|", "j"],
|
|
870
|
+
["¿", "j"],
|
|
871
|
+
["|<", "k"],
|
|
872
|
+
["1<", "k"],
|
|
873
|
+
["7", "l"],
|
|
874
|
+
["|\\/|", "m"],
|
|
875
|
+
["/\\/\\", "m"],
|
|
876
|
+
["|\\|", "n"],
|
|
877
|
+
["//", "n"],
|
|
878
|
+
["()", "o"],
|
|
879
|
+
["|*", "p"],
|
|
880
|
+
["|o", "p"],
|
|
881
|
+
["(_,)", "q"],
|
|
882
|
+
["()_", "q"],
|
|
883
|
+
["|2", "r"],
|
|
884
|
+
["12", "r"],
|
|
885
|
+
["5", "s"],
|
|
886
|
+
["$", "s"],
|
|
887
|
+
["z", "s"],
|
|
888
|
+
["7", "t"],
|
|
889
|
+
["+", "t"],
|
|
890
|
+
["†", "t"],
|
|
891
|
+
["|_|", "u"],
|
|
892
|
+
["(_)", "u"],
|
|
893
|
+
["v", "u"],
|
|
894
|
+
["\\/", "v"],
|
|
895
|
+
["|/", "v"],
|
|
896
|
+
["\\/\\/", "w"],
|
|
897
|
+
["vv", "w"],
|
|
898
|
+
["><", "x"],
|
|
899
|
+
["}{", "x"],
|
|
900
|
+
["`/", "y"],
|
|
901
|
+
["j", "y"],
|
|
902
|
+
["2", "z"],
|
|
903
|
+
["7_", "z"],
|
|
904
|
+
]);
|
|
905
|
+
this.dynamicWords = new Set();
|
|
906
|
+
// Advanced algorithms
|
|
907
|
+
this.ahoCorasickAutomaton = null;
|
|
908
|
+
this.bloomFilter = null;
|
|
909
|
+
this.contextAnalyzer = null;
|
|
910
|
+
this.matchingAlgorithm = "trie";
|
|
911
|
+
this.resultCache = null;
|
|
912
|
+
/**
|
|
913
|
+
* Leet mappings where the source is a regular letter (e.g. z→s, v→u, j→y).
|
|
914
|
+
* These are ambiguous because they can destroy legitimate words during
|
|
915
|
+
* normalization (e.g. "nazi" → "nasi"). Separated so that layered
|
|
916
|
+
* normalization can try symbol-only mappings first.
|
|
917
|
+
*/
|
|
918
|
+
this.letterToLetterLeetKeys = new Set([...this.leetMappings.keys()].filter((k) => /^[a-zA-Z]+$/.test(k)));
|
|
919
|
+
// Use silent logger if silent mode is enabled, otherwise use provided logger or console logger
|
|
920
|
+
this.logger = (options === null || options === void 0 ? void 0 : options.logger) || ((options === null || options === void 0 ? void 0 : options.silent) ? new SilentLogger() : new ConsoleLogger());
|
|
921
|
+
if ((options === null || options === void 0 ? void 0 : options.defaultPlaceholder) !== undefined) {
|
|
922
|
+
this.setPlaceholder(options.defaultPlaceholder);
|
|
923
|
+
}
|
|
924
|
+
this.enableLeetSpeak = (_a = options === null || options === void 0 ? void 0 : options.enableLeetSpeak) !== null && _a !== void 0 ? _a : true;
|
|
925
|
+
this.caseSensitive = (_b = options === null || options === void 0 ? void 0 : options.caseSensitive) !== null && _b !== void 0 ? _b : false;
|
|
926
|
+
this.strictMode = (_c = options === null || options === void 0 ? void 0 : options.strictMode) !== null && _c !== void 0 ? _c : false;
|
|
927
|
+
this.detectPartialWords = (_d = options === null || options === void 0 ? void 0 : options.detectPartialWords) !== null && _d !== void 0 ? _d : false;
|
|
928
|
+
this.embeddedProfanityDetection = (_e = options === null || options === void 0 ? void 0 : options.embeddedProfanityDetection) !== null && _e !== void 0 ? _e : false;
|
|
929
|
+
this.sensitiveMode = (_f = options === null || options === void 0 ? void 0 : options.sensitiveMode) !== null && _f !== void 0 ? _f : false;
|
|
930
|
+
const sepTol = options === null || options === void 0 ? void 0 : options.separatorTolerance;
|
|
931
|
+
if (sepTol === false) {
|
|
932
|
+
this.separatorTolerance = 0;
|
|
933
|
+
}
|
|
934
|
+
else if (typeof sepTol === "number") {
|
|
935
|
+
this.separatorTolerance = Math.max(0, sepTol);
|
|
936
|
+
}
|
|
937
|
+
else {
|
|
938
|
+
// true or undefined → default 5
|
|
939
|
+
this.separatorTolerance = 5;
|
|
940
|
+
}
|
|
941
|
+
if (options === null || options === void 0 ? void 0 : options.whitelistWords) {
|
|
942
|
+
this.addToWhitelist(options.whitelistWords);
|
|
943
|
+
}
|
|
944
|
+
// Initialize advanced algorithms BEFORE loading dictionaries
|
|
945
|
+
// so that words can be added to all data structures
|
|
946
|
+
this.initializeAdvancedAlgorithms(options);
|
|
947
|
+
this.loadLanguage("all");
|
|
948
|
+
if ((_g = options === null || options === void 0 ? void 0 : options.languages) === null || _g === void 0 ? void 0 : _g.length) {
|
|
949
|
+
options.languages.forEach((lang) => this.loadLanguage(lang));
|
|
950
|
+
}
|
|
951
|
+
if (options === null || options === void 0 ? void 0 : options.customDictionaries) {
|
|
952
|
+
Object.entries(options.customDictionaries).forEach(([name, words]) => {
|
|
953
|
+
this.loadCustomDictionary(name, words);
|
|
954
|
+
});
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
/**
|
|
958
|
+
* Initialize advanced algorithms based on configuration
|
|
959
|
+
*/
|
|
960
|
+
initializeAdvancedAlgorithms(options) {
|
|
961
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m;
|
|
962
|
+
// Set matching algorithm
|
|
963
|
+
if ((_a = options === null || options === void 0 ? void 0 : options.algorithm) === null || _a === void 0 ? void 0 : _a.matching) {
|
|
964
|
+
this.matchingAlgorithm = options.algorithm.matching;
|
|
965
|
+
}
|
|
966
|
+
// Initialize Bloom Filter if enabled
|
|
967
|
+
const bloomEnabled = ((_b = options === null || options === void 0 ? void 0 : options.algorithm) === null || _b === void 0 ? void 0 : _b.useBloomFilter) ||
|
|
968
|
+
((_c = options === null || options === void 0 ? void 0 : options.bloomFilter) === null || _c === void 0 ? void 0 : _c.enabled) ||
|
|
969
|
+
this.matchingAlgorithm === "hybrid";
|
|
970
|
+
if (bloomEnabled) {
|
|
971
|
+
const expectedItems = ((_d = options === null || options === void 0 ? void 0 : options.bloomFilter) === null || _d === void 0 ? void 0 : _d.expectedItems) || 10000;
|
|
972
|
+
const falsePositiveRate = ((_e = options === null || options === void 0 ? void 0 : options.bloomFilter) === null || _e === void 0 ? void 0 : _e.falsePositiveRate) || 0.01;
|
|
973
|
+
this.bloomFilter = new BloomFilter(expectedItems, falsePositiveRate);
|
|
974
|
+
this.logger.info(`Bloom Filter initialized with ${expectedItems} expected items and ${(falsePositiveRate * 100).toFixed(2)}% false positive rate`);
|
|
975
|
+
}
|
|
976
|
+
// Initialize Aho-Corasick if enabled
|
|
977
|
+
const ahoEnabled = ((_f = options === null || options === void 0 ? void 0 : options.algorithm) === null || _f === void 0 ? void 0 : _f.useAhoCorasick) ||
|
|
978
|
+
((_g = options === null || options === void 0 ? void 0 : options.ahoCorasick) === null || _g === void 0 ? void 0 : _g.enabled) ||
|
|
979
|
+
this.matchingAlgorithm === "aho-corasick" ||
|
|
980
|
+
this.matchingAlgorithm === "hybrid";
|
|
981
|
+
if (ahoEnabled) {
|
|
982
|
+
this.ahoCorasickAutomaton = new AhoCorasick([]);
|
|
983
|
+
this.logger.info("Aho-Corasick automaton initialized");
|
|
984
|
+
}
|
|
985
|
+
// Initialize Context Analyzer if enabled
|
|
986
|
+
const contextEnabled = ((_h = options === null || options === void 0 ? void 0 : options.algorithm) === null || _h === void 0 ? void 0 : _h.useContextAnalysis) ||
|
|
987
|
+
((_j = options === null || options === void 0 ? void 0 : options.contextAnalysis) === null || _j === void 0 ? void 0 : _j.enabled);
|
|
988
|
+
if (contextEnabled) {
|
|
989
|
+
const contextLanguages = ((_k = options === null || options === void 0 ? void 0 : options.contextAnalysis) === null || _k === void 0 ? void 0 : _k.languages) || ["en"];
|
|
990
|
+
this.contextAnalyzer = new ContextAnalyzer(contextLanguages);
|
|
991
|
+
if ((_l = options === null || options === void 0 ? void 0 : options.contextAnalysis) === null || _l === void 0 ? void 0 : _l.contextWindow) {
|
|
992
|
+
this.contextAnalyzer.setContextWindow(options.contextAnalysis.contextWindow);
|
|
993
|
+
}
|
|
994
|
+
this.logger.info(`Context Analyzer initialized for languages: ${contextLanguages.join(", ")}`);
|
|
995
|
+
}
|
|
996
|
+
// Initialize result cache if enabled
|
|
997
|
+
if ((_m = options === null || options === void 0 ? void 0 : options.performance) === null || _m === void 0 ? void 0 : _m.enableCaching) {
|
|
998
|
+
const cacheSize = options.performance.cacheSize || 1000;
|
|
999
|
+
this.resultCache = new Map();
|
|
1000
|
+
this.logger.info(`Result caching enabled with size limit: ${cacheSize}`);
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
/**
|
|
1004
|
+
* Normalize leet speak to regular characters (full pass — all mappings).
|
|
1005
|
+
* @param text - The input text.
|
|
1006
|
+
* @returns Normalized text.
|
|
1007
|
+
*/
|
|
1008
|
+
normalizeLeetSpeak(text) {
|
|
1009
|
+
if (!this.enableLeetSpeak)
|
|
1010
|
+
return text;
|
|
1011
|
+
let normalized = text.toLowerCase();
|
|
1012
|
+
const sortedMappings = Array.from(this.leetMappings.entries()).sort(([leetA], [leetB]) => leetB.length - leetA.length);
|
|
1013
|
+
for (const [leet, normal] of sortedMappings) {
|
|
1014
|
+
const regex = new RegExp(this.escapeRegex(leet), "g");
|
|
1015
|
+
normalized = normalized.replace(regex, normal);
|
|
1016
|
+
}
|
|
1017
|
+
return normalized;
|
|
1018
|
+
}
|
|
1019
|
+
/**
|
|
1020
|
+
* Conservative leet normalization — only replaces non-letter characters
|
|
1021
|
+
* (digits, symbols, punctuation) with their letter equivalents.
|
|
1022
|
+
* Letter-to-letter mappings (z→s, v→u, j→y, ph→f) are skipped so that
|
|
1023
|
+
* real letters are preserved, avoiding collisions like "nazi" → "nasi".
|
|
1024
|
+
*/
|
|
1025
|
+
normalizeLeetSpeakSymbolsOnly(text) {
|
|
1026
|
+
if (!this.enableLeetSpeak)
|
|
1027
|
+
return text;
|
|
1028
|
+
let normalized = text.toLowerCase();
|
|
1029
|
+
const sortedMappings = Array.from(this.leetMappings.entries()).sort(([leetA], [leetB]) => leetB.length - leetA.length);
|
|
1030
|
+
for (const [leet, normal] of sortedMappings) {
|
|
1031
|
+
if (this.letterToLetterLeetKeys.has(leet))
|
|
1032
|
+
continue;
|
|
1033
|
+
const regex = new RegExp(this.escapeRegex(leet), "g");
|
|
1034
|
+
normalized = normalized.replace(regex, normal);
|
|
1035
|
+
}
|
|
1036
|
+
return normalized;
|
|
1037
|
+
}
|
|
1038
|
+
/**
|
|
1039
|
+
* Returns all unique leet-normalized variants of the text that differ
|
|
1040
|
+
* from the base normalizedText. Runs two layers:
|
|
1041
|
+
* 1. Symbol-only normalization (digits/special → letters, preserves real letters)
|
|
1042
|
+
* 2. Full normalization (all mappings including letter→letter)
|
|
1043
|
+
*
|
|
1044
|
+
* This layered approach catches both "n4zi" (symbol-only → "nazi") and
|
|
1045
|
+
* "a55" (full → "ass") without one breaking the other.
|
|
1046
|
+
*/
|
|
1047
|
+
getLeetVariants(normalizedText) {
|
|
1048
|
+
if (!this.enableLeetSpeak)
|
|
1049
|
+
return [];
|
|
1050
|
+
const variants = [];
|
|
1051
|
+
const seen = new Set([normalizedText]);
|
|
1052
|
+
// Layer 1: symbol-only (conservative) — catches n4zi→nazi, wh1te→white
|
|
1053
|
+
const symbolOnly = this.normalizeLeetSpeakSymbolsOnly(normalizedText);
|
|
1054
|
+
if (!seen.has(symbolOnly)) {
|
|
1055
|
+
seen.add(symbolOnly);
|
|
1056
|
+
variants.push(symbolOnly);
|
|
1057
|
+
}
|
|
1058
|
+
// Layer 2: full normalization — catches z→s substitutions like a55→ass
|
|
1059
|
+
const full = this.normalizeLeetSpeak(normalizedText);
|
|
1060
|
+
if (!seen.has(full)) {
|
|
1061
|
+
seen.add(full);
|
|
1062
|
+
variants.push(full);
|
|
1063
|
+
}
|
|
1064
|
+
return variants;
|
|
1065
|
+
}
|
|
1066
|
+
/**
|
|
1067
|
+
* Check if a character is a non-space separator (skipped freely).
|
|
1068
|
+
*/
|
|
1069
|
+
static isSymbolSeparator(char) {
|
|
1070
|
+
return BeKind.SYMBOL_SEPARATOR_SET.has(char);
|
|
1071
|
+
}
|
|
1072
|
+
/**
|
|
1073
|
+
* Check if a character is whitespace (skipped with certainty penalty).
|
|
1074
|
+
*/
|
|
1075
|
+
static isWhitespaceSeparator(char) {
|
|
1076
|
+
return char === " " || char === "\t" || char === "\n" || char === "\r";
|
|
1077
|
+
}
|
|
1078
|
+
/**
|
|
1079
|
+
* Check if a character is any kind of separator.
|
|
1080
|
+
*/
|
|
1081
|
+
static isSeparator(char) {
|
|
1082
|
+
return BeKind.isSymbolSeparator(char) || BeKind.isWhitespaceSeparator(char);
|
|
1083
|
+
}
|
|
1084
|
+
/**
|
|
1085
|
+
* Extract surrounding context (±N words) around a match position in text.
|
|
1086
|
+
*/
|
|
1087
|
+
extractSurroundingContext(text, start, end, wordCount) {
|
|
1088
|
+
const words = text.split(/\s+/);
|
|
1089
|
+
let charPos = 0;
|
|
1090
|
+
let startWordIdx = 0;
|
|
1091
|
+
let endWordIdx = words.length - 1;
|
|
1092
|
+
for (let i = 0; i < words.length; i++) {
|
|
1093
|
+
const wordStart = text.indexOf(words[i], charPos);
|
|
1094
|
+
const wordEnd = wordStart + words[i].length;
|
|
1095
|
+
if (wordEnd <= start)
|
|
1096
|
+
startWordIdx = i;
|
|
1097
|
+
if (wordStart < end)
|
|
1098
|
+
endWordIdx = i;
|
|
1099
|
+
charPos = wordEnd;
|
|
1100
|
+
}
|
|
1101
|
+
const contextStart = Math.max(0, startWordIdx - wordCount);
|
|
1102
|
+
const contextEnd = Math.min(words.length - 1, endWordIdx + wordCount);
|
|
1103
|
+
return words.slice(contextStart, contextEnd + 1).join(" ");
|
|
1104
|
+
}
|
|
1105
|
+
/**
|
|
1106
|
+
* Escape regex special characters in a string.
|
|
1107
|
+
* @param str - The string to escape.
|
|
1108
|
+
* @returns The escaped string.
|
|
1109
|
+
*/
|
|
1110
|
+
escapeRegex(str) {
|
|
1111
|
+
return str.replace(/[\\^$.*+?()[\]{}|]/g, "\\$&");
|
|
1112
|
+
}
|
|
1113
|
+
/**
|
|
1114
|
+
* Check if a match is bounded by word boundaries (strict mode).
|
|
1115
|
+
* @param text - The text.
|
|
1116
|
+
* @param start - Start index.
|
|
1117
|
+
* @param end - End index.
|
|
1118
|
+
* @returns True if match is at word boundaries, false otherwise.
|
|
1119
|
+
*/
|
|
1120
|
+
hasWordBoundaries(text, start, end) {
|
|
1121
|
+
if (!this.strictMode)
|
|
1122
|
+
return true;
|
|
1123
|
+
const beforeChar = start > 0 ? text[start - 1] : " ";
|
|
1124
|
+
const afterChar = end < text.length ? text[end] : " ";
|
|
1125
|
+
const wordBoundaryRegex = /[\s\p{P}\p{S}]/u;
|
|
1126
|
+
return (wordBoundaryRegex.test(beforeChar) && wordBoundaryRegex.test(afterChar));
|
|
1127
|
+
}
|
|
1128
|
+
isWholeWord(text, start, end) {
|
|
1129
|
+
// CJK scripts (Chinese, Japanese, Korean) have no spaces between words.
|
|
1130
|
+
// If the matched word is CJK, treat it as a whole-word hit unconditionally —
|
|
1131
|
+
// the language-signal guard in isHighCoverageEmbed handles cross-script FPs.
|
|
1132
|
+
if (BeKind.CJK_RE.test(text.slice(start, end)))
|
|
1133
|
+
return true;
|
|
1134
|
+
// Use \p{L} (Unicode letter) not \w (ASCII-only) so that accented and
|
|
1135
|
+
// non-Latin characters (ü, ş, ğ, é, ñ, …) are correctly recognised as
|
|
1136
|
+
// word characters and do not act as false word-boundaries.
|
|
1137
|
+
if (start !== 0 && /\p{L}/u.test(text[start - 1]))
|
|
1138
|
+
return false;
|
|
1139
|
+
if (end !== text.length && /\p{L}/u.test(text[end]))
|
|
1140
|
+
return false;
|
|
1141
|
+
return true;
|
|
1142
|
+
}
|
|
1143
|
+
/**
|
|
1144
|
+
* Returns the char-index bounds of the host word containing [start, end).
|
|
1145
|
+
* Scans outward using the same Unicode-letter definition as isWholeWord.
|
|
1146
|
+
*/
|
|
1147
|
+
getHostWordBounds(text, start, end) {
|
|
1148
|
+
let hostStart = start;
|
|
1149
|
+
while (hostStart > 0 && /\p{L}/u.test(text[hostStart - 1]))
|
|
1150
|
+
hostStart--;
|
|
1151
|
+
let hostEnd = end;
|
|
1152
|
+
while (hostEnd < text.length && /\p{L}/u.test(text[hostEnd]))
|
|
1153
|
+
hostEnd++;
|
|
1154
|
+
return { hostStart, hostEnd };
|
|
1155
|
+
}
|
|
1156
|
+
isHighCoverageEmbed(text, matchStart, matchEnd, matchWord, docLangSignal) {
|
|
1157
|
+
var _a, _b;
|
|
1158
|
+
const matchLen = matchEnd - matchStart;
|
|
1159
|
+
if (matchLen < BeKind.HIGH_COVERAGE_MIN_MATCH_LEN)
|
|
1160
|
+
return false;
|
|
1161
|
+
const { hostStart, hostEnd } = this.getHostWordBounds(text, matchStart, matchEnd);
|
|
1162
|
+
const hostLen = hostEnd - hostStart;
|
|
1163
|
+
if (hostLen === 0)
|
|
1164
|
+
return false;
|
|
1165
|
+
// Graduated coverage: shorter matches need higher coverage to reduce FPs
|
|
1166
|
+
const coverageThreshold = matchLen <= 6
|
|
1167
|
+
? BeKind.HIGH_COVERAGE_THRESHOLD_SHORT
|
|
1168
|
+
: BeKind.HIGH_COVERAGE_THRESHOLD_LONG;
|
|
1169
|
+
if (matchLen / hostLen < coverageThreshold)
|
|
1170
|
+
return false;
|
|
1171
|
+
if (matchWord) {
|
|
1172
|
+
const wordScore = this.wordScores[matchWord.toLowerCase()];
|
|
1173
|
+
if (wordScore) {
|
|
1174
|
+
const profaneLang = wordScore.language;
|
|
1175
|
+
const hostWord = text.slice(hostStart, hostEnd);
|
|
1176
|
+
// Word-level language signal guard
|
|
1177
|
+
const hostSignal = scoreWord(hostWord);
|
|
1178
|
+
const wordLangSignal = (_a = hostSignal[profaneLang]) !== null && _a !== void 0 ? _a : 0;
|
|
1179
|
+
if (wordLangSignal < BeKind.HIGH_COVERAGE_LANG_SIGNAL_MIN)
|
|
1180
|
+
return false;
|
|
1181
|
+
// Document-level language mismatch guard: if the doc is strongly one
|
|
1182
|
+
// language and the profane word is from a DIFFERENT language, skip.
|
|
1183
|
+
// e.g. English doc + French "engin" in "engine" → skip
|
|
1184
|
+
if (docLangSignal) {
|
|
1185
|
+
const docProfaneLangSignal = (_b = docLangSignal[profaneLang]) !== null && _b !== void 0 ? _b : 0;
|
|
1186
|
+
const docTopSignal = Math.max(...Object.values(docLangSignal), 0);
|
|
1187
|
+
// If profane word's language has < 10% doc signal AND another language
|
|
1188
|
+
// dominates the doc (> 50%), this is almost certainly a cross-language FP
|
|
1189
|
+
if (docProfaneLangSignal < 0.1 && docTopSignal > 0.5)
|
|
1190
|
+
return false;
|
|
1191
|
+
}
|
|
1192
|
+
// Innocent embed guard: check hostWords allowlist and partialDampeningFactor
|
|
1193
|
+
const innocentEntries = innocentWords[matchWord.toLowerCase()];
|
|
1194
|
+
if (innocentEntries) {
|
|
1195
|
+
const lowerHost = hostWord.toLowerCase();
|
|
1196
|
+
if (innocentEntries.some(e => { var _a; return (_a = e.hostWords) === null || _a === void 0 ? void 0 : _a.includes(lowerHost); }))
|
|
1197
|
+
return false;
|
|
1198
|
+
if (innocentEntries.some(e => { var _a; return ((_a = e.partialDampeningFactor) !== null && _a !== void 0 ? _a : 0) >= 0.5; }))
|
|
1199
|
+
return false;
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
return true;
|
|
1204
|
+
}
|
|
1205
|
+
/**
|
|
1206
|
+
* Check if a match is whitelisted.
|
|
1207
|
+
* @param word - Word from dictionary.
|
|
1208
|
+
* @param matchedText - Actual matched text.
|
|
1209
|
+
* @returns True if whitelisted, false otherwise.
|
|
1210
|
+
*/
|
|
1211
|
+
isWhitelistedMatch(word, matchedText) {
|
|
1212
|
+
if (this.caseSensitive) {
|
|
1213
|
+
return this.whitelistSet.has(word) || this.whitelistSet.has(matchedText);
|
|
1214
|
+
}
|
|
1215
|
+
else {
|
|
1216
|
+
return (this.whitelistSet.has(word.toLowerCase()) ||
|
|
1217
|
+
this.whitelistSet.has(matchedText.toLowerCase()));
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
/**
|
|
1221
|
+
* Remove overlapping matches, keeping only the longest at each start position.
|
|
1222
|
+
* @param matches - Array of match results.
|
|
1223
|
+
* @returns Deduplicated matches.
|
|
1224
|
+
*/
|
|
1225
|
+
deduplicateMatches(matches) {
|
|
1226
|
+
const sorted = [...matches].sort((a, b) => {
|
|
1227
|
+
if (a.start !== b.start)
|
|
1228
|
+
return a.start - b.start;
|
|
1229
|
+
return b.end - a.end;
|
|
1230
|
+
});
|
|
1231
|
+
const result = [];
|
|
1232
|
+
let lastEnd = -1;
|
|
1233
|
+
for (const match of sorted) {
|
|
1234
|
+
if (match.start >= lastEnd) {
|
|
1235
|
+
result.push(match);
|
|
1236
|
+
lastEnd = match.end;
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
return result;
|
|
1240
|
+
}
|
|
1241
|
+
/**
|
|
1242
|
+
* Use Aho-Corasick algorithm for pattern matching
|
|
1243
|
+
*/
|
|
1244
|
+
findMatchesWithAhoCorasick(searchText, originalText) {
|
|
1245
|
+
if (!this.ahoCorasickAutomaton) {
|
|
1246
|
+
return [];
|
|
1247
|
+
}
|
|
1248
|
+
const ahoMatches = this.ahoCorasickAutomaton.findAll(searchText);
|
|
1249
|
+
const results = [];
|
|
1250
|
+
// Compute doc-level language signal once for all embed checks
|
|
1251
|
+
let docLangSignal;
|
|
1252
|
+
const getDocLang = () => {
|
|
1253
|
+
if (!docLangSignal) {
|
|
1254
|
+
const detected = detectLanguages(originalText, { maxLanguages: 3 });
|
|
1255
|
+
docLangSignal = {};
|
|
1256
|
+
for (const lang of detected.languages) {
|
|
1257
|
+
docLangSignal[lang.language] = lang.proportion;
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1260
|
+
return docLangSignal;
|
|
1261
|
+
};
|
|
1262
|
+
for (const match of ahoMatches) {
|
|
1263
|
+
const isWhole = this.isWholeWord(originalText, match.start, match.end);
|
|
1264
|
+
if (!this.detectPartialWords && !isWhole) {
|
|
1265
|
+
if (!this.isHighCoverageEmbed(originalText, match.start, match.end, match.pattern, getDocLang())) {
|
|
1266
|
+
continue;
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
const matchedText = originalText.substring(match.start, match.end);
|
|
1270
|
+
if (this.isWhitelistedMatch(match.pattern, matchedText)) {
|
|
1271
|
+
continue;
|
|
1272
|
+
}
|
|
1273
|
+
if (this.hasWordBoundaries(originalText, match.start, match.end)) {
|
|
1274
|
+
results.push({
|
|
1275
|
+
word: match.pattern,
|
|
1276
|
+
start: match.start,
|
|
1277
|
+
end: match.end,
|
|
1278
|
+
originalWord: matchedText,
|
|
1279
|
+
});
|
|
1280
|
+
}
|
|
1281
|
+
}
|
|
1282
|
+
return results;
|
|
1283
|
+
}
|
|
1284
|
+
/**
|
|
1285
|
+
* Hybrid approach: Aho-Corasick for fast matching, Bloom Filter for validation
|
|
1286
|
+
*/
|
|
1287
|
+
findMatchesHybrid(searchText, originalText) {
|
|
1288
|
+
// Use Aho-Corasick for primary matching if available
|
|
1289
|
+
if (this.ahoCorasickAutomaton) {
|
|
1290
|
+
const matches = this.findMatchesWithAhoCorasick(searchText, originalText);
|
|
1291
|
+
// If Bloom Filter is enabled, validate matches
|
|
1292
|
+
if (this.bloomFilter) {
|
|
1293
|
+
return matches.filter((match) => this.bloomFilter.mightContain(match.word));
|
|
1294
|
+
}
|
|
1295
|
+
return matches;
|
|
1296
|
+
}
|
|
1297
|
+
// Fallback to Trie if Aho-Corasick not available
|
|
1298
|
+
const matches = [];
|
|
1299
|
+
this.findMatches(searchText, originalText, matches);
|
|
1300
|
+
// Validate with Bloom Filter if enabled
|
|
1301
|
+
if (this.bloomFilter) {
|
|
1302
|
+
return matches.filter((match) => this.bloomFilter.mightContain(match.word));
|
|
1303
|
+
}
|
|
1304
|
+
return matches;
|
|
1305
|
+
}
|
|
1306
|
+
/**
|
|
1307
|
+
* Apply context analysis to filter false positives
|
|
1308
|
+
*/
|
|
1309
|
+
applyContextAnalysis(text, matches, scoreThreshold = 0.5) {
|
|
1310
|
+
if (!this.contextAnalyzer) {
|
|
1311
|
+
return matches;
|
|
1312
|
+
}
|
|
1313
|
+
return matches.filter((match) => {
|
|
1314
|
+
const analysis = this.contextAnalyzer.analyzeContext(text, match.start, match.end, match.word);
|
|
1315
|
+
// If score is above threshold, it's likely profanity
|
|
1316
|
+
return analysis.score >= scoreThreshold;
|
|
1317
|
+
});
|
|
1318
|
+
}
|
|
1319
|
+
/**
|
|
1320
|
+
* Detects profanity in the provided text and returns comprehensive analysis.
|
|
1321
|
+
*
|
|
1322
|
+
* @param {string} text - The text to analyze for profanity
|
|
1323
|
+
* @returns {ProfanityDetectionResult} Detailed detection result including matches, positions, severity, and cleaned text
|
|
1324
|
+
*
|
|
1325
|
+
* @throws {TypeError} If text is not a string
|
|
1326
|
+
*
|
|
1327
|
+
* @remarks
|
|
1328
|
+
* ### Performance:
|
|
1329
|
+
* - Time Complexity: O(n*m) where n is text length, m is average word length in dictionary
|
|
1330
|
+
* - With Bloom Filter: O(n) average case (faster early rejection)
|
|
1331
|
+
* - With Caching: O(1) for repeated identical text
|
|
1332
|
+
*
|
|
1333
|
+
* ### Features:
|
|
1334
|
+
* - Detects leet speak variations (if enabled): "h3ll0" → "hello"
|
|
1335
|
+
* - Respects word boundaries (strict mode) or detects partial matches
|
|
1336
|
+
* - Returns exact positions for highlighting/masking
|
|
1337
|
+
* - Calculates severity based on match count and uniqueness
|
|
1338
|
+
*
|
|
1339
|
+
* ### Caching:
|
|
1340
|
+
* - Results are cached if `performance.enableCaching` is true
|
|
1341
|
+
* - Cache uses LRU eviction when size limit is reached
|
|
1342
|
+
*
|
|
1343
|
+
* @example
|
|
1344
|
+
* ```typescript
|
|
1345
|
+
* const filter = new BeKind();
|
|
1346
|
+
* const result = filter.detect("This has bad words");
|
|
1347
|
+
*
|
|
1348
|
+
* console.log(result.hasProfanity); // true
|
|
1349
|
+
* console.log(result.detectedWords); // ['bad']
|
|
1350
|
+
* console.log(result.cleanedText); // 'This has *** words'
|
|
1351
|
+
* console.log(result.severity); // ProfanitySeverity.MILD
|
|
1352
|
+
* console.log(result.positions); // [{ word: 'bad', start: 9, end: 12 }]
|
|
1353
|
+
* ```
|
|
1354
|
+
*
|
|
1355
|
+
* @example
|
|
1356
|
+
* ```typescript
|
|
1357
|
+
* // With leet speak detection
|
|
1358
|
+
* const filter = new BeKind({ enableLeetSpeak: true });
|
|
1359
|
+
* const result = filter.detect("st0p b3ing b@d");
|
|
1360
|
+
*
|
|
1361
|
+
* if (result.hasProfanity) {
|
|
1362
|
+
* result.positions.forEach(pos => {
|
|
1363
|
+
* console.log(`Found "${pos.word}" at position ${pos.start}-${pos.end}`);
|
|
1364
|
+
* });
|
|
1365
|
+
* }
|
|
1366
|
+
* ```
|
|
1367
|
+
*
|
|
1368
|
+
* @see {@link ProfanityDetectionResult} for result structure
|
|
1369
|
+
* @see {@link ProfanitySeverity} for severity levels
|
|
1370
|
+
*/
|
|
1371
|
+
detect(text) {
|
|
1372
|
+
var _a, _b, _c;
|
|
1373
|
+
const validatedText = validateString(text, "text");
|
|
1374
|
+
if (validatedText.length === 0) {
|
|
1375
|
+
return {
|
|
1376
|
+
hasProfanity: false,
|
|
1377
|
+
detectedWords: [],
|
|
1378
|
+
cleanedText: validatedText,
|
|
1379
|
+
severity: ProfanitySeverity.MILD,
|
|
1380
|
+
positions: [],
|
|
1381
|
+
needsManualReview: false,
|
|
1382
|
+
flaggedAbhorrentWords: [],
|
|
1383
|
+
scoredWords: [],
|
|
1384
|
+
maxSeverity: null,
|
|
1385
|
+
suspiciousPhrases: [],
|
|
1386
|
+
};
|
|
1387
|
+
}
|
|
1388
|
+
// Check cache first if enabled
|
|
1389
|
+
if ((_a = this.resultCache) === null || _a === void 0 ? void 0 : _a.has(validatedText)) {
|
|
1390
|
+
return this.resultCache.get(validatedText);
|
|
1391
|
+
}
|
|
1392
|
+
// Reset temporary suspicious match storage
|
|
1393
|
+
this._suspiciousMatches = null;
|
|
1394
|
+
let matches = [];
|
|
1395
|
+
const normalizedText = this.caseSensitive
|
|
1396
|
+
? validatedText
|
|
1397
|
+
: validatedText.toLowerCase();
|
|
1398
|
+
// Choose matching algorithm based on configuration
|
|
1399
|
+
// Leet-speak uses layered normalization: symbol-only first, then full,
|
|
1400
|
+
// so that letter→letter mappings (z→s) don't clobber legitimate letters.
|
|
1401
|
+
const leetVariants = this.getLeetVariants(normalizedText);
|
|
1402
|
+
switch (this.matchingAlgorithm) {
|
|
1403
|
+
case "aho-corasick":
|
|
1404
|
+
matches = this.findMatchesWithAhoCorasick(normalizedText, validatedText);
|
|
1405
|
+
for (const variant of leetVariants) {
|
|
1406
|
+
matches.push(...this.findMatchesWithAhoCorasick(variant, validatedText));
|
|
1407
|
+
}
|
|
1408
|
+
break;
|
|
1409
|
+
case "hybrid":
|
|
1410
|
+
matches = this.findMatchesHybrid(normalizedText, validatedText);
|
|
1411
|
+
for (const variant of leetVariants) {
|
|
1412
|
+
matches.push(...this.findMatchesHybrid(variant, validatedText));
|
|
1413
|
+
}
|
|
1414
|
+
break;
|
|
1415
|
+
case "trie":
|
|
1416
|
+
default:
|
|
1417
|
+
this.findMatches(normalizedText, validatedText, matches);
|
|
1418
|
+
for (const variant of leetVariants) {
|
|
1419
|
+
this.findMatches(variant, validatedText, matches);
|
|
1420
|
+
}
|
|
1421
|
+
break;
|
|
1422
|
+
}
|
|
1423
|
+
// Separator-tolerant matching: re-walk the trie but skip over separators
|
|
1424
|
+
if (this.separatorTolerance > 0) {
|
|
1425
|
+
this.findSeparatorTolerantMatches(normalizedText, validatedText, matches);
|
|
1426
|
+
}
|
|
1427
|
+
// Context analysis is handled via certainty-delta in shouldFlagWithContext()
|
|
1428
|
+
const allUniqueMatches = this.deduplicateMatches(matches);
|
|
1429
|
+
// Partition: certainty:0 matches become suspicious phrases, not profanity
|
|
1430
|
+
const uniqueMatches = allUniqueMatches.filter((m) => {
|
|
1431
|
+
const score = this.getWordScore(m.word);
|
|
1432
|
+
return !score || score.certainty !== 0;
|
|
1433
|
+
});
|
|
1434
|
+
const suspiciousFromCertaintyZero = allUniqueMatches.filter((m) => {
|
|
1435
|
+
const score = this.getWordScore(m.word);
|
|
1436
|
+
return score && score.certainty === 0;
|
|
1437
|
+
});
|
|
1438
|
+
const detectedWords = uniqueMatches.map((m) => m.originalWord);
|
|
1439
|
+
const severity = this.calculateSeverity(uniqueMatches);
|
|
1440
|
+
const cleanedText = this.generateCleanedText(validatedText, uniqueMatches);
|
|
1441
|
+
// Check for abhorrent words that need manual review
|
|
1442
|
+
const flaggedAbhorrentWords = uniqueMatches
|
|
1443
|
+
.filter((m) => this.abhorrentWords.has(m.word.toLowerCase()))
|
|
1444
|
+
.map((m) => m.originalWord);
|
|
1445
|
+
const uniqueAbhorrent = [...new Set(flaggedAbhorrentWords)];
|
|
1446
|
+
// Lazy document-level language detection — only computed if a collision word is matched
|
|
1447
|
+
let docSignal = null;
|
|
1448
|
+
function getDocSignal() {
|
|
1449
|
+
if (docSignal === null) {
|
|
1450
|
+
docSignal = {};
|
|
1451
|
+
const docResult = detectLanguages(text);
|
|
1452
|
+
for (const lang of docResult.languages) {
|
|
1453
|
+
docSignal[lang.language] = lang.proportion;
|
|
1454
|
+
}
|
|
1455
|
+
}
|
|
1456
|
+
return docSignal;
|
|
1457
|
+
}
|
|
1458
|
+
// Build scoredWords: PROFANE if shouldFlag(), AMBIVALENT otherwise
|
|
1459
|
+
// For embedded/substring matches, use the decayed scores for flag determination
|
|
1460
|
+
const scoredWords = uniqueMatches.map((m) => {
|
|
1461
|
+
var _a, _b;
|
|
1462
|
+
let wordSev;
|
|
1463
|
+
if (m.isSubstringMatch && m.decayedScore) {
|
|
1464
|
+
const { severity, certainty } = m.decayedScore;
|
|
1465
|
+
const shouldFlagEmbedded = BeKind.shouldFlagWithCertainty(severity, certainty);
|
|
1466
|
+
wordSev = shouldFlagEmbedded ? WordSeverity.PROFANE : WordSeverity.AMBIVALENT;
|
|
1467
|
+
}
|
|
1468
|
+
else {
|
|
1469
|
+
// Check for cross-language innocence before standard shouldFlag
|
|
1470
|
+
const normalizedWord = m.word.toLowerCase();
|
|
1471
|
+
const innocentEntries = innocentWords[normalizedWord];
|
|
1472
|
+
if (innocentEntries && innocentEntries.length > 0) {
|
|
1473
|
+
const wordScore = this.getWordScore(m.word);
|
|
1474
|
+
if (wordScore) {
|
|
1475
|
+
const ds = getDocSignal();
|
|
1476
|
+
const wordSignal = scoreWord(normalizedWord);
|
|
1477
|
+
const DOC_WEIGHT = 1.5;
|
|
1478
|
+
const WORD_WEIGHT = 1.0;
|
|
1479
|
+
const TOTAL_WEIGHT = DOC_WEIGHT + WORD_WEIGHT;
|
|
1480
|
+
const amplified = {};
|
|
1481
|
+
for (const lang of new Set([...Object.keys(wordSignal), ...Object.keys(ds)])) {
|
|
1482
|
+
const lk = lang;
|
|
1483
|
+
amplified[lang] = (((_a = wordSignal[lk]) !== null && _a !== void 0 ? _a : 0) * WORD_WEIGHT + ((_b = ds[lk]) !== null && _b !== void 0 ? _b : 0) * DOC_WEIGHT) / TOTAL_WEIGHT;
|
|
1484
|
+
}
|
|
1485
|
+
let adjustedCertainty = adjustCertaintyForLanguage(wordScore.certainty, wordScore.language, innocentEntries, amplified);
|
|
1486
|
+
// Apply context-based certainty delta on top of language adjustment
|
|
1487
|
+
if (this.contextAnalyzer) {
|
|
1488
|
+
const delta = this.contextAnalyzer.getCertaintyDelta(validatedText, m.start, m.end, m.word);
|
|
1489
|
+
adjustedCertainty = Math.max(0, Math.min(5, adjustedCertainty + delta));
|
|
1490
|
+
}
|
|
1491
|
+
const adjustedShouldFlag = BeKind.shouldFlagWithCertainty(wordScore.severity, adjustedCertainty);
|
|
1492
|
+
wordSev = adjustedShouldFlag ? WordSeverity.PROFANE : WordSeverity.AMBIVALENT;
|
|
1493
|
+
}
|
|
1494
|
+
else {
|
|
1495
|
+
wordSev = this.shouldFlagWithContext(m.word, validatedText, m.start, m.end) ? WordSeverity.PROFANE : WordSeverity.AMBIVALENT;
|
|
1496
|
+
}
|
|
1497
|
+
}
|
|
1498
|
+
else {
|
|
1499
|
+
wordSev = this.shouldFlagWithContext(m.word, validatedText, m.start, m.end) ? WordSeverity.PROFANE : WordSeverity.AMBIVALENT;
|
|
1500
|
+
}
|
|
1501
|
+
}
|
|
1502
|
+
return { word: m.originalWord, severity: wordSev };
|
|
1503
|
+
});
|
|
1504
|
+
const maxSeverity = scoredWords.length > 0
|
|
1505
|
+
? Math.max(...scoredWords.map((sw) => sw.severity))
|
|
1506
|
+
: null;
|
|
1507
|
+
const rawSuspicious = (_b = this._suspiciousMatches) !== null && _b !== void 0 ? _b : [];
|
|
1508
|
+
const suspiciousPhrases = rawSuspicious.map((sm) => {
|
|
1509
|
+
const score = this.getWordScore(sm.word);
|
|
1510
|
+
const baseScore = score
|
|
1511
|
+
? { severity: score.severity, certainty: score.certainty }
|
|
1512
|
+
: { severity: 1, certainty: 1 };
|
|
1513
|
+
const context = this.extractSurroundingContext(validatedText, sm.start, sm.end, 5);
|
|
1514
|
+
return {
|
|
1515
|
+
word: sm.word,
|
|
1516
|
+
originalText: sm.originalWord,
|
|
1517
|
+
context,
|
|
1518
|
+
start: sm.start,
|
|
1519
|
+
end: sm.end,
|
|
1520
|
+
baseScore,
|
|
1521
|
+
spaceBoundaries: sm.spaceBoundaries,
|
|
1522
|
+
};
|
|
1523
|
+
});
|
|
1524
|
+
this._suspiciousMatches = null;
|
|
1525
|
+
// Append certainty:0 matches as suspicious phrases
|
|
1526
|
+
for (const m of suspiciousFromCertaintyZero) {
|
|
1527
|
+
const score = this.getWordScore(m.word);
|
|
1528
|
+
const context = this.extractSurroundingContext(validatedText, m.start, m.end, 5);
|
|
1529
|
+
suspiciousPhrases.push({
|
|
1530
|
+
word: m.word,
|
|
1531
|
+
originalText: m.originalWord,
|
|
1532
|
+
context,
|
|
1533
|
+
start: m.start,
|
|
1534
|
+
end: m.end,
|
|
1535
|
+
baseScore: { severity: (_c = score === null || score === void 0 ? void 0 : score.severity) !== null && _c !== void 0 ? _c : 1, certainty: 0 },
|
|
1536
|
+
spaceBoundaries: 0,
|
|
1537
|
+
});
|
|
1538
|
+
}
|
|
1539
|
+
// sensitiveMode (default: false) controls whether AMBIVALENT words trigger hasProfanity.
|
|
1540
|
+
// When sensitiveMode is true, any match (including AMBIVALENT cross-language collisions
|
|
1541
|
+
// like "bitte" in German text) counts as profanity.
|
|
1542
|
+
// When false (default), only PROFANE-scored words count.
|
|
1543
|
+
const hasProfane = this.sensitiveMode
|
|
1544
|
+
? uniqueMatches.length > 0
|
|
1545
|
+
: scoredWords.some((sw) => sw.severity === WordSeverity.PROFANE);
|
|
1546
|
+
const result = {
|
|
1547
|
+
hasProfanity: hasProfane,
|
|
1548
|
+
detectedWords,
|
|
1549
|
+
cleanedText,
|
|
1550
|
+
severity,
|
|
1551
|
+
positions: uniqueMatches.map((m) => ({
|
|
1552
|
+
word: m.originalWord,
|
|
1553
|
+
start: m.start,
|
|
1554
|
+
end: m.end,
|
|
1555
|
+
})),
|
|
1556
|
+
needsManualReview: uniqueAbhorrent.length > 0,
|
|
1557
|
+
flaggedAbhorrentWords: uniqueAbhorrent,
|
|
1558
|
+
scoredWords,
|
|
1559
|
+
maxSeverity,
|
|
1560
|
+
suspiciousPhrases,
|
|
1561
|
+
};
|
|
1562
|
+
// Cache result if caching is enabled
|
|
1563
|
+
if (this.resultCache) {
|
|
1564
|
+
this.resultCache.set(validatedText, result);
|
|
1565
|
+
// Implement simple LRU by clearing cache when it gets too large
|
|
1566
|
+
if (this.resultCache.size > 1000) {
|
|
1567
|
+
const firstKey = this.resultCache.keys().next().value;
|
|
1568
|
+
if (firstKey !== undefined) {
|
|
1569
|
+
this.resultCache.delete(firstKey);
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
}
|
|
1573
|
+
return result;
|
|
1574
|
+
}
|
|
1575
|
+
/**
|
|
1576
|
+
* Main matching function, with whole-word logic.
|
|
1577
|
+
* @param searchText - The normalized text to search.
|
|
1578
|
+
* @param originalText - The original text.
|
|
1579
|
+
* @param matches - Array to collect matches.
|
|
1580
|
+
*/
|
|
1581
|
+
findMatches(searchText, originalText, matches) {
|
|
1582
|
+
const boundaryMatchedRanges = [];
|
|
1583
|
+
// Compute doc-level language signal once for all embed checks
|
|
1584
|
+
let docLangSignal;
|
|
1585
|
+
const getDocLang = () => {
|
|
1586
|
+
if (!docLangSignal) {
|
|
1587
|
+
const detected = detectLanguages(originalText, { maxLanguages: 3 });
|
|
1588
|
+
docLangSignal = {};
|
|
1589
|
+
for (const lang of detected.languages) {
|
|
1590
|
+
docLangSignal[lang.language] = lang.proportion;
|
|
1591
|
+
}
|
|
1592
|
+
}
|
|
1593
|
+
return docLangSignal;
|
|
1594
|
+
};
|
|
1595
|
+
for (let i = 0; i < searchText.length; i++) {
|
|
1596
|
+
const matchResults = this.profanityTrie.findMatches(searchText, i, this.detectPartialWords);
|
|
1597
|
+
for (const match of matchResults) {
|
|
1598
|
+
const start = i + match.start;
|
|
1599
|
+
const end = i + match.end;
|
|
1600
|
+
const isWhole = this.isWholeWord(originalText, start, end);
|
|
1601
|
+
if (!this.detectPartialWords && !isWhole) {
|
|
1602
|
+
if (!this.isHighCoverageEmbed(originalText, start, end, match.word, getDocLang())) {
|
|
1603
|
+
continue;
|
|
1604
|
+
}
|
|
1605
|
+
}
|
|
1606
|
+
const matchedText = originalText.substring(start, end);
|
|
1607
|
+
if (this.isWhitelistedMatch(match.word, matchedText)) {
|
|
1608
|
+
continue;
|
|
1609
|
+
}
|
|
1610
|
+
if (this.hasWordBoundaries(originalText, start, end)) {
|
|
1611
|
+
matches.push({
|
|
1612
|
+
word: match.word,
|
|
1613
|
+
start,
|
|
1614
|
+
end,
|
|
1615
|
+
originalWord: matchedText,
|
|
1616
|
+
});
|
|
1617
|
+
boundaryMatchedRanges.push({ start, end });
|
|
1618
|
+
}
|
|
1619
|
+
}
|
|
1620
|
+
}
|
|
1621
|
+
// Embedded profanity detection: find profane substrings inside words
|
|
1622
|
+
// that weren't caught by word-boundary matching
|
|
1623
|
+
if (this.embeddedProfanityDetection) {
|
|
1624
|
+
this.findEmbeddedMatches(searchText, originalText, matches, boundaryMatchedRanges);
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
/**
|
|
1628
|
+
* Walk the trie while tolerating separator characters between letters.
|
|
1629
|
+
* Catches evasion patterns: "fu ck", "c.u.n.t", "fu@ck@cu@nt@bi@tch"
|
|
1630
|
+
*
|
|
1631
|
+
* Symbol separators (@, ., -, etc.) are skipped freely.
|
|
1632
|
+
* Space separators reduce certainty by SPACE_CERTAINTY_PENALTY per gap.
|
|
1633
|
+
* Matches that drop below the flagging threshold become "suspicious" instead.
|
|
1634
|
+
*/
|
|
1635
|
+
findSeparatorTolerantMatches(searchText, originalText, matches) {
|
|
1636
|
+
const alreadyFound = new Set(matches.map((m) => m.word.toLowerCase()));
|
|
1637
|
+
const maxSkip = this.separatorTolerance;
|
|
1638
|
+
for (let i = 0; i < searchText.length; i++) {
|
|
1639
|
+
// Only start walks from non-separator characters at word-boundary positions
|
|
1640
|
+
if (BeKind.isSeparator(searchText[i]))
|
|
1641
|
+
continue;
|
|
1642
|
+
if (i > 0 && /\w/.test(searchText[i - 1]))
|
|
1643
|
+
continue;
|
|
1644
|
+
const found = this.walkTrieWithSeparators(this.profanityTrie, searchText, i, maxSkip, 0);
|
|
1645
|
+
for (const { word, endPos, anySeparatorSkipped, spaceBoundaries } of found) {
|
|
1646
|
+
// Only report if separators were actually skipped (normal matching handles the rest)
|
|
1647
|
+
if (!anySeparatorSkipped)
|
|
1648
|
+
continue;
|
|
1649
|
+
// Require minimum word length of 3 to avoid short false positives
|
|
1650
|
+
if (word.length < 3)
|
|
1651
|
+
continue;
|
|
1652
|
+
if (alreadyFound.has(word.toLowerCase()))
|
|
1653
|
+
continue;
|
|
1654
|
+
if (this.isWhitelistedMatch(word, originalText.substring(i, endPos)))
|
|
1655
|
+
continue;
|
|
1656
|
+
alreadyFound.add(word.toLowerCase());
|
|
1657
|
+
// All separator-tolerant matches are suspicious only for now.
|
|
1658
|
+
// They're captured with context for review but don't flag as profanity.
|
|
1659
|
+
if (!this._suspiciousMatches)
|
|
1660
|
+
this._suspiciousMatches = [];
|
|
1661
|
+
this._suspiciousMatches.push({
|
|
1662
|
+
word,
|
|
1663
|
+
start: i,
|
|
1664
|
+
end: endPos,
|
|
1665
|
+
originalWord: originalText.substring(i, endPos),
|
|
1666
|
+
spaceBoundaries,
|
|
1667
|
+
});
|
|
1668
|
+
}
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1671
|
+
/**
|
|
1672
|
+
* Recursively walk the trie from a given node, skipping separator chars.
|
|
1673
|
+
* Tracks space boundaries crossed (for certainty penalty) separately from
|
|
1674
|
+
* symbol separators (which are free to skip).
|
|
1675
|
+
*/
|
|
1676
|
+
walkTrieWithSeparators(node, text, pos, maxSkip, spaceBoundaries, totalSkips = 0) {
|
|
1677
|
+
const results = [];
|
|
1678
|
+
if (pos >= text.length) {
|
|
1679
|
+
if (node.isEndOfWord) {
|
|
1680
|
+
results.push({ word: node.word, endPos: pos, anySeparatorSkipped: totalSkips > 0, spaceBoundaries });
|
|
1681
|
+
}
|
|
1682
|
+
return results;
|
|
1683
|
+
}
|
|
1684
|
+
const char = text[pos];
|
|
1685
|
+
// Try matching the character directly in the trie
|
|
1686
|
+
const nextNode = node.getChild(char);
|
|
1687
|
+
if (nextNode) {
|
|
1688
|
+
if (nextNode.isEndOfWord) {
|
|
1689
|
+
results.push({ word: nextNode.word, endPos: pos + 1, anySeparatorSkipped: totalSkips > 0, spaceBoundaries });
|
|
1690
|
+
}
|
|
1691
|
+
results.push(...this.walkTrieWithSeparators(nextNode, text, pos + 1, maxSkip, spaceBoundaries, totalSkips));
|
|
1692
|
+
}
|
|
1693
|
+
// If current char is a separator, skip over consecutive separators
|
|
1694
|
+
if (BeKind.isSeparator(char)) {
|
|
1695
|
+
let skipCount = 0;
|
|
1696
|
+
let skipPos = pos;
|
|
1697
|
+
let hasSpace = false;
|
|
1698
|
+
while (skipPos < text.length && BeKind.isSeparator(text[skipPos]) && skipCount < maxSkip) {
|
|
1699
|
+
if (BeKind.isWhitespaceSeparator(text[skipPos]))
|
|
1700
|
+
hasSpace = true;
|
|
1701
|
+
skipPos++;
|
|
1702
|
+
skipCount++;
|
|
1703
|
+
}
|
|
1704
|
+
if (skipPos < text.length && skipCount > 0) {
|
|
1705
|
+
const newSpaceBoundaries = spaceBoundaries + (hasSpace ? 1 : 0);
|
|
1706
|
+
results.push(...this.walkTrieWithSeparators(node, text, skipPos, maxSkip, newSpaceBoundaries, totalSkips + skipCount));
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
return results;
|
|
1710
|
+
}
|
|
1711
|
+
/**
|
|
1712
|
+
* Find profane substrings embedded inside larger words with certainty decay.
|
|
1713
|
+
*
|
|
1714
|
+
* Formula: decayed_c = base_c * (DECAY_RATE ^ extra_chars) * (profane_len / host_word_len)
|
|
1715
|
+
*
|
|
1716
|
+
* Multi-profanity bonus: if a host word contains multiple profane substrings,
|
|
1717
|
+
* certainty is boosted (sum of base severities used as multiplier, capped at c:5).
|
|
1718
|
+
*
|
|
1719
|
+
* Unusually long words (12+ chars) containing profanity get a certainty bonus
|
|
1720
|
+
* since legitimate words rarely exceed this length.
|
|
1721
|
+
*/
|
|
1722
|
+
findEmbeddedMatches(searchText, originalText, matches, alreadyMatched) {
|
|
1723
|
+
// Extract individual words from text with their positions
|
|
1724
|
+
const wordPattern = /[a-zA-Z\u00C0-\u024F\u0400-\u04FF\u0600-\u06FF\u3000-\u9FFF\uAC00-\uD7AF]+/g;
|
|
1725
|
+
let wordMatch;
|
|
1726
|
+
while ((wordMatch = wordPattern.exec(searchText)) !== null) {
|
|
1727
|
+
const hostWord = wordMatch[0];
|
|
1728
|
+
const hostStart = wordMatch.index;
|
|
1729
|
+
const hostEnd = hostStart + hostWord.length;
|
|
1730
|
+
// Skip if this word was already fully matched by boundary detection
|
|
1731
|
+
const fullyMatched = alreadyMatched.some((r) => r.start <= hostStart && r.end >= hostEnd);
|
|
1732
|
+
if (fullyMatched)
|
|
1733
|
+
continue;
|
|
1734
|
+
// Find all profane substrings within this word
|
|
1735
|
+
const embeddedFinds = [];
|
|
1736
|
+
for (let i = 0; i < hostWord.length; i++) {
|
|
1737
|
+
const subMatches = this.profanityTrie.findMatches(hostWord.toLowerCase(), i, true);
|
|
1738
|
+
for (const sub of subMatches) {
|
|
1739
|
+
const subStart = hostStart + i + sub.start;
|
|
1740
|
+
const subEnd = hostStart + i + sub.end;
|
|
1741
|
+
// Skip if this exact range was already boundary-matched
|
|
1742
|
+
const alreadyCovered = alreadyMatched.some((r) => r.start === subStart && r.end === subEnd);
|
|
1743
|
+
if (alreadyCovered)
|
|
1744
|
+
continue;
|
|
1745
|
+
const score = this.wordScores[sub.word];
|
|
1746
|
+
if (!score)
|
|
1747
|
+
continue;
|
|
1748
|
+
embeddedFinds.push({
|
|
1749
|
+
word: sub.word,
|
|
1750
|
+
start: subStart,
|
|
1751
|
+
end: subEnd,
|
|
1752
|
+
baseSeverity: score.severity,
|
|
1753
|
+
baseCertainty: score.certainty,
|
|
1754
|
+
});
|
|
1755
|
+
}
|
|
1756
|
+
}
|
|
1757
|
+
if (embeddedFinds.length === 0)
|
|
1758
|
+
continue;
|
|
1759
|
+
// Deduplicate: keep longest match at each position
|
|
1760
|
+
const dedupedFinds = this.deduplicateEmbeddedFinds(embeddedFinds);
|
|
1761
|
+
// Multi-profanity bonus: if multiple distinct profane roots found, boost certainty
|
|
1762
|
+
const multiBonus = dedupedFinds.length >= 2
|
|
1763
|
+
? Math.min(dedupedFinds.length * 0.5, 2.0) // +0.5 per extra root, cap +2
|
|
1764
|
+
: 0;
|
|
1765
|
+
// Unusually long word bonus (12+ chars with profanity = likely evasion)
|
|
1766
|
+
const lengthBonus = hostWord.length >= 12 ? 1.0 : 0;
|
|
1767
|
+
for (const find of dedupedFinds) {
|
|
1768
|
+
const profaneLen = find.word.length;
|
|
1769
|
+
const extraChars = hostWord.length - profaneLen;
|
|
1770
|
+
const decayFactor = Math.pow(BeKind.EMBEDDED_DECAY_RATE, extraChars);
|
|
1771
|
+
const lengthRatio = profaneLen / hostWord.length;
|
|
1772
|
+
let decayedCertainty = find.baseCertainty * decayFactor * lengthRatio + multiBonus + lengthBonus;
|
|
1773
|
+
decayedCertainty = Math.round(Math.max(1, Math.min(5, decayedCertainty)));
|
|
1774
|
+
if (decayedCertainty < BeKind.EMBEDDED_MIN_CERTAINTY)
|
|
1775
|
+
continue;
|
|
1776
|
+
const matchedText = originalText.substring(find.start, find.end);
|
|
1777
|
+
matches.push({
|
|
1778
|
+
word: find.word,
|
|
1779
|
+
start: find.start,
|
|
1780
|
+
end: find.end,
|
|
1781
|
+
originalWord: matchedText,
|
|
1782
|
+
isSubstringMatch: true,
|
|
1783
|
+
decayedScore: { severity: find.baseSeverity, certainty: decayedCertainty },
|
|
1784
|
+
});
|
|
1785
|
+
}
|
|
1786
|
+
}
|
|
1787
|
+
}
|
|
1788
|
+
/**
|
|
1789
|
+
* Deduplicate embedded finds: at overlapping positions, keep the longest match.
|
|
1790
|
+
*/
|
|
1791
|
+
deduplicateEmbeddedFinds(finds) {
|
|
1792
|
+
// Sort by start, then by length descending
|
|
1793
|
+
const sorted = [...finds].sort((a, b) => a.start - b.start || (b.end - b.start) - (a.end - a.start));
|
|
1794
|
+
const result = [];
|
|
1795
|
+
let lastEnd = -1;
|
|
1796
|
+
for (const find of sorted) {
|
|
1797
|
+
// Skip if fully contained within a previous match
|
|
1798
|
+
if (find.start >= lastEnd || find.end > lastEnd) {
|
|
1799
|
+
result.push(find);
|
|
1800
|
+
lastEnd = Math.max(lastEnd, find.end);
|
|
1801
|
+
}
|
|
1802
|
+
}
|
|
1803
|
+
return result;
|
|
1804
|
+
}
|
|
1805
|
+
/**
|
|
1806
|
+
* Generate cleaned text by replacing profane words.
|
|
1807
|
+
* @param originalText - The original text.
|
|
1808
|
+
* @param matches - Array of matches.
|
|
1809
|
+
* @returns Cleaned text.
|
|
1810
|
+
*/
|
|
1811
|
+
generateCleanedText(originalText, matches) {
|
|
1812
|
+
if (matches.length === 0)
|
|
1813
|
+
return originalText;
|
|
1814
|
+
let result = originalText;
|
|
1815
|
+
const sortedMatches = [...this.deduplicateMatches(matches)].sort((a, b) => b.start - a.start);
|
|
1816
|
+
for (const match of sortedMatches) {
|
|
1817
|
+
const replacement = this.defaultPlaceholder.repeat(match.originalWord.length);
|
|
1818
|
+
result =
|
|
1819
|
+
result.substring(0, match.start) +
|
|
1820
|
+
replacement +
|
|
1821
|
+
result.substring(match.end);
|
|
1822
|
+
}
|
|
1823
|
+
return result;
|
|
1824
|
+
}
|
|
1825
|
+
/**
|
|
1826
|
+
* Quick boolean check for profanity presence in text.
|
|
1827
|
+
*
|
|
1828
|
+
* @param {string} text - The text to check for profanity
|
|
1829
|
+
* @returns {boolean} True if profanity is detected, false otherwise
|
|
1830
|
+
*
|
|
1831
|
+
* @throws {TypeError} If text is not a string
|
|
1832
|
+
*
|
|
1833
|
+
* @remarks
|
|
1834
|
+
* - Convenience method that internally calls `detect()` and returns only the boolean result
|
|
1835
|
+
* - For detailed information about matches, use `detect()` instead
|
|
1836
|
+
* - Results are cached if caching is enabled (same cache as `detect()`)
|
|
1837
|
+
*
|
|
1838
|
+
* @example
|
|
1839
|
+
* ```typescript
|
|
1840
|
+
* const filter = new BeKind();
|
|
1841
|
+
*
|
|
1842
|
+
* if (filter.check("This has bad words")) {
|
|
1843
|
+
* console.log("Profanity detected!");
|
|
1844
|
+
* }
|
|
1845
|
+
*
|
|
1846
|
+
* // Quick validation
|
|
1847
|
+
* const isClean = !filter.check(userInput);
|
|
1848
|
+
* ```
|
|
1849
|
+
*
|
|
1850
|
+
* @see {@link detect} for detailed profanity analysis
|
|
1851
|
+
*/
|
|
1852
|
+
check(text) {
|
|
1853
|
+
return this.detect(text).hasProfanity;
|
|
1854
|
+
}
|
|
1855
|
+
/**
|
|
1856
|
+
* Cleans text by replacing profanity with a placeholder character.
|
|
1857
|
+
*
|
|
1858
|
+
* @param {string} text - The text to clean
|
|
1859
|
+
* @param {string} [placeholder] - Optional custom placeholder character (uses default if not provided)
|
|
1860
|
+
* @returns {string} The cleaned text with profanity replaced
|
|
1861
|
+
*
|
|
1862
|
+
* @throws {TypeError} If text is not a string
|
|
1863
|
+
*
|
|
1864
|
+
* @remarks
|
|
1865
|
+
* ### Character-level Replacement:
|
|
1866
|
+
* - Each profane character is replaced individually
|
|
1867
|
+
* - "bad" with placeholder "*" becomes "***"
|
|
1868
|
+
* - Preserves text length and structure
|
|
1869
|
+
*
|
|
1870
|
+
* ### Placeholder Behavior:
|
|
1871
|
+
* - If no placeholder provided, uses the instance's default placeholder
|
|
1872
|
+
* - If placeholder provided, uses only the first character
|
|
1873
|
+
* - Empty placeholder throws error
|
|
1874
|
+
*
|
|
1875
|
+
* @example
|
|
1876
|
+
* ```typescript
|
|
1877
|
+
* const filter = new BeKind();
|
|
1878
|
+
*
|
|
1879
|
+
* // Using default placeholder (*)
|
|
1880
|
+
* const cleaned = filter.clean("This has bad words");
|
|
1881
|
+
* console.log(cleaned); // "This has *** *****"
|
|
1882
|
+
*
|
|
1883
|
+
* // Using custom placeholder
|
|
1884
|
+
* const cleaned = filter.clean("This has bad words", "#");
|
|
1885
|
+
* console.log(cleaned); // "This has ### #####"
|
|
1886
|
+
* ```
|
|
1887
|
+
*
|
|
1888
|
+
* @example
|
|
1889
|
+
* ```typescript
|
|
1890
|
+
* // Clean user-generated content for display
|
|
1891
|
+
* const userComment = "Some inappropriate words here";
|
|
1892
|
+
* const safeComment = filter.clean(userComment);
|
|
1893
|
+
* displayComment(safeComment);
|
|
1894
|
+
* ```
|
|
1895
|
+
*
|
|
1896
|
+
* @see {@link cleanWithPlaceholder} for word-level replacement
|
|
1897
|
+
* @see {@link setPlaceholder} to change default placeholder
|
|
1898
|
+
*/
|
|
1899
|
+
clean(text, placeholder) {
|
|
1900
|
+
const detection = this.detect(text);
|
|
1901
|
+
if (!placeholder || placeholder === this.defaultPlaceholder) {
|
|
1902
|
+
return detection.cleanedText;
|
|
1903
|
+
}
|
|
1904
|
+
let result = text;
|
|
1905
|
+
const sortedPositions = [
|
|
1906
|
+
...this.deduplicateMatches(detection.positions.map((p) => ({
|
|
1907
|
+
word: p.word,
|
|
1908
|
+
start: p.start,
|
|
1909
|
+
end: p.end,
|
|
1910
|
+
originalWord: text.substring(p.start, p.end),
|
|
1911
|
+
}))),
|
|
1912
|
+
].sort((a, b) => b.start - a.start);
|
|
1913
|
+
for (const pos of sortedPositions) {
|
|
1914
|
+
const originalWord = text.substring(pos.start, pos.end);
|
|
1915
|
+
const replacement = placeholder.repeat(originalWord.length);
|
|
1916
|
+
result =
|
|
1917
|
+
result.substring(0, pos.start) +
|
|
1918
|
+
replacement +
|
|
1919
|
+
result.substring(pos.end);
|
|
1920
|
+
}
|
|
1921
|
+
return result;
|
|
1922
|
+
}
|
|
1923
|
+
/**
|
|
1924
|
+
* Cleans text by replacing each profane word with a single placeholder string (word-level replacement).
|
|
1925
|
+
*
|
|
1926
|
+
* @param {string} text - The text to clean
|
|
1927
|
+
* @param {string} [placeholder="***"] - The placeholder string to use for each profane word
|
|
1928
|
+
* @returns {string} The cleaned text with each profane word replaced by the placeholder
|
|
1929
|
+
*
|
|
1930
|
+
* @throws {TypeError} If text is not a string
|
|
1931
|
+
*
|
|
1932
|
+
* @remarks
|
|
1933
|
+
* ### Word-level Replacement:
|
|
1934
|
+
* - Each profane word is replaced with the entire placeholder string (not character-by-character)
|
|
1935
|
+
* - "bad words" with placeholder "***" becomes "*** ***"
|
|
1936
|
+
* - Does NOT preserve original text length
|
|
1937
|
+
*
|
|
1938
|
+
* ### Difference from `clean()`:
|
|
1939
|
+
* - `clean()`: Character-level replacement - "bad" becomes "***" (preserves length)
|
|
1940
|
+
* - `cleanWithPlaceholder()`: Word-level replacement - "bad" becomes "***" (fixed placeholder)
|
|
1941
|
+
*
|
|
1942
|
+
* @example
|
|
1943
|
+
* ```typescript
|
|
1944
|
+
* const filter = new BeKind();
|
|
1945
|
+
*
|
|
1946
|
+
* // Default placeholder (***) const text = "This has bad words";
|
|
1947
|
+
* const cleaned = filter.cleanWithPlaceholder(text);
|
|
1948
|
+
* console.log(cleaned); // "This has *** ***"
|
|
1949
|
+
*
|
|
1950
|
+
* // Custom placeholder
|
|
1951
|
+
* const cleaned2 = filter.cleanWithPlaceholder(text, "[CENSORED]");
|
|
1952
|
+
* console.log(cleaned2); // "This has [CENSORED] [CENSORED]"
|
|
1953
|
+
* ```
|
|
1954
|
+
*
|
|
1955
|
+
* @example
|
|
1956
|
+
* ```typescript
|
|
1957
|
+
* // Censoring chat messages
|
|
1958
|
+
* const message = "You are a badword and stupid";
|
|
1959
|
+
* const censored = filter.cleanWithPlaceholder(message, "[***]");
|
|
1960
|
+
* // Result: "You are a [***] and [***]"
|
|
1961
|
+
* ```
|
|
1962
|
+
*
|
|
1963
|
+
* @see {@link clean} for character-level replacement
|
|
1964
|
+
*/
|
|
1965
|
+
cleanWithPlaceholder(text, placeholder = "***") {
|
|
1966
|
+
const detection = this.detect(text);
|
|
1967
|
+
if (detection.positions.length === 0)
|
|
1968
|
+
return text;
|
|
1969
|
+
let result = text;
|
|
1970
|
+
const sortedPositions = [
|
|
1971
|
+
...this.deduplicateMatches(detection.positions.map((p) => ({
|
|
1972
|
+
word: p.word,
|
|
1973
|
+
start: p.start,
|
|
1974
|
+
end: p.end,
|
|
1975
|
+
originalWord: text.substring(p.start, p.end),
|
|
1976
|
+
}))),
|
|
1977
|
+
].sort((a, b) => b.start - a.start);
|
|
1978
|
+
for (const pos of sortedPositions) {
|
|
1979
|
+
if (!this.isWholeWord(result, pos.start, pos.end))
|
|
1980
|
+
continue;
|
|
1981
|
+
result =
|
|
1982
|
+
result.substring(0, pos.start) +
|
|
1983
|
+
placeholder +
|
|
1984
|
+
result.substring(pos.end);
|
|
1985
|
+
}
|
|
1986
|
+
return result;
|
|
1987
|
+
}
|
|
1988
|
+
/**
|
|
1989
|
+
* Dynamically adds one or more words to the profanity filter at runtime.
|
|
1990
|
+
*
|
|
1991
|
+
* @param {string | string[]} word - A single word or array of words to add to the filter
|
|
1992
|
+
* @returns {void}
|
|
1993
|
+
*
|
|
1994
|
+
* @remarks
|
|
1995
|
+
* ### Behavior:
|
|
1996
|
+
* - Words are added to all active data structures (Trie, Aho-Corasick, Bloom Filter)
|
|
1997
|
+
* - Automatically normalizes words based on caseSensitive setting
|
|
1998
|
+
* - Skips whitelisted words
|
|
1999
|
+
* - Validates and filters out non-string or empty values
|
|
2000
|
+
* - Changes take effect immediately for subsequent detect/check/clean calls
|
|
2001
|
+
*
|
|
2002
|
+
* ### Use Cases:
|
|
2003
|
+
* - Adding context-specific profanity
|
|
2004
|
+
* - Building dynamic word lists from user reports
|
|
2005
|
+
* - Customizing filters for specific communities/applications
|
|
2006
|
+
*
|
|
2007
|
+
* @example
|
|
2008
|
+
* ```typescript
|
|
2009
|
+
* const filter = new BeKind();
|
|
2010
|
+
*
|
|
2011
|
+
* // Add single word
|
|
2012
|
+
* filter.add('newbadword');
|
|
2013
|
+
*
|
|
2014
|
+
* // Add multiple words
|
|
2015
|
+
* filter.add(['word1', 'word2', 'word3']);
|
|
2016
|
+
*
|
|
2017
|
+
* // Now these words will be detected
|
|
2018
|
+
* filter.check('newbadword'); // true
|
|
2019
|
+
* ```
|
|
2020
|
+
*
|
|
2021
|
+
* @example
|
|
2022
|
+
* ```typescript
|
|
2023
|
+
* // Add game-specific slang dynamically
|
|
2024
|
+
* const filter = new BeKind();
|
|
2025
|
+
* const gamingSlang = ['noob', 'trash', 'tryhard'];
|
|
2026
|
+
* filter.add(gamingSlang);
|
|
2027
|
+
*
|
|
2028
|
+
* const message = "You're such a noob";
|
|
2029
|
+
* console.log(filter.check(message)); // true
|
|
2030
|
+
* ```
|
|
2031
|
+
*
|
|
2032
|
+
* @see {@link remove} to remove words
|
|
2033
|
+
* @see {@link loadCustomDictionary} for loading named dictionaries
|
|
2034
|
+
*/
|
|
2035
|
+
add(word) {
|
|
2036
|
+
const words = Array.isArray(word) ? word : [word];
|
|
2037
|
+
const validatedWords = validateStringArray(words, "words to add");
|
|
2038
|
+
for (const w of validatedWords) {
|
|
2039
|
+
this.dynamicWords.add(w);
|
|
2040
|
+
this.addWordToTrie(w);
|
|
2041
|
+
}
|
|
2042
|
+
}
|
|
2043
|
+
/**
|
|
2044
|
+
* Dynamically removes one or more words from the profanity filter at runtime.
|
|
2045
|
+
*
|
|
2046
|
+
* @param {string | string[]} word - A single word or array of words to remove from the filter
|
|
2047
|
+
* @returns {void}
|
|
2048
|
+
*
|
|
2049
|
+
* @remarks
|
|
2050
|
+
* ### Behavior:
|
|
2051
|
+
* - Removes words from all active data structures (Trie, dynamic words set)
|
|
2052
|
+
* - Normalizes words based on caseSensitive setting before removal
|
|
2053
|
+
* - Only removes dynamically added words, not words from loaded language dictionaries
|
|
2054
|
+
* - Changes take effect immediately for subsequent detect/check/clean calls
|
|
2055
|
+
*
|
|
2056
|
+
* ### Important Notes:
|
|
2057
|
+
* - Cannot remove words from built-in language dictionaries
|
|
2058
|
+
* - To exclude dictionary words, use `addToWhitelist()` instead
|
|
2059
|
+
* - Validates and filters out non-string or empty values
|
|
2060
|
+
*
|
|
2061
|
+
* @example
|
|
2062
|
+
* ```typescript
|
|
2063
|
+
* const filter = new BeKind();
|
|
2064
|
+
*
|
|
2065
|
+
* // Add then remove a word
|
|
2066
|
+
* filter.add('tempword');
|
|
2067
|
+
* filter.check('tempword'); // true
|
|
2068
|
+
*
|
|
2069
|
+
* filter.remove('tempword');
|
|
2070
|
+
* filter.check('tempword'); // false
|
|
2071
|
+
*
|
|
2072
|
+
* // Remove multiple words
|
|
2073
|
+
* filter.remove(['word1', 'word2']);
|
|
2074
|
+
* ```
|
|
2075
|
+
*
|
|
2076
|
+
* @example
|
|
2077
|
+
* ```typescript
|
|
2078
|
+
* // Managing custom word list
|
|
2079
|
+
* const filter = new BeKind();
|
|
2080
|
+
* filter.add(['custom1', 'custom2', 'custom3']);
|
|
2081
|
+
*
|
|
2082
|
+
* // Later, remove one that's no longer needed
|
|
2083
|
+
* filter.remove('custom2');
|
|
2084
|
+
* ```
|
|
2085
|
+
*
|
|
2086
|
+
* @see {@link add} to add words
|
|
2087
|
+
* @see {@link addToWhitelist} to exclude dictionary words without removing them
|
|
2088
|
+
*/
|
|
2089
|
+
remove(word) {
|
|
2090
|
+
const words = Array.isArray(word) ? word : [word];
|
|
2091
|
+
const validatedWords = validateStringArray(words, "words to remove");
|
|
2092
|
+
for (const w of validatedWords) {
|
|
2093
|
+
const normalizedWord = this.caseSensitive ? w : w.toLowerCase();
|
|
2094
|
+
this.profanityTrie.removeWord(normalizedWord);
|
|
2095
|
+
this.dynamicWords.delete(w);
|
|
2096
|
+
}
|
|
2097
|
+
}
|
|
2098
|
+
/**
|
|
2099
|
+
* Add words to the whitelist.
|
|
2100
|
+
* @param words - Words to whitelist.
|
|
2101
|
+
*/
|
|
2102
|
+
addToWhitelist(words) {
|
|
2103
|
+
const validatedWords = validateStringArray(words, "whitelist words");
|
|
2104
|
+
for (const word of validatedWords) {
|
|
2105
|
+
const normalizedWord = this.caseSensitive ? word : word.toLowerCase();
|
|
2106
|
+
this.whitelistSet.add(normalizedWord);
|
|
2107
|
+
}
|
|
2108
|
+
}
|
|
2109
|
+
/**
|
|
2110
|
+
* Remove words from the whitelist.
|
|
2111
|
+
* @param words - Words to remove from whitelist.
|
|
2112
|
+
*/
|
|
2113
|
+
removeFromWhitelist(words) {
|
|
2114
|
+
const validatedWords = validateStringArray(words, "whitelist words");
|
|
2115
|
+
for (const word of validatedWords) {
|
|
2116
|
+
const normalizedWord = this.caseSensitive ? word : word.toLowerCase();
|
|
2117
|
+
this.whitelistSet.delete(normalizedWord);
|
|
2118
|
+
}
|
|
2119
|
+
}
|
|
2120
|
+
/**
|
|
2121
|
+
* Check if a word is whitelisted.
|
|
2122
|
+
* @param word - The word to check.
|
|
2123
|
+
* @returns True if whitelisted, false otherwise.
|
|
2124
|
+
*/
|
|
2125
|
+
isWhitelisted(word) {
|
|
2126
|
+
const normalizedWord = this.caseSensitive ? word : word.toLowerCase();
|
|
2127
|
+
return this.whitelistSet.has(normalizedWord);
|
|
2128
|
+
}
|
|
2129
|
+
/**
|
|
2130
|
+
* Loads a built-in language dictionary into the profanity filter.
|
|
2131
|
+
*
|
|
2132
|
+
* @param {string} language - The language key to load (case-insensitive)
|
|
2133
|
+
* @returns {boolean} True if language was loaded successfully, false if not found or already loaded
|
|
2134
|
+
*
|
|
2135
|
+
* @remarks
|
|
2136
|
+
* ### Available Languages:
|
|
2137
|
+
* - `'english'` - English profanity words
|
|
2138
|
+
* - `'hindi'` - Hindi profanity words
|
|
2139
|
+
* - `'french'` - French profanity words
|
|
2140
|
+
* - `'german'` - German profanity words
|
|
2141
|
+
* - `'spanish'` - Spanish profanity words
|
|
2142
|
+
* - `'bengali'` - Bengali profanity words
|
|
2143
|
+
* - `'tamil'` - Tamil profanity words
|
|
2144
|
+
* - `'telugu'` - Telugu profanity words
|
|
2145
|
+
* - `'brazilian'` - Brazilian Portuguese profanity words
|
|
2146
|
+
*
|
|
2147
|
+
* ### Behavior:
|
|
2148
|
+
* - Language keys are case-insensitive
|
|
2149
|
+
* - Loading is idempotent - calling multiple times for same language is safe
|
|
2150
|
+
* - Returns true if language loaded successfully or was already loaded
|
|
2151
|
+
* - Returns false if language not found
|
|
2152
|
+
* - Logs success/failure messages (unless silent mode enabled)
|
|
2153
|
+
* - Words are added to all active data structures
|
|
2154
|
+
*
|
|
2155
|
+
* ### Default Languages:
|
|
2156
|
+
* English and Hindi are loaded automatically in the constructor
|
|
2157
|
+
*
|
|
2158
|
+
* @example
|
|
2159
|
+
* ```typescript
|
|
2160
|
+
* const filter = new BeKind();
|
|
2161
|
+
*
|
|
2162
|
+
* // Load additional languages
|
|
2163
|
+
* filter.loadLanguage('french');
|
|
2164
|
+
* filter.loadLanguage('spanish');
|
|
2165
|
+
*
|
|
2166
|
+
* // Case-insensitive
|
|
2167
|
+
* filter.loadLanguage('GERMAN'); // Works
|
|
2168
|
+
*
|
|
2169
|
+
* // Check if loaded
|
|
2170
|
+
* console.log(filter.getLoadedLanguages()); // ['english', 'hindi', 'french', 'spanish', 'german']
|
|
2171
|
+
* ```
|
|
2172
|
+
*
|
|
2173
|
+
* @example
|
|
2174
|
+
* ```typescript
|
|
2175
|
+
* // Load all Indian languages at once
|
|
2176
|
+
* const filter = new BeKind();
|
|
2177
|
+
* filter.loadIndianLanguages();
|
|
2178
|
+
* ```
|
|
2179
|
+
*
|
|
2180
|
+
* @see {@link loadLanguages} to load multiple languages at once
|
|
2181
|
+
* @see {@link loadIndianLanguages} for convenience method
|
|
2182
|
+
* @see {@link getAvailableLanguages} to see all available languages
|
|
2183
|
+
* @see {@link getLoadedLanguages} to see currently loaded languages
|
|
2184
|
+
*/
|
|
2185
|
+
loadLanguage(language) {
|
|
2186
|
+
if (!language || typeof language !== "string") {
|
|
2187
|
+
this.logger.warn(`Invalid language parameter: ${language}`);
|
|
2188
|
+
return false;
|
|
2189
|
+
}
|
|
2190
|
+
const langKey = language.toLowerCase().trim();
|
|
2191
|
+
if (this.loadedLanguages.has(langKey)) {
|
|
2192
|
+
return true;
|
|
2193
|
+
}
|
|
2194
|
+
const words = this.availableLanguages[langKey];
|
|
2195
|
+
if (!words || words.length === 0) {
|
|
2196
|
+
this.logger.warn(`Language '${language}' not found or empty`);
|
|
2197
|
+
return false;
|
|
2198
|
+
}
|
|
2199
|
+
try {
|
|
2200
|
+
let addedCount = 0;
|
|
2201
|
+
for (const word of words) {
|
|
2202
|
+
if (this.addWordToTrie(word)) {
|
|
2203
|
+
addedCount++;
|
|
2204
|
+
}
|
|
2205
|
+
}
|
|
2206
|
+
this.loadedLanguages.add(langKey);
|
|
2207
|
+
this.logger.info(`Loaded ${addedCount} words from ${language} dictionary`);
|
|
2208
|
+
return true;
|
|
2209
|
+
}
|
|
2210
|
+
catch (error) {
|
|
2211
|
+
this.logger.error(`Failed to load language ${language}: ${error}`);
|
|
2212
|
+
return false;
|
|
2213
|
+
}
|
|
2214
|
+
}
|
|
2215
|
+
/**
|
|
2216
|
+
* Load multiple language dictionaries.
|
|
2217
|
+
* @param languages - Array of languages to load.
|
|
2218
|
+
* @returns Number of successfully loaded languages.
|
|
2219
|
+
*/
|
|
2220
|
+
loadLanguages(languages) {
|
|
2221
|
+
const validatedLanguages = validateStringArray(languages, "languages");
|
|
2222
|
+
return validatedLanguages.reduce((count, lang) => {
|
|
2223
|
+
return this.loadLanguage(lang) ? count + 1 : count;
|
|
2224
|
+
}, 0);
|
|
2225
|
+
}
|
|
2226
|
+
/**
|
|
2227
|
+
* Load all supported Indian languages.
|
|
2228
|
+
* @returns Number of loaded Indian languages.
|
|
2229
|
+
*/
|
|
2230
|
+
loadIndianLanguages() {
|
|
2231
|
+
const indianLanguages = ["hindi", "bengali", "tamil", "telugu"];
|
|
2232
|
+
return this.loadLanguages(indianLanguages);
|
|
2233
|
+
}
|
|
2234
|
+
/**
|
|
2235
|
+
* Loads a custom dictionary of profane words with a specific name.
|
|
2236
|
+
*
|
|
2237
|
+
* @param {string} name - Unique name/identifier for this custom dictionary
|
|
2238
|
+
* @param {string[]} words - Array of profane words to add to the dictionary
|
|
2239
|
+
* @returns {void}
|
|
2240
|
+
*
|
|
2241
|
+
* @throws {TypeError} If name is not a string or words is not an array
|
|
2242
|
+
*
|
|
2243
|
+
* @remarks
|
|
2244
|
+
* ### Behavior:
|
|
2245
|
+
* - Creates a new named dictionary or overwrites existing one with same name
|
|
2246
|
+
* - Validates and filters out non-string and empty values from words array
|
|
2247
|
+
* - Words are added to all active data structures (Trie, Aho-Corasick, Bloom Filter)
|
|
2248
|
+
* - Dictionary name is converted to lowercase for storage
|
|
2249
|
+
* - Logs count of loaded words (unless silent mode enabled)
|
|
2250
|
+
*
|
|
2251
|
+
* ### Use Cases:
|
|
2252
|
+
* - Domain-specific profanity (gaming, medical, legal, etc.)
|
|
2253
|
+
* - Organization-specific word lists
|
|
2254
|
+
* - Temporary or context-dependent filters
|
|
2255
|
+
* - Testing and development
|
|
2256
|
+
*
|
|
2257
|
+
* @example
|
|
2258
|
+
* ```typescript
|
|
2259
|
+
* const filter = new BeKind();
|
|
2260
|
+
*
|
|
2261
|
+
* // Load gaming-specific slang
|
|
2262
|
+
* filter.loadCustomDictionary('gaming', [
|
|
2263
|
+
* 'noob',
|
|
2264
|
+
* 'scrub',
|
|
2265
|
+
* 'tryhard',
|
|
2266
|
+
* 'trash'
|
|
2267
|
+
* ]);
|
|
2268
|
+
*
|
|
2269
|
+
* // Load company-specific terms
|
|
2270
|
+
* filter.loadCustomDictionary('company', [
|
|
2271
|
+
* 'competitor1',
|
|
2272
|
+
* 'bannedTerm1',
|
|
2273
|
+
* 'inappropriateJargon'
|
|
2274
|
+
* ]);
|
|
2275
|
+
*
|
|
2276
|
+
* console.log(filter.check('You are such a noob')); // true
|
|
2277
|
+
* ```
|
|
2278
|
+
*
|
|
2279
|
+
* @example
|
|
2280
|
+
* ```typescript
|
|
2281
|
+
* // Load from external source
|
|
2282
|
+
* const filter = new BeKind();
|
|
2283
|
+
*
|
|
2284
|
+
* async function loadExternalDictionary() {
|
|
2285
|
+
* const response = await fetch('https://example.com/custom-words.json');
|
|
2286
|
+
* const customWords = await response.json();
|
|
2287
|
+
* filter.loadCustomDictionary('external', customWords);
|
|
2288
|
+
* }
|
|
2289
|
+
* ```
|
|
2290
|
+
*
|
|
2291
|
+
* @see {@link add} for adding individual words dynamically
|
|
2292
|
+
* @see {@link loadLanguage} for loading built-in language dictionaries
|
|
2293
|
+
*/
|
|
2294
|
+
loadCustomDictionary(name, words) {
|
|
2295
|
+
validateString(name, "dictionary name");
|
|
2296
|
+
const validatedWords = validateStringArray(words, "custom dictionary words");
|
|
2297
|
+
if (validatedWords.length === 0) {
|
|
2298
|
+
this.logger.warn(`Custom dictionary '${name}' contains no valid words`);
|
|
2299
|
+
return;
|
|
2300
|
+
}
|
|
2301
|
+
try {
|
|
2302
|
+
let addedCount = 0;
|
|
2303
|
+
for (const word of validatedWords) {
|
|
2304
|
+
if (this.addWordToTrie(word)) {
|
|
2305
|
+
addedCount++;
|
|
2306
|
+
}
|
|
2307
|
+
}
|
|
2308
|
+
this.availableLanguages[name.toLowerCase()] = validatedWords;
|
|
2309
|
+
this.loadedLanguages.add(name.toLowerCase());
|
|
2310
|
+
this.logger.info(`Loaded ${addedCount} words from custom dictionary '${name}'`);
|
|
2311
|
+
}
|
|
2312
|
+
catch (error) {
|
|
2313
|
+
this.logger.error(`Failed to load custom dictionary ${name}: ${error}`);
|
|
2314
|
+
}
|
|
2315
|
+
}
|
|
2316
|
+
/**
|
|
2317
|
+
* Add a single word to the trie.
|
|
2318
|
+
* @param word - The word to add.
|
|
2319
|
+
* @returns True if added, false otherwise.
|
|
2320
|
+
*/
|
|
2321
|
+
addWordToTrie(word) {
|
|
2322
|
+
if (!word || typeof word !== "string" || word.trim().length === 0) {
|
|
2323
|
+
return false;
|
|
2324
|
+
}
|
|
2325
|
+
const normalizedWord = this.caseSensitive
|
|
2326
|
+
? word.trim()
|
|
2327
|
+
: word.trim().toLowerCase();
|
|
2328
|
+
if (this.isWhitelisted(normalizedWord)) {
|
|
2329
|
+
return false;
|
|
2330
|
+
}
|
|
2331
|
+
// Add to Trie (always used as fallback)
|
|
2332
|
+
this.profanityTrie.addWord(normalizedWord);
|
|
2333
|
+
// Add to Bloom Filter if enabled
|
|
2334
|
+
if (this.bloomFilter) {
|
|
2335
|
+
this.bloomFilter.add(normalizedWord);
|
|
2336
|
+
}
|
|
2337
|
+
// Add to Aho-Corasick automaton if enabled
|
|
2338
|
+
if (this.ahoCorasickAutomaton) {
|
|
2339
|
+
this.ahoCorasickAutomaton.addPattern(normalizedWord);
|
|
2340
|
+
}
|
|
2341
|
+
return true;
|
|
2342
|
+
}
|
|
2343
|
+
/**
|
|
2344
|
+
* Calculate severity from matches.
|
|
2345
|
+
* @param matches - Array of matches.
|
|
2346
|
+
* @returns Severity level.
|
|
2347
|
+
*/
|
|
2348
|
+
calculateSeverity(matches) {
|
|
2349
|
+
if (matches.length === 0)
|
|
2350
|
+
return ProfanitySeverity.MILD;
|
|
2351
|
+
const uniqueWords = new Set(matches.map((m) => m.word)).size;
|
|
2352
|
+
const totalMatches = matches.length;
|
|
2353
|
+
if (totalMatches >= 5 || uniqueWords >= 4)
|
|
2354
|
+
return ProfanitySeverity.EXTREME;
|
|
2355
|
+
if (totalMatches >= 3 || uniqueWords >= 3)
|
|
2356
|
+
return ProfanitySeverity.SEVERE;
|
|
2357
|
+
if (totalMatches >= 2 || uniqueWords >= 2)
|
|
2358
|
+
return ProfanitySeverity.MODERATE;
|
|
2359
|
+
return ProfanitySeverity.MILD;
|
|
2360
|
+
}
|
|
2361
|
+
/**
|
|
2362
|
+
* Get the severity (s) and certainty (c) scores for a word.
|
|
2363
|
+
* Returns null if the word has no score entry.
|
|
2364
|
+
*
|
|
2365
|
+
* @param word - The word to look up
|
|
2366
|
+
* @returns The score object or null
|
|
2367
|
+
*/
|
|
2368
|
+
getWordScore(word) {
|
|
2369
|
+
var _a;
|
|
2370
|
+
const normalized = word.toLowerCase().trim();
|
|
2371
|
+
return (_a = this.wordScores[normalized]) !== null && _a !== void 0 ? _a : null;
|
|
2372
|
+
}
|
|
2373
|
+
/**
|
|
2374
|
+
* Check whether a word should be flagged based on its severity/certainty scores.
|
|
2375
|
+
*
|
|
2376
|
+
* Threshold rules:
|
|
2377
|
+
* - Flag if s:5 (any certainty)
|
|
2378
|
+
* - Flag if s:4+ AND c:2+
|
|
2379
|
+
* - Flag if s:3 AND c:3+
|
|
2380
|
+
* - Allow everything else
|
|
2381
|
+
*
|
|
2382
|
+
* @param word - The word to check
|
|
2383
|
+
* @returns true if the word should be flagged
|
|
2384
|
+
*/
|
|
2385
|
+
/**
|
|
2386
|
+
* Shared threshold logic: determines whether a severity/certainty pair
|
|
2387
|
+
* crosses the flag threshold. Used by shouldFlag, shouldFlagWithContext,
|
|
2388
|
+
* and inline threshold checks.
|
|
2389
|
+
*/
|
|
2390
|
+
static shouldFlagWithCertainty(severity, certainty) {
|
|
2391
|
+
return severity === 5 || (severity >= 4 && certainty >= 2) || (severity === 3 && certainty >= 3);
|
|
2392
|
+
}
|
|
2393
|
+
shouldFlag(word) {
|
|
2394
|
+
const score = this.getWordScore(word);
|
|
2395
|
+
if (!score)
|
|
2396
|
+
return false;
|
|
2397
|
+
return BeKind.shouldFlagWithCertainty(score.severity, score.certainty);
|
|
2398
|
+
}
|
|
2399
|
+
/**
|
|
2400
|
+
* Context-aware shouldFlag: for words with certainty ≤ 3, applies
|
|
2401
|
+
* certainty-delta adjustments from surrounding context before evaluating
|
|
2402
|
+
* the shouldFlag threshold. Words with certainty > 3 skip context analysis.
|
|
2403
|
+
*/
|
|
2404
|
+
shouldFlagWithContext(word, text, matchStart, matchEnd) {
|
|
2405
|
+
const wordScore = this.getWordScore(word);
|
|
2406
|
+
if (!wordScore)
|
|
2407
|
+
return false;
|
|
2408
|
+
if (this.contextAnalyzer) {
|
|
2409
|
+
const delta = this.contextAnalyzer.getCertaintyDelta(text, matchStart, matchEnd, word);
|
|
2410
|
+
const adjustedCertainty = Math.max(0, Math.min(5, wordScore.certainty + delta));
|
|
2411
|
+
return BeKind.shouldFlagWithCertainty(wordScore.severity, adjustedCertainty);
|
|
2412
|
+
}
|
|
2413
|
+
return this.shouldFlag(word);
|
|
2414
|
+
}
|
|
2415
|
+
/**
|
|
2416
|
+
* Clear all loaded dictionaries and dynamic words.
|
|
2417
|
+
*/
|
|
2418
|
+
clearList() {
|
|
2419
|
+
this.profanityTrie.clear();
|
|
2420
|
+
this.loadedLanguages.clear();
|
|
2421
|
+
this.dynamicWords.clear();
|
|
2422
|
+
}
|
|
2423
|
+
/**
|
|
2424
|
+
* Set the placeholder character for filtered words.
|
|
2425
|
+
* @param placeholder - The placeholder character.
|
|
2426
|
+
*/
|
|
2427
|
+
setPlaceholder(placeholder) {
|
|
2428
|
+
validateString(placeholder, "placeholder");
|
|
2429
|
+
if (placeholder.length === 0) {
|
|
2430
|
+
throw new Error("Placeholder cannot be empty");
|
|
2431
|
+
}
|
|
2432
|
+
this.defaultPlaceholder = placeholder.charAt(0);
|
|
2433
|
+
}
|
|
2434
|
+
/**
|
|
2435
|
+
* Get the list of loaded languages.
|
|
2436
|
+
* @returns Array of loaded language keys.
|
|
2437
|
+
*/
|
|
2438
|
+
getLoadedLanguages() {
|
|
2439
|
+
return Array.from(this.loadedLanguages);
|
|
2440
|
+
}
|
|
2441
|
+
/**
|
|
2442
|
+
* Get the list of available built-in languages.
|
|
2443
|
+
* @returns Array of available language keys.
|
|
2444
|
+
*/
|
|
2445
|
+
getAvailableLanguages() {
|
|
2446
|
+
return Object.keys(this.availableLanguages);
|
|
2447
|
+
}
|
|
2448
|
+
/**
|
|
2449
|
+
* Get the current configuration of the profanity filter.
|
|
2450
|
+
* @returns Partial configuration object.
|
|
2451
|
+
*/
|
|
2452
|
+
getConfig() {
|
|
2453
|
+
return {
|
|
2454
|
+
defaultPlaceholder: this.defaultPlaceholder,
|
|
2455
|
+
enableLeetSpeak: this.enableLeetSpeak,
|
|
2456
|
+
caseSensitive: this.caseSensitive,
|
|
2457
|
+
strictMode: this.strictMode,
|
|
2458
|
+
detectPartialWords: this.detectPartialWords,
|
|
2459
|
+
languages: this.getLoadedLanguages(),
|
|
2460
|
+
whitelistWords: Array.from(this.whitelistSet),
|
|
2461
|
+
};
|
|
2462
|
+
}
|
|
2463
|
+
/**
|
|
2464
|
+
* Rebuild the profanity trie from loaded dictionaries and dynamic words.
|
|
2465
|
+
*/
|
|
2466
|
+
rebuildTrie() {
|
|
2467
|
+
this.profanityTrie.clear();
|
|
2468
|
+
for (const lang of this.loadedLanguages) {
|
|
2469
|
+
const words = this.availableLanguages[lang] || [];
|
|
2470
|
+
for (const word of words) {
|
|
2471
|
+
this.addWordToTrie(word);
|
|
2472
|
+
}
|
|
2473
|
+
}
|
|
2474
|
+
for (const word of this.dynamicWords) {
|
|
2475
|
+
this.addWordToTrie(word);
|
|
2476
|
+
}
|
|
2477
|
+
}
|
|
2478
|
+
/**
|
|
2479
|
+
* Update configuration options for the profanity filter.
|
|
2480
|
+
* @param options - Partial configuration object.
|
|
2481
|
+
*/
|
|
2482
|
+
updateConfig(options) {
|
|
2483
|
+
let rebuildNeeded = false;
|
|
2484
|
+
if (options.defaultPlaceholder !== undefined) {
|
|
2485
|
+
this.setPlaceholder(options.defaultPlaceholder);
|
|
2486
|
+
}
|
|
2487
|
+
if (options.enableLeetSpeak !== undefined) {
|
|
2488
|
+
this.enableLeetSpeak = options.enableLeetSpeak;
|
|
2489
|
+
}
|
|
2490
|
+
if (options.caseSensitive !== undefined &&
|
|
2491
|
+
options.caseSensitive !== this.caseSensitive) {
|
|
2492
|
+
this.caseSensitive = options.caseSensitive;
|
|
2493
|
+
rebuildNeeded = true;
|
|
2494
|
+
}
|
|
2495
|
+
if (options.strictMode !== undefined) {
|
|
2496
|
+
this.strictMode = options.strictMode;
|
|
2497
|
+
}
|
|
2498
|
+
if (options.detectPartialWords !== undefined) {
|
|
2499
|
+
this.detectPartialWords = options.detectPartialWords;
|
|
2500
|
+
}
|
|
2501
|
+
if (options.embeddedProfanityDetection !== undefined) {
|
|
2502
|
+
this.embeddedProfanityDetection = options.embeddedProfanityDetection;
|
|
2503
|
+
}
|
|
2504
|
+
if (options.separatorTolerance !== undefined) {
|
|
2505
|
+
const sepTol = options.separatorTolerance;
|
|
2506
|
+
if (sepTol === false) {
|
|
2507
|
+
this.separatorTolerance = 0;
|
|
2508
|
+
}
|
|
2509
|
+
else if (typeof sepTol === "number") {
|
|
2510
|
+
this.separatorTolerance = Math.max(0, sepTol);
|
|
2511
|
+
}
|
|
2512
|
+
else {
|
|
2513
|
+
this.separatorTolerance = 5;
|
|
2514
|
+
}
|
|
2515
|
+
}
|
|
2516
|
+
if (options.whitelistWords) {
|
|
2517
|
+
this.addToWhitelist(options.whitelistWords);
|
|
2518
|
+
}
|
|
2519
|
+
if (rebuildNeeded) {
|
|
2520
|
+
this.rebuildTrie();
|
|
2521
|
+
}
|
|
2522
|
+
}
|
|
2523
|
+
/**
|
|
2524
|
+
* Create an BeKind instance from a configuration object.
|
|
2525
|
+
* @param config - Configuration object
|
|
2526
|
+
* @returns A new BeKind instance
|
|
2527
|
+
*/
|
|
2528
|
+
static fromConfig(config) {
|
|
2529
|
+
const options = {};
|
|
2530
|
+
if (config.algorithm)
|
|
2531
|
+
options.algorithm = config.algorithm;
|
|
2532
|
+
if (config.bloomFilter)
|
|
2533
|
+
options.bloomFilter = config.bloomFilter;
|
|
2534
|
+
if (config.ahoCorasick)
|
|
2535
|
+
options.ahoCorasick = config.ahoCorasick;
|
|
2536
|
+
if (config.contextAnalysis)
|
|
2537
|
+
options.contextAnalysis = config.contextAnalysis;
|
|
2538
|
+
if (config.performance)
|
|
2539
|
+
options.performance = config.performance;
|
|
2540
|
+
if (config.profanityDetection) {
|
|
2541
|
+
options.enableLeetSpeak = config.profanityDetection.enableLeetSpeak;
|
|
2542
|
+
options.caseSensitive = config.profanityDetection.caseSensitive;
|
|
2543
|
+
options.strictMode = config.profanityDetection.strictMode;
|
|
2544
|
+
options.detectPartialWords = config.profanityDetection.detectPartialWords;
|
|
2545
|
+
options.defaultPlaceholder = config.profanityDetection.defaultPlaceholder;
|
|
2546
|
+
}
|
|
2547
|
+
if (config.enableLeetSpeak !== undefined)
|
|
2548
|
+
options.enableLeetSpeak = config.enableLeetSpeak;
|
|
2549
|
+
if (config.caseSensitive !== undefined)
|
|
2550
|
+
options.caseSensitive = config.caseSensitive;
|
|
2551
|
+
if (config.strictMode !== undefined)
|
|
2552
|
+
options.strictMode = config.strictMode;
|
|
2553
|
+
if (config.detectPartialWords !== undefined)
|
|
2554
|
+
options.detectPartialWords = config.detectPartialWords;
|
|
2555
|
+
if (config.defaultPlaceholder !== undefined)
|
|
2556
|
+
options.defaultPlaceholder = config.defaultPlaceholder;
|
|
2557
|
+
if (config.languages)
|
|
2558
|
+
options.languages = config.languages;
|
|
2559
|
+
if (config.whitelistWords)
|
|
2560
|
+
options.whitelistWords = config.whitelistWords;
|
|
2561
|
+
if (config.customDictionaries)
|
|
2562
|
+
options.customDictionaries = config.customDictionaries;
|
|
2563
|
+
if (config.logger)
|
|
2564
|
+
options.logger = config.logger;
|
|
2565
|
+
return new BeKind(options);
|
|
2566
|
+
}
|
|
2567
|
+
}
|
|
2568
|
+
/**
|
|
2569
|
+
* Non-space separator characters (evasion symbols like @, ., -, etc.)
|
|
2570
|
+
* These are skipped freely with no certainty penalty.
|
|
2571
|
+
*/
|
|
2572
|
+
BeKind.SYMBOL_SEPARATOR_SET = new Set("@._-*#~`|\\\/+^=:;,!?'\"(){}[]<>".split(""));
|
|
2573
|
+
/**
|
|
2574
|
+
* Certainty penalty per space boundary crossed during separator-tolerant matching.
|
|
2575
|
+
* Each distinct whitespace gap reduces the matched word's certainty by this amount.
|
|
2576
|
+
* e.g., "fu ck" → fuck (c:5) → c:5-2 = c:3 → still flags at s:3
|
|
2577
|
+
* e.g., "No m" → nom (c:3) → c:3-2 = c:1 → drops below threshold
|
|
2578
|
+
*/
|
|
2579
|
+
BeKind.SPACE_CERTAINTY_PENALTY = 2;
|
|
2580
|
+
/**
|
|
2581
|
+
* Determine if a match is a whole word.
|
|
2582
|
+
* @param text - The text.
|
|
2583
|
+
* @param start - Start index.
|
|
2584
|
+
* @param end - End index.
|
|
2585
|
+
* @returns True if whole word, false otherwise.
|
|
2586
|
+
*/
|
|
2587
|
+
BeKind.CJK_RE = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u;
|
|
2588
|
+
/**
|
|
2589
|
+
* When a match is embedded (not a whole word), check whether the profane
|
|
2590
|
+
* substring covers a large enough fraction of its host word to be flagged
|
|
2591
|
+
* anyway. This catches deliberate obfuscation like "urASSHOLEbro" where
|
|
2592
|
+
* "asshole" (7 chars) = 58 % of the 12-char host word.
|
|
2593
|
+
*
|
|
2594
|
+
* Guards (all must pass):
|
|
2595
|
+
* 1. Match length ≥ 6 chars — short words (ass/shit/anal/semen) are too common.
|
|
2596
|
+
* 2. Graduated coverage threshold — shorter matches need higher coverage:
|
|
2597
|
+
* - 6-char matches: ≥ 85% (only catches near-exact wraps like "ufucker")
|
|
2598
|
+
* - 7+ char matches: ≥ 55% (catches obfuscation like "urASSHOLEbro")
|
|
2599
|
+
* 3. Language signal — scoreWord() on the host word must show signal for
|
|
2600
|
+
* the profane word's language. If the host word has no signal for that
|
|
2601
|
+
* language it's a cross-language collision (e.g. "singe" = French slur
|
|
2602
|
+
* inside "singer" which scores as English → skip).
|
|
2603
|
+
*
|
|
2604
|
+
* Examples:
|
|
2605
|
+
* "asshole" (7, en) in "urASSHOLEbro" (en signal) = 58 % → flagged ✓
|
|
2606
|
+
* "fucker" (6, en) in "ufucker" (en signal) = 86 % → flagged ✓
|
|
2607
|
+
* "raging" (6, en) in "foraging" = 75 % → below 85% for 6-char → safe ✓
|
|
2608
|
+
* "semen" (5) in "basement" → too short → safe ✓
|
|
2609
|
+
* "anal" (4) in "canal" → too short → safe ✓
|
|
2610
|
+
* "singe" (5, fr) in "singer" → too short → safe ✓
|
|
2611
|
+
* "negro" (5, en) in "negroni" → too short → safe ✓
|
|
2612
|
+
*/
|
|
2613
|
+
BeKind.HIGH_COVERAGE_THRESHOLD_SHORT = 0.85; // 6-char matches
|
|
2614
|
+
BeKind.HIGH_COVERAGE_THRESHOLD_LONG = 0.55; // 7+ char matches
|
|
2615
|
+
BeKind.HIGH_COVERAGE_MIN_MATCH_LEN = 6;
|
|
2616
|
+
BeKind.HIGH_COVERAGE_LANG_SIGNAL_MIN = 0.05;
|
|
2617
|
+
/**
|
|
2618
|
+
* Decay constant for embedded profanity detection.
|
|
2619
|
+
* Each extra character beyond the profane root reduces certainty by this factor.
|
|
2620
|
+
*/
|
|
2621
|
+
BeKind.EMBEDDED_DECAY_RATE = 0.9;
|
|
2622
|
+
/**
|
|
2623
|
+
* Minimum decayed certainty to report an embedded match.
|
|
2624
|
+
*/
|
|
2625
|
+
BeKind.EMBEDDED_MIN_CERTAINTY = 2;
|
|
2626
|
+
/**
|
|
2627
|
+
* Singleton instance of BeKind with default configuration.
|
|
2628
|
+
*/
|
|
2629
|
+
const allProfanity = new BeKind();
|
|
2630
|
+
export default allProfanity;
|
|
2631
|
+
//# sourceMappingURL=index.js.map
|