allprofanity 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +144 -25
- package/allprofanity.config.example.json +6 -0
- package/bin/init.js +1 -1
- package/bin/mcp.js +6 -0
- package/config.schema.json +44 -0
- package/dist/algos/aho-corasick.d.ts +11 -1
- package/dist/algos/aho-corasick.js +31 -6
- package/dist/algos/aho-corasick.js.map +1 -1
- package/dist/algos/bloom-filter.d.ts +2 -2
- package/dist/algos/bloom-filter.js +6 -6
- package/dist/algos/bloom-filter.js.map +1 -1
- package/dist/index.d.ts +896 -48
- package/dist/index.js +1438 -177
- package/dist/index.js.map +1 -1
- package/dist/languages/hindi-words.js +2 -2
- package/dist/languages/hindi-words.js.map +1 -1
- package/dist/mcp/server.d.ts +30 -0
- package/dist/mcp/server.js +364 -0
- package/dist/mcp/server.js.map +1 -0
- package/dist/mcp/stdio.d.ts +1 -0
- package/dist/mcp/stdio.js +72 -0
- package/dist/mcp/stdio.js.map +1 -0
- package/examples-config/README.md +113 -0
- package/examples-config/chat-app.json +24 -0
- package/examples-config/content-moderation.json +42 -0
- package/examples-config/family-friendly-max.json +33 -0
- package/examples-config/high-throughput-api.json +29 -0
- package/examples-config/low-latency-minimal.json +24 -0
- package/examples-config/medical-professional.json +42 -0
- package/examples-config/multilingual-global.json +33 -0
- package/package.json +17 -7
package/dist/index.js
CHANGED
|
@@ -23,35 +23,221 @@ export { default as tamilBadWords } from "./languages/tamil-words.js";
|
|
|
23
23
|
export { default as teluguBadWords } from "./languages/telugu-words.js";
|
|
24
24
|
export { default as brazilianBadWords } from "./languages/brazilian-words.js";
|
|
25
25
|
/**
|
|
26
|
-
* Default console logger implementation.
|
|
26
|
+
* Default console logger implementation for AllProfanity.
|
|
27
|
+
*
|
|
28
|
+
* @class ConsoleLogger
|
|
29
|
+
* @implements {Logger}
|
|
30
|
+
* @description Logs messages to the browser or Node.js console with an "[AllProfanity]" prefix.
|
|
31
|
+
* This is the default logger used when no custom logger is provided.
|
|
32
|
+
*
|
|
33
|
+
* @internal
|
|
27
34
|
*/
|
|
28
35
|
class ConsoleLogger {
|
|
36
|
+
/**
|
|
37
|
+
* Log informational messages to console.log with [AllProfanity] prefix.
|
|
38
|
+
*
|
|
39
|
+
* @param message - The message to log
|
|
40
|
+
* @returns void
|
|
41
|
+
*/
|
|
29
42
|
info(message) {
|
|
30
43
|
console.log(`[AllProfanity] ${message}`);
|
|
31
44
|
}
|
|
45
|
+
/**
|
|
46
|
+
* Log warning messages to console.warn with [AllProfanity] prefix.
|
|
47
|
+
*
|
|
48
|
+
* @param message - The warning message to log
|
|
49
|
+
* @returns void
|
|
50
|
+
*/
|
|
32
51
|
warn(message) {
|
|
33
52
|
console.warn(`[AllProfanity] ${message}`);
|
|
34
53
|
}
|
|
54
|
+
/**
|
|
55
|
+
* Log error messages to console.error with [AllProfanity] prefix.
|
|
56
|
+
*
|
|
57
|
+
* @param message - The error message to log
|
|
58
|
+
* @returns void
|
|
59
|
+
*/
|
|
35
60
|
error(message) {
|
|
36
61
|
console.error(`[AllProfanity] ${message}`);
|
|
37
62
|
}
|
|
38
63
|
}
|
|
39
64
|
/**
|
|
40
|
-
*
|
|
65
|
+
* Silent logger implementation that suppresses all log output.
|
|
66
|
+
*
|
|
67
|
+
* @class SilentLogger
|
|
68
|
+
* @implements {Logger}
|
|
69
|
+
* @description A no-op logger that discards all log messages. Used when `silent: true` is set
|
|
70
|
+
* in AllProfanityOptions, or when you want to completely disable logging.
|
|
71
|
+
*
|
|
72
|
+
* @internal
|
|
73
|
+
*/
|
|
74
|
+
class SilentLogger {
|
|
75
|
+
/**
|
|
76
|
+
* No-op implementation - messages are discarded.
|
|
77
|
+
*
|
|
78
|
+
* @param _message - The message (unused)
|
|
79
|
+
* @returns void
|
|
80
|
+
*/
|
|
81
|
+
info(_message) {
|
|
82
|
+
// Silent mode - no logging
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* No-op implementation - warnings are discarded.
|
|
86
|
+
*
|
|
87
|
+
* @param _message - The warning message (unused)
|
|
88
|
+
* @returns void
|
|
89
|
+
*/
|
|
90
|
+
warn(_message) {
|
|
91
|
+
// Silent mode - no logging
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* No-op implementation - errors are discarded.
|
|
95
|
+
*
|
|
96
|
+
* @param _message - The error message (unused)
|
|
97
|
+
* @returns void
|
|
98
|
+
*/
|
|
99
|
+
error(_message) {
|
|
100
|
+
// Silent mode - no logging
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Severity levels for profanity detection results.
|
|
105
|
+
*
|
|
106
|
+
* @enum {number}
|
|
107
|
+
* @description Categorizes the severity of detected profanity based on the number
|
|
108
|
+
* of unique words and total matches found in the text.
|
|
109
|
+
*
|
|
110
|
+
* @readonly
|
|
111
|
+
* @example
|
|
112
|
+
* ```typescript
|
|
113
|
+
* const result = filter.detect("some text");
|
|
114
|
+
* if (result.severity === ProfanitySeverity.EXTREME) {
|
|
115
|
+
* // Handle extreme profanity
|
|
116
|
+
* }
|
|
117
|
+
* ```
|
|
41
118
|
*/
|
|
42
119
|
export var ProfanitySeverity;
|
|
43
120
|
(function (ProfanitySeverity) {
|
|
121
|
+
/** No profanity detected */
|
|
122
|
+
ProfanitySeverity[ProfanitySeverity["NONE"] = 0] = "NONE";
|
|
123
|
+
/** Mild profanity: 1 unique word or 1 total match */
|
|
44
124
|
ProfanitySeverity[ProfanitySeverity["MILD"] = 1] = "MILD";
|
|
125
|
+
/** Moderate profanity: 2 unique words or 2 total matches */
|
|
45
126
|
ProfanitySeverity[ProfanitySeverity["MODERATE"] = 2] = "MODERATE";
|
|
127
|
+
/** Severe profanity: 3 unique words or 3 total matches */
|
|
46
128
|
ProfanitySeverity[ProfanitySeverity["SEVERE"] = 3] = "SEVERE";
|
|
129
|
+
/** Extreme profanity: 4+ unique words or 5+ total matches */
|
|
47
130
|
ProfanitySeverity[ProfanitySeverity["EXTREME"] = 4] = "EXTREME";
|
|
48
131
|
})(ProfanitySeverity = ProfanitySeverity || (ProfanitySeverity = {}));
|
|
49
132
|
/**
|
|
50
|
-
*
|
|
51
|
-
*
|
|
52
|
-
*
|
|
53
|
-
*
|
|
54
|
-
* @
|
|
133
|
+
* Compose two position maps: `inner` maps its normalized text back to the
|
|
134
|
+
* text that `outer` normalized, and the result maps `inner.normalized`
|
|
135
|
+
* directly back to the original input.
|
|
136
|
+
*
|
|
137
|
+
* @internal
|
|
138
|
+
*/
|
|
139
|
+
function composeMaps(outer, inner) {
|
|
140
|
+
const starts = new Array(inner.starts.length);
|
|
141
|
+
const ends = new Array(inner.ends.length);
|
|
142
|
+
for (let i = 0; i < inner.starts.length; i++) {
|
|
143
|
+
starts[i] = outer.starts[inner.starts[i]];
|
|
144
|
+
ends[i] = outer.ends[inner.ends[i] - 1];
|
|
145
|
+
}
|
|
146
|
+
return { normalized: inner.normalized, starts, ends };
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Common homoglyphs (visually identical/near-identical non-Latin characters)
|
|
150
|
+
* folded to their ASCII look-alikes for evasion-resistant matching.
|
|
151
|
+
*
|
|
152
|
+
* @internal
|
|
153
|
+
*/
|
|
154
|
+
const CONFUSABLES = new Map([
|
|
155
|
+
// Cyrillic
|
|
156
|
+
["а", "a"], ["в", "b"], ["е", "e"], ["к", "k"], ["м", "m"], ["н", "h"],
|
|
157
|
+
["о", "o"], ["р", "p"], ["с", "c"], ["т", "t"], ["у", "y"], ["х", "x"],
|
|
158
|
+
["і", "i"], ["ј", "j"], ["ѕ", "s"], ["ԁ", "d"], ["ԛ", "q"], ["ԝ", "w"],
|
|
159
|
+
// Greek
|
|
160
|
+
["α", "a"], ["β", "b"], ["γ", "y"], ["ε", "e"], ["η", "n"], ["ι", "i"],
|
|
161
|
+
["κ", "k"], ["μ", "m"], ["ν", "v"], ["ο", "o"], ["ρ", "p"], ["σ", "s"],
|
|
162
|
+
["τ", "t"], ["υ", "u"], ["χ", "x"], ["ω", "w"],
|
|
163
|
+
]);
|
|
164
|
+
/**
|
|
165
|
+
* Invisible characters commonly injected to break up profane words.
|
|
166
|
+
*
|
|
167
|
+
* @internal
|
|
168
|
+
*/
|
|
169
|
+
const INVISIBLE_CHARS = new Set([
|
|
170
|
+
"\u200B",
|
|
171
|
+
"\u200C",
|
|
172
|
+
"\u200D",
|
|
173
|
+
"\uFEFF",
|
|
174
|
+
"\u00AD",
|
|
175
|
+
"\u2060",
|
|
176
|
+
"\u180E", // Mongolian vowel separator
|
|
177
|
+
]);
|
|
178
|
+
/** Symbols treated as single-character wildcards in masked words like "f*ck". @internal */
|
|
179
|
+
const MASK_CHARS = new Set(["*", "#", "@", "$", "%"]);
|
|
180
|
+
/**
|
|
181
|
+
* Unambiguous profanity stems that are flagged even when embedded inside a
|
|
182
|
+
* larger token ("sisfuck", "totalshitshow"). Only words that essentially
|
|
183
|
+
* never occur inside legitimate vocabulary belong here — ambiguous stems
|
|
184
|
+
* like "ass" or "cock" (class, bass, Hitchcock, peacock) must stay
|
|
185
|
+
* whole-word matched.
|
|
186
|
+
*
|
|
187
|
+
* @internal
|
|
188
|
+
*/
|
|
189
|
+
const EMBEDDED_STRONG_STEMS = [
|
|
190
|
+
"fuck",
|
|
191
|
+
"shit",
|
|
192
|
+
"bitch",
|
|
193
|
+
"cunt",
|
|
194
|
+
"whore",
|
|
195
|
+
"nigger",
|
|
196
|
+
"nigga",
|
|
197
|
+
"faggot",
|
|
198
|
+
"wanker",
|
|
199
|
+
"chutiya",
|
|
200
|
+
"bhenchod",
|
|
201
|
+
"behenchod",
|
|
202
|
+
"madarchod",
|
|
203
|
+
"bhosdi",
|
|
204
|
+
];
|
|
205
|
+
/**
|
|
206
|
+
* Legitimate words that contain a strong stem and must never be flagged by
|
|
207
|
+
* the embedded pass (the user whitelist extends this set).
|
|
208
|
+
*
|
|
209
|
+
* @internal
|
|
210
|
+
*/
|
|
211
|
+
const EMBEDDED_SAFE_WORDS = new Set([
|
|
212
|
+
"scunthorpe",
|
|
213
|
+
"mishit",
|
|
214
|
+
"mishits",
|
|
215
|
+
"mishitting",
|
|
216
|
+
"shitake",
|
|
217
|
+
"shitakes",
|
|
218
|
+
"matsushita",
|
|
219
|
+
"takeshita",
|
|
220
|
+
"snigger",
|
|
221
|
+
"sniggers",
|
|
222
|
+
"sniggered",
|
|
223
|
+
"sniggering",
|
|
224
|
+
]);
|
|
225
|
+
/**
|
|
226
|
+
* Validates that an input is a non-empty string.
|
|
227
|
+
*
|
|
228
|
+
* @function validateString
|
|
229
|
+
* @param {unknown} input - The value to validate
|
|
230
|
+
* @param {string} paramName - Name of the parameter being validated (used in error messages)
|
|
231
|
+
* @returns {string} The validated string
|
|
232
|
+
* @throws {TypeError} If input is not a string
|
|
233
|
+
*
|
|
234
|
+
* @internal
|
|
235
|
+
*
|
|
236
|
+
* @example
|
|
237
|
+
* ```typescript
|
|
238
|
+
* const text = validateString(userInput, 'text');
|
|
239
|
+
* // Returns userInput if it's a string, throws TypeError otherwise
|
|
240
|
+
* ```
|
|
55
241
|
*/
|
|
56
242
|
function validateString(input, paramName) {
|
|
57
243
|
if (typeof input !== "string") {
|
|
@@ -60,36 +246,86 @@ function validateString(input, paramName) {
|
|
|
60
246
|
return input;
|
|
61
247
|
}
|
|
62
248
|
/**
|
|
63
|
-
*
|
|
64
|
-
*
|
|
65
|
-
* @
|
|
66
|
-
* @
|
|
67
|
-
* @
|
|
249
|
+
* Validates and filters a string array, removing non-string and empty items.
|
|
250
|
+
*
|
|
251
|
+
* @function validateStringArray
|
|
252
|
+
* @param {unknown} input - The value to validate (expected to be an array)
|
|
253
|
+
* @param {string} paramName - Name of the parameter being validated (used in error/warning messages)
|
|
254
|
+
* @returns {string[]} Array of valid, non-empty strings
|
|
255
|
+
* @throws {TypeError} If input is not an array
|
|
256
|
+
*
|
|
257
|
+
* @internal
|
|
258
|
+
*
|
|
259
|
+
* @example
|
|
260
|
+
* ```typescript
|
|
261
|
+
* const words = validateStringArray(['word1', '', 123, 'word2'], 'words');
|
|
262
|
+
* // Returns: ['word1', 'word2']
|
|
263
|
+
* // Logs warning: "Skipping non-string item in words: 123"
|
|
264
|
+
* ```
|
|
68
265
|
*/
|
|
69
|
-
function validateStringArray(input, paramName) {
|
|
266
|
+
function validateStringArray(input, paramName, logger) {
|
|
70
267
|
if (!Array.isArray(input)) {
|
|
71
268
|
throw new TypeError(`${paramName} must be an array`);
|
|
72
269
|
}
|
|
73
270
|
return input.filter((item) => {
|
|
74
271
|
if (typeof item !== "string") {
|
|
75
|
-
|
|
272
|
+
const message = `Skipping non-string item in ${paramName}: ${item}`;
|
|
273
|
+
if (logger) {
|
|
274
|
+
logger.warn(message);
|
|
275
|
+
}
|
|
276
|
+
else {
|
|
277
|
+
console.warn(message);
|
|
278
|
+
}
|
|
76
279
|
return false;
|
|
77
280
|
}
|
|
78
281
|
return item.trim().length > 0;
|
|
79
282
|
});
|
|
80
283
|
}
|
|
81
284
|
/**
|
|
82
|
-
* Trie node for efficient
|
|
285
|
+
* Trie (prefix tree) node for efficient pattern matching and word storage.
|
|
286
|
+
*
|
|
287
|
+
* @class TrieNode
|
|
288
|
+
* @description Implements a trie data structure for O(m) time complexity word matching,
|
|
289
|
+
* where m is the length of the word being searched. Each node represents a character
|
|
290
|
+
* in the word, and paths from root to nodes with isEndOfWord=true represent complete words.
|
|
291
|
+
*
|
|
292
|
+
* @internal
|
|
293
|
+
*
|
|
294
|
+
* @example
|
|
295
|
+
* ```typescript
|
|
296
|
+
* const trie = new TrieNode();
|
|
297
|
+
* trie.addWord('bad');
|
|
298
|
+
* trie.addWord('badword');
|
|
299
|
+
* const matches = trie.findMatches('badwords here', 0, false);
|
|
300
|
+
* // Returns matches for 'bad' and 'badword'
|
|
301
|
+
* ```
|
|
83
302
|
*/
|
|
84
303
|
class TrieNode {
|
|
85
304
|
constructor() {
|
|
305
|
+
/** Map of characters to child nodes for fast lookups */
|
|
86
306
|
this.children = new Map();
|
|
307
|
+
/** Flag indicating if this node represents the end of a complete word */
|
|
87
308
|
this.isEndOfWord = false;
|
|
309
|
+
/** The complete word ending at this node (only set when isEndOfWord is true) */
|
|
88
310
|
this.word = "";
|
|
89
311
|
}
|
|
90
312
|
/**
|
|
91
|
-
*
|
|
92
|
-
*
|
|
313
|
+
* Adds a word to the trie structure.
|
|
314
|
+
*
|
|
315
|
+
* @param {string} word - The word to add to the trie
|
|
316
|
+
* @returns {void}
|
|
317
|
+
*
|
|
318
|
+
* @remarks
|
|
319
|
+
* - Time Complexity: O(m) where m is the length of the word
|
|
320
|
+
* - Space Complexity: O(m) in worst case when all characters are new
|
|
321
|
+
* - Supports any Unicode characters
|
|
322
|
+
*
|
|
323
|
+
* @example
|
|
324
|
+
* ```typescript
|
|
325
|
+
* const trie = new TrieNode();
|
|
326
|
+
* trie.addWord('hello');
|
|
327
|
+
* trie.addWord('world');
|
|
328
|
+
* ```
|
|
93
329
|
*/
|
|
94
330
|
addWord(word) {
|
|
95
331
|
let current = this;
|
|
@@ -106,13 +342,36 @@ class TrieNode {
|
|
|
106
342
|
current.word = word;
|
|
107
343
|
}
|
|
108
344
|
/**
|
|
109
|
-
*
|
|
110
|
-
*
|
|
111
|
-
* @
|
|
345
|
+
* Removes a word from the trie structure.
|
|
346
|
+
*
|
|
347
|
+
* @param {string} word - The word to remove from the trie
|
|
348
|
+
* @returns {boolean} True if the word existed and was removed, false if word was not found
|
|
349
|
+
*
|
|
350
|
+
* @remarks
|
|
351
|
+
* - Time Complexity: O(m) where m is the length of the word
|
|
352
|
+
* - Also removes unnecessary nodes to keep the trie optimized
|
|
353
|
+
* - Only removes the word marking; shared prefixes with other words are preserved
|
|
354
|
+
*
|
|
355
|
+
* @example
|
|
356
|
+
* ```typescript
|
|
357
|
+
* const trie = new TrieNode();
|
|
358
|
+
* trie.addWord('hello');
|
|
359
|
+
* trie.removeWord('hello'); // Returns: true
|
|
360
|
+
* trie.removeWord('world'); // Returns: false (word not in trie)
|
|
361
|
+
* ```
|
|
112
362
|
*/
|
|
113
363
|
removeWord(word) {
|
|
114
364
|
return this.removeHelper(word, 0);
|
|
115
365
|
}
|
|
366
|
+
/**
|
|
367
|
+
* Recursive helper method for removing a word from the trie.
|
|
368
|
+
*
|
|
369
|
+
* @param {string} word - The word being removed
|
|
370
|
+
* @param {number} index - Current character index in the word
|
|
371
|
+
* @returns {boolean} True if this node should be deleted (has no children and is not end of another word)
|
|
372
|
+
*
|
|
373
|
+
* @internal
|
|
374
|
+
*/
|
|
116
375
|
removeHelper(word, index) {
|
|
117
376
|
if (index === word.length) {
|
|
118
377
|
if (!this.isEndOfWord)
|
|
@@ -132,11 +391,25 @@ class TrieNode {
|
|
|
132
391
|
return false;
|
|
133
392
|
}
|
|
134
393
|
/**
|
|
135
|
-
*
|
|
136
|
-
*
|
|
137
|
-
* @param
|
|
138
|
-
* @param
|
|
139
|
-
* @
|
|
394
|
+
* Finds all word matches in text starting at a specific position.
|
|
395
|
+
*
|
|
396
|
+
* @param {string} text - The text to search for profanity
|
|
397
|
+
* @param {number} startPos - The starting position (0-based index) in the text
|
|
398
|
+
* @param {boolean} allowPartial - If true, finds partial matches within larger words
|
|
399
|
+
* @returns {Array<{ word: string; start: number; end: number }>} Array of match objects with word and position info
|
|
400
|
+
*
|
|
401
|
+
* @remarks
|
|
402
|
+
* - Time Complexity: O(k) where k is the length of the longest match from startPos
|
|
403
|
+
* - Returns all valid words that can be formed starting from startPos
|
|
404
|
+
* - When allowPartial is false, respects word boundaries
|
|
405
|
+
*
|
|
406
|
+
* @example
|
|
407
|
+
* ```typescript
|
|
408
|
+
* const trie = new TrieNode();
|
|
409
|
+
* trie.addWord('bad');
|
|
410
|
+
* const matches = trie.findMatches('badword', 0, false);
|
|
411
|
+
* // Returns: [{ word: 'bad', start: 0, end: 3 }]
|
|
412
|
+
* ```
|
|
140
413
|
*/
|
|
141
414
|
findMatches(text, startPos, allowPartial) {
|
|
142
415
|
const matches = [];
|
|
@@ -149,28 +422,59 @@ class TrieNode {
|
|
|
149
422
|
current = nextNode;
|
|
150
423
|
pos++;
|
|
151
424
|
if (current.isEndOfWord) {
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
start: wordStart - startPos,
|
|
158
|
-
end: wordEnd - startPos,
|
|
159
|
-
});
|
|
160
|
-
}
|
|
161
|
-
else {
|
|
162
|
-
matches.push({
|
|
163
|
-
word: current.word,
|
|
164
|
-
start: 0,
|
|
165
|
-
end: pos - startPos,
|
|
166
|
-
});
|
|
167
|
-
}
|
|
425
|
+
matches.push({
|
|
426
|
+
word: current.word,
|
|
427
|
+
start: 0,
|
|
428
|
+
end: pos - startPos,
|
|
429
|
+
});
|
|
168
430
|
}
|
|
169
431
|
}
|
|
170
432
|
return matches;
|
|
171
433
|
}
|
|
172
434
|
/**
|
|
173
|
-
*
|
|
435
|
+
* Find a stored word matching the token, where mask characters match any
|
|
436
|
+
* single character. The token must align with a complete word exactly.
|
|
437
|
+
*
|
|
438
|
+
* @param token - The token to resolve (e.g. "f*ck")
|
|
439
|
+
* @param maskChars - Characters that act as single-character wildcards
|
|
440
|
+
* @returns The first matching dictionary word, or null
|
|
441
|
+
*/
|
|
442
|
+
findWildcardMatch(token, maskChars) {
|
|
443
|
+
return this.wildcardHelper(token, 0, maskChars);
|
|
444
|
+
}
|
|
445
|
+
wildcardHelper(token, index, maskChars) {
|
|
446
|
+
if (index === token.length) {
|
|
447
|
+
return this.isEndOfWord ? this.word : null;
|
|
448
|
+
}
|
|
449
|
+
const char = token[index];
|
|
450
|
+
if (maskChars.has(char)) {
|
|
451
|
+
for (const child of this.children.values()) {
|
|
452
|
+
const result = child.wildcardHelper(token, index + 1, maskChars);
|
|
453
|
+
if (result)
|
|
454
|
+
return result;
|
|
455
|
+
}
|
|
456
|
+
return null;
|
|
457
|
+
}
|
|
458
|
+
const child = this.children.get(char);
|
|
459
|
+
return child ? child.wildcardHelper(token, index + 1, maskChars) : null;
|
|
460
|
+
}
|
|
461
|
+
/**
|
|
462
|
+
* Clears all words from the trie, resetting it to empty state.
|
|
463
|
+
*
|
|
464
|
+
* @returns {void}
|
|
465
|
+
*
|
|
466
|
+
* @remarks
|
|
467
|
+
* - Time Complexity: O(1) - clears the root node only (JavaScript GC handles children)
|
|
468
|
+
* - Removes all stored words and resets the trie to initial state
|
|
469
|
+
*
|
|
470
|
+
* @example
|
|
471
|
+
* ```typescript
|
|
472
|
+
* const trie = new TrieNode();
|
|
473
|
+
* trie.addWord('hello');
|
|
474
|
+
* trie.addWord('world');
|
|
475
|
+
* trie.clear();
|
|
476
|
+
* // Trie is now empty
|
|
477
|
+
* ```
|
|
174
478
|
*/
|
|
175
479
|
clear() {
|
|
176
480
|
this.children.clear();
|
|
@@ -179,15 +483,142 @@ class TrieNode {
|
|
|
179
483
|
}
|
|
180
484
|
}
|
|
181
485
|
/**
|
|
182
|
-
*
|
|
486
|
+
* AllProfanity - Professional-grade multilingual profanity detection and filtering library.
|
|
487
|
+
*
|
|
488
|
+
* @class AllProfanity
|
|
489
|
+
* @description A comprehensive, high-performance profanity filtering system supporting 9+ languages
|
|
490
|
+
* with advanced features including leet speak detection, context analysis, multiple matching algorithms,
|
|
491
|
+
* and customizable filtering options.
|
|
492
|
+
*
|
|
493
|
+
* @remarks
|
|
494
|
+
* ### Features:
|
|
495
|
+
* - **Multi-language Support**: English, Hindi, French, German, Spanish, Bengali, Tamil, Telugu, Brazilian Portuguese
|
|
496
|
+
* - **Advanced Algorithms**: Trie, Aho-Corasick, Bloom Filter, and hybrid approaches
|
|
497
|
+
* - **Leet Speak Detection**: Automatically normalizes and detects variations like "h3ll0"
|
|
498
|
+
* - **Context Analysis**: Reduces false positives using surrounding word context
|
|
499
|
+
* - **Performance**: Built-in caching and optimized data structures
|
|
500
|
+
* - **Flexible**: Custom dictionaries, whitelisting, severity levels
|
|
501
|
+
*
|
|
502
|
+
* ### Default Behavior:
|
|
503
|
+
* - Loads English and Hindi dictionaries by default
|
|
504
|
+
* - Case-insensitive matching
|
|
505
|
+
* - Leet speak detection enabled
|
|
506
|
+
* - Uses Trie algorithm (fastest for most cases)
|
|
507
|
+
*
|
|
508
|
+
* @example
|
|
509
|
+
* ```typescript
|
|
510
|
+
* // Basic usage with default instance
|
|
511
|
+
* import allProfanity from 'allprofanity';
|
|
512
|
+
*
|
|
513
|
+
* const result = allProfanity.detect("This is some bad text");
|
|
514
|
+
* console.log(result.hasProfanity); // true
|
|
515
|
+
* console.log(result.cleanedText); // "This is some *** text"
|
|
516
|
+
* console.log(result.severity); // ProfanitySeverity.MILD
|
|
517
|
+
* ```
|
|
518
|
+
*
|
|
519
|
+
* @example
|
|
520
|
+
* ```typescript
|
|
521
|
+
* // Advanced usage with custom configuration
|
|
522
|
+
* import { AllProfanity, ProfanitySeverity } from 'allprofanity';
|
|
523
|
+
*
|
|
524
|
+
* const filter = new AllProfanity({
|
|
525
|
+
* languages: ['english', 'french', 'spanish'],
|
|
526
|
+
* enableLeetSpeak: true,
|
|
527
|
+
* strictMode: true,
|
|
528
|
+
* algorithm: {
|
|
529
|
+
* matching: 'hybrid',
|
|
530
|
+
* useBloomFilter: true
|
|
531
|
+
* },
|
|
532
|
+
* performance: {
|
|
533
|
+
* enableCaching: true,
|
|
534
|
+
* cacheSize: 500
|
|
535
|
+
* },
|
|
536
|
+
* whitelistWords: ['class', 'assignment']
|
|
537
|
+
* });
|
|
538
|
+
*
|
|
539
|
+
* const text = "This text has some b@d w0rds";
|
|
540
|
+
* const result = filter.detect(text);
|
|
541
|
+
*
|
|
542
|
+
* if (result.hasProfanity) {
|
|
543
|
+
* console.log(`Found ${result.detectedWords.length} profane words`);
|
|
544
|
+
* console.log(`Severity: ${ProfanitySeverity[result.severity]}`);
|
|
545
|
+
* console.log(`Cleaned: ${result.cleanedText}`);
|
|
546
|
+
* }
|
|
547
|
+
* ```
|
|
548
|
+
*
|
|
549
|
+
* @example
|
|
550
|
+
* ```typescript
|
|
551
|
+
* // Using individual methods
|
|
552
|
+
* const filter = new AllProfanity();
|
|
553
|
+
*
|
|
554
|
+
* // Simple check
|
|
555
|
+
* if (filter.check("some text")) {
|
|
556
|
+
* console.log("Contains profanity!");
|
|
557
|
+
* }
|
|
558
|
+
*
|
|
559
|
+
* // Clean with custom placeholder
|
|
560
|
+
* const cleaned = filter.clean("bad words here", "#");
|
|
561
|
+
*
|
|
562
|
+
* // Load additional languages
|
|
563
|
+
* filter.loadLanguage('german');
|
|
564
|
+
* filter.loadIndianLanguages(); // Loads hindi, bengali, tamil, telugu
|
|
565
|
+
*
|
|
566
|
+
* // Add custom words
|
|
567
|
+
* filter.add(['customword1', 'customword2']);
|
|
568
|
+
*
|
|
569
|
+
* // Remove words
|
|
570
|
+
* filter.remove(['someword']);
|
|
571
|
+
*
|
|
572
|
+
* // Whitelist words
|
|
573
|
+
* filter.addToWhitelist(['class', 'assignment']);
|
|
574
|
+
* ```
|
|
575
|
+
*
|
|
576
|
+
* @see {@link AllProfanityOptions} for all configuration options
|
|
577
|
+
* @see {@link ProfanityDetectionResult} for detection result format
|
|
578
|
+
* @see {@link ProfanitySeverity} for severity levels
|
|
183
579
|
*/
|
|
184
580
|
export class AllProfanity {
|
|
185
581
|
/**
|
|
186
|
-
*
|
|
187
|
-
*
|
|
582
|
+
* Creates a new AllProfanity instance with the specified configuration.
|
|
583
|
+
*
|
|
584
|
+
* @constructor
|
|
585
|
+
* @param {AllProfanityOptions} [options] - Configuration options for profanity detection behavior
|
|
586
|
+
*
|
|
587
|
+
* @remarks
|
|
588
|
+
* ### Default Initialization:
|
|
589
|
+
* - Loads English and Hindi dictionaries automatically
|
|
590
|
+
* - Enables leet speak detection
|
|
591
|
+
* - Case-insensitive matching
|
|
592
|
+
* - Uses Trie algorithm for pattern matching
|
|
593
|
+
*
|
|
594
|
+
* ### Performance Considerations:
|
|
595
|
+
* - Initial load time depends on number of languages loaded
|
|
596
|
+
* - Aho-Corasick automaton (if enabled) is built during construction
|
|
597
|
+
* - Bloom Filter (if enabled) is populated during construction
|
|
598
|
+
*
|
|
599
|
+
* @throws {TypeError} If invalid options are provided
|
|
600
|
+
*
|
|
601
|
+
* @example
|
|
602
|
+
* ```typescript
|
|
603
|
+
* // Default instance
|
|
604
|
+
* const filter = new AllProfanity();
|
|
605
|
+
*
|
|
606
|
+
* // Custom configuration
|
|
607
|
+
* const filter = new AllProfanity({
|
|
608
|
+
* languages: ['english', 'french'],
|
|
609
|
+
* strictMode: true,
|
|
610
|
+
* defaultPlaceholder: '#',
|
|
611
|
+
* algorithm: { matching: 'hybrid' }
|
|
612
|
+
* });
|
|
613
|
+
*
|
|
614
|
+
* // Silent mode (no logging)
|
|
615
|
+
* const filter = new AllProfanity({ silent: true });
|
|
616
|
+
* ```
|
|
617
|
+
*
|
|
618
|
+
* @see {@link AllProfanityOptions} for all available configuration options
|
|
188
619
|
*/
|
|
189
620
|
constructor(options) {
|
|
190
|
-
var _a, _b, _c, _d, _e;
|
|
621
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o, _p, _q, _r;
|
|
191
622
|
this.profanityTrie = new TrieNode();
|
|
192
623
|
this.whitelistSet = new Set();
|
|
193
624
|
this.loadedLanguages = new Set();
|
|
@@ -196,6 +627,11 @@ export class AllProfanity {
|
|
|
196
627
|
this.caseSensitive = false;
|
|
197
628
|
this.strictMode = false;
|
|
198
629
|
this.detectPartialWords = false;
|
|
630
|
+
this.evasionUnicode = true;
|
|
631
|
+
this.evasionRepeatedChars = true;
|
|
632
|
+
this.evasionMaskedChars = true;
|
|
633
|
+
this.evasionSeparatedLetters = true;
|
|
634
|
+
this.evasionEmbeddedWords = true;
|
|
199
635
|
this.availableLanguages = {
|
|
200
636
|
english: englishBadWords || [],
|
|
201
637
|
hindi: hindiBadWords || [],
|
|
@@ -233,7 +669,6 @@ export class AllProfanity {
|
|
|
233
669
|
["¿", "j"],
|
|
234
670
|
["|<", "k"],
|
|
235
671
|
["1<", "k"],
|
|
236
|
-
["7", "l"],
|
|
237
672
|
["|\\/|", "m"],
|
|
238
673
|
["/\\/\\", "m"],
|
|
239
674
|
["|\\|", "n"],
|
|
@@ -247,13 +682,11 @@ export class AllProfanity {
|
|
|
247
682
|
["12", "r"],
|
|
248
683
|
["5", "s"],
|
|
249
684
|
["$", "s"],
|
|
250
|
-
["z", "s"],
|
|
251
685
|
["7", "t"],
|
|
252
686
|
["+", "t"],
|
|
253
687
|
["†", "t"],
|
|
254
688
|
["|_|", "u"],
|
|
255
689
|
["(_)", "u"],
|
|
256
|
-
["v", "u"],
|
|
257
690
|
["\\/", "v"],
|
|
258
691
|
["|/", "v"],
|
|
259
692
|
["\\/\\/", "w"],
|
|
@@ -261,7 +694,6 @@ export class AllProfanity {
|
|
|
261
694
|
["><", "x"],
|
|
262
695
|
["}{", "x"],
|
|
263
696
|
["`/", "y"],
|
|
264
|
-
["j", "y"],
|
|
265
697
|
["2", "z"],
|
|
266
698
|
["7_", "z"],
|
|
267
699
|
]);
|
|
@@ -270,9 +702,13 @@ export class AllProfanity {
|
|
|
270
702
|
this.ahoCorasickAutomaton = null;
|
|
271
703
|
this.bloomFilter = null;
|
|
272
704
|
this.contextAnalyzer = null;
|
|
705
|
+
this.contextScoreThreshold = 0.5;
|
|
273
706
|
this.matchingAlgorithm = "trie";
|
|
274
707
|
this.resultCache = null;
|
|
275
|
-
this.
|
|
708
|
+
this.cacheMaxSize = 1000;
|
|
709
|
+
this.leetTokensByFirstChar = null;
|
|
710
|
+
// Use silent logger if silent mode is enabled, otherwise use provided logger or console logger
|
|
711
|
+
this.logger = (options === null || options === void 0 ? void 0 : options.logger) || ((options === null || options === void 0 ? void 0 : options.silent) ? new SilentLogger() : new ConsoleLogger());
|
|
276
712
|
if ((options === null || options === void 0 ? void 0 : options.defaultPlaceholder) !== undefined) {
|
|
277
713
|
this.setPlaceholder(options.defaultPlaceholder);
|
|
278
714
|
}
|
|
@@ -280,6 +716,15 @@ export class AllProfanity {
|
|
|
280
716
|
this.caseSensitive = (_b = options === null || options === void 0 ? void 0 : options.caseSensitive) !== null && _b !== void 0 ? _b : false;
|
|
281
717
|
this.strictMode = (_c = options === null || options === void 0 ? void 0 : options.strictMode) !== null && _c !== void 0 ? _c : false;
|
|
282
718
|
this.detectPartialWords = (_d = options === null || options === void 0 ? void 0 : options.detectPartialWords) !== null && _d !== void 0 ? _d : false;
|
|
719
|
+
this.evasionUnicode = (_f = (_e = options === null || options === void 0 ? void 0 : options.evasionProtection) === null || _e === void 0 ? void 0 : _e.unicode) !== null && _f !== void 0 ? _f : true;
|
|
720
|
+
this.evasionRepeatedChars =
|
|
721
|
+
(_h = (_g = options === null || options === void 0 ? void 0 : options.evasionProtection) === null || _g === void 0 ? void 0 : _g.repeatedCharacters) !== null && _h !== void 0 ? _h : true;
|
|
722
|
+
this.evasionMaskedChars =
|
|
723
|
+
(_k = (_j = options === null || options === void 0 ? void 0 : options.evasionProtection) === null || _j === void 0 ? void 0 : _j.maskedCharacters) !== null && _k !== void 0 ? _k : true;
|
|
724
|
+
this.evasionSeparatedLetters =
|
|
725
|
+
(_m = (_l = options === null || options === void 0 ? void 0 : options.evasionProtection) === null || _l === void 0 ? void 0 : _l.separatedLetters) !== null && _m !== void 0 ? _m : true;
|
|
726
|
+
this.evasionEmbeddedWords =
|
|
727
|
+
(_p = (_o = options === null || options === void 0 ? void 0 : options.evasionProtection) === null || _o === void 0 ? void 0 : _o.embeddedWords) !== null && _p !== void 0 ? _p : true;
|
|
283
728
|
if (options === null || options === void 0 ? void 0 : options.whitelistWords) {
|
|
284
729
|
this.addToWhitelist(options.whitelistWords);
|
|
285
730
|
}
|
|
@@ -288,7 +733,7 @@ export class AllProfanity {
|
|
|
288
733
|
this.initializeAdvancedAlgorithms(options);
|
|
289
734
|
this.loadLanguage("english");
|
|
290
735
|
this.loadLanguage("hindi");
|
|
291
|
-
if ((
|
|
736
|
+
if ((_q = options === null || options === void 0 ? void 0 : options.languages) === null || _q === void 0 ? void 0 : _q.length) {
|
|
292
737
|
options.languages.forEach((lang) => this.loadLanguage(lang));
|
|
293
738
|
}
|
|
294
739
|
if (options === null || options === void 0 ? void 0 : options.customDictionaries) {
|
|
@@ -296,12 +741,15 @@ export class AllProfanity {
|
|
|
296
741
|
this.loadCustomDictionary(name, words);
|
|
297
742
|
});
|
|
298
743
|
}
|
|
744
|
+
if (((_r = options === null || options === void 0 ? void 0 : options.ahoCorasick) === null || _r === void 0 ? void 0 : _r.prebuild) && this.ahoCorasickAutomaton) {
|
|
745
|
+
this.ahoCorasickAutomaton.build();
|
|
746
|
+
}
|
|
299
747
|
}
|
|
300
748
|
/**
|
|
301
749
|
* Initialize advanced algorithms based on configuration
|
|
302
750
|
*/
|
|
303
751
|
initializeAdvancedAlgorithms(options) {
|
|
304
|
-
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m;
|
|
752
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o;
|
|
305
753
|
// Set matching algorithm
|
|
306
754
|
if ((_a = options === null || options === void 0 ? void 0 : options.algorithm) === null || _a === void 0 ? void 0 : _a.matching) {
|
|
307
755
|
this.matchingAlgorithm = options.algorithm.matching;
|
|
@@ -334,38 +782,362 @@ export class AllProfanity {
|
|
|
334
782
|
if ((_l = options === null || options === void 0 ? void 0 : options.contextAnalysis) === null || _l === void 0 ? void 0 : _l.contextWindow) {
|
|
335
783
|
this.contextAnalyzer.setContextWindow(options.contextAnalysis.contextWindow);
|
|
336
784
|
}
|
|
785
|
+
if (((_m = options === null || options === void 0 ? void 0 : options.contextAnalysis) === null || _m === void 0 ? void 0 : _m.scoreThreshold) !== undefined) {
|
|
786
|
+
this.contextScoreThreshold = options.contextAnalysis.scoreThreshold;
|
|
787
|
+
}
|
|
337
788
|
this.logger.info(`Context Analyzer initialized for languages: ${contextLanguages.join(", ")}`);
|
|
338
789
|
}
|
|
339
790
|
// Initialize result cache if enabled
|
|
340
|
-
if ((
|
|
341
|
-
|
|
791
|
+
if ((_o = options === null || options === void 0 ? void 0 : options.performance) === null || _o === void 0 ? void 0 : _o.enableCaching) {
|
|
792
|
+
this.cacheMaxSize = options.performance.cacheSize || 1000;
|
|
342
793
|
this.resultCache = new Map();
|
|
343
|
-
this.logger.info(`Result caching enabled with size limit: ${
|
|
794
|
+
this.logger.info(`Result caching enabled with size limit: ${this.cacheMaxSize}`);
|
|
344
795
|
}
|
|
345
796
|
}
|
|
346
797
|
/**
|
|
347
|
-
* Normalize leet speak to regular characters
|
|
348
|
-
*
|
|
349
|
-
*
|
|
798
|
+
* Normalize leet speak to regular characters, keeping a map from each
|
|
799
|
+
* normalized character back to its source range in the input text.
|
|
800
|
+
*
|
|
801
|
+
* For normalized index i, starts[i]/ends[i] give the [start, end) range in
|
|
802
|
+
* the input that produced that character. A match [s, e) in the normalized
|
|
803
|
+
* string therefore spans [starts[s], ends[e - 1]) in the input. This is what
|
|
804
|
+
* keeps positions correct when length-changing mappings like "ph" -> "f"
|
|
805
|
+
* apply.
|
|
350
806
|
*/
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
807
|
+
normalizeLeetSpeakWithMap(text) {
|
|
808
|
+
// Bucket tokens by first character so each position costs one Map lookup
|
|
809
|
+
// instead of a scan over every mapping (longest token first per bucket).
|
|
810
|
+
if (!this.leetTokensByFirstChar) {
|
|
811
|
+
this.leetTokensByFirstChar = new Map();
|
|
812
|
+
for (const entry of this.leetMappings.entries()) {
|
|
813
|
+
const bucket = this.leetTokensByFirstChar.get(entry[0][0]);
|
|
814
|
+
if (bucket) {
|
|
815
|
+
bucket.push(entry);
|
|
816
|
+
}
|
|
817
|
+
else {
|
|
818
|
+
this.leetTokensByFirstChar.set(entry[0][0], [entry]);
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
for (const bucket of this.leetTokensByFirstChar.values()) {
|
|
822
|
+
bucket.sort(([leetA], [leetB]) => leetB.length - leetA.length);
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
// Fast path: most text contains no leet characters at all. Scan for the
|
|
826
|
+
// first applicable mapping before allocating the position-map arrays.
|
|
827
|
+
let hasLeet = false;
|
|
828
|
+
for (let j = 0; j < text.length && !hasLeet; j++) {
|
|
829
|
+
const bucket = this.leetTokensByFirstChar.get(text[j]);
|
|
830
|
+
if (bucket) {
|
|
831
|
+
for (const [leet] of bucket) {
|
|
832
|
+
if (leet.length === 1 || text.startsWith(leet, j)) {
|
|
833
|
+
hasLeet = true;
|
|
834
|
+
break;
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
if (!hasLeet) {
|
|
840
|
+
return { normalized: text, starts: [], ends: [] };
|
|
841
|
+
}
|
|
842
|
+
const parts = [];
|
|
843
|
+
const starts = [];
|
|
844
|
+
const ends = [];
|
|
845
|
+
let i = 0;
|
|
846
|
+
while (i < text.length) {
|
|
847
|
+
let consumed = 0;
|
|
848
|
+
let replacement = "";
|
|
849
|
+
const bucket = this.leetTokensByFirstChar.get(text[i]);
|
|
850
|
+
if (bucket) {
|
|
851
|
+
for (const [leet, normal] of bucket) {
|
|
852
|
+
if (leet.length === 1 || text.startsWith(leet, i)) {
|
|
853
|
+
consumed = leet.length;
|
|
854
|
+
replacement = normal;
|
|
855
|
+
break;
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
if (consumed === 0) {
|
|
860
|
+
consumed = 1;
|
|
861
|
+
replacement = text[i];
|
|
862
|
+
}
|
|
863
|
+
for (const char of replacement) {
|
|
864
|
+
parts.push(char);
|
|
865
|
+
starts.push(i);
|
|
866
|
+
ends.push(i + consumed);
|
|
867
|
+
}
|
|
868
|
+
i += consumed;
|
|
869
|
+
}
|
|
870
|
+
return { normalized: parts.join(""), starts, ends };
|
|
871
|
+
}
|
|
872
|
+
/**
|
|
873
|
+
* Fold unicode evasion tactics into ASCII with a position map: fullwidth
|
|
874
|
+
* forms, Cyrillic/Greek homoglyphs, Latin diacritics, and invisible
|
|
875
|
+
* characters injected inside words. Non-Latin scripts (Devanagari, Tamil,
|
|
876
|
+
* etc.) pass through untouched. Returns null when nothing changed.
|
|
877
|
+
*/
|
|
878
|
+
unicodeNormalizeWithMap(text) {
|
|
879
|
+
// Fast path: pure ASCII text needs no folding
|
|
880
|
+
let needsScan = false;
|
|
881
|
+
for (let j = 0; j < text.length; j++) {
|
|
882
|
+
if (text.charCodeAt(j) > 127) {
|
|
883
|
+
needsScan = true;
|
|
884
|
+
break;
|
|
885
|
+
}
|
|
886
|
+
}
|
|
887
|
+
if (!needsScan)
|
|
888
|
+
return null;
|
|
889
|
+
const parts = [];
|
|
890
|
+
const starts = [];
|
|
891
|
+
const ends = [];
|
|
892
|
+
let changed = false;
|
|
893
|
+
for (let i = 0; i < text.length; i++) {
|
|
894
|
+
const char = text[i];
|
|
895
|
+
const code = text.charCodeAt(i);
|
|
896
|
+
if (code < 128) {
|
|
897
|
+
parts.push(char);
|
|
898
|
+
starts.push(i);
|
|
899
|
+
ends.push(i + 1);
|
|
900
|
+
continue;
|
|
901
|
+
}
|
|
902
|
+
if (INVISIBLE_CHARS.has(char)) {
|
|
903
|
+
changed = true;
|
|
904
|
+
continue;
|
|
905
|
+
}
|
|
906
|
+
// Fullwidth ASCII block (! U+FF01 .. ~ U+FF5E)
|
|
907
|
+
if (code >= 0xff01 && code <= 0xff5e) {
|
|
908
|
+
parts.push(String.fromCharCode(code - 0xfee0));
|
|
909
|
+
starts.push(i);
|
|
910
|
+
ends.push(i + 1);
|
|
911
|
+
changed = true;
|
|
912
|
+
continue;
|
|
913
|
+
}
|
|
914
|
+
const confusable = CONFUSABLES.get(char);
|
|
915
|
+
if (confusable) {
|
|
916
|
+
parts.push(confusable);
|
|
917
|
+
starts.push(i);
|
|
918
|
+
ends.push(i + 1);
|
|
919
|
+
changed = true;
|
|
920
|
+
continue;
|
|
921
|
+
}
|
|
922
|
+
// Bare combining marks (covers decomposed input like "u" + U+0308)
|
|
923
|
+
if (code >= 0x0300 && code <= 0x036f) {
|
|
924
|
+
changed = true;
|
|
925
|
+
continue;
|
|
926
|
+
}
|
|
927
|
+
// Latin letters with diacritics: decompose and strip the marks.
|
|
928
|
+
// Limited to the Latin blocks so other scripts keep their composed forms.
|
|
929
|
+
if (code >= 0x00c0 && code < 0x0250) {
|
|
930
|
+
for (const piece of char.normalize("NFD")) {
|
|
931
|
+
const pieceCode = piece.charCodeAt(0);
|
|
932
|
+
if (pieceCode >= 0x0300 && pieceCode <= 0x036f) {
|
|
933
|
+
changed = true;
|
|
934
|
+
continue;
|
|
935
|
+
}
|
|
936
|
+
const folded = this.caseSensitive ? piece : piece.toLowerCase();
|
|
937
|
+
parts.push(folded);
|
|
938
|
+
starts.push(i);
|
|
939
|
+
ends.push(i + 1);
|
|
940
|
+
if (folded !== char)
|
|
941
|
+
changed = true;
|
|
942
|
+
}
|
|
943
|
+
continue;
|
|
944
|
+
}
|
|
945
|
+
parts.push(char);
|
|
946
|
+
starts.push(i);
|
|
947
|
+
ends.push(i + 1);
|
|
948
|
+
}
|
|
949
|
+
if (!changed)
|
|
950
|
+
return null;
|
|
951
|
+
return { normalized: parts.join(""), starts, ends };
|
|
952
|
+
}
|
|
953
|
+
/**
|
|
954
|
+
* Collapse runs of repeated characters ("fuuuuck" -> "fuck") with a
|
|
955
|
+
* position map. Only triggers when a run of 3+ identical characters
|
|
956
|
+
* exists, so ordinary doubled letters never pay for this pass.
|
|
957
|
+
* Returns null when not triggered.
|
|
958
|
+
*/
|
|
959
|
+
collapseRepeatsWithMap(text) {
|
|
960
|
+
let triggered = false;
|
|
961
|
+
for (let j = 2; j < text.length; j++) {
|
|
962
|
+
if (text[j] === text[j - 1] && text[j] === text[j - 2]) {
|
|
963
|
+
triggered = true;
|
|
964
|
+
break;
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
if (!triggered)
|
|
968
|
+
return null;
|
|
969
|
+
const parts = [];
|
|
970
|
+
const starts = [];
|
|
971
|
+
const ends = [];
|
|
972
|
+
let i = 0;
|
|
973
|
+
while (i < text.length) {
|
|
974
|
+
let runEnd = i + 1;
|
|
975
|
+
while (runEnd < text.length && text[runEnd] === text[i]) {
|
|
976
|
+
runEnd++;
|
|
977
|
+
}
|
|
978
|
+
parts.push(text[i]);
|
|
979
|
+
starts.push(i);
|
|
980
|
+
ends.push(runEnd);
|
|
981
|
+
i = runEnd;
|
|
982
|
+
}
|
|
983
|
+
return { normalized: parts.join(""), starts, ends };
|
|
984
|
+
}
|
|
985
|
+
/**
|
|
986
|
+
* Build the list of (text, position-map) variants to scan: the base text
|
|
987
|
+
* plus unicode-folded, leet-normalized and repeat-collapsed variants, each
|
|
988
|
+
* included only when its normalization actually changed something.
|
|
989
|
+
*/
|
|
990
|
+
buildScanPasses(normalizedText) {
|
|
991
|
+
const passes = [
|
|
992
|
+
{ text: normalizedText },
|
|
993
|
+
];
|
|
994
|
+
let workText = normalizedText;
|
|
995
|
+
let workMap;
|
|
996
|
+
if (this.evasionUnicode) {
|
|
997
|
+
const uni = this.unicodeNormalizeWithMap(normalizedText);
|
|
998
|
+
if (uni) {
|
|
999
|
+
passes.push({ text: uni.normalized, posMap: uni });
|
|
1000
|
+
workText = uni.normalized;
|
|
1001
|
+
workMap = uni;
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
if (this.enableLeetSpeak) {
|
|
1005
|
+
const leet = this.normalizeLeetSpeakWithMap(workText);
|
|
1006
|
+
if (leet.normalized !== workText) {
|
|
1007
|
+
passes.push({
|
|
1008
|
+
text: leet.normalized,
|
|
1009
|
+
posMap: workMap ? composeMaps(workMap, leet) : leet,
|
|
1010
|
+
});
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
if (this.evasionRepeatedChars) {
|
|
1014
|
+
const collapsed = this.collapseRepeatsWithMap(workText);
|
|
1015
|
+
if (collapsed) {
|
|
1016
|
+
passes.push({
|
|
1017
|
+
text: collapsed.normalized,
|
|
1018
|
+
posMap: workMap ? composeMaps(workMap, collapsed) : collapsed,
|
|
1019
|
+
});
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
return passes;
|
|
1023
|
+
}
|
|
1024
|
+
/**
|
|
1025
|
+
* Find dictionary words hidden behind masked characters ("f*ck", "f#ck").
|
|
1026
|
+
* Each mask matches exactly one character and the token's visible letters
|
|
1027
|
+
* must align with a dictionary word, so "c#" or "5% off" never flag.
|
|
1028
|
+
*/
|
|
1029
|
+
findMaskedMatches(searchText, originalText) {
|
|
1030
|
+
const results = [];
|
|
1031
|
+
if (!/[*#@$%]/.test(searchText))
|
|
1032
|
+
return results;
|
|
1033
|
+
const tokenRegex = /[\p{L}*#@$%]+/gu;
|
|
1034
|
+
let tokenMatch;
|
|
1035
|
+
while ((tokenMatch = tokenRegex.exec(searchText)) !== null) {
|
|
1036
|
+
const token = tokenMatch[0];
|
|
1037
|
+
let maskCount = 0;
|
|
1038
|
+
for (const char of token) {
|
|
1039
|
+
if (MASK_CHARS.has(char))
|
|
1040
|
+
maskCount++;
|
|
1041
|
+
}
|
|
1042
|
+
if (maskCount === 0 || maskCount > 2)
|
|
1043
|
+
continue;
|
|
1044
|
+
if (MASK_CHARS.has(token[0]) ||
|
|
1045
|
+
MASK_CHARS.has(token[token.length - 1])) {
|
|
1046
|
+
continue;
|
|
1047
|
+
}
|
|
1048
|
+
const word = this.profanityTrie.findWildcardMatch(token, MASK_CHARS);
|
|
1049
|
+
if (!word)
|
|
1050
|
+
continue;
|
|
1051
|
+
const start = tokenMatch.index;
|
|
1052
|
+
const end = start + token.length;
|
|
1053
|
+
if (!this.detectPartialWords &&
|
|
1054
|
+
!this.isWholeWord(originalText, start, end)) {
|
|
1055
|
+
continue;
|
|
1056
|
+
}
|
|
1057
|
+
const matchedText = originalText.substring(start, end);
|
|
1058
|
+
if (this.isWhitelistedMatch(word, matchedText))
|
|
1059
|
+
continue;
|
|
1060
|
+
if (!this.hasWordBoundaries(originalText, start, end))
|
|
1061
|
+
continue;
|
|
1062
|
+
results.push({ word, start, end, originalWord: matchedText });
|
|
359
1063
|
}
|
|
360
|
-
return
|
|
1064
|
+
return results;
|
|
1065
|
+
}
|
|
1066
|
+
/**
|
|
1067
|
+
* Find words spelled out with a uniform single separator ("f u c k",
|
|
1068
|
+
* "f.u.c.k"). The joined letters must equal a dictionary word exactly:
|
|
1069
|
+
* runs like "U S A" or letters inside spelled-out sentences never flag.
|
|
1070
|
+
*/
|
|
1071
|
+
findSeparatedMatches(searchText, originalText) {
|
|
1072
|
+
const results = [];
|
|
1073
|
+
// Single letters joined by one consistent separator, at least 3 letters,
|
|
1074
|
+
// not touching letters/digits on either side.
|
|
1075
|
+
const runRegex = /(?<![\p{L}\p{N}])\p{L}(?:([ ._\-/])\p{L})(?:\1\p{L})+(?![\p{L}\p{N}])/gu;
|
|
1076
|
+
let runMatch;
|
|
1077
|
+
while ((runMatch = runRegex.exec(searchText)) !== null) {
|
|
1078
|
+
const run = runMatch[0];
|
|
1079
|
+
const separator = runMatch[1];
|
|
1080
|
+
const joined = run.split(separator).join("");
|
|
1081
|
+
const trieMatches = this.profanityTrie.findMatches(joined, 0, false);
|
|
1082
|
+
const exact = trieMatches.find((m) => m.end === joined.length);
|
|
1083
|
+
if (!exact)
|
|
1084
|
+
continue;
|
|
1085
|
+
const start = runMatch.index;
|
|
1086
|
+
const end = start + run.length;
|
|
1087
|
+
const matchedText = originalText.substring(start, end);
|
|
1088
|
+
if (this.isWhitelistedMatch(exact.word, joined) ||
|
|
1089
|
+
this.isWhitelistedMatch(exact.word, matchedText)) {
|
|
1090
|
+
continue;
|
|
1091
|
+
}
|
|
1092
|
+
results.push({ word: exact.word, start, end, originalWord: matchedText });
|
|
1093
|
+
}
|
|
1094
|
+
return results;
|
|
361
1095
|
}
|
|
362
1096
|
/**
|
|
363
|
-
*
|
|
364
|
-
*
|
|
365
|
-
*
|
|
1097
|
+
* Find unambiguous profanity stems embedded inside larger tokens
|
|
1098
|
+
* ("sisfuck", "totalshitshow"). Only stems from EMBEDDED_STRONG_STEMS that
|
|
1099
|
+
* are currently in the dictionary are considered, and tokens listed in
|
|
1100
|
+
* EMBEDDED_SAFE_WORDS or the whitelist never flag. The whole containing
|
|
1101
|
+
* token is reported so cleaning masks all of it.
|
|
366
1102
|
*/
|
|
367
|
-
|
|
368
|
-
|
|
1103
|
+
findEmbeddedMatches(searchText, originalText) {
|
|
1104
|
+
const results = [];
|
|
1105
|
+
for (const stem of EMBEDDED_STRONG_STEMS) {
|
|
1106
|
+
// Respect remove()/clearList(): only flag stems still in the dictionary
|
|
1107
|
+
const exact = this.profanityTrie
|
|
1108
|
+
.findMatches(stem, 0, false)
|
|
1109
|
+
.some((m) => m.end === stem.length);
|
|
1110
|
+
if (!exact)
|
|
1111
|
+
continue;
|
|
1112
|
+
let index = searchText.indexOf(stem);
|
|
1113
|
+
while (index !== -1) {
|
|
1114
|
+
// Expand to the containing token
|
|
1115
|
+
let tokenStart = index;
|
|
1116
|
+
let tokenEnd = index + stem.length;
|
|
1117
|
+
while (tokenStart > 0 && /\w/.test(searchText[tokenStart - 1])) {
|
|
1118
|
+
tokenStart--;
|
|
1119
|
+
}
|
|
1120
|
+
while (tokenEnd < searchText.length &&
|
|
1121
|
+
/\w/.test(searchText[tokenEnd])) {
|
|
1122
|
+
tokenEnd++;
|
|
1123
|
+
}
|
|
1124
|
+
const token = searchText.substring(tokenStart, tokenEnd);
|
|
1125
|
+
const isEmbedded = token !== stem; // exact tokens are the base pass's job
|
|
1126
|
+
if (isEmbedded &&
|
|
1127
|
+
!EMBEDDED_SAFE_WORDS.has(token.toLowerCase()) &&
|
|
1128
|
+
!this.isWhitelisted(token) &&
|
|
1129
|
+
!this.isWhitelistedMatch(stem, token)) {
|
|
1130
|
+
results.push({
|
|
1131
|
+
word: stem,
|
|
1132
|
+
start: tokenStart,
|
|
1133
|
+
end: tokenEnd,
|
|
1134
|
+
originalWord: originalText.substring(tokenStart, tokenEnd),
|
|
1135
|
+
});
|
|
1136
|
+
}
|
|
1137
|
+
index = searchText.indexOf(stem, tokenEnd);
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
return results;
|
|
369
1141
|
}
|
|
370
1142
|
/**
|
|
371
1143
|
* Check if a match is bounded by word boundaries (strict mode).
|
|
@@ -411,6 +1183,27 @@ export class AllProfanity {
|
|
|
411
1183
|
this.whitelistSet.has(matchedText.toLowerCase()));
|
|
412
1184
|
}
|
|
413
1185
|
}
|
|
1186
|
+
/**
|
|
1187
|
+
* In partial-word mode, check whether the word CONTAINING the match is
|
|
1188
|
+
* whitelisted: with "classic" whitelisted, the embedded "ass" must not flag.
|
|
1189
|
+
*/
|
|
1190
|
+
isWhitelistedContainingWord(originalText, start, end) {
|
|
1191
|
+
if (!this.detectPartialWords || this.whitelistSet.size === 0) {
|
|
1192
|
+
return false;
|
|
1193
|
+
}
|
|
1194
|
+
let tokenStart = start;
|
|
1195
|
+
let tokenEnd = end;
|
|
1196
|
+
while (tokenStart > 0 && /\w/.test(originalText[tokenStart - 1])) {
|
|
1197
|
+
tokenStart--;
|
|
1198
|
+
}
|
|
1199
|
+
while (tokenEnd < originalText.length && /\w/.test(originalText[tokenEnd])) {
|
|
1200
|
+
tokenEnd++;
|
|
1201
|
+
}
|
|
1202
|
+
if (tokenStart === start && tokenEnd === end) {
|
|
1203
|
+
return false; // match is the whole token; already covered by isWhitelistedMatch
|
|
1204
|
+
}
|
|
1205
|
+
return this.isWhitelisted(originalText.substring(tokenStart, tokenEnd));
|
|
1206
|
+
}
|
|
414
1207
|
/**
|
|
415
1208
|
* Remove overlapping matches, keeping only the longest at each start position.
|
|
416
1209
|
* @param matches - Array of match results.
|
|
@@ -435,26 +1228,31 @@ export class AllProfanity {
|
|
|
435
1228
|
/**
|
|
436
1229
|
* Use Aho-Corasick algorithm for pattern matching
|
|
437
1230
|
*/
|
|
438
|
-
findMatchesWithAhoCorasick(searchText, originalText) {
|
|
1231
|
+
findMatchesWithAhoCorasick(searchText, originalText, posMap) {
|
|
439
1232
|
if (!this.ahoCorasickAutomaton) {
|
|
440
1233
|
return [];
|
|
441
1234
|
}
|
|
442
1235
|
const ahoMatches = this.ahoCorasickAutomaton.findAll(searchText);
|
|
443
1236
|
const results = [];
|
|
444
1237
|
for (const match of ahoMatches) {
|
|
1238
|
+
const start = posMap ? posMap.starts[match.start] : match.start;
|
|
1239
|
+
const end = posMap ? posMap.ends[match.end - 1] : match.end;
|
|
445
1240
|
if (!this.detectPartialWords &&
|
|
446
|
-
!this.isWholeWord(originalText,
|
|
1241
|
+
!this.isWholeWord(originalText, start, end)) {
|
|
447
1242
|
continue;
|
|
448
1243
|
}
|
|
449
|
-
const matchedText = originalText.substring(
|
|
1244
|
+
const matchedText = originalText.substring(start, end);
|
|
450
1245
|
if (this.isWhitelistedMatch(match.pattern, matchedText)) {
|
|
451
1246
|
continue;
|
|
452
1247
|
}
|
|
453
|
-
if (this.
|
|
1248
|
+
if (this.isWhitelistedContainingWord(originalText, start, end)) {
|
|
1249
|
+
continue;
|
|
1250
|
+
}
|
|
1251
|
+
if (this.hasWordBoundaries(originalText, start, end)) {
|
|
454
1252
|
results.push({
|
|
455
1253
|
word: match.pattern,
|
|
456
|
-
start
|
|
457
|
-
end
|
|
1254
|
+
start,
|
|
1255
|
+
end,
|
|
458
1256
|
originalWord: matchedText,
|
|
459
1257
|
});
|
|
460
1258
|
}
|
|
@@ -462,25 +1260,38 @@ export class AllProfanity {
|
|
|
462
1260
|
return results;
|
|
463
1261
|
}
|
|
464
1262
|
/**
|
|
465
|
-
*
|
|
1263
|
+
* Check whether the Bloom Filter can quickly rule out any profanity in the
|
|
1264
|
+
* text. Only safe for ASCII whole-word matching: partial matches and
|
|
1265
|
+
* non-ASCII scripts can match inside tokens, so they bypass the prefilter.
|
|
466
1266
|
*/
|
|
467
|
-
|
|
1267
|
+
bloomQuickReject(searchText) {
|
|
1268
|
+
if (!this.bloomFilter || this.detectPartialWords)
|
|
1269
|
+
return false;
|
|
1270
|
+
// eslint-disable-next-line no-control-regex
|
|
1271
|
+
if (!/^[\x00-\x7F]*$/.test(searchText))
|
|
1272
|
+
return false;
|
|
1273
|
+
const tokens = searchText.split(/[^\p{L}\p{N}]+/u);
|
|
1274
|
+
for (const token of tokens) {
|
|
1275
|
+
if (token.length > 0 && this.bloomFilter.mightContain(token)) {
|
|
1276
|
+
return false;
|
|
1277
|
+
}
|
|
1278
|
+
}
|
|
1279
|
+
return true;
|
|
1280
|
+
}
|
|
1281
|
+
/**
|
|
1282
|
+
* Hybrid approach: Bloom Filter for quick rejection, Aho-Corasick for matching
|
|
1283
|
+
*/
|
|
1284
|
+
findMatchesHybrid(searchText, originalText, posMap) {
|
|
1285
|
+
if (this.bloomQuickReject(searchText)) {
|
|
1286
|
+
return [];
|
|
1287
|
+
}
|
|
468
1288
|
// Use Aho-Corasick for primary matching if available
|
|
469
1289
|
if (this.ahoCorasickAutomaton) {
|
|
470
|
-
|
|
471
|
-
// If Bloom Filter is enabled, validate matches
|
|
472
|
-
if (this.bloomFilter) {
|
|
473
|
-
return matches.filter((match) => this.bloomFilter.mightContain(match.word));
|
|
474
|
-
}
|
|
475
|
-
return matches;
|
|
1290
|
+
return this.findMatchesWithAhoCorasick(searchText, originalText, posMap);
|
|
476
1291
|
}
|
|
477
1292
|
// Fallback to Trie if Aho-Corasick not available
|
|
478
1293
|
const matches = [];
|
|
479
|
-
this.findMatches(searchText, originalText, matches);
|
|
480
|
-
// Validate with Bloom Filter if enabled
|
|
481
|
-
if (this.bloomFilter) {
|
|
482
|
-
return matches.filter((match) => this.bloomFilter.mightContain(match.word));
|
|
483
|
-
}
|
|
1294
|
+
this.findMatches(searchText, originalText, matches, posMap);
|
|
484
1295
|
return matches;
|
|
485
1296
|
}
|
|
486
1297
|
/**
|
|
@@ -497,66 +1308,117 @@ export class AllProfanity {
|
|
|
497
1308
|
});
|
|
498
1309
|
}
|
|
499
1310
|
/**
|
|
500
|
-
*
|
|
501
|
-
*
|
|
502
|
-
* @returns Profanity detection result.
|
|
1311
|
+
* Drop all cached detection results. Must be called whenever the word lists
|
|
1312
|
+
* or any option that affects detection output changes.
|
|
503
1313
|
*/
|
|
504
|
-
|
|
1314
|
+
invalidateCache() {
|
|
505
1315
|
var _a;
|
|
1316
|
+
(_a = this.resultCache) === null || _a === void 0 ? void 0 : _a.clear();
|
|
1317
|
+
}
|
|
1318
|
+
/**
|
|
1319
|
+
* Detects profanity in the provided text and returns comprehensive analysis.
|
|
1320
|
+
*
|
|
1321
|
+
* @param {string} text - The text to analyze for profanity
|
|
1322
|
+
* @returns {ProfanityDetectionResult} Detailed detection result including matches, positions, severity, and cleaned text
|
|
1323
|
+
*
|
|
1324
|
+
* @throws {TypeError} If text is not a string
|
|
1325
|
+
*
|
|
1326
|
+
* @remarks
|
|
1327
|
+
* ### Performance:
|
|
1328
|
+
* - Time Complexity: O(n*m) where n is text length, m is average word length in dictionary
|
|
1329
|
+
* - With Bloom Filter: O(n) average case (faster early rejection)
|
|
1330
|
+
* - With Caching: O(1) for repeated identical text
|
|
1331
|
+
*
|
|
1332
|
+
* ### Features:
|
|
1333
|
+
* - Detects leet speak variations (if enabled): "h3ll0" → "hello"
|
|
1334
|
+
* - Respects word boundaries (strict mode) or detects partial matches
|
|
1335
|
+
* - Returns exact positions for highlighting/masking
|
|
1336
|
+
* - Calculates severity based on match count and uniqueness
|
|
1337
|
+
*
|
|
1338
|
+
* ### Caching:
|
|
1339
|
+
* - Results are cached if `performance.enableCaching` is true
|
|
1340
|
+
* - Cache uses LRU eviction when size limit is reached
|
|
1341
|
+
*
|
|
1342
|
+
* @example
|
|
1343
|
+
* ```typescript
|
|
1344
|
+
* const filter = new AllProfanity();
|
|
1345
|
+
* const result = filter.detect("This has bad words");
|
|
1346
|
+
*
|
|
1347
|
+
* console.log(result.hasProfanity); // true
|
|
1348
|
+
* console.log(result.detectedWords); // ['bad']
|
|
1349
|
+
* console.log(result.cleanedText); // 'This has *** words'
|
|
1350
|
+
* console.log(result.severity); // ProfanitySeverity.MILD
|
|
1351
|
+
* console.log(result.positions); // [{ word: 'bad', start: 9, end: 12 }]
|
|
1352
|
+
* ```
|
|
1353
|
+
*
|
|
1354
|
+
* @example
|
|
1355
|
+
* ```typescript
|
|
1356
|
+
* // With leet speak detection
|
|
1357
|
+
* const filter = new AllProfanity({ enableLeetSpeak: true });
|
|
1358
|
+
* const result = filter.detect("st0p b3ing b@d");
|
|
1359
|
+
*
|
|
1360
|
+
* if (result.hasProfanity) {
|
|
1361
|
+
* result.positions.forEach(pos => {
|
|
1362
|
+
* console.log(`Found "${pos.word}" at position ${pos.start}-${pos.end}`);
|
|
1363
|
+
* });
|
|
1364
|
+
* }
|
|
1365
|
+
* ```
|
|
1366
|
+
*
|
|
1367
|
+
* @see {@link ProfanityDetectionResult} for result structure
|
|
1368
|
+
* @see {@link ProfanitySeverity} for severity levels
|
|
1369
|
+
*/
|
|
1370
|
+
detect(text) {
|
|
506
1371
|
const validatedText = validateString(text, "text");
|
|
507
1372
|
if (validatedText.length === 0) {
|
|
508
1373
|
return {
|
|
509
1374
|
hasProfanity: false,
|
|
510
1375
|
detectedWords: [],
|
|
511
1376
|
cleanedText: validatedText,
|
|
512
|
-
severity: ProfanitySeverity.
|
|
1377
|
+
severity: ProfanitySeverity.NONE,
|
|
513
1378
|
positions: [],
|
|
514
1379
|
};
|
|
515
1380
|
}
|
|
516
|
-
// Check cache first if enabled
|
|
517
|
-
if (
|
|
518
|
-
|
|
1381
|
+
// Check cache first if enabled (refresh recency for LRU eviction)
|
|
1382
|
+
if (this.resultCache) {
|
|
1383
|
+
const cached = this.resultCache.get(validatedText);
|
|
1384
|
+
if (cached) {
|
|
1385
|
+
this.resultCache.delete(validatedText);
|
|
1386
|
+
this.resultCache.set(validatedText, cached);
|
|
1387
|
+
return cached;
|
|
1388
|
+
}
|
|
519
1389
|
}
|
|
520
1390
|
let matches = [];
|
|
521
1391
|
const normalizedText = this.caseSensitive
|
|
522
1392
|
? validatedText
|
|
523
1393
|
: validatedText.toLowerCase();
|
|
524
|
-
//
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
this.findMatches(normalizedText, validatedText, matches);
|
|
549
|
-
if (this.enableLeetSpeak) {
|
|
550
|
-
const leetNormalized = this.normalizeLeetSpeak(normalizedText);
|
|
551
|
-
if (leetNormalized !== normalizedText) {
|
|
552
|
-
this.findMatches(leetNormalized, validatedText, matches);
|
|
553
|
-
}
|
|
554
|
-
}
|
|
555
|
-
break;
|
|
1394
|
+
// Scan the base text plus every triggered normalization variant
|
|
1395
|
+
// (unicode folding, leet speak, repeated-character collapse)
|
|
1396
|
+
for (const pass of this.buildScanPasses(normalizedText)) {
|
|
1397
|
+
switch (this.matchingAlgorithm) {
|
|
1398
|
+
case "aho-corasick":
|
|
1399
|
+
matches.push(...this.findMatchesWithAhoCorasick(pass.text, validatedText, pass.posMap));
|
|
1400
|
+
break;
|
|
1401
|
+
case "hybrid":
|
|
1402
|
+
matches.push(...this.findMatchesHybrid(pass.text, validatedText, pass.posMap));
|
|
1403
|
+
break;
|
|
1404
|
+
case "trie":
|
|
1405
|
+
default:
|
|
1406
|
+
this.findMatches(pass.text, validatedText, matches, pass.posMap);
|
|
1407
|
+
break;
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
if (this.evasionMaskedChars) {
|
|
1411
|
+
matches.push(...this.findMaskedMatches(normalizedText, validatedText));
|
|
1412
|
+
}
|
|
1413
|
+
if (this.evasionSeparatedLetters) {
|
|
1414
|
+
matches.push(...this.findSeparatedMatches(normalizedText, validatedText));
|
|
1415
|
+
}
|
|
1416
|
+
if (this.evasionEmbeddedWords) {
|
|
1417
|
+
matches.push(...this.findEmbeddedMatches(normalizedText, validatedText));
|
|
556
1418
|
}
|
|
557
1419
|
// Apply context analysis if enabled
|
|
558
1420
|
if (this.contextAnalyzer) {
|
|
559
|
-
matches = this.applyContextAnalysis(validatedText, matches);
|
|
1421
|
+
matches = this.applyContextAnalysis(validatedText, matches, this.contextScoreThreshold);
|
|
560
1422
|
}
|
|
561
1423
|
const uniqueMatches = this.deduplicateMatches(matches);
|
|
562
1424
|
const detectedWords = uniqueMatches.map((m) => m.originalWord);
|
|
@@ -573,14 +1435,15 @@ export class AllProfanity {
|
|
|
573
1435
|
end: m.end,
|
|
574
1436
|
})),
|
|
575
1437
|
};
|
|
576
|
-
// Cache result if caching is enabled
|
|
1438
|
+
// Cache result if caching is enabled (evict least recently used entry)
|
|
577
1439
|
if (this.resultCache) {
|
|
578
|
-
this.resultCache.
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
1440
|
+
if (this.resultCache.size >= this.cacheMaxSize) {
|
|
1441
|
+
const oldestKey = this.resultCache.keys().next().value;
|
|
1442
|
+
if (oldestKey !== undefined) {
|
|
1443
|
+
this.resultCache.delete(oldestKey);
|
|
1444
|
+
}
|
|
583
1445
|
}
|
|
1446
|
+
this.resultCache.set(validatedText, result);
|
|
584
1447
|
}
|
|
585
1448
|
return result;
|
|
586
1449
|
}
|
|
@@ -590,12 +1453,14 @@ export class AllProfanity {
|
|
|
590
1453
|
* @param originalText - The original text.
|
|
591
1454
|
* @param matches - Array to collect matches.
|
|
592
1455
|
*/
|
|
593
|
-
findMatches(searchText, originalText, matches) {
|
|
1456
|
+
findMatches(searchText, originalText, matches, posMap) {
|
|
594
1457
|
for (let i = 0; i < searchText.length; i++) {
|
|
595
1458
|
const matchResults = this.profanityTrie.findMatches(searchText, i, this.detectPartialWords);
|
|
596
1459
|
for (const match of matchResults) {
|
|
597
|
-
const
|
|
598
|
-
const
|
|
1460
|
+
const searchStart = i + match.start;
|
|
1461
|
+
const searchEnd = i + match.end;
|
|
1462
|
+
const start = posMap ? posMap.starts[searchStart] : searchStart;
|
|
1463
|
+
const end = posMap ? posMap.ends[searchEnd - 1] : searchEnd;
|
|
599
1464
|
if (!this.detectPartialWords &&
|
|
600
1465
|
!this.isWholeWord(originalText, start, end)) {
|
|
601
1466
|
continue;
|
|
@@ -604,6 +1469,9 @@ export class AllProfanity {
|
|
|
604
1469
|
if (this.isWhitelistedMatch(match.word, matchedText)) {
|
|
605
1470
|
continue;
|
|
606
1471
|
}
|
|
1472
|
+
if (this.isWhitelistedContainingWord(originalText, start, end)) {
|
|
1473
|
+
continue;
|
|
1474
|
+
}
|
|
607
1475
|
if (this.hasWordBoundaries(originalText, start, end)) {
|
|
608
1476
|
matches.push({
|
|
609
1477
|
word: match.word,
|
|
@@ -636,18 +1504,149 @@ export class AllProfanity {
|
|
|
636
1504
|
return result;
|
|
637
1505
|
}
|
|
638
1506
|
/**
|
|
639
|
-
*
|
|
640
|
-
*
|
|
641
|
-
* @
|
|
1507
|
+
* Quick boolean check for profanity presence in text.
|
|
1508
|
+
*
|
|
1509
|
+
* @param {string} text - The text to check for profanity
|
|
1510
|
+
* @returns {boolean} True if profanity is detected, false otherwise
|
|
1511
|
+
*
|
|
1512
|
+
* @throws {TypeError} If text is not a string
|
|
1513
|
+
*
|
|
1514
|
+
* @remarks
|
|
1515
|
+
* - Convenience method that internally calls `detect()` and returns only the boolean result
|
|
1516
|
+
* - For detailed information about matches, use `detect()` instead
|
|
1517
|
+
* - Results are cached if caching is enabled (same cache as `detect()`)
|
|
1518
|
+
*
|
|
1519
|
+
* @example
|
|
1520
|
+
* ```typescript
|
|
1521
|
+
* const filter = new AllProfanity();
|
|
1522
|
+
*
|
|
1523
|
+
* if (filter.check("This has bad words")) {
|
|
1524
|
+
* console.log("Profanity detected!");
|
|
1525
|
+
* }
|
|
1526
|
+
*
|
|
1527
|
+
* // Quick validation
|
|
1528
|
+
* const isClean = !filter.check(userInput);
|
|
1529
|
+
* ```
|
|
1530
|
+
*
|
|
1531
|
+
* @see {@link detect} for detailed profanity analysis
|
|
642
1532
|
*/
|
|
643
1533
|
check(text) {
|
|
644
|
-
|
|
1534
|
+
const validatedText = validateString(text, "text");
|
|
1535
|
+
if (validatedText.length === 0)
|
|
1536
|
+
return false;
|
|
1537
|
+
// Reuse a cached full result when available
|
|
1538
|
+
if (this.resultCache) {
|
|
1539
|
+
const cached = this.resultCache.get(validatedText);
|
|
1540
|
+
if (cached)
|
|
1541
|
+
return cached.hasProfanity;
|
|
1542
|
+
}
|
|
1543
|
+
// Context analysis scores matches against their surroundings; reuse the
|
|
1544
|
+
// full pipeline so check() and detect() can never disagree.
|
|
1545
|
+
if (this.contextAnalyzer) {
|
|
1546
|
+
return this.detect(validatedText).hasProfanity;
|
|
1547
|
+
}
|
|
1548
|
+
const normalizedText = this.caseSensitive
|
|
1549
|
+
? validatedText
|
|
1550
|
+
: validatedText.toLowerCase();
|
|
1551
|
+
// Early exit on the first accepted match — unlike detect(), no positions,
|
|
1552
|
+
// severity or cleaned text are computed. The base text is scanned before
|
|
1553
|
+
// any normalization variants are built, so plainly profane text returns
|
|
1554
|
+
// without paying for normalization at all.
|
|
1555
|
+
if (this.hasMatchInPass(normalizedText, validatedText)) {
|
|
1556
|
+
return true;
|
|
1557
|
+
}
|
|
1558
|
+
const passes = this.buildScanPasses(normalizedText);
|
|
1559
|
+
for (let p = 1; p < passes.length; p++) {
|
|
1560
|
+
if (this.hasMatchInPass(passes[p].text, validatedText, passes[p].posMap)) {
|
|
1561
|
+
return true;
|
|
1562
|
+
}
|
|
1563
|
+
}
|
|
1564
|
+
if (this.evasionMaskedChars &&
|
|
1565
|
+
this.findMaskedMatches(normalizedText, validatedText).length > 0) {
|
|
1566
|
+
return true;
|
|
1567
|
+
}
|
|
1568
|
+
if (this.evasionSeparatedLetters &&
|
|
1569
|
+
this.findSeparatedMatches(normalizedText, validatedText).length > 0) {
|
|
1570
|
+
return true;
|
|
1571
|
+
}
|
|
1572
|
+
if (this.evasionEmbeddedWords &&
|
|
1573
|
+
this.findEmbeddedMatches(normalizedText, validatedText).length > 0) {
|
|
1574
|
+
return true;
|
|
1575
|
+
}
|
|
1576
|
+
return false;
|
|
645
1577
|
}
|
|
646
1578
|
/**
|
|
647
|
-
*
|
|
648
|
-
*
|
|
649
|
-
|
|
650
|
-
|
|
1579
|
+
* Trie scan that stops at the first match surviving the whole-word,
|
|
1580
|
+
* whitelist and boundary checks. Powers the fast path in check().
|
|
1581
|
+
*/
|
|
1582
|
+
hasMatchInPass(searchText, originalText, posMap) {
|
|
1583
|
+
for (let i = 0; i < searchText.length; i++) {
|
|
1584
|
+
const matchResults = this.profanityTrie.findMatches(searchText, i, this.detectPartialWords);
|
|
1585
|
+
for (const match of matchResults) {
|
|
1586
|
+
const searchEnd = i + match.end;
|
|
1587
|
+
const start = posMap ? posMap.starts[i] : i;
|
|
1588
|
+
const end = posMap ? posMap.ends[searchEnd - 1] : searchEnd;
|
|
1589
|
+
if (!this.detectPartialWords &&
|
|
1590
|
+
!this.isWholeWord(originalText, start, end)) {
|
|
1591
|
+
continue;
|
|
1592
|
+
}
|
|
1593
|
+
const matchedText = originalText.substring(start, end);
|
|
1594
|
+
if (this.isWhitelistedMatch(match.word, matchedText)) {
|
|
1595
|
+
continue;
|
|
1596
|
+
}
|
|
1597
|
+
if (this.isWhitelistedContainingWord(originalText, start, end)) {
|
|
1598
|
+
continue;
|
|
1599
|
+
}
|
|
1600
|
+
if (this.hasWordBoundaries(originalText, start, end)) {
|
|
1601
|
+
return true;
|
|
1602
|
+
}
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
return false;
|
|
1606
|
+
}
|
|
1607
|
+
/**
|
|
1608
|
+
* Cleans text by replacing profanity with a placeholder character.
|
|
1609
|
+
*
|
|
1610
|
+
* @param {string} text - The text to clean
|
|
1611
|
+
* @param {string} [placeholder] - Optional custom placeholder character (uses default if not provided)
|
|
1612
|
+
* @returns {string} The cleaned text with profanity replaced
|
|
1613
|
+
*
|
|
1614
|
+
* @throws {TypeError} If text is not a string
|
|
1615
|
+
*
|
|
1616
|
+
* @remarks
|
|
1617
|
+
* ### Character-level Replacement:
|
|
1618
|
+
* - Each profane character is replaced individually
|
|
1619
|
+
* - "bad" with placeholder "*" becomes "***"
|
|
1620
|
+
* - Preserves text length and structure
|
|
1621
|
+
*
|
|
1622
|
+
* ### Placeholder Behavior:
|
|
1623
|
+
* - If no placeholder provided, uses the instance's default placeholder
|
|
1624
|
+
* - If placeholder provided, uses only the first character
|
|
1625
|
+
* - Empty placeholder throws error
|
|
1626
|
+
*
|
|
1627
|
+
* @example
|
|
1628
|
+
* ```typescript
|
|
1629
|
+
* const filter = new AllProfanity();
|
|
1630
|
+
*
|
|
1631
|
+
* // Using default placeholder (*)
|
|
1632
|
+
* const cleaned = filter.clean("This has bad words");
|
|
1633
|
+
* console.log(cleaned); // "This has *** *****"
|
|
1634
|
+
*
|
|
1635
|
+
* // Using custom placeholder
|
|
1636
|
+
* const cleaned = filter.clean("This has bad words", "#");
|
|
1637
|
+
* console.log(cleaned); // "This has ### #####"
|
|
1638
|
+
* ```
|
|
1639
|
+
*
|
|
1640
|
+
* @example
|
|
1641
|
+
* ```typescript
|
|
1642
|
+
* // Clean user-generated content for display
|
|
1643
|
+
* const userComment = "Some inappropriate words here";
|
|
1644
|
+
* const safeComment = filter.clean(userComment);
|
|
1645
|
+
* displayComment(safeComment);
|
|
1646
|
+
* ```
|
|
1647
|
+
*
|
|
1648
|
+
* @see {@link cleanWithPlaceholder} for word-level replacement
|
|
1649
|
+
* @see {@link setPlaceholder} to change default placeholder
|
|
651
1650
|
*/
|
|
652
1651
|
clean(text, placeholder) {
|
|
653
1652
|
const detection = this.detect(text);
|
|
@@ -663,9 +1662,10 @@ export class AllProfanity {
|
|
|
663
1662
|
originalWord: text.substring(p.start, p.end),
|
|
664
1663
|
}))),
|
|
665
1664
|
].sort((a, b) => b.start - a.start);
|
|
1665
|
+
const placeholderChar = placeholder.charAt(0);
|
|
666
1666
|
for (const pos of sortedPositions) {
|
|
667
1667
|
const originalWord = text.substring(pos.start, pos.end);
|
|
668
|
-
const replacement =
|
|
1668
|
+
const replacement = placeholderChar.repeat(originalWord.length);
|
|
669
1669
|
result =
|
|
670
1670
|
result.substring(0, pos.start) +
|
|
671
1671
|
replacement +
|
|
@@ -674,10 +1674,46 @@ export class AllProfanity {
|
|
|
674
1674
|
return result;
|
|
675
1675
|
}
|
|
676
1676
|
/**
|
|
677
|
-
*
|
|
678
|
-
*
|
|
679
|
-
* @param
|
|
680
|
-
* @
|
|
1677
|
+
* Cleans text by replacing each profane word with a single placeholder string (word-level replacement).
|
|
1678
|
+
*
|
|
1679
|
+
* @param {string} text - The text to clean
|
|
1680
|
+
* @param {string} [placeholder="***"] - The placeholder string to use for each profane word
|
|
1681
|
+
* @returns {string} The cleaned text with each profane word replaced by the placeholder
|
|
1682
|
+
*
|
|
1683
|
+
* @throws {TypeError} If text is not a string
|
|
1684
|
+
*
|
|
1685
|
+
* @remarks
|
|
1686
|
+
* ### Word-level Replacement:
|
|
1687
|
+
* - Each profane word is replaced with the entire placeholder string (not character-by-character)
|
|
1688
|
+
* - "bad words" with placeholder "***" becomes "*** ***"
|
|
1689
|
+
* - Does NOT preserve original text length
|
|
1690
|
+
*
|
|
1691
|
+
* ### Difference from `clean()`:
|
|
1692
|
+
* - `clean()`: Character-level replacement - "bad" becomes "***" (preserves length)
|
|
1693
|
+
* - `cleanWithPlaceholder()`: Word-level replacement - "bad" becomes "***" (fixed placeholder)
|
|
1694
|
+
*
|
|
1695
|
+
* @example
|
|
1696
|
+
* ```typescript
|
|
1697
|
+
* const filter = new AllProfanity();
|
|
1698
|
+
*
|
|
1699
|
+
* // Default placeholder (***) const text = "This has bad words";
|
|
1700
|
+
* const cleaned = filter.cleanWithPlaceholder(text);
|
|
1701
|
+
* console.log(cleaned); // "This has *** ***"
|
|
1702
|
+
*
|
|
1703
|
+
* // Custom placeholder
|
|
1704
|
+
* const cleaned2 = filter.cleanWithPlaceholder(text, "[CENSORED]");
|
|
1705
|
+
* console.log(cleaned2); // "This has [CENSORED] [CENSORED]"
|
|
1706
|
+
* ```
|
|
1707
|
+
*
|
|
1708
|
+
* @example
|
|
1709
|
+
* ```typescript
|
|
1710
|
+
* // Censoring chat messages
|
|
1711
|
+
* const message = "You are a badword and stupid";
|
|
1712
|
+
* const censored = filter.cleanWithPlaceholder(message, "[***]");
|
|
1713
|
+
* // Result: "You are a [***] and [***]"
|
|
1714
|
+
* ```
|
|
1715
|
+
*
|
|
1716
|
+
* @see {@link clean} for character-level replacement
|
|
681
1717
|
*/
|
|
682
1718
|
cleanWithPlaceholder(text, placeholder = "***") {
|
|
683
1719
|
const detection = this.detect(text);
|
|
@@ -703,51 +1739,144 @@ export class AllProfanity {
|
|
|
703
1739
|
return result;
|
|
704
1740
|
}
|
|
705
1741
|
/**
|
|
706
|
-
*
|
|
707
|
-
*
|
|
1742
|
+
* Dynamically adds one or more words to the profanity filter at runtime.
|
|
1743
|
+
*
|
|
1744
|
+
* @param {string | string[]} word - A single word or array of words to add to the filter
|
|
1745
|
+
* @returns {void}
|
|
1746
|
+
*
|
|
1747
|
+
* @remarks
|
|
1748
|
+
* ### Behavior:
|
|
1749
|
+
* - Words are added to all active data structures (Trie, Aho-Corasick, Bloom Filter)
|
|
1750
|
+
* - Automatically normalizes words based on caseSensitive setting
|
|
1751
|
+
* - Skips whitelisted words
|
|
1752
|
+
* - Validates and filters out non-string or empty values
|
|
1753
|
+
* - Changes take effect immediately for subsequent detect/check/clean calls
|
|
1754
|
+
*
|
|
1755
|
+
* ### Use Cases:
|
|
1756
|
+
* - Adding context-specific profanity
|
|
1757
|
+
* - Building dynamic word lists from user reports
|
|
1758
|
+
* - Customizing filters for specific communities/applications
|
|
1759
|
+
*
|
|
1760
|
+
* @example
|
|
1761
|
+
* ```typescript
|
|
1762
|
+
* const filter = new AllProfanity();
|
|
1763
|
+
*
|
|
1764
|
+
* // Add single word
|
|
1765
|
+
* filter.add('newbadword');
|
|
1766
|
+
*
|
|
1767
|
+
* // Add multiple words
|
|
1768
|
+
* filter.add(['word1', 'word2', 'word3']);
|
|
1769
|
+
*
|
|
1770
|
+
* // Now these words will be detected
|
|
1771
|
+
* filter.check('newbadword'); // true
|
|
1772
|
+
* ```
|
|
1773
|
+
*
|
|
1774
|
+
* @example
|
|
1775
|
+
* ```typescript
|
|
1776
|
+
* // Add game-specific slang dynamically
|
|
1777
|
+
* const filter = new AllProfanity();
|
|
1778
|
+
* const gamingSlang = ['noob', 'trash', 'tryhard'];
|
|
1779
|
+
* filter.add(gamingSlang);
|
|
1780
|
+
*
|
|
1781
|
+
* const message = "You're such a noob";
|
|
1782
|
+
* console.log(filter.check(message)); // true
|
|
1783
|
+
* ```
|
|
1784
|
+
*
|
|
1785
|
+
* @see {@link remove} to remove words
|
|
1786
|
+
* @see {@link loadCustomDictionary} for loading named dictionaries
|
|
708
1787
|
*/
|
|
709
1788
|
add(word) {
|
|
710
1789
|
const words = Array.isArray(word) ? word : [word];
|
|
711
|
-
const validatedWords = validateStringArray(words, "words to add");
|
|
1790
|
+
const validatedWords = validateStringArray(words, "words to add", this.logger);
|
|
712
1791
|
for (const w of validatedWords) {
|
|
713
1792
|
this.dynamicWords.add(w);
|
|
714
1793
|
this.addWordToTrie(w);
|
|
715
1794
|
}
|
|
1795
|
+
this.invalidateCache();
|
|
716
1796
|
}
|
|
717
1797
|
/**
|
|
718
|
-
*
|
|
719
|
-
*
|
|
1798
|
+
* Dynamically removes one or more words from the profanity filter at runtime.
|
|
1799
|
+
*
|
|
1800
|
+
* @param {string | string[]} word - A single word or array of words to remove from the filter
|
|
1801
|
+
* @returns {void}
|
|
1802
|
+
*
|
|
1803
|
+
* @remarks
|
|
1804
|
+
* ### Behavior:
|
|
1805
|
+
* - Removes words from all active data structures (Trie, dynamic words set)
|
|
1806
|
+
* - Normalizes words based on caseSensitive setting before removal
|
|
1807
|
+
* - Only removes dynamically added words, not words from loaded language dictionaries
|
|
1808
|
+
* - Changes take effect immediately for subsequent detect/check/clean calls
|
|
1809
|
+
*
|
|
1810
|
+
* ### Important Notes:
|
|
1811
|
+
* - Cannot remove words from built-in language dictionaries
|
|
1812
|
+
* - To exclude dictionary words, use `addToWhitelist()` instead
|
|
1813
|
+
* - Validates and filters out non-string or empty values
|
|
1814
|
+
*
|
|
1815
|
+
* @example
|
|
1816
|
+
* ```typescript
|
|
1817
|
+
* const filter = new AllProfanity();
|
|
1818
|
+
*
|
|
1819
|
+
* // Add then remove a word
|
|
1820
|
+
* filter.add('tempword');
|
|
1821
|
+
* filter.check('tempword'); // true
|
|
1822
|
+
*
|
|
1823
|
+
* filter.remove('tempword');
|
|
1824
|
+
* filter.check('tempword'); // false
|
|
1825
|
+
*
|
|
1826
|
+
* // Remove multiple words
|
|
1827
|
+
* filter.remove(['word1', 'word2']);
|
|
1828
|
+
* ```
|
|
1829
|
+
*
|
|
1830
|
+
* @example
|
|
1831
|
+
* ```typescript
|
|
1832
|
+
* // Managing custom word list
|
|
1833
|
+
* const filter = new AllProfanity();
|
|
1834
|
+
* filter.add(['custom1', 'custom2', 'custom3']);
|
|
1835
|
+
*
|
|
1836
|
+
* // Later, remove one that's no longer needed
|
|
1837
|
+
* filter.remove('custom2');
|
|
1838
|
+
* ```
|
|
1839
|
+
*
|
|
1840
|
+
* @see {@link add} to add words
|
|
1841
|
+
* @see {@link addToWhitelist} to exclude dictionary words without removing them
|
|
720
1842
|
*/
|
|
721
1843
|
remove(word) {
|
|
1844
|
+
var _a;
|
|
722
1845
|
const words = Array.isArray(word) ? word : [word];
|
|
723
|
-
const validatedWords = validateStringArray(words, "words to remove");
|
|
1846
|
+
const validatedWords = validateStringArray(words, "words to remove", this.logger);
|
|
724
1847
|
for (const w of validatedWords) {
|
|
725
1848
|
const normalizedWord = this.caseSensitive ? w : w.toLowerCase();
|
|
726
1849
|
this.profanityTrie.removeWord(normalizedWord);
|
|
727
1850
|
this.dynamicWords.delete(w);
|
|
1851
|
+
// Bloom filter entries cannot be deleted, but stale entries only cost a
|
|
1852
|
+
// skipped quick-rejection — they can never produce a match by themselves.
|
|
1853
|
+
(_a = this.ahoCorasickAutomaton) === null || _a === void 0 ? void 0 : _a.removePattern(normalizedWord);
|
|
728
1854
|
}
|
|
1855
|
+
this.invalidateCache();
|
|
729
1856
|
}
|
|
730
1857
|
/**
|
|
731
1858
|
* Add words to the whitelist.
|
|
732
1859
|
* @param words - Words to whitelist.
|
|
733
1860
|
*/
|
|
734
1861
|
addToWhitelist(words) {
|
|
735
|
-
const validatedWords = validateStringArray(words, "whitelist words");
|
|
1862
|
+
const validatedWords = validateStringArray(words, "whitelist words", this.logger);
|
|
736
1863
|
for (const word of validatedWords) {
|
|
737
1864
|
const normalizedWord = this.caseSensitive ? word : word.toLowerCase();
|
|
738
1865
|
this.whitelistSet.add(normalizedWord);
|
|
739
1866
|
}
|
|
1867
|
+
this.invalidateCache();
|
|
740
1868
|
}
|
|
741
1869
|
/**
|
|
742
1870
|
* Remove words from the whitelist.
|
|
743
1871
|
* @param words - Words to remove from whitelist.
|
|
744
1872
|
*/
|
|
745
1873
|
removeFromWhitelist(words) {
|
|
746
|
-
const validatedWords = validateStringArray(words, "whitelist words");
|
|
1874
|
+
const validatedWords = validateStringArray(words, "whitelist words", this.logger);
|
|
747
1875
|
for (const word of validatedWords) {
|
|
748
1876
|
const normalizedWord = this.caseSensitive ? word : word.toLowerCase();
|
|
749
1877
|
this.whitelistSet.delete(normalizedWord);
|
|
750
1878
|
}
|
|
1879
|
+
this.invalidateCache();
|
|
751
1880
|
}
|
|
752
1881
|
/**
|
|
753
1882
|
* Check if a word is whitelisted.
|
|
@@ -759,9 +1888,60 @@ export class AllProfanity {
|
|
|
759
1888
|
return this.whitelistSet.has(normalizedWord);
|
|
760
1889
|
}
|
|
761
1890
|
/**
|
|
762
|
-
*
|
|
763
|
-
*
|
|
764
|
-
* @
|
|
1891
|
+
* Loads a built-in language dictionary into the profanity filter.
|
|
1892
|
+
*
|
|
1893
|
+
* @param {string} language - The language key to load (case-insensitive)
|
|
1894
|
+
* @returns {boolean} True if language was loaded successfully, false if not found or already loaded
|
|
1895
|
+
*
|
|
1896
|
+
* @remarks
|
|
1897
|
+
* ### Available Languages:
|
|
1898
|
+
* - `'english'` - English profanity words
|
|
1899
|
+
* - `'hindi'` - Hindi profanity words
|
|
1900
|
+
* - `'french'` - French profanity words
|
|
1901
|
+
* - `'german'` - German profanity words
|
|
1902
|
+
* - `'spanish'` - Spanish profanity words
|
|
1903
|
+
* - `'bengali'` - Bengali profanity words
|
|
1904
|
+
* - `'tamil'` - Tamil profanity words
|
|
1905
|
+
* - `'telugu'` - Telugu profanity words
|
|
1906
|
+
* - `'brazilian'` - Brazilian Portuguese profanity words
|
|
1907
|
+
*
|
|
1908
|
+
* ### Behavior:
|
|
1909
|
+
* - Language keys are case-insensitive
|
|
1910
|
+
* - Loading is idempotent - calling multiple times for same language is safe
|
|
1911
|
+
* - Returns true if language loaded successfully or was already loaded
|
|
1912
|
+
* - Returns false if language not found
|
|
1913
|
+
* - Logs success/failure messages (unless silent mode enabled)
|
|
1914
|
+
* - Words are added to all active data structures
|
|
1915
|
+
*
|
|
1916
|
+
* ### Default Languages:
|
|
1917
|
+
* English and Hindi are loaded automatically in the constructor
|
|
1918
|
+
*
|
|
1919
|
+
* @example
|
|
1920
|
+
* ```typescript
|
|
1921
|
+
* const filter = new AllProfanity();
|
|
1922
|
+
*
|
|
1923
|
+
* // Load additional languages
|
|
1924
|
+
* filter.loadLanguage('french');
|
|
1925
|
+
* filter.loadLanguage('spanish');
|
|
1926
|
+
*
|
|
1927
|
+
* // Case-insensitive
|
|
1928
|
+
* filter.loadLanguage('GERMAN'); // Works
|
|
1929
|
+
*
|
|
1930
|
+
* // Check if loaded
|
|
1931
|
+
* console.log(filter.getLoadedLanguages()); // ['english', 'hindi', 'french', 'spanish', 'german']
|
|
1932
|
+
* ```
|
|
1933
|
+
*
|
|
1934
|
+
* @example
|
|
1935
|
+
* ```typescript
|
|
1936
|
+
* // Load all Indian languages at once
|
|
1937
|
+
* const filter = new AllProfanity();
|
|
1938
|
+
* filter.loadIndianLanguages();
|
|
1939
|
+
* ```
|
|
1940
|
+
*
|
|
1941
|
+
* @see {@link loadLanguages} to load multiple languages at once
|
|
1942
|
+
* @see {@link loadIndianLanguages} for convenience method
|
|
1943
|
+
* @see {@link getAvailableLanguages} to see all available languages
|
|
1944
|
+
* @see {@link getLoadedLanguages} to see currently loaded languages
|
|
765
1945
|
*/
|
|
766
1946
|
loadLanguage(language) {
|
|
767
1947
|
if (!language || typeof language !== "string") {
|
|
@@ -785,6 +1965,7 @@ export class AllProfanity {
|
|
|
785
1965
|
}
|
|
786
1966
|
}
|
|
787
1967
|
this.loadedLanguages.add(langKey);
|
|
1968
|
+
this.invalidateCache();
|
|
788
1969
|
this.logger.info(`Loaded ${addedCount} words from ${language} dictionary`);
|
|
789
1970
|
return true;
|
|
790
1971
|
}
|
|
@@ -799,7 +1980,7 @@ export class AllProfanity {
|
|
|
799
1980
|
* @returns Number of successfully loaded languages.
|
|
800
1981
|
*/
|
|
801
1982
|
loadLanguages(languages) {
|
|
802
|
-
const validatedLanguages = validateStringArray(languages, "languages");
|
|
1983
|
+
const validatedLanguages = validateStringArray(languages, "languages", this.logger);
|
|
803
1984
|
return validatedLanguages.reduce((count, lang) => {
|
|
804
1985
|
return this.loadLanguage(lang) ? count + 1 : count;
|
|
805
1986
|
}, 0);
|
|
@@ -813,13 +1994,68 @@ export class AllProfanity {
|
|
|
813
1994
|
return this.loadLanguages(indianLanguages);
|
|
814
1995
|
}
|
|
815
1996
|
/**
|
|
816
|
-
*
|
|
817
|
-
*
|
|
818
|
-
* @param
|
|
1997
|
+
* Loads a custom dictionary of profane words with a specific name.
|
|
1998
|
+
*
|
|
1999
|
+
* @param {string} name - Unique name/identifier for this custom dictionary
|
|
2000
|
+
* @param {string[]} words - Array of profane words to add to the dictionary
|
|
2001
|
+
* @returns {void}
|
|
2002
|
+
*
|
|
2003
|
+
* @throws {TypeError} If name is not a string or words is not an array
|
|
2004
|
+
*
|
|
2005
|
+
* @remarks
|
|
2006
|
+
* ### Behavior:
|
|
2007
|
+
* - Creates a new named dictionary or overwrites existing one with same name
|
|
2008
|
+
* - Validates and filters out non-string and empty values from words array
|
|
2009
|
+
* - Words are added to all active data structures (Trie, Aho-Corasick, Bloom Filter)
|
|
2010
|
+
* - Dictionary name is converted to lowercase for storage
|
|
2011
|
+
* - Logs count of loaded words (unless silent mode enabled)
|
|
2012
|
+
*
|
|
2013
|
+
* ### Use Cases:
|
|
2014
|
+
* - Domain-specific profanity (gaming, medical, legal, etc.)
|
|
2015
|
+
* - Organization-specific word lists
|
|
2016
|
+
* - Temporary or context-dependent filters
|
|
2017
|
+
* - Testing and development
|
|
2018
|
+
*
|
|
2019
|
+
* @example
|
|
2020
|
+
* ```typescript
|
|
2021
|
+
* const filter = new AllProfanity();
|
|
2022
|
+
*
|
|
2023
|
+
* // Load gaming-specific slang
|
|
2024
|
+
* filter.loadCustomDictionary('gaming', [
|
|
2025
|
+
* 'noob',
|
|
2026
|
+
* 'scrub',
|
|
2027
|
+
* 'tryhard',
|
|
2028
|
+
* 'trash'
|
|
2029
|
+
* ]);
|
|
2030
|
+
*
|
|
2031
|
+
* // Load company-specific terms
|
|
2032
|
+
* filter.loadCustomDictionary('company', [
|
|
2033
|
+
* 'competitor1',
|
|
2034
|
+
* 'bannedTerm1',
|
|
2035
|
+
* 'inappropriateJargon'
|
|
2036
|
+
* ]);
|
|
2037
|
+
*
|
|
2038
|
+
* console.log(filter.check('You are such a noob')); // true
|
|
2039
|
+
* ```
|
|
2040
|
+
*
|
|
2041
|
+
* @example
|
|
2042
|
+
* ```typescript
|
|
2043
|
+
* // Load from external source
|
|
2044
|
+
* const filter = new AllProfanity();
|
|
2045
|
+
*
|
|
2046
|
+
* async function loadExternalDictionary() {
|
|
2047
|
+
* const response = await fetch('https://example.com/custom-words.json');
|
|
2048
|
+
* const customWords = await response.json();
|
|
2049
|
+
* filter.loadCustomDictionary('external', customWords);
|
|
2050
|
+
* }
|
|
2051
|
+
* ```
|
|
2052
|
+
*
|
|
2053
|
+
* @see {@link add} for adding individual words dynamically
|
|
2054
|
+
* @see {@link loadLanguage} for loading built-in language dictionaries
|
|
819
2055
|
*/
|
|
820
2056
|
loadCustomDictionary(name, words) {
|
|
821
2057
|
validateString(name, "dictionary name");
|
|
822
|
-
const validatedWords = validateStringArray(words, "custom dictionary words");
|
|
2058
|
+
const validatedWords = validateStringArray(words, "custom dictionary words", this.logger);
|
|
823
2059
|
if (validatedWords.length === 0) {
|
|
824
2060
|
this.logger.warn(`Custom dictionary '${name}' contains no valid words`);
|
|
825
2061
|
return;
|
|
@@ -833,6 +2069,7 @@ export class AllProfanity {
|
|
|
833
2069
|
}
|
|
834
2070
|
this.availableLanguages[name.toLowerCase()] = validatedWords;
|
|
835
2071
|
this.loadedLanguages.add(name.toLowerCase());
|
|
2072
|
+
this.invalidateCache();
|
|
836
2073
|
this.logger.info(`Loaded ${addedCount} words from custom dictionary '${name}'`);
|
|
837
2074
|
}
|
|
838
2075
|
catch (error) {
|
|
@@ -856,9 +2093,17 @@ export class AllProfanity {
|
|
|
856
2093
|
}
|
|
857
2094
|
// Add to Trie (always used as fallback)
|
|
858
2095
|
this.profanityTrie.addWord(normalizedWord);
|
|
859
|
-
// Add to Bloom Filter if enabled
|
|
2096
|
+
// Add to Bloom Filter if enabled. Constituent tokens of multi-word or
|
|
2097
|
+
// symbol-containing entries are added too, so the token-level quick
|
|
2098
|
+
// rejection in bloomQuickReject() can never miss a phrase.
|
|
860
2099
|
if (this.bloomFilter) {
|
|
861
2100
|
this.bloomFilter.add(normalizedWord);
|
|
2101
|
+
const tokens = normalizedWord.split(/[^\p{L}\p{N}]+/u);
|
|
2102
|
+
for (const token of tokens) {
|
|
2103
|
+
if (token.length > 0 && token !== normalizedWord) {
|
|
2104
|
+
this.bloomFilter.add(token);
|
|
2105
|
+
}
|
|
2106
|
+
}
|
|
862
2107
|
}
|
|
863
2108
|
// Add to Aho-Corasick automaton if enabled
|
|
864
2109
|
if (this.ahoCorasickAutomaton) {
|
|
@@ -873,7 +2118,7 @@ export class AllProfanity {
|
|
|
873
2118
|
*/
|
|
874
2119
|
calculateSeverity(matches) {
|
|
875
2120
|
if (matches.length === 0)
|
|
876
|
-
return ProfanitySeverity.
|
|
2121
|
+
return ProfanitySeverity.NONE;
|
|
877
2122
|
const uniqueWords = new Set(matches.map((m) => m.word)).size;
|
|
878
2123
|
const totalMatches = matches.length;
|
|
879
2124
|
if (totalMatches >= 5 || uniqueWords >= 4)
|
|
@@ -888,9 +2133,13 @@ export class AllProfanity {
|
|
|
888
2133
|
* Clear all loaded dictionaries and dynamic words.
|
|
889
2134
|
*/
|
|
890
2135
|
clearList() {
|
|
2136
|
+
var _a, _b;
|
|
891
2137
|
this.profanityTrie.clear();
|
|
892
2138
|
this.loadedLanguages.clear();
|
|
893
2139
|
this.dynamicWords.clear();
|
|
2140
|
+
(_a = this.ahoCorasickAutomaton) === null || _a === void 0 ? void 0 : _a.clear();
|
|
2141
|
+
(_b = this.bloomFilter) === null || _b === void 0 ? void 0 : _b.clear();
|
|
2142
|
+
this.invalidateCache();
|
|
894
2143
|
}
|
|
895
2144
|
/**
|
|
896
2145
|
* Set the placeholder character for filtered words.
|
|
@@ -902,6 +2151,7 @@ export class AllProfanity {
|
|
|
902
2151
|
throw new Error("Placeholder cannot be empty");
|
|
903
2152
|
}
|
|
904
2153
|
this.defaultPlaceholder = placeholder.charAt(0);
|
|
2154
|
+
this.invalidateCache();
|
|
905
2155
|
}
|
|
906
2156
|
/**
|
|
907
2157
|
* Get the list of loaded languages.
|
|
@@ -933,10 +2183,14 @@ export class AllProfanity {
|
|
|
933
2183
|
};
|
|
934
2184
|
}
|
|
935
2185
|
/**
|
|
936
|
-
* Rebuild
|
|
2186
|
+
* Rebuild all matching structures (trie, Aho-Corasick automaton, Bloom
|
|
2187
|
+
* Filter) from loaded dictionaries and dynamic words.
|
|
937
2188
|
*/
|
|
938
|
-
|
|
2189
|
+
rebuildIndexes() {
|
|
2190
|
+
var _a, _b;
|
|
939
2191
|
this.profanityTrie.clear();
|
|
2192
|
+
(_a = this.ahoCorasickAutomaton) === null || _a === void 0 ? void 0 : _a.clear();
|
|
2193
|
+
(_b = this.bloomFilter) === null || _b === void 0 ? void 0 : _b.clear();
|
|
940
2194
|
for (const lang of this.loadedLanguages) {
|
|
941
2195
|
const words = this.availableLanguages[lang] || [];
|
|
942
2196
|
for (const word of words) {
|
|
@@ -946,6 +2200,7 @@ export class AllProfanity {
|
|
|
946
2200
|
for (const word of this.dynamicWords) {
|
|
947
2201
|
this.addWordToTrie(word);
|
|
948
2202
|
}
|
|
2203
|
+
this.invalidateCache();
|
|
949
2204
|
}
|
|
950
2205
|
/**
|
|
951
2206
|
* Update configuration options for the profanity filter.
|
|
@@ -974,8 +2229,9 @@ export class AllProfanity {
|
|
|
974
2229
|
this.addToWhitelist(options.whitelistWords);
|
|
975
2230
|
}
|
|
976
2231
|
if (rebuildNeeded) {
|
|
977
|
-
this.
|
|
2232
|
+
this.rebuildIndexes();
|
|
978
2233
|
}
|
|
2234
|
+
this.invalidateCache();
|
|
979
2235
|
}
|
|
980
2236
|
/**
|
|
981
2237
|
* Create an AllProfanity instance from a configuration object.
|
|
@@ -992,8 +2248,12 @@ export class AllProfanity {
|
|
|
992
2248
|
options.ahoCorasick = config.ahoCorasick;
|
|
993
2249
|
if (config.contextAnalysis)
|
|
994
2250
|
options.contextAnalysis = config.contextAnalysis;
|
|
2251
|
+
if (config.evasionProtection)
|
|
2252
|
+
options.evasionProtection = config.evasionProtection;
|
|
995
2253
|
if (config.performance)
|
|
996
2254
|
options.performance = config.performance;
|
|
2255
|
+
if (config.silent !== undefined)
|
|
2256
|
+
options.silent = config.silent;
|
|
997
2257
|
if (config.profanityDetection) {
|
|
998
2258
|
options.enableLeetSpeak = config.profanityDetection.enableLeetSpeak;
|
|
999
2259
|
options.caseSensitive = config.profanityDetection.caseSensitive;
|
|
@@ -1024,7 +2284,8 @@ export class AllProfanity {
|
|
|
1024
2284
|
}
|
|
1025
2285
|
/**
|
|
1026
2286
|
* Singleton instance of AllProfanity with default configuration.
|
|
2287
|
+
* Silent so that importing the library never writes to the console.
|
|
1027
2288
|
*/
|
|
1028
|
-
const allProfanity = new AllProfanity();
|
|
2289
|
+
const allProfanity = new AllProfanity({ silent: true });
|
|
1029
2290
|
export default allProfanity;
|
|
1030
2291
|
//# sourceMappingURL=index.js.map
|