bekindprofanityfilter 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/index.js +6 -6
- package/dist/esm/languages/english-primary-all-languages.d.ts +0 -17
- package/dist/esm.min.js +8 -0
- package/package.json +5 -6
- package/dist/esm/algos/aho-corasick.js +0 -238
- package/dist/esm/algos/bloom-filter.js +0 -208
- package/dist/esm/algos/context-patterns.js +0 -415
- package/dist/esm/index.js +0 -2640
- package/dist/esm/innocence-scoring.js +0 -118
- package/dist/esm/language-detector.js +0 -952
- package/dist/esm/language-dicts.js +0 -2718
- package/dist/esm/languages/english-primary-all-languages.js +0 -36894
- package/dist/esm/romanization-detector.js +0 -779
package/package.json
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "bekindprofanityfilter",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.6",
|
|
4
4
|
"description": "A multi-language profanity filter with romanization detection, language-aware innocence scoring, leet-speak detection, and cross-language collision handling. Forked from AllProfanity.",
|
|
5
5
|
"main": "dist/cjs/index.js",
|
|
6
|
-
"module": "dist/esm
|
|
6
|
+
"module": "dist/esm.min.js",
|
|
7
7
|
"types": "dist/esm/index.d.ts",
|
|
8
8
|
"type": "module",
|
|
9
9
|
"exports": {
|
|
10
10
|
".": {
|
|
11
11
|
"import": {
|
|
12
12
|
"types": "./dist/esm/index.d.ts",
|
|
13
|
-
"default": "./dist/esm
|
|
13
|
+
"default": "./dist/esm.min.js"
|
|
14
14
|
},
|
|
15
15
|
"require": {
|
|
16
16
|
"types": "./dist/esm/index.d.ts",
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
}
|
|
20
20
|
},
|
|
21
21
|
"scripts": {
|
|
22
|
-
"build": "tsc && esbuild dist/esm/index.js --bundle --minify --platform=node --format=cjs --outfile=dist/cjs/index.js --packages=bundle && echo '{\"type\":\"commonjs\"}' > dist/cjs/package.json",
|
|
22
|
+
"build": "tsc && esbuild dist/esm/index.js --bundle --minify --platform=node --format=esm --outfile=dist/esm.min.js --packages=bundle && esbuild dist/esm/index.js --bundle --minify --platform=node --format=cjs --outfile=dist/cjs/index.js --packages=bundle && echo '{\"type\":\"commonjs\"}' > dist/cjs/package.json",
|
|
23
23
|
"test": "jest",
|
|
24
24
|
"test:watch": "jest --watch",
|
|
25
25
|
"test:coverage": "jest --coverage",
|
|
@@ -75,9 +75,8 @@
|
|
|
75
75
|
"typescript": "^4.0.0"
|
|
76
76
|
},
|
|
77
77
|
"files": [
|
|
78
|
-
"dist/esm
|
|
78
|
+
"dist/esm.min.js",
|
|
79
79
|
"dist/esm/**/*.d.ts",
|
|
80
|
-
"!dist/esm/languages/*-words.js",
|
|
81
80
|
"!dist/esm/languages/*-words.d.ts",
|
|
82
81
|
"dist/cjs/index.js",
|
|
83
82
|
"dist/cjs/package.json",
|
|
@@ -1,238 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Aho-Corasick algorithm implementation for efficient multi-pattern string matching
|
|
3
|
-
*/
|
|
4
|
-
export class AhoCorasick {
|
|
5
|
-
constructor(patterns = []) {
|
|
6
|
-
this.compiled = false;
|
|
7
|
-
this.patterns = [...patterns];
|
|
8
|
-
this.root = this.createNode();
|
|
9
|
-
if (patterns.length > 0) {
|
|
10
|
-
this.buildAutomaton();
|
|
11
|
-
}
|
|
12
|
-
}
|
|
13
|
-
/**
|
|
14
|
-
* Create a new trie node
|
|
15
|
-
*/
|
|
16
|
-
createNode() {
|
|
17
|
-
return {
|
|
18
|
-
children: new Map(),
|
|
19
|
-
output: [],
|
|
20
|
-
outputIndices: [],
|
|
21
|
-
failure: null,
|
|
22
|
-
isEndOfPattern: false,
|
|
23
|
-
};
|
|
24
|
-
}
|
|
25
|
-
/**
|
|
26
|
-
* Add patterns to the automaton
|
|
27
|
-
*/
|
|
28
|
-
addPatterns(patterns) {
|
|
29
|
-
this.patterns.push(...patterns);
|
|
30
|
-
this.compiled = false;
|
|
31
|
-
}
|
|
32
|
-
/**
|
|
33
|
-
* Add a single pattern to the automaton
|
|
34
|
-
*/
|
|
35
|
-
addPattern(pattern) {
|
|
36
|
-
if (pattern && pattern.length > 0) {
|
|
37
|
-
this.patterns.push(pattern);
|
|
38
|
-
this.compiled = false;
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
/**
|
|
42
|
-
* Build the Aho-Corasick automaton
|
|
43
|
-
*/
|
|
44
|
-
buildAutomaton() {
|
|
45
|
-
this.buildTrie();
|
|
46
|
-
this.buildFailureLinks();
|
|
47
|
-
this.buildOutputLinks();
|
|
48
|
-
this.compiled = true;
|
|
49
|
-
}
|
|
50
|
-
/**
|
|
51
|
-
* Build the trie structure
|
|
52
|
-
*/
|
|
53
|
-
buildTrie() {
|
|
54
|
-
this.root = this.createNode();
|
|
55
|
-
for (let i = 0; i < this.patterns.length; i++) {
|
|
56
|
-
const pattern = this.patterns[i];
|
|
57
|
-
let current = this.root;
|
|
58
|
-
for (const char of pattern) {
|
|
59
|
-
if (!current.children.has(char)) {
|
|
60
|
-
current.children.set(char, this.createNode());
|
|
61
|
-
}
|
|
62
|
-
current = current.children.get(char);
|
|
63
|
-
}
|
|
64
|
-
current.isEndOfPattern = true;
|
|
65
|
-
current.output.push(pattern);
|
|
66
|
-
current.outputIndices.push(i);
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
/**
|
|
70
|
-
* Build failure links using BFS
|
|
71
|
-
*/
|
|
72
|
-
buildFailureLinks() {
|
|
73
|
-
const queue = [];
|
|
74
|
-
// Initialize failure links for depth 1 nodes
|
|
75
|
-
for (const child of this.root.children.values()) {
|
|
76
|
-
child.failure = this.root;
|
|
77
|
-
queue.push(child);
|
|
78
|
-
}
|
|
79
|
-
// Build failure links for deeper nodes
|
|
80
|
-
while (queue.length > 0) {
|
|
81
|
-
const current = queue.shift();
|
|
82
|
-
for (const [char, child] of current.children) {
|
|
83
|
-
queue.push(child);
|
|
84
|
-
let failure = current.failure;
|
|
85
|
-
while (failure !== null && !failure.children.has(char)) {
|
|
86
|
-
failure = failure.failure;
|
|
87
|
-
}
|
|
88
|
-
child.failure = failure ? failure.children.get(char) : this.root;
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
/**
|
|
93
|
-
* Build output links for failure transitions
|
|
94
|
-
*/
|
|
95
|
-
buildOutputLinks() {
|
|
96
|
-
const queue = [];
|
|
97
|
-
for (const child of this.root.children.values()) {
|
|
98
|
-
queue.push(child);
|
|
99
|
-
}
|
|
100
|
-
while (queue.length > 0) {
|
|
101
|
-
const current = queue.shift();
|
|
102
|
-
// Add failure node outputs to current node
|
|
103
|
-
if (current.failure && current.failure.output.length > 0) {
|
|
104
|
-
current.output.push(...current.failure.output);
|
|
105
|
-
current.outputIndices.push(...current.failure.outputIndices);
|
|
106
|
-
}
|
|
107
|
-
for (const child of current.children.values()) {
|
|
108
|
-
queue.push(child);
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
/**
|
|
113
|
-
* Find all pattern matches in the given text
|
|
114
|
-
*/
|
|
115
|
-
findAll(text) {
|
|
116
|
-
if (!this.compiled) {
|
|
117
|
-
this.buildAutomaton();
|
|
118
|
-
}
|
|
119
|
-
const matches = [];
|
|
120
|
-
let current = this.root;
|
|
121
|
-
for (let i = 0; i < text.length; i++) {
|
|
122
|
-
const char = text[i];
|
|
123
|
-
// Follow failure links until we find a transition or reach root
|
|
124
|
-
while (current !== this.root && !current.children.has(char)) {
|
|
125
|
-
current = current.failure;
|
|
126
|
-
}
|
|
127
|
-
// Transition to next state if possible
|
|
128
|
-
if (current.children.has(char)) {
|
|
129
|
-
current = current.children.get(char);
|
|
130
|
-
}
|
|
131
|
-
// Report all patterns that end at this position
|
|
132
|
-
for (let j = 0; j < current.output.length; j++) {
|
|
133
|
-
const pattern = current.output[j];
|
|
134
|
-
const patternIndex = current.outputIndices[j];
|
|
135
|
-
const start = i - pattern.length + 1;
|
|
136
|
-
matches.push({
|
|
137
|
-
pattern,
|
|
138
|
-
start,
|
|
139
|
-
end: i + 1,
|
|
140
|
-
patternIndex,
|
|
141
|
-
});
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
return matches;
|
|
145
|
-
}
|
|
146
|
-
/**
|
|
147
|
-
* Check if text contains any patterns
|
|
148
|
-
*/
|
|
149
|
-
hasMatch(text) {
|
|
150
|
-
if (!this.compiled) {
|
|
151
|
-
this.buildAutomaton();
|
|
152
|
-
}
|
|
153
|
-
let current = this.root;
|
|
154
|
-
for (let i = 0; i < text.length; i++) {
|
|
155
|
-
const char = text[i];
|
|
156
|
-
while (current !== this.root && !current.children.has(char)) {
|
|
157
|
-
current = current.failure;
|
|
158
|
-
}
|
|
159
|
-
if (current.children.has(char)) {
|
|
160
|
-
current = current.children.get(char);
|
|
161
|
-
}
|
|
162
|
-
if (current.output.length > 0) {
|
|
163
|
-
return true;
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
return false;
|
|
167
|
-
}
|
|
168
|
-
/**
|
|
169
|
-
* Find first match in text
|
|
170
|
-
*/
|
|
171
|
-
findFirst(text) {
|
|
172
|
-
if (!this.compiled) {
|
|
173
|
-
this.buildAutomaton();
|
|
174
|
-
}
|
|
175
|
-
let current = this.root;
|
|
176
|
-
for (let i = 0; i < text.length; i++) {
|
|
177
|
-
const char = text[i];
|
|
178
|
-
while (current !== this.root && !current.children.has(char)) {
|
|
179
|
-
current = current.failure;
|
|
180
|
-
}
|
|
181
|
-
if (current.children.has(char)) {
|
|
182
|
-
current = current.children.get(char);
|
|
183
|
-
}
|
|
184
|
-
if (current.output.length > 0) {
|
|
185
|
-
const pattern = current.output[0];
|
|
186
|
-
const patternIndex = current.outputIndices[0];
|
|
187
|
-
const start = i - pattern.length + 1;
|
|
188
|
-
return {
|
|
189
|
-
pattern,
|
|
190
|
-
start,
|
|
191
|
-
end: i + 1,
|
|
192
|
-
patternIndex,
|
|
193
|
-
};
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
return null;
|
|
197
|
-
}
|
|
198
|
-
/**
|
|
199
|
-
* Get the patterns stored in this automaton
|
|
200
|
-
*/
|
|
201
|
-
getPatterns() {
|
|
202
|
-
return [...this.patterns];
|
|
203
|
-
}
|
|
204
|
-
/**
|
|
205
|
-
* Clear all patterns and reset the automaton
|
|
206
|
-
*/
|
|
207
|
-
clear() {
|
|
208
|
-
this.patterns = [];
|
|
209
|
-
this.root = this.createNode();
|
|
210
|
-
this.compiled = false;
|
|
211
|
-
}
|
|
212
|
-
/**
|
|
213
|
-
* Get statistics about the automaton
|
|
214
|
-
*/
|
|
215
|
-
getStats() {
|
|
216
|
-
const nodeCount = this.countNodes(this.root);
|
|
217
|
-
const averagePatternLength = this.patterns.length > 0
|
|
218
|
-
? this.patterns.reduce((sum, p) => sum + p.length, 0) /
|
|
219
|
-
this.patterns.length
|
|
220
|
-
: 0;
|
|
221
|
-
return {
|
|
222
|
-
patternCount: this.patterns.length,
|
|
223
|
-
nodeCount,
|
|
224
|
-
averagePatternLength,
|
|
225
|
-
};
|
|
226
|
-
}
|
|
227
|
-
/**
|
|
228
|
-
* Count total nodes in the trie
|
|
229
|
-
*/
|
|
230
|
-
countNodes(node) {
|
|
231
|
-
let count = 1;
|
|
232
|
-
for (const child of node.children.values()) {
|
|
233
|
-
count += this.countNodes(child);
|
|
234
|
-
}
|
|
235
|
-
return count;
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
|
-
//# sourceMappingURL=aho-corasick.js.map
|
|
@@ -1,208 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Bloom Filter implementation for efficient set membership testing
|
|
3
|
-
*/
|
|
4
|
-
export class BloomFilter {
|
|
5
|
-
constructor(expectedItems, falsePositiveRate = 0.01) {
|
|
6
|
-
this.itemCount = 0;
|
|
7
|
-
// Calculate optimal size and hash count
|
|
8
|
-
this.size = this.calculateOptimalSize(expectedItems, falsePositiveRate);
|
|
9
|
-
this.hashCount = this.calculateOptimalHashCount(this.size, expectedItems);
|
|
10
|
-
this.bitArray = new Uint8Array(Math.ceil(this.size / 8));
|
|
11
|
-
}
|
|
12
|
-
/**
|
|
13
|
-
* Calculate optimal bit array size
|
|
14
|
-
*/
|
|
15
|
-
calculateOptimalSize(n, p) {
|
|
16
|
-
return Math.ceil((-n * Math.log(p)) / Math.log(2) ** 2);
|
|
17
|
-
}
|
|
18
|
-
/**
|
|
19
|
-
* Calculate optimal number of hash functions
|
|
20
|
-
*/
|
|
21
|
-
calculateOptimalHashCount(m, n) {
|
|
22
|
-
return Math.ceil((m / n) * Math.log(2));
|
|
23
|
-
}
|
|
24
|
-
/**
|
|
25
|
-
* Hash function 1 (FNV-1a variant)
|
|
26
|
-
*/
|
|
27
|
-
hash1(item) {
|
|
28
|
-
let hash = 2166136261;
|
|
29
|
-
for (let i = 0; i < item.length; i++) {
|
|
30
|
-
hash ^= item.charCodeAt(i);
|
|
31
|
-
hash *= 16777619;
|
|
32
|
-
}
|
|
33
|
-
return Math.abs(hash) % this.size;
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Hash function 2 (djb2 variant)
|
|
37
|
-
*/
|
|
38
|
-
hash2(item) {
|
|
39
|
-
let hash = 5381;
|
|
40
|
-
for (let i = 0; i < item.length; i++) {
|
|
41
|
-
hash = (hash << 5) + hash + item.charCodeAt(i);
|
|
42
|
-
}
|
|
43
|
-
return Math.abs(hash) % this.size;
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
* Generate k hash values for an item using double hashing
|
|
47
|
-
*/
|
|
48
|
-
getHashes(item) {
|
|
49
|
-
const hash1 = this.hash1(item);
|
|
50
|
-
const hash2 = this.hash2(item);
|
|
51
|
-
const hashes = [];
|
|
52
|
-
for (let i = 0; i < this.hashCount; i++) {
|
|
53
|
-
const hash = (hash1 + i * hash2) % this.size;
|
|
54
|
-
hashes.push(Math.abs(hash));
|
|
55
|
-
}
|
|
56
|
-
return hashes;
|
|
57
|
-
}
|
|
58
|
-
/**
|
|
59
|
-
* Set a bit in the bit array
|
|
60
|
-
*/
|
|
61
|
-
setBit(index) {
|
|
62
|
-
const byteIndex = Math.floor(index / 8);
|
|
63
|
-
const bitIndex = index % 8;
|
|
64
|
-
this.bitArray[byteIndex] |= 1 << bitIndex;
|
|
65
|
-
}
|
|
66
|
-
/**
|
|
67
|
-
* Get a bit from the bit array
|
|
68
|
-
*/
|
|
69
|
-
getBit(index) {
|
|
70
|
-
const byteIndex = Math.floor(index / 8);
|
|
71
|
-
const bitIndex = index % 8;
|
|
72
|
-
return (this.bitArray[byteIndex] & (1 << bitIndex)) !== 0;
|
|
73
|
-
}
|
|
74
|
-
/**
|
|
75
|
-
* Add an item to the bloom filter
|
|
76
|
-
*/
|
|
77
|
-
add(item) {
|
|
78
|
-
const hashes = this.getHashes(item);
|
|
79
|
-
for (const hash of hashes) {
|
|
80
|
-
this.setBit(hash);
|
|
81
|
-
}
|
|
82
|
-
this.itemCount++;
|
|
83
|
-
}
|
|
84
|
-
/**
|
|
85
|
-
* Add multiple items to the bloom filter
|
|
86
|
-
*/
|
|
87
|
-
addAll(items) {
|
|
88
|
-
for (const item of items) {
|
|
89
|
-
this.add(item);
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
/**
|
|
93
|
-
* Test if an item might be in the set
|
|
94
|
-
*/
|
|
95
|
-
mightContain(item) {
|
|
96
|
-
const hashes = this.getHashes(item);
|
|
97
|
-
for (const hash of hashes) {
|
|
98
|
-
if (!this.getBit(hash)) {
|
|
99
|
-
return false;
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
return true;
|
|
103
|
-
}
|
|
104
|
-
/**
|
|
105
|
-
* Test multiple items at once
|
|
106
|
-
*/
|
|
107
|
-
mightContainAny(items) {
|
|
108
|
-
return items.some((item) => this.mightContain(item));
|
|
109
|
-
}
|
|
110
|
-
/**
|
|
111
|
-
* Filter items that might be in the set
|
|
112
|
-
*/
|
|
113
|
-
filter(items) {
|
|
114
|
-
return items.filter((item) => this.mightContain(item));
|
|
115
|
-
}
|
|
116
|
-
/**
|
|
117
|
-
* Clear the bloom filter
|
|
118
|
-
*/
|
|
119
|
-
clear() {
|
|
120
|
-
this.bitArray.fill(0);
|
|
121
|
-
this.itemCount = 0;
|
|
122
|
-
}
|
|
123
|
-
/**
|
|
124
|
-
* Get current false positive probability
|
|
125
|
-
*/
|
|
126
|
-
getCurrentFalsePositiveRate() {
|
|
127
|
-
const ratio = this.itemCount / this.size;
|
|
128
|
-
return Math.pow(1 - Math.exp(-this.hashCount * ratio), this.hashCount);
|
|
129
|
-
}
|
|
130
|
-
/**
|
|
131
|
-
* Get bloom filter statistics
|
|
132
|
-
*/
|
|
133
|
-
getStats() {
|
|
134
|
-
let bitsSet = 0;
|
|
135
|
-
for (let i = 0; i < this.size; i++) {
|
|
136
|
-
if (this.getBit(i)) {
|
|
137
|
-
bitsSet++;
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
const loadFactor = bitsSet / this.size;
|
|
141
|
-
const estimatedFalsePositiveRate = Math.pow(loadFactor, this.hashCount);
|
|
142
|
-
return {
|
|
143
|
-
size: this.size,
|
|
144
|
-
hashCount: this.hashCount,
|
|
145
|
-
itemCount: this.itemCount,
|
|
146
|
-
bitsSet,
|
|
147
|
-
loadFactor,
|
|
148
|
-
estimatedFalsePositiveRate,
|
|
149
|
-
};
|
|
150
|
-
}
|
|
151
|
-
/**
|
|
152
|
-
* Serialize bloom filter to JSON
|
|
153
|
-
*/
|
|
154
|
-
toJSON() {
|
|
155
|
-
return {
|
|
156
|
-
size: this.size,
|
|
157
|
-
hashCount: this.hashCount,
|
|
158
|
-
itemCount: this.itemCount,
|
|
159
|
-
bitArray: Array.from(this.bitArray),
|
|
160
|
-
};
|
|
161
|
-
}
|
|
162
|
-
/**
|
|
163
|
-
* Deserialize bloom filter from JSON
|
|
164
|
-
*/
|
|
165
|
-
static fromJSON(data) {
|
|
166
|
-
const filter = Object.create(BloomFilter.prototype);
|
|
167
|
-
filter.size = data.size;
|
|
168
|
-
filter.hashCount = data.hashCount;
|
|
169
|
-
filter.itemCount = data.itemCount;
|
|
170
|
-
filter.bitArray = new Uint8Array(data.bitArray);
|
|
171
|
-
return filter;
|
|
172
|
-
}
|
|
173
|
-
/**
|
|
174
|
-
* Union operation with another bloom filter
|
|
175
|
-
*/
|
|
176
|
-
union(other) {
|
|
177
|
-
if (this.size !== other.size || this.hashCount !== other.hashCount) {
|
|
178
|
-
throw new Error("Bloom filters must have same size and hash count for union operation");
|
|
179
|
-
}
|
|
180
|
-
const result = new BloomFilter(1, 0.01);
|
|
181
|
-
result.size = this.size;
|
|
182
|
-
result.hashCount = this.hashCount;
|
|
183
|
-
result.bitArray = new Uint8Array(this.bitArray.length);
|
|
184
|
-
result.itemCount = this.itemCount + other.itemCount;
|
|
185
|
-
for (let i = 0; i < this.bitArray.length; i++) {
|
|
186
|
-
result.bitArray[i] = this.bitArray[i] | other.bitArray[i];
|
|
187
|
-
}
|
|
188
|
-
return result;
|
|
189
|
-
}
|
|
190
|
-
/**
|
|
191
|
-
* Intersection operation with another bloom filter
|
|
192
|
-
*/
|
|
193
|
-
intersect(other) {
|
|
194
|
-
if (this.size !== other.size || this.hashCount !== other.hashCount) {
|
|
195
|
-
throw new Error("Bloom filters must have same size and hash count for intersection operation");
|
|
196
|
-
}
|
|
197
|
-
const result = new BloomFilter(1, 0.01);
|
|
198
|
-
result.size = this.size;
|
|
199
|
-
result.hashCount = this.hashCount;
|
|
200
|
-
result.bitArray = new Uint8Array(this.bitArray.length);
|
|
201
|
-
result.itemCount = Math.min(this.itemCount, other.itemCount);
|
|
202
|
-
for (let i = 0; i < this.bitArray.length; i++) {
|
|
203
|
-
result.bitArray[i] = this.bitArray[i] & other.bitArray[i];
|
|
204
|
-
}
|
|
205
|
-
return result;
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
//# sourceMappingURL=bloom-filter.js.map
|