bekindprofanityfilter 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -123
- package/dist/cjs/index.js +55795 -0
- package/dist/cjs/package.json +1 -0
- package/dist/esm/algos/aho-corasick.js.map +1 -0
- package/dist/esm/algos/bloom-filter.js.map +1 -0
- package/dist/{algos → esm/algos}/context-patterns.js +15 -84
- package/dist/esm/algos/context-patterns.js.map +1 -0
- package/dist/{index.js → esm/index.js} +10 -1
- package/dist/esm/index.js.map +1 -0
- package/dist/esm/innocence-scoring.js.map +1 -0
- package/dist/esm/language-detector.js.map +1 -0
- package/dist/esm/language-dicts.js.map +1 -0
- package/dist/esm/languages/arabic-words.js.map +1 -0
- package/dist/esm/languages/bengali-words.js.map +1 -0
- package/dist/esm/languages/brazilian-words.js.map +1 -0
- package/dist/{languages → esm/languages}/chinese-words.js.map +1 -1
- package/dist/{languages → esm/languages}/english-primary-all-languages.js.map +1 -1
- package/dist/esm/languages/english-words.js.map +1 -0
- package/dist/esm/languages/french-words.js.map +1 -0
- package/dist/esm/languages/german-words.js.map +1 -0
- package/dist/esm/languages/hindi-words.js.map +1 -0
- package/dist/{languages → esm/languages}/innocent-words.js +2 -0
- package/dist/esm/languages/innocent-words.js.map +1 -0
- package/dist/esm/languages/italian-words.js.map +1 -0
- package/dist/esm/languages/japanese-words.js.map +1 -0
- package/dist/{languages → esm/languages}/korean-words.js.map +1 -1
- package/dist/esm/languages/russian-words.js.map +1 -0
- package/dist/esm/languages/spanish-words.js.map +1 -0
- package/dist/esm/languages/tamil-words.js.map +1 -0
- package/dist/esm/languages/telugu-words.js.map +1 -0
- package/dist/esm/romanization-detector.js.map +1 -0
- package/package.json +32 -19
- package/dist/algos/aho-corasick.js.map +0 -1
- package/dist/algos/bloom-filter.js.map +0 -1
- package/dist/algos/context-patterns.js.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/innocence-scoring.js.map +0 -1
- package/dist/language-detector.js.map +0 -1
- package/dist/language-dicts.js.map +0 -1
- package/dist/languages/arabic-words.js.map +0 -1
- package/dist/languages/bengali-words.js.map +0 -1
- package/dist/languages/brazilian-words.js.map +0 -1
- package/dist/languages/english-words.js.map +0 -1
- package/dist/languages/french-words.js.map +0 -1
- package/dist/languages/german-words.js.map +0 -1
- package/dist/languages/hindi-words.js.map +0 -1
- package/dist/languages/innocent-words.js.map +0 -1
- package/dist/languages/italian-words.js.map +0 -1
- package/dist/languages/japanese-words.js.map +0 -1
- package/dist/languages/russian-words.js.map +0 -1
- package/dist/languages/spanish-words.js.map +0 -1
- package/dist/languages/tamil-words.js.map +0 -1
- package/dist/languages/telugu-words.js.map +0 -1
- package/dist/romanization-detector.js.map +0 -1
- /package/dist/{algos → esm/algos}/aho-corasick.d.ts +0 -0
- /package/dist/{algos → esm/algos}/aho-corasick.js +0 -0
- /package/dist/{algos → esm/algos}/bloom-filter.d.ts +0 -0
- /package/dist/{algos → esm/algos}/bloom-filter.js +0 -0
- /package/dist/{algos → esm/algos}/context-patterns.d.ts +0 -0
- /package/dist/{index.d.ts → esm/index.d.ts} +0 -0
- /package/dist/{innocence-scoring.d.ts → esm/innocence-scoring.d.ts} +0 -0
- /package/dist/{innocence-scoring.js → esm/innocence-scoring.js} +0 -0
- /package/dist/{language-detector.d.ts → esm/language-detector.d.ts} +0 -0
- /package/dist/{language-detector.js → esm/language-detector.js} +0 -0
- /package/dist/{language-dicts.d.ts → esm/language-dicts.d.ts} +0 -0
- /package/dist/{language-dicts.js → esm/language-dicts.js} +0 -0
- /package/dist/{languages → esm/languages}/arabic-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/arabic-words.js +0 -0
- /package/dist/{languages → esm/languages}/bengali-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/bengali-words.js +0 -0
- /package/dist/{languages → esm/languages}/brazilian-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/brazilian-words.js +0 -0
- /package/dist/{languages → esm/languages}/chinese-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/chinese-words.js +0 -0
- /package/dist/{languages → esm/languages}/english-primary-all-languages.d.ts +0 -0
- /package/dist/{languages → esm/languages}/english-primary-all-languages.js +0 -0
- /package/dist/{languages → esm/languages}/english-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/english-words.js +0 -0
- /package/dist/{languages → esm/languages}/french-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/french-words.js +0 -0
- /package/dist/{languages → esm/languages}/german-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/german-words.js +0 -0
- /package/dist/{languages → esm/languages}/hindi-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/hindi-words.js +0 -0
- /package/dist/{languages → esm/languages}/innocent-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/italian-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/italian-words.js +0 -0
- /package/dist/{languages → esm/languages}/japanese-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/japanese-words.js +0 -0
- /package/dist/{languages → esm/languages}/korean-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/korean-words.js +0 -0
- /package/dist/{languages → esm/languages}/russian-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/russian-words.js +0 -0
- /package/dist/{languages → esm/languages}/spanish-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/spanish-words.js +0 -0
- /package/dist/{languages → esm/languages}/tamil-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/tamil-words.js +0 -0
- /package/dist/{languages → esm/languages}/telugu-words.d.ts +0 -0
- /package/dist/{languages → esm/languages}/telugu-words.js +0 -0
- /package/dist/{romanization-detector.d.ts → esm/romanization-detector.d.ts} +0 -0
- /package/dist/{romanization-detector.js → esm/romanization-detector.js} +0 -0
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# BeKind Profanity Filter
|
|
2
2
|
|
|
3
|
-
> Forked from [AllProfanity](https://github.com/ayush-jadaun/allprofanity) by Ayush Jadaun. Extended with **romanization profanity detection** (catches Hinglish, transliterated text), **language-aware innocence scoring** (ELD + trie-based detection prevents false positives for cross-language collisions like "
|
|
3
|
+
> Forked from [AllProfanity](https://github.com/ayush-jadaun/allprofanity) by Ayush Jadaun. Extended with **romanization profanity detection** (catches Hinglish, transliterated text), **language-aware innocence scoring** (ELD + trie-based detection prevents false positives for cross-language collisions like "got" in Turkish), and additional language dictionaries. Licensed under MIT.
|
|
4
4
|
|
|
5
5
|
> ⚠️ **Early-stage package in progress.** Features available in the original AllProfanity are being actively deprecated, adjusted, or replaced. API surface may change without notice. Contributions and suggestions greatly appreciated.
|
|
6
6
|
|
|
@@ -19,7 +19,7 @@ A multi-language profanity filter with romanization detection, language-aware in
|
|
|
19
19
|
|
|
20
20
|
- **Multi-Language Profanity Detection:** 34K+ word dictionary across 16 languages with 18-language detection trie
|
|
21
21
|
- **Romanization Detection:** Catches Hinglish, transliterated Bengali, Tamil, Telugu, and Japanese
|
|
22
|
-
- **Cross-Language Innocence Scoring:** Handles words like "
|
|
22
|
+
- **Cross-Language Innocence Scoring:** Handles words like "got" (Turkish: "buttocks") and "fart" (Norwegian: "speed")
|
|
23
23
|
- **Context-Aware Analysis:** Booster/reducer patterns detect sexual context, negation, medical usage, and quoted speech
|
|
24
24
|
- **Leet-Speak Detection:** Catches obfuscated profanity (`f#ck`, `a55hole`, `sh1t`)
|
|
25
25
|
- **Word Boundary Detection:** Smart whole-word matching prevents flagging "assassin" or "assistance"
|
|
@@ -63,7 +63,7 @@ A multi-language profanity filter with romanization detection, language-aware in
|
|
|
63
63
|
|
|
64
64
|
---
|
|
65
65
|
|
|
66
|
-
> **Forked from [
|
|
66
|
+
> **Forked from [AllProfanity](https://github.com/ayush-jadaun/allprofanity)** by Ayush Jadaun. Extended with **romanization profanity detection** (catches Hinglish, transliterated text), **language-aware innocence scoring** (ELD + trie-based detection prevents false positives for cross-language collisions like "got" in Turkish), and additional language dictionaries. Licensed under MIT.
|
|
67
67
|
|
|
68
68
|
## Installation
|
|
69
69
|
|
|
@@ -193,85 +193,88 @@ const filter = new BeKind({
|
|
|
193
193
|
});
|
|
194
194
|
```
|
|
195
195
|
|
|
196
|
-
###
|
|
196
|
+
### Alternative Library Comparison
|
|
197
|
+
|
|
198
|
+
The main strength of be-kind comes from its dictionary and knowledge base. To give a fair comparison, **all benchmarks below inject be-kind's full 34K-word dictionary into every alternative library**, so the results compare **matching engines and detection features**, not dictionary coverage.
|
|
197
199
|
|
|
198
200
|
Benchmarked on a single CPU core (pinned via `taskset -c 0`). All numbers are **ops/second — higher is better**.
|
|
199
201
|
|
|
200
|
-
>
|
|
202
|
+
> [leo-profanity](https://github.com/jojoee/leo-profanity) ships with ~400 English words, [bad-words](https://github.com/web-mech/badwords) ships with ~400 English words, and [glin-profanity](https://www.glincker.com/tools/glin-profanity) loads its own 24-language dictionaries — all receive be-kind's 34K dictionary on top.
|
|
201
203
|
|
|
202
204
|
| Library | Languages (out-of-the-box) | Leet-speak | Repeat compression | Context-aware |
|
|
203
205
|
|---------|--------------------------|-----------|-------------------|--------------|
|
|
204
206
|
| **be-kind** | 16 profanity dicts + 18-lang detection trie | ✅ | 🚧 planned | ✅ (certainty-delta) |
|
|
205
207
|
| **be-kind (ctx)** | same as be-kind | ✅ | 🚧 planned | ✅ (boosters + reducers) |
|
|
206
208
|
| [leo-profanity](https://github.com/jojoee/leo-profanity) + dict | 16 (via be-kind dict injection) | ❌ | ❌ | ❌ |
|
|
207
|
-
| [bad-words](https://github.com/web-mech/badwords) |
|
|
208
|
-
| [glin-profanity](https://www.glincker.com/tools/glin-profanity) | 24 | ✅ (3 levels) | ✅ | ✅ (heuristic) |
|
|
209
|
-
|
|
210
|
-
**Speed benchmark** — ops/second on a single CPU core (`taskset -c 0`), higher is better:
|
|
211
|
-
|
|
212
|
-
| Test | be-kind | be-kind (ctx) | leo | bad-words | glin (basic) | glin (enhanced) |
|
|
213
|
-
|
|
214
|
-
| check — clean (short) | 2,
|
|
215
|
-
| check — profane (short) | 2,
|
|
216
|
-
| check — leet-speak | 1,
|
|
217
|
-
| clean — profane (short) | 2,
|
|
218
|
-
| check — 500-char clean |
|
|
219
|
-
| check — 500-char profane |
|
|
220
|
-
| check — 2,500-char clean |
|
|
221
|
-
| check — 2,500-char profane |
|
|
209
|
+
| [bad-words](https://github.com/web-mech/badwords) + dict | 16 (via be-kind dict injection) | ❌ | ❌ | ❌ |
|
|
210
|
+
| [glin-profanity](https://www.glincker.com/tools/glin-profanity) + dict | 24 + be-kind dict | ✅ (3 levels) | ✅ | ✅ (heuristic) |
|
|
211
|
+
|
|
212
|
+
**Speed benchmark** — ops/second on a single CPU core (`taskset -c 0`), higher is better. All competitors have be-kind's 34K dictionary injected:
|
|
213
|
+
|
|
214
|
+
| Test | be-kind | be-kind (ctx) | leo + dict | bad-words + dict | glin (basic) | glin (enhanced) |
|
|
215
|
+
|------|--------:|--------------:|-----------:|-----------------:|-------------:|----------------:|
|
|
216
|
+
| check — clean (short) | 2,625 | 3,007 | 932,597 | 29 | 68 | 68 |
|
|
217
|
+
| check — profane (short) | 2,556 | 2,251 | 1,424,984 | 27 | 3,602 | 3,333 |
|
|
218
|
+
| check — leet-speak | 1,407 | 1,324 | 1,540,700 | 26 | 2,791 | 4,350 |
|
|
219
|
+
| clean — profane (short) | 2,499 | 2,243 | 372,049 | 2 | N/A | N/A |
|
|
220
|
+
| check — 500-char clean | 409 | 427 | 110,318 | 17 | 21 | 22 |
|
|
221
|
+
| check — 500-char profane | 357 | 314 | 217,347 | 17 | 828 | 718 |
|
|
222
|
+
| check — 2,500-char clean | 88 | 90 | 21,727 | 10 | 6 | 6 |
|
|
223
|
+
| check — 2,500-char profane | 79 | 69 | 47,966 | 9 | 192 | 165 |
|
|
222
224
|
|
|
223
225
|
**Library versions tested:** `leo-profanity@1.9.0`, `bad-words@4.0.0`, `glin-profanity@3.3.0`
|
|
224
226
|
|
|
225
227
|
**Notes:**
|
|
226
|
-
- **
|
|
227
|
-
-
|
|
228
|
-
- `
|
|
229
|
-
- `
|
|
230
|
-
- `
|
|
228
|
+
- **All competitors have be-kind's 34K dictionary injected** to isolate matching-engine performance from dictionary coverage.
|
|
229
|
+
- **be-kind** is **~39x faster than glin** on clean short text (2,625 vs 68 ops/s) with the same vocabulary. be-kind uses a **trie** (O(input_length) matching), while glin uses **linear scanning** (`for (const word of this.words.keys())` — O(dict_size * input_length)).
|
|
230
|
+
- `be-kind (ctx)` adds ~10-15% overhead over default be-kind — context analysis (certainty-delta pattern matching) is cheap.
|
|
231
|
+
- `leo + dict` is the fastest by a large margin but offers **no leet-speak, no context analysis, and no repeat compression** — it's a simple substring matcher. Its speed advantage comes from a flat array lookup with no normalization overhead.
|
|
232
|
+
- `bad-words + dict` demonstrates the regex bottleneck catastrophically: 29 ops/s on clean short text vs 2,625 for be-kind — a **~90x slowdown**. bad-words creates a new `RegExp` per word in a `.filter()` loop ([source](https://github.com/web-mech/badwords/blob/master/src/badwords.ts#L91-L103)) — no short-circuiting, so clean and profane text perform identically (~27 ops/s). `clean()` drops to 2 ops/s (vs 2,499 for be-kind). This makes bad-words unsuitable for large multilingual dictionaries.
|
|
233
|
+
- **glin with dict** collapses to 68 ops/s on clean short text (vs 2,625 for be-kind) — a **~39x slowdown** — demonstrating the linear-scan bottleneck at scale. glin short-circuits on first match, which explains the ~53x speedup on profane text (3,602 ops/s) vs clean text (68 ops/s).
|
|
231
234
|
- be-kind is the only library with cross-language innocence scoring, romanization support, and context-aware certainty adjustment.
|
|
232
235
|
|
|
233
|
-
Run the benchmark yourself:
|
|
236
|
+
Run the speed benchmark yourself:
|
|
234
237
|
```bash
|
|
235
238
|
taskset -c 0 bun run benchmark:competitors
|
|
236
239
|
```
|
|
237
240
|
|
|
238
241
|
### Accuracy Comparison
|
|
239
242
|
|
|
240
|
-
Measures TP rate (recall), FP rate, and F1 across eight test categories (225 labeled cases, dataset v6). All libraries are tested against all categories — no exemptions. **Higher F1 and lower FP rate are better.**
|
|
243
|
+
Measures TP rate (recall), FP rate, and F1 across eight test categories (225 labeled cases, dataset v6). All alternative libraries have be-kind's 34K dictionary injected. All libraries are tested against all categories — no exemptions. **Higher F1 and lower FP rate are better.**
|
|
241
244
|
|
|
242
245
|
> **Bias disclaimer:** This dataset was created by the be-kind team. Non-English cases were likely drawn from or verified against be-kind's own dictionary, which advantages be-kind on those categories. To partially offset this, the dataset includes independent test cases from [glin-profanity's upstream test suite](https://github.com/GLINCKER/glin-profanity/tree/release/tests) and adversarial false-positive cases specifically chosen to expose known be-kind failures. We strongly recommend running this benchmark against your own dataset before drawing conclusions.
|
|
243
246
|
|
|
244
|
-
> **Note:** `be-kind (sensitive)` = `sensitiveMode: true` (flags AMBIVALENT words too). `be-kind (ctx)` = `contextAnalysis.enabled: true`. `glin (collapsed)` = glin (basic) with `collapseRepeatedCharacters()` pre-processing.
|
|
247
|
+
> **Note:** `be-kind (sensitive)` = `sensitiveMode: true` (flags AMBIVALENT words too). `be-kind (ctx)` = `contextAnalysis.enabled: true`. `glin (collapsed) + dict` = glin (basic) + dict with `collapseRepeatedCharacters()` pre-processing. All alternative libraries have be-kind's 34K dictionary injected.
|
|
245
248
|
|
|
246
249
|
#### Single-language detection — 65 cases (English incl. leetspeak, French, German, Spanish, Hindi)
|
|
247
250
|
|
|
248
251
|
| Library | Recall | Precision | FP Rate | F1 |
|
|
249
252
|
|---|---|---|---|---|
|
|
250
253
|
| be-kind (sensitive) | 100% | 100% | 0% | **1.00** |
|
|
254
|
+
| bad-words + dict | 88% | 100% | 0% | 0.94 |
|
|
255
|
+
| glin (enhanced) + dict | 88% | 100% | 0% | 0.94 |
|
|
256
|
+
| glin (collapsed) + dict | 86% | 100% | 0% | 0.92 |
|
|
251
257
|
| leo + dict | 82% | 100% | 0% | 0.90 |
|
|
252
258
|
| be-kind | 80% | 100% | 0% | 0.89 |
|
|
253
259
|
| be-kind (ctx) | 80% | 100% | 0% | 0.89 |
|
|
254
|
-
| glin (enhanced) | 72% | 100% | 0% | 0.84 |
|
|
255
|
-
| glin (collapsed) | 72% | 100% | 0% | 0.84 |
|
|
256
|
-
| bad-words | 52% | 100% | 0% | 0.68 |
|
|
257
260
|
|
|
258
|
-
>
|
|
261
|
+
> With be-kind's 34K dictionary injected, all alternatives improve dramatically. `bad-words + dict` and `glin (enhanced) + dict` both reach 88% recall (up from 52% and 72% without dict). be-kind in default mode misses mild words (`damn`, `hell`); `sensitiveMode: true` catches these. All libraries achieve 100% precision — when they flag something, it's always correct.
|
|
259
262
|
|
|
260
263
|
#### False positives / innocent words — 48 cases (clean only, lower FP rate is better)
|
|
261
264
|
|
|
262
|
-
Includes adversarial cases (`cum laude`, `Dick Van Dyke`, culinary `faggots`,
|
|
265
|
+
Includes adversarial cases (`cum laude`, `Dick Van Dyke`, culinary `faggots`, Turkish `got`). Recall and F1 are undefined (no profane cases).
|
|
263
266
|
|
|
264
267
|
| Library | FP Rate |
|
|
265
268
|
|---|---|
|
|
266
|
-
|
|
|
267
|
-
|
|
|
268
|
-
| be-kind (ctx) | 21% |
|
|
269
|
-
| bad-words | 23% |
|
|
270
|
-
| leo + dict | 25% |
|
|
269
|
+
| leo + dict | **25%** |
|
|
270
|
+
| be-kind (ctx) | **25%** |
|
|
271
271
|
| be-kind | 27% |
|
|
272
272
|
| be-kind (sensitive) | 31% |
|
|
273
|
+
| glin (enhanced) + dict | 31% |
|
|
274
|
+
| glin (collapsed) + dict | 31% |
|
|
275
|
+
| bad-words + dict | 33% |
|
|
273
276
|
|
|
274
|
-
>
|
|
277
|
+
> With the full 34K dictionary injected, glin and bad-words now produce more false positives than before — their FP rates rise to 31-33% due to the larger vocabulary. `be-kind (ctx)` ties with `leo + dict` for the lowest FP rate (25%) thanks to context-aware certainty adjustment. be-kind's FP rate remains a significant weakness, but context analysis helps.
|
|
275
278
|
|
|
276
279
|
#### Multi-language detection — 26 cases (Hinglish, French, German, Spanish, mixed)
|
|
277
280
|
|
|
@@ -280,96 +283,98 @@ Includes adversarial cases (`cum laude`, `Dick Van Dyke`, culinary `faggots`, Sw
|
|
|
280
283
|
| be-kind | 100% | 100% | 0% | **1.00** |
|
|
281
284
|
| be-kind (sensitive) | 100% | 100% | 0% | **1.00** |
|
|
282
285
|
| leo + dict | 100% | 100% | 0% | **1.00** |
|
|
283
|
-
|
|
|
284
|
-
| glin (enhanced) |
|
|
285
|
-
|
|
|
286
|
-
|
|
|
286
|
+
| bad-words + dict | 100% | 100% | 0% | **1.00** |
|
|
287
|
+
| glin (enhanced) + dict | 100% | 100% | 0% | **1.00** |
|
|
288
|
+
| be-kind (ctx) | 100% | 100% | 0% | **1.00** |
|
|
289
|
+
| glin (collapsed) + dict | 100% | 100% | 0% | **1.00** |
|
|
287
290
|
|
|
288
|
-
> With be-kind's dictionary injected,
|
|
291
|
+
> With be-kind's 34K dictionary injected, **every library achieves 100% recall** — proving the dictionary is the sole differentiator for multi-language detection. The matching engine doesn't matter when the vocabulary is comprehensive enough.
|
|
289
292
|
|
|
290
293
|
#### Romanization — 30 cases (Hinglish, Bengali, Tamil, Telugu, Japanese)
|
|
291
294
|
|
|
292
295
|
| Library | Recall | Precision | FP Rate | F1 |
|
|
293
296
|
|---|---|---|---|---|
|
|
297
|
+
| glin (enhanced) + dict | 85% | 81% | 40% | **0.83** |
|
|
294
298
|
| leo + dict | 75% | 94% | 10% | **0.83** |
|
|
295
299
|
| be-kind | 80% | 84% | 30% | 0.82 |
|
|
296
300
|
| be-kind (sensitive) | 80% | 84% | 30% | 0.82 |
|
|
297
301
|
| be-kind (ctx) | 80% | 84% | 30% | 0.82 |
|
|
298
|
-
|
|
|
299
|
-
| glin (collapsed) |
|
|
300
|
-
| bad-words | 0% | 0% | 10% | — |
|
|
302
|
+
| bad-words + dict | 80% | 84% | 30% | 0.82 |
|
|
303
|
+
| glin (collapsed) + dict | 80% | 84% | 30% | 0.82 |
|
|
301
304
|
|
|
302
|
-
>
|
|
305
|
+
> With dict injection, `glin (enhanced) + dict` achieves the **highest recall** (85%) on romanization — glin's leet-speak detection catches additional transliterated variants. However, its FP rate (40%) is also the highest. `leo + dict` achieves the same F1 (0.83) with much better precision (94%) and lowest FP (10%). be-kind, bad-words + dict, and glin (collapsed) + dict all tie at 80% recall / 30% FP / F1=0.82, showing that the dictionary drives most romanization detection — not the matching engine.
|
|
303
306
|
|
|
304
307
|
#### Semantic context — 25 cases
|
|
305
308
|
|
|
306
309
|
| Library | Recall | Precision | FP Rate | F1 |
|
|
307
310
|
|---|---|---|---|---|
|
|
308
|
-
| be-kind (ctx) | 80% | 73% | 20% | **0.76** |
|
|
309
311
|
| leo + dict | 100% | 59% | 47% | 0.74 |
|
|
310
|
-
|
|
|
311
|
-
| glin (collapsed) | 90% | 53% | 53% | 0.67 |
|
|
312
|
+
| bad-words + dict | 100% | 48% | 73% | 0.65 |
|
|
312
313
|
| be-kind (sensitive) | 100% | 48% | 73% | 0.65 |
|
|
313
|
-
|
|
|
314
|
+
| glin (enhanced) + dict | 100% | 48% | 73% | 0.65 |
|
|
315
|
+
| glin (collapsed) + dict | 100% | 48% | 73% | 0.65 |
|
|
316
|
+
| be-kind (ctx) | 80% | 62% | 47% | 0.64 |
|
|
314
317
|
| be-kind | 80% | 47% | 60% | 0.59 |
|
|
315
318
|
|
|
316
|
-
> Semantic context is where all libraries struggle — precision drops below 50% for most. Cases include metalinguistic uses
|
|
319
|
+
> Semantic context is where all libraries struggle — precision drops below 50% for most. Cases include metalinguistic uses, negation, and medical context. With dict injection, bad-words + dict and glin now achieve 100% recall but at the cost of 73% FP rate. `be-kind (ctx)` trades lower recall (80%) for better precision (62%) and a lower FP rate (47%) via context-aware certainty adjustment — boosters confirm profane intent, reducers detect innocent contexts like proper nouns and medical terms.
|
|
317
320
|
|
|
318
|
-
#### Repeated character evasion — 5 cases (
|
|
321
|
+
#### Repeated character evasion — 5 cases (elongated profanity)
|
|
319
322
|
|
|
320
323
|
No clean cases in this category — FP rate is undefined.
|
|
321
324
|
|
|
322
325
|
| Library | Recall | Precision |
|
|
323
326
|
|---|---|---|
|
|
324
|
-
| glin (enhanced) | **100%** | 100% |
|
|
325
|
-
| glin (collapsed) | 40% | 100% |
|
|
327
|
+
| glin (enhanced) + dict | **100%** | 100% |
|
|
328
|
+
| glin (collapsed) + dict | 40% | 100% |
|
|
326
329
|
| be-kind | 0% | — |
|
|
327
330
|
| be-kind (sensitive) | 0% | — |
|
|
328
331
|
| be-kind (ctx) | 0% | — |
|
|
329
332
|
| leo + dict | 0% | — |
|
|
330
|
-
| bad-words | 0% | — |
|
|
333
|
+
| bad-words + dict | 0% | — |
|
|
331
334
|
|
|
332
|
-
#### Concatenated / no-space evasion — 7 cases (
|
|
335
|
+
#### Concatenated / no-space evasion — 7 cases (profanity embedded in concatenated strings)
|
|
333
336
|
|
|
334
337
|
| Library | Recall | Precision | FP Rate | F1 |
|
|
335
338
|
|---|---|---|---|---|
|
|
336
339
|
| be-kind | 20% | 100% | 0% | 0.33 |
|
|
337
340
|
| be-kind (sensitive) | 20% | 100% | 0% | 0.33 |
|
|
338
341
|
| be-kind (ctx) | 20% | 100% | 0% | 0.33 |
|
|
342
|
+
| bad-words + dict | 20% | 100% | 0% | 0.33 |
|
|
343
|
+
| glin (enhanced) + dict | 20% | 100% | 0% | 0.33 |
|
|
344
|
+
| glin (collapsed) + dict | 20% | 100% | 0% | 0.33 |
|
|
339
345
|
| leo + dict | 0% | — | 0% | — |
|
|
340
|
-
| bad-words | 0% | — | 0% | — |
|
|
341
|
-
| glin (enhanced) | 0% | — | 0% | — |
|
|
342
|
-
| glin (collapsed) | 0% | — | 0% | — |
|
|
343
346
|
|
|
344
347
|
#### Challenge cases — 19 cases (semantic disambiguation, embedded substrings, separator evasion)
|
|
345
348
|
|
|
346
|
-
Hard problems: `cock` as rooster, `ass` as donkey,
|
|
349
|
+
Hard problems: `cock` as rooster, `ass` as donkey, Turkish `got` = "buttocks" vs English "got", profanity in concatenated strings, and separator-spaced evasion (`f u c k`, `f_u*c k`, `a.s.s.h.o.l.e`).
|
|
347
350
|
|
|
348
351
|
| Library | Recall | Precision | FP Rate | F1 |
|
|
349
352
|
|---|---|---|---|---|
|
|
350
|
-
| be-kind (ctx) | 60% | 75% |
|
|
353
|
+
| be-kind (ctx) | 60% | 75% | 33% | **0.63** |
|
|
351
354
|
| be-kind | 60% | 60% | 44% | 0.60 |
|
|
352
355
|
| be-kind (sensitive) | 60% | 60% | 44% | 0.60 |
|
|
353
|
-
| glin (enhanced) |
|
|
356
|
+
| glin (enhanced) + dict | 60% | 60% | 44% | 0.60 |
|
|
357
|
+
| bad-words + dict | 50% | 56% | 44% | 0.53 |
|
|
358
|
+
| glin (collapsed) + dict | 50% | 56% | 44% | 0.53 |
|
|
354
359
|
| leo + dict | 20% | 50% | 22% | 0.29 |
|
|
355
|
-
| bad-words | 20% | 33% | 44% | 0.25 |
|
|
356
|
-
| glin (collapsed) | 0% | 0% | 44% | — |
|
|
357
360
|
|
|
358
|
-
> be-kind (ctx)
|
|
361
|
+
> be-kind (ctx) achieves the best F1 on challenge cases thanks to context-aware certainty adjustment — recognizing innocent contexts like "cock crowed at dawn" and "wild ass is an equine." With dict injection, glin (enhanced) + dict now matches be-kind's recall (60%) but at higher FP (44% vs 33%). Separator-spaced evasion cases (`f u c k`, `f_u*c k`, mixed separators) test features that no alternative library supports. These cases still require semantic understanding that no dictionary-based filter can fully solve — the strongest argument for LLM-assisted moderation as a second pass.
|
|
359
362
|
|
|
360
363
|
#### Overall summary — micro-averaged across all 225 cases
|
|
361
364
|
|
|
365
|
+
All alternative libraries have be-kind's 34K dictionary injected.
|
|
366
|
+
|
|
362
367
|
| Library | Recall | Precision | FP Rate | F1 | TP | FN | FP | TN |
|
|
363
368
|
|---|---|---|---|---|---|---|---|---|
|
|
364
|
-
| be-kind (sensitive) | **86%** | 76% | 32% | 0.81 | 104 | 17 | 33 | 71 |
|
|
365
|
-
|
|
|
369
|
+
| be-kind (sensitive) | **86%** | 76% | 32% | **0.81** | 104 | 17 | 33 | 71 |
|
|
370
|
+
| glin (enhanced) + dict | **86%** | 75% | 33% | 0.80 | 104 | 17 | 34 | 70 |
|
|
371
|
+
| glin (collapsed) + dict | 81% | 75% | 32% | 0.78 | 98 | 23 | 33 | 71 |
|
|
372
|
+
| bad-words + dict | 80% | 74% | 33% | 0.77 | 97 | 24 | 34 | 70 |
|
|
373
|
+
| leo + dict | 74% | 80% | 21% | 0.77 | 89 | 32 | 22 | 82 |
|
|
374
|
+
| be-kind (ctx) | 76% | **79%** | **24%** | 0.77 | 92 | 29 | 25 | 79 |
|
|
366
375
|
| be-kind | 76% | 76% | 28% | 0.76 | 92 | 29 | 29 | 75 |
|
|
367
|
-
| leo + dict | 74% | 80% | 21% | 0.76 | 89 | 32 | 22 | 82 |
|
|
368
|
-
| glin (enhanced) | 63% | 78% | 21% | 0.70 | 76 | 45 | 22 | 82 |
|
|
369
|
-
| glin (collapsed) | 58% | 77% | 20% | 0.66 | 70 | 51 | 21 | 83 |
|
|
370
|
-
| bad-words | 42% | 65% | 26% | 0.51 | 51 | 70 | 27 | 77 |
|
|
371
376
|
|
|
372
|
-
> Micro-averaged: all 225 cases (121 profane, 104 clean) aggregated into one confusion matrix per library, then recall/precision/F1 computed once. No category weighting artifacts.
|
|
377
|
+
> Micro-averaged: all 225 cases (121 profane, 104 clean) aggregated into one confusion matrix per library, then recall/precision/F1 computed once. No category weighting artifacts. With be-kind's dictionary injected, **glin (enhanced) + dict matches be-kind (sensitive) on recall (86%)** and nearly matches on F1 (0.80 vs 0.81) — proving the dictionary is the core differentiator, not the matching engine. `leo + dict` and `be-kind (ctx)` tie for best precision (79-80%) and lowest FP rates (21-24%). be-kind (ctx) achieves this through context-aware certainty adjustment; leo achieves it through simpler matching that avoids over-triggering.
|
|
373
378
|
|
|
374
379
|
Run the accuracy benchmark yourself:
|
|
375
380
|
```bash
|
|
@@ -386,7 +391,7 @@ Returns `true` if the text contains any profanity.
|
|
|
386
391
|
|
|
387
392
|
```typescript
|
|
388
393
|
profanity.check('This is a clean sentence.'); // false
|
|
389
|
-
profanity.check('This is a
|
|
394
|
+
profanity.check('This is a b*llsh*t sentence.'); // true
|
|
390
395
|
profanity.check('What the f#ck is this?'); // true (leet-speak)
|
|
391
396
|
profanity.check('यह एक चूतिया परीक्षण है।'); // true (Hindi)
|
|
392
397
|
```
|
|
@@ -404,9 +409,9 @@ Returns a detailed result:
|
|
|
404
409
|
- `positions: Array<{ word: string, start: number, end: number }>`
|
|
405
410
|
|
|
406
411
|
```typescript
|
|
407
|
-
const result = profanity.detect('This is
|
|
412
|
+
const result = profanity.detect('This is f**king b*llsh*t and chutiya.');
|
|
408
413
|
console.log(result.hasProfanity); // true
|
|
409
|
-
console.log(result.detectedWords); // ['
|
|
414
|
+
console.log(result.detectedWords); // ['f**king', 'b*llsh*t', 'chutiya']
|
|
410
415
|
console.log(result.severity); // 3 (SEVERE)
|
|
411
416
|
console.log(result.cleanedText); // "This is ******* ******** and ******."
|
|
412
417
|
console.log(result.positions); // e.g. [{word: 'fucking', start: 8, end: 15}, ...]
|
|
@@ -419,8 +424,8 @@ console.log(result.positions); // e.g. [{word: 'fucking', start: 8, end: 15}, ..
|
|
|
419
424
|
Replace each character of profane words with a placeholder (default: `*`).
|
|
420
425
|
|
|
421
426
|
```typescript
|
|
422
|
-
profanity.clean('This contains
|
|
423
|
-
profanity.clean('This contains
|
|
427
|
+
profanity.clean('This contains b*llsh*t.'); // "This contains ********."
|
|
428
|
+
profanity.clean('This contains b*llsh*t.', '#'); // "This contains ########."
|
|
424
429
|
profanity.clean('यह एक चूतिया परीक्षण है।'); // e.g. "यह एक ***** परीक्षण है।"
|
|
425
430
|
```
|
|
426
431
|
|
|
@@ -432,8 +437,8 @@ Replace each profane word with a single placeholder (default: `***`).
|
|
|
432
437
|
(If the placeholder is omitted, uses `***`.)
|
|
433
438
|
|
|
434
439
|
```typescript
|
|
435
|
-
profanity.cleanWithPlaceholder('This contains
|
|
436
|
-
profanity.cleanWithPlaceholder('This contains
|
|
440
|
+
profanity.cleanWithPlaceholder('This contains b*llsh*t.'); // "This contains ***."
|
|
441
|
+
profanity.cleanWithPlaceholder('This contains b*llsh*t.', '[CENSORED]'); // "This contains [CENSORED]."
|
|
437
442
|
profanity.cleanWithPlaceholder('यह एक चूतिया परीक्षण है।', '####'); // e.g. "यह एक #### परीक्षण है।"
|
|
438
443
|
```
|
|
439
444
|
|
|
@@ -459,8 +464,8 @@ profanity.check('Qué puta situación.'); // true
|
|
|
459
464
|
Remove a word or an array of words from the profanity filter.
|
|
460
465
|
|
|
461
466
|
```typescript
|
|
462
|
-
profanity.remove('
|
|
463
|
-
profanity.check('This is
|
|
467
|
+
profanity.remove('b*llsh*t');
|
|
468
|
+
profanity.check('This is b*llsh*t.'); // false
|
|
464
469
|
|
|
465
470
|
profanity.remove(['mierda', 'puta']);
|
|
466
471
|
profanity.check('Esto es mierda.'); // false
|
|
@@ -473,11 +478,11 @@ profanity.check('Esto es mierda.'); // false
|
|
|
473
478
|
Whitelist words so they are never flagged as profane.
|
|
474
479
|
|
|
475
480
|
```typescript
|
|
476
|
-
profanity.addToWhitelist(['
|
|
477
|
-
profanity.check('He is an
|
|
478
|
-
profanity.check('
|
|
481
|
+
profanity.addToWhitelist(['f**k', 'idiot','sh*t']);
|
|
482
|
+
profanity.check('He is an f**king idiot.'); // false
|
|
483
|
+
profanity.check('F**k this sh*t.'); // false
|
|
479
484
|
// Remove from whitelist to restore detection
|
|
480
|
-
profanity.removeFromWhitelist(['
|
|
485
|
+
profanity.removeFromWhitelist(['f**k', 'idiot','sh*t']);
|
|
481
486
|
```
|
|
482
487
|
|
|
483
488
|
---
|
|
@@ -498,7 +503,7 @@ Set the default placeholder character for `clean()`.
|
|
|
498
503
|
|
|
499
504
|
```typescript
|
|
500
505
|
profanity.setPlaceholder('#');
|
|
501
|
-
profanity.clean('This is
|
|
506
|
+
profanity.clean('This is b*llsh*t.'); // "This is ########."
|
|
502
507
|
profanity.setPlaceholder('*'); // Reset to default
|
|
503
508
|
```
|
|
504
509
|
|
|
@@ -511,7 +516,7 @@ Options include: `enableLeetSpeak`, `caseSensitive`, `strictMode`, `detectPartia
|
|
|
511
516
|
|
|
512
517
|
```typescript
|
|
513
518
|
profanity.updateConfig({ caseSensitive: true, enableLeetSpeak: false });
|
|
514
|
-
profanity.check('
|
|
519
|
+
profanity.check('F**K'); // false (if caseSensitive)
|
|
515
520
|
profanity.updateConfig({ caseSensitive: false, enableLeetSpeak: true });
|
|
516
521
|
profanity.check('f#ck'); // true
|
|
517
522
|
```
|
|
@@ -591,9 +596,9 @@ Remove all loaded languages and dynamic words (start with a clean filter).
|
|
|
591
596
|
|
|
592
597
|
```typescript
|
|
593
598
|
profanity.clearList();
|
|
594
|
-
profanity.check('
|
|
599
|
+
profanity.check('f**k'); // false
|
|
595
600
|
profanity.loadLanguage('english');
|
|
596
|
-
profanity.check('
|
|
601
|
+
profanity.check('f**k'); // true
|
|
597
602
|
```
|
|
598
603
|
|
|
599
604
|
---
|
|
@@ -723,7 +728,7 @@ Edit `bekindprofanityfilter.config.json` to enable/disable features. Your IDE wi
|
|
|
723
728
|
|
|
724
729
|
## Cross-Language Innocence Scoring
|
|
725
730
|
|
|
726
|
-
Many words are profane in one language but perfectly innocent in another. For example, "
|
|
731
|
+
Many words are profane in one language but perfectly innocent in another. For example, "got" means "buttocks" in Turkish but is an extremely common English word, "fart" means "speed" in Scandinavian languages, and "bite" is a common English word that's vulgar in French. BeKind handles these cross-language collisions automatically using a multi-layer language detection and scoring system.
|
|
727
732
|
|
|
728
733
|
### Language Detection Architecture
|
|
729
734
|
|
|
@@ -745,10 +750,10 @@ Unicode codepoint ranges map characters directly to language families (e.g., Cyr
|
|
|
745
750
|
For each word, `scoreWord()` combines all three layers into a single `Record<string, number>` mapping language codes to confidence scores:
|
|
746
751
|
|
|
747
752
|
```
|
|
748
|
-
scoreWord("
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
753
|
+
scoreWord("got") → { en: 0.9, tr: 0.7, de: 0.2, ... }
|
|
754
|
+
↑ English trie match (extremely common word)
|
|
755
|
+
↑ Turkish trie match (profane in Turkish)
|
|
756
|
+
↑ German ELD n-gram signal
|
|
752
757
|
```
|
|
753
758
|
|
|
754
759
|
Layer weights: Script (1.0) > Trie (0.8) > ELD (0.6) > Suffix (0.3+) > Prefix (0.3+)
|
|
@@ -758,8 +763,8 @@ Layer weights: Script (1.0) > Trie (0.8) > ELD (0.6) > Suffix (0.3+) > Prefix (0
|
|
|
758
763
|
For full text, `detectLanguages()` runs `scoreWord()` on every word and aggregates results into document-level proportions:
|
|
759
764
|
|
|
760
765
|
```typescript
|
|
761
|
-
detectLanguages("
|
|
762
|
-
// → { languages: [{ language: "
|
|
766
|
+
detectLanguages("We got the tickets and went to the show")
|
|
767
|
+
// → { languages: [{ language: "en", proportion: 0.9 }, { language: "tr", proportion: 0.1 }, ...] }
|
|
763
768
|
```
|
|
764
769
|
|
|
765
770
|
*Note:* ELD often classifies Swedish as German due to n-gram similarity. The confusion map (see below) compensates for this.
|
|
@@ -798,41 +803,37 @@ If profane language dominates (profaneAmp > innocentAmp):
|
|
|
798
803
|
Result clamped to [0, 5]
|
|
799
804
|
```
|
|
800
805
|
|
|
801
|
-
The `dampeningFactor` (0-1) controls how aggressively the adjustment works per collision word. Words that are genuinely innocent in another language (e.g., "
|
|
806
|
+
The `dampeningFactor` (0-1) controls how aggressively the adjustment works per collision word. Words that are genuinely innocent in another language (e.g., "got" in English, df=0.95) get heavy dampening, while dangerous dual-meaning words (e.g., "cock" as rooster, df=0.1) barely adjust.
|
|
802
807
|
|
|
803
808
|
### End-to-End Flow
|
|
804
809
|
|
|
805
810
|
```
|
|
806
|
-
Text: "
|
|
807
|
-
|
|
808
|
-
|
|
811
|
+
Text: "All proceeds go to the local food bank"
|
|
812
|
+
^^^^^^
|
|
813
|
+
"go t" bridged → "got" detected (tr: s:4 c:4)
|
|
809
814
|
|
|
810
815
|
1. Collision word matched → check innocent-words map
|
|
811
|
-
"
|
|
816
|
+
"got" → innocent in English (meaning: "past tense of get", dampeningFactor: 0.95)
|
|
812
817
|
|
|
813
818
|
2. Language detection triggered (lazy — only runs on collision matches)
|
|
814
|
-
Document signal: detectLanguages() → {
|
|
815
|
-
Word signal: scoreWord("
|
|
819
|
+
Document signal: detectLanguages() → { en: 0.9, tr: 0.05, ... }
|
|
820
|
+
Word signal: scoreWord("got") → { en: 0.9, tr: 0.7, ... }
|
|
816
821
|
|
|
817
822
|
3. Weighted average (1.5:1 doc:word ratio)
|
|
818
|
-
amplified["
|
|
819
|
-
amplified["
|
|
820
|
-
amplified["en"] = (0.6 × 1.0 + 0.2 × 1.5) / 2.5 = 0.36
|
|
821
|
-
|
|
822
|
-
4. Confusion map: German signal → partial Swedish evidence
|
|
823
|
-
effectiveAmp["sv"] = max(0.32, 0.42 × 0.8) = 0.336
|
|
823
|
+
amplified["en"] = (0.9 × 1.0 + 0.9 × 1.5) / 2.5 = 0.90
|
|
824
|
+
amplified["tr"] = (0.7 × 1.0 + 0.05 × 1.5) / 2.5 = 0.31
|
|
824
825
|
|
|
825
|
-
|
|
826
|
-
→
|
|
827
|
-
→ Certainty dampened: 4 × (1 - 0.
|
|
828
|
-
→ Below flag threshold (s:
|
|
826
|
+
4. Innocent language (en: 0.90) > Profane language (tr: 0.31)?
|
|
827
|
+
→ Yes, English signal dominates
|
|
828
|
+
→ Certainty dampened: 4 × (1 - 0.95 × 0.90) = 0.58
|
|
829
|
+
→ Below flag threshold (s:4 needs c:2+) → NOT FLAGGED ✓
|
|
829
830
|
```
|
|
830
831
|
|
|
831
832
|
### Key Features
|
|
832
833
|
|
|
833
834
|
- **29 collision words** mapped across 7 languages (English, Swedish, Norwegian, Danish, German, Dutch, French, Spanish)
|
|
834
835
|
- **Per-word dampening factors** control adjustment strength:
|
|
835
|
-
- `0.
|
|
836
|
+
- `0.95` = heavy dampening (genuinely innocent cross-language, e.g., "got" in English)
|
|
836
837
|
- `0.1` = barely dampens (almost always used as profanity, e.g., "cock" in English)
|
|
837
838
|
- **Lazy language detection** — `detectLanguages()` only runs when a collision word is matched (zero performance cost for non-collision text)
|
|
838
839
|
- **Confusion map** — handles ELD n-gram detector's known misclassifications (e.g., Swedish often classified as German)
|
|
@@ -842,6 +843,7 @@ Text: "Programmet börjar klockan åtta och tar slut vid tio"
|
|
|
842
843
|
|
|
843
844
|
| Word | Profane In | Innocent In | Meaning |
|
|
844
845
|
|------|-----------|-------------|---------|
|
|
846
|
+
| got | Turkish | English | past tense of "get" (df: 0.95) |
|
|
845
847
|
| slut | English | Swedish, Danish | end/finish |
|
|
846
848
|
| fart | English | Swedish, Norwegian, Danish | speed |
|
|
847
849
|
| hell | English | Swedish, Norwegian | luck |
|
|
@@ -903,7 +905,7 @@ Severity reflects the number and variety of detected profanities:
|
|
|
903
905
|
- **Mixed Content:** Handles mixed-language and code-switched sentences with language-aware scoring.
|
|
904
906
|
|
|
905
907
|
```typescript
|
|
906
|
-
profanity.check('This is
|
|
908
|
+
profanity.check('This is b*llsh*t and चूतिया.'); // true (mixed English/Hindi)
|
|
907
909
|
profanity.check('Ce mot est merde and पागल.'); // true (French/Hindi)
|
|
908
910
|
profanity.check('Isso é uma merda.'); // true (Brazilian Portuguese)
|
|
909
911
|
```
|
|
@@ -916,7 +918,7 @@ For sample words in a language (for UIs, admin, etc):
|
|
|
916
918
|
|
|
917
919
|
```typescript
|
|
918
920
|
import { englishBadWords, hindiBadWords } from 'bekindprofanityfilter';
|
|
919
|
-
console.log(englishBadWords.slice(0, 5)); // ["
|
|
921
|
+
console.log(englishBadWords.slice(0, 5)); // ["f**k", "sh*t", ...]
|
|
920
922
|
```
|
|
921
923
|
|
|
922
924
|
---
|
|
@@ -925,7 +927,7 @@ console.log(englishBadWords.slice(0, 5)); // ["fuck", "shit", ...]
|
|
|
925
927
|
|
|
926
928
|
- **No wordlist exposure:** There is no `.list()` function for security and encapsulation. Use exported word arrays for samples.
|
|
927
929
|
- **TRIE-based:** Scales easily to 50,000+ words.
|
|
928
|
-
- **Handles leet-speak:** Catches obfuscated variants like `f#ck`, `
|
|
930
|
+
- **Handles leet-speak:** Catches obfuscated variants like `f#ck`, `a55h*le`.
|
|
929
931
|
|
|
930
932
|
---
|
|
931
933
|
|
|
@@ -947,7 +949,7 @@ profanity.addToWhitelist(['anal', 'ass']);
|
|
|
947
949
|
console.log(profanity.check('He is an associate professor.')); // false
|
|
948
950
|
|
|
949
951
|
// Severity
|
|
950
|
-
const result = profanity.detect('This is
|
|
952
|
+
const result = profanity.detect('This is f**king b*llsh*t and chutiya.');
|
|
951
953
|
console.log(ProfanitySeverity[result.severity]); // "SEVERE"
|
|
952
954
|
|
|
953
955
|
// Custom dictionary
|
|
@@ -957,7 +959,7 @@ console.log(profanity.check('You barnacle-head!')); // true
|
|
|
957
959
|
|
|
958
960
|
// Placeholder configuration
|
|
959
961
|
profanity.setPlaceholder('#');
|
|
960
|
-
console.log(profanity.clean('This is
|
|
962
|
+
console.log(profanity.clean('This is b*llsh*t.')); // "This is ########."
|
|
961
963
|
profanity.setPlaceholder('*'); // Reset
|
|
962
964
|
```
|
|
963
965
|
|
|
@@ -991,7 +993,7 @@ A: Yes! BeKind is universal.
|
|
|
991
993
|
- ✅ Additional language packs (Arabic, Russian, Japanese, Korean, Chinese, Dutch)
|
|
992
994
|
- ✅ Romanization detection (Hinglish and other transliterated scripts)
|
|
993
995
|
- 🚧 Norwegian and Danish trie vocabularies (currently covered via confusion map)
|
|
994
|
-
- 🚧 Repeat character compression (normalize
|
|
996
|
+
- 🚧 Repeat character compression (normalize elongated words before matching, avoiding the need to enumerate elongations in the dictionary)
|
|
995
997
|
- 🚧 Phonetic matching (sounds-like detection)
|
|
996
998
|
- 🚧 Plugin system for custom detection algorithms
|
|
997
999
|
|
|
@@ -1001,7 +1003,7 @@ A: Yes! BeKind is universal.
|
|
|
1001
1003
|
|
|
1002
1004
|
MIT — See [LICENSE](https://github.com/grassroots-labs-org/be-kind-profanity-filter/blob/main/LICENSE)
|
|
1003
1005
|
|
|
1004
|
-
This project is a fork of [
|
|
1006
|
+
This project is a fork of [AllProfanity](https://github.com/ayush-jadaun/allprofanity) by Ayush Jadaun, also licensed under MIT.
|
|
1005
1007
|
|
|
1006
1008
|
---
|
|
1007
1009
|
|