terlik.js 2.2.1 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -11
- package/dist/index.d.mts +16 -14
- package/dist/index.d.ts +16 -14
- package/dist/index.js +528 -45
- package/dist/index.mjs +528 -45
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
# terlik.js
|
|
2
2
|
|
|
3
|
-

|
|
3
|
+

|
|
4
4
|
|
|
5
5
|
[](https://github.com/badursun/terlik.js/actions/workflows/ci.yml)
|
|
6
6
|
[](https://www.npmjs.com/package/terlik.js)
|
|
7
7
|
[](https://bundlephobia.com/package/terlik.js)
|
|
8
8
|
[](https://opensource.org/licenses/MIT)
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
Turkish-first multi-language profanity detection and filtering. Not a naive blacklist — a multi-layered normalization and pattern engine that catches what simple string matching misses.
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
**Turkish** is the flagship language with full coverage. **English**, **Spanish**, and **German** are community-maintained and open for contributions. Adding a new language is just a folder with two files.
|
|
13
13
|
|
|
14
14
|
Zero runtime dependencies. Full TypeScript. ESM + CJS. **35 KB** gzipped. Works in Node.js, Bun, Deno, browsers, Cloudflare Workers, and Edge runtimes — no Node.js-specific APIs used.
|
|
15
15
|
|
|
@@ -119,6 +119,8 @@ For suffixable roots, the engine appends an optional suffix group (up to 2 chain
|
|
|
119
119
|
|
|
120
120
|
### Language Packs
|
|
121
121
|
|
|
122
|
+
Community contributions to existing language packs (new words, variants, whitelist entries) and entirely new language packs are welcome! See [CONTRIBUTING.md](./CONTRIBUTING.md) for step-by-step instructions.
|
|
123
|
+
|
|
122
124
|
Each language lives in its own folder under `src/lang/`:
|
|
123
125
|
|
|
124
126
|
```
|
|
@@ -161,12 +163,12 @@ terlik.js ships with a **deliberately narrow dictionary** — the goal is to **m
|
|
|
161
163
|
|
|
162
164
|
### Coverage
|
|
163
165
|
|
|
164
|
-
| Language | Roots | Explicit Variants | Suffixes | Whitelist | Effective Forms |
|
|
165
|
-
|
|
166
|
-
| Turkish | 25 | 88 | 83 | 52 | ~3,000+ |
|
|
167
|
-
| English | 23 | 106 | 8 | 42 | ~700+ |
|
|
168
|
-
| Spanish | 19 | 73 | 13 | 15 | ~500+ |
|
|
169
|
-
| German | 18 | 48 | 8 | 3 | ~300+ |
|
|
166
|
+
| Language | Status | Roots | Explicit Variants | Suffixes | Whitelist | Effective Forms |
|
|
167
|
+
|---|---|---|---|---|---|---|
|
|
168
|
+
| Turkish | Flagship | 25 | 88 | 83 | 52 | ~3,000+ |
|
|
169
|
+
| English | Community | 23 | 106 | 8 | 42 | ~700+ |
|
|
170
|
+
| Spanish | Community | 19 | 73 | 13 | 15 | ~500+ |
|
|
171
|
+
| German | Community | 18 | 48 | 8 | 3 | ~300+ |
|
|
170
172
|
|
|
171
173
|
"Effective forms" = roots × normalization variants × suffix combinations × evasion patterns. A root like `sik` with 83 possible suffixes, leet decoding, separator tolerance, and repeat collapse produces thousands of detectable surface forms.
|
|
172
174
|
|
|
@@ -317,6 +319,7 @@ const terlik = new Terlik({
|
|
|
317
319
|
fuzzyAlgorithm: "levenshtein", // "levenshtein" | "dice"
|
|
318
320
|
maxLength: 10000, // truncate input beyond this
|
|
319
321
|
backgroundWarmup: false, // compile patterns in background via setTimeout
|
|
322
|
+
extendDictionary: undefined, // DictionaryData object to merge with built-in dictionary
|
|
320
323
|
});
|
|
321
324
|
```
|
|
322
325
|
|
|
@@ -379,6 +382,30 @@ const cache = Terlik.warmup(["tr", "en", "es", "de"]);
|
|
|
379
382
|
cache.get("en")!.containsProfanity("fuck"); // true — no cold start
|
|
380
383
|
```
|
|
381
384
|
|
|
385
|
+
### `extendDictionary` Option
|
|
386
|
+
|
|
387
|
+
Merge an external dictionary with the built-in one. Useful for teams managing custom word lists without modifying the core package:
|
|
388
|
+
|
|
389
|
+
```ts
|
|
390
|
+
const terlik = new Terlik({
|
|
391
|
+
extendDictionary: {
|
|
392
|
+
version: 1,
|
|
393
|
+
suffixes: ["ci", "cu"],
|
|
394
|
+
entries: [
|
|
395
|
+
{ root: "customword", variants: ["cust0mword"], severity: "high", category: "general", suffixable: true },
|
|
396
|
+
],
|
|
397
|
+
whitelist: ["safeterm"],
|
|
398
|
+
},
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
terlik.containsProfanity("customword"); // true
|
|
402
|
+
terlik.containsProfanity("customwordci"); // true (suffix match)
|
|
403
|
+
terlik.containsProfanity("safeterm"); // false (whitelisted)
|
|
404
|
+
terlik.containsProfanity("siktir"); // true (built-in still works)
|
|
405
|
+
```
|
|
406
|
+
|
|
407
|
+
The extension dictionary must follow the same schema as built-in dictionaries. Duplicate roots are skipped; suffixes and whitelist entries are merged. Pattern cache is disabled for extended instances.
|
|
408
|
+
|
|
382
409
|
### `terlik.language: string`
|
|
383
410
|
|
|
384
411
|
Read-only property. Returns the language code of the instance.
|
|
@@ -412,7 +439,7 @@ deNormalize("Scheiße"); // "scheisse"
|
|
|
412
439
|
|
|
413
440
|
## Testing
|
|
414
441
|
|
|
415
|
-
|
|
442
|
+
874 tests covering all 4 languages, 25 Turkish root words, suffix detection, lazy compilation, multi-language isolation, normalization, fuzzy matching, cleaning, integration, ReDoS hardening, attack surface coverage, external dictionary merging, and edge cases:
|
|
416
443
|
|
|
417
444
|
```bash
|
|
418
445
|
pnpm test # run once
|
|
@@ -427,7 +454,7 @@ An interactive browser-based test environment is included. Chat interface on the
|
|
|
427
454
|
pnpm dev:live # http://localhost:2026
|
|
428
455
|
```
|
|
429
456
|
|
|
430
|
-
See [`
|
|
457
|
+
See [`tools/README.md`](./tools/README.md) for details.
|
|
431
458
|
|
|
432
459
|
### Integration Guide
|
|
433
460
|
|
|
@@ -451,6 +478,25 @@ See [CONTRIBUTING.md](./CONTRIBUTING.md) for contribution guidelines.
|
|
|
451
478
|
|
|
452
479
|
## Changelog
|
|
453
480
|
|
|
481
|
+
### 2026-02-28 (v2.3.0) — 40x Faster Cold Start: V8 JIT Regex Optimization
|
|
482
|
+
|
|
483
|
+
**Replaces `\p{L}`/`\p{N}` Unicode property escapes with explicit Latin ranges, eliminating V8 JIT bottleneck.**
|
|
484
|
+
|
|
485
|
+
- **40x faster cold start** — First `containsProfanity()` call: 16,494ms → 404ms.
|
|
486
|
+
- **356x faster multi-language warmup** — 4-language warmup: 19,234ms → 54ms.
|
|
487
|
+
- **13x less memory** — Heap usage: 492MB → 38MB.
|
|
488
|
+
- **Static pattern cache** — Same-language instances share compiled patterns via `Detector.patternCache`.
|
|
489
|
+
- **Background warmup** — Dev server starts instantly, warms up in background.
|
|
490
|
+
|
|
491
|
+
| Change | File |
|
|
492
|
+
|---|---|
|
|
493
|
+
| Replace `\p{L}\p{N}` with `[a-zA-Z0-9À-ɏ]` | `src/patterns.ts` |
|
|
494
|
+
| Static pattern cache + explicit range in getSurroundingWord | `src/detector.ts` |
|
|
495
|
+
| Explicit range in number expander + punctuation removal | `src/normalizer.ts` |
|
|
496
|
+
| Pass cacheKey to Detector | `src/terlik.ts` |
|
|
497
|
+
| Background warmup, lazy instance cache | `tools/server.ts` |
|
|
498
|
+
| NODE_OPTIONS heap safety net | `.github/workflows/ci.yml` |
|
|
499
|
+
|
|
454
500
|
### 2026-02-28 (v2.2.1) — CI Fix: Timeout Race Condition + İ Platform Compatibility
|
|
455
501
|
|
|
456
502
|
**Fixes detection failures on slow runners and cross-platform İ (U+0130) handling.**
|
package/dist/index.d.mts
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
/** Raw dictionary data structure as loaded from JSON. */
|
|
2
|
+
interface DictionaryData {
|
|
3
|
+
version: number;
|
|
4
|
+
suffixes: string[];
|
|
5
|
+
entries: Array<{
|
|
6
|
+
root: string;
|
|
7
|
+
variants: string[];
|
|
8
|
+
severity: string;
|
|
9
|
+
category: string;
|
|
10
|
+
suffixable: boolean;
|
|
11
|
+
}>;
|
|
12
|
+
whitelist: string[];
|
|
13
|
+
}
|
|
14
|
+
|
|
1
15
|
/** Profanity severity level. */
|
|
2
16
|
type Severity = "high" | "medium" | "low";
|
|
3
17
|
/** Detection mode controlling the balance between precision and recall. */
|
|
@@ -45,6 +59,8 @@ interface TerlikOptions {
|
|
|
45
59
|
replaceMask?: string;
|
|
46
60
|
/** Background'da regex derleme + JIT warmup. Default: false. Serverless'da önerilmez. */
|
|
47
61
|
backgroundWarmup?: boolean;
|
|
62
|
+
/** External dictionary data to merge with the built-in language dictionary. */
|
|
63
|
+
extendDictionary?: DictionaryData;
|
|
48
64
|
}
|
|
49
65
|
/** Per-call detection options that override instance defaults. */
|
|
50
66
|
interface DetectOptions {
|
|
@@ -226,20 +242,6 @@ declare function levenshteinSimilarity(a: string, b: string): number;
|
|
|
226
242
|
*/
|
|
227
243
|
declare function diceSimilarity(a: string, b: string): number;
|
|
228
244
|
|
|
229
|
-
/** Raw dictionary data structure as loaded from JSON. */
|
|
230
|
-
interface DictionaryData {
|
|
231
|
-
version: number;
|
|
232
|
-
suffixes: string[];
|
|
233
|
-
entries: Array<{
|
|
234
|
-
root: string;
|
|
235
|
-
variants: string[];
|
|
236
|
-
severity: string;
|
|
237
|
-
category: string;
|
|
238
|
-
suffixable: boolean;
|
|
239
|
-
}>;
|
|
240
|
-
whitelist: string[];
|
|
241
|
-
}
|
|
242
|
-
|
|
243
245
|
interface LanguageConfig {
|
|
244
246
|
/** BCP-47 locale tag for toLocaleLowerCase (e.g. "tr", "en", "es", "de") */
|
|
245
247
|
locale: string;
|
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
/** Raw dictionary data structure as loaded from JSON. */
|
|
2
|
+
interface DictionaryData {
|
|
3
|
+
version: number;
|
|
4
|
+
suffixes: string[];
|
|
5
|
+
entries: Array<{
|
|
6
|
+
root: string;
|
|
7
|
+
variants: string[];
|
|
8
|
+
severity: string;
|
|
9
|
+
category: string;
|
|
10
|
+
suffixable: boolean;
|
|
11
|
+
}>;
|
|
12
|
+
whitelist: string[];
|
|
13
|
+
}
|
|
14
|
+
|
|
1
15
|
/** Profanity severity level. */
|
|
2
16
|
type Severity = "high" | "medium" | "low";
|
|
3
17
|
/** Detection mode controlling the balance between precision and recall. */
|
|
@@ -45,6 +59,8 @@ interface TerlikOptions {
|
|
|
45
59
|
replaceMask?: string;
|
|
46
60
|
/** Background'da regex derleme + JIT warmup. Default: false. Serverless'da önerilmez. */
|
|
47
61
|
backgroundWarmup?: boolean;
|
|
62
|
+
/** External dictionary data to merge with the built-in language dictionary. */
|
|
63
|
+
extendDictionary?: DictionaryData;
|
|
48
64
|
}
|
|
49
65
|
/** Per-call detection options that override instance defaults. */
|
|
50
66
|
interface DetectOptions {
|
|
@@ -226,20 +242,6 @@ declare function levenshteinSimilarity(a: string, b: string): number;
|
|
|
226
242
|
*/
|
|
227
243
|
declare function diceSimilarity(a: string, b: string): number;
|
|
228
244
|
|
|
229
|
-
/** Raw dictionary data structure as loaded from JSON. */
|
|
230
|
-
interface DictionaryData {
|
|
231
|
-
version: number;
|
|
232
|
-
suffixes: string[];
|
|
233
|
-
entries: Array<{
|
|
234
|
-
root: string;
|
|
235
|
-
variants: string[];
|
|
236
|
-
severity: string;
|
|
237
|
-
category: string;
|
|
238
|
-
suffixable: boolean;
|
|
239
|
-
}>;
|
|
240
|
-
whitelist: string[];
|
|
241
|
-
}
|
|
242
|
-
|
|
243
245
|
interface LanguageConfig {
|
|
244
246
|
/** BCP-47 locale tag for toLocaleLowerCase (e.g. "tr", "en", "es", "de") */
|
|
245
247
|
locale: string;
|