terlik.js 2.3.0 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,17 +1,32 @@
1
1
  # terlik.js
2
2
 
3
- ![terlik.js](git-header.png)
3
+ ![terlik.js](assets/git-header.png)
4
4
 
5
5
  [![CI](https://github.com/badursun/terlik.js/actions/workflows/ci.yml/badge.svg)](https://github.com/badursun/terlik.js/actions/workflows/ci.yml)
6
6
  [![npm version](https://img.shields.io/npm/v/terlik.js.svg)](https://www.npmjs.com/package/terlik.js)
7
+ [![npm downloads](https://img.shields.io/npm/dm/terlik.js.svg)](https://www.npmjs.com/package/terlik.js)
7
8
  [![npm bundle size](https://img.shields.io/bundlephobia/minzip/terlik.js)](https://bundlephobia.com/package/terlik.js)
9
+ [![TypeScript](https://img.shields.io/badge/TypeScript-Ready-blue.svg)](https://www.typescriptlang.org/)
10
+ [![zero dependencies](https://img.shields.io/badge/dependencies-0-brightgreen.svg)]()
8
11
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
9
12
 
10
- Production-grade multi-language profanity detection and filtering. Not a naive blacklist — a multi-layered normalization and pattern engine that catches what simple string matching misses.
13
+ Multi-language profanity detection and filtering engine, designed Turkish-first and **extensible to any language**. Not a naive blacklist — a multi-layered normalization and pattern engine that catches what simple string matching misses.
11
14
 
12
- Built-in support for **Turkish**, **English**, **Spanish**, and **German**. Adding a new language is just a folder with two files.
15
+ Ships with **Turkish** (flagship, full coverage), **English**, **Spanish**, and **German** built-in. Add any language with a folder and two files, or extend at runtime via `extendDictionary`.
13
16
 
14
- Zero runtime dependencies. Full TypeScript. ESM + CJS. **35 KB** gzipped. Works in Node.js, Bun, Deno, browsers, Cloudflare Workers, and Edge runtimes no Node.js-specific APIs used.
17
+ > **Turkce:** Turkce oncelikli, her dile genisletilebilir kufur tespit ve filtreleme motoru. Leet speak, karakter tekrari, ayirici karakterler ve Turkce ek sistemi destegi ile yaratici kufur denemelerini yakalar. Sifir bagimlilik, TypeScript, 35 KB.
18
+
19
+ ## Features
20
+
21
+ - **Extensible to any language** — ships with TR/EN/ES/DE, add more via language packs or `extendDictionary`
22
+ - Catches leet speak, separators, char repetition, mixed case, zero-width chars
23
+ - Turkish suffix engine (83 suffixes, ~3,000+ detectable forms from 25 roots)
24
+ - Three detection modes: strict, balanced, loose (with fuzzy matching)
25
+ - Zero dependencies, **35 KB** gzipped
26
+ - ESM + CJS — works in Node.js, Bun, Deno, browsers, Cloudflare Workers, Edge runtimes
27
+ - Lazy compilation: ~1.5ms construction, <1ms per check after warmup
28
+ - ReDoS-safe regex patterns with timeout safety net
29
+ - Full TypeScript support with exported types
15
30
 
16
31
  ## Why terlik.js?
17
32
 
@@ -113,12 +128,14 @@ input
113
128
  → result
114
129
  ```
115
130
 
116
- Each language has its own char map, leet map, char classes, and optional number expansions. The engine is language-agnostic — only the data is language-specific.
131
+ Each language has its own char map, leet map, char classes, and optional number expansions. The engine is language-agnostic — only the data is language-specific. This means **any language can be added** without modifying the core engine.
117
132
 
118
133
  For suffixable roots, the engine appends an optional suffix group (up to 2 chained suffixes). Turkish has 83 suffixes (including question particles and adverbial forms), English has 8, Spanish has 13, German has 8.
119
134
 
120
135
  ### Language Packs
121
136
 
137
+ Community contributions to existing language packs (new words, variants, whitelist entries) and entirely new language packs are welcome! See [CONTRIBUTING.md](./CONTRIBUTING.md) for step-by-step instructions.
138
+
122
139
  Each language lives in its own folder under `src/lang/`:
123
140
 
124
141
  ```
@@ -161,15 +178,17 @@ terlik.js ships with a **deliberately narrow dictionary** — the goal is to **m
161
178
 
162
179
  ### Coverage
163
180
 
164
- | Language | Roots | Explicit Variants | Suffixes | Whitelist | Effective Forms |
165
- |---|---|---|---|---|---|
166
- | Turkish | 25 | 88 | 83 | 52 | ~3,000+ |
167
- | English | 23 | 106 | 8 | 42 | ~700+ |
168
- | Spanish | 19 | 73 | 13 | 15 | ~500+ |
169
- | German | 18 | 48 | 8 | 3 | ~300+ |
181
+ | Language | Status | Roots | Explicit Variants | Suffixes | Whitelist | Effective Forms |
182
+ |---|---|---|---|---|---|---|
183
+ | Turkish | Flagship | 25 | 88 | 83 | 52 | ~3,000+ |
184
+ | English | Community | 23 | 106 | 8 | 42 | ~700+ |
185
+ | Spanish | Community | 19 | 73 | 13 | 15 | ~500+ |
186
+ | German | Community | 18 | 48 | 8 | 3 | ~300+ |
170
187
 
171
188
  "Effective forms" = roots × normalization variants × suffix combinations × evasion patterns. A root like `sik` with 83 possible suffixes, leet decoding, separator tolerance, and repeat collapse produces thousands of detectable surface forms.
172
189
 
190
+ > **Add your language!** The engine is language-agnostic. See [Adding a New Language](#adding-a-new-language) or use [`extendDictionary`](#extenddictionary-option) for runtime extension.
191
+
173
192
  ### What IS Covered
174
193
 
175
194
  - **Core profanity roots** per language (high-severity sexual, insults, slurs)
@@ -306,7 +325,7 @@ Reproduce: `pnpm bench:accuracy` — outputs per-category breakdown, failure lis
306
325
 
307
326
  ```ts
308
327
  const terlik = new Terlik({
309
- language: "tr", // "tr" | "en" | "es" | "de" (default: "tr")
328
+ language: "tr", // built-in: "tr" | "en" | "es" | "de" (default: "tr")
310
329
  mode: "balanced", // "strict" | "balanced" | "loose"
311
330
  maskStyle: "stars", // "stars" | "partial" | "replace"
312
331
  replaceMask: "[***]", // mask text for "replace" style
@@ -317,6 +336,7 @@ const terlik = new Terlik({
317
336
  fuzzyAlgorithm: "levenshtein", // "levenshtein" | "dice"
318
337
  maxLength: 10000, // truncate input beyond this
319
338
  backgroundWarmup: false, // compile patterns in background via setTimeout
339
+ extendDictionary: undefined, // DictionaryData object to merge with built-in dictionary
320
340
  });
321
341
  ```
322
342
 
@@ -379,6 +399,30 @@ const cache = Terlik.warmup(["tr", "en", "es", "de"]);
379
399
  cache.get("en")!.containsProfanity("fuck"); // true — no cold start
380
400
  ```
381
401
 
402
+ ### `extendDictionary` Option
403
+
404
+ Merge an external dictionary with the built-in one. Useful for teams managing custom word lists without modifying the core package:
405
+
406
+ ```ts
407
+ const terlik = new Terlik({
408
+ extendDictionary: {
409
+ version: 1,
410
+ suffixes: ["ci", "cu"],
411
+ entries: [
412
+ { root: "customword", variants: ["cust0mword"], severity: "high", category: "general", suffixable: true },
413
+ ],
414
+ whitelist: ["safeterm"],
415
+ },
416
+ });
417
+
418
+ terlik.containsProfanity("customword"); // true
419
+ terlik.containsProfanity("customwordci"); // true (suffix match)
420
+ terlik.containsProfanity("safeterm"); // false (whitelisted)
421
+ terlik.containsProfanity("siktir"); // true (built-in still works)
422
+ ```
423
+
424
+ The extension dictionary must follow the same schema as built-in dictionaries. Duplicate roots are skipped; suffixes and whitelist entries are merged. Pattern cache is disabled for extended instances.
425
+
382
426
  ### `terlik.language: string`
383
427
 
384
428
  Read-only property. Returns the language code of the instance.
@@ -412,7 +456,7 @@ deNormalize("Scheiße"); // "scheisse"
412
456
 
413
457
  ## Testing
414
458
 
415
- 631 tests covering all 4 languages, 25 Turkish root words, suffix detection, lazy compilation, multi-language isolation, normalization, fuzzy matching, cleaning, integration, ReDoS hardening, attack surface coverage, and edge cases:
459
+ 874 tests covering all built-in languages, 25 Turkish root words, suffix detection, lazy compilation, multi-language isolation, normalization, fuzzy matching, cleaning, integration, ReDoS hardening, attack surface coverage, external dictionary merging, and edge cases:
416
460
 
417
461
  ```bash
418
462
  pnpm test # run once
@@ -427,7 +471,7 @@ An interactive browser-based test environment is included. Chat interface on the
427
471
  pnpm dev:live # http://localhost:2026
428
472
  ```
429
473
 
430
- See [`live_test_server/README.md`](./live_test_server/README.md) for details.
474
+ See [`tools/README.md`](./tools/README.md) for details.
431
475
 
432
476
  ### Integration Guide
433
477
 
@@ -451,99 +495,7 @@ See [CONTRIBUTING.md](./CONTRIBUTING.md) for contribution guidelines.
451
495
 
452
496
  ## Changelog
453
497
 
454
- ### 2026-02-28 (v2.3.0) 40x Faster Cold Start: V8 JIT Regex Optimization
455
-
456
- **Replaces `\p{L}`/`\p{N}` Unicode property escapes with explicit Latin ranges, eliminating V8 JIT bottleneck.**
457
-
458
- - **40x faster cold start** — First `containsProfanity()` call: 16,494ms → 404ms.
459
- - **356x faster multi-language warmup** — 4-language warmup: 19,234ms → 54ms.
460
- - **13x less memory** — Heap usage: 492MB → 38MB.
461
- - **Static pattern cache** — Same-language instances share compiled patterns via `Detector.patternCache`.
462
- - **Background warmup** — Dev server starts instantly, warms up in background.
463
-
464
- | Change | File |
465
- |---|---|
466
- | Replace `\p{L}\p{N}` with `[a-zA-Z0-9À-ɏ]` | `src/patterns.ts` |
467
- | Static pattern cache + explicit range in getSurroundingWord | `src/detector.ts` |
468
- | Explicit range in number expander + punctuation removal | `src/normalizer.ts` |
469
- | Pass cacheKey to Detector | `src/terlik.ts` |
470
- | Background warmup, lazy instance cache | `live_test_server/server.ts` |
471
- | NODE_OPTIONS heap safety net | `.github/workflows/ci.yml` |
472
-
473
- ### 2026-02-28 (v2.2.1) — CI Fix: Timeout Race Condition + İ Platform Compatibility
474
-
475
- **Fixes detection failures on slow runners and cross-platform İ (U+0130) handling.**
476
-
477
- - **Timeout race condition fix** — `REGEX_TIMEOUT_MS` check moved from _before_ match processing to _after_. Previously, V8 JIT compilation on first `exec()` call (triggered by lazy compilation) could exceed 250ms, causing the timeout to discard a valid match before it was recorded. Now the current match is always processed; the timeout only prevents scanning for additional matches.
478
- - **İ (U+0130) cross-platform fix** — First regex pass now runs on `text.toLocaleLowerCase(locale)` instead of raw text. Turkish İ→i mapping is performed explicitly before regex matching, avoiding inconsistent V8/ICU case-folding behavior across platforms (Ubuntu vs macOS). The `mapNormalizedToOriginal()` mapper recovers original-cased words for result output.
479
-
480
- | Change | File |
481
- |---|---|
482
- | Timeout check moved after match processing | `src/detector.ts` (`runPatterns`) |
483
- | Locale-lower first pass for İ safety | `src/detector.ts` (`detectPattern`) |
484
-
485
- ### 2026-02-28 (v2.2) — Lazy Compilation + Linguistic Patch
486
-
487
- **Zero-cost construction. Background warmup. Turkish agglutination hardening.**
488
-
489
- - **Lazy compilation** — Pattern compilation deferred from constructor to first `detect()` call. `new Terlik()` drops from ~225ms to **~1.5ms**. Strict-mode users never pay regex cost (hash lookup only).
490
- - **`backgroundWarmup` option** — `new Terlik({ backgroundWarmup: true })` schedules compilation + JIT warmup via `setTimeout(fn, 0)`. Idempotent: if `detect()` is called before the timer fires, it compiles synchronously and the timer becomes a no-op.
491
- - **`detector.compile()` public method** — Allows manual precompilation for advanced use cases.
492
- - **Turkish suffix expansion** — Added question particles (`misin`, `misiniz`, `musun`, `musunuz`, `miyim`, `miyiz`) and adverbial forms (`cesine`, `casina`) to suffix engine (now 83 total). All suffixable entries (orospu, piç, yarrak, ibne, etc.) now catch question and adverbial inflections.
493
- - **Deep agglutination variants** — Added explicit variants for `siktiğimin`, `sikermisiniz`, `sikermisin`, `siktirmişcesine`. These forms require 3+ suffix chains or non-standard morpheme boundaries (ğ→g bridge) that the suffix engine can't generalize without false positives.
494
- - **`MAX_PATTERN_LENGTH` 6000 → 10000** — Accommodates the larger suffix group without fallback to non-suffix mode.
495
- - **Test count** — 619 → 631. New `tests/lazy-compilation.test.ts` covers construction timing, transparent lazy compile, strict-mode optimization, backgroundWarmup with fake timers, and idempotent early-detect.
496
-
497
- | Change | File |
498
- |---|---|
499
- | `backgroundWarmup` option | `src/types.ts` |
500
- | Lazy `_patterns`, `ensureCompiled()`, `compile()` | `src/detector.ts` |
501
- | backgroundWarmup setTimeout scheduling | `src/terlik.ts` |
502
- | Suffix + variant expansion, MAX_PATTERN_LENGTH | `src/patterns.ts`, `src/lang/tr/dictionary.json` |
503
- | Lazy compilation tests (new) | `tests/lazy-compilation.test.ts` |
504
-
505
- ### 2026-02-28 (v2.1) — ReDoS Security Hardening
506
-
507
- **Added Regex Denial-of-Service protection.**
508
-
509
- Identified vulnerability: overlap between `charClasses` and `separator` (`@`, `$`, `!`, `|`, `+`, `#`, `€`, `¢`, `©` could be matched by both char class and separator) enabled polynomial O(n^2) backtracking via adversarial input.
510
-
511
- - **Bounded separator** — `[^\p{L}\p{N}]*` (unbounded) replaced with `[^\p{L}\p{N}]{0,3}` (max 3 chars). Real-world evasions (`s.i.k.t.i.r`, `s_i_k`) use 1 separator char. This reduces backtracking from O(n^2) to O(1) per boundary.
512
- - **Regex timeout safety net** — Added 250ms timeout (`REGEX_TIMEOUT_MS`) to `runPatterns()` and `detectFuzzy()` loops. Never triggers on normal input (<1ms), but provides a hard cap on adversarial input.
513
- - **charClasses cleanup** — Removed separator-overlapping symbols from all 4 language configs (TR, EN, ES, DE). These symbols are already defined in `leetMap` and converted during the normalizer pass — removing them from pattern matching causes no false negatives.
514
- - **ReDoS test suite** — `tests/redos.test.ts`: 71 tests covering adversarial timing, attack surface (separator abuse, leet bypass, char repetition, Unicode tricks, whitelist integrity, boundary attacks, multi-match, input edge cases, suffix hardening).
515
- - **MAX_PATTERN_LENGTH** — 5000 → 6000 (later raised to 10000 in v2.2). The `{0,3}` separator adds ~3 chars per boundary; raised the limit so large suffix patterns (e.g. `orospu`) don't fall back to non-suffix mode.
516
- - **Test count** — 548 → 619.
517
-
518
- | Change | File |
519
- |---|---|
520
- | Separator `*` → `{0,3}`, timeout constant | `src/patterns.ts` |
521
- | Timeout loop guard | `src/detector.ts` |
522
- | charClasses cleanup | `src/lang/{tr,en,es,de}/config.ts` |
523
- | ReDoS + attack surface test suite (new) | `tests/redos.test.ts` |
524
-
525
- ### 2026-02-28 (v2)
526
-
527
- **Multi-Language Support**
528
-
529
- - **4 built-in languages** — Turkish (tr), English (en), Spanish (es), German (de). Each language is a self-contained folder (`src/lang/xx/`) with `config.ts` and `dictionary.json`.
530
- - **Folder-based language packs** — Adding a new language requires creating one folder with two files and one import line in the registry.
531
- - **`Terlik.warmup()`** — Static method to create and JIT-warm multiple language instances at once for server deployments.
532
- - **`language` option** — `new Terlik({ language: "en" })`. Default remains `"tr"` (backward compatible).
533
- - **Language-agnostic engine** — Normalizer, pattern compiler, detector, and cleaner are now fully parametric. Language-specific data (charMap, leetMap, charClasses, numberExpansions) comes from config files.
534
- - **New exports** — `createNormalizer`, `getLanguageConfig`, `getSupportedLanguages`, `LanguageConfig` type.
535
- - **Test coverage** — 346 → 418 tests. Added language-specific tests, cross-language isolation tests, and registry tests.
536
-
537
- ### 2026-02-28
538
-
539
- **Suffix Engine + JSON Dictionary Migration**
540
-
541
- - **JSON dictionary** — Migrated dictionary from `tr.ts` to community-friendly `tr.json` format. Added runtime schema validation (`validateDictionary`). Each entry now includes `category` and `suffixable` fields.
542
- - **Suffix engine** — Defined Turkish grammatical suffixes (later expanded to 83 in v2.2). Suffixable roots (`orospu`, `salak`, `aptal`, `kahpe`, etc.) automatically catch inflected forms like `orospuluk`, `salaksin`, `aptallarin`, `kahpeler`. Short roots (3-char: `sik`, `bok`, `göt`, `döl`) use explicit variants instead to prevent false positives.
543
- - **Critical bug fix: `\W` separator** — JavaScript's `\W` treats Turkish characters (`ı`, `ş`, `ğ`, `ö`, `ü`, `ç`) as non-word characters. The pattern engine separator `[\W_]*` was changed to `[^\p{L}\p{N}]*` (Unicode-aware). This fixed false positives on innocent words like `sıkma`, `sıkıntı`, `sıkıştı`.
544
- - **Live test server warmup fix** — Fixed cache key mismatch and added JIT warmup. First request latency reduced from 3318ms to 37ms.
545
- - **Test coverage** — 101 → 346 tests. All 25 root words are comprehensively tested.
546
- - **Expanded whitelist** — Added `ama`, `ami`, `amen`, `amir`, `amil`, `dolmen`.
498
+ See [CHANGELOG.md](./CHANGELOG.md) for the full version history.
547
499
 
548
500
  ## License
549
501
 
package/dist/index.d.mts CHANGED
@@ -1,3 +1,17 @@
1
+ /** Raw dictionary data structure as loaded from JSON. */
2
+ interface DictionaryData {
3
+ version: number;
4
+ suffixes: string[];
5
+ entries: Array<{
6
+ root: string;
7
+ variants: string[];
8
+ severity: string;
9
+ category: string;
10
+ suffixable: boolean;
11
+ }>;
12
+ whitelist: string[];
13
+ }
14
+
1
15
  /** Profanity severity level. */
2
16
  type Severity = "high" | "medium" | "low";
3
17
  /** Detection mode controlling the balance between precision and recall. */
@@ -45,6 +59,8 @@ interface TerlikOptions {
45
59
  replaceMask?: string;
46
60
  /** Background'da regex derleme + JIT warmup. Default: false. Serverless'da önerilmez. */
47
61
  backgroundWarmup?: boolean;
62
+ /** External dictionary data to merge with the built-in language dictionary. */
63
+ extendDictionary?: DictionaryData;
48
64
  }
49
65
  /** Per-call detection options that override instance defaults. */
50
66
  interface DetectOptions {
@@ -226,20 +242,6 @@ declare function levenshteinSimilarity(a: string, b: string): number;
226
242
  */
227
243
  declare function diceSimilarity(a: string, b: string): number;
228
244
 
229
- /** Raw dictionary data structure as loaded from JSON. */
230
- interface DictionaryData {
231
- version: number;
232
- suffixes: string[];
233
- entries: Array<{
234
- root: string;
235
- variants: string[];
236
- severity: string;
237
- category: string;
238
- suffixable: boolean;
239
- }>;
240
- whitelist: string[];
241
- }
242
-
243
245
  interface LanguageConfig {
244
246
  /** BCP-47 locale tag for toLocaleLowerCase (e.g. "tr", "en", "es", "de") */
245
247
  locale: string;
package/dist/index.d.ts CHANGED
@@ -1,3 +1,17 @@
1
+ /** Raw dictionary data structure as loaded from JSON. */
2
+ interface DictionaryData {
3
+ version: number;
4
+ suffixes: string[];
5
+ entries: Array<{
6
+ root: string;
7
+ variants: string[];
8
+ severity: string;
9
+ category: string;
10
+ suffixable: boolean;
11
+ }>;
12
+ whitelist: string[];
13
+ }
14
+
1
15
  /** Profanity severity level. */
2
16
  type Severity = "high" | "medium" | "low";
3
17
  /** Detection mode controlling the balance between precision and recall. */
@@ -45,6 +59,8 @@ interface TerlikOptions {
45
59
  replaceMask?: string;
46
60
  /** Background'da regex derleme + JIT warmup. Default: false. Serverless'da önerilmez. */
47
61
  backgroundWarmup?: boolean;
62
+ /** External dictionary data to merge with the built-in language dictionary. */
63
+ extendDictionary?: DictionaryData;
48
64
  }
49
65
  /** Per-call detection options that override instance defaults. */
50
66
  interface DetectOptions {
@@ -226,20 +242,6 @@ declare function levenshteinSimilarity(a: string, b: string): number;
226
242
  */
227
243
  declare function diceSimilarity(a: string, b: string): number;
228
244
 
229
- /** Raw dictionary data structure as loaded from JSON. */
230
- interface DictionaryData {
231
- version: number;
232
- suffixes: string[];
233
- entries: Array<{
234
- root: string;
235
- variants: string[];
236
- severity: string;
237
- category: string;
238
- suffixable: boolean;
239
- }>;
240
- whitelist: string[];
241
- }
242
-
243
245
  interface LanguageConfig {
244
246
  /** BCP-47 locale tag for toLocaleLowerCase (e.g. "tr", "en", "es", "de") */
245
247
  locale: string;