terlik.js 2.2.1 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,15 +1,15 @@
1
1
  # terlik.js
2
2
 
3
- ![terlik.js](git-header.png)
3
+ ![terlik.js](assets/git-header.png)
4
4
 
5
5
  [![CI](https://github.com/badursun/terlik.js/actions/workflows/ci.yml/badge.svg)](https://github.com/badursun/terlik.js/actions/workflows/ci.yml)
6
6
  [![npm version](https://img.shields.io/npm/v/terlik.js.svg)](https://www.npmjs.com/package/terlik.js)
7
7
  [![npm bundle size](https://img.shields.io/bundlephobia/minzip/terlik.js)](https://bundlephobia.com/package/terlik.js)
8
8
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
9
9
 
10
- Production-grade multi-language profanity detection and filtering. Not a naive blacklist — a multi-layered normalization and pattern engine that catches what simple string matching misses.
10
+ Turkish-first multi-language profanity detection and filtering. Not a naive blacklist — a multi-layered normalization and pattern engine that catches what simple string matching misses.
11
11
 
12
- Built-in support for **Turkish**, **English**, **Spanish**, and **German**. Adding a new language is just a folder with two files.
12
+ **Turkish** is the flagship language with full coverage. **English**, **Spanish**, and **German** are community-maintained and open for contributions. Adding a new language is just a folder with two files.
13
13
 
14
14
  Zero runtime dependencies. Full TypeScript. ESM + CJS. **35 KB** gzipped. Works in Node.js, Bun, Deno, browsers, Cloudflare Workers, and Edge runtimes — no Node.js-specific APIs used.
15
15
 
@@ -119,6 +119,8 @@ For suffixable roots, the engine appends an optional suffix group (up to 2 chain
119
119
 
120
120
  ### Language Packs
121
121
 
122
+ Community contributions to existing language packs (new words, variants, whitelist entries) and entirely new language packs are welcome! See [CONTRIBUTING.md](./CONTRIBUTING.md) for step-by-step instructions.
123
+
122
124
  Each language lives in its own folder under `src/lang/`:
123
125
 
124
126
  ```
@@ -161,12 +163,12 @@ terlik.js ships with a **deliberately narrow dictionary** — the goal is to **m
161
163
 
162
164
  ### Coverage
163
165
 
164
- | Language | Roots | Explicit Variants | Suffixes | Whitelist | Effective Forms |
165
- |---|---|---|---|---|---|
166
- | Turkish | 25 | 88 | 83 | 52 | ~3,000+ |
167
- | English | 23 | 106 | 8 | 42 | ~700+ |
168
- | Spanish | 19 | 73 | 13 | 15 | ~500+ |
169
- | German | 18 | 48 | 8 | 3 | ~300+ |
166
+ | Language | Status | Roots | Explicit Variants | Suffixes | Whitelist | Effective Forms |
167
+ |---|---|---|---|---|---|---|
168
+ | Turkish | Flagship | 25 | 88 | 83 | 52 | ~3,000+ |
169
+ | English | Community | 23 | 106 | 8 | 42 | ~700+ |
170
+ | Spanish | Community | 19 | 73 | 13 | 15 | ~500+ |
171
+ | German | Community | 18 | 48 | 8 | 3 | ~300+ |
170
172
 
171
173
  "Effective forms" = roots × normalization variants × suffix combinations × evasion patterns. A root like `sik` with 83 possible suffixes, leet decoding, separator tolerance, and repeat collapse produces thousands of detectable surface forms.
172
174
 
@@ -317,6 +319,7 @@ const terlik = new Terlik({
317
319
  fuzzyAlgorithm: "levenshtein", // "levenshtein" | "dice"
318
320
  maxLength: 10000, // truncate input beyond this
319
321
  backgroundWarmup: false, // compile patterns in background via setTimeout
322
+ extendDictionary: undefined, // DictionaryData object to merge with built-in dictionary
320
323
  });
321
324
  ```
322
325
 
@@ -379,6 +382,30 @@ const cache = Terlik.warmup(["tr", "en", "es", "de"]);
379
382
  cache.get("en")!.containsProfanity("fuck"); // true — no cold start
380
383
  ```
381
384
 
385
+ ### `extendDictionary` Option
386
+
387
+ Merge an external dictionary with the built-in one. Useful for teams managing custom word lists without modifying the core package:
388
+
389
+ ```ts
390
+ const terlik = new Terlik({
391
+ extendDictionary: {
392
+ version: 1,
393
+ suffixes: ["ci", "cu"],
394
+ entries: [
395
+ { root: "customword", variants: ["cust0mword"], severity: "high", category: "general", suffixable: true },
396
+ ],
397
+ whitelist: ["safeterm"],
398
+ },
399
+ });
400
+
401
+ terlik.containsProfanity("customword"); // true
402
+ terlik.containsProfanity("customwordci"); // true (suffix match)
403
+ terlik.containsProfanity("safeterm"); // false (whitelisted)
404
+ terlik.containsProfanity("siktir"); // true (built-in still works)
405
+ ```
406
+
407
+ The extension dictionary must follow the same schema as built-in dictionaries. Duplicate roots are skipped; suffixes and whitelist entries are merged. Pattern cache is disabled for extended instances.
408
+
382
409
  ### `terlik.language: string`
383
410
 
384
411
  Read-only property. Returns the language code of the instance.
@@ -412,7 +439,7 @@ deNormalize("Scheiße"); // "scheisse"
412
439
 
413
440
  ## Testing
414
441
 
415
- 631 tests covering all 4 languages, 25 Turkish root words, suffix detection, lazy compilation, multi-language isolation, normalization, fuzzy matching, cleaning, integration, ReDoS hardening, attack surface coverage, and edge cases:
442
+ 874 tests covering all 4 languages, 25 Turkish root words, suffix detection, lazy compilation, multi-language isolation, normalization, fuzzy matching, cleaning, integration, ReDoS hardening, attack surface coverage, external dictionary merging, and edge cases:
416
443
 
417
444
  ```bash
418
445
  pnpm test # run once
@@ -427,7 +454,7 @@ An interactive browser-based test environment is included. Chat interface on the
427
454
  pnpm dev:live # http://localhost:2026
428
455
  ```
429
456
 
430
- See [`live_test_server/README.md`](./live_test_server/README.md) for details.
457
+ See [`tools/README.md`](./tools/README.md) for details.
431
458
 
432
459
  ### Integration Guide
433
460
 
@@ -451,6 +478,25 @@ See [CONTRIBUTING.md](./CONTRIBUTING.md) for contribution guidelines.
451
478
 
452
479
  ## Changelog
453
480
 
481
+ ### 2026-02-28 (v2.3.0) — 40x Faster Cold Start: V8 JIT Regex Optimization
482
+
483
+ **Replaces `\p{L}`/`\p{N}` Unicode property escapes with explicit Latin ranges, eliminating V8 JIT bottleneck.**
484
+
485
+ - **40x faster cold start** — First `containsProfanity()` call: 16,494ms → 404ms.
486
+ - **356x faster multi-language warmup** — 4-language warmup: 19,234ms → 54ms.
487
+ - **13x less memory** — Heap usage: 492MB → 38MB.
488
+ - **Static pattern cache** — Same-language instances share compiled patterns via `Detector.patternCache`.
489
+ - **Background warmup** — Dev server starts instantly, warms up in background.
490
+
491
+ | Change | File |
492
+ |---|---|
493
+ | Replace `\p{L}\p{N}` with `[a-zA-Z0-9À-ɏ]` | `src/patterns.ts` |
494
+ | Static pattern cache + explicit range in getSurroundingWord | `src/detector.ts` |
495
+ | Explicit range in number expander + punctuation removal | `src/normalizer.ts` |
496
+ | Pass cacheKey to Detector | `src/terlik.ts` |
497
+ | Background warmup, lazy instance cache | `tools/server.ts` |
498
+ | NODE_OPTIONS heap safety net | `.github/workflows/ci.yml` |
499
+
454
500
  ### 2026-02-28 (v2.2.1) — CI Fix: Timeout Race Condition + İ Platform Compatibility
455
501
 
456
502
  **Fixes detection failures on slow runners and cross-platform İ (U+0130) handling.**
package/dist/index.d.mts CHANGED
@@ -1,3 +1,17 @@
1
+ /** Raw dictionary data structure as loaded from JSON. */
2
+ interface DictionaryData {
3
+ version: number;
4
+ suffixes: string[];
5
+ entries: Array<{
6
+ root: string;
7
+ variants: string[];
8
+ severity: string;
9
+ category: string;
10
+ suffixable: boolean;
11
+ }>;
12
+ whitelist: string[];
13
+ }
14
+
1
15
  /** Profanity severity level. */
2
16
  type Severity = "high" | "medium" | "low";
3
17
  /** Detection mode controlling the balance between precision and recall. */
@@ -45,6 +59,8 @@ interface TerlikOptions {
45
59
  replaceMask?: string;
46
60
  /** Background'da regex derleme + JIT warmup. Default: false. Serverless'da önerilmez. */
47
61
  backgroundWarmup?: boolean;
62
+ /** External dictionary data to merge with the built-in language dictionary. */
63
+ extendDictionary?: DictionaryData;
48
64
  }
49
65
  /** Per-call detection options that override instance defaults. */
50
66
  interface DetectOptions {
@@ -226,20 +242,6 @@ declare function levenshteinSimilarity(a: string, b: string): number;
226
242
  */
227
243
  declare function diceSimilarity(a: string, b: string): number;
228
244
 
229
- /** Raw dictionary data structure as loaded from JSON. */
230
- interface DictionaryData {
231
- version: number;
232
- suffixes: string[];
233
- entries: Array<{
234
- root: string;
235
- variants: string[];
236
- severity: string;
237
- category: string;
238
- suffixable: boolean;
239
- }>;
240
- whitelist: string[];
241
- }
242
-
243
245
  interface LanguageConfig {
244
246
  /** BCP-47 locale tag for toLocaleLowerCase (e.g. "tr", "en", "es", "de") */
245
247
  locale: string;
package/dist/index.d.ts CHANGED
@@ -1,3 +1,17 @@
1
+ /** Raw dictionary data structure as loaded from JSON. */
2
+ interface DictionaryData {
3
+ version: number;
4
+ suffixes: string[];
5
+ entries: Array<{
6
+ root: string;
7
+ variants: string[];
8
+ severity: string;
9
+ category: string;
10
+ suffixable: boolean;
11
+ }>;
12
+ whitelist: string[];
13
+ }
14
+
1
15
  /** Profanity severity level. */
2
16
  type Severity = "high" | "medium" | "low";
3
17
  /** Detection mode controlling the balance between precision and recall. */
@@ -45,6 +59,8 @@ interface TerlikOptions {
45
59
  replaceMask?: string;
46
60
  /** Background'da regex derleme + JIT warmup. Default: false. Serverless'da önerilmez. */
47
61
  backgroundWarmup?: boolean;
62
+ /** External dictionary data to merge with the built-in language dictionary. */
63
+ extendDictionary?: DictionaryData;
48
64
  }
49
65
  /** Per-call detection options that override instance defaults. */
50
66
  interface DetectOptions {
@@ -226,20 +242,6 @@ declare function levenshteinSimilarity(a: string, b: string): number;
226
242
  */
227
243
  declare function diceSimilarity(a: string, b: string): number;
228
244
 
229
- /** Raw dictionary data structure as loaded from JSON. */
230
- interface DictionaryData {
231
- version: number;
232
- suffixes: string[];
233
- entries: Array<{
234
- root: string;
235
- variants: string[];
236
- severity: string;
237
- category: string;
238
- suffixable: boolean;
239
- }>;
240
- whitelist: string[];
241
- }
242
-
243
245
  interface LanguageConfig {
244
246
  /** BCP-47 locale tag for toLocaleLowerCase (e.g. "tr", "en", "es", "de") */
245
247
  locale: string;