terlik.js 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,278 @@
1
+ /** Profanity severity level. */
2
+ type Severity = "high" | "medium" | "low";
3
+ /** Detection mode controlling the balance between precision and recall. */
4
+ type Mode = "strict" | "balanced" | "loose";
5
+ /** Masking style used when cleaning text. */
6
+ type MaskStyle = "stars" | "partial" | "replace";
7
+ /** Fuzzy matching algorithm. */
8
+ type FuzzyAlgorithm = "levenshtein" | "dice";
9
+ /** How a match was detected. */
10
+ type MatchMethod = "exact" | "pattern" | "fuzzy";
11
+ /** A single entry in the profanity dictionary. */
12
+ interface WordEntry {
13
+ /** The canonical root form of the word. */
14
+ root: string;
15
+ /** Alternative spellings or forms of the root. */
16
+ variants: string[];
17
+ /** Severity level of the word. */
18
+ severity: Severity;
19
+ /** Content category (e.g. "sexual", "insult", "slur", "general"). */
20
+ category?: string;
21
+ /** Whether the suffix engine should match grammatical suffixes on this root. */
22
+ suffixable?: boolean;
23
+ }
24
+ /** Configuration options for creating a Terlik instance. */
25
+ interface TerlikOptions {
26
+ /** Language code (default: `"tr"`). */
27
+ language?: string;
28
+ /** Detection mode (default: `"balanced"`). */
29
+ mode?: Mode;
30
+ /** Masking style (default: `"stars"`). */
31
+ maskStyle?: MaskStyle;
32
+ /** Additional words to detect. */
33
+ customList?: string[];
34
+ /** Additional words to exclude from detection. */
35
+ whitelist?: string[];
36
+ /** Enable fuzzy matching (default: `false`). */
37
+ enableFuzzy?: boolean;
38
+ /** Fuzzy similarity threshold between 0 and 1 (default: `0.8`). */
39
+ fuzzyThreshold?: number;
40
+ /** Fuzzy matching algorithm (default: `"levenshtein"`). */
41
+ fuzzyAlgorithm?: FuzzyAlgorithm;
42
+ /** Maximum input length before truncation (default: `10000`). */
43
+ maxLength?: number;
44
+ /** Custom mask text for "replace" mask style (default: `"[***]"`). */
45
+ replaceMask?: string;
46
+ /** Background'da regex derleme + JIT warmup. Default: false. Serverless'da önerilmez. */
47
+ backgroundWarmup?: boolean;
48
+ }
49
+ /** Per-call detection options that override instance defaults. */
50
+ interface DetectOptions {
51
+ mode?: Mode;
52
+ enableFuzzy?: boolean;
53
+ fuzzyThreshold?: number;
54
+ fuzzyAlgorithm?: FuzzyAlgorithm;
55
+ }
56
+ /** Per-call clean options that override instance defaults. */
57
+ interface CleanOptions extends DetectOptions {
58
+ maskStyle?: MaskStyle;
59
+ replaceMask?: string;
60
+ }
61
+ /** A single profanity match found in the input text. */
62
+ interface MatchResult {
63
+ /** The matched text from the original input. */
64
+ word: string;
65
+ /** The dictionary root word. */
66
+ root: string;
67
+ /** Character index in the original input. */
68
+ index: number;
69
+ /** Severity of the matched word. */
70
+ severity: Severity;
71
+ /** How the match was detected. */
72
+ method: MatchMethod;
73
+ }
74
+
75
+ /**
76
+ * Multi-language profanity detection and filtering engine.
77
+ *
78
+ * @example
79
+ * ```ts
80
+ * const terlik = new Terlik();
81
+ * terlik.containsProfanity("siktir"); // true
82
+ * terlik.clean("siktir git"); // "****** git"
83
+ * ```
84
+ */
85
+ declare class Terlik {
86
+ private dictionary;
87
+ private detector;
88
+ private mode;
89
+ private maskStyle;
90
+ private enableFuzzy;
91
+ private fuzzyThreshold;
92
+ private fuzzyAlgorithm;
93
+ private maxLength;
94
+ private replaceMask;
95
+ /** The language code this instance was created with. */
96
+ readonly language: string;
97
+ /**
98
+ * Creates a new Terlik instance.
99
+ * @param options - Configuration options.
100
+ * @throws {Error} If the specified language is not supported.
101
+ */
102
+ constructor(options?: TerlikOptions);
103
+ /**
104
+ * Creates and JIT-warms instances for multiple languages at once.
105
+ * Useful for server deployments to eliminate cold-start latency.
106
+ *
107
+ * @param languages - Language codes to warm up (e.g. `["tr", "en"]`).
108
+ * @param baseOptions - Shared options applied to all instances.
109
+ * @returns A map of language code to warmed-up Terlik instance.
110
+ *
111
+ * @example
112
+ * ```ts
113
+ * const cache = Terlik.warmup(["tr", "en", "es"]);
114
+ * cache.get("en")!.containsProfanity("fuck"); // true, no cold start
115
+ * ```
116
+ */
117
+ static warmup(languages: string[], baseOptions?: Omit<TerlikOptions, "language">): Map<string, Terlik>;
118
+ /**
119
+ * Checks whether the text contains profanity.
120
+ * @param text - The text to check.
121
+ * @param options - Per-call detection options (overrides instance defaults).
122
+ * @returns `true` if profanity is detected, `false` otherwise.
123
+ */
124
+ containsProfanity(text: string, options?: DetectOptions): boolean;
125
+ /**
126
+ * Returns all profanity matches with details (word, root, index, severity, method).
127
+ * @param text - The text to analyze.
128
+ * @param options - Per-call detection options (overrides instance defaults).
129
+ * @returns Array of match results, sorted by index.
130
+ */
131
+ getMatches(text: string, options?: DetectOptions): MatchResult[];
132
+ /**
133
+ * Returns the text with detected profanity masked.
134
+ * @param text - The text to clean.
135
+ * @param options - Per-call clean options (overrides instance defaults).
136
+ * @returns The cleaned text with profanity replaced by mask characters.
137
+ */
138
+ clean(text: string, options?: CleanOptions): string;
139
+ /**
140
+ * Adds custom words to the detection dictionary at runtime.
141
+ * Triggers pattern recompilation.
142
+ * @param words - Words to add.
143
+ */
144
+ addWords(words: string[]): void;
145
+ /**
146
+ * Removes words from the detection dictionary at runtime.
147
+ * Triggers pattern recompilation.
148
+ * @param words - Words to remove.
149
+ */
150
+ removeWords(words: string[]): void;
151
+ /**
152
+ * Returns the compiled regex patterns keyed by root word.
153
+ * Useful for debugging or advanced usage.
154
+ * @returns Map of root word to compiled RegExp.
155
+ */
156
+ getPatterns(): Map<string, RegExp>;
157
+ private mergeDetectOptions;
158
+ }
159
+
160
+ /** Configuration for creating a language-specific normalizer. */
161
+ interface NormalizerConfig {
162
+ locale: string;
163
+ charMap: Record<string, string>;
164
+ leetMap: Record<string, string>;
165
+ numberExpansions?: [string, string][];
166
+ }
167
+ /**
168
+ * Creates a language-specific normalize function using the given config.
169
+ * The returned function applies a 6-stage pipeline: lowercase, char folding,
170
+ * number expansion, leet decode, punctuation removal, repeat collapse.
171
+ *
172
+ * @param config - Language-specific normalization settings.
173
+ * @returns A normalize function for the configured language.
174
+ *
175
+ * @example
176
+ * ```ts
177
+ * const normalize = createNormalizer({
178
+ * locale: "de",
179
+ * charMap: { ä: "a", ö: "o", ü: "u", ß: "ss" },
180
+ * leetMap: { "0": "o", "3": "e" },
181
+ * });
182
+ * normalize("Scheiße"); // "scheisse"
183
+ * ```
184
+ */
185
+ declare function createNormalizer(config: NormalizerConfig): (text: string) => string;
186
+ /**
187
+ * Normalizes text using the default Turkish locale pipeline.
188
+ * Shorthand for `createNormalizer()` with Turkish defaults.
189
+ *
190
+ * @param text - The text to normalize.
191
+ * @returns The normalized text.
192
+ *
193
+ * @example
194
+ * ```ts
195
+ * normalize("S.İ.K.T.İ.R"); // "siktir"
196
+ * normalize("$1kt1r"); // "siktir"
197
+ * ```
198
+ */
199
+ declare function normalize(text: string): string;
200
+
201
+ /**
202
+ * Computes the Levenshtein edit distance between two strings.
203
+ * Uses O(n) space optimization with two-row approach.
204
+ *
205
+ * @param a - First string.
206
+ * @param b - Second string.
207
+ * @returns The minimum number of single-character edits (insertions, deletions, substitutions).
208
+ */
209
+ declare function levenshteinDistance(a: string, b: string): number;
210
+ /**
211
+ * Computes the Levenshtein similarity ratio between two strings.
212
+ * Returns a value between 0 (completely different) and 1 (identical).
213
+ *
214
+ * @param a - First string.
215
+ * @param b - Second string.
216
+ * @returns Similarity ratio (0–1).
217
+ */
218
+ declare function levenshteinSimilarity(a: string, b: string): number;
219
+ /**
220
+ * Computes the Dice coefficient (bigram similarity) between two strings.
221
+ * Returns a value between 0 (no shared bigrams) and 1 (identical bigrams).
222
+ *
223
+ * @param a - First string (must be at least 2 characters for meaningful result).
224
+ * @param b - Second string (must be at least 2 characters for meaningful result).
225
+ * @returns Dice coefficient (0–1).
226
+ */
227
+ declare function diceSimilarity(a: string, b: string): number;
228
+
229
+ /** Raw dictionary data structure as loaded from JSON. */
230
+ interface DictionaryData {
231
+ version: number;
232
+ suffixes: string[];
233
+ entries: Array<{
234
+ root: string;
235
+ variants: string[];
236
+ severity: string;
237
+ category: string;
238
+ suffixable: boolean;
239
+ }>;
240
+ whitelist: string[];
241
+ }
242
+
243
+ interface LanguageConfig {
244
+ /** BCP-47 locale tag for toLocaleLowerCase (e.g. "tr", "en", "es", "de") */
245
+ locale: string;
246
+ /** Diacritics normalization: language-specific characters to base Latin.
247
+ * e.g. Turkish: ç→c, ğ→g, ı→i; German: ä→a, ö→o, ü→u, ß→ss */
248
+ charMap: Record<string, string>;
249
+ /** Leet speak substitution map.
250
+ * e.g. "0"→"o", "1"→"i", "@"→"a", "$"→"s" */
251
+ leetMap: Record<string, string>;
252
+ /** Visual similarity regex character classes for the pattern engine.
253
+ * Each key is a base letter, value is a regex character class string.
254
+ * e.g. a: "[a4@àáâãäå]", s: "[s5$şŞß]" */
255
+ charClasses: Record<string, string>;
256
+ /** Optional number-to-word expansions applied between letters.
257
+ * e.g. Turkish: [["2", "iki"], ["10", "on"]]
258
+ * Most languages leave this undefined. */
259
+ numberExpansions?: [string, string][];
260
+ /** Validated dictionary data (entries, whitelist, suffixes, version). */
261
+ dictionary: DictionaryData;
262
+ }
263
+
264
+ /**
265
+ * Retrieves the configuration for a supported language.
266
+ *
267
+ * @param lang - Language code (e.g. "tr", "en", "es", "de").
268
+ * @returns The language configuration including dictionary, charMap, and leetMap.
269
+ * @throws {Error} If the language is not supported or the dictionary version is too old.
270
+ */
271
+ declare function getLanguageConfig(lang: string): LanguageConfig;
272
+ /**
273
+ * Returns all available language codes.
274
+ * @returns Array of supported language codes (e.g. `["tr", "en", "es", "de"]`).
275
+ */
276
+ declare function getSupportedLanguages(): string[];
277
+
278
+ export { type CleanOptions, type DetectOptions, type FuzzyAlgorithm, type LanguageConfig, type MaskStyle, type MatchMethod, type MatchResult, type Mode, type NormalizerConfig, type Severity, Terlik, type TerlikOptions, type WordEntry, createNormalizer, diceSimilarity, getLanguageConfig, getSupportedLanguages, levenshteinDistance, levenshteinSimilarity, normalize };
@@ -0,0 +1,278 @@
1
+ /** Profanity severity level. */
2
+ type Severity = "high" | "medium" | "low";
3
+ /** Detection mode controlling the balance between precision and recall. */
4
+ type Mode = "strict" | "balanced" | "loose";
5
+ /** Masking style used when cleaning text. */
6
+ type MaskStyle = "stars" | "partial" | "replace";
7
+ /** Fuzzy matching algorithm. */
8
+ type FuzzyAlgorithm = "levenshtein" | "dice";
9
+ /** How a match was detected. */
10
+ type MatchMethod = "exact" | "pattern" | "fuzzy";
11
+ /** A single entry in the profanity dictionary. */
12
+ interface WordEntry {
13
+ /** The canonical root form of the word. */
14
+ root: string;
15
+ /** Alternative spellings or forms of the root. */
16
+ variants: string[];
17
+ /** Severity level of the word. */
18
+ severity: Severity;
19
+ /** Content category (e.g. "sexual", "insult", "slur", "general"). */
20
+ category?: string;
21
+ /** Whether the suffix engine should match grammatical suffixes on this root. */
22
+ suffixable?: boolean;
23
+ }
24
+ /** Configuration options for creating a Terlik instance. */
25
+ interface TerlikOptions {
26
+ /** Language code (default: `"tr"`). */
27
+ language?: string;
28
+ /** Detection mode (default: `"balanced"`). */
29
+ mode?: Mode;
30
+ /** Masking style (default: `"stars"`). */
31
+ maskStyle?: MaskStyle;
32
+ /** Additional words to detect. */
33
+ customList?: string[];
34
+ /** Additional words to exclude from detection. */
35
+ whitelist?: string[];
36
+ /** Enable fuzzy matching (default: `false`). */
37
+ enableFuzzy?: boolean;
38
+ /** Fuzzy similarity threshold between 0 and 1 (default: `0.8`). */
39
+ fuzzyThreshold?: number;
40
+ /** Fuzzy matching algorithm (default: `"levenshtein"`). */
41
+ fuzzyAlgorithm?: FuzzyAlgorithm;
42
+ /** Maximum input length before truncation (default: `10000`). */
43
+ maxLength?: number;
44
+ /** Custom mask text for "replace" mask style (default: `"[***]"`). */
45
+ replaceMask?: string;
46
+ /** Background'da regex derleme + JIT warmup. Default: false. Serverless'da önerilmez. */
47
+ backgroundWarmup?: boolean;
48
+ }
49
+ /** Per-call detection options that override instance defaults. */
50
+ interface DetectOptions {
51
+ mode?: Mode;
52
+ enableFuzzy?: boolean;
53
+ fuzzyThreshold?: number;
54
+ fuzzyAlgorithm?: FuzzyAlgorithm;
55
+ }
56
+ /** Per-call clean options that override instance defaults. */
57
+ interface CleanOptions extends DetectOptions {
58
+ maskStyle?: MaskStyle;
59
+ replaceMask?: string;
60
+ }
61
+ /** A single profanity match found in the input text. */
62
+ interface MatchResult {
63
+ /** The matched text from the original input. */
64
+ word: string;
65
+ /** The dictionary root word. */
66
+ root: string;
67
+ /** Character index in the original input. */
68
+ index: number;
69
+ /** Severity of the matched word. */
70
+ severity: Severity;
71
+ /** How the match was detected. */
72
+ method: MatchMethod;
73
+ }
74
+
75
+ /**
76
+ * Multi-language profanity detection and filtering engine.
77
+ *
78
+ * @example
79
+ * ```ts
80
+ * const terlik = new Terlik();
81
+ * terlik.containsProfanity("siktir"); // true
82
+ * terlik.clean("siktir git"); // "****** git"
83
+ * ```
84
+ */
85
+ declare class Terlik {
86
+ private dictionary;
87
+ private detector;
88
+ private mode;
89
+ private maskStyle;
90
+ private enableFuzzy;
91
+ private fuzzyThreshold;
92
+ private fuzzyAlgorithm;
93
+ private maxLength;
94
+ private replaceMask;
95
+ /** The language code this instance was created with. */
96
+ readonly language: string;
97
+ /**
98
+ * Creates a new Terlik instance.
99
+ * @param options - Configuration options.
100
+ * @throws {Error} If the specified language is not supported.
101
+ */
102
+ constructor(options?: TerlikOptions);
103
+ /**
104
+ * Creates and JIT-warms instances for multiple languages at once.
105
+ * Useful for server deployments to eliminate cold-start latency.
106
+ *
107
+ * @param languages - Language codes to warm up (e.g. `["tr", "en"]`).
108
+ * @param baseOptions - Shared options applied to all instances.
109
+ * @returns A map of language code to warmed-up Terlik instance.
110
+ *
111
+ * @example
112
+ * ```ts
113
+ * const cache = Terlik.warmup(["tr", "en", "es"]);
114
+ * cache.get("en")!.containsProfanity("fuck"); // true, no cold start
115
+ * ```
116
+ */
117
+ static warmup(languages: string[], baseOptions?: Omit<TerlikOptions, "language">): Map<string, Terlik>;
118
+ /**
119
+ * Checks whether the text contains profanity.
120
+ * @param text - The text to check.
121
+ * @param options - Per-call detection options (overrides instance defaults).
122
+ * @returns `true` if profanity is detected, `false` otherwise.
123
+ */
124
+ containsProfanity(text: string, options?: DetectOptions): boolean;
125
+ /**
126
+ * Returns all profanity matches with details (word, root, index, severity, method).
127
+ * @param text - The text to analyze.
128
+ * @param options - Per-call detection options (overrides instance defaults).
129
+ * @returns Array of match results, sorted by index.
130
+ */
131
+ getMatches(text: string, options?: DetectOptions): MatchResult[];
132
+ /**
133
+ * Returns the text with detected profanity masked.
134
+ * @param text - The text to clean.
135
+ * @param options - Per-call clean options (overrides instance defaults).
136
+ * @returns The cleaned text with profanity replaced by mask characters.
137
+ */
138
+ clean(text: string, options?: CleanOptions): string;
139
+ /**
140
+ * Adds custom words to the detection dictionary at runtime.
141
+ * Triggers pattern recompilation.
142
+ * @param words - Words to add.
143
+ */
144
+ addWords(words: string[]): void;
145
+ /**
146
+ * Removes words from the detection dictionary at runtime.
147
+ * Triggers pattern recompilation.
148
+ * @param words - Words to remove.
149
+ */
150
+ removeWords(words: string[]): void;
151
+ /**
152
+ * Returns the compiled regex patterns keyed by root word.
153
+ * Useful for debugging or advanced usage.
154
+ * @returns Map of root word to compiled RegExp.
155
+ */
156
+ getPatterns(): Map<string, RegExp>;
157
+ private mergeDetectOptions;
158
+ }
159
+
160
+ /** Configuration for creating a language-specific normalizer. */
161
+ interface NormalizerConfig {
162
+ locale: string;
163
+ charMap: Record<string, string>;
164
+ leetMap: Record<string, string>;
165
+ numberExpansions?: [string, string][];
166
+ }
167
+ /**
168
+ * Creates a language-specific normalize function using the given config.
169
+ * The returned function applies a 6-stage pipeline: lowercase, char folding,
170
+ * number expansion, leet decode, punctuation removal, repeat collapse.
171
+ *
172
+ * @param config - Language-specific normalization settings.
173
+ * @returns A normalize function for the configured language.
174
+ *
175
+ * @example
176
+ * ```ts
177
+ * const normalize = createNormalizer({
178
+ * locale: "de",
179
+ * charMap: { ä: "a", ö: "o", ü: "u", ß: "ss" },
180
+ * leetMap: { "0": "o", "3": "e" },
181
+ * });
182
+ * normalize("Scheiße"); // "scheisse"
183
+ * ```
184
+ */
185
+ declare function createNormalizer(config: NormalizerConfig): (text: string) => string;
186
+ /**
187
+ * Normalizes text using the default Turkish locale pipeline.
188
+ * Shorthand for `createNormalizer()` with Turkish defaults.
189
+ *
190
+ * @param text - The text to normalize.
191
+ * @returns The normalized text.
192
+ *
193
+ * @example
194
+ * ```ts
195
+ * normalize("S.İ.K.T.İ.R"); // "siktir"
196
+ * normalize("$1kt1r"); // "siktir"
197
+ * ```
198
+ */
199
+ declare function normalize(text: string): string;
200
+
201
+ /**
202
+ * Computes the Levenshtein edit distance between two strings.
203
+ * Uses O(n) space optimization with two-row approach.
204
+ *
205
+ * @param a - First string.
206
+ * @param b - Second string.
207
+ * @returns The minimum number of single-character edits (insertions, deletions, substitutions).
208
+ */
209
+ declare function levenshteinDistance(a: string, b: string): number;
210
+ /**
211
+ * Computes the Levenshtein similarity ratio between two strings.
212
+ * Returns a value between 0 (completely different) and 1 (identical).
213
+ *
214
+ * @param a - First string.
215
+ * @param b - Second string.
216
+ * @returns Similarity ratio (0–1).
217
+ */
218
+ declare function levenshteinSimilarity(a: string, b: string): number;
219
+ /**
220
+ * Computes the Dice coefficient (bigram similarity) between two strings.
221
+ * Returns a value between 0 (no shared bigrams) and 1 (identical bigrams).
222
+ *
223
+ * @param a - First string (must be at least 2 characters for meaningful result).
224
+ * @param b - Second string (must be at least 2 characters for meaningful result).
225
+ * @returns Dice coefficient (0–1).
226
+ */
227
+ declare function diceSimilarity(a: string, b: string): number;
228
+
229
+ /** Raw dictionary data structure as loaded from JSON. */
230
+ interface DictionaryData {
231
+ version: number;
232
+ suffixes: string[];
233
+ entries: Array<{
234
+ root: string;
235
+ variants: string[];
236
+ severity: string;
237
+ category: string;
238
+ suffixable: boolean;
239
+ }>;
240
+ whitelist: string[];
241
+ }
242
+
243
+ interface LanguageConfig {
244
+ /** BCP-47 locale tag for toLocaleLowerCase (e.g. "tr", "en", "es", "de") */
245
+ locale: string;
246
+ /** Diacritics normalization: language-specific characters to base Latin.
247
+ * e.g. Turkish: ç→c, ğ→g, ı→i; German: ä→a, ö→o, ü→u, ß→ss */
248
+ charMap: Record<string, string>;
249
+ /** Leet speak substitution map.
250
+ * e.g. "0"→"o", "1"→"i", "@"→"a", "$"→"s" */
251
+ leetMap: Record<string, string>;
252
+ /** Visual similarity regex character classes for the pattern engine.
253
+ * Each key is a base letter, value is a regex character class string.
254
+ * e.g. a: "[a4@àáâãäå]", s: "[s5$şŞß]" */
255
+ charClasses: Record<string, string>;
256
+ /** Optional number-to-word expansions applied between letters.
257
+ * e.g. Turkish: [["2", "iki"], ["10", "on"]]
258
+ * Most languages leave this undefined. */
259
+ numberExpansions?: [string, string][];
260
+ /** Validated dictionary data (entries, whitelist, suffixes, version). */
261
+ dictionary: DictionaryData;
262
+ }
263
+
264
+ /**
265
+ * Retrieves the configuration for a supported language.
266
+ *
267
+ * @param lang - Language code (e.g. "tr", "en", "es", "de").
268
+ * @returns The language configuration including dictionary, charMap, and leetMap.
269
+ * @throws {Error} If the language is not supported or the dictionary version is too old.
270
+ */
271
+ declare function getLanguageConfig(lang: string): LanguageConfig;
272
+ /**
273
+ * Returns all available language codes.
274
+ * @returns Array of supported language codes (e.g. `["tr", "en", "es", "de"]`).
275
+ */
276
+ declare function getSupportedLanguages(): string[];
277
+
278
+ export { type CleanOptions, type DetectOptions, type FuzzyAlgorithm, type LanguageConfig, type MaskStyle, type MatchMethod, type MatchResult, type Mode, type NormalizerConfig, type Severity, Terlik, type TerlikOptions, type WordEntry, createNormalizer, diceSimilarity, getLanguageConfig, getSupportedLanguages, levenshteinDistance, levenshteinSimilarity, normalize };