@fldx/sopan 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Feildrix Liemdra
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,130 @@
1
+ # sopan
2
+
3
+ Small TypeScript-first profanity filter for Indonesian text.
4
+
5
+ `sopan` is dependency-free at runtime and exposes a compact API for JavaScript and TypeScript projects.
6
+
7
+ ## Install
8
+
9
+ ```sh
10
+ npm install sopan
11
+ ```
12
+
13
+ ```sh
14
+ pnpm add sopan
15
+ ```
16
+
17
+ ```sh
18
+ yarn add sopan
19
+ ```
20
+
21
+ ```sh
22
+ bun add sopan
23
+ ```
24
+
25
+ ## Usage
26
+
27
+ ```ts
28
+ import { addWords, clean, containsProfanity, findProfanity } from "sopan";
29
+
30
+ addWords(["kasar"]);
31
+
32
+ containsProfanity("dasar t41"); // true
33
+ findProfanity("A*N*J*I*N*G");
34
+ clean("dasar t41"); // "dasar ***"
35
+ clean("ka$aar!"); // "***!"
36
+ ```
37
+
38
+ CommonJS is supported too:
39
+
40
+ ```js
41
+ const { clean, containsProfanity } = require("sopan");
42
+
43
+ containsProfanity("ta1"); // true
44
+ clean("ta1"); // "***"
45
+ ```
46
+
47
+ ## API
48
+
49
+ ### `addWords(words)`
50
+
51
+ Adds words to the shared default filter once, so future calls to `containsProfanity`, `findProfanity`, and `clean` detect them automatically.
52
+
53
+ ```ts
54
+ addWords(["kasar", "kata-baru"]);
55
+
56
+ containsProfanity("ka$aar"); // true
57
+ clean("kata-baru"); // "***"
58
+ ```
59
+
60
+ ### `clearWords()`
61
+
62
+ Removes words previously registered through `addWords`.
63
+
64
+ ```ts
65
+ clearWords();
66
+ ```
67
+
68
+ ### `containsProfanity(input, options)`
69
+
70
+ Returns `true` when the input contains a profane word.
71
+
72
+ ```ts
73
+ containsProfanity("ini santai"); // false
74
+ containsProfanity("ini tai"); // true
75
+ containsProfanity("ka$aar", { additionalWords: ["kasar"] }); // true
76
+ ```
77
+
78
+ ### `findProfanity(input, options)`
79
+
80
+ Returns match details with the configured dictionary word, raw token, normalized token, and index.
81
+
82
+ ```ts
83
+ findProfanity("halo t41");
84
+ // [{ word: "tai", raw: "t41", normalized: "tai", index: 5 }]
85
+
86
+ findProfanity("ka$aar", { additionalWords: ["kasar"] });
87
+ // [{ word: "kasar", raw: "ka$aar", normalized: "kasar", index: 0 }]
88
+ ```
89
+
90
+ ### `clean(input, options)`
91
+
92
+ Replaces profane words with `"***"` by default.
93
+
94
+ ```ts
95
+ clean("dasar t41"); // "dasar ***"
96
+ clean("dasar t41", { replacement: "[redacted]" });
97
+ clean("dasar t41", { replacement: (match) => `[${match.word}]` });
98
+ clean("ka$aar", { additionalWords: ["kasar"], replacement: "[custom]" });
99
+ ```
100
+
101
+ ### `createFilter(options)`
102
+
103
+ Creates a custom filter with your own word list.
104
+
105
+ ```ts
106
+ const filter = createFilter({
107
+ words: ["kasar", "contoh"] as const
108
+ });
109
+
110
+ filter.containsProfanity("ka$aar");
111
+ ```
112
+
113
+ ## Matching Behavior
114
+
115
+ The default filter focuses on Indonesian profanity and normalizes:
116
+
117
+ - mixed casing: `TAI`
118
+ - simple leetspeak: `t41`, `ta1`
119
+ - symbol separators: `a*n*j*i*n*g`
120
+ - repeated letters: `annjiiinggg`
121
+
122
+ Matching is token-based, so `tai` is detected while `santai` is not.
123
+
124
+ The default Indonesian word list was expanded from public Indonesian rude-word references including Wiktionary's Indonesian "kata kasar" category, a translator-maintained Indonesian profanity list, a public GitHub gist, and selected regional Indonesian references for Sundanese, Javanese, Batak, and Medan usage. It intentionally keeps only single-token words in the core package.
125
+
126
+ ## Roadmap
127
+
128
+ - Expand Indonesian dictionary coverage.
129
+ - Add language packs without increasing the default bundle size.
130
+ - Add optional phrase matching for multi-word profanity.
@@ -0,0 +1,158 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.INDONESIAN_WORDS = void 0;
4
+ exports.addWords = addWords;
5
+ exports.clearWords = clearWords;
6
+ exports.containsProfanity = containsProfanity;
7
+ exports.findProfanity = findProfanity;
8
+ exports.clean = clean;
9
+ exports.createFilter = createFilter;
10
+ const normalize_js_1 = require("./normalize.js");
11
+ const words_js_1 = require("./words.js");
12
+ Object.defineProperty(exports, "INDONESIAN_WORDS", { enumerable: true, get: function () { return words_js_1.INDONESIAN_WORDS; } });
13
+ const DEFAULT_REPLACEMENT = "***";
14
+ const DEFAULT_FILTER = createFilter({
15
+ words: words_js_1.INDONESIAN_WORDS
16
+ });
17
+ const addedWords = new Map();
18
+ let cachedDefaultFilter = DEFAULT_FILTER;
19
+ let isDefaultFilterDirty = false;
20
+ /**
21
+ * Add words to the shared default filter used by `containsProfanity`,
22
+ * `findProfanity`, and `clean`.
23
+ *
24
+ * Use this during application startup when your project has product-specific
25
+ * words that should be detected everywhere. Words are normalized and deduped
26
+ * with the same rules as the built-in Indonesian dictionary.
27
+ */
28
+ function addWords(words) {
29
+ for (const word of words) {
30
+ const normalized = (0, normalize_js_1.normalizeWord)(word);
31
+ if (normalized.length > 0) {
32
+ addedWords.set(normalized, word);
33
+ }
34
+ }
35
+ isDefaultFilterDirty = true;
36
+ }
37
+ /**
38
+ * Remove all words previously registered through `addWords`.
39
+ *
40
+ * This is useful for tests, worker reuse, or apps that need to rebuild their
41
+ * moderation policy at runtime.
42
+ */
43
+ function clearWords() {
44
+ addedWords.clear();
45
+ cachedDefaultFilter = DEFAULT_FILTER;
46
+ isDefaultFilterDirty = false;
47
+ }
48
+ /**
49
+ * Returns `true` when input contains a word from the default Indonesian
50
+ * dictionary, shared words registered with `addWords`, or per-call
51
+ * `additionalWords`.
52
+ */
53
+ function containsProfanity(input, options = {}) {
54
+ return getDefaultFilter(options.additionalWords).containsProfanity(input);
55
+ }
56
+ /**
57
+ * Finds profanity matches in input and returns match details, including the raw
58
+ * token, normalized token, matched dictionary word, and start index.
59
+ */
60
+ function findProfanity(input, options = {}) {
61
+ return getDefaultFilter(options.additionalWords).findProfanity(input);
62
+ }
63
+ /**
64
+ * Replaces profanity in input. By default matches are replaced with `"***"`,
65
+ * but callers can provide a replacement string or callback.
66
+ */
67
+ function clean(input, options = {}) {
68
+ const filter = getDefaultFilter(options.additionalWords);
69
+ if (options.replacement === undefined) {
70
+ return filter.clean(input);
71
+ }
72
+ return filter.clean(input, {
73
+ replacement: options.replacement
74
+ });
75
+ }
76
+ /**
77
+ * Creates an isolated reusable filter with its own word list.
78
+ *
79
+ * Prefer this when you need multiple independent dictionaries or literal
80
+ * TypeScript word types. Use `addWords` when you want to extend the package's
81
+ * shared default filter once for the whole app.
82
+ */
83
+ function createFilter(options) {
84
+ const words = [...options.words];
85
+ const dictionary = createDictionary(words);
86
+ return {
87
+ words,
88
+ containsProfanity(input) {
89
+ return findMatches(input, dictionary).length > 0;
90
+ },
91
+ findProfanity(input) {
92
+ return findMatches(input, dictionary);
93
+ },
94
+ clean(input, cleanOptions = {}) {
95
+ return replaceMatches(input, findMatches(input, dictionary), cleanOptions.replacement);
96
+ }
97
+ };
98
+ }
99
+ function createDictionary(words) {
100
+ const dictionary = new Map();
101
+ for (const word of words) {
102
+ const normalized = (0, normalize_js_1.normalizeWord)(word);
103
+ if (normalized.length > 0) {
104
+ dictionary.set(normalized, word);
105
+ }
106
+ }
107
+ return dictionary;
108
+ }
109
+ function getDefaultFilter(additionalWords) {
110
+ const sharedFilter = getSharedDefaultFilter();
111
+ if (additionalWords === undefined || additionalWords.length === 0) {
112
+ return sharedFilter;
113
+ }
114
+ return createFilter({
115
+ words: [...sharedFilter.words, ...additionalWords]
116
+ });
117
+ }
118
+ function getSharedDefaultFilter() {
119
+ if (!isDefaultFilterDirty) {
120
+ return cachedDefaultFilter;
121
+ }
122
+ cachedDefaultFilter = createFilter({
123
+ words: [...words_js_1.INDONESIAN_WORDS, ...addedWords.values()]
124
+ });
125
+ isDefaultFilterDirty = false;
126
+ return cachedDefaultFilter;
127
+ }
128
+ function findMatches(input, dictionary) {
129
+ const matches = [];
130
+ for (const token of (0, normalize_js_1.tokenize)(input)) {
131
+ const word = dictionary.get(token.value);
132
+ if (word !== undefined) {
133
+ matches.push(toMatch(token, word));
134
+ }
135
+ }
136
+ return matches;
137
+ }
138
+ function toMatch(token, word) {
139
+ return {
140
+ word,
141
+ raw: token.raw,
142
+ normalized: token.value,
143
+ index: token.index
144
+ };
145
+ }
146
+ function replaceMatches(input, matches, replacement = DEFAULT_REPLACEMENT) {
147
+ if (matches.length === 0) {
148
+ return input;
149
+ }
150
+ let cursor = 0;
151
+ let output = "";
152
+ for (const match of matches) {
153
+ output += input.slice(cursor, match.index);
154
+ output += typeof replacement === "function" ? replacement(match) : replacement;
155
+ cursor = match.index + match.raw.length;
156
+ }
157
+ return output + input.slice(cursor);
158
+ }
@@ -0,0 +1,53 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.normalizeWord = normalizeWord;
4
+ exports.tokenize = tokenize;
5
+ const LEET_CHARS = {
6
+ "0": "o",
7
+ "1": "i",
8
+ "!": "i",
9
+ "|": "i",
10
+ "3": "e",
11
+ "4": "a",
12
+ "@": "a",
13
+ "5": "s",
14
+ "$": "s",
15
+ "7": "t",
16
+ "+": "t",
17
+ "8": "b"
18
+ };
19
+ const COMBINING_MARKS = /[\u0300-\u036f]/g;
20
+ const REPEATED_CHARS = /([a-z0-9])\1+/g;
21
+ const TOKEN_CHARS = /[a-z0-9@#$]+(?:[!|+*._~-]*[a-z0-9@#$]+)*/g;
22
+ function normalizeWord(word) {
23
+ return normalizeToken(word);
24
+ }
25
+ function tokenize(input) {
26
+ const normalizedInput = baseNormalize(input);
27
+ const tokens = [];
28
+ for (const match of normalizedInput.matchAll(TOKEN_CHARS)) {
29
+ const raw = match[0] ?? "";
30
+ const value = normalizeToken(raw);
31
+ if (value.length > 0) {
32
+ tokens.push({
33
+ value,
34
+ raw,
35
+ index: match.index ?? 0
36
+ });
37
+ }
38
+ }
39
+ return tokens;
40
+ }
41
+ function baseNormalize(input) {
42
+ return input.normalize("NFD").replace(COMBINING_MARKS, "").toLowerCase();
43
+ }
44
+ function normalizeToken(token) {
45
+ let normalized = "";
46
+ for (const char of baseNormalize(token)) {
47
+ const mapped = LEET_CHARS[char] ?? char;
48
+ if (/[a-z0-9]/.test(mapped)) {
49
+ normalized += mapped;
50
+ }
51
+ }
52
+ return normalized.replace(REPEATED_CHARS, "$1");
53
+ }
@@ -0,0 +1 @@
1
+ {"type":"commonjs"}
@@ -0,0 +1,182 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.INDONESIAN_WORDS = void 0;
4
+ exports.INDONESIAN_WORDS = [
5
+ "anjay",
6
+ "anjing",
7
+ "anjink",
8
+ "anjir",
9
+ "anjrit",
10
+ "anying",
11
+ "asu",
12
+ "babi",
13
+ "babatok",
14
+ "bacot",
15
+ "bagudung",
16
+ "bagong",
17
+ "bagoy",
18
+ "bajilak",
19
+ "bajindul",
20
+ "bajing",
21
+ "bajingak",
22
+ "bajingan",
23
+ "bajingseng",
24
+ "banci",
25
+ "bangsat",
26
+ "bangkawarah",
27
+ "bebel",
28
+ "bejad",
29
+ "bego",
30
+ "begok",
31
+ "belegug",
32
+ "belekok",
33
+ "bencong",
34
+ "berak",
35
+ "beungeut",
36
+ "bispak",
37
+ "bloon",
38
+ "bodat",
39
+ "bodo",
40
+ "bodoh",
41
+ "boloho",
42
+ "bolokotondo",
43
+ "brengsek",
44
+ "budeg",
45
+ "budheg",
46
+ "bujang",
47
+ "bujanginam",
48
+ "burit",
49
+ "cangcut",
50
+ "cangkem",
51
+ "cangkeman",
52
+ "cangkeme",
53
+ "celeng",
54
+ "celsit",
55
+ "cocot",
56
+ "cocote",
57
+ "cocotmu",
58
+ "cok",
59
+ "congor",
60
+ "congore",
61
+ "congormu",
62
+ "dancuk",
63
+ "damput",
64
+ "entot",
65
+ "ewe",
66
+ "gendeng",
67
+ "gendheng",
68
+ "geblek",
69
+ "gembel",
70
+ "gila",
71
+ "goblog",
72
+ "goblok",
73
+ "gundulmu",
74
+ "heang",
75
+ "hencet",
76
+ "henceut",
77
+ "herek",
78
+ "heunceut",
79
+ "idiot",
80
+ "itil",
81
+ "jablay",
82
+ "jamban",
83
+ "jampurut",
84
+ "jamput",
85
+ "jancok",
86
+ "jancuk",
87
+ "jangkrik",
88
+ "jembut",
89
+ "kampret",
90
+ "kampang",
91
+ "kancut",
92
+ "kanjut",
93
+ "kehed",
94
+ "keparat",
95
+ "kenthir",
96
+ "kimak",
97
+ "kimbek",
98
+ "kirik",
99
+ "kontol",
100
+ "kopet",
101
+ "koplak",
102
+ "koplok",
103
+ "kunyuk",
104
+ "lonte",
105
+ "loak",
106
+ "maho",
107
+ "maling",
108
+ "mampus",
109
+ "mampos",
110
+ "matamu",
111
+ "matane",
112
+ "mbacot",
113
+ "mbahmu",
114
+ "mbathang",
115
+ "mbladhog",
116
+ "memek",
117
+ "memex",
118
+ "meki",
119
+ "micek",
120
+ "modar",
121
+ "moddar",
122
+ "monyet",
123
+ "mripatmu",
124
+ "munyuk",
125
+ "ndase",
126
+ "ndasmu",
127
+ "ndlogok",
128
+ "ngaceng",
129
+ "ngendog",
130
+ "ngentot",
131
+ "nggapleki",
132
+ "nggateli",
133
+ "nggilani",
134
+ "ngewe",
135
+ "ngocor",
136
+ "ngocok",
137
+ "nguntal",
138
+ "ngurek",
139
+ "njeplak",
140
+ "njir",
141
+ "nyocot",
142
+ "ontohod",
143
+ "palkon",
144
+ "pantat",
145
+ "pantek",
146
+ "pecun",
147
+ "peju",
148
+ "pelacur",
149
+ "peler",
150
+ "pekok",
151
+ "pentil",
152
+ "pepek",
153
+ "perek",
154
+ "pethuk",
155
+ "picek",
156
+ "puki",
157
+ "pukimak",
158
+ "raimu",
159
+ "sarap",
160
+ "sempak",
161
+ "semprul",
162
+ "sial",
163
+ "sialan",
164
+ "silit",
165
+ "sinting",
166
+ "setan",
167
+ "soblog",
168
+ "sundal",
169
+ "taek",
170
+ "taik",
171
+ "tahi",
172
+ "tai",
173
+ "telek",
174
+ "tetek",
175
+ "titit",
176
+ "tokai",
177
+ "toket",
178
+ "tolol",
179
+ "udelmu",
180
+ "wedhus",
181
+ "zakar"
182
+ ];
@@ -0,0 +1,149 @@
1
+ import { tokenize, normalizeWord } from "./normalize.js";
2
+ import { INDONESIAN_WORDS } from "./words.js";
3
+ const DEFAULT_REPLACEMENT = "***";
4
+ const DEFAULT_FILTER = createFilter({
5
+ words: INDONESIAN_WORDS
6
+ });
7
+ const addedWords = new Map();
8
+ let cachedDefaultFilter = DEFAULT_FILTER;
9
+ let isDefaultFilterDirty = false;
10
+ /**
11
+ * Add words to the shared default filter used by `containsProfanity`,
12
+ * `findProfanity`, and `clean`.
13
+ *
14
+ * Use this during application startup when your project has product-specific
15
+ * words that should be detected everywhere. Words are normalized and deduped
16
+ * with the same rules as the built-in Indonesian dictionary.
17
+ */
18
+ export function addWords(words) {
19
+ for (const word of words) {
20
+ const normalized = normalizeWord(word);
21
+ if (normalized.length > 0) {
22
+ addedWords.set(normalized, word);
23
+ }
24
+ }
25
+ isDefaultFilterDirty = true;
26
+ }
27
+ /**
28
+ * Remove all words previously registered through `addWords`.
29
+ *
30
+ * This is useful for tests, worker reuse, or apps that need to rebuild their
31
+ * moderation policy at runtime.
32
+ */
33
+ export function clearWords() {
34
+ addedWords.clear();
35
+ cachedDefaultFilter = DEFAULT_FILTER;
36
+ isDefaultFilterDirty = false;
37
+ }
38
+ /**
39
+ * Returns `true` when input contains a word from the default Indonesian
40
+ * dictionary, shared words registered with `addWords`, or per-call
41
+ * `additionalWords`.
42
+ */
43
+ export function containsProfanity(input, options = {}) {
44
+ return getDefaultFilter(options.additionalWords).containsProfanity(input);
45
+ }
46
+ /**
47
+ * Finds profanity matches in input and returns match details, including the raw
48
+ * token, normalized token, matched dictionary word, and start index.
49
+ */
50
+ export function findProfanity(input, options = {}) {
51
+ return getDefaultFilter(options.additionalWords).findProfanity(input);
52
+ }
53
+ /**
54
+ * Replaces profanity in input. By default matches are replaced with `"***"`,
55
+ * but callers can provide a replacement string or callback.
56
+ */
57
+ export function clean(input, options = {}) {
58
+ const filter = getDefaultFilter(options.additionalWords);
59
+ if (options.replacement === undefined) {
60
+ return filter.clean(input);
61
+ }
62
+ return filter.clean(input, {
63
+ replacement: options.replacement
64
+ });
65
+ }
66
+ /**
67
+ * Creates an isolated reusable filter with its own word list.
68
+ *
69
+ * Prefer this when you need multiple independent dictionaries or literal
70
+ * TypeScript word types. Use `addWords` when you want to extend the package's
71
+ * shared default filter once for the whole app.
72
+ */
73
+ export function createFilter(options) {
74
+ const words = [...options.words];
75
+ const dictionary = createDictionary(words);
76
+ return {
77
+ words,
78
+ containsProfanity(input) {
79
+ return findMatches(input, dictionary).length > 0;
80
+ },
81
+ findProfanity(input) {
82
+ return findMatches(input, dictionary);
83
+ },
84
+ clean(input, cleanOptions = {}) {
85
+ return replaceMatches(input, findMatches(input, dictionary), cleanOptions.replacement);
86
+ }
87
+ };
88
+ }
89
+ function createDictionary(words) {
90
+ const dictionary = new Map();
91
+ for (const word of words) {
92
+ const normalized = normalizeWord(word);
93
+ if (normalized.length > 0) {
94
+ dictionary.set(normalized, word);
95
+ }
96
+ }
97
+ return dictionary;
98
+ }
99
+ function getDefaultFilter(additionalWords) {
100
+ const sharedFilter = getSharedDefaultFilter();
101
+ if (additionalWords === undefined || additionalWords.length === 0) {
102
+ return sharedFilter;
103
+ }
104
+ return createFilter({
105
+ words: [...sharedFilter.words, ...additionalWords]
106
+ });
107
+ }
108
+ function getSharedDefaultFilter() {
109
+ if (!isDefaultFilterDirty) {
110
+ return cachedDefaultFilter;
111
+ }
112
+ cachedDefaultFilter = createFilter({
113
+ words: [...INDONESIAN_WORDS, ...addedWords.values()]
114
+ });
115
+ isDefaultFilterDirty = false;
116
+ return cachedDefaultFilter;
117
+ }
118
+ function findMatches(input, dictionary) {
119
+ const matches = [];
120
+ for (const token of tokenize(input)) {
121
+ const word = dictionary.get(token.value);
122
+ if (word !== undefined) {
123
+ matches.push(toMatch(token, word));
124
+ }
125
+ }
126
+ return matches;
127
+ }
128
+ function toMatch(token, word) {
129
+ return {
130
+ word,
131
+ raw: token.raw,
132
+ normalized: token.value,
133
+ index: token.index
134
+ };
135
+ }
136
+ function replaceMatches(input, matches, replacement = DEFAULT_REPLACEMENT) {
137
+ if (matches.length === 0) {
138
+ return input;
139
+ }
140
+ let cursor = 0;
141
+ let output = "";
142
+ for (const match of matches) {
143
+ output += input.slice(cursor, match.index);
144
+ output += typeof replacement === "function" ? replacement(match) : replacement;
145
+ cursor = match.index + match.raw.length;
146
+ }
147
+ return output + input.slice(cursor);
148
+ }
149
+ export { INDONESIAN_WORDS };
@@ -0,0 +1,49 @@
1
+ const LEET_CHARS = {
2
+ "0": "o",
3
+ "1": "i",
4
+ "!": "i",
5
+ "|": "i",
6
+ "3": "e",
7
+ "4": "a",
8
+ "@": "a",
9
+ "5": "s",
10
+ "$": "s",
11
+ "7": "t",
12
+ "+": "t",
13
+ "8": "b"
14
+ };
15
+ const COMBINING_MARKS = /[\u0300-\u036f]/g;
16
+ const REPEATED_CHARS = /([a-z0-9])\1+/g;
17
+ const TOKEN_CHARS = /[a-z0-9@#$]+(?:[!|+*._~-]*[a-z0-9@#$]+)*/g;
18
+ export function normalizeWord(word) {
19
+ return normalizeToken(word);
20
+ }
21
+ export function tokenize(input) {
22
+ const normalizedInput = baseNormalize(input);
23
+ const tokens = [];
24
+ for (const match of normalizedInput.matchAll(TOKEN_CHARS)) {
25
+ const raw = match[0] ?? "";
26
+ const value = normalizeToken(raw);
27
+ if (value.length > 0) {
28
+ tokens.push({
29
+ value,
30
+ raw,
31
+ index: match.index ?? 0
32
+ });
33
+ }
34
+ }
35
+ return tokens;
36
+ }
37
+ function baseNormalize(input) {
38
+ return input.normalize("NFD").replace(COMBINING_MARKS, "").toLowerCase();
39
+ }
40
+ function normalizeToken(token) {
41
+ let normalized = "";
42
+ for (const char of baseNormalize(token)) {
43
+ const mapped = LEET_CHARS[char] ?? char;
44
+ if (/[a-z0-9]/.test(mapped)) {
45
+ normalized += mapped;
46
+ }
47
+ }
48
+ return normalized.replace(REPEATED_CHARS, "$1");
49
+ }
@@ -0,0 +1,179 @@
1
+ export const INDONESIAN_WORDS = [
2
+ "anjay",
3
+ "anjing",
4
+ "anjink",
5
+ "anjir",
6
+ "anjrit",
7
+ "anying",
8
+ "asu",
9
+ "babi",
10
+ "babatok",
11
+ "bacot",
12
+ "bagudung",
13
+ "bagong",
14
+ "bagoy",
15
+ "bajilak",
16
+ "bajindul",
17
+ "bajing",
18
+ "bajingak",
19
+ "bajingan",
20
+ "bajingseng",
21
+ "banci",
22
+ "bangsat",
23
+ "bangkawarah",
24
+ "bebel",
25
+ "bejad",
26
+ "bego",
27
+ "begok",
28
+ "belegug",
29
+ "belekok",
30
+ "bencong",
31
+ "berak",
32
+ "beungeut",
33
+ "bispak",
34
+ "bloon",
35
+ "bodat",
36
+ "bodo",
37
+ "bodoh",
38
+ "boloho",
39
+ "bolokotondo",
40
+ "brengsek",
41
+ "budeg",
42
+ "budheg",
43
+ "bujang",
44
+ "bujanginam",
45
+ "burit",
46
+ "cangcut",
47
+ "cangkem",
48
+ "cangkeman",
49
+ "cangkeme",
50
+ "celeng",
51
+ "celsit",
52
+ "cocot",
53
+ "cocote",
54
+ "cocotmu",
55
+ "cok",
56
+ "congor",
57
+ "congore",
58
+ "congormu",
59
+ "dancuk",
60
+ "damput",
61
+ "entot",
62
+ "ewe",
63
+ "gendeng",
64
+ "gendheng",
65
+ "geblek",
66
+ "gembel",
67
+ "gila",
68
+ "goblog",
69
+ "goblok",
70
+ "gundulmu",
71
+ "heang",
72
+ "hencet",
73
+ "henceut",
74
+ "herek",
75
+ "heunceut",
76
+ "idiot",
77
+ "itil",
78
+ "jablay",
79
+ "jamban",
80
+ "jampurut",
81
+ "jamput",
82
+ "jancok",
83
+ "jancuk",
84
+ "jangkrik",
85
+ "jembut",
86
+ "kampret",
87
+ "kampang",
88
+ "kancut",
89
+ "kanjut",
90
+ "kehed",
91
+ "keparat",
92
+ "kenthir",
93
+ "kimak",
94
+ "kimbek",
95
+ "kirik",
96
+ "kontol",
97
+ "kopet",
98
+ "koplak",
99
+ "koplok",
100
+ "kunyuk",
101
+ "lonte",
102
+ "loak",
103
+ "maho",
104
+ "maling",
105
+ "mampus",
106
+ "mampos",
107
+ "matamu",
108
+ "matane",
109
+ "mbacot",
110
+ "mbahmu",
111
+ "mbathang",
112
+ "mbladhog",
113
+ "memek",
114
+ "memex",
115
+ "meki",
116
+ "micek",
117
+ "modar",
118
+ "moddar",
119
+ "monyet",
120
+ "mripatmu",
121
+ "munyuk",
122
+ "ndase",
123
+ "ndasmu",
124
+ "ndlogok",
125
+ "ngaceng",
126
+ "ngendog",
127
+ "ngentot",
128
+ "nggapleki",
129
+ "nggateli",
130
+ "nggilani",
131
+ "ngewe",
132
+ "ngocor",
133
+ "ngocok",
134
+ "nguntal",
135
+ "ngurek",
136
+ "njeplak",
137
+ "njir",
138
+ "nyocot",
139
+ "ontohod",
140
+ "palkon",
141
+ "pantat",
142
+ "pantek",
143
+ "pecun",
144
+ "peju",
145
+ "pelacur",
146
+ "peler",
147
+ "pekok",
148
+ "pentil",
149
+ "pepek",
150
+ "perek",
151
+ "pethuk",
152
+ "picek",
153
+ "puki",
154
+ "pukimak",
155
+ "raimu",
156
+ "sarap",
157
+ "sempak",
158
+ "semprul",
159
+ "sial",
160
+ "sialan",
161
+ "silit",
162
+ "sinting",
163
+ "setan",
164
+ "soblog",
165
+ "sundal",
166
+ "taek",
167
+ "taik",
168
+ "tahi",
169
+ "tai",
170
+ "telek",
171
+ "tetek",
172
+ "titit",
173
+ "tokai",
174
+ "toket",
175
+ "tolol",
176
+ "udelmu",
177
+ "wedhus",
178
+ "zakar"
179
+ ];
@@ -0,0 +1,93 @@
1
+ import { INDONESIAN_WORDS, type IndonesianWord } from "./words.js";
2
+ export type SupportedLanguage = "id";
3
+ /**
4
+ * A detected profanity token and its location in the original input.
5
+ */
6
+ export interface ProfanityMatch<Word extends string = string> {
7
+ /** The dictionary word that matched after normalization. */
8
+ readonly word: Word;
9
+ /** The original token from the input, before normalization. */
10
+ readonly raw: string;
11
+ /** The normalized token used for dictionary lookup. */
12
+ readonly normalized: string;
13
+ /** Zero-based character index where the raw token starts. */
14
+ readonly index: number;
15
+ }
16
+ /**
17
+ * Options shared by read-only detection helpers.
18
+ */
19
+ export interface ProfanityOptions<AdditionalWord extends string = never> {
20
+ /** Extra words to merge with the default dictionary for this call only. */
21
+ readonly additionalWords?: readonly AdditionalWord[];
22
+ }
23
+ /**
24
+ * Options for replacing detected profanity.
25
+ */
26
+ export interface CleanOptions<AdditionalWord extends string = never> extends ProfanityOptions<AdditionalWord> {
27
+ /** Replacement string or callback. Defaults to `"***"`. */
28
+ readonly replacement?: string | ((match: ProfanityMatch<DefaultWord<AdditionalWord>>) => string);
29
+ }
30
+ /**
31
+ * Options for creating an isolated reusable profanity filter.
32
+ */
33
+ export interface FilterOptions<Word extends string = IndonesianWord> {
34
+ /** Words that should be detected by this filter. */
35
+ readonly words: readonly Word[];
36
+ readonly replacement?: string | ((match: ProfanityMatch<Word>) => string);
37
+ }
38
+ /**
39
+ * A reusable profanity filter with its own dictionary.
40
+ */
41
+ export interface ProfanityFilter<Word extends string = IndonesianWord> {
42
+ /** The words configured for this filter. */
43
+ readonly words: readonly Word[];
44
+ /** Returns whether the input contains at least one configured word. */
45
+ containsProfanity(input: string): boolean;
46
+ /** Returns all configured words found in the input. */
47
+ findProfanity(input: string): readonly ProfanityMatch<Word>[];
48
+ /** Returns input with configured words replaced. */
49
+ clean(input: string, options?: Pick<FilterOptions<Word>, "replacement">): string;
50
+ }
51
+ type DefaultWord<AdditionalWord extends string = never> = IndonesianWord | string | AdditionalWord;
52
+ /**
53
+ * Add words to the shared default filter used by `containsProfanity`,
54
+ * `findProfanity`, and `clean`.
55
+ *
56
+ * Use this during application startup when your project has product-specific
57
+ * words that should be detected everywhere. Words are normalized and deduped
58
+ * with the same rules as the built-in Indonesian dictionary.
59
+ */
60
+ export declare function addWords(words: readonly string[]): void;
61
+ /**
62
+ * Remove all words previously registered through `addWords`.
63
+ *
64
+ * This is useful for tests, worker reuse, or apps that need to rebuild their
65
+ * moderation policy at runtime.
66
+ */
67
+ export declare function clearWords(): void;
68
+ /**
69
+ * Returns `true` when input contains a word from the default Indonesian
70
+ * dictionary, shared words registered with `addWords`, or per-call
71
+ * `additionalWords`.
72
+ */
73
+ export declare function containsProfanity<AdditionalWord extends string = never>(input: string, options?: ProfanityOptions<AdditionalWord>): boolean;
74
+ /**
75
+ * Finds profanity matches in input and returns match details, including the raw
76
+ * token, normalized token, matched dictionary word, and start index.
77
+ */
78
+ export declare function findProfanity<AdditionalWord extends string = never>(input: string, options?: ProfanityOptions<AdditionalWord>): readonly ProfanityMatch<DefaultWord<AdditionalWord>>[];
79
+ /**
80
+ * Replaces profanity in input. By default matches are replaced with `"***"`,
81
+ * but callers can provide a replacement string or callback.
82
+ */
83
+ export declare function clean<AdditionalWord extends string = never>(input: string, options?: CleanOptions<AdditionalWord>): string;
84
+ /**
85
+ * Creates an isolated reusable filter with its own word list.
86
+ *
87
+ * Prefer this when you need multiple independent dictionaries or literal
88
+ * TypeScript word types. Use `addWords` when you want to extend the package's
89
+ * shared default filter once for the whole app.
90
+ */
91
+ export declare function createFilter<Word extends string>(options: Pick<FilterOptions<Word>, "words">): ProfanityFilter<Word>;
92
+ export { INDONESIAN_WORDS };
93
+ export type { IndonesianWord };
@@ -0,0 +1,7 @@
1
+ export interface NormalizedToken {
2
+ readonly value: string;
3
+ readonly raw: string;
4
+ readonly index: number;
5
+ }
6
+ export declare function normalizeWord(word: string): string;
7
+ export declare function tokenize(input: string): readonly NormalizedToken[];
@@ -0,0 +1,2 @@
1
+ export declare const INDONESIAN_WORDS: readonly ["anjay", "anjing", "anjink", "anjir", "anjrit", "anying", "asu", "babi", "babatok", "bacot", "bagudung", "bagong", "bagoy", "bajilak", "bajindul", "bajing", "bajingak", "bajingan", "bajingseng", "banci", "bangsat", "bangkawarah", "bebel", "bejad", "bego", "begok", "belegug", "belekok", "bencong", "berak", "beungeut", "bispak", "bloon", "bodat", "bodo", "bodoh", "boloho", "bolokotondo", "brengsek", "budeg", "budheg", "bujang", "bujanginam", "burit", "cangcut", "cangkem", "cangkeman", "cangkeme", "celeng", "celsit", "cocot", "cocote", "cocotmu", "cok", "congor", "congore", "congormu", "dancuk", "damput", "entot", "ewe", "gendeng", "gendheng", "geblek", "gembel", "gila", "goblog", "goblok", "gundulmu", "heang", "hencet", "henceut", "herek", "heunceut", "idiot", "itil", "jablay", "jamban", "jampurut", "jamput", "jancok", "jancuk", "jangkrik", "jembut", "kampret", "kampang", "kancut", "kanjut", "kehed", "keparat", "kenthir", "kimak", "kimbek", "kirik", "kontol", "kopet", "koplak", "koplok", "kunyuk", "lonte", "loak", "maho", "maling", "mampus", "mampos", "matamu", "matane", "mbacot", "mbahmu", "mbathang", "mbladhog", "memek", "memex", "meki", "micek", "modar", "moddar", "monyet", "mripatmu", "munyuk", "ndase", "ndasmu", "ndlogok", "ngaceng", "ngendog", "ngentot", "nggapleki", "nggateli", "nggilani", "ngewe", "ngocor", "ngocok", "nguntal", "ngurek", "njeplak", "njir", "nyocot", "ontohod", "palkon", "pantat", "pantek", "pecun", "peju", "pelacur", "peler", "pekok", "pentil", "pepek", "perek", "pethuk", "picek", "puki", "pukimak", "raimu", "sarap", "sempak", "semprul", "sial", "sialan", "silit", "sinting", "setan", "soblog", "sundal", "taek", "taik", "tahi", "tai", "telek", "tetek", "titit", "tokai", "toket", "tolol", "udelmu", "wedhus", "zakar"];
2
+ export type IndonesianWord = (typeof INDONESIAN_WORDS)[number];
package/package.json ADDED
@@ -0,0 +1,64 @@
1
+ {
2
+ "name": "@fldx/sopan",
3
+ "version": "1.0.0",
4
+ "description": "Small TypeScript-first profanity filter for Indonesian text.",
5
+ "homepage": "https://github.com/feildrixliemdra/sopan",
6
+ "license": "MIT",
7
+ "author": "fldx",
8
+ "type": "module",
9
+ "sideEffects": false,
10
+ "files": [
11
+ "dist",
12
+ "README.md",
13
+ "LICENSE"
14
+ ],
15
+ "repository": {
16
+ "type": "git",
17
+ "url": "git+ssh://git@github.com:feildrixliemdra/sopan.git"
18
+ },
19
+ "bugs": {
20
+ "url": "https://github.com/feildrixliemdra/sopan/issues"
21
+ },
22
+ "main": "./dist/cjs/index.js",
23
+ "module": "./dist/esm/index.js",
24
+ "types": "./dist/types/index.d.ts",
25
+ "exports": {
26
+ ".": {
27
+ "types": "./dist/types/index.d.ts",
28
+ "import": "./dist/esm/index.js",
29
+ "require": "./dist/cjs/index.js"
30
+ }
31
+ },
32
+ "scripts": {
33
+ "build": "npm run clean && npm run build:esm && npm run build:cjs && npm run build:types && node scripts/rename-cjs.mjs",
34
+ "build:esm": "tsc -p tsconfig.esm.json",
35
+ "build:cjs": "tsc -p tsconfig.cjs.json",
36
+ "build:types": "tsc -p tsconfig.types.json",
37
+ "clean": "node scripts/clean.mjs",
38
+ "test": "npm run build && node --test test/*.test.js",
39
+ "typecheck": "tsc -p tsconfig.json --noEmit",
40
+ "lint": "eslint .",
41
+ "lint:fix": "eslint . --fix",
42
+ "prepublishOnly": "npm run lint && npm test"
43
+ },
44
+ "keywords": [
45
+ "profanity",
46
+ "filter",
47
+ "indonesian",
48
+ "words",
49
+ "censor",
50
+ "swearing",
51
+ "badwords",
52
+ "sensor"
53
+ ],
54
+ "devDependencies": {
55
+ "@eslint/js": "^10.0.1",
56
+ "eslint": "^10.4.1",
57
+ "globals": "^15.14.0",
58
+ "typescript": "^5.4.0",
59
+ "typescript-eslint": "^8.61.0"
60
+ },
61
+ "engines": {
62
+ "node": ">=18"
63
+ }
64
+ }