bekindprofanityfilter 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/CONTRIBUTORS.md +106 -0
  2. package/LICENSE +22 -0
  3. package/README.md +1015 -0
  4. package/allprofanity.config.example.json +35 -0
  5. package/bin/init.js +49 -0
  6. package/config.schema.json +163 -0
  7. package/dist/algos/aho-corasick.d.ts +75 -0
  8. package/dist/algos/aho-corasick.js +238 -0
  9. package/dist/algos/aho-corasick.js.map +1 -0
  10. package/dist/algos/bloom-filter.d.ts +103 -0
  11. package/dist/algos/bloom-filter.js +208 -0
  12. package/dist/algos/bloom-filter.js.map +1 -0
  13. package/dist/algos/context-patterns.d.ts +102 -0
  14. package/dist/algos/context-patterns.js +484 -0
  15. package/dist/algos/context-patterns.js.map +1 -0
  16. package/dist/index.d.ts +1332 -0
  17. package/dist/index.js +2631 -0
  18. package/dist/index.js.map +1 -0
  19. package/dist/innocence-scoring.d.ts +23 -0
  20. package/dist/innocence-scoring.js +118 -0
  21. package/dist/innocence-scoring.js.map +1 -0
  22. package/dist/language-detector.d.ts +162 -0
  23. package/dist/language-detector.js +952 -0
  24. package/dist/language-detector.js.map +1 -0
  25. package/dist/language-dicts.d.ts +60 -0
  26. package/dist/language-dicts.js +2718 -0
  27. package/dist/language-dicts.js.map +1 -0
  28. package/dist/languages/arabic-words.d.ts +10 -0
  29. package/dist/languages/arabic-words.js +1649 -0
  30. package/dist/languages/arabic-words.js.map +1 -0
  31. package/dist/languages/bengali-words.d.ts +10 -0
  32. package/dist/languages/bengali-words.js +1696 -0
  33. package/dist/languages/bengali-words.js.map +1 -0
  34. package/dist/languages/brazilian-words.d.ts +10 -0
  35. package/dist/languages/brazilian-words.js +2122 -0
  36. package/dist/languages/brazilian-words.js.map +1 -0
  37. package/dist/languages/chinese-words.d.ts +10 -0
  38. package/dist/languages/chinese-words.js +2728 -0
  39. package/dist/languages/chinese-words.js.map +1 -0
  40. package/dist/languages/english-primary-all-languages.d.ts +23 -0
  41. package/dist/languages/english-primary-all-languages.js +36894 -0
  42. package/dist/languages/english-primary-all-languages.js.map +1 -0
  43. package/dist/languages/english-words.d.ts +5 -0
  44. package/dist/languages/english-words.js +5156 -0
  45. package/dist/languages/english-words.js.map +1 -0
  46. package/dist/languages/french-words.d.ts +10 -0
  47. package/dist/languages/french-words.js +2326 -0
  48. package/dist/languages/french-words.js.map +1 -0
  49. package/dist/languages/german-words.d.ts +10 -0
  50. package/dist/languages/german-words.js +2633 -0
  51. package/dist/languages/german-words.js.map +1 -0
  52. package/dist/languages/hindi-words.d.ts +10 -0
  53. package/dist/languages/hindi-words.js +2341 -0
  54. package/dist/languages/hindi-words.js.map +1 -0
  55. package/dist/languages/innocent-words.d.ts +41 -0
  56. package/dist/languages/innocent-words.js +109 -0
  57. package/dist/languages/innocent-words.js.map +1 -0
  58. package/dist/languages/italian-words.d.ts +10 -0
  59. package/dist/languages/italian-words.js +2287 -0
  60. package/dist/languages/italian-words.js.map +1 -0
  61. package/dist/languages/japanese-words.d.ts +11 -0
  62. package/dist/languages/japanese-words.js +2557 -0
  63. package/dist/languages/japanese-words.js.map +1 -0
  64. package/dist/languages/korean-words.d.ts +10 -0
  65. package/dist/languages/korean-words.js +2509 -0
  66. package/dist/languages/korean-words.js.map +1 -0
  67. package/dist/languages/russian-words.d.ts +10 -0
  68. package/dist/languages/russian-words.js +2175 -0
  69. package/dist/languages/russian-words.js.map +1 -0
  70. package/dist/languages/spanish-words.d.ts +11 -0
  71. package/dist/languages/spanish-words.js +2536 -0
  72. package/dist/languages/spanish-words.js.map +1 -0
  73. package/dist/languages/tamil-words.d.ts +10 -0
  74. package/dist/languages/tamil-words.js +1722 -0
  75. package/dist/languages/tamil-words.js.map +1 -0
  76. package/dist/languages/telugu-words.d.ts +10 -0
  77. package/dist/languages/telugu-words.js +1739 -0
  78. package/dist/languages/telugu-words.js.map +1 -0
  79. package/dist/romanization-detector.d.ts +50 -0
  80. package/dist/romanization-detector.js +779 -0
  81. package/dist/romanization-detector.js.map +1 -0
  82. package/package.json +79 -0
@@ -0,0 +1,779 @@
1
+ /**
2
+ * Romanization Detector — identifies Latin-script transliterations of
3
+ * non-Latin-script languages (Hindi romaji, Pinyin, etc.).
4
+ *
5
+ * Uses three layers:
6
+ * 1. ELD n-gram language detection + cluster analysis
7
+ * 2. Per-language romanization n-gram fingerprinting (trigrams + quadgrams)
8
+ * 3. Heuristic signals (scatter, ELD confidence, reliability)
9
+ */
10
+ // @ts-ignore — eld ships as JS with .d.ts but no proper ESM types
11
+ import { eld } from "eld/small";
12
+ // ---------------------------------------------------------------------------
13
+ // Language family mapping
14
+ // ---------------------------------------------------------------------------
15
+ export const LANGUAGE_FAMILIES = {
16
+ romance: ["fr", "ca", "it", "pt", "es", "ro", "gl", "oc"],
17
+ germanic: ["de", "nl", "da", "no", "sv", "af", "is", "lb", "en"],
18
+ slavic: ["ru", "uk", "bg", "pl", "cs", "sk", "hr", "sr", "sl", "bs", "mk"],
19
+ uralic: ["fi", "hu", "et"],
20
+ turkic: ["tr", "az", "uz", "kk", "ky", "tk"],
21
+ celtic: ["cy", "ga", "gd", "br"],
22
+ baltic: ["lt", "lv"],
23
+ semitic: ["ar", "he", "mt"],
24
+ indic: ["hi", "bn", "mr", "gu", "pa", "ne", "si", "ur"],
25
+ dravidian: ["ta", "te", "kn", "ml"],
26
+ sinitic: ["zh"],
27
+ japonic: ["ja"],
28
+ koreanic: ["ko"],
29
+ austronesian: ["tl", "ms", "id", "mg", "ceb", "jv", "su"],
30
+ tai: ["th", "lo"],
31
+ };
32
+ const LANG_TO_FAMILY = {};
33
+ for (const [family, langs] of Object.entries(LANGUAGE_FAMILIES)) {
34
+ for (const lang of langs)
35
+ LANG_TO_FAMILY[lang] = family;
36
+ }
37
+ export function analyzeCluster(scores) {
38
+ const sorted = Object.entries(scores).sort(([, a], [, b]) => b - a);
39
+ const top5families = sorted.slice(0, 5).map(([l]) => LANG_TO_FAMILY[l] || "unknown");
40
+ const familyCounts = {};
41
+ for (const f of top5families)
42
+ familyCounts[f] = (familyCounts[f] || 0) + 1;
43
+ const uniqueFamilies = Object.keys(familyCounts).length;
44
+ const dominantFamilyCount = Math.max(0, ...Object.values(familyCounts));
45
+ return {
46
+ uniqueFamilies,
47
+ dominantFamilyCount,
48
+ isCoherent: dominantFamilyCount >= 3,
49
+ isScattered: uniqueFamilies >= 4,
50
+ };
51
+ }
52
+ // ---------------------------------------------------------------------------
53
+ // Per-language romanization n-gram sets
54
+ // ---------------------------------------------------------------------------
55
+ // weakParticles: space-padded 2-letter words that are common across languages
56
+ // — scored at 0.5 instead of 1.0/1.5 to avoid false positives
57
+ export const ROMANIZATION_NGRAMS = {
58
+ // Hindi/Urdu/Bengali/Nepali — aspirated clusters, retroflex, nasal combos
59
+ indic: {
60
+ trigrams: new Set([
61
+ "bha", "bhe", "bhi", "bho", "bhu",
62
+ "dha", "dhe", "dhi", "dho", "dhu",
63
+ "gha", "ghe", "ghi", "gho", "ghu",
64
+ "kha", "khe", "khi", "kho", "khu",
65
+ "pha", "phe", "phi", "pho", "phu",
66
+ "chh", "cch",
67
+ "aal", "aam", "aan", "aap", "aar", "aas", "aat",
68
+ "jhe", "jha", "jhi",
69
+ "muj", "tuj", "yeh", "hai", "haa", "nah", "kya",
70
+ "waa", "jaa", "bah", "saa", "tta",
71
+ "ekh", "dek", "akh",
72
+ "eek", "aaj", "aur",
73
+ "ush", "shk", "hki",
74
+ // Bengali-specific
75
+ "cho", "ach", "oth", "pni", "mar",
76
+ "kem", "oba", "bad",
77
+ "she", "chi", "hec",
78
+ // Telugu-specific
79
+ "aru", "nna", "agu", "elu",
80
+ "ava", "ulu",
81
+ "ast", "sto",
82
+ // Nepali-specific
83
+ "hun", "hha",
84
+ // Gujarati-specific
85
+ "avn", "amy", "gam",
86
+ ]),
87
+ quadgrams: new Set([
88
+ "bhai", "bhar", "bhut", "bhaa", "bhen",
89
+ "dhar", "dhak", "dhoo", "dhan",
90
+ "ghar", "ghoo", "ghum",
91
+ "khan", "khaa", "khub", "khat", "khar",
92
+ "thee", "thod", "thek",
93
+ "chho", "chha", "chhe",
94
+ "haal", "hain", "hame", "hama",
95
+ "kaam", "jaan", "yaar", "raha", "rahe",
96
+ "mush", "shki", "achh", "achc", "bahu",
97
+ "chod", "saal", "kutt", "kami", "gaan", "baad", "baat", "naam", "mujh", "tujh",
98
+ // Space-padded 2-letter particles (padded "xx" → " xx " = 4 chars = quadgram)
99
+ " ke ", " ka ", " ki ", " ko ", " se ", " na ", " pe ", " ho ",
100
+ " ab ", " jo ", " ye ", " wo ", " tu ",
101
+ // Space-padded 3-letter particles (padded "xxx" → " xxx " → quadgrams " xxx" and "xxx ")
102
+ " sab", " yeh", " veh", " voh", " hai", " hum", " tum", " aur",
103
+ " mat", " kab", " koi",
104
+ "pahr", "parh",
105
+ "sund", "unda", "ndar",
106
+ "padh", "dhai",
107
+ "insh", "nsha", "shal",
108
+ "baar", "aari",
109
+ "sadk",
110
+ "bana", "anan",
111
+ "seek", "eekh",
112
+ "zaro", "aroo",
113
+ // Bengali quadgrams
114
+ "bhal", "halo",
115
+ "koth", "otha", "thay",
116
+ "jacc", "acch", "cche", "chen",
117
+ "apni", "toma", "omar",
118
+ "dhon", "honn", "nnob", "noba",
119
+ "kemo", "emon",
120
+ "eshe", "shec", "hech",
121
+ "boud", "oudi",
122
+ "dada",
123
+ "gelo",
124
+ "bair", "aire",
125
+ "ghur", "hure",
126
+ "lage", "agch", "gche",
127
+ // Telugu quadgrams
128
+ "unn", "nnar", "naru",
129
+ "baag", "aagu", "gunn", "unna",
130
+ "dhan", "hany", "anya", "nyav", "yava",
131
+ "vast", "asta", "star",
132
+ "velt", "eltu", "ltun", "tunn",
133
+ "sant", "anto", "ntos", "tosh",
134
+ "eppu", "ppud",
135
+ // Nepali quadgrams
136
+ "nama", "amas", "mast",
137
+ "hunu", "unuh", "nuhu", "uhun",
138
+ "dhan", "hany", "nyab", "yaba", "abaa",
139
+ // Gujarati quadgrams
140
+ "malv", "alva", "lvan", "vanu",
141
+ "gamy",
142
+ ]),
143
+ // Weak particles: common across languages, scored at 0.5
144
+ weakParticles: new Set([" na ", " to "]),
145
+ },
146
+ // Mandarin Pinyin
147
+ pinyin: {
148
+ trigrams: new Set([
149
+ "zho", "zha", "zhe", "zhi", "zhu",
150
+ "qia", "qie", "qin", "qiu",
151
+ "xia", "xie", "xin", "xiu", "xue",
152
+ "iao", "iou", "uai", "uei",
153
+ "guo", "duo", "huo", "suo", "zuo",
154
+ "jin", "jia", "jie", "jiu",
155
+ "ngg", "ngr", "ngy",
156
+ "gao", "hao", "bao", "dao", "lao",
157
+ ]),
158
+ quadgrams: new Set([
159
+ "zhon", "hong", "nggu", "gguo",
160
+ "zhen", "zhao", "zhua", "zhan",
161
+ "qian", "qing", "qixi",
162
+ "xian", "xing", "xiao", "xiex",
163
+ "jint", "inti", "ntia",
164
+ "tian", "huan", "yuan",
165
+ "gong", "dong", "peng",
166
+ "ming", "ting", "bing", "ling",
167
+ "gaox", "aoxi", "oxin",
168
+ "bang", "duos", "uosh", "shao",
169
+ "guan", "dian", "nian",
170
+ "pengy", "ngyo",
171
+ "zaij", "aiji", "ijia",
172
+ "piao", "iaol", "aoli", "olia",
173
+ "xihu", "ihuan",
174
+ "feng", "engj", "ngji", "gjin",
175
+ "difa", "ifan", "fang",
176
+ "chif", "ifan",
177
+ "wans", "ansh", "nsha", "shan",
178
+ "shen", "henm",
179
+ ]),
180
+ // Weak particles: de (的), le (了), bu (不) — common but shared with other languages
181
+ weakParticles: new Set([" de ", " le ", " bu "]),
182
+ },
183
+ // Japanese Romaji — polite, casual, literary, and slang forms
184
+ romaji: {
185
+ trigrams: new Set([
186
+ // Polite verb endings
187
+ "asu", "esu", "osu",
188
+ "mas", "des",
189
+ // Unique consonant clusters
190
+ "tsu",
191
+ "sho", "shi", "chi",
192
+ // Common fragments
193
+ "ata", "ima",
194
+ // Gemination (doubled consonants)
195
+ "tte", "tta", "kke", "ssa", "kka",
196
+ // Compound consonants
197
+ "nky", "nbe", "nbu", "nde", "nda",
198
+ // Long vowel patterns (-ou, -uu, -ei common in romaji)
199
+ "kou", "mou", "tou", "dou", "sou", "rou",
200
+ "iku", "oku", "uku", "aku", "eku",
201
+ // Polite/keigo fragments
202
+ "sai", "goz", "aim",
203
+ // Common literary/poetic — only distinctive combos
204
+ "omo", "iro",
205
+ "yum", "ume",
206
+ "osh", "yor",
207
+ "ait", "tai",
208
+ "hik",
209
+ "sak",
210
+ "hosh",
211
+ "kai", "sei",
212
+ // Common particles as trigram context
213
+ "wat", "nih",
214
+ "mpo", "amp",
215
+ // Profanity/slang fragments
216
+ "kus", "uso", "aho",
217
+ "kut", "tab",
218
+ "tem", "mee",
219
+ ]),
220
+ quadgrams: new Set([
221
+ // Space-padded 2-letter particles (avoid "no", "de" — English; "wa", "ni" — Swahili)
222
+ " ga ", " wo ", " mo ", " ne ", " yo ",
223
+ // Space-padded 3-letter particles
224
+ " eki", " ima", " ano", " ore", " iku", " aru", " nai", " mae",
225
+ // Polite forms
226
+ "masu", "desu", "desh", "imas",
227
+ "shim", "mash", "ashi",
228
+ "kuda", "udas", "dasa",
229
+ "goza", "ozai", "zaim",
230
+ // Common words
231
+ "wata", "atas",
232
+ "niho", "ihon",
233
+ "benk", "enky", "nkyo",
234
+ "ganb", "anba", "nbat", "batt",
235
+ "tomo", "omod", "moda", "odac",
236
+ "omoi", "moir", "oiro",
237
+ "tote", "otem",
238
+ "sumi", "umim", "mima",
239
+ "tano", "anos", "nosh",
240
+ // Poetic/literary
241
+ "yume", "naka", "anat", "nata",
242
+ "aitai", "hosh", "oshi",
243
+ "kaga", "agay", "gaya", "ayak", "yaku", "yaki",
244
+ "shit", "hite",
245
+ "yoru", "saku", "akur", "kura",
246
+ "kire", "irei",
247
+ "hana", "haru", "kami",
248
+ "tsuk", "suki",
249
+ // Casual/informal
250
+ "naru", "shir", "iren", "rena",
251
+ "owar", "wari",
252
+ "haji", "ajim", "jime",
253
+ "yaro", "arou",
254
+ // Profanity/slang
255
+ "kuso", "kutar", "utab", "taba", "abar", "bare",
256
+ "ahou", "hond", "onda", "ndar",
257
+ "kisa", "isam", "sama",
258
+ "teme", "emee",
259
+ "bokk", "okke",
260
+ ]),
261
+ // Weak particles: shared with Swahili/English, scored at 0.5
262
+ // Excluded: " wa ", " ni " — too common in Swahili
263
+ weakParticles: new Set([" no ", " de ", " ka ", " to "]),
264
+ },
265
+ // Korean Romanization — formal, casual, and slang forms
266
+ korean: {
267
+ trigrams: new Set([
268
+ // Polite endings
269
+ "eyo", "ayo",
270
+ // Particles
271
+ "sey", "eun", "eul", "reul",
272
+ // Verb stems
273
+ "hag", "seo", "geo",
274
+ "ham", "gam", "kam",
275
+ // -nida formal ending
276
+ "nid", "ida",
277
+ // Tense markers
278
+ "eos", "iss", "sse",
279
+ // annyeong fragments
280
+ "nye", "ngh", "ngs",
281
+ // Common vowel combos
282
+ "yeo", "ung", "eon",
283
+ // Compound vowels
284
+ "hae", "hoe", "hwa",
285
+ // Common words
286
+ "jal", "joh", "gbu",
287
+ "lkk", "eok", "eog",
288
+ "ase", "oyo",
289
+ "sip", "ipe",
290
+ "nge", "ngb",
291
+ // Casual/informal — only distinctive patterns
292
+ "eul", "gat",
293
+ "gae", "rae", "jae",
294
+ "dul", "iga",
295
+ "ssi",
296
+ "bap", "meo", "eok",
297
+ "gac",
298
+ "jeo",
299
+ "peo",
300
+ "oeg",
301
+ "alk",
302
+ // Particles
303
+ "gwa", "eseo",
304
+ ]),
305
+ quadgrams: new Set([
306
+ // Space-padded 3-letter particles
307
+ " eun", " eul", " jal", " nae", " uri", " geu", " jeo",
308
+ // Polite forms
309
+ "hase", "aseyo", "seyo",
310
+ "isseo", "sseo", "seoyo",
311
+ "gamsa", "amsa", "msah",
312
+ "hamni", "amni", "mnid", "nida",
313
+ // annyeong
314
+ "annyeo", "nnyeo", "nyeon",
315
+ // Common words
316
+ "gongb", "ongbu",
317
+ "hwai", "wait", "aiti",
318
+ "haeb", "aebo", "bose",
319
+ "meok", "eokgo",
320
+ "sipeo", "ipeo", "peoyo",
321
+ "nalss", "alss", "lssi",
322
+ // Casual/conversational
323
+ "oneul", "neul",
324
+ "gachi", "achi", "gach",
325
+ "aeba", "ebak", "oppa",
326
+ "sanch", "anche", "nchek",
327
+ "joey", "oeyo",
328
+ "nals", "alss", "lssi", "ssig",
329
+ "gayo",
330
+ "chingu", "ingu",
331
+ "haru",
332
+ "dosi", "osir", "sirak",
333
+ "sajin",
334
+ "jeony", "eonye", "nyeo", "yeog",
335
+ "jaem", "aemi", "emii",
336
+ "bang", "angap", "ngap",
337
+ "mann", "anna", "nnas", "nase",
338
+ ]),
339
+ // Weak particles: e (에), do (도), i (이) — common but shared
340
+ weakParticles: new Set([" do ", " na ", " je "]),
341
+ },
342
+ // Arabic romanization — MSA and dialectal (Levantine, Egyptian, Gulf)
343
+ arabic: {
344
+ trigrams: new Set([
345
+ // Core patterns
346
+ "hab", "abi", "bib",
347
+ "all", "lla", "lah",
348
+ "shk", "shu", "shm",
349
+ "akh", "ukh", "khr",
350
+ "eef", "eek", "eel",
351
+ "ahm", "hmd",
352
+ "bik", "bil",
353
+ "jaz", "aze", "zee",
354
+ "yaw", "awm",
355
+ "ana", "ant", "nta",
356
+ "ahl", "hla", "sah",
357
+ "kha", "las", "yal",
358
+ "ukr", "kra",
359
+ // Dialectal fragments
360
+ "bid", "idd",
361
+ "ray", "aye", "yeh",
362
+ "ked", "eer",
363
+ "kti", "tir",
364
+ "hel", "elw",
365
+ "shi", "way",
366
+ "mni", "nit",
367
+ "nti", "tik",
368
+ "akhi", "khi",
369
+ "zal", "ala", "lam",
370
+ "bah", "ahe",
371
+ "sad", "ade", "dee",
372
+ "umm", // ummi
373
+ ]),
374
+ quadgrams: new Set([
375
+ // Space-padded 2-letter particles (only distinctive ones — avoid "an", "la", "ma" which are common English)
376
+ " ya ", " wa ", " fi ", " bi ",
377
+ // Space-padded 3-letter particles
378
+ " ana", " min", " lil", " ila", " ala", " law", " shu",
379
+ // Core
380
+ "habi", "abib", "bibi",
381
+ "alla", "llah",
382
+ "insh", "nsha", "shal",
383
+ "mash", "asha",
384
+ "shuk", "hukr", "ukra", "kran",
385
+ "jaze", "azee", "zeel",
386
+ "keef", "eefa", "efak",
387
+ "ahla", "hlan",
388
+ "sahl",
389
+ "khal", "hala",
390
+ "yall",
391
+ "wall",
392
+ "alha", "lham", "hamd",
393
+ // Dialectal
394
+ "bidd",
395
+ "raye", "ayeh",
396
+ "helw",
397
+ "shwa", "hway",
398
+ "akhi",
399
+ "baha", "aheb", "heba",
400
+ "sade", "adee", "deeq",
401
+ "zala", "alam", "lame",
402
+ "tihk",
403
+ "mish", "fahim", // mish fahim
404
+ ]),
405
+ // Weak particles: la (لا), ma (ما), an (أن) — common but shared with English/Spanish
406
+ weakParticles: new Set([" la ", " ma ", " an "]),
407
+ },
408
+ // Russian/Slavic transliteration — formal, casual, and slang
409
+ russian: {
410
+ trigrams: new Set([
411
+ // Standard transliteration clusters
412
+ "kho", "khr", "kha",
413
+ "zho", "zhe", "zhi",
414
+ "shc", "hch",
415
+ "tsy", "tsa",
416
+ "vst", "dra",
417
+ "pri", "iye",
418
+ "vsy",
419
+ "ych", "yat", "oya",
420
+ "oho", "ros", "osh",
421
+ "seg", "ego", "odn",
422
+ "its",
423
+ "poy", "oyt", "yti",
424
+ "ozh",
425
+ "gul", "uly",
426
+ "khl", "hle", "leb",
427
+ "mol", "olo", "lok",
428
+ // Slang/profanity fragments
429
+ "bly", "lya", "yat",
430
+ "piz", "izd", "zde",
431
+ "nah", "ahu",
432
+ "mud", "uda", "dak",
433
+ "deb", "ebi",
434
+ "dol", "olb", "lbo",
435
+ "suk", "uka",
436
+ "tup", "upo",
437
+ // Casual/informal
438
+ "chy", "zna", "poz",
439
+ "kru", "tol",
440
+ "dav", "ava",
441
+ "poe", "oek", "ekh",
442
+ ]),
443
+ quadgrams: new Set([
444
+ // Space-padded 3-letter particles
445
+ " kak", " eto", " ves", " mne", " vot", " uzh", " tam",
446
+ // Standard transliteration
447
+ "khor", "horo", "oros", "rosh",
448
+ "zhno", "mozh",
449
+ "priv", "rive", "ivet",
450
+ "sego", "egod", "godn", "odny",
451
+ "vsyo",
452
+ "nrav", "ravi", "avit", "vits",
453
+ "poyt", "oyti",
454
+ "guly", "ulya", "lyat",
455
+ "khleb", "hleb",
456
+ "molo", "olok", "loko",
457
+ "spas", "pasi", "asib", "sibo",
458
+ "bols", "olsh", "lsho", "shoy",
459
+ "pomo", "omos", "mosh", "oshc",
460
+ "priy", "riya", "iyat",
461
+ "inte", "nter", "tere", "eres",
462
+ "zdra", "drav",
463
+ "drug", "ruzy", "uzya", "zyam",
464
+ "posh", "shli",
465
+ "poto", "otom", "tomu",
466
+ "chto",
467
+ "vzya", "zyal", "yali",
468
+ "sobo", "oboi",
469
+ "igra", "gral",
470
+ "vrem", "remy", "emya",
471
+ "prov", "rove", "ovel",
472
+ "vech", "eche", "cher", "hero",
473
+ "domo", "omoi",
474
+ "usta", "stav", "tavs",
475
+ "scha", "chas", "hast", "astl",
476
+ "zame", "amec", "mech", "chat",
477
+ "piko", "ikni",
478
+ "futb", "utbo",
479
+ // Slang/profanity
480
+ "blya", "lyat",
481
+ "pizd", "izde", "zdet",
482
+ "nahu",
483
+ "muda", "udak",
484
+ "debi",
485
+ "dolb", "olbo", "lboy", "boyo",
486
+ "suka",
487
+ "tupo", "upoy",
488
+ // Casual patterns
489
+ "dava", "avai",
490
+ "poka",
491
+ "tolk", "olko",
492
+ ]),
493
+ // Weak particles: ya (я), ne (не), on (он), no (но), vy (вы) — excluded "da" (too common: Hausa, English)
494
+ weakParticles: new Set([" ya ", " ne ", " no ", " vy ", " on "]),
495
+ },
496
+ // Thai romanization
497
+ thai: {
498
+ trigrams: new Set([
499
+ "kra", "rap", "kha",
500
+ "sab", "aba", "bai",
501
+ "saw", "awa", "wad", "ade",
502
+ "kho", "hob", "obu",
503
+ "khu", "hun",
504
+ "pho", "hom",
505
+ "rai", "ara", "nee",
506
+ "pai", "nai",
507
+ "dee", "mai", "cha",
508
+ "anu", "nuk", "san",
509
+ ]),
510
+ quadgrams: new Set([
511
+ "krap", "sawa", "awad", "wade", "adee",
512
+ "saba", "abai",
513
+ "khob", "khun", "phom",
514
+ "arai",
515
+ "sanu", "anuk",
516
+ ]),
517
+ // Weak particles: na (นะ), di (ดี), ja (จ๊ะ), ka (ค่ะ/ครับ shorthand)
518
+ weakParticles: new Set([" na ", " di ", " ja ", " ka "]),
519
+ },
520
+ // Tamil romanization
521
+ tamil: {
522
+ trigrams: new Set([
523
+ "kka", "kki",
524
+ "nga", "ngi",
525
+ "ppa", "adi",
526
+ "iru", "ruk", "ukk",
527
+ "nak", "akk",
528
+ "ndr", "dri",
529
+ "enn", "nna",
530
+ "inn", "nni", "nik",
531
+ "ela", "aik",
532
+ "ree", "eng",
533
+ ]),
534
+ quadgrams: new Set([
535
+ "vana", "anak", "nakk", "akka", "kkam",
536
+ "iruk", "rukk", "ukki", "kkin", "king",
537
+ "eppa", "ppad", "padi",
538
+ "nand", "andr", "ndri",
539
+ "inni", "nnik", "nikk", "ikku",
540
+ "vela", "elai", "laik", "aikk",
541
+ "panr", "anre", "nree", "reeng",
542
+ ]),
543
+ // Weak particles: la (லா), na (னா) — excluded "da" (too common cross-language)
544
+ weakParticles: new Set([" la ", " na "]),
545
+ },
546
+ // Persian/Farsi romanization
547
+ persian: {
548
+ trigrams: new Set([
549
+ "sal", "ala", "lam",
550
+ "che", "het", "eto", "tor",
551
+ "ale", "let",
552
+ "khe", "hei", "eil",
553
+ "khu", "hub", "ube",
554
+ "doo", "oos", "ost",
555
+ "iru", "run",
556
+ "rim",
557
+ "gha", "haz", "aza",
558
+ "bok", "okh",
559
+ "emr", "mru", "ruz",
560
+ ]),
561
+ quadgrams: new Set([
562
+ "sala", "alam",
563
+ "chet", "heto", "etor", "tori",
564
+ "khei", "heil", "eili",
565
+ "khub", "hube",
566
+ "doos", "oost",
567
+ "biru", "irun",
568
+ "beri", "erim",
569
+ "ghaz", "haza",
570
+ "bokh", "okho", "khor",
571
+ "emru", "mruz",
572
+ "mikha", "ikha",
573
+ "laze", "azem",
574
+ ]),
575
+ // Weak particles: ra (را), az (از), be (به), ke (که)
576
+ weakParticles: new Set([" ra ", " az ", " be ", " ke "]),
577
+ },
578
+ // Vietnamese without diacritics
579
+ vietnamese: {
580
+ trigrams: new Set([
581
+ "uoc", "uoi", "uon", "uot",
582
+ "ngu", "ngh", "nho", "nha",
583
+ "tro", "anh", "inh",
584
+ "hom", "nay", "dep", "qua",
585
+ "vui", "duc", "gap", "ban",
586
+ "rat", "toi", "hay", "roi",
587
+ "xin", "cam", "gia", "dia",
588
+ "chu", "dun", "hoc",
589
+ "lam", "mot", "cho",
590
+ "phu",
591
+ "uye", "yen",
592
+ ]),
593
+ quadgrams: new Set([
594
+ "nguo", "guoi",
595
+ "nghi", "truo", "ruon", "uong",
596
+ "duoc",
597
+ "chun", "hung",
598
+ "tron", "rong",
599
+ "khon", "hong",
600
+ "nhun",
601
+ "hoan", "oanh",
602
+ ]),
603
+ // Weak particles: la (là), va (và), co (có)
604
+ weakParticles: new Set([" la ", " va ", " co "]),
605
+ },
606
+ };
607
+ export function trigramFingerprint(text) {
608
+ var _a, _b, _c, _d, _e, _f;
609
+ const words = text.toLowerCase().split(/\s+/).filter(w => w.length > 0);
610
+ const textTrigrams = [];
611
+ const textQuadgrams = [];
612
+ const particleQuadgrams = [];
613
+ for (const word of words) {
614
+ const clean = word.replace(/[^a-z]/g, "");
615
+ if (clean.length === 0)
616
+ continue;
617
+ // Standard per-word n-gram extraction (words >= 3 chars — counted in denominator)
618
+ if (clean.length >= 3) {
619
+ for (let i = 0; i <= clean.length - 3; i++) {
620
+ textTrigrams.push(clean.slice(i, i + 3));
621
+ }
622
+ for (let i = 0; i <= clean.length - 4; i++) {
623
+ textQuadgrams.push(clean.slice(i, i + 4));
624
+ }
625
+ }
626
+ // Short words (≤ 3 chars): also pad with spaces for particle quadgrams (bonus only)
627
+ if (clean.length <= 3) {
628
+ // Short words (≤ 3 chars): pad with spaces → quadgrams as bonus signal (not in denominator)
629
+ // e.g. "ke" → " ke " → quadgram " ke "; "hai" → " hai " → quadgrams " hai", "hai "
630
+ const padded = " " + clean + " ";
631
+ for (let i = 0; i <= padded.length - 4; i++) {
632
+ particleQuadgrams.push(padded.slice(i, i + 4));
633
+ }
634
+ }
635
+ }
636
+ // Denominator only counts per-word n-grams; particles are bonus signal
637
+ const totalNgrams = textTrigrams.length + textQuadgrams.length;
638
+ if (totalNgrams < 10) {
639
+ return { bestFamily: "", bestHitRate: 0, perFamilyRates: {}, totalTrigrams: totalNgrams, decayedScore: 0, triggered: false };
640
+ }
641
+ const diacriticChars = text.match(/[À-ÿĀ-žƠ-ơƯ-ưẠ-ỹ]/g);
642
+ const latinChars = text.match(/[a-zA-ZÀ-ÿĀ-žƠ-ơƯ-ưẠ-ỹ]/g);
643
+ const diacriticDensity = ((_a = diacriticChars === null || diacriticChars === void 0 ? void 0 : diacriticChars.length) !== null && _a !== void 0 ? _a : 0) / ((_b = latinChars === null || latinChars === void 0 ? void 0 : latinChars.length) !== null && _b !== void 0 ? _b : 1);
644
+ const perFamilyHits = {};
645
+ for (const [family, entry] of Object.entries(ROMANIZATION_NGRAMS)) {
646
+ if (family === "vietnamese" && diacriticDensity > 0.05)
647
+ continue;
648
+ const { trigrams: trigramSet, quadgrams: quadgramSet, weakParticles: weakSet } = entry;
649
+ let hits = 0;
650
+ // Standard n-gram matching
651
+ for (const tri of textTrigrams) {
652
+ if (trigramSet.has(tri))
653
+ hits++;
654
+ }
655
+ for (const quad of textQuadgrams) {
656
+ if (quadgramSet.has(quad))
657
+ hits += 1.5;
658
+ }
659
+ // Particle quadgram matches — bonus hits (not in denominator)
660
+ // 2-letter particles (" ke " — 2 spaces) score 1.5, 3-letter (" hai" — 1 space) score 0.25
661
+ for (const quad of particleQuadgrams) {
662
+ if (quadgramSet.has(quad)) {
663
+ const spaces = (quad.match(/ /g) || []).length;
664
+ hits += spaces >= 2 ? 1.5 : 0.25;
665
+ }
666
+ }
667
+ // Weak particle matches — cross-language particles scored at 0.5
668
+ if (weakSet) {
669
+ for (const quad of particleQuadgrams) {
670
+ if (weakSet.has(quad))
671
+ hits += 0.5;
672
+ }
673
+ }
674
+ if (hits > 0)
675
+ perFamilyHits[family] = hits;
676
+ }
677
+ const perFamilyRates = {};
678
+ for (const [family, hits] of Object.entries(perFamilyHits)) {
679
+ perFamilyRates[family] = Math.min(1.0, hits / totalNgrams);
680
+ }
681
+ const sorted = Object.entries(perFamilyRates).sort(([, a], [, b]) => b - a);
682
+ const bestFamily = (_d = (_c = sorted[0]) === null || _c === void 0 ? void 0 : _c[0]) !== null && _d !== void 0 ? _d : "";
683
+ const bestHitRate = (_f = (_e = sorted[0]) === null || _e === void 0 ? void 0 : _e[1]) !== null && _f !== void 0 ? _f : 0;
684
+ let decayedScore = 0;
685
+ if (bestHitRate >= 0.18) {
686
+ decayedScore = 1.0;
687
+ }
688
+ else if (bestHitRate >= 0.10) {
689
+ decayedScore = 0.5;
690
+ for (let i = 1; i < sorted.length; i++) {
691
+ const decayFactor = Math.pow(0.5, i);
692
+ decayedScore += sorted[i][1] * decayFactor * 5;
693
+ }
694
+ }
695
+ else {
696
+ for (let i = 0; i < sorted.length; i++) {
697
+ const decayFactor = Math.pow(0.5, i);
698
+ decayedScore += sorted[i][1] * decayFactor;
699
+ }
700
+ }
701
+ return {
702
+ bestFamily,
703
+ bestHitRate,
704
+ perFamilyRates,
705
+ totalTrigrams: textTrigrams.length,
706
+ decayedScore,
707
+ triggered: bestHitRate >= 0.18,
708
+ };
709
+ }
710
+ const ELD_GARBAGE_LANGS = new Set(["yo", "hmn"]);
711
+ export function isRomanized(text, eldLang, eldTopScore, eldReliable, cluster) {
712
+ const signals = [];
713
+ let score = 0;
714
+ const words = text.split(/\s+/).filter(w => w.length > 0);
715
+ const alphaText = text.replace(/[^a-zA-ZÀ-ÿ\u0100-\u024F\u0370-\uFFFF]/g, "");
716
+ const latinText = text.replace(/[^a-zA-ZÀ-ÿ\u0100-\u024F]/g, "");
717
+ const isLatinScript = alphaText.length > 0 ? latinText.length / alphaText.length > 0.85 : true;
718
+ if (!isLatinScript)
719
+ return { isRomanized: false, confidence: 0, tier: "none", signals: ["native-script"] };
720
+ if (words.length <= 3)
721
+ return { isRomanized: false, confidence: 0, tier: "none", signals: ["too-short"] };
722
+ // Graduated family scatter
723
+ if (cluster.uniqueFamilies >= 5) {
724
+ score += 0.20;
725
+ signals.push("family-scatter-5");
726
+ }
727
+ else if (cluster.uniqueFamilies >= 4) {
728
+ score += 0.15;
729
+ signals.push("family-scatter-4");
730
+ }
731
+ else if (cluster.uniqueFamilies >= 3 && !cluster.isCoherent) {
732
+ score += 0.10;
733
+ signals.push("family-incoherent-3");
734
+ }
735
+ if (ELD_GARBAGE_LANGS.has(eldLang)) {
736
+ score += 0.20;
737
+ signals.push("eld-garbage-lang");
738
+ }
739
+ if (eldTopScore < 0.45) {
740
+ score += 0.15;
741
+ signals.push("very-low-eld");
742
+ }
743
+ else if (eldTopScore < 0.60) {
744
+ score += 0.08;
745
+ signals.push("low-eld");
746
+ }
747
+ if (!eldReliable) {
748
+ score += 0.08;
749
+ signals.push("eld-unreliable");
750
+ }
751
+ // Per-language romanization n-gram fingerprinting
752
+ const fp = trigramFingerprint(text);
753
+ if (fp.triggered) {
754
+ score += 0.40;
755
+ signals.push(`trigram-${fp.bestFamily}(${(fp.bestHitRate * 100).toFixed(0)}%)`);
756
+ }
757
+ else if (fp.bestHitRate >= 0.10) {
758
+ score += 0.20;
759
+ signals.push(`trigram-moderate-${fp.bestFamily}(${(fp.bestHitRate * 100).toFixed(0)}%,decay=${fp.decayedScore.toFixed(2)})`);
760
+ }
761
+ else if (fp.decayedScore >= 0.10) {
762
+ score += 0.05;
763
+ signals.push(`trigram-noise(${(fp.decayedScore * 100).toFixed(0)}%)`);
764
+ }
765
+ const confidence = Math.min(1.0, score);
766
+ const tier = confidence >= 0.60 ? "high" : confidence >= 0.30 ? "mixed" : "none";
767
+ return { isRomanized: confidence >= 0.40, confidence, tier, signals };
768
+ }
769
+ export function detectRomanization(text) {
770
+ var _a;
771
+ const result = eld.detect(text);
772
+ const scores = result.getScores();
773
+ const cluster = analyzeCluster(scores);
774
+ const sortedScores = Object.values(scores).sort((a, b) => b - a);
775
+ const eldTopScore = ((_a = sortedScores[0]) !== null && _a !== void 0 ? _a : 0);
776
+ const r = isRomanized(text, result.language, eldTopScore, result.isReliable(), cluster);
777
+ return Object.assign(Object.assign({}, r), { eldLanguage: result.language, eldTopScore, eldReliable: result.isReliable(), cluster });
778
+ }
779
+ //# sourceMappingURL=romanization-detector.js.map