bekindprofanityfilter 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTORS.md +106 -0
- package/LICENSE +22 -0
- package/README.md +1015 -0
- package/allprofanity.config.example.json +35 -0
- package/bin/init.js +49 -0
- package/config.schema.json +163 -0
- package/dist/algos/aho-corasick.d.ts +75 -0
- package/dist/algos/aho-corasick.js +238 -0
- package/dist/algos/aho-corasick.js.map +1 -0
- package/dist/algos/bloom-filter.d.ts +103 -0
- package/dist/algos/bloom-filter.js +208 -0
- package/dist/algos/bloom-filter.js.map +1 -0
- package/dist/algos/context-patterns.d.ts +102 -0
- package/dist/algos/context-patterns.js +484 -0
- package/dist/algos/context-patterns.js.map +1 -0
- package/dist/index.d.ts +1332 -0
- package/dist/index.js +2631 -0
- package/dist/index.js.map +1 -0
- package/dist/innocence-scoring.d.ts +23 -0
- package/dist/innocence-scoring.js +118 -0
- package/dist/innocence-scoring.js.map +1 -0
- package/dist/language-detector.d.ts +162 -0
- package/dist/language-detector.js +952 -0
- package/dist/language-detector.js.map +1 -0
- package/dist/language-dicts.d.ts +60 -0
- package/dist/language-dicts.js +2718 -0
- package/dist/language-dicts.js.map +1 -0
- package/dist/languages/arabic-words.d.ts +10 -0
- package/dist/languages/arabic-words.js +1649 -0
- package/dist/languages/arabic-words.js.map +1 -0
- package/dist/languages/bengali-words.d.ts +10 -0
- package/dist/languages/bengali-words.js +1696 -0
- package/dist/languages/bengali-words.js.map +1 -0
- package/dist/languages/brazilian-words.d.ts +10 -0
- package/dist/languages/brazilian-words.js +2122 -0
- package/dist/languages/brazilian-words.js.map +1 -0
- package/dist/languages/chinese-words.d.ts +10 -0
- package/dist/languages/chinese-words.js +2728 -0
- package/dist/languages/chinese-words.js.map +1 -0
- package/dist/languages/english-primary-all-languages.d.ts +23 -0
- package/dist/languages/english-primary-all-languages.js +36894 -0
- package/dist/languages/english-primary-all-languages.js.map +1 -0
- package/dist/languages/english-words.d.ts +5 -0
- package/dist/languages/english-words.js +5156 -0
- package/dist/languages/english-words.js.map +1 -0
- package/dist/languages/french-words.d.ts +10 -0
- package/dist/languages/french-words.js +2326 -0
- package/dist/languages/french-words.js.map +1 -0
- package/dist/languages/german-words.d.ts +10 -0
- package/dist/languages/german-words.js +2633 -0
- package/dist/languages/german-words.js.map +1 -0
- package/dist/languages/hindi-words.d.ts +10 -0
- package/dist/languages/hindi-words.js +2341 -0
- package/dist/languages/hindi-words.js.map +1 -0
- package/dist/languages/innocent-words.d.ts +41 -0
- package/dist/languages/innocent-words.js +109 -0
- package/dist/languages/innocent-words.js.map +1 -0
- package/dist/languages/italian-words.d.ts +10 -0
- package/dist/languages/italian-words.js +2287 -0
- package/dist/languages/italian-words.js.map +1 -0
- package/dist/languages/japanese-words.d.ts +11 -0
- package/dist/languages/japanese-words.js +2557 -0
- package/dist/languages/japanese-words.js.map +1 -0
- package/dist/languages/korean-words.d.ts +10 -0
- package/dist/languages/korean-words.js +2509 -0
- package/dist/languages/korean-words.js.map +1 -0
- package/dist/languages/russian-words.d.ts +10 -0
- package/dist/languages/russian-words.js +2175 -0
- package/dist/languages/russian-words.js.map +1 -0
- package/dist/languages/spanish-words.d.ts +11 -0
- package/dist/languages/spanish-words.js +2536 -0
- package/dist/languages/spanish-words.js.map +1 -0
- package/dist/languages/tamil-words.d.ts +10 -0
- package/dist/languages/tamil-words.js +1722 -0
- package/dist/languages/tamil-words.js.map +1 -0
- package/dist/languages/telugu-words.d.ts +10 -0
- package/dist/languages/telugu-words.js +1739 -0
- package/dist/languages/telugu-words.js.map +1 -0
- package/dist/romanization-detector.d.ts +50 -0
- package/dist/romanization-detector.js +779 -0
- package/dist/romanization-detector.js.map +1 -0
- package/package.json +79 -0
|
@@ -0,0 +1,779 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Romanization Detector — identifies Latin-script transliterations of
|
|
3
|
+
* non-Latin-script languages (Hindi romaji, Pinyin, etc.).
|
|
4
|
+
*
|
|
5
|
+
* Uses three layers:
|
|
6
|
+
* 1. ELD n-gram language detection + cluster analysis
|
|
7
|
+
* 2. Per-language romanization n-gram fingerprinting (trigrams + quadgrams)
|
|
8
|
+
* 3. Heuristic signals (scatter, ELD confidence, reliability)
|
|
9
|
+
*/
|
|
10
|
+
// @ts-ignore — eld ships as JS with .d.ts but no proper ESM types
|
|
11
|
+
import { eld } from "eld/small";
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Language family mapping
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
export const LANGUAGE_FAMILIES = {
|
|
16
|
+
romance: ["fr", "ca", "it", "pt", "es", "ro", "gl", "oc"],
|
|
17
|
+
germanic: ["de", "nl", "da", "no", "sv", "af", "is", "lb", "en"],
|
|
18
|
+
slavic: ["ru", "uk", "bg", "pl", "cs", "sk", "hr", "sr", "sl", "bs", "mk"],
|
|
19
|
+
uralic: ["fi", "hu", "et"],
|
|
20
|
+
turkic: ["tr", "az", "uz", "kk", "ky", "tk"],
|
|
21
|
+
celtic: ["cy", "ga", "gd", "br"],
|
|
22
|
+
baltic: ["lt", "lv"],
|
|
23
|
+
semitic: ["ar", "he", "mt"],
|
|
24
|
+
indic: ["hi", "bn", "mr", "gu", "pa", "ne", "si", "ur"],
|
|
25
|
+
dravidian: ["ta", "te", "kn", "ml"],
|
|
26
|
+
sinitic: ["zh"],
|
|
27
|
+
japonic: ["ja"],
|
|
28
|
+
koreanic: ["ko"],
|
|
29
|
+
austronesian: ["tl", "ms", "id", "mg", "ceb", "jv", "su"],
|
|
30
|
+
tai: ["th", "lo"],
|
|
31
|
+
};
|
|
32
|
+
const LANG_TO_FAMILY = {};
|
|
33
|
+
for (const [family, langs] of Object.entries(LANGUAGE_FAMILIES)) {
|
|
34
|
+
for (const lang of langs)
|
|
35
|
+
LANG_TO_FAMILY[lang] = family;
|
|
36
|
+
}
|
|
37
|
+
export function analyzeCluster(scores) {
|
|
38
|
+
const sorted = Object.entries(scores).sort(([, a], [, b]) => b - a);
|
|
39
|
+
const top5families = sorted.slice(0, 5).map(([l]) => LANG_TO_FAMILY[l] || "unknown");
|
|
40
|
+
const familyCounts = {};
|
|
41
|
+
for (const f of top5families)
|
|
42
|
+
familyCounts[f] = (familyCounts[f] || 0) + 1;
|
|
43
|
+
const uniqueFamilies = Object.keys(familyCounts).length;
|
|
44
|
+
const dominantFamilyCount = Math.max(0, ...Object.values(familyCounts));
|
|
45
|
+
return {
|
|
46
|
+
uniqueFamilies,
|
|
47
|
+
dominantFamilyCount,
|
|
48
|
+
isCoherent: dominantFamilyCount >= 3,
|
|
49
|
+
isScattered: uniqueFamilies >= 4,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
// ---------------------------------------------------------------------------
|
|
53
|
+
// Per-language romanization n-gram sets
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// weakParticles: space-padded 2-letter words that are common across languages
|
|
56
|
+
// — scored at 0.5 instead of 1.0/1.5 to avoid false positives
|
|
57
|
+
export const ROMANIZATION_NGRAMS = {
|
|
58
|
+
// Hindi/Urdu/Bengali/Nepali — aspirated clusters, retroflex, nasal combos
|
|
59
|
+
indic: {
|
|
60
|
+
trigrams: new Set([
|
|
61
|
+
"bha", "bhe", "bhi", "bho", "bhu",
|
|
62
|
+
"dha", "dhe", "dhi", "dho", "dhu",
|
|
63
|
+
"gha", "ghe", "ghi", "gho", "ghu",
|
|
64
|
+
"kha", "khe", "khi", "kho", "khu",
|
|
65
|
+
"pha", "phe", "phi", "pho", "phu",
|
|
66
|
+
"chh", "cch",
|
|
67
|
+
"aal", "aam", "aan", "aap", "aar", "aas", "aat",
|
|
68
|
+
"jhe", "jha", "jhi",
|
|
69
|
+
"muj", "tuj", "yeh", "hai", "haa", "nah", "kya",
|
|
70
|
+
"waa", "jaa", "bah", "saa", "tta",
|
|
71
|
+
"ekh", "dek", "akh",
|
|
72
|
+
"eek", "aaj", "aur",
|
|
73
|
+
"ush", "shk", "hki",
|
|
74
|
+
// Bengali-specific
|
|
75
|
+
"cho", "ach", "oth", "pni", "mar",
|
|
76
|
+
"kem", "oba", "bad",
|
|
77
|
+
"she", "chi", "hec",
|
|
78
|
+
// Telugu-specific
|
|
79
|
+
"aru", "nna", "agu", "elu",
|
|
80
|
+
"ava", "ulu",
|
|
81
|
+
"ast", "sto",
|
|
82
|
+
// Nepali-specific
|
|
83
|
+
"hun", "hha",
|
|
84
|
+
// Gujarati-specific
|
|
85
|
+
"avn", "amy", "gam",
|
|
86
|
+
]),
|
|
87
|
+
quadgrams: new Set([
|
|
88
|
+
"bhai", "bhar", "bhut", "bhaa", "bhen",
|
|
89
|
+
"dhar", "dhak", "dhoo", "dhan",
|
|
90
|
+
"ghar", "ghoo", "ghum",
|
|
91
|
+
"khan", "khaa", "khub", "khat", "khar",
|
|
92
|
+
"thee", "thod", "thek",
|
|
93
|
+
"chho", "chha", "chhe",
|
|
94
|
+
"haal", "hain", "hame", "hama",
|
|
95
|
+
"kaam", "jaan", "yaar", "raha", "rahe",
|
|
96
|
+
"mush", "shki", "achh", "achc", "bahu",
|
|
97
|
+
"chod", "saal", "kutt", "kami", "gaan", "baad", "baat", "naam", "mujh", "tujh",
|
|
98
|
+
// Space-padded 2-letter particles (padded "xx" → " xx " = 4 chars = quadgram)
|
|
99
|
+
" ke ", " ka ", " ki ", " ko ", " se ", " na ", " pe ", " ho ",
|
|
100
|
+
" ab ", " jo ", " ye ", " wo ", " tu ",
|
|
101
|
+
// Space-padded 3-letter particles (padded "xxx" → " xxx " → quadgrams " xxx" and "xxx ")
|
|
102
|
+
" sab", " yeh", " veh", " voh", " hai", " hum", " tum", " aur",
|
|
103
|
+
" mat", " kab", " koi",
|
|
104
|
+
"pahr", "parh",
|
|
105
|
+
"sund", "unda", "ndar",
|
|
106
|
+
"padh", "dhai",
|
|
107
|
+
"insh", "nsha", "shal",
|
|
108
|
+
"baar", "aari",
|
|
109
|
+
"sadk",
|
|
110
|
+
"bana", "anan",
|
|
111
|
+
"seek", "eekh",
|
|
112
|
+
"zaro", "aroo",
|
|
113
|
+
// Bengali quadgrams
|
|
114
|
+
"bhal", "halo",
|
|
115
|
+
"koth", "otha", "thay",
|
|
116
|
+
"jacc", "acch", "cche", "chen",
|
|
117
|
+
"apni", "toma", "omar",
|
|
118
|
+
"dhon", "honn", "nnob", "noba",
|
|
119
|
+
"kemo", "emon",
|
|
120
|
+
"eshe", "shec", "hech",
|
|
121
|
+
"boud", "oudi",
|
|
122
|
+
"dada",
|
|
123
|
+
"gelo",
|
|
124
|
+
"bair", "aire",
|
|
125
|
+
"ghur", "hure",
|
|
126
|
+
"lage", "agch", "gche",
|
|
127
|
+
// Telugu quadgrams
|
|
128
|
+
"unn", "nnar", "naru",
|
|
129
|
+
"baag", "aagu", "gunn", "unna",
|
|
130
|
+
"dhan", "hany", "anya", "nyav", "yava",
|
|
131
|
+
"vast", "asta", "star",
|
|
132
|
+
"velt", "eltu", "ltun", "tunn",
|
|
133
|
+
"sant", "anto", "ntos", "tosh",
|
|
134
|
+
"eppu", "ppud",
|
|
135
|
+
// Nepali quadgrams
|
|
136
|
+
"nama", "amas", "mast",
|
|
137
|
+
"hunu", "unuh", "nuhu", "uhun",
|
|
138
|
+
"dhan", "hany", "nyab", "yaba", "abaa",
|
|
139
|
+
// Gujarati quadgrams
|
|
140
|
+
"malv", "alva", "lvan", "vanu",
|
|
141
|
+
"gamy",
|
|
142
|
+
]),
|
|
143
|
+
// Weak particles: common across languages, scored at 0.5
|
|
144
|
+
weakParticles: new Set([" na ", " to "]),
|
|
145
|
+
},
|
|
146
|
+
// Mandarin Pinyin
|
|
147
|
+
pinyin: {
|
|
148
|
+
trigrams: new Set([
|
|
149
|
+
"zho", "zha", "zhe", "zhi", "zhu",
|
|
150
|
+
"qia", "qie", "qin", "qiu",
|
|
151
|
+
"xia", "xie", "xin", "xiu", "xue",
|
|
152
|
+
"iao", "iou", "uai", "uei",
|
|
153
|
+
"guo", "duo", "huo", "suo", "zuo",
|
|
154
|
+
"jin", "jia", "jie", "jiu",
|
|
155
|
+
"ngg", "ngr", "ngy",
|
|
156
|
+
"gao", "hao", "bao", "dao", "lao",
|
|
157
|
+
]),
|
|
158
|
+
quadgrams: new Set([
|
|
159
|
+
"zhon", "hong", "nggu", "gguo",
|
|
160
|
+
"zhen", "zhao", "zhua", "zhan",
|
|
161
|
+
"qian", "qing", "qixi",
|
|
162
|
+
"xian", "xing", "xiao", "xiex",
|
|
163
|
+
"jint", "inti", "ntia",
|
|
164
|
+
"tian", "huan", "yuan",
|
|
165
|
+
"gong", "dong", "peng",
|
|
166
|
+
"ming", "ting", "bing", "ling",
|
|
167
|
+
"gaox", "aoxi", "oxin",
|
|
168
|
+
"bang", "duos", "uosh", "shao",
|
|
169
|
+
"guan", "dian", "nian",
|
|
170
|
+
"pengy", "ngyo",
|
|
171
|
+
"zaij", "aiji", "ijia",
|
|
172
|
+
"piao", "iaol", "aoli", "olia",
|
|
173
|
+
"xihu", "ihuan",
|
|
174
|
+
"feng", "engj", "ngji", "gjin",
|
|
175
|
+
"difa", "ifan", "fang",
|
|
176
|
+
"chif", "ifan",
|
|
177
|
+
"wans", "ansh", "nsha", "shan",
|
|
178
|
+
"shen", "henm",
|
|
179
|
+
]),
|
|
180
|
+
// Weak particles: de (的), le (了), bu (不) — common but shared with other languages
|
|
181
|
+
weakParticles: new Set([" de ", " le ", " bu "]),
|
|
182
|
+
},
|
|
183
|
+
// Japanese Romaji — polite, casual, literary, and slang forms
|
|
184
|
+
romaji: {
|
|
185
|
+
trigrams: new Set([
|
|
186
|
+
// Polite verb endings
|
|
187
|
+
"asu", "esu", "osu",
|
|
188
|
+
"mas", "des",
|
|
189
|
+
// Unique consonant clusters
|
|
190
|
+
"tsu",
|
|
191
|
+
"sho", "shi", "chi",
|
|
192
|
+
// Common fragments
|
|
193
|
+
"ata", "ima",
|
|
194
|
+
// Gemination (doubled consonants)
|
|
195
|
+
"tte", "tta", "kke", "ssa", "kka",
|
|
196
|
+
// Compound consonants
|
|
197
|
+
"nky", "nbe", "nbu", "nde", "nda",
|
|
198
|
+
// Long vowel patterns (-ou, -uu, -ei common in romaji)
|
|
199
|
+
"kou", "mou", "tou", "dou", "sou", "rou",
|
|
200
|
+
"iku", "oku", "uku", "aku", "eku",
|
|
201
|
+
// Polite/keigo fragments
|
|
202
|
+
"sai", "goz", "aim",
|
|
203
|
+
// Common literary/poetic — only distinctive combos
|
|
204
|
+
"omo", "iro",
|
|
205
|
+
"yum", "ume",
|
|
206
|
+
"osh", "yor",
|
|
207
|
+
"ait", "tai",
|
|
208
|
+
"hik",
|
|
209
|
+
"sak",
|
|
210
|
+
"hosh",
|
|
211
|
+
"kai", "sei",
|
|
212
|
+
// Common particles as trigram context
|
|
213
|
+
"wat", "nih",
|
|
214
|
+
"mpo", "amp",
|
|
215
|
+
// Profanity/slang fragments
|
|
216
|
+
"kus", "uso", "aho",
|
|
217
|
+
"kut", "tab",
|
|
218
|
+
"tem", "mee",
|
|
219
|
+
]),
|
|
220
|
+
quadgrams: new Set([
|
|
221
|
+
// Space-padded 2-letter particles (avoid "no", "de" — English; "wa", "ni" — Swahili)
|
|
222
|
+
" ga ", " wo ", " mo ", " ne ", " yo ",
|
|
223
|
+
// Space-padded 3-letter particles
|
|
224
|
+
" eki", " ima", " ano", " ore", " iku", " aru", " nai", " mae",
|
|
225
|
+
// Polite forms
|
|
226
|
+
"masu", "desu", "desh", "imas",
|
|
227
|
+
"shim", "mash", "ashi",
|
|
228
|
+
"kuda", "udas", "dasa",
|
|
229
|
+
"goza", "ozai", "zaim",
|
|
230
|
+
// Common words
|
|
231
|
+
"wata", "atas",
|
|
232
|
+
"niho", "ihon",
|
|
233
|
+
"benk", "enky", "nkyo",
|
|
234
|
+
"ganb", "anba", "nbat", "batt",
|
|
235
|
+
"tomo", "omod", "moda", "odac",
|
|
236
|
+
"omoi", "moir", "oiro",
|
|
237
|
+
"tote", "otem",
|
|
238
|
+
"sumi", "umim", "mima",
|
|
239
|
+
"tano", "anos", "nosh",
|
|
240
|
+
// Poetic/literary
|
|
241
|
+
"yume", "naka", "anat", "nata",
|
|
242
|
+
"aitai", "hosh", "oshi",
|
|
243
|
+
"kaga", "agay", "gaya", "ayak", "yaku", "yaki",
|
|
244
|
+
"shit", "hite",
|
|
245
|
+
"yoru", "saku", "akur", "kura",
|
|
246
|
+
"kire", "irei",
|
|
247
|
+
"hana", "haru", "kami",
|
|
248
|
+
"tsuk", "suki",
|
|
249
|
+
// Casual/informal
|
|
250
|
+
"naru", "shir", "iren", "rena",
|
|
251
|
+
"owar", "wari",
|
|
252
|
+
"haji", "ajim", "jime",
|
|
253
|
+
"yaro", "arou",
|
|
254
|
+
// Profanity/slang
|
|
255
|
+
"kuso", "kutar", "utab", "taba", "abar", "bare",
|
|
256
|
+
"ahou", "hond", "onda", "ndar",
|
|
257
|
+
"kisa", "isam", "sama",
|
|
258
|
+
"teme", "emee",
|
|
259
|
+
"bokk", "okke",
|
|
260
|
+
]),
|
|
261
|
+
// Weak particles: shared with Swahili/English, scored at 0.5
|
|
262
|
+
// Excluded: " wa ", " ni " — too common in Swahili
|
|
263
|
+
weakParticles: new Set([" no ", " de ", " ka ", " to "]),
|
|
264
|
+
},
|
|
265
|
+
// Korean Romanization — formal, casual, and slang forms
|
|
266
|
+
korean: {
|
|
267
|
+
trigrams: new Set([
|
|
268
|
+
// Polite endings
|
|
269
|
+
"eyo", "ayo",
|
|
270
|
+
// Particles
|
|
271
|
+
"sey", "eun", "eul", "reul",
|
|
272
|
+
// Verb stems
|
|
273
|
+
"hag", "seo", "geo",
|
|
274
|
+
"ham", "gam", "kam",
|
|
275
|
+
// -nida formal ending
|
|
276
|
+
"nid", "ida",
|
|
277
|
+
// Tense markers
|
|
278
|
+
"eos", "iss", "sse",
|
|
279
|
+
// annyeong fragments
|
|
280
|
+
"nye", "ngh", "ngs",
|
|
281
|
+
// Common vowel combos
|
|
282
|
+
"yeo", "ung", "eon",
|
|
283
|
+
// Compound vowels
|
|
284
|
+
"hae", "hoe", "hwa",
|
|
285
|
+
// Common words
|
|
286
|
+
"jal", "joh", "gbu",
|
|
287
|
+
"lkk", "eok", "eog",
|
|
288
|
+
"ase", "oyo",
|
|
289
|
+
"sip", "ipe",
|
|
290
|
+
"nge", "ngb",
|
|
291
|
+
// Casual/informal — only distinctive patterns
|
|
292
|
+
"eul", "gat",
|
|
293
|
+
"gae", "rae", "jae",
|
|
294
|
+
"dul", "iga",
|
|
295
|
+
"ssi",
|
|
296
|
+
"bap", "meo", "eok",
|
|
297
|
+
"gac",
|
|
298
|
+
"jeo",
|
|
299
|
+
"peo",
|
|
300
|
+
"oeg",
|
|
301
|
+
"alk",
|
|
302
|
+
// Particles
|
|
303
|
+
"gwa", "eseo",
|
|
304
|
+
]),
|
|
305
|
+
quadgrams: new Set([
|
|
306
|
+
// Space-padded 3-letter particles
|
|
307
|
+
" eun", " eul", " jal", " nae", " uri", " geu", " jeo",
|
|
308
|
+
// Polite forms
|
|
309
|
+
"hase", "aseyo", "seyo",
|
|
310
|
+
"isseo", "sseo", "seoyo",
|
|
311
|
+
"gamsa", "amsa", "msah",
|
|
312
|
+
"hamni", "amni", "mnid", "nida",
|
|
313
|
+
// annyeong
|
|
314
|
+
"annyeo", "nnyeo", "nyeon",
|
|
315
|
+
// Common words
|
|
316
|
+
"gongb", "ongbu",
|
|
317
|
+
"hwai", "wait", "aiti",
|
|
318
|
+
"haeb", "aebo", "bose",
|
|
319
|
+
"meok", "eokgo",
|
|
320
|
+
"sipeo", "ipeo", "peoyo",
|
|
321
|
+
"nalss", "alss", "lssi",
|
|
322
|
+
// Casual/conversational
|
|
323
|
+
"oneul", "neul",
|
|
324
|
+
"gachi", "achi", "gach",
|
|
325
|
+
"aeba", "ebak", "oppa",
|
|
326
|
+
"sanch", "anche", "nchek",
|
|
327
|
+
"joey", "oeyo",
|
|
328
|
+
"nals", "alss", "lssi", "ssig",
|
|
329
|
+
"gayo",
|
|
330
|
+
"chingu", "ingu",
|
|
331
|
+
"haru",
|
|
332
|
+
"dosi", "osir", "sirak",
|
|
333
|
+
"sajin",
|
|
334
|
+
"jeony", "eonye", "nyeo", "yeog",
|
|
335
|
+
"jaem", "aemi", "emii",
|
|
336
|
+
"bang", "angap", "ngap",
|
|
337
|
+
"mann", "anna", "nnas", "nase",
|
|
338
|
+
]),
|
|
339
|
+
// Weak particles: e (에), do (도), i (이) — common but shared
|
|
340
|
+
weakParticles: new Set([" do ", " na ", " je "]),
|
|
341
|
+
},
|
|
342
|
+
// Arabic romanization — MSA and dialectal (Levantine, Egyptian, Gulf)
|
|
343
|
+
arabic: {
|
|
344
|
+
trigrams: new Set([
|
|
345
|
+
// Core patterns
|
|
346
|
+
"hab", "abi", "bib",
|
|
347
|
+
"all", "lla", "lah",
|
|
348
|
+
"shk", "shu", "shm",
|
|
349
|
+
"akh", "ukh", "khr",
|
|
350
|
+
"eef", "eek", "eel",
|
|
351
|
+
"ahm", "hmd",
|
|
352
|
+
"bik", "bil",
|
|
353
|
+
"jaz", "aze", "zee",
|
|
354
|
+
"yaw", "awm",
|
|
355
|
+
"ana", "ant", "nta",
|
|
356
|
+
"ahl", "hla", "sah",
|
|
357
|
+
"kha", "las", "yal",
|
|
358
|
+
"ukr", "kra",
|
|
359
|
+
// Dialectal fragments
|
|
360
|
+
"bid", "idd",
|
|
361
|
+
"ray", "aye", "yeh",
|
|
362
|
+
"ked", "eer",
|
|
363
|
+
"kti", "tir",
|
|
364
|
+
"hel", "elw",
|
|
365
|
+
"shi", "way",
|
|
366
|
+
"mni", "nit",
|
|
367
|
+
"nti", "tik",
|
|
368
|
+
"akhi", "khi",
|
|
369
|
+
"zal", "ala", "lam",
|
|
370
|
+
"bah", "ahe",
|
|
371
|
+
"sad", "ade", "dee",
|
|
372
|
+
"umm", // ummi
|
|
373
|
+
]),
|
|
374
|
+
quadgrams: new Set([
|
|
375
|
+
// Space-padded 2-letter particles (only distinctive ones — avoid "an", "la", "ma" which are common English)
|
|
376
|
+
" ya ", " wa ", " fi ", " bi ",
|
|
377
|
+
// Space-padded 3-letter particles
|
|
378
|
+
" ana", " min", " lil", " ila", " ala", " law", " shu",
|
|
379
|
+
// Core
|
|
380
|
+
"habi", "abib", "bibi",
|
|
381
|
+
"alla", "llah",
|
|
382
|
+
"insh", "nsha", "shal",
|
|
383
|
+
"mash", "asha",
|
|
384
|
+
"shuk", "hukr", "ukra", "kran",
|
|
385
|
+
"jaze", "azee", "zeel",
|
|
386
|
+
"keef", "eefa", "efak",
|
|
387
|
+
"ahla", "hlan",
|
|
388
|
+
"sahl",
|
|
389
|
+
"khal", "hala",
|
|
390
|
+
"yall",
|
|
391
|
+
"wall",
|
|
392
|
+
"alha", "lham", "hamd",
|
|
393
|
+
// Dialectal
|
|
394
|
+
"bidd",
|
|
395
|
+
"raye", "ayeh",
|
|
396
|
+
"helw",
|
|
397
|
+
"shwa", "hway",
|
|
398
|
+
"akhi",
|
|
399
|
+
"baha", "aheb", "heba",
|
|
400
|
+
"sade", "adee", "deeq",
|
|
401
|
+
"zala", "alam", "lame",
|
|
402
|
+
"tihk",
|
|
403
|
+
"mish", "fahim", // mish fahim
|
|
404
|
+
]),
|
|
405
|
+
// Weak particles: la (لا), ma (ما), an (أن) — common but shared with English/Spanish
|
|
406
|
+
weakParticles: new Set([" la ", " ma ", " an "]),
|
|
407
|
+
},
|
|
408
|
+
// Russian/Slavic transliteration — formal, casual, and slang
|
|
409
|
+
russian: {
|
|
410
|
+
trigrams: new Set([
|
|
411
|
+
// Standard transliteration clusters
|
|
412
|
+
"kho", "khr", "kha",
|
|
413
|
+
"zho", "zhe", "zhi",
|
|
414
|
+
"shc", "hch",
|
|
415
|
+
"tsy", "tsa",
|
|
416
|
+
"vst", "dra",
|
|
417
|
+
"pri", "iye",
|
|
418
|
+
"vsy",
|
|
419
|
+
"ych", "yat", "oya",
|
|
420
|
+
"oho", "ros", "osh",
|
|
421
|
+
"seg", "ego", "odn",
|
|
422
|
+
"its",
|
|
423
|
+
"poy", "oyt", "yti",
|
|
424
|
+
"ozh",
|
|
425
|
+
"gul", "uly",
|
|
426
|
+
"khl", "hle", "leb",
|
|
427
|
+
"mol", "olo", "lok",
|
|
428
|
+
// Slang/profanity fragments
|
|
429
|
+
"bly", "lya", "yat",
|
|
430
|
+
"piz", "izd", "zde",
|
|
431
|
+
"nah", "ahu",
|
|
432
|
+
"mud", "uda", "dak",
|
|
433
|
+
"deb", "ebi",
|
|
434
|
+
"dol", "olb", "lbo",
|
|
435
|
+
"suk", "uka",
|
|
436
|
+
"tup", "upo",
|
|
437
|
+
// Casual/informal
|
|
438
|
+
"chy", "zna", "poz",
|
|
439
|
+
"kru", "tol",
|
|
440
|
+
"dav", "ava",
|
|
441
|
+
"poe", "oek", "ekh",
|
|
442
|
+
]),
|
|
443
|
+
quadgrams: new Set([
|
|
444
|
+
// Space-padded 3-letter particles
|
|
445
|
+
" kak", " eto", " ves", " mne", " vot", " uzh", " tam",
|
|
446
|
+
// Standard transliteration
|
|
447
|
+
"khor", "horo", "oros", "rosh",
|
|
448
|
+
"zhno", "mozh",
|
|
449
|
+
"priv", "rive", "ivet",
|
|
450
|
+
"sego", "egod", "godn", "odny",
|
|
451
|
+
"vsyo",
|
|
452
|
+
"nrav", "ravi", "avit", "vits",
|
|
453
|
+
"poyt", "oyti",
|
|
454
|
+
"guly", "ulya", "lyat",
|
|
455
|
+
"khleb", "hleb",
|
|
456
|
+
"molo", "olok", "loko",
|
|
457
|
+
"spas", "pasi", "asib", "sibo",
|
|
458
|
+
"bols", "olsh", "lsho", "shoy",
|
|
459
|
+
"pomo", "omos", "mosh", "oshc",
|
|
460
|
+
"priy", "riya", "iyat",
|
|
461
|
+
"inte", "nter", "tere", "eres",
|
|
462
|
+
"zdra", "drav",
|
|
463
|
+
"drug", "ruzy", "uzya", "zyam",
|
|
464
|
+
"posh", "shli",
|
|
465
|
+
"poto", "otom", "tomu",
|
|
466
|
+
"chto",
|
|
467
|
+
"vzya", "zyal", "yali",
|
|
468
|
+
"sobo", "oboi",
|
|
469
|
+
"igra", "gral",
|
|
470
|
+
"vrem", "remy", "emya",
|
|
471
|
+
"prov", "rove", "ovel",
|
|
472
|
+
"vech", "eche", "cher", "hero",
|
|
473
|
+
"domo", "omoi",
|
|
474
|
+
"usta", "stav", "tavs",
|
|
475
|
+
"scha", "chas", "hast", "astl",
|
|
476
|
+
"zame", "amec", "mech", "chat",
|
|
477
|
+
"piko", "ikni",
|
|
478
|
+
"futb", "utbo",
|
|
479
|
+
// Slang/profanity
|
|
480
|
+
"blya", "lyat",
|
|
481
|
+
"pizd", "izde", "zdet",
|
|
482
|
+
"nahu",
|
|
483
|
+
"muda", "udak",
|
|
484
|
+
"debi",
|
|
485
|
+
"dolb", "olbo", "lboy", "boyo",
|
|
486
|
+
"suka",
|
|
487
|
+
"tupo", "upoy",
|
|
488
|
+
// Casual patterns
|
|
489
|
+
"dava", "avai",
|
|
490
|
+
"poka",
|
|
491
|
+
"tolk", "olko",
|
|
492
|
+
]),
|
|
493
|
+
// Weak particles: ya (я), ne (не), on (он), no (но), vy (вы) — excluded "da" (too common: Hausa, English)
|
|
494
|
+
weakParticles: new Set([" ya ", " ne ", " no ", " vy ", " on "]),
|
|
495
|
+
},
|
|
496
|
+
// Thai romanization
|
|
497
|
+
thai: {
|
|
498
|
+
trigrams: new Set([
|
|
499
|
+
"kra", "rap", "kha",
|
|
500
|
+
"sab", "aba", "bai",
|
|
501
|
+
"saw", "awa", "wad", "ade",
|
|
502
|
+
"kho", "hob", "obu",
|
|
503
|
+
"khu", "hun",
|
|
504
|
+
"pho", "hom",
|
|
505
|
+
"rai", "ara", "nee",
|
|
506
|
+
"pai", "nai",
|
|
507
|
+
"dee", "mai", "cha",
|
|
508
|
+
"anu", "nuk", "san",
|
|
509
|
+
]),
|
|
510
|
+
quadgrams: new Set([
|
|
511
|
+
"krap", "sawa", "awad", "wade", "adee",
|
|
512
|
+
"saba", "abai",
|
|
513
|
+
"khob", "khun", "phom",
|
|
514
|
+
"arai",
|
|
515
|
+
"sanu", "anuk",
|
|
516
|
+
]),
|
|
517
|
+
// Weak particles: na (นะ), di (ดี), ja (จ๊ะ), ka (ค่ะ/ครับ shorthand)
|
|
518
|
+
weakParticles: new Set([" na ", " di ", " ja ", " ka "]),
|
|
519
|
+
},
|
|
520
|
+
// Tamil romanization
|
|
521
|
+
tamil: {
|
|
522
|
+
trigrams: new Set([
|
|
523
|
+
"kka", "kki",
|
|
524
|
+
"nga", "ngi",
|
|
525
|
+
"ppa", "adi",
|
|
526
|
+
"iru", "ruk", "ukk",
|
|
527
|
+
"nak", "akk",
|
|
528
|
+
"ndr", "dri",
|
|
529
|
+
"enn", "nna",
|
|
530
|
+
"inn", "nni", "nik",
|
|
531
|
+
"ela", "aik",
|
|
532
|
+
"ree", "eng",
|
|
533
|
+
]),
|
|
534
|
+
quadgrams: new Set([
|
|
535
|
+
"vana", "anak", "nakk", "akka", "kkam",
|
|
536
|
+
"iruk", "rukk", "ukki", "kkin", "king",
|
|
537
|
+
"eppa", "ppad", "padi",
|
|
538
|
+
"nand", "andr", "ndri",
|
|
539
|
+
"inni", "nnik", "nikk", "ikku",
|
|
540
|
+
"vela", "elai", "laik", "aikk",
|
|
541
|
+
"panr", "anre", "nree", "reeng",
|
|
542
|
+
]),
|
|
543
|
+
// Weak particles: la (லா), na (னா) — excluded "da" (too common cross-language)
|
|
544
|
+
weakParticles: new Set([" la ", " na "]),
|
|
545
|
+
},
|
|
546
|
+
// Persian/Farsi romanization
|
|
547
|
+
persian: {
|
|
548
|
+
trigrams: new Set([
|
|
549
|
+
"sal", "ala", "lam",
|
|
550
|
+
"che", "het", "eto", "tor",
|
|
551
|
+
"ale", "let",
|
|
552
|
+
"khe", "hei", "eil",
|
|
553
|
+
"khu", "hub", "ube",
|
|
554
|
+
"doo", "oos", "ost",
|
|
555
|
+
"iru", "run",
|
|
556
|
+
"rim",
|
|
557
|
+
"gha", "haz", "aza",
|
|
558
|
+
"bok", "okh",
|
|
559
|
+
"emr", "mru", "ruz",
|
|
560
|
+
]),
|
|
561
|
+
quadgrams: new Set([
|
|
562
|
+
"sala", "alam",
|
|
563
|
+
"chet", "heto", "etor", "tori",
|
|
564
|
+
"khei", "heil", "eili",
|
|
565
|
+
"khub", "hube",
|
|
566
|
+
"doos", "oost",
|
|
567
|
+
"biru", "irun",
|
|
568
|
+
"beri", "erim",
|
|
569
|
+
"ghaz", "haza",
|
|
570
|
+
"bokh", "okho", "khor",
|
|
571
|
+
"emru", "mruz",
|
|
572
|
+
"mikha", "ikha",
|
|
573
|
+
"laze", "azem",
|
|
574
|
+
]),
|
|
575
|
+
// Weak particles: ra (را), az (از), be (به), ke (که)
|
|
576
|
+
weakParticles: new Set([" ra ", " az ", " be ", " ke "]),
|
|
577
|
+
},
|
|
578
|
+
// Vietnamese without diacritics
|
|
579
|
+
vietnamese: {
|
|
580
|
+
trigrams: new Set([
|
|
581
|
+
"uoc", "uoi", "uon", "uot",
|
|
582
|
+
"ngu", "ngh", "nho", "nha",
|
|
583
|
+
"tro", "anh", "inh",
|
|
584
|
+
"hom", "nay", "dep", "qua",
|
|
585
|
+
"vui", "duc", "gap", "ban",
|
|
586
|
+
"rat", "toi", "hay", "roi",
|
|
587
|
+
"xin", "cam", "gia", "dia",
|
|
588
|
+
"chu", "dun", "hoc",
|
|
589
|
+
"lam", "mot", "cho",
|
|
590
|
+
"phu",
|
|
591
|
+
"uye", "yen",
|
|
592
|
+
]),
|
|
593
|
+
quadgrams: new Set([
|
|
594
|
+
"nguo", "guoi",
|
|
595
|
+
"nghi", "truo", "ruon", "uong",
|
|
596
|
+
"duoc",
|
|
597
|
+
"chun", "hung",
|
|
598
|
+
"tron", "rong",
|
|
599
|
+
"khon", "hong",
|
|
600
|
+
"nhun",
|
|
601
|
+
"hoan", "oanh",
|
|
602
|
+
]),
|
|
603
|
+
// Weak particles: la (là), va (và), co (có)
|
|
604
|
+
weakParticles: new Set([" la ", " va ", " co "]),
|
|
605
|
+
},
|
|
606
|
+
};
|
|
607
|
+
export function trigramFingerprint(text) {
|
|
608
|
+
var _a, _b, _c, _d, _e, _f;
|
|
609
|
+
const words = text.toLowerCase().split(/\s+/).filter(w => w.length > 0);
|
|
610
|
+
const textTrigrams = [];
|
|
611
|
+
const textQuadgrams = [];
|
|
612
|
+
const particleQuadgrams = [];
|
|
613
|
+
for (const word of words) {
|
|
614
|
+
const clean = word.replace(/[^a-z]/g, "");
|
|
615
|
+
if (clean.length === 0)
|
|
616
|
+
continue;
|
|
617
|
+
// Standard per-word n-gram extraction (words >= 3 chars — counted in denominator)
|
|
618
|
+
if (clean.length >= 3) {
|
|
619
|
+
for (let i = 0; i <= clean.length - 3; i++) {
|
|
620
|
+
textTrigrams.push(clean.slice(i, i + 3));
|
|
621
|
+
}
|
|
622
|
+
for (let i = 0; i <= clean.length - 4; i++) {
|
|
623
|
+
textQuadgrams.push(clean.slice(i, i + 4));
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
// Short words (≤ 3 chars): also pad with spaces for particle quadgrams (bonus only)
|
|
627
|
+
if (clean.length <= 3) {
|
|
628
|
+
// Short words (≤ 3 chars): pad with spaces → quadgrams as bonus signal (not in denominator)
|
|
629
|
+
// e.g. "ke" → " ke " → quadgram " ke "; "hai" → " hai " → quadgrams " hai", "hai "
|
|
630
|
+
const padded = " " + clean + " ";
|
|
631
|
+
for (let i = 0; i <= padded.length - 4; i++) {
|
|
632
|
+
particleQuadgrams.push(padded.slice(i, i + 4));
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
// Denominator only counts per-word n-grams; particles are bonus signal
|
|
637
|
+
const totalNgrams = textTrigrams.length + textQuadgrams.length;
|
|
638
|
+
if (totalNgrams < 10) {
|
|
639
|
+
return { bestFamily: "", bestHitRate: 0, perFamilyRates: {}, totalTrigrams: totalNgrams, decayedScore: 0, triggered: false };
|
|
640
|
+
}
|
|
641
|
+
const diacriticChars = text.match(/[À-ÿĀ-žƠ-ơƯ-ưẠ-ỹ]/g);
|
|
642
|
+
const latinChars = text.match(/[a-zA-ZÀ-ÿĀ-žƠ-ơƯ-ưẠ-ỹ]/g);
|
|
643
|
+
const diacriticDensity = ((_a = diacriticChars === null || diacriticChars === void 0 ? void 0 : diacriticChars.length) !== null && _a !== void 0 ? _a : 0) / ((_b = latinChars === null || latinChars === void 0 ? void 0 : latinChars.length) !== null && _b !== void 0 ? _b : 1);
|
|
644
|
+
const perFamilyHits = {};
|
|
645
|
+
for (const [family, entry] of Object.entries(ROMANIZATION_NGRAMS)) {
|
|
646
|
+
if (family === "vietnamese" && diacriticDensity > 0.05)
|
|
647
|
+
continue;
|
|
648
|
+
const { trigrams: trigramSet, quadgrams: quadgramSet, weakParticles: weakSet } = entry;
|
|
649
|
+
let hits = 0;
|
|
650
|
+
// Standard n-gram matching
|
|
651
|
+
for (const tri of textTrigrams) {
|
|
652
|
+
if (trigramSet.has(tri))
|
|
653
|
+
hits++;
|
|
654
|
+
}
|
|
655
|
+
for (const quad of textQuadgrams) {
|
|
656
|
+
if (quadgramSet.has(quad))
|
|
657
|
+
hits += 1.5;
|
|
658
|
+
}
|
|
659
|
+
// Particle quadgram matches — bonus hits (not in denominator)
|
|
660
|
+
// 2-letter particles (" ke " — 2 spaces) score 1.5, 3-letter (" hai" — 1 space) score 0.25
|
|
661
|
+
for (const quad of particleQuadgrams) {
|
|
662
|
+
if (quadgramSet.has(quad)) {
|
|
663
|
+
const spaces = (quad.match(/ /g) || []).length;
|
|
664
|
+
hits += spaces >= 2 ? 1.5 : 0.25;
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
// Weak particle matches — cross-language particles scored at 0.5
|
|
668
|
+
if (weakSet) {
|
|
669
|
+
for (const quad of particleQuadgrams) {
|
|
670
|
+
if (weakSet.has(quad))
|
|
671
|
+
hits += 0.5;
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
if (hits > 0)
|
|
675
|
+
perFamilyHits[family] = hits;
|
|
676
|
+
}
|
|
677
|
+
const perFamilyRates = {};
|
|
678
|
+
for (const [family, hits] of Object.entries(perFamilyHits)) {
|
|
679
|
+
perFamilyRates[family] = Math.min(1.0, hits / totalNgrams);
|
|
680
|
+
}
|
|
681
|
+
const sorted = Object.entries(perFamilyRates).sort(([, a], [, b]) => b - a);
|
|
682
|
+
const bestFamily = (_d = (_c = sorted[0]) === null || _c === void 0 ? void 0 : _c[0]) !== null && _d !== void 0 ? _d : "";
|
|
683
|
+
const bestHitRate = (_f = (_e = sorted[0]) === null || _e === void 0 ? void 0 : _e[1]) !== null && _f !== void 0 ? _f : 0;
|
|
684
|
+
let decayedScore = 0;
|
|
685
|
+
if (bestHitRate >= 0.18) {
|
|
686
|
+
decayedScore = 1.0;
|
|
687
|
+
}
|
|
688
|
+
else if (bestHitRate >= 0.10) {
|
|
689
|
+
decayedScore = 0.5;
|
|
690
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
691
|
+
const decayFactor = Math.pow(0.5, i);
|
|
692
|
+
decayedScore += sorted[i][1] * decayFactor * 5;
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
else {
|
|
696
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
697
|
+
const decayFactor = Math.pow(0.5, i);
|
|
698
|
+
decayedScore += sorted[i][1] * decayFactor;
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
return {
|
|
702
|
+
bestFamily,
|
|
703
|
+
bestHitRate,
|
|
704
|
+
perFamilyRates,
|
|
705
|
+
totalTrigrams: textTrigrams.length,
|
|
706
|
+
decayedScore,
|
|
707
|
+
triggered: bestHitRate >= 0.18,
|
|
708
|
+
};
|
|
709
|
+
}
|
|
710
|
+
const ELD_GARBAGE_LANGS = new Set(["yo", "hmn"]);
|
|
711
|
+
export function isRomanized(text, eldLang, eldTopScore, eldReliable, cluster) {
|
|
712
|
+
const signals = [];
|
|
713
|
+
let score = 0;
|
|
714
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
715
|
+
const alphaText = text.replace(/[^a-zA-ZÀ-ÿ\u0100-\u024F\u0370-\uFFFF]/g, "");
|
|
716
|
+
const latinText = text.replace(/[^a-zA-ZÀ-ÿ\u0100-\u024F]/g, "");
|
|
717
|
+
const isLatinScript = alphaText.length > 0 ? latinText.length / alphaText.length > 0.85 : true;
|
|
718
|
+
if (!isLatinScript)
|
|
719
|
+
return { isRomanized: false, confidence: 0, tier: "none", signals: ["native-script"] };
|
|
720
|
+
if (words.length <= 3)
|
|
721
|
+
return { isRomanized: false, confidence: 0, tier: "none", signals: ["too-short"] };
|
|
722
|
+
// Graduated family scatter
|
|
723
|
+
if (cluster.uniqueFamilies >= 5) {
|
|
724
|
+
score += 0.20;
|
|
725
|
+
signals.push("family-scatter-5");
|
|
726
|
+
}
|
|
727
|
+
else if (cluster.uniqueFamilies >= 4) {
|
|
728
|
+
score += 0.15;
|
|
729
|
+
signals.push("family-scatter-4");
|
|
730
|
+
}
|
|
731
|
+
else if (cluster.uniqueFamilies >= 3 && !cluster.isCoherent) {
|
|
732
|
+
score += 0.10;
|
|
733
|
+
signals.push("family-incoherent-3");
|
|
734
|
+
}
|
|
735
|
+
if (ELD_GARBAGE_LANGS.has(eldLang)) {
|
|
736
|
+
score += 0.20;
|
|
737
|
+
signals.push("eld-garbage-lang");
|
|
738
|
+
}
|
|
739
|
+
if (eldTopScore < 0.45) {
|
|
740
|
+
score += 0.15;
|
|
741
|
+
signals.push("very-low-eld");
|
|
742
|
+
}
|
|
743
|
+
else if (eldTopScore < 0.60) {
|
|
744
|
+
score += 0.08;
|
|
745
|
+
signals.push("low-eld");
|
|
746
|
+
}
|
|
747
|
+
if (!eldReliable) {
|
|
748
|
+
score += 0.08;
|
|
749
|
+
signals.push("eld-unreliable");
|
|
750
|
+
}
|
|
751
|
+
// Per-language romanization n-gram fingerprinting
|
|
752
|
+
const fp = trigramFingerprint(text);
|
|
753
|
+
if (fp.triggered) {
|
|
754
|
+
score += 0.40;
|
|
755
|
+
signals.push(`trigram-${fp.bestFamily}(${(fp.bestHitRate * 100).toFixed(0)}%)`);
|
|
756
|
+
}
|
|
757
|
+
else if (fp.bestHitRate >= 0.10) {
|
|
758
|
+
score += 0.20;
|
|
759
|
+
signals.push(`trigram-moderate-${fp.bestFamily}(${(fp.bestHitRate * 100).toFixed(0)}%,decay=${fp.decayedScore.toFixed(2)})`);
|
|
760
|
+
}
|
|
761
|
+
else if (fp.decayedScore >= 0.10) {
|
|
762
|
+
score += 0.05;
|
|
763
|
+
signals.push(`trigram-noise(${(fp.decayedScore * 100).toFixed(0)}%)`);
|
|
764
|
+
}
|
|
765
|
+
const confidence = Math.min(1.0, score);
|
|
766
|
+
const tier = confidence >= 0.60 ? "high" : confidence >= 0.30 ? "mixed" : "none";
|
|
767
|
+
return { isRomanized: confidence >= 0.40, confidence, tier, signals };
|
|
768
|
+
}
|
|
769
|
+
export function detectRomanization(text) {
|
|
770
|
+
var _a;
|
|
771
|
+
const result = eld.detect(text);
|
|
772
|
+
const scores = result.getScores();
|
|
773
|
+
const cluster = analyzeCluster(scores);
|
|
774
|
+
const sortedScores = Object.values(scores).sort((a, b) => b - a);
|
|
775
|
+
const eldTopScore = ((_a = sortedScores[0]) !== null && _a !== void 0 ? _a : 0);
|
|
776
|
+
const r = isRomanized(text, result.language, eldTopScore, result.isReliable(), cluster);
|
|
777
|
+
return Object.assign(Object.assign({}, r), { eldLanguage: result.language, eldTopScore, eldReliable: result.isReliable(), cluster });
|
|
778
|
+
}
|
|
779
|
+
//# sourceMappingURL=romanization-detector.js.map
|