bekindprofanityfilter 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTORS.md +106 -0
- package/LICENSE +22 -0
- package/README.md +1015 -0
- package/allprofanity.config.example.json +35 -0
- package/bin/init.js +49 -0
- package/config.schema.json +163 -0
- package/dist/algos/aho-corasick.d.ts +75 -0
- package/dist/algos/aho-corasick.js +238 -0
- package/dist/algos/aho-corasick.js.map +1 -0
- package/dist/algos/bloom-filter.d.ts +103 -0
- package/dist/algos/bloom-filter.js +208 -0
- package/dist/algos/bloom-filter.js.map +1 -0
- package/dist/algos/context-patterns.d.ts +102 -0
- package/dist/algos/context-patterns.js +484 -0
- package/dist/algos/context-patterns.js.map +1 -0
- package/dist/index.d.ts +1332 -0
- package/dist/index.js +2631 -0
- package/dist/index.js.map +1 -0
- package/dist/innocence-scoring.d.ts +23 -0
- package/dist/innocence-scoring.js +118 -0
- package/dist/innocence-scoring.js.map +1 -0
- package/dist/language-detector.d.ts +162 -0
- package/dist/language-detector.js +952 -0
- package/dist/language-detector.js.map +1 -0
- package/dist/language-dicts.d.ts +60 -0
- package/dist/language-dicts.js +2718 -0
- package/dist/language-dicts.js.map +1 -0
- package/dist/languages/arabic-words.d.ts +10 -0
- package/dist/languages/arabic-words.js +1649 -0
- package/dist/languages/arabic-words.js.map +1 -0
- package/dist/languages/bengali-words.d.ts +10 -0
- package/dist/languages/bengali-words.js +1696 -0
- package/dist/languages/bengali-words.js.map +1 -0
- package/dist/languages/brazilian-words.d.ts +10 -0
- package/dist/languages/brazilian-words.js +2122 -0
- package/dist/languages/brazilian-words.js.map +1 -0
- package/dist/languages/chinese-words.d.ts +10 -0
- package/dist/languages/chinese-words.js +2728 -0
- package/dist/languages/chinese-words.js.map +1 -0
- package/dist/languages/english-primary-all-languages.d.ts +23 -0
- package/dist/languages/english-primary-all-languages.js +36894 -0
- package/dist/languages/english-primary-all-languages.js.map +1 -0
- package/dist/languages/english-words.d.ts +5 -0
- package/dist/languages/english-words.js +5156 -0
- package/dist/languages/english-words.js.map +1 -0
- package/dist/languages/french-words.d.ts +10 -0
- package/dist/languages/french-words.js +2326 -0
- package/dist/languages/french-words.js.map +1 -0
- package/dist/languages/german-words.d.ts +10 -0
- package/dist/languages/german-words.js +2633 -0
- package/dist/languages/german-words.js.map +1 -0
- package/dist/languages/hindi-words.d.ts +10 -0
- package/dist/languages/hindi-words.js +2341 -0
- package/dist/languages/hindi-words.js.map +1 -0
- package/dist/languages/innocent-words.d.ts +41 -0
- package/dist/languages/innocent-words.js +109 -0
- package/dist/languages/innocent-words.js.map +1 -0
- package/dist/languages/italian-words.d.ts +10 -0
- package/dist/languages/italian-words.js +2287 -0
- package/dist/languages/italian-words.js.map +1 -0
- package/dist/languages/japanese-words.d.ts +11 -0
- package/dist/languages/japanese-words.js +2557 -0
- package/dist/languages/japanese-words.js.map +1 -0
- package/dist/languages/korean-words.d.ts +10 -0
- package/dist/languages/korean-words.js +2509 -0
- package/dist/languages/korean-words.js.map +1 -0
- package/dist/languages/russian-words.d.ts +10 -0
- package/dist/languages/russian-words.js +2175 -0
- package/dist/languages/russian-words.js.map +1 -0
- package/dist/languages/spanish-words.d.ts +11 -0
- package/dist/languages/spanish-words.js +2536 -0
- package/dist/languages/spanish-words.js.map +1 -0
- package/dist/languages/tamil-words.d.ts +10 -0
- package/dist/languages/tamil-words.js +1722 -0
- package/dist/languages/tamil-words.js.map +1 -0
- package/dist/languages/telugu-words.d.ts +10 -0
- package/dist/languages/telugu-words.js +1739 -0
- package/dist/languages/telugu-words.js.map +1 -0
- package/dist/romanization-detector.d.ts +50 -0
- package/dist/romanization-detector.js +779 -0
- package/dist/romanization-detector.js.map +1 -0
- package/package.json +79 -0
|
@@ -0,0 +1,952 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language Detector — hybrid ELD n-gram + script + vocabulary detection.
|
|
3
|
+
*
|
|
4
|
+
* Three detection layers:
|
|
5
|
+
* 1. **ELD n-gram detection**: Corpus-trained n-gram model (Nito-ELD) provides
|
|
6
|
+
* per-word language scores and full-text Bayesian priors.
|
|
7
|
+
* 2. **Script detection**: Unicode codepoint ranges map characters to
|
|
8
|
+
* language families instantly and deterministically.
|
|
9
|
+
* 3. **Vocabulary detection**: Trie-based word matching against per-language
|
|
10
|
+
* dictionaries disambiguates languages sharing a script.
|
|
11
|
+
*
|
|
12
|
+
* High-signal suffixes/prefixes (accent-bearing, 0.3+ weight) provide additional
|
|
13
|
+
* disambiguation for accented/diacritical text.
|
|
14
|
+
*
|
|
15
|
+
* Designed for use alongside the profanity filter to weight matches by detected
|
|
16
|
+
* language context.
|
|
17
|
+
*/
|
|
18
|
+
// @ts-ignore — eld ships as JS with .d.ts but no proper ESM types
|
|
19
|
+
import { eld } from "eld/small";
|
|
20
|
+
import { languageTries, phraseSets } from "./language-dicts.js";
|
|
21
|
+
import { detectRomanization } from "./romanization-detector.js";
|
|
22
|
+
/** Languages that have trie vocabulary dictionaries */
|
|
23
|
+
const TRIE_LANGUAGES = new Set([
|
|
24
|
+
"en", "es", "fr", "de", "it", "pt", "nl", "tr",
|
|
25
|
+
"ru", "ar", "zh", "ja", "ko", "hi", "bn", "ta", "te",
|
|
26
|
+
"sv",
|
|
27
|
+
]);
|
|
28
|
+
/**
|
|
29
|
+
* Unicode codepoint ranges mapped to the languages they indicate.
|
|
30
|
+
* Ranges are checked in order; first match wins for each character.
|
|
31
|
+
*/
|
|
32
|
+
const SCRIPT_RANGES = [
|
|
33
|
+
// Cyrillic
|
|
34
|
+
{ start: 0x0400, end: 0x04FF, languages: ["ru"] },
|
|
35
|
+
{ start: 0x0500, end: 0x052F, languages: ["ru"] },
|
|
36
|
+
// Arabic (includes Urdu, Farsi — we tag as "ar" for now)
|
|
37
|
+
{ start: 0x0600, end: 0x06FF, languages: ["ar"] },
|
|
38
|
+
{ start: 0x0750, end: 0x077F, languages: ["ar"] },
|
|
39
|
+
{ start: 0x08A0, end: 0x08FF, languages: ["ar"] },
|
|
40
|
+
{ start: 0xFB50, end: 0xFDFF, languages: ["ar"] },
|
|
41
|
+
{ start: 0xFE70, end: 0xFEFF, languages: ["ar"] },
|
|
42
|
+
// Devanagari (Hindi, Marathi, Sanskrit)
|
|
43
|
+
{ start: 0x0900, end: 0x097F, languages: ["hi"] },
|
|
44
|
+
{ start: 0xA8E0, end: 0xA8FF, languages: ["hi"] },
|
|
45
|
+
// Bengali
|
|
46
|
+
{ start: 0x0980, end: 0x09FF, languages: ["bn"] },
|
|
47
|
+
// Tamil
|
|
48
|
+
{ start: 0x0B80, end: 0x0BFF, languages: ["ta"] },
|
|
49
|
+
// Telugu
|
|
50
|
+
{ start: 0x0C00, end: 0x0C7F, languages: ["te"] },
|
|
51
|
+
// Hangul (Korean)
|
|
52
|
+
{ start: 0x1100, end: 0x11FF, languages: ["ko"] },
|
|
53
|
+
{ start: 0x3130, end: 0x318F, languages: ["ko"] },
|
|
54
|
+
{ start: 0xAC00, end: 0xD7AF, languages: ["ko"] },
|
|
55
|
+
{ start: 0xD7B0, end: 0xD7FF, languages: ["ko"] },
|
|
56
|
+
// Japanese-specific kana
|
|
57
|
+
{ start: 0x3040, end: 0x309F, languages: ["ja"] },
|
|
58
|
+
{ start: 0x30A0, end: 0x30FF, languages: ["ja"] },
|
|
59
|
+
{ start: 0x31F0, end: 0x31FF, languages: ["ja"] },
|
|
60
|
+
{ start: 0xFF65, end: 0xFF9F, languages: ["ja"] },
|
|
61
|
+
// CJK Unified Ideographs — shared by Chinese and Japanese
|
|
62
|
+
// When we see CJK without any kana, it's more likely Chinese.
|
|
63
|
+
// When kana is also present, the kana ranges already tag "ja".
|
|
64
|
+
{ start: 0x4E00, end: 0x9FFF, languages: ["zh", "ja"] },
|
|
65
|
+
{ start: 0x3400, end: 0x4DBF, languages: ["zh", "ja"] },
|
|
66
|
+
{ start: 0x20000, end: 0x2A6DF, languages: ["zh", "ja"] },
|
|
67
|
+
{ start: 0x2A700, end: 0x2B73F, languages: ["zh", "ja"] },
|
|
68
|
+
{ start: 0x2B740, end: 0x2B81F, languages: ["zh", "ja"] },
|
|
69
|
+
{ start: 0x3000, end: 0x303F, languages: ["zh", "ja"] },
|
|
70
|
+
// Latin Extended — covers accented chars used in European languages
|
|
71
|
+
// We can't distinguish en/es/fr/de/it/pt from characters alone, but certain
|
|
72
|
+
// diacritical marks are strong signals for specific languages.
|
|
73
|
+
{ start: 0x00C0, end: 0x00FF, languages: ["es", "fr", "de", "it", "pt", "en"] },
|
|
74
|
+
{ start: 0x0100, end: 0x017F, languages: ["es", "fr", "de", "it", "pt", "tr"] },
|
|
75
|
+
{ start: 0x0180, end: 0x024F, languages: ["es", "fr", "de", "it", "pt", "tr"] }, // Latin Extended-B
|
|
76
|
+
// Turkish-specific characters
|
|
77
|
+
// ğ (0x011F), ş (0x015F), ı (0x0131), İ (0x0130), ç (0x00E7 shared with FR/PT)
|
|
78
|
+
// These are already covered by Latin Extended-A above, but we boost Turkish
|
|
79
|
+
// via vocabulary detection when these are present.
|
|
80
|
+
];
|
|
81
|
+
/**
|
|
82
|
+
* Specific diacritical characters that are strong signals for particular languages.
|
|
83
|
+
* Maps codepoints to language boosts.
|
|
84
|
+
*/
|
|
85
|
+
const DIACRITICAL_SIGNALS = new Map([
|
|
86
|
+
// German-specific
|
|
87
|
+
[0x00C4, ["de"]],
|
|
88
|
+
[0x00D6, ["de"]],
|
|
89
|
+
[0x00DC, ["de"]],
|
|
90
|
+
[0x00E4, ["de"]],
|
|
91
|
+
[0x00F6, ["de"]],
|
|
92
|
+
[0x00FC, ["de"]],
|
|
93
|
+
[0x00DF, ["de"]],
|
|
94
|
+
// Spanish-specific
|
|
95
|
+
[0x00D1, ["es"]],
|
|
96
|
+
[0x00F1, ["es"]],
|
|
97
|
+
[0x00BF, ["es"]],
|
|
98
|
+
[0x00A1, ["es"]],
|
|
99
|
+
[0x00E1, ["es", "pt"]],
|
|
100
|
+
[0x00C1, ["es", "pt"]],
|
|
101
|
+
[0x00ED, ["es"]],
|
|
102
|
+
[0x00CD, ["es"]],
|
|
103
|
+
[0x00F3, ["es", "pt", "it"]],
|
|
104
|
+
[0x00D3, ["es", "pt", "it"]],
|
|
105
|
+
[0x00FA, ["es", "pt"]],
|
|
106
|
+
[0x00DA, ["es", "pt"]],
|
|
107
|
+
// French-specific accents (shared with others, but weighted)
|
|
108
|
+
[0x00E9, ["fr", "es"]],
|
|
109
|
+
[0x00C9, ["fr", "es"]],
|
|
110
|
+
[0x00E0, ["fr"]],
|
|
111
|
+
[0x00C0, ["fr"]],
|
|
112
|
+
[0x00E8, ["fr"]],
|
|
113
|
+
[0x00C8, ["fr"]],
|
|
114
|
+
[0x00EA, ["fr"]],
|
|
115
|
+
[0x00EB, ["fr"]],
|
|
116
|
+
[0x00EE, ["fr"]],
|
|
117
|
+
[0x00EF, ["fr"]],
|
|
118
|
+
[0x00F4, ["fr"]],
|
|
119
|
+
[0x00F9, ["fr"]],
|
|
120
|
+
[0x00FB, ["fr"]],
|
|
121
|
+
[0x0153, ["fr"]],
|
|
122
|
+
[0x00E7, ["fr", "pt", "tr"]],
|
|
123
|
+
// Portuguese-specific
|
|
124
|
+
[0x00E3, ["pt"]],
|
|
125
|
+
[0x00F5, ["pt"]],
|
|
126
|
+
// Turkish-specific
|
|
127
|
+
[0x011E, ["tr"]],
|
|
128
|
+
[0x011F, ["tr"]],
|
|
129
|
+
[0x015E, ["tr"]],
|
|
130
|
+
[0x015F, ["tr"]],
|
|
131
|
+
[0x0130, ["tr"]],
|
|
132
|
+
[0x0131, ["tr"]], // ı (dotless i)
|
|
133
|
+
]);
|
|
134
|
+
// ---------------------------------------------------------------------------
|
|
135
|
+
// Global Bayesian prior — distribution-weighted
|
|
136
|
+
// ---------------------------------------------------------------------------
|
|
137
|
+
// As words are processed left-to-right, each language accumulates a running
|
|
138
|
+
// proportion of the text. For each subsequent word, the raw score is nudged
|
|
139
|
+
// by (1 + PRIOR_ALPHA * cumulativeProportion). This means languages that have
|
|
140
|
+
// already accumulated evidence get a proportional boost on ambiguous words,
|
|
141
|
+
// without needing predefined family groupings. Mixed-language text naturally
|
|
142
|
+
// gives both languages a boost; noise languages stay suppressed.
|
|
143
|
+
//
|
|
144
|
+
// PRIOR_ALPHA controls the strength: 0.2 means a language with 100% of the
|
|
145
|
+
// running distribution gets a 20% score boost; one with 50% gets 10%.
|
|
146
|
+
const PRIOR_ALPHA = 0.2;
|
|
147
|
+
/**
|
|
148
|
+
* Shannon entropy of the cumulative distribution (in bits).
|
|
149
|
+
* Low entropy = 1-2 languages dominate → prior is confident, apply full alpha.
|
|
150
|
+
* High entropy = many languages roughly equal → back off, let raw scores speak.
|
|
151
|
+
* Returns a scaling factor 0..1 to multiply PRIOR_ALPHA by.
|
|
152
|
+
*/
|
|
153
|
+
function entropyGate(shares, total) {
|
|
154
|
+
if (total === 0)
|
|
155
|
+
return 0;
|
|
156
|
+
let entropy = 0;
|
|
157
|
+
for (const count of shares.values()) {
|
|
158
|
+
const p = count / total;
|
|
159
|
+
if (p > 0)
|
|
160
|
+
entropy -= p * Math.log2(p);
|
|
161
|
+
}
|
|
162
|
+
// Max entropy for N languages = log2(N). With 17 languages, max ≈ 4.09 bits.
|
|
163
|
+
// We want full alpha when entropy < 1.5 bits (1-2 dominant languages),
|
|
164
|
+
// and zero alpha when entropy > 3.5 bits (many roughly equal).
|
|
165
|
+
// Linear ramp between these thresholds.
|
|
166
|
+
const LOW = 1.5;
|
|
167
|
+
const HIGH = 3.5;
|
|
168
|
+
if (entropy <= LOW)
|
|
169
|
+
return 1.0;
|
|
170
|
+
if (entropy >= HIGH)
|
|
171
|
+
return 0.0;
|
|
172
|
+
return (HIGH - entropy) / (HIGH - LOW);
|
|
173
|
+
}
|
|
174
|
+
// ---------------------------------------------------------------------------
|
|
175
|
+
// Detection implementation
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
/**
|
|
178
|
+
* Classify a single character by Unicode script to one or more language codes.
|
|
179
|
+
* Returns undefined for basic Latin (A-Z, 0-9) and common punctuation — those
|
|
180
|
+
* are ambiguous and need vocabulary-based disambiguation.
|
|
181
|
+
*/
|
|
182
|
+
function classifyChar(codepoint) {
|
|
183
|
+
// Check diacritical signals first (strong language indicators)
|
|
184
|
+
const diacritical = DIACRITICAL_SIGNALS.get(codepoint);
|
|
185
|
+
if (diacritical)
|
|
186
|
+
return diacritical;
|
|
187
|
+
// Check script ranges
|
|
188
|
+
for (const range of SCRIPT_RANGES) {
|
|
189
|
+
if (codepoint >= range.start && codepoint <= range.end) {
|
|
190
|
+
return range.languages;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return undefined;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Script-based detection: count characters belonging to each language's script.
|
|
197
|
+
* Returns raw character counts per language.
|
|
198
|
+
*/
|
|
199
|
+
function detectByScript(text) {
|
|
200
|
+
const counts = new Map();
|
|
201
|
+
for (const char of text) {
|
|
202
|
+
const cp = char.codePointAt(0);
|
|
203
|
+
if (cp === undefined)
|
|
204
|
+
continue;
|
|
205
|
+
const langs = classifyChar(cp);
|
|
206
|
+
if (!langs)
|
|
207
|
+
continue;
|
|
208
|
+
for (const lang of langs) {
|
|
209
|
+
counts.set(lang, (counts.get(lang) || 0) + 1);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
return counts;
|
|
213
|
+
}
|
|
214
|
+
/** Minimum match score to count a word as a hit for a language */
|
|
215
|
+
const MIN_MATCH_SCORE = 0.6;
|
|
216
|
+
/**
|
|
217
|
+
* Trie-based vocabulary detection: split text into words, score each word
|
|
218
|
+
* against every language trie, aggregate scores per language.
|
|
219
|
+
*
|
|
220
|
+
* Each input word is walked against each language's trie in O(word_len).
|
|
221
|
+
* Partial matches produce fractional scores (e.g., "thre" vs "three" = 0.8).
|
|
222
|
+
* Phrase matching is done separately for multi-word expressions.
|
|
223
|
+
*/
|
|
224
|
+
function detectByVocabulary(text) {
|
|
225
|
+
const scores = new Map();
|
|
226
|
+
const textLower = text.toLowerCase();
|
|
227
|
+
// Split into words (Unicode-aware: includes accented Latin, apostrophes)
|
|
228
|
+
const words = textLower.match(/[\p{L}\p{M}'-]+/gu) || [];
|
|
229
|
+
// Score each word against each language trie
|
|
230
|
+
for (const word of words) {
|
|
231
|
+
if (word.length < 2)
|
|
232
|
+
continue; // Skip single chars — too ambiguous
|
|
233
|
+
for (const [lang, trie] of languageTries) {
|
|
234
|
+
const score = trie.matchScore(word);
|
|
235
|
+
if (score >= MIN_MATCH_SCORE) {
|
|
236
|
+
scores.set(lang, (scores.get(lang) || 0) + score);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
// Phrase matching — weighted higher (5× per phrase match)
|
|
241
|
+
for (const [lang, phrases] of phraseSets) {
|
|
242
|
+
for (const phrase of phrases) {
|
|
243
|
+
if (textLower.includes(phrase)) {
|
|
244
|
+
scores.set(lang, (scores.get(lang) || 0) + 5);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
return scores;
|
|
249
|
+
}
|
|
250
|
+
// ---------------------------------------------------------------------------
|
|
251
|
+
// Shannon entropy utility
|
|
252
|
+
// ---------------------------------------------------------------------------
|
|
253
|
+
/**
|
|
254
|
+
* Compute Shannon entropy (in bits) over a distribution of raw scores.
|
|
255
|
+
* Normalizes internally so inputs don't need to sum to 1.
|
|
256
|
+
* Filters out negligible entries (< 1% of total) before computing.
|
|
257
|
+
*
|
|
258
|
+
* Reference values for N equally likely outcomes: H = log2(N)
|
|
259
|
+
* - 2 langs: 1.00 bits
|
|
260
|
+
* - 3 langs: 1.58 bits
|
|
261
|
+
* - 4 langs: 2.00 bits
|
|
262
|
+
* - 8 langs: 3.00 bits
|
|
263
|
+
*/
|
|
264
|
+
export function shannonEntropy(values) {
|
|
265
|
+
const total = values.reduce((s, v) => s + v, 0);
|
|
266
|
+
if (total <= 0)
|
|
267
|
+
return 0;
|
|
268
|
+
// Filter to significant entries and renormalize
|
|
269
|
+
const significant = values.filter(v => v / total > 0.01);
|
|
270
|
+
const sigTotal = significant.reduce((s, v) => s + v, 0);
|
|
271
|
+
if (sigTotal <= 0)
|
|
272
|
+
return 0;
|
|
273
|
+
let entropy = 0;
|
|
274
|
+
for (const v of significant) {
|
|
275
|
+
const p = v / sigTotal;
|
|
276
|
+
if (p > 0)
|
|
277
|
+
entropy -= p * Math.log2(p);
|
|
278
|
+
}
|
|
279
|
+
return entropy;
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Compute how "scrappy" / SMS-like the text is.
|
|
283
|
+
* Romanized text tends to be informal: short words, no punctuation, no caps,
|
|
284
|
+
* abbreviations, repeated chars, number substitutions.
|
|
285
|
+
*
|
|
286
|
+
* ELD is trained on formal text, so high scrappiness = low ELD trust.
|
|
287
|
+
*/
|
|
288
|
+
export function computeScrappiness(text) {
|
|
289
|
+
var _a;
|
|
290
|
+
const signals = [];
|
|
291
|
+
let score = 0;
|
|
292
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
293
|
+
if (words.length === 0)
|
|
294
|
+
return { score: 0, signals: [] };
|
|
295
|
+
// Average word length — romanized/SMS text has shorter words
|
|
296
|
+
const avgWordLen = words.reduce((s, w) => s + w.length, 0) / words.length;
|
|
297
|
+
if (avgWordLen < 3.5) {
|
|
298
|
+
score += 0.20;
|
|
299
|
+
signals.push(`short-words(${avgWordLen.toFixed(1)})`);
|
|
300
|
+
}
|
|
301
|
+
else if (avgWordLen < 4.5) {
|
|
302
|
+
score += 0.10;
|
|
303
|
+
signals.push(`medium-words(${avgWordLen.toFixed(1)})`);
|
|
304
|
+
}
|
|
305
|
+
// No punctuation — informal text often lacks periods, commas, etc.
|
|
306
|
+
const punctuation = text.match(/[.,;:!?'"()[\]{}]/g);
|
|
307
|
+
const punctDensity = ((_a = punctuation === null || punctuation === void 0 ? void 0 : punctuation.length) !== null && _a !== void 0 ? _a : 0) / text.length;
|
|
308
|
+
if (punctDensity < 0.005) {
|
|
309
|
+
score += 0.15;
|
|
310
|
+
signals.push("no-punctuation");
|
|
311
|
+
}
|
|
312
|
+
// No capitalization (or all lowercase) — SMS/chat style
|
|
313
|
+
const alphaChars = text.match(/[a-zA-Z]/g) || [];
|
|
314
|
+
const upperCount = alphaChars.filter(c => c >= "A" && c <= "Z").length;
|
|
315
|
+
const upperRatio = alphaChars.length > 0 ? upperCount / alphaChars.length : 0;
|
|
316
|
+
if (upperRatio === 0 && alphaChars.length > 10) {
|
|
317
|
+
score += 0.15;
|
|
318
|
+
signals.push("all-lowercase");
|
|
319
|
+
}
|
|
320
|
+
else if (upperRatio < 0.03 && alphaChars.length > 10) {
|
|
321
|
+
score += 0.05;
|
|
322
|
+
signals.push("mostly-lowercase");
|
|
323
|
+
}
|
|
324
|
+
// Number substitutions (common in chat: 3=e, 7=t, 0=o, 2=to, etc.)
|
|
325
|
+
const numInWords = words.filter(w => /\d/.test(w) && /[a-zA-Z]/.test(w)).length;
|
|
326
|
+
if (numInWords >= 2) {
|
|
327
|
+
score += 0.15;
|
|
328
|
+
signals.push(`num-substitution(${numInWords})`);
|
|
329
|
+
}
|
|
330
|
+
// Repeated characters (lol → loool, pleaaase)
|
|
331
|
+
const repeats = text.match(/(.)\1{2,}/g);
|
|
332
|
+
if (repeats && repeats.length >= 1) {
|
|
333
|
+
score += 0.10;
|
|
334
|
+
signals.push(`char-repeats(${repeats.length})`);
|
|
335
|
+
}
|
|
336
|
+
// High proportion of very short words (≤ 2 chars) — particle-heavy / SMS
|
|
337
|
+
const shortWords = words.filter(w => w.replace(/[^a-zA-Z]/g, "").length <= 2).length;
|
|
338
|
+
const shortRatio = shortWords / words.length;
|
|
339
|
+
if (shortRatio >= 0.40) {
|
|
340
|
+
score += 0.15;
|
|
341
|
+
signals.push(`short-word-ratio(${(shortRatio * 100).toFixed(0)}%)`);
|
|
342
|
+
}
|
|
343
|
+
else if (shortRatio >= 0.25) {
|
|
344
|
+
score += 0.05;
|
|
345
|
+
signals.push(`moderate-short-words(${(shortRatio * 100).toFixed(0)}%)`);
|
|
346
|
+
}
|
|
347
|
+
return { score: Math.min(1.0, score), signals };
|
|
348
|
+
}
|
|
349
|
+
/**
|
|
350
|
+
* Compute how much to trust ELD's language detection.
|
|
351
|
+
*
|
|
352
|
+
* Returns a factor 0–1 that should multiply ELD contributions:
|
|
353
|
+
* - 1.0: ELD is fully trusted (formal English/European text)
|
|
354
|
+
* - 0.5: ELD is partially trusted (some romanization or scrappiness)
|
|
355
|
+
* - 0.0: ELD is not trusted at all (strong romanization + very scrappy)
|
|
356
|
+
*/
|
|
357
|
+
export function computeEldPenalty(text) {
|
|
358
|
+
const penalties = [];
|
|
359
|
+
let penalty = 0;
|
|
360
|
+
// Romanization penalty — if text looks romanized, ELD is unreliable
|
|
361
|
+
const rom = detectRomanization(text);
|
|
362
|
+
if (rom.tier === "high") {
|
|
363
|
+
penalty += 0.50;
|
|
364
|
+
penalties.push(`romanization-high(${rom.confidence.toFixed(2)})`);
|
|
365
|
+
}
|
|
366
|
+
else if (rom.tier === "mixed") {
|
|
367
|
+
penalty += 0.30;
|
|
368
|
+
penalties.push(`romanization-mixed(${rom.confidence.toFixed(2)})`);
|
|
369
|
+
}
|
|
370
|
+
else if (rom.confidence >= 0.15) {
|
|
371
|
+
penalty += 0.10;
|
|
372
|
+
penalties.push(`romanization-low(${rom.confidence.toFixed(2)})`);
|
|
373
|
+
}
|
|
374
|
+
// Scrappiness penalty — informal text degrades ELD accuracy
|
|
375
|
+
const scrappy = computeScrappiness(text);
|
|
376
|
+
if (scrappy.score >= 0.60) {
|
|
377
|
+
penalty += 0.30;
|
|
378
|
+
penalties.push(`scrappy-high(${scrappy.score.toFixed(2)})`);
|
|
379
|
+
}
|
|
380
|
+
else if (scrappy.score >= 0.35) {
|
|
381
|
+
penalty += 0.15;
|
|
382
|
+
penalties.push(`scrappy-moderate(${scrappy.score.toFixed(2)})`);
|
|
383
|
+
}
|
|
384
|
+
else if (scrappy.score >= 0.20) {
|
|
385
|
+
penalty += 0.05;
|
|
386
|
+
penalties.push(`scrappy-low(${scrappy.score.toFixed(2)})`);
|
|
387
|
+
}
|
|
388
|
+
// ELD entropy — when ELD spreads probability across many languages,
|
|
389
|
+
// it's confused. Apply as multiplicative damping: 1/(1 + entropy).
|
|
390
|
+
const eldResult = eld.detect(text);
|
|
391
|
+
const eldScores = eldResult.getScores();
|
|
392
|
+
const textEntropy = shannonEntropy(Object.values(eldScores));
|
|
393
|
+
const entropyDamping = 1 / (1 + textEntropy);
|
|
394
|
+
if (textEntropy >= 1.3) {
|
|
395
|
+
penalties.push(`eld-entropy(${textEntropy.toFixed(2)})`);
|
|
396
|
+
}
|
|
397
|
+
// Combine: additive penalties become a 0–1 factor, then multiply by entropy damping
|
|
398
|
+
const factor = Math.max(0, (1.0 - penalty) * entropyDamping);
|
|
399
|
+
return { factor, penalties };
|
|
400
|
+
}
|
|
401
|
+
// ---------------------------------------------------------------------------
|
|
402
|
+
// Main detection
|
|
403
|
+
// ---------------------------------------------------------------------------
|
|
404
|
+
/**
|
|
405
|
+
* Detect languages present in the input text.
|
|
406
|
+
*
|
|
407
|
+
* Uses a two-layer approach:
|
|
408
|
+
* 1. **Script detection** — Unicode character ranges identify non-Latin scripts
|
|
409
|
+
* with high certainty (Cyrillic → ru, Devanagari → hi, Hangul → ko, etc.)
|
|
410
|
+
* 2. **Vocabulary detection** — common words and phrases disambiguate Latin-script
|
|
411
|
+
* languages (en vs es vs fr vs de vs it vs pt vs nl vs tr)
|
|
412
|
+
*
|
|
413
|
+
* Script detection is weighted 2× relative to vocabulary detection since it's
|
|
414
|
+
* more deterministic.
|
|
415
|
+
*
|
|
416
|
+
* @param text - The text to analyze
|
|
417
|
+
* @param options - Detection options
|
|
418
|
+
* @returns Detection result with languages sorted by proportion
|
|
419
|
+
*
|
|
420
|
+
* @example
|
|
421
|
+
* ```typescript
|
|
422
|
+
* const result = detectLanguages("Bonjour, comment allez-vous?");
|
|
423
|
+
* // → { languages: [{ language: "fr", proportion: 0.82, present: 1.0, wordCount: 3 }, ...], text: "..." }
|
|
424
|
+
*
|
|
425
|
+
* const result2 = detectLanguages("こんにちは世界");
|
|
426
|
+
* // → { languages: [{ language: "ja", proportion: 0.85 }, { language: "zh", proportion: 0.15 }], text: "..." }
|
|
427
|
+
* ```
|
|
428
|
+
*/
|
|
429
|
+
export function detectLanguages(text, options = {}) {
|
|
430
|
+
var _a, _b;
|
|
431
|
+
const { minConfidence = 0.05, maxLanguages = 5, priorWeights } = options;
|
|
432
|
+
if (!text || typeof text !== "string") {
|
|
433
|
+
return { languages: [], text: text !== null && text !== void 0 ? text : "", totalWords: 0 };
|
|
434
|
+
}
|
|
435
|
+
// Split text into words (Unicode-aware)
|
|
436
|
+
const words = text.match(/[\p{L}\p{M}'-]+/gu) || [];
|
|
437
|
+
if (words.length === 0) {
|
|
438
|
+
return { languages: [], text, totalWords: 0 };
|
|
439
|
+
}
|
|
440
|
+
// For each word, find which language it best belongs to (winner-take-all per word).
|
|
441
|
+
// Track: word count per language, and max single-word score per language (presence signal).
|
|
442
|
+
const langWordCounts = new Map();
|
|
443
|
+
const langMaxScore = new Map();
|
|
444
|
+
let scoredWords = 0;
|
|
445
|
+
// Running cumulative word shares per language — global Bayesian prior.
|
|
446
|
+
// As we process words left-to-right, the running distribution nudges ambiguous
|
|
447
|
+
// words toward languages that already have evidence (e.g., if text is mostly
|
|
448
|
+
// English, "con" leans English over Spanish). No family groupings needed.
|
|
449
|
+
const cumulativeShares = new Map();
|
|
450
|
+
let totalCumulativeShares = 0;
|
|
451
|
+
// Compute ELD penalty once at the text level — romanized / scrappy text
|
|
452
|
+
// should deflate our reliance on ELD n-gram scores everywhere.
|
|
453
|
+
const eldPenalty = computeEldPenalty(text);
|
|
454
|
+
// Seed the Bayesian prior from ELD full-text analysis.
|
|
455
|
+
// ELD gives corpus-trained n-gram priors for the entire text,
|
|
456
|
+
// which are then refined per-word by trie + script matching.
|
|
457
|
+
// eldPenalty.factor already includes romanization + scrappiness + text-level entropy.
|
|
458
|
+
const eldPriors = getEldTextPriors(text);
|
|
459
|
+
for (const [lang, weight] of Object.entries(eldPriors)) {
|
|
460
|
+
const penalized = weight * eldPenalty.factor;
|
|
461
|
+
if (penalized > 0.01) { // Only seed languages with meaningful ELD signal
|
|
462
|
+
cumulativeShares.set(lang, penalized);
|
|
463
|
+
totalCumulativeShares += penalized;
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
// Layer user-provided weights on top of ELD priors if given.
|
|
467
|
+
if (priorWeights) {
|
|
468
|
+
const weightSum = Object.values(priorWeights).reduce((s, w) => s + (w || 0), 0);
|
|
469
|
+
if (weightSum > 0) {
|
|
470
|
+
for (const [lang, w] of Object.entries(priorWeights)) {
|
|
471
|
+
if (w > 0) {
|
|
472
|
+
const normalized = w / weightSum;
|
|
473
|
+
cumulativeShares.set(lang, (cumulativeShares.get(lang) || 0) + normalized);
|
|
474
|
+
totalCumulativeShares += normalized;
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
for (const word of words) {
|
|
480
|
+
if (word.length < 2)
|
|
481
|
+
continue; // Skip single chars
|
|
482
|
+
const wordScores = scoreWord(word, eldPenalty.factor);
|
|
483
|
+
const entries = Object.entries(wordScores);
|
|
484
|
+
if (entries.length === 0)
|
|
485
|
+
continue;
|
|
486
|
+
scoredWords++;
|
|
487
|
+
// Track max score per language (presence signal)
|
|
488
|
+
for (const [lang, score] of entries) {
|
|
489
|
+
langMaxScore.set(lang, Math.max(langMaxScore.get(lang) || 0, score));
|
|
490
|
+
}
|
|
491
|
+
// ── Tension-gated Bayesian prior ──
|
|
492
|
+
// Only apply when the word is genuinely contested (runner-up >= 50% of leader).
|
|
493
|
+
// Clear signals (Devanagari → Hindi, "the" → English) pass through unmodified.
|
|
494
|
+
// Entropy-gated: backs off when the cumulative distribution is flat.
|
|
495
|
+
const sorted = [...entries].sort((a, b) => b[1] - a[1]);
|
|
496
|
+
const topScore = ((_a = sorted[0]) === null || _a === void 0 ? void 0 : _a[1]) || 0;
|
|
497
|
+
const runnerUp = ((_b = sorted[1]) === null || _b === void 0 ? void 0 : _b[1]) || 0;
|
|
498
|
+
const tension = topScore > 0 && runnerUp > 0 ? runnerUp / topScore : 0;
|
|
499
|
+
const TENSION_THRESHOLD = 0.5;
|
|
500
|
+
const hasTension = tension >= TENSION_THRESHOLD;
|
|
501
|
+
let adjustedEntries;
|
|
502
|
+
if (hasTension && totalCumulativeShares > 0) {
|
|
503
|
+
const gate = entropyGate(cumulativeShares, totalCumulativeShares);
|
|
504
|
+
const effectiveAlpha = PRIOR_ALPHA * gate;
|
|
505
|
+
adjustedEntries = entries.map(([lang, score]) => {
|
|
506
|
+
if (effectiveAlpha === 0)
|
|
507
|
+
return [lang, score];
|
|
508
|
+
const cumProportion = (cumulativeShares.get(lang) || 0) / totalCumulativeShares;
|
|
509
|
+
return [lang, score * (1 + effectiveAlpha * cumProportion)];
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
else {
|
|
513
|
+
// No tension or no prior context — pass through unmodified
|
|
514
|
+
adjustedEntries = entries;
|
|
515
|
+
}
|
|
516
|
+
// ── Competitive cutoff: drop tail languages that aren't competitive ──
|
|
517
|
+
// Only languages scoring >= 50% of the top scorer get any share.
|
|
518
|
+
// This prevents noise (e.g., "allé" giving share to Arabic via bigrams)
|
|
519
|
+
// from diluting the true language's proportion.
|
|
520
|
+
const peakScore = adjustedEntries.reduce((mx, [, s]) => Math.max(mx, s), 0);
|
|
521
|
+
const COMPETITIVE_RATIO = 0.4;
|
|
522
|
+
const competitive = adjustedEntries.filter(([, s]) => s >= peakScore * COMPETITIVE_RATIO);
|
|
523
|
+
// Boost exact matches: if a language scored 1.0 (perfect trie + script +
|
|
524
|
+
// bigram), double its raw score so it captures the majority of this word's
|
|
525
|
+
// budget even among other competitive languages.
|
|
526
|
+
const boostedEntries = competitive.map(([lang, score]) => [
|
|
527
|
+
lang,
|
|
528
|
+
score >= 1.0 ? score * 2 : score,
|
|
529
|
+
]);
|
|
530
|
+
const totalScore = boostedEntries.reduce((sum, [, s]) => sum + s, 0);
|
|
531
|
+
if (totalScore > 0) {
|
|
532
|
+
for (const [lang, score] of boostedEntries) {
|
|
533
|
+
const share = score / totalScore; // Normalized share of this word
|
|
534
|
+
langWordCounts.set(lang, (langWordCounts.get(lang) || 0) + share);
|
|
535
|
+
// Update running cumulative for next word's prior
|
|
536
|
+
cumulativeShares.set(lang, (cumulativeShares.get(lang) || 0) + share);
|
|
537
|
+
totalCumulativeShares += share;
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
// Add phrase bonus as fractional word count (a matched phrase = +2 words for that language)
|
|
542
|
+
const textLower = text.toLowerCase();
|
|
543
|
+
for (const [lang, phrases] of phraseSets) {
|
|
544
|
+
for (const phrase of phrases) {
|
|
545
|
+
if (textLower.includes(phrase)) {
|
|
546
|
+
langWordCounts.set(lang, (langWordCounts.get(lang) || 0) + 2);
|
|
547
|
+
scoredWords += 2;
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
// If nothing matched, default to English for Latin text
|
|
552
|
+
if (langWordCounts.size === 0) {
|
|
553
|
+
if (/[a-zA-Z]/.test(text)) {
|
|
554
|
+
langWordCounts.set("en", scoredWords || 1);
|
|
555
|
+
if (scoredWords === 0)
|
|
556
|
+
scoredWords = 1;
|
|
557
|
+
}
|
|
558
|
+
else {
|
|
559
|
+
return { languages: [], text, totalWords: words.length };
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
// Build results: proportion = wordCount / totalScoredWords
|
|
563
|
+
// present = max single-word score for that language (1.0 = definitively present)
|
|
564
|
+
let languages = Array.from(langWordCounts.entries())
|
|
565
|
+
.map(([language, wc]) => {
|
|
566
|
+
const proportion = Math.round((wc / scoredWords) * 1000) / 1000;
|
|
567
|
+
const present = Math.round((langMaxScore.get(language) || 0) * 1000) / 1000;
|
|
568
|
+
return {
|
|
569
|
+
language,
|
|
570
|
+
present,
|
|
571
|
+
proportion,
|
|
572
|
+
wordCount: Math.round(wc * 100) / 100,
|
|
573
|
+
};
|
|
574
|
+
})
|
|
575
|
+
.filter((d) => d.proportion >= minConfidence || d.present >= 0.5)
|
|
576
|
+
.sort((a, b) => {
|
|
577
|
+
// Sort primarily by proportion, use present as tiebreaker
|
|
578
|
+
if (Math.abs(a.proportion - b.proportion) > 0.05) {
|
|
579
|
+
return b.proportion - a.proportion;
|
|
580
|
+
}
|
|
581
|
+
return b.present - a.present;
|
|
582
|
+
})
|
|
583
|
+
.slice(0, Math.max(maxLanguages, 10)); // Keep top 10 for re-normalization
|
|
584
|
+
// Re-normalize proportions among the top N candidates so they sum to ~1.0.
|
|
585
|
+
// Without this, Latin-family languages dilute each other (e.g., "de" the word
|
|
586
|
+
// splits across French, Spanish, Portuguese, etc.) and even long monolingual
|
|
587
|
+
// texts end up with the true language at only 0.3–0.4 proportion.
|
|
588
|
+
const topNSum = languages.reduce((sum, d) => sum + d.proportion, 0);
|
|
589
|
+
if (topNSum > 0) {
|
|
590
|
+
for (const lang of languages) {
|
|
591
|
+
lang.proportion = Math.round((lang.proportion / topNSum) * 1000) / 1000;
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
// Final slice to requested maxLanguages after re-normalization
|
|
595
|
+
languages = languages.slice(0, maxLanguages);
|
|
596
|
+
return { languages, text, totalWords: words.length };
|
|
597
|
+
}
|
|
598
|
+
/**
|
|
599
|
+
* Get the single best-matching language for the text.
|
|
600
|
+
*
|
|
601
|
+
* @param text - The text to analyze
|
|
602
|
+
* @returns The top language detection, or a default English result if unknown
|
|
603
|
+
*
|
|
604
|
+
* @example
|
|
605
|
+
* ```typescript
|
|
606
|
+
* const best = detectBestLanguage("Hola, ¿cómo estás?");
|
|
607
|
+
* // → { language: "es", present: 1.0, proportion: 0.91, wordCount: 3 }
|
|
608
|
+
* ```
|
|
609
|
+
*/
|
|
610
|
+
export function detectBestLanguage(text) {
|
|
611
|
+
var _a;
|
|
612
|
+
const result = detectLanguages(text, { maxLanguages: 1 });
|
|
613
|
+
return (_a = result.languages[0]) !== null && _a !== void 0 ? _a : {
|
|
614
|
+
language: "en",
|
|
615
|
+
present: 0,
|
|
616
|
+
proportion: 0,
|
|
617
|
+
wordCount: 0,
|
|
618
|
+
};
|
|
619
|
+
}
|
|
620
|
+
/**
|
|
621
|
+
* Detect the script/language of a single word based purely on its characters.
|
|
622
|
+
* Useful for classifying individual profanity dictionary entries.
|
|
623
|
+
*
|
|
624
|
+
* @param word - A single word to classify
|
|
625
|
+
* @returns Array of language codes the word's characters belong to, sorted by
|
|
626
|
+
* character count (most characters → first). Empty if purely basic Latin.
|
|
627
|
+
*
|
|
628
|
+
* @example
|
|
629
|
+
* ```typescript
|
|
630
|
+
* classifyWordScript("Scheiße") // → ["de"]
|
|
631
|
+
* classifyWordScript("くそ") // → ["ja"]
|
|
632
|
+
* classifyWordScript("fuck") // → [] (basic Latin, ambiguous)
|
|
633
|
+
* classifyWordScript("씨발") // → ["ko"]
|
|
634
|
+
* ```
|
|
635
|
+
*/
|
|
636
|
+
export function classifyWordScript(word) {
|
|
637
|
+
const counts = detectByScript(word);
|
|
638
|
+
if (counts.size === 0)
|
|
639
|
+
return [];
|
|
640
|
+
return Array.from(counts.entries())
|
|
641
|
+
.sort((a, b) => b[1] - a[1])
|
|
642
|
+
.map(([lang]) => lang);
|
|
643
|
+
}
|
|
644
|
+
// ---------------------------------------------------------------------------
|
|
645
|
+
// ELD n-gram helper — per-word language scores from corpus-trained model
|
|
646
|
+
// ---------------------------------------------------------------------------
|
|
647
|
+
/**
|
|
648
|
+
* Get ELD n-gram scores for a single word, normalized to [0, 1].
|
|
649
|
+
* Returns only scores above a minimum threshold.
|
|
650
|
+
*/
|
|
651
|
+
function getEldWordScores(word) {
|
|
652
|
+
const result = eld.detect(word);
|
|
653
|
+
const scores = result.getScores();
|
|
654
|
+
const out = {};
|
|
655
|
+
for (const [lang, score] of Object.entries(scores)) {
|
|
656
|
+
if (score > 0.1) {
|
|
657
|
+
out[lang] = score;
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
return out;
|
|
661
|
+
}
|
|
662
|
+
/**
|
|
663
|
+
* Get ELD n-gram scores for the full text, used as Bayesian priors.
|
|
664
|
+
* Returns normalized scores (sum to ~1.0).
|
|
665
|
+
*/
|
|
666
|
+
function getEldTextPriors(text) {
|
|
667
|
+
const result = eld.detect(text);
|
|
668
|
+
const scores = result.getScores();
|
|
669
|
+
const out = {};
|
|
670
|
+
const total = Object.values(scores).reduce((s, v) => s + v, 0);
|
|
671
|
+
if (total <= 0)
|
|
672
|
+
return out;
|
|
673
|
+
for (const [lang, score] of Object.entries(scores)) {
|
|
674
|
+
if (score > 0) {
|
|
675
|
+
out[lang] = score / total;
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
return out;
|
|
679
|
+
}
|
|
680
|
+
const HIGH_SIGNAL_SUFFIXES = {
|
|
681
|
+
es: [
|
|
682
|
+
// ñ-bearing — guarantees Spanish
|
|
683
|
+
{ pattern: "ción", weight: 0.8 }, { pattern: "sión", weight: 0.8 },
|
|
684
|
+
{ pattern: "ñol", weight: 0.9 }, { pattern: "ñar", weight: 0.9 },
|
|
685
|
+
{ pattern: "imiento", weight: 0.8 }, { pattern: "miento", weight: 0.7 },
|
|
686
|
+
{ pattern: "ería", weight: 0.7 },
|
|
687
|
+
],
|
|
688
|
+
fr: [
|
|
689
|
+
// Accent-bearing — guarantees French
|
|
690
|
+
{ pattern: "amment", weight: 0.9 }, { pattern: "ément", weight: 0.9 },
|
|
691
|
+
{ pattern: "eaux", weight: 0.8 }, { pattern: "euse", weight: 0.7 },
|
|
692
|
+
{ pattern: "ière", weight: 0.7 }, { pattern: "eux", weight: 0.6 },
|
|
693
|
+
{ pattern: "ée", weight: 0.5 }, { pattern: "ées", weight: 0.6 },
|
|
694
|
+
{ pattern: "esque", weight: 0.5 },
|
|
695
|
+
],
|
|
696
|
+
de: [
|
|
697
|
+
// Umlaut/ß-bearing — guarantees German
|
|
698
|
+
{ pattern: "schaft", weight: 0.8 }, { pattern: "mäßig", weight: 0.9 },
|
|
699
|
+
{ pattern: "igkeit", weight: 0.8 }, { pattern: "ierung", weight: 0.8 },
|
|
700
|
+
{ pattern: "keit", weight: 0.8 }, { pattern: "heit", weight: 0.7 },
|
|
701
|
+
{ pattern: "lich", weight: 0.35 }, { pattern: "isch", weight: 0.3 },
|
|
702
|
+
{ pattern: "ung", weight: 0.35 }, { pattern: "ös", weight: 0.3 },
|
|
703
|
+
],
|
|
704
|
+
pt: [
|
|
705
|
+
// ã/õ/ç-bearing — guarantees Portuguese
|
|
706
|
+
{ pattern: "ção", weight: 0.9 }, { pattern: "ções", weight: 0.9 },
|
|
707
|
+
{ pattern: "ável", weight: 0.8 }, { pattern: "ível", weight: 0.8 },
|
|
708
|
+
{ pattern: "ância", weight: 0.8 }, { pattern: "ência", weight: 0.8 },
|
|
709
|
+
],
|
|
710
|
+
tr: [
|
|
711
|
+
// Turkish special chars — guarantees Turkish
|
|
712
|
+
{ pattern: "lık", weight: 0.8 }, { pattern: "lığ", weight: 0.9 },
|
|
713
|
+
{ pattern: "lük", weight: 0.8 }, { pattern: "lüğ", weight: 0.9 },
|
|
714
|
+
{ pattern: "sız", weight: 0.8 }, { pattern: "süz", weight: 0.8 },
|
|
715
|
+
{ pattern: "ınca", weight: 0.8 }, { pattern: "ünce", weight: 0.8 },
|
|
716
|
+
{ pattern: "dır", weight: 0.5 }, { pattern: "dür", weight: 0.5 },
|
|
717
|
+
{ pattern: "siz", weight: 0.4 }, { pattern: "suz", weight: 0.4 },
|
|
718
|
+
{ pattern: "çı", weight: 0.3 },
|
|
719
|
+
],
|
|
720
|
+
ja: [
|
|
721
|
+
// Unique romaji endings — guarantees Japanese
|
|
722
|
+
{ pattern: "mashita", weight: 0.9 }, { pattern: "masen", weight: 0.9 },
|
|
723
|
+
{ pattern: "masu", weight: 0.8 }, { pattern: "desu", weight: 0.8 },
|
|
724
|
+
],
|
|
725
|
+
ko: [
|
|
726
|
+
// Unique Korean romanization — guarantees Korean
|
|
727
|
+
{ pattern: "imnida", weight: 0.9 }, { pattern: "haseo", weight: 0.9 },
|
|
728
|
+
{ pattern: "hamni", weight: 0.8 }, { pattern: "eoyo", weight: 0.8 },
|
|
729
|
+
],
|
|
730
|
+
ru: [
|
|
731
|
+
// Unique Russian romanization — guarantees Russian
|
|
732
|
+
{ pattern: "ovich", weight: 0.9 }, { pattern: "evich", weight: 0.9 },
|
|
733
|
+
{ pattern: "ovna", weight: 0.9 }, { pattern: "evna", weight: 0.9 },
|
|
734
|
+
{ pattern: "stvo", weight: 0.8 }, { pattern: "nost", weight: 0.5 },
|
|
735
|
+
{ pattern: "skaya", weight: 0.3 }, { pattern: "skoye", weight: 0.3 },
|
|
736
|
+
],
|
|
737
|
+
nl: [
|
|
738
|
+
// Distinctive Dutch suffixes
|
|
739
|
+
{ pattern: "heid", weight: 0.3 }, { pattern: "schap", weight: 0.3 },
|
|
740
|
+
],
|
|
741
|
+
ar: [
|
|
742
|
+
// Distinctive Arabic romanization
|
|
743
|
+
{ pattern: "ullah", weight: 0.3 }, { pattern: "allah", weight: 0.3 },
|
|
744
|
+
],
|
|
745
|
+
};
|
|
746
|
+
// ---------------------------------------------------------------------------
|
|
747
|
+
// High-signal prefixes — only accent-bearing / highly distinctive (0.3+)
|
|
748
|
+
// ---------------------------------------------------------------------------
|
|
749
|
+
const HIGH_SIGNAL_PREFIXES = {
|
|
750
|
+
de: [
|
|
751
|
+
// Umlaut-bearing — practically guarantees German
|
|
752
|
+
{ pattern: "über", weight: 0.9 }, { pattern: "zurück", weight: 0.9 },
|
|
753
|
+
{ pattern: "wieder", weight: 0.5 }, { pattern: "unter", weight: 0.3 },
|
|
754
|
+
],
|
|
755
|
+
fr: [
|
|
756
|
+
// Accent-bearing — very distinctive French
|
|
757
|
+
{ pattern: "dé", weight: 0.35 }, { pattern: "ré", weight: 0.3 },
|
|
758
|
+
{ pattern: "pré", weight: 0.35 },
|
|
759
|
+
],
|
|
760
|
+
tr: [
|
|
761
|
+
// Accent-bearing — very distinctive Turkish
|
|
762
|
+
{ pattern: "karşı", weight: 0.9 }, { pattern: "üst", weight: 0.7 },
|
|
763
|
+
{ pattern: "geri", weight: 0.3 }, { pattern: "ön", weight: 0.3 },
|
|
764
|
+
],
|
|
765
|
+
};
|
|
766
|
+
// ---------------------------------------------------------------------------
|
|
767
|
+
// High-signal suffix/prefix matching helpers
|
|
768
|
+
// ---------------------------------------------------------------------------
|
|
769
|
+
function findHighSignalSuffixes(word) {
|
|
770
|
+
const lower = word.toLowerCase();
|
|
771
|
+
const matched = new Map();
|
|
772
|
+
for (const [lang, affixes] of Object.entries(HIGH_SIGNAL_SUFFIXES)) {
|
|
773
|
+
for (const { pattern, weight } of affixes) {
|
|
774
|
+
if (lower.endsWith(pattern)) {
|
|
775
|
+
const current = matched.get(lang) || 0;
|
|
776
|
+
if (weight > current)
|
|
777
|
+
matched.set(lang, weight);
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
return matched;
|
|
782
|
+
}
|
|
783
|
+
function findHighSignalPrefixes(word) {
|
|
784
|
+
const lower = word.toLowerCase();
|
|
785
|
+
const matched = new Map();
|
|
786
|
+
for (const [lang, affixes] of Object.entries(HIGH_SIGNAL_PREFIXES)) {
|
|
787
|
+
for (const { pattern, weight } of affixes) {
|
|
788
|
+
if (lower.startsWith(pattern) && lower.length > pattern.length + 1) {
|
|
789
|
+
const current = matched.get(lang) || 0;
|
|
790
|
+
if (weight > current)
|
|
791
|
+
matched.set(lang, weight);
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
return matched;
|
|
796
|
+
}
|
|
797
|
+
/**
|
|
798
|
+
* Score a single word against ALL languages (0–1 each).
|
|
799
|
+
*
|
|
800
|
+
* Combines four signals:
|
|
801
|
+
* 1. **Script** (weight 1.0) — Unicode chars belonging to a language's script.
|
|
802
|
+
* 2. **Trie** (weight 0.8) — vocabulary match via per-language tries.
|
|
803
|
+
* 3. **ELD n-grams** (weight 0.6) — corpus-trained n-gram model per word.
|
|
804
|
+
* 4. **High-signal suffix/prefix** — accent-bearing patterns (0.3+ weight).
|
|
805
|
+
*
|
|
806
|
+
* @param word - A single word to score
|
|
807
|
+
* @param eldPenaltyFactor - Multiplier (0–1) to deflate ELD n-gram contributions.
|
|
808
|
+
* 1.0 = full ELD trust (default), 0.0 = ignore ELD entirely.
|
|
809
|
+
* @returns Map of language → score (only includes languages with score > 0)
|
|
810
|
+
*/
|
|
811
|
+
export function scoreWord(word, eldPenaltyFactor = 1.0) {
|
|
812
|
+
const scores = {};
|
|
813
|
+
const charLen = [...word].length; // Unicode-safe length
|
|
814
|
+
if (charLen === 0)
|
|
815
|
+
return scores;
|
|
816
|
+
// ── Detect unique language fingerprints → guaranteed 1.0 floor ──
|
|
817
|
+
const scriptCounts = detectByScript(word);
|
|
818
|
+
const uniqueLangChars = new Map();
|
|
819
|
+
for (const char of word) {
|
|
820
|
+
const cp = char.codePointAt(0);
|
|
821
|
+
if (cp === undefined)
|
|
822
|
+
continue;
|
|
823
|
+
if (DIACRITICAL_SIGNALS.has(cp))
|
|
824
|
+
continue;
|
|
825
|
+
for (const range of SCRIPT_RANGES) {
|
|
826
|
+
if (cp >= range.start && cp <= range.end) {
|
|
827
|
+
if (range.languages.length === 1) {
|
|
828
|
+
const lang = range.languages[0];
|
|
829
|
+
uniqueLangChars.set(lang, (uniqueLangChars.get(lang) || 0) + 1);
|
|
830
|
+
}
|
|
831
|
+
break;
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
const guaranteedLangs = new Set();
|
|
836
|
+
// 2+ unique script chars → guaranteed 1.0
|
|
837
|
+
for (const [lang, count] of uniqueLangChars) {
|
|
838
|
+
if (count >= 2) {
|
|
839
|
+
guaranteedLangs.add(lang);
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
// Unique exact vocabulary match (len > 4, only 1 trie matches) → guaranteed 1.0
|
|
843
|
+
if (charLen > 4) {
|
|
844
|
+
const exactMatches = [];
|
|
845
|
+
for (const [lang, trie] of languageTries) {
|
|
846
|
+
if (trie.matchScore(word) === 1.0) {
|
|
847
|
+
exactMatches.push(lang);
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
if (exactMatches.length === 1) {
|
|
851
|
+
guaranteedLangs.add(exactMatches[0]);
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
// ── Scoring weights ──
|
|
855
|
+
const SCRIPT_WEIGHT = 1.0;
|
|
856
|
+
const TRIE_WEIGHT = 0.8;
|
|
857
|
+
const ELD_WEIGHT = 0.6;
|
|
858
|
+
// Layer 1: Script detection
|
|
859
|
+
if (charLen > 0) {
|
|
860
|
+
for (const [lang, count] of scriptCounts) {
|
|
861
|
+
const scriptScore = count / charLen;
|
|
862
|
+
scores[lang] = (scores[lang] || 0) + scriptScore * SCRIPT_WEIGHT;
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
// CJK disambiguation: pure CJK ideographs without kana → skew Chinese
|
|
866
|
+
if (charLen >= 10 && scriptCounts.has("zh") && scriptCounts.has("ja")) {
|
|
867
|
+
const hasKana = [...word].some((ch) => {
|
|
868
|
+
const cp = ch.codePointAt(0);
|
|
869
|
+
return (cp >= 0x3040 && cp <= 0x309f) || (cp >= 0x30a0 && cp <= 0x30ff) ||
|
|
870
|
+
(cp >= 0x31f0 && cp <= 0x31ff) || (cp >= 0xff65 && cp <= 0xff9f);
|
|
871
|
+
});
|
|
872
|
+
if (!hasKana) {
|
|
873
|
+
scores["zh"] = (scores["zh"] || 0) + 0.5;
|
|
874
|
+
scores["ja"] = (scores["ja"] || 0) * 0.5;
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
// Layer 2: Trie matching
|
|
878
|
+
for (const [lang, trie] of languageTries) {
|
|
879
|
+
const trieScore = trie.matchScore(word);
|
|
880
|
+
if (trieScore > 0) {
|
|
881
|
+
let effectiveScore;
|
|
882
|
+
if (trieScore === 1.0) {
|
|
883
|
+
effectiveScore = TRIE_WEIGHT;
|
|
884
|
+
}
|
|
885
|
+
else {
|
|
886
|
+
const matchedChars = Math.round(trieScore * charLen);
|
|
887
|
+
const charBonus = Math.pow(1.1, matchedChars);
|
|
888
|
+
effectiveScore = trieScore * TRIE_WEIGHT * 0.1 * Math.min(charBonus, 3);
|
|
889
|
+
}
|
|
890
|
+
const langKey = lang;
|
|
891
|
+
scores[langKey] = (scores[langKey] || 0) + effectiveScore;
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
// Layer 3: ELD n-gram scores (corpus-trained, replaces hand-tuned bigrams/trigrams)
|
|
895
|
+
// Two ELD penalties stack multiplicatively:
|
|
896
|
+
// 1. eldPenaltyFactor — text-level penalty from romanization + scrappiness
|
|
897
|
+
// 2. Per-word entropy — if ELD is confused about THIS word, dampen further
|
|
898
|
+
if (charLen >= 3) {
|
|
899
|
+
const eldScores = getEldWordScores(word);
|
|
900
|
+
const eldValues = Object.values(eldScores);
|
|
901
|
+
// Per-word entropy: high entropy = ELD can't decide → dampen its scores.
|
|
902
|
+
// 1 bit = 2 equally likely langs (ok), 2+ bits = 4+ langs (confused).
|
|
903
|
+
// Damping: 1.0 at 0 bits, ~0.5 at 2 bits, ~0.25 at 3 bits.
|
|
904
|
+
const wordEntropy = shannonEntropy(eldValues);
|
|
905
|
+
const entropyDamping = 1 / (1 + wordEntropy);
|
|
906
|
+
const effectiveEldWeight = ELD_WEIGHT * eldPenaltyFactor * entropyDamping;
|
|
907
|
+
if (effectiveEldWeight > 0.01) {
|
|
908
|
+
for (const [lang, score] of Object.entries(eldScores)) {
|
|
909
|
+
scores[lang] = (scores[lang] || 0) + score * effectiveEldWeight;
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
// Normalize: max possible is SCRIPT + TRIE + ELD
|
|
914
|
+
const maxPossible = SCRIPT_WEIGHT + TRIE_WEIGHT + ELD_WEIGHT;
|
|
915
|
+
for (const lang of Object.keys(scores)) {
|
|
916
|
+
scores[lang] = Math.min(1, scores[lang] / maxPossible);
|
|
917
|
+
if (scores[lang] < 0.01)
|
|
918
|
+
delete scores[lang];
|
|
919
|
+
}
|
|
920
|
+
// Layer 4: High-signal suffix matching (accent-bearing, 0.3+ weight only)
|
|
921
|
+
const suffixMatches = findHighSignalSuffixes(word);
|
|
922
|
+
for (const [lang, weight] of suffixMatches) {
|
|
923
|
+
const base = scores[lang] || 0;
|
|
924
|
+
if (base > 0) {
|
|
925
|
+
scores[lang] = Math.min(1, base * 1.3 + weight);
|
|
926
|
+
}
|
|
927
|
+
else {
|
|
928
|
+
scores[lang] = weight;
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
// Layer 5: High-signal prefix matching (accent-bearing, 0.3+ weight only)
|
|
932
|
+
const prefixMatches = findHighSignalPrefixes(word);
|
|
933
|
+
for (const [lang, weight] of prefixMatches) {
|
|
934
|
+
const base = scores[lang] || 0;
|
|
935
|
+
if (base > 0) {
|
|
936
|
+
scores[lang] = Math.min(1, base * 1.2 + weight);
|
|
937
|
+
}
|
|
938
|
+
else {
|
|
939
|
+
scores[lang] = weight;
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
// Apply guaranteed 1.0 floor for uniquely-fingerprinted languages
|
|
943
|
+
for (const lang of guaranteedLangs) {
|
|
944
|
+
scores[lang] = 1.0;
|
|
945
|
+
}
|
|
946
|
+
// Clip all scores to 3 decimal places
|
|
947
|
+
for (const lang of Object.keys(scores)) {
|
|
948
|
+
scores[lang] = Math.round(scores[lang] * 1000) / 1000;
|
|
949
|
+
}
|
|
950
|
+
return scores;
|
|
951
|
+
}
|
|
952
|
+
//# sourceMappingURL=language-detector.js.map
|