bekindprofanityfilter 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/CONTRIBUTORS.md +106 -0
  2. package/LICENSE +22 -0
  3. package/README.md +1015 -0
  4. package/allprofanity.config.example.json +35 -0
  5. package/bin/init.js +49 -0
  6. package/config.schema.json +163 -0
  7. package/dist/algos/aho-corasick.d.ts +75 -0
  8. package/dist/algos/aho-corasick.js +238 -0
  9. package/dist/algos/aho-corasick.js.map +1 -0
  10. package/dist/algos/bloom-filter.d.ts +103 -0
  11. package/dist/algos/bloom-filter.js +208 -0
  12. package/dist/algos/bloom-filter.js.map +1 -0
  13. package/dist/algos/context-patterns.d.ts +102 -0
  14. package/dist/algos/context-patterns.js +484 -0
  15. package/dist/algos/context-patterns.js.map +1 -0
  16. package/dist/index.d.ts +1332 -0
  17. package/dist/index.js +2631 -0
  18. package/dist/index.js.map +1 -0
  19. package/dist/innocence-scoring.d.ts +23 -0
  20. package/dist/innocence-scoring.js +118 -0
  21. package/dist/innocence-scoring.js.map +1 -0
  22. package/dist/language-detector.d.ts +162 -0
  23. package/dist/language-detector.js +952 -0
  24. package/dist/language-detector.js.map +1 -0
  25. package/dist/language-dicts.d.ts +60 -0
  26. package/dist/language-dicts.js +2718 -0
  27. package/dist/language-dicts.js.map +1 -0
  28. package/dist/languages/arabic-words.d.ts +10 -0
  29. package/dist/languages/arabic-words.js +1649 -0
  30. package/dist/languages/arabic-words.js.map +1 -0
  31. package/dist/languages/bengali-words.d.ts +10 -0
  32. package/dist/languages/bengali-words.js +1696 -0
  33. package/dist/languages/bengali-words.js.map +1 -0
  34. package/dist/languages/brazilian-words.d.ts +10 -0
  35. package/dist/languages/brazilian-words.js +2122 -0
  36. package/dist/languages/brazilian-words.js.map +1 -0
  37. package/dist/languages/chinese-words.d.ts +10 -0
  38. package/dist/languages/chinese-words.js +2728 -0
  39. package/dist/languages/chinese-words.js.map +1 -0
  40. package/dist/languages/english-primary-all-languages.d.ts +23 -0
  41. package/dist/languages/english-primary-all-languages.js +36894 -0
  42. package/dist/languages/english-primary-all-languages.js.map +1 -0
  43. package/dist/languages/english-words.d.ts +5 -0
  44. package/dist/languages/english-words.js +5156 -0
  45. package/dist/languages/english-words.js.map +1 -0
  46. package/dist/languages/french-words.d.ts +10 -0
  47. package/dist/languages/french-words.js +2326 -0
  48. package/dist/languages/french-words.js.map +1 -0
  49. package/dist/languages/german-words.d.ts +10 -0
  50. package/dist/languages/german-words.js +2633 -0
  51. package/dist/languages/german-words.js.map +1 -0
  52. package/dist/languages/hindi-words.d.ts +10 -0
  53. package/dist/languages/hindi-words.js +2341 -0
  54. package/dist/languages/hindi-words.js.map +1 -0
  55. package/dist/languages/innocent-words.d.ts +41 -0
  56. package/dist/languages/innocent-words.js +109 -0
  57. package/dist/languages/innocent-words.js.map +1 -0
  58. package/dist/languages/italian-words.d.ts +10 -0
  59. package/dist/languages/italian-words.js +2287 -0
  60. package/dist/languages/italian-words.js.map +1 -0
  61. package/dist/languages/japanese-words.d.ts +11 -0
  62. package/dist/languages/japanese-words.js +2557 -0
  63. package/dist/languages/japanese-words.js.map +1 -0
  64. package/dist/languages/korean-words.d.ts +10 -0
  65. package/dist/languages/korean-words.js +2509 -0
  66. package/dist/languages/korean-words.js.map +1 -0
  67. package/dist/languages/russian-words.d.ts +10 -0
  68. package/dist/languages/russian-words.js +2175 -0
  69. package/dist/languages/russian-words.js.map +1 -0
  70. package/dist/languages/spanish-words.d.ts +11 -0
  71. package/dist/languages/spanish-words.js +2536 -0
  72. package/dist/languages/spanish-words.js.map +1 -0
  73. package/dist/languages/tamil-words.d.ts +10 -0
  74. package/dist/languages/tamil-words.js +1722 -0
  75. package/dist/languages/tamil-words.js.map +1 -0
  76. package/dist/languages/telugu-words.d.ts +10 -0
  77. package/dist/languages/telugu-words.js +1739 -0
  78. package/dist/languages/telugu-words.js.map +1 -0
  79. package/dist/romanization-detector.d.ts +50 -0
  80. package/dist/romanization-detector.js +779 -0
  81. package/dist/romanization-detector.js.map +1 -0
  82. package/package.json +79 -0
@@ -0,0 +1,952 @@
1
+ /**
2
+ * Language Detector — hybrid ELD n-gram + script + vocabulary detection.
3
+ *
4
+ * Three detection layers:
5
+ * 1. **ELD n-gram detection**: Corpus-trained n-gram model (Nito-ELD) provides
6
+ * per-word language scores and full-text Bayesian priors.
7
+ * 2. **Script detection**: Unicode codepoint ranges map characters to
8
+ * language families instantly and deterministically.
9
+ * 3. **Vocabulary detection**: Trie-based word matching against per-language
10
+ * dictionaries disambiguates languages sharing a script.
11
+ *
12
+ * High-signal suffixes/prefixes (accent-bearing, 0.3+ weight) provide additional
13
+ * disambiguation for accented/diacritical text.
14
+ *
15
+ * Designed for use alongside the profanity filter to weight matches by detected
16
+ * language context.
17
+ */
18
+ // @ts-ignore — eld ships as JS with .d.ts but no proper ESM types
19
+ import { eld } from "eld/small";
20
+ import { languageTries, phraseSets } from "./language-dicts.js";
21
+ import { detectRomanization } from "./romanization-detector.js";
22
+ /** Languages that have trie vocabulary dictionaries */
23
+ const TRIE_LANGUAGES = new Set([
24
+ "en", "es", "fr", "de", "it", "pt", "nl", "tr",
25
+ "ru", "ar", "zh", "ja", "ko", "hi", "bn", "ta", "te",
26
+ "sv",
27
+ ]);
28
+ /**
29
+ * Unicode codepoint ranges mapped to the languages they indicate.
30
+ * Ranges are checked in order; first match wins for each character.
31
+ */
32
+ const SCRIPT_RANGES = [
33
+ // Cyrillic
34
+ { start: 0x0400, end: 0x04FF, languages: ["ru"] },
35
+ { start: 0x0500, end: 0x052F, languages: ["ru"] },
36
+ // Arabic (includes Urdu, Farsi — we tag as "ar" for now)
37
+ { start: 0x0600, end: 0x06FF, languages: ["ar"] },
38
+ { start: 0x0750, end: 0x077F, languages: ["ar"] },
39
+ { start: 0x08A0, end: 0x08FF, languages: ["ar"] },
40
+ { start: 0xFB50, end: 0xFDFF, languages: ["ar"] },
41
+ { start: 0xFE70, end: 0xFEFF, languages: ["ar"] },
42
+ // Devanagari (Hindi, Marathi, Sanskrit)
43
+ { start: 0x0900, end: 0x097F, languages: ["hi"] },
44
+ { start: 0xA8E0, end: 0xA8FF, languages: ["hi"] },
45
+ // Bengali
46
+ { start: 0x0980, end: 0x09FF, languages: ["bn"] },
47
+ // Tamil
48
+ { start: 0x0B80, end: 0x0BFF, languages: ["ta"] },
49
+ // Telugu
50
+ { start: 0x0C00, end: 0x0C7F, languages: ["te"] },
51
+ // Hangul (Korean)
52
+ { start: 0x1100, end: 0x11FF, languages: ["ko"] },
53
+ { start: 0x3130, end: 0x318F, languages: ["ko"] },
54
+ { start: 0xAC00, end: 0xD7AF, languages: ["ko"] },
55
+ { start: 0xD7B0, end: 0xD7FF, languages: ["ko"] },
56
+ // Japanese-specific kana
57
+ { start: 0x3040, end: 0x309F, languages: ["ja"] },
58
+ { start: 0x30A0, end: 0x30FF, languages: ["ja"] },
59
+ { start: 0x31F0, end: 0x31FF, languages: ["ja"] },
60
+ { start: 0xFF65, end: 0xFF9F, languages: ["ja"] },
61
+ // CJK Unified Ideographs — shared by Chinese and Japanese
62
+ // When we see CJK without any kana, it's more likely Chinese.
63
+ // When kana is also present, the kana ranges already tag "ja".
64
+ { start: 0x4E00, end: 0x9FFF, languages: ["zh", "ja"] },
65
+ { start: 0x3400, end: 0x4DBF, languages: ["zh", "ja"] },
66
+ { start: 0x20000, end: 0x2A6DF, languages: ["zh", "ja"] },
67
+ { start: 0x2A700, end: 0x2B73F, languages: ["zh", "ja"] },
68
+ { start: 0x2B740, end: 0x2B81F, languages: ["zh", "ja"] },
69
+ { start: 0x3000, end: 0x303F, languages: ["zh", "ja"] },
70
+ // Latin Extended — covers accented chars used in European languages
71
+ // We can't distinguish en/es/fr/de/it/pt from characters alone, but certain
72
+ // diacritical marks are strong signals for specific languages.
73
+ { start: 0x00C0, end: 0x00FF, languages: ["es", "fr", "de", "it", "pt", "en"] },
74
+ { start: 0x0100, end: 0x017F, languages: ["es", "fr", "de", "it", "pt", "tr"] },
75
+ { start: 0x0180, end: 0x024F, languages: ["es", "fr", "de", "it", "pt", "tr"] }, // Latin Extended-B
76
+ // Turkish-specific characters
77
+ // ğ (0x011F), ş (0x015F), ı (0x0131), İ (0x0130), ç (0x00E7 shared with FR/PT)
78
+ // These are already covered by Latin Extended-A above, but we boost Turkish
79
+ // via vocabulary detection when these are present.
80
+ ];
81
+ /**
82
+ * Specific diacritical characters that are strong signals for particular languages.
83
+ * Maps codepoints to language boosts.
84
+ */
85
+ const DIACRITICAL_SIGNALS = new Map([
86
+ // German-specific
87
+ [0x00C4, ["de"]],
88
+ [0x00D6, ["de"]],
89
+ [0x00DC, ["de"]],
90
+ [0x00E4, ["de"]],
91
+ [0x00F6, ["de"]],
92
+ [0x00FC, ["de"]],
93
+ [0x00DF, ["de"]],
94
+ // Spanish-specific
95
+ [0x00D1, ["es"]],
96
+ [0x00F1, ["es"]],
97
+ [0x00BF, ["es"]],
98
+ [0x00A1, ["es"]],
99
+ [0x00E1, ["es", "pt"]],
100
+ [0x00C1, ["es", "pt"]],
101
+ [0x00ED, ["es"]],
102
+ [0x00CD, ["es"]],
103
+ [0x00F3, ["es", "pt", "it"]],
104
+ [0x00D3, ["es", "pt", "it"]],
105
+ [0x00FA, ["es", "pt"]],
106
+ [0x00DA, ["es", "pt"]],
107
+ // French-specific accents (shared with others, but weighted)
108
+ [0x00E9, ["fr", "es"]],
109
+ [0x00C9, ["fr", "es"]],
110
+ [0x00E0, ["fr"]],
111
+ [0x00C0, ["fr"]],
112
+ [0x00E8, ["fr"]],
113
+ [0x00C8, ["fr"]],
114
+ [0x00EA, ["fr"]],
115
+ [0x00EB, ["fr"]],
116
+ [0x00EE, ["fr"]],
117
+ [0x00EF, ["fr"]],
118
+ [0x00F4, ["fr"]],
119
+ [0x00F9, ["fr"]],
120
+ [0x00FB, ["fr"]],
121
+ [0x0153, ["fr"]],
122
+ [0x00E7, ["fr", "pt", "tr"]],
123
+ // Portuguese-specific
124
+ [0x00E3, ["pt"]],
125
+ [0x00F5, ["pt"]],
126
+ // Turkish-specific
127
+ [0x011E, ["tr"]],
128
+ [0x011F, ["tr"]],
129
+ [0x015E, ["tr"]],
130
+ [0x015F, ["tr"]],
131
+ [0x0130, ["tr"]],
132
+ [0x0131, ["tr"]], // ı (dotless i)
133
+ ]);
134
+ // ---------------------------------------------------------------------------
135
+ // Global Bayesian prior — distribution-weighted
136
+ // ---------------------------------------------------------------------------
137
+ // As words are processed left-to-right, each language accumulates a running
138
+ // proportion of the text. For each subsequent word, the raw score is nudged
139
+ // by (1 + PRIOR_ALPHA * cumulativeProportion). This means languages that have
140
+ // already accumulated evidence get a proportional boost on ambiguous words,
141
+ // without needing predefined family groupings. Mixed-language text naturally
142
+ // gives both languages a boost; noise languages stay suppressed.
143
+ //
144
+ // PRIOR_ALPHA controls the strength: 0.2 means a language with 100% of the
145
+ // running distribution gets a 20% score boost; one with 50% gets 10%.
146
+ const PRIOR_ALPHA = 0.2;
147
+ /**
148
+ * Shannon entropy of the cumulative distribution (in bits).
149
+ * Low entropy = 1-2 languages dominate → prior is confident, apply full alpha.
150
+ * High entropy = many languages roughly equal → back off, let raw scores speak.
151
+ * Returns a scaling factor 0..1 to multiply PRIOR_ALPHA by.
152
+ */
153
+ function entropyGate(shares, total) {
154
+ if (total === 0)
155
+ return 0;
156
+ let entropy = 0;
157
+ for (const count of shares.values()) {
158
+ const p = count / total;
159
+ if (p > 0)
160
+ entropy -= p * Math.log2(p);
161
+ }
162
+ // Max entropy for N languages = log2(N). With 17 languages, max ≈ 4.09 bits.
163
+ // We want full alpha when entropy < 1.5 bits (1-2 dominant languages),
164
+ // and zero alpha when entropy > 3.5 bits (many roughly equal).
165
+ // Linear ramp between these thresholds.
166
+ const LOW = 1.5;
167
+ const HIGH = 3.5;
168
+ if (entropy <= LOW)
169
+ return 1.0;
170
+ if (entropy >= HIGH)
171
+ return 0.0;
172
+ return (HIGH - entropy) / (HIGH - LOW);
173
+ }
174
+ // ---------------------------------------------------------------------------
175
+ // Detection implementation
176
+ // ---------------------------------------------------------------------------
177
+ /**
178
+ * Classify a single character by Unicode script to one or more language codes.
179
+ * Returns undefined for basic Latin (A-Z, 0-9) and common punctuation — those
180
+ * are ambiguous and need vocabulary-based disambiguation.
181
+ */
182
+ function classifyChar(codepoint) {
183
+ // Check diacritical signals first (strong language indicators)
184
+ const diacritical = DIACRITICAL_SIGNALS.get(codepoint);
185
+ if (diacritical)
186
+ return diacritical;
187
+ // Check script ranges
188
+ for (const range of SCRIPT_RANGES) {
189
+ if (codepoint >= range.start && codepoint <= range.end) {
190
+ return range.languages;
191
+ }
192
+ }
193
+ return undefined;
194
+ }
195
+ /**
196
+ * Script-based detection: count characters belonging to each language's script.
197
+ * Returns raw character counts per language.
198
+ */
199
+ function detectByScript(text) {
200
+ const counts = new Map();
201
+ for (const char of text) {
202
+ const cp = char.codePointAt(0);
203
+ if (cp === undefined)
204
+ continue;
205
+ const langs = classifyChar(cp);
206
+ if (!langs)
207
+ continue;
208
+ for (const lang of langs) {
209
+ counts.set(lang, (counts.get(lang) || 0) + 1);
210
+ }
211
+ }
212
+ return counts;
213
+ }
214
+ /** Minimum match score to count a word as a hit for a language */
215
+ const MIN_MATCH_SCORE = 0.6;
216
+ /**
217
+ * Trie-based vocabulary detection: split text into words, score each word
218
+ * against every language trie, aggregate scores per language.
219
+ *
220
+ * Each input word is walked against each language's trie in O(word_len).
221
+ * Partial matches produce fractional scores (e.g., "thre" vs "three" = 0.8).
222
+ * Phrase matching is done separately for multi-word expressions.
223
+ */
224
+ function detectByVocabulary(text) {
225
+ const scores = new Map();
226
+ const textLower = text.toLowerCase();
227
+ // Split into words (Unicode-aware: includes accented Latin, apostrophes)
228
+ const words = textLower.match(/[\p{L}\p{M}'-]+/gu) || [];
229
+ // Score each word against each language trie
230
+ for (const word of words) {
231
+ if (word.length < 2)
232
+ continue; // Skip single chars — too ambiguous
233
+ for (const [lang, trie] of languageTries) {
234
+ const score = trie.matchScore(word);
235
+ if (score >= MIN_MATCH_SCORE) {
236
+ scores.set(lang, (scores.get(lang) || 0) + score);
237
+ }
238
+ }
239
+ }
240
+ // Phrase matching — weighted higher (5× per phrase match)
241
+ for (const [lang, phrases] of phraseSets) {
242
+ for (const phrase of phrases) {
243
+ if (textLower.includes(phrase)) {
244
+ scores.set(lang, (scores.get(lang) || 0) + 5);
245
+ }
246
+ }
247
+ }
248
+ return scores;
249
+ }
250
+ // ---------------------------------------------------------------------------
251
+ // Shannon entropy utility
252
+ // ---------------------------------------------------------------------------
253
+ /**
254
+ * Compute Shannon entropy (in bits) over a distribution of raw scores.
255
+ * Normalizes internally so inputs don't need to sum to 1.
256
+ * Filters out negligible entries (< 1% of total) before computing.
257
+ *
258
+ * Reference values for N equally likely outcomes: H = log2(N)
259
+ * - 2 langs: 1.00 bits
260
+ * - 3 langs: 1.58 bits
261
+ * - 4 langs: 2.00 bits
262
+ * - 8 langs: 3.00 bits
263
+ */
264
+ export function shannonEntropy(values) {
265
+ const total = values.reduce((s, v) => s + v, 0);
266
+ if (total <= 0)
267
+ return 0;
268
+ // Filter to significant entries and renormalize
269
+ const significant = values.filter(v => v / total > 0.01);
270
+ const sigTotal = significant.reduce((s, v) => s + v, 0);
271
+ if (sigTotal <= 0)
272
+ return 0;
273
+ let entropy = 0;
274
+ for (const v of significant) {
275
+ const p = v / sigTotal;
276
+ if (p > 0)
277
+ entropy -= p * Math.log2(p);
278
+ }
279
+ return entropy;
280
+ }
281
+ /**
282
+ * Compute how "scrappy" / SMS-like the text is.
283
+ * Romanized text tends to be informal: short words, no punctuation, no caps,
284
+ * abbreviations, repeated chars, number substitutions.
285
+ *
286
+ * ELD is trained on formal text, so high scrappiness = low ELD trust.
287
+ */
288
+ export function computeScrappiness(text) {
289
+ var _a;
290
+ const signals = [];
291
+ let score = 0;
292
+ const words = text.split(/\s+/).filter(w => w.length > 0);
293
+ if (words.length === 0)
294
+ return { score: 0, signals: [] };
295
+ // Average word length — romanized/SMS text has shorter words
296
+ const avgWordLen = words.reduce((s, w) => s + w.length, 0) / words.length;
297
+ if (avgWordLen < 3.5) {
298
+ score += 0.20;
299
+ signals.push(`short-words(${avgWordLen.toFixed(1)})`);
300
+ }
301
+ else if (avgWordLen < 4.5) {
302
+ score += 0.10;
303
+ signals.push(`medium-words(${avgWordLen.toFixed(1)})`);
304
+ }
305
+ // No punctuation — informal text often lacks periods, commas, etc.
306
+ const punctuation = text.match(/[.,;:!?'"()[\]{}]/g);
307
+ const punctDensity = ((_a = punctuation === null || punctuation === void 0 ? void 0 : punctuation.length) !== null && _a !== void 0 ? _a : 0) / text.length;
308
+ if (punctDensity < 0.005) {
309
+ score += 0.15;
310
+ signals.push("no-punctuation");
311
+ }
312
+ // No capitalization (or all lowercase) — SMS/chat style
313
+ const alphaChars = text.match(/[a-zA-Z]/g) || [];
314
+ const upperCount = alphaChars.filter(c => c >= "A" && c <= "Z").length;
315
+ const upperRatio = alphaChars.length > 0 ? upperCount / alphaChars.length : 0;
316
+ if (upperRatio === 0 && alphaChars.length > 10) {
317
+ score += 0.15;
318
+ signals.push("all-lowercase");
319
+ }
320
+ else if (upperRatio < 0.03 && alphaChars.length > 10) {
321
+ score += 0.05;
322
+ signals.push("mostly-lowercase");
323
+ }
324
+ // Number substitutions (common in chat: 3=e, 7=t, 0=o, 2=to, etc.)
325
+ const numInWords = words.filter(w => /\d/.test(w) && /[a-zA-Z]/.test(w)).length;
326
+ if (numInWords >= 2) {
327
+ score += 0.15;
328
+ signals.push(`num-substitution(${numInWords})`);
329
+ }
330
+ // Repeated characters (lol → loool, pleaaase)
331
+ const repeats = text.match(/(.)\1{2,}/g);
332
+ if (repeats && repeats.length >= 1) {
333
+ score += 0.10;
334
+ signals.push(`char-repeats(${repeats.length})`);
335
+ }
336
+ // High proportion of very short words (≤ 2 chars) — particle-heavy / SMS
337
+ const shortWords = words.filter(w => w.replace(/[^a-zA-Z]/g, "").length <= 2).length;
338
+ const shortRatio = shortWords / words.length;
339
+ if (shortRatio >= 0.40) {
340
+ score += 0.15;
341
+ signals.push(`short-word-ratio(${(shortRatio * 100).toFixed(0)}%)`);
342
+ }
343
+ else if (shortRatio >= 0.25) {
344
+ score += 0.05;
345
+ signals.push(`moderate-short-words(${(shortRatio * 100).toFixed(0)}%)`);
346
+ }
347
+ return { score: Math.min(1.0, score), signals };
348
+ }
349
+ /**
350
+ * Compute how much to trust ELD's language detection.
351
+ *
352
+ * Returns a factor 0–1 that should multiply ELD contributions:
353
+ * - 1.0: ELD is fully trusted (formal English/European text)
354
+ * - 0.5: ELD is partially trusted (some romanization or scrappiness)
355
+ * - 0.0: ELD is not trusted at all (strong romanization + very scrappy)
356
+ */
357
+ export function computeEldPenalty(text) {
358
+ const penalties = [];
359
+ let penalty = 0;
360
+ // Romanization penalty — if text looks romanized, ELD is unreliable
361
+ const rom = detectRomanization(text);
362
+ if (rom.tier === "high") {
363
+ penalty += 0.50;
364
+ penalties.push(`romanization-high(${rom.confidence.toFixed(2)})`);
365
+ }
366
+ else if (rom.tier === "mixed") {
367
+ penalty += 0.30;
368
+ penalties.push(`romanization-mixed(${rom.confidence.toFixed(2)})`);
369
+ }
370
+ else if (rom.confidence >= 0.15) {
371
+ penalty += 0.10;
372
+ penalties.push(`romanization-low(${rom.confidence.toFixed(2)})`);
373
+ }
374
+ // Scrappiness penalty — informal text degrades ELD accuracy
375
+ const scrappy = computeScrappiness(text);
376
+ if (scrappy.score >= 0.60) {
377
+ penalty += 0.30;
378
+ penalties.push(`scrappy-high(${scrappy.score.toFixed(2)})`);
379
+ }
380
+ else if (scrappy.score >= 0.35) {
381
+ penalty += 0.15;
382
+ penalties.push(`scrappy-moderate(${scrappy.score.toFixed(2)})`);
383
+ }
384
+ else if (scrappy.score >= 0.20) {
385
+ penalty += 0.05;
386
+ penalties.push(`scrappy-low(${scrappy.score.toFixed(2)})`);
387
+ }
388
+ // ELD entropy — when ELD spreads probability across many languages,
389
+ // it's confused. Apply as multiplicative damping: 1/(1 + entropy).
390
+ const eldResult = eld.detect(text);
391
+ const eldScores = eldResult.getScores();
392
+ const textEntropy = shannonEntropy(Object.values(eldScores));
393
+ const entropyDamping = 1 / (1 + textEntropy);
394
+ if (textEntropy >= 1.3) {
395
+ penalties.push(`eld-entropy(${textEntropy.toFixed(2)})`);
396
+ }
397
+ // Combine: additive penalties become a 0–1 factor, then multiply by entropy damping
398
+ const factor = Math.max(0, (1.0 - penalty) * entropyDamping);
399
+ return { factor, penalties };
400
+ }
401
+ // ---------------------------------------------------------------------------
402
+ // Main detection
403
+ // ---------------------------------------------------------------------------
404
+ /**
405
+ * Detect languages present in the input text.
406
+ *
407
+ * Uses a two-layer approach:
408
+ * 1. **Script detection** — Unicode character ranges identify non-Latin scripts
409
+ * with high certainty (Cyrillic → ru, Devanagari → hi, Hangul → ko, etc.)
410
+ * 2. **Vocabulary detection** — common words and phrases disambiguate Latin-script
411
+ * languages (en vs es vs fr vs de vs it vs pt vs nl vs tr)
412
+ *
413
+ * Script detection is weighted 2× relative to vocabulary detection since it's
414
+ * more deterministic.
415
+ *
416
+ * @param text - The text to analyze
417
+ * @param options - Detection options
418
+ * @returns Detection result with languages sorted by proportion
419
+ *
420
+ * @example
421
+ * ```typescript
422
+ * const result = detectLanguages("Bonjour, comment allez-vous?");
423
+ * // → { languages: [{ language: "fr", proportion: 0.82, present: 1.0, wordCount: 3 }, ...], text: "..." }
424
+ *
425
+ * const result2 = detectLanguages("こんにちは世界");
426
+ * // → { languages: [{ language: "ja", proportion: 0.85 }, { language: "zh", proportion: 0.15 }], text: "..." }
427
+ * ```
428
+ */
429
+ export function detectLanguages(text, options = {}) {
430
+ var _a, _b;
431
+ const { minConfidence = 0.05, maxLanguages = 5, priorWeights } = options;
432
+ if (!text || typeof text !== "string") {
433
+ return { languages: [], text: text !== null && text !== void 0 ? text : "", totalWords: 0 };
434
+ }
435
+ // Split text into words (Unicode-aware)
436
+ const words = text.match(/[\p{L}\p{M}'-]+/gu) || [];
437
+ if (words.length === 0) {
438
+ return { languages: [], text, totalWords: 0 };
439
+ }
440
+ // For each word, find which language it best belongs to (winner-take-all per word).
441
+ // Track: word count per language, and max single-word score per language (presence signal).
442
+ const langWordCounts = new Map();
443
+ const langMaxScore = new Map();
444
+ let scoredWords = 0;
445
+ // Running cumulative word shares per language — global Bayesian prior.
446
+ // As we process words left-to-right, the running distribution nudges ambiguous
447
+ // words toward languages that already have evidence (e.g., if text is mostly
448
+ // English, "con" leans English over Spanish). No family groupings needed.
449
+ const cumulativeShares = new Map();
450
+ let totalCumulativeShares = 0;
451
+ // Compute ELD penalty once at the text level — romanized / scrappy text
452
+ // should deflate our reliance on ELD n-gram scores everywhere.
453
+ const eldPenalty = computeEldPenalty(text);
454
+ // Seed the Bayesian prior from ELD full-text analysis.
455
+ // ELD gives corpus-trained n-gram priors for the entire text,
456
+ // which are then refined per-word by trie + script matching.
457
+ // eldPenalty.factor already includes romanization + scrappiness + text-level entropy.
458
+ const eldPriors = getEldTextPriors(text);
459
+ for (const [lang, weight] of Object.entries(eldPriors)) {
460
+ const penalized = weight * eldPenalty.factor;
461
+ if (penalized > 0.01) { // Only seed languages with meaningful ELD signal
462
+ cumulativeShares.set(lang, penalized);
463
+ totalCumulativeShares += penalized;
464
+ }
465
+ }
466
+ // Layer user-provided weights on top of ELD priors if given.
467
+ if (priorWeights) {
468
+ const weightSum = Object.values(priorWeights).reduce((s, w) => s + (w || 0), 0);
469
+ if (weightSum > 0) {
470
+ for (const [lang, w] of Object.entries(priorWeights)) {
471
+ if (w > 0) {
472
+ const normalized = w / weightSum;
473
+ cumulativeShares.set(lang, (cumulativeShares.get(lang) || 0) + normalized);
474
+ totalCumulativeShares += normalized;
475
+ }
476
+ }
477
+ }
478
+ }
479
+ for (const word of words) {
480
+ if (word.length < 2)
481
+ continue; // Skip single chars
482
+ const wordScores = scoreWord(word, eldPenalty.factor);
483
+ const entries = Object.entries(wordScores);
484
+ if (entries.length === 0)
485
+ continue;
486
+ scoredWords++;
487
+ // Track max score per language (presence signal)
488
+ for (const [lang, score] of entries) {
489
+ langMaxScore.set(lang, Math.max(langMaxScore.get(lang) || 0, score));
490
+ }
491
+ // ── Tension-gated Bayesian prior ──
492
+ // Only apply when the word is genuinely contested (runner-up >= 50% of leader).
493
+ // Clear signals (Devanagari → Hindi, "the" → English) pass through unmodified.
494
+ // Entropy-gated: backs off when the cumulative distribution is flat.
495
+ const sorted = [...entries].sort((a, b) => b[1] - a[1]);
496
+ const topScore = ((_a = sorted[0]) === null || _a === void 0 ? void 0 : _a[1]) || 0;
497
+ const runnerUp = ((_b = sorted[1]) === null || _b === void 0 ? void 0 : _b[1]) || 0;
498
+ const tension = topScore > 0 && runnerUp > 0 ? runnerUp / topScore : 0;
499
+ const TENSION_THRESHOLD = 0.5;
500
+ const hasTension = tension >= TENSION_THRESHOLD;
501
+ let adjustedEntries;
502
+ if (hasTension && totalCumulativeShares > 0) {
503
+ const gate = entropyGate(cumulativeShares, totalCumulativeShares);
504
+ const effectiveAlpha = PRIOR_ALPHA * gate;
505
+ adjustedEntries = entries.map(([lang, score]) => {
506
+ if (effectiveAlpha === 0)
507
+ return [lang, score];
508
+ const cumProportion = (cumulativeShares.get(lang) || 0) / totalCumulativeShares;
509
+ return [lang, score * (1 + effectiveAlpha * cumProportion)];
510
+ });
511
+ }
512
+ else {
513
+ // No tension or no prior context — pass through unmodified
514
+ adjustedEntries = entries;
515
+ }
516
+ // ── Competitive cutoff: drop tail languages that aren't competitive ──
517
+ // Only languages scoring >= 50% of the top scorer get any share.
518
+ // This prevents noise (e.g., "allé" giving share to Arabic via bigrams)
519
+ // from diluting the true language's proportion.
520
+ const peakScore = adjustedEntries.reduce((mx, [, s]) => Math.max(mx, s), 0);
521
+ const COMPETITIVE_RATIO = 0.4;
522
+ const competitive = adjustedEntries.filter(([, s]) => s >= peakScore * COMPETITIVE_RATIO);
523
+ // Boost exact matches: if a language scored 1.0 (perfect trie + script +
524
+ // bigram), double its raw score so it captures the majority of this word's
525
+ // budget even among other competitive languages.
526
+ const boostedEntries = competitive.map(([lang, score]) => [
527
+ lang,
528
+ score >= 1.0 ? score * 2 : score,
529
+ ]);
530
+ const totalScore = boostedEntries.reduce((sum, [, s]) => sum + s, 0);
531
+ if (totalScore > 0) {
532
+ for (const [lang, score] of boostedEntries) {
533
+ const share = score / totalScore; // Normalized share of this word
534
+ langWordCounts.set(lang, (langWordCounts.get(lang) || 0) + share);
535
+ // Update running cumulative for next word's prior
536
+ cumulativeShares.set(lang, (cumulativeShares.get(lang) || 0) + share);
537
+ totalCumulativeShares += share;
538
+ }
539
+ }
540
+ }
541
+ // Add phrase bonus as fractional word count (a matched phrase = +2 words for that language)
542
+ const textLower = text.toLowerCase();
543
+ for (const [lang, phrases] of phraseSets) {
544
+ for (const phrase of phrases) {
545
+ if (textLower.includes(phrase)) {
546
+ langWordCounts.set(lang, (langWordCounts.get(lang) || 0) + 2);
547
+ scoredWords += 2;
548
+ }
549
+ }
550
+ }
551
+ // If nothing matched, default to English for Latin text
552
+ if (langWordCounts.size === 0) {
553
+ if (/[a-zA-Z]/.test(text)) {
554
+ langWordCounts.set("en", scoredWords || 1);
555
+ if (scoredWords === 0)
556
+ scoredWords = 1;
557
+ }
558
+ else {
559
+ return { languages: [], text, totalWords: words.length };
560
+ }
561
+ }
562
+ // Build results: proportion = wordCount / totalScoredWords
563
+ // present = max single-word score for that language (1.0 = definitively present)
564
+ let languages = Array.from(langWordCounts.entries())
565
+ .map(([language, wc]) => {
566
+ const proportion = Math.round((wc / scoredWords) * 1000) / 1000;
567
+ const present = Math.round((langMaxScore.get(language) || 0) * 1000) / 1000;
568
+ return {
569
+ language,
570
+ present,
571
+ proportion,
572
+ wordCount: Math.round(wc * 100) / 100,
573
+ };
574
+ })
575
+ .filter((d) => d.proportion >= minConfidence || d.present >= 0.5)
576
+ .sort((a, b) => {
577
+ // Sort primarily by proportion, use present as tiebreaker
578
+ if (Math.abs(a.proportion - b.proportion) > 0.05) {
579
+ return b.proportion - a.proportion;
580
+ }
581
+ return b.present - a.present;
582
+ })
583
+ .slice(0, Math.max(maxLanguages, 10)); // Keep top 10 for re-normalization
584
+ // Re-normalize proportions among the top N candidates so they sum to ~1.0.
585
+ // Without this, Latin-family languages dilute each other (e.g., "de" the word
586
+ // splits across French, Spanish, Portuguese, etc.) and even long monolingual
587
+ // texts end up with the true language at only 0.3–0.4 proportion.
588
+ const topNSum = languages.reduce((sum, d) => sum + d.proportion, 0);
589
+ if (topNSum > 0) {
590
+ for (const lang of languages) {
591
+ lang.proportion = Math.round((lang.proportion / topNSum) * 1000) / 1000;
592
+ }
593
+ }
594
+ // Final slice to requested maxLanguages after re-normalization
595
+ languages = languages.slice(0, maxLanguages);
596
+ return { languages, text, totalWords: words.length };
597
+ }
598
+ /**
599
+ * Get the single best-matching language for the text.
600
+ *
601
+ * @param text - The text to analyze
602
+ * @returns The top language detection, or a default English result if unknown
603
+ *
604
+ * @example
605
+ * ```typescript
606
+ * const best = detectBestLanguage("Hola, ¿cómo estás?");
607
+ * // → { language: "es", present: 1.0, proportion: 0.91, wordCount: 3 }
608
+ * ```
609
+ */
610
+ export function detectBestLanguage(text) {
611
+ var _a;
612
+ const result = detectLanguages(text, { maxLanguages: 1 });
613
+ return (_a = result.languages[0]) !== null && _a !== void 0 ? _a : {
614
+ language: "en",
615
+ present: 0,
616
+ proportion: 0,
617
+ wordCount: 0,
618
+ };
619
+ }
620
+ /**
621
+ * Detect the script/language of a single word based purely on its characters.
622
+ * Useful for classifying individual profanity dictionary entries.
623
+ *
624
+ * @param word - A single word to classify
625
+ * @returns Array of language codes the word's characters belong to, sorted by
626
+ * character count (most characters → first). Empty if purely basic Latin.
627
+ *
628
+ * @example
629
+ * ```typescript
630
+ * classifyWordScript("Scheiße") // → ["de"]
631
+ * classifyWordScript("くそ") // → ["ja"]
632
+ * classifyWordScript("fuck") // → [] (basic Latin, ambiguous)
633
+ * classifyWordScript("씨발") // → ["ko"]
634
+ * ```
635
+ */
636
+ export function classifyWordScript(word) {
637
+ const counts = detectByScript(word);
638
+ if (counts.size === 0)
639
+ return [];
640
+ return Array.from(counts.entries())
641
+ .sort((a, b) => b[1] - a[1])
642
+ .map(([lang]) => lang);
643
+ }
644
+ // ---------------------------------------------------------------------------
645
+ // ELD n-gram helper — per-word language scores from corpus-trained model
646
+ // ---------------------------------------------------------------------------
647
+ /**
648
+ * Get ELD n-gram scores for a single word, normalized to [0, 1].
649
+ * Returns only scores above a minimum threshold.
650
+ */
651
+ function getEldWordScores(word) {
652
+ const result = eld.detect(word);
653
+ const scores = result.getScores();
654
+ const out = {};
655
+ for (const [lang, score] of Object.entries(scores)) {
656
+ if (score > 0.1) {
657
+ out[lang] = score;
658
+ }
659
+ }
660
+ return out;
661
+ }
662
+ /**
663
+ * Get ELD n-gram scores for the full text, used as Bayesian priors.
664
+ * Returns normalized scores (sum to ~1.0).
665
+ */
666
+ function getEldTextPriors(text) {
667
+ const result = eld.detect(text);
668
+ const scores = result.getScores();
669
+ const out = {};
670
+ const total = Object.values(scores).reduce((s, v) => s + v, 0);
671
+ if (total <= 0)
672
+ return out;
673
+ for (const [lang, score] of Object.entries(scores)) {
674
+ if (score > 0) {
675
+ out[lang] = score / total;
676
+ }
677
+ }
678
+ return out;
679
+ }
680
+ const HIGH_SIGNAL_SUFFIXES = {
681
+ es: [
682
+ // ñ-bearing — guarantees Spanish
683
+ { pattern: "ción", weight: 0.8 }, { pattern: "sión", weight: 0.8 },
684
+ { pattern: "ñol", weight: 0.9 }, { pattern: "ñar", weight: 0.9 },
685
+ { pattern: "imiento", weight: 0.8 }, { pattern: "miento", weight: 0.7 },
686
+ { pattern: "ería", weight: 0.7 },
687
+ ],
688
+ fr: [
689
+ // Accent-bearing — guarantees French
690
+ { pattern: "amment", weight: 0.9 }, { pattern: "ément", weight: 0.9 },
691
+ { pattern: "eaux", weight: 0.8 }, { pattern: "euse", weight: 0.7 },
692
+ { pattern: "ière", weight: 0.7 }, { pattern: "eux", weight: 0.6 },
693
+ { pattern: "ée", weight: 0.5 }, { pattern: "ées", weight: 0.6 },
694
+ { pattern: "esque", weight: 0.5 },
695
+ ],
696
+ de: [
697
+ // Umlaut/ß-bearing — guarantees German
698
+ { pattern: "schaft", weight: 0.8 }, { pattern: "mäßig", weight: 0.9 },
699
+ { pattern: "igkeit", weight: 0.8 }, { pattern: "ierung", weight: 0.8 },
700
+ { pattern: "keit", weight: 0.8 }, { pattern: "heit", weight: 0.7 },
701
+ { pattern: "lich", weight: 0.35 }, { pattern: "isch", weight: 0.3 },
702
+ { pattern: "ung", weight: 0.35 }, { pattern: "ös", weight: 0.3 },
703
+ ],
704
+ pt: [
705
+ // ã/õ/ç-bearing — guarantees Portuguese
706
+ { pattern: "ção", weight: 0.9 }, { pattern: "ções", weight: 0.9 },
707
+ { pattern: "ável", weight: 0.8 }, { pattern: "ível", weight: 0.8 },
708
+ { pattern: "ância", weight: 0.8 }, { pattern: "ência", weight: 0.8 },
709
+ ],
710
+ tr: [
711
+ // Turkish special chars — guarantees Turkish
712
+ { pattern: "lık", weight: 0.8 }, { pattern: "lığ", weight: 0.9 },
713
+ { pattern: "lük", weight: 0.8 }, { pattern: "lüğ", weight: 0.9 },
714
+ { pattern: "sız", weight: 0.8 }, { pattern: "süz", weight: 0.8 },
715
+ { pattern: "ınca", weight: 0.8 }, { pattern: "ünce", weight: 0.8 },
716
+ { pattern: "dır", weight: 0.5 }, { pattern: "dür", weight: 0.5 },
717
+ { pattern: "siz", weight: 0.4 }, { pattern: "suz", weight: 0.4 },
718
+ { pattern: "çı", weight: 0.3 },
719
+ ],
720
+ ja: [
721
+ // Unique romaji endings — guarantees Japanese
722
+ { pattern: "mashita", weight: 0.9 }, { pattern: "masen", weight: 0.9 },
723
+ { pattern: "masu", weight: 0.8 }, { pattern: "desu", weight: 0.8 },
724
+ ],
725
+ ko: [
726
+ // Unique Korean romanization — guarantees Korean
727
+ { pattern: "imnida", weight: 0.9 }, { pattern: "haseo", weight: 0.9 },
728
+ { pattern: "hamni", weight: 0.8 }, { pattern: "eoyo", weight: 0.8 },
729
+ ],
730
+ ru: [
731
+ // Unique Russian romanization — guarantees Russian
732
+ { pattern: "ovich", weight: 0.9 }, { pattern: "evich", weight: 0.9 },
733
+ { pattern: "ovna", weight: 0.9 }, { pattern: "evna", weight: 0.9 },
734
+ { pattern: "stvo", weight: 0.8 }, { pattern: "nost", weight: 0.5 },
735
+ { pattern: "skaya", weight: 0.3 }, { pattern: "skoye", weight: 0.3 },
736
+ ],
737
+ nl: [
738
+ // Distinctive Dutch suffixes
739
+ { pattern: "heid", weight: 0.3 }, { pattern: "schap", weight: 0.3 },
740
+ ],
741
+ ar: [
742
+ // Distinctive Arabic romanization
743
+ { pattern: "ullah", weight: 0.3 }, { pattern: "allah", weight: 0.3 },
744
+ ],
745
+ };
746
+ // ---------------------------------------------------------------------------
747
+ // High-signal prefixes — only accent-bearing / highly distinctive (0.3+)
748
+ // ---------------------------------------------------------------------------
749
+ const HIGH_SIGNAL_PREFIXES = {
750
+ de: [
751
+ // Umlaut-bearing — practically guarantees German
752
+ { pattern: "über", weight: 0.9 }, { pattern: "zurück", weight: 0.9 },
753
+ { pattern: "wieder", weight: 0.5 }, { pattern: "unter", weight: 0.3 },
754
+ ],
755
+ fr: [
756
+ // Accent-bearing — very distinctive French
757
+ { pattern: "dé", weight: 0.35 }, { pattern: "ré", weight: 0.3 },
758
+ { pattern: "pré", weight: 0.35 },
759
+ ],
760
+ tr: [
761
+ // Accent-bearing — very distinctive Turkish
762
+ { pattern: "karşı", weight: 0.9 }, { pattern: "üst", weight: 0.7 },
763
+ { pattern: "geri", weight: 0.3 }, { pattern: "ön", weight: 0.3 },
764
+ ],
765
+ };
766
+ // ---------------------------------------------------------------------------
767
+ // High-signal suffix/prefix matching helpers
768
+ // ---------------------------------------------------------------------------
769
+ function findHighSignalSuffixes(word) {
770
+ const lower = word.toLowerCase();
771
+ const matched = new Map();
772
+ for (const [lang, affixes] of Object.entries(HIGH_SIGNAL_SUFFIXES)) {
773
+ for (const { pattern, weight } of affixes) {
774
+ if (lower.endsWith(pattern)) {
775
+ const current = matched.get(lang) || 0;
776
+ if (weight > current)
777
+ matched.set(lang, weight);
778
+ }
779
+ }
780
+ }
781
+ return matched;
782
+ }
783
+ function findHighSignalPrefixes(word) {
784
+ const lower = word.toLowerCase();
785
+ const matched = new Map();
786
+ for (const [lang, affixes] of Object.entries(HIGH_SIGNAL_PREFIXES)) {
787
+ for (const { pattern, weight } of affixes) {
788
+ if (lower.startsWith(pattern) && lower.length > pattern.length + 1) {
789
+ const current = matched.get(lang) || 0;
790
+ if (weight > current)
791
+ matched.set(lang, weight);
792
+ }
793
+ }
794
+ }
795
+ return matched;
796
+ }
797
+ /**
798
+ * Score a single word against ALL languages (0–1 each).
799
+ *
800
+ * Combines four signals:
801
+ * 1. **Script** (weight 1.0) — Unicode chars belonging to a language's script.
802
+ * 2. **Trie** (weight 0.8) — vocabulary match via per-language tries.
803
+ * 3. **ELD n-grams** (weight 0.6) — corpus-trained n-gram model per word.
804
+ * 4. **High-signal suffix/prefix** — accent-bearing patterns (0.3+ weight).
805
+ *
806
+ * @param word - A single word to score
807
+ * @param eldPenaltyFactor - Multiplier (0–1) to deflate ELD n-gram contributions.
808
+ * 1.0 = full ELD trust (default), 0.0 = ignore ELD entirely.
809
+ * @returns Map of language → score (only includes languages with score > 0)
810
+ */
811
+ export function scoreWord(word, eldPenaltyFactor = 1.0) {
812
+ const scores = {};
813
+ const charLen = [...word].length; // Unicode-safe length
814
+ if (charLen === 0)
815
+ return scores;
816
+ // ── Detect unique language fingerprints → guaranteed 1.0 floor ──
817
+ const scriptCounts = detectByScript(word);
818
+ const uniqueLangChars = new Map();
819
+ for (const char of word) {
820
+ const cp = char.codePointAt(0);
821
+ if (cp === undefined)
822
+ continue;
823
+ if (DIACRITICAL_SIGNALS.has(cp))
824
+ continue;
825
+ for (const range of SCRIPT_RANGES) {
826
+ if (cp >= range.start && cp <= range.end) {
827
+ if (range.languages.length === 1) {
828
+ const lang = range.languages[0];
829
+ uniqueLangChars.set(lang, (uniqueLangChars.get(lang) || 0) + 1);
830
+ }
831
+ break;
832
+ }
833
+ }
834
+ }
835
+ const guaranteedLangs = new Set();
836
+ // 2+ unique script chars → guaranteed 1.0
837
+ for (const [lang, count] of uniqueLangChars) {
838
+ if (count >= 2) {
839
+ guaranteedLangs.add(lang);
840
+ }
841
+ }
842
+ // Unique exact vocabulary match (len > 4, only 1 trie matches) → guaranteed 1.0
843
+ if (charLen > 4) {
844
+ const exactMatches = [];
845
+ for (const [lang, trie] of languageTries) {
846
+ if (trie.matchScore(word) === 1.0) {
847
+ exactMatches.push(lang);
848
+ }
849
+ }
850
+ if (exactMatches.length === 1) {
851
+ guaranteedLangs.add(exactMatches[0]);
852
+ }
853
+ }
854
+ // ── Scoring weights ──
855
+ const SCRIPT_WEIGHT = 1.0;
856
+ const TRIE_WEIGHT = 0.8;
857
+ const ELD_WEIGHT = 0.6;
858
+ // Layer 1: Script detection
859
+ if (charLen > 0) {
860
+ for (const [lang, count] of scriptCounts) {
861
+ const scriptScore = count / charLen;
862
+ scores[lang] = (scores[lang] || 0) + scriptScore * SCRIPT_WEIGHT;
863
+ }
864
+ }
865
+ // CJK disambiguation: pure CJK ideographs without kana → skew Chinese
866
+ if (charLen >= 10 && scriptCounts.has("zh") && scriptCounts.has("ja")) {
867
+ const hasKana = [...word].some((ch) => {
868
+ const cp = ch.codePointAt(0);
869
+ return (cp >= 0x3040 && cp <= 0x309f) || (cp >= 0x30a0 && cp <= 0x30ff) ||
870
+ (cp >= 0x31f0 && cp <= 0x31ff) || (cp >= 0xff65 && cp <= 0xff9f);
871
+ });
872
+ if (!hasKana) {
873
+ scores["zh"] = (scores["zh"] || 0) + 0.5;
874
+ scores["ja"] = (scores["ja"] || 0) * 0.5;
875
+ }
876
+ }
877
+ // Layer 2: Trie matching
878
+ for (const [lang, trie] of languageTries) {
879
+ const trieScore = trie.matchScore(word);
880
+ if (trieScore > 0) {
881
+ let effectiveScore;
882
+ if (trieScore === 1.0) {
883
+ effectiveScore = TRIE_WEIGHT;
884
+ }
885
+ else {
886
+ const matchedChars = Math.round(trieScore * charLen);
887
+ const charBonus = Math.pow(1.1, matchedChars);
888
+ effectiveScore = trieScore * TRIE_WEIGHT * 0.1 * Math.min(charBonus, 3);
889
+ }
890
+ const langKey = lang;
891
+ scores[langKey] = (scores[langKey] || 0) + effectiveScore;
892
+ }
893
+ }
894
+ // Layer 3: ELD n-gram scores (corpus-trained, replaces hand-tuned bigrams/trigrams)
895
+ // Two ELD penalties stack multiplicatively:
896
+ // 1. eldPenaltyFactor — text-level penalty from romanization + scrappiness
897
+ // 2. Per-word entropy — if ELD is confused about THIS word, dampen further
898
+ if (charLen >= 3) {
899
+ const eldScores = getEldWordScores(word);
900
+ const eldValues = Object.values(eldScores);
901
+ // Per-word entropy: high entropy = ELD can't decide → dampen its scores.
902
+ // 1 bit = 2 equally likely langs (ok), 2+ bits = 4+ langs (confused).
903
+ // Damping: 1.0 at 0 bits, ~0.5 at 2 bits, ~0.25 at 3 bits.
904
+ const wordEntropy = shannonEntropy(eldValues);
905
+ const entropyDamping = 1 / (1 + wordEntropy);
906
+ const effectiveEldWeight = ELD_WEIGHT * eldPenaltyFactor * entropyDamping;
907
+ if (effectiveEldWeight > 0.01) {
908
+ for (const [lang, score] of Object.entries(eldScores)) {
909
+ scores[lang] = (scores[lang] || 0) + score * effectiveEldWeight;
910
+ }
911
+ }
912
+ }
913
+ // Normalize: max possible is SCRIPT + TRIE + ELD
914
+ const maxPossible = SCRIPT_WEIGHT + TRIE_WEIGHT + ELD_WEIGHT;
915
+ for (const lang of Object.keys(scores)) {
916
+ scores[lang] = Math.min(1, scores[lang] / maxPossible);
917
+ if (scores[lang] < 0.01)
918
+ delete scores[lang];
919
+ }
920
+ // Layer 4: High-signal suffix matching (accent-bearing, 0.3+ weight only)
921
+ const suffixMatches = findHighSignalSuffixes(word);
922
+ for (const [lang, weight] of suffixMatches) {
923
+ const base = scores[lang] || 0;
924
+ if (base > 0) {
925
+ scores[lang] = Math.min(1, base * 1.3 + weight);
926
+ }
927
+ else {
928
+ scores[lang] = weight;
929
+ }
930
+ }
931
+ // Layer 5: High-signal prefix matching (accent-bearing, 0.3+ weight only)
932
+ const prefixMatches = findHighSignalPrefixes(word);
933
+ for (const [lang, weight] of prefixMatches) {
934
+ const base = scores[lang] || 0;
935
+ if (base > 0) {
936
+ scores[lang] = Math.min(1, base * 1.2 + weight);
937
+ }
938
+ else {
939
+ scores[lang] = weight;
940
+ }
941
+ }
942
+ // Apply guaranteed 1.0 floor for uniquely-fingerprinted languages
943
+ for (const lang of guaranteedLangs) {
944
+ scores[lang] = 1.0;
945
+ }
946
+ // Clip all scores to 3 decimal places
947
+ for (const lang of Object.keys(scores)) {
948
+ scores[lang] = Math.round(scores[lang] * 1000) / 1000;
949
+ }
950
+ return scores;
951
+ }
952
+ //# sourceMappingURL=language-detector.js.map