bekindprofanityfilter 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,118 +0,0 @@
1
- /**
2
- * Innocence Scoring — adjusts profanity certainty based on cross-language context.
3
- *
4
- * When a word is profane in one language but innocent in another (e.g., "slut"
5
- * is profane in English but means "end" in Swedish), this module adjusts the
6
- * certainty score based on detected language signals.
7
- *
8
- * - Dampens certainty when the innocent language dominates the text context
9
- * - Boosts certainty when the profane language dominates
10
- * - Applies the word's dampeningFactor to control adjustment magnitude
11
- */
12
- // ---------------------------------------------------------------------------
13
- // Language family groups — for disambiguation of closely related languages.
14
- // The ELD n-gram model often confuses languages within the same family
15
- // (e.g., Swedish ↔ German, Norwegian ↔ Danish). When checking innocent entries,
16
- // we also consider signal from sibling languages in the same family.
17
- // ---------------------------------------------------------------------------
18
- const LANGUAGE_FAMILIES = {
19
- germanic_scandinavian: ["sv", "no", "da"],
20
- germanic_continental: ["de", "nl"],
21
- };
22
- /** Build a reverse lookup: language code → family members (including itself) */
23
- const FAMILY_SIBLINGS = {};
24
- for (const members of Object.values(LANGUAGE_FAMILIES)) {
25
- for (const lang of members) {
26
- FAMILY_SIBLINGS[lang] = members;
27
- }
28
- }
29
- // Cross-family confusion map: languages the detector often confuses with each other.
30
- // Swedish/Norwegian/Danish are often classified as German by ELD n-gram.
31
- const CONFUSION_MAP = {
32
- de: ["sv", "no", "da"],
33
- sv: ["de", "no", "da"],
34
- no: ["de", "sv", "da"],
35
- da: ["de", "sv", "no"],
36
- nl: ["de"], // Dutch sometimes classified as German
37
- };
38
- /**
39
- * Get the effective amplified signal for a language, including signal
40
- * from languages the detector commonly confuses it with.
41
- *
42
- * For example, if the innocent language is "sv" (Swedish) but the detector
43
- * classified the text as "de" (German), we add the German signal as a
44
- * partial boost to the Swedish signal, since it may actually be Swedish.
45
- */
46
- function getEffectiveAmp(language, amplified) {
47
- var _a, _b;
48
- const directAmp = (_a = amplified[language]) !== null && _a !== void 0 ? _a : 0;
49
- // Check if any confused languages have signal
50
- const confusedWith = CONFUSION_MAP[language];
51
- if (!confusedWith)
52
- return directAmp;
53
- let confusedAmp = 0;
54
- for (const confused of confusedWith) {
55
- const amp = (_b = amplified[confused]) !== null && _b !== void 0 ? _b : 0;
56
- if (amp > confusedAmp) {
57
- confusedAmp = amp;
58
- }
59
- }
60
- // Use the higher of direct signal or confused signal (discounted by 0.8)
61
- // The discount prevents over-attribution: German text shouldn't fully
62
- // count as Swedish, but mostly-German signal in a Scandinavian context
63
- // should still trigger dampening.
64
- return Math.max(directAmp, confusedAmp * 0.8);
65
- }
66
- /**
67
- * Adjust a word's certainty score based on cross-language innocence data
68
- * and pre-computed amplified language signals.
69
- *
70
- * @param certainty - Base certainty score (0-5)
71
- * @param profaneLanguage - ISO 639-1 code of the language where the word is profane
72
- * @param innocentEntries - Array of languages where the word is innocent
73
- * @param amplified - Pre-computed weighted average of word + document language signals
74
- * @returns Adjusted certainty, clamped to [0, 5]
75
- */
76
- export function adjustCertaintyForLanguage(certainty, profaneLanguage, innocentEntries, amplified, isPartialMatch = false) {
77
- var _a, _b, _c;
78
- // No innocent entries → no adjustment possible
79
- if (innocentEntries.length === 0) {
80
- return certainty;
81
- }
82
- const profaneAmp = (_a = amplified[profaneLanguage]) !== null && _a !== void 0 ? _a : 0;
83
- // Find the strongest innocent signal among all innocent entries,
84
- // considering language family confusion
85
- let bestInnocentAmp = 0;
86
- let bestEntry = null;
87
- for (const entry of innocentEntries) {
88
- const effectiveAmp = getEffectiveAmp(entry.language, amplified);
89
- if (effectiveAmp > bestInnocentAmp) {
90
- bestInnocentAmp = effectiveAmp;
91
- bestEntry = entry;
92
- }
93
- }
94
- // Both signals below threshold → no meaningful signal to act on
95
- if (profaneAmp < 0.01 && bestInnocentAmp < 0.01) {
96
- return certainty;
97
- }
98
- let adjusted = certainty;
99
- if (bestInnocentAmp > profaneAmp && bestEntry) {
100
- // Innocent language dominates → dampen certainty
101
- // Use partialDampeningFactor when the word is embedded inside another word
102
- const df = isPartialMatch
103
- ? ((_b = bestEntry.partialDampeningFactor) !== null && _b !== void 0 ? _b : bestEntry.dampeningFactor)
104
- : bestEntry.dampeningFactor;
105
- adjusted = certainty * (1 - df * bestInnocentAmp);
106
- }
107
- else if (profaneAmp > bestInnocentAmp && bestEntry) {
108
- // Profane language dominates → boost certainty
109
- const df = isPartialMatch
110
- ? ((_c = bestEntry.partialDampeningFactor) !== null && _c !== void 0 ? _c : bestEntry.dampeningFactor)
111
- : bestEntry.dampeningFactor;
112
- adjusted = certainty * (1 + df * profaneAmp);
113
- }
114
- // Equal signals → no change
115
- // Clamp to [0, 5]
116
- return Math.max(0, Math.min(5, adjusted));
117
- }
118
- //# sourceMappingURL=innocence-scoring.js.map