bekindprofanityfilter 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/index.js +6 -6
- package/dist/esm/index.d.ts +15 -0
- package/dist/esm/languages/english-primary-all-languages.d.ts +0 -17
- package/dist/esm.min.js +8 -0
- package/package.json +5 -6
- package/dist/esm/algos/aho-corasick.js +0 -238
- package/dist/esm/algos/bloom-filter.js +0 -208
- package/dist/esm/algos/context-patterns.js +0 -415
- package/dist/esm/index.js +0 -2640
- package/dist/esm/innocence-scoring.js +0 -118
- package/dist/esm/language-detector.js +0 -952
- package/dist/esm/language-dicts.js +0 -2718
- package/dist/esm/languages/english-primary-all-languages.js +0 -36894
- package/dist/esm/romanization-detector.js +0 -779
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Innocence Scoring — adjusts profanity certainty based on cross-language context.
|
|
3
|
-
*
|
|
4
|
-
* When a word is profane in one language but innocent in another (e.g., "slut"
|
|
5
|
-
* is profane in English but means "end" in Swedish), this module adjusts the
|
|
6
|
-
* certainty score based on detected language signals.
|
|
7
|
-
*
|
|
8
|
-
* - Dampens certainty when the innocent language dominates the text context
|
|
9
|
-
* - Boosts certainty when the profane language dominates
|
|
10
|
-
* - Applies the word's dampeningFactor to control adjustment magnitude
|
|
11
|
-
*/
|
|
12
|
-
// ---------------------------------------------------------------------------
|
|
13
|
-
// Language family groups — for disambiguation of closely related languages.
|
|
14
|
-
// The ELD n-gram model often confuses languages within the same family
|
|
15
|
-
// (e.g., Swedish ↔ German, Norwegian ↔ Danish). When checking innocent entries,
|
|
16
|
-
// we also consider signal from sibling languages in the same family.
|
|
17
|
-
// ---------------------------------------------------------------------------
|
|
18
|
-
const LANGUAGE_FAMILIES = {
|
|
19
|
-
germanic_scandinavian: ["sv", "no", "da"],
|
|
20
|
-
germanic_continental: ["de", "nl"],
|
|
21
|
-
};
|
|
22
|
-
/** Build a reverse lookup: language code → family members (including itself) */
|
|
23
|
-
const FAMILY_SIBLINGS = {};
|
|
24
|
-
for (const members of Object.values(LANGUAGE_FAMILIES)) {
|
|
25
|
-
for (const lang of members) {
|
|
26
|
-
FAMILY_SIBLINGS[lang] = members;
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
// Cross-family confusion map: languages the detector often confuses with each other.
|
|
30
|
-
// Swedish/Norwegian/Danish are often classified as German by ELD n-gram.
|
|
31
|
-
const CONFUSION_MAP = {
|
|
32
|
-
de: ["sv", "no", "da"],
|
|
33
|
-
sv: ["de", "no", "da"],
|
|
34
|
-
no: ["de", "sv", "da"],
|
|
35
|
-
da: ["de", "sv", "no"],
|
|
36
|
-
nl: ["de"], // Dutch sometimes classified as German
|
|
37
|
-
};
|
|
38
|
-
/**
|
|
39
|
-
* Get the effective amplified signal for a language, including signal
|
|
40
|
-
* from languages the detector commonly confuses it with.
|
|
41
|
-
*
|
|
42
|
-
* For example, if the innocent language is "sv" (Swedish) but the detector
|
|
43
|
-
* classified the text as "de" (German), we add the German signal as a
|
|
44
|
-
* partial boost to the Swedish signal, since it may actually be Swedish.
|
|
45
|
-
*/
|
|
46
|
-
function getEffectiveAmp(language, amplified) {
|
|
47
|
-
var _a, _b;
|
|
48
|
-
const directAmp = (_a = amplified[language]) !== null && _a !== void 0 ? _a : 0;
|
|
49
|
-
// Check if any confused languages have signal
|
|
50
|
-
const confusedWith = CONFUSION_MAP[language];
|
|
51
|
-
if (!confusedWith)
|
|
52
|
-
return directAmp;
|
|
53
|
-
let confusedAmp = 0;
|
|
54
|
-
for (const confused of confusedWith) {
|
|
55
|
-
const amp = (_b = amplified[confused]) !== null && _b !== void 0 ? _b : 0;
|
|
56
|
-
if (amp > confusedAmp) {
|
|
57
|
-
confusedAmp = amp;
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
// Use the higher of direct signal or confused signal (discounted by 0.8)
|
|
61
|
-
// The discount prevents over-attribution: German text shouldn't fully
|
|
62
|
-
// count as Swedish, but mostly-German signal in a Scandinavian context
|
|
63
|
-
// should still trigger dampening.
|
|
64
|
-
return Math.max(directAmp, confusedAmp * 0.8);
|
|
65
|
-
}
|
|
66
|
-
/**
|
|
67
|
-
* Adjust a word's certainty score based on cross-language innocence data
|
|
68
|
-
* and pre-computed amplified language signals.
|
|
69
|
-
*
|
|
70
|
-
* @param certainty - Base certainty score (0-5)
|
|
71
|
-
* @param profaneLanguage - ISO 639-1 code of the language where the word is profane
|
|
72
|
-
* @param innocentEntries - Array of languages where the word is innocent
|
|
73
|
-
* @param amplified - Pre-computed weighted average of word + document language signals
|
|
74
|
-
* @returns Adjusted certainty, clamped to [0, 5]
|
|
75
|
-
*/
|
|
76
|
-
export function adjustCertaintyForLanguage(certainty, profaneLanguage, innocentEntries, amplified, isPartialMatch = false) {
|
|
77
|
-
var _a, _b, _c;
|
|
78
|
-
// No innocent entries → no adjustment possible
|
|
79
|
-
if (innocentEntries.length === 0) {
|
|
80
|
-
return certainty;
|
|
81
|
-
}
|
|
82
|
-
const profaneAmp = (_a = amplified[profaneLanguage]) !== null && _a !== void 0 ? _a : 0;
|
|
83
|
-
// Find the strongest innocent signal among all innocent entries,
|
|
84
|
-
// considering language family confusion
|
|
85
|
-
let bestInnocentAmp = 0;
|
|
86
|
-
let bestEntry = null;
|
|
87
|
-
for (const entry of innocentEntries) {
|
|
88
|
-
const effectiveAmp = getEffectiveAmp(entry.language, amplified);
|
|
89
|
-
if (effectiveAmp > bestInnocentAmp) {
|
|
90
|
-
bestInnocentAmp = effectiveAmp;
|
|
91
|
-
bestEntry = entry;
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
// Both signals below threshold → no meaningful signal to act on
|
|
95
|
-
if (profaneAmp < 0.01 && bestInnocentAmp < 0.01) {
|
|
96
|
-
return certainty;
|
|
97
|
-
}
|
|
98
|
-
let adjusted = certainty;
|
|
99
|
-
if (bestInnocentAmp > profaneAmp && bestEntry) {
|
|
100
|
-
// Innocent language dominates → dampen certainty
|
|
101
|
-
// Use partialDampeningFactor when the word is embedded inside another word
|
|
102
|
-
const df = isPartialMatch
|
|
103
|
-
? ((_b = bestEntry.partialDampeningFactor) !== null && _b !== void 0 ? _b : bestEntry.dampeningFactor)
|
|
104
|
-
: bestEntry.dampeningFactor;
|
|
105
|
-
adjusted = certainty * (1 - df * bestInnocentAmp);
|
|
106
|
-
}
|
|
107
|
-
else if (profaneAmp > bestInnocentAmp && bestEntry) {
|
|
108
|
-
// Profane language dominates → boost certainty
|
|
109
|
-
const df = isPartialMatch
|
|
110
|
-
? ((_c = bestEntry.partialDampeningFactor) !== null && _c !== void 0 ? _c : bestEntry.dampeningFactor)
|
|
111
|
-
: bestEntry.dampeningFactor;
|
|
112
|
-
adjusted = certainty * (1 + df * profaneAmp);
|
|
113
|
-
}
|
|
114
|
-
// Equal signals → no change
|
|
115
|
-
// Clamp to [0, 5]
|
|
116
|
-
return Math.max(0, Math.min(5, adjusted));
|
|
117
|
-
}
|
|
118
|
-
//# sourceMappingURL=innocence-scoring.js.map
|