npm - bekindprofanityfilter - Versions diffs - 0.0.5 → 0.0.6 - Mend

bekindprofanityfilter 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/cjs/index.js +6 -6
package/dist/esm/languages/english-primary-all-languages.d.ts +0 -17
package/dist/esm.min.js +8 -0
package/package.json +5 -6
package/dist/esm/algos/aho-corasick.js +0 -238
package/dist/esm/algos/bloom-filter.js +0 -208
package/dist/esm/algos/context-patterns.js +0 -415
package/dist/esm/index.js +0 -2640
package/dist/esm/innocence-scoring.js +0 -118
package/dist/esm/language-detector.js +0 -952
package/dist/esm/language-dicts.js +0 -2718
package/dist/esm/languages/english-primary-all-languages.js +0 -36894
package/dist/esm/romanization-detector.js +0 -779

package/dist/esm/algos/context-patterns.js DELETED Viewed

@@ -1,415 +0,0 @@
-/**
- * Universal context patterns for multi-language profanity detection
- */
-/**
- * Universal context patterns that work across multiple languages
- */
-export const UNIVERSAL_CONTEXT_PATTERNS = [
-    // === REDUCER PATTERNS ===
-    // Proper noun before profane word — require two consecutive capitalized words
-    // to distinguish "Dick Cheney" from "Some dick"
-    {
-        type: "proper_noun",
-        pattern: /\b[A-Z][a-z]+\s+PROFANE_WORD\s+[A-Z][a-z]+\b/,
-        weight: 0.3,
-        delta: -2,
-        languages: ["en", "fr", "de", "es", "it"],
-        description: "Profane word sandwiched between proper nouns (place/person name)",
-        examples: ["Hell Creek Road", "Fort Dick California"],
-    },
-    // Profane word followed by capitalized word (place names like "Ass Mountain")
-    {
-        type: "proper_noun",
-        pattern: /\bPROFANE_WORD\s+[A-Z][a-z]{2,}\b/,
-        weight: 0.3,
-        delta: -2,
-        languages: ["en", "fr", "de", "es", "it"],
-        description: "Potential profanity followed by proper noun (place/person name)",
-        examples: ["Ass Mountain", "Dick Cheney", "Hell Creek"],
-    },
-    // Medical/anatomical context
-    {
-        type: "medical",
-        pattern: /\b(medical|anatomy|doctor|hospital|clinic|patient|diagnosis|treatment|surgical|clinical)\b.{0,50}PROFANE_WORD/i,
-        weight: 0.1,
-        delta: -3,
-        languages: ["*"],
-        description: "Medical contexts where anatomical terms are appropriate",
-        examples: [
-            "medical examination of the ass",
-            "doctor checked the damn thing",
-        ],
-    },
-    // Anatomical context
-    {
-        type: "anatomical",
-        pattern: /\b(body|part|muscle|bone|skin|tissue|organ|limb|extremity)\b.{0,30}PROFANE_WORD/i,
-        weight: 0.3,
-        delta: -2,
-        languages: ["*"],
-        description: "Anatomical contexts for body parts",
-        examples: ["body part called ass", "muscle in the ass"],
-    },
-    // === BOOSTER PATTERNS ===
-    {
-        type: "sexual_verb_before",
-        pattern: /\b(suck|ride|lick|grab|stroke|jerk|squirt|bang|blow|pound|hump|grind|fondle|grope|spank|thrust|mount|penetrate|finger|fist|step|stomp|foot)\b.{0,10}PROFANE_WORD/i,
-        weight: 2.0,
-        delta: 3,
-        languages: ["*"],
-        description: "Sexual verb before target word — confirms profane intent",
-        examples: ["suck my cock", "ride that dick", "jerk that dick", "stomp on my"],
-    },
-    {
-        type: "sexual_verb_after",
-        pattern: /PROFANE_WORD.{0,10}\b(suck|ride|lick|grab|stroke|jerk|squirt|bang|blow|pound|hump|grind|fondle|grope|spank|thrust|mount|penetrate|finger|fist|step|stomp|foot|sucking|riding|licking|grabbing|stroking|jerking|squirting|banging|blowing|pounding|humping|grinding|fondling|groping|spanking|thrusting|mounting|penetrating|fingering|fisting|stepping|stomping|footing)\b/i,
-        weight: 2.0,
-        delta: 3,
-        languages: ["*"],
-        description: "Sexual verb after target word — confirms profane intent",
-        examples: ["cock sucking", "dick riding", "ass pounding"],
-    },
-    {
-        type: "compound_slur",
-        pattern: /PROFANE_WORD.{0,10}(hole|face|head|wipe|bag|job)\b/i,
-        weight: 2.0,
-        delta: 3,
-        languages: ["*"],
-        description: "Compound slur suffix — confirms profane intent (no \\b before suffix to match compounds like 'asshole')",
-        examples: ["asshole", "dickhead", "dickface"],
-    },
-    {
-        type: "insult_construction",
-        pattern: /\b(piece of|load of|full of)\s.{0,5}PROFANE_WORD/i,
-        weight: 1.4,
-        delta: 2,
-        languages: ["*"],
-        description: "Insult construction — likely profane",
-        examples: ["piece of ass", "load of cock"],
-    },
-    {
-        type: "direct_address",
-        pattern: /\b(you|your|u|ur)\b.{0,10}PROFANE_WORD/i,
-        weight: 1.3,
-        delta: 1,
-        languages: ["*"],
-        description: "Direct address — likely insult",
-        examples: ["you dick", "your ass"],
-    },
-    {
-        type: "pejorative_adj",
-        pattern: /\b(stupid|ugly|fat|dumb|dirty|nasty|filthy)\b.{0,10}PROFANE_WORD/i,
-        weight: 1.3,
-        delta: 1,
-        languages: ["*"],
-        description: "Pejorative adjective before target — likely profane",
-        examples: ["stupid ass", "fat dick", "dirty cock"],
-    },
-];
-/**
- * Language-specific context patterns
- */
-export const LANGUAGE_SPECIFIC_PATTERNS = {
-    en: [],
-    fr: [],
-    de: [
-        {
-            type: "compound",
-            pattern: /\bPROFANE_WORD(kopf|zeug|ding|sache)\b/i,
-            weight: 0.5,
-            delta: -1,
-            languages: ["de"],
-            description: "German compound word patterns",
-            examples: ["Scheißzeug", "Arschloch"],
-        },
-    ],
-    es: [],
-};
-/**
- * Word-specific context patterns for disambiguating ambiguous profane words.
- * Keyed by the lowercase profane word.
- */
-export const WORD_SPECIFIC_PATTERNS = {
-    cock: [
-        {
-            type: "sexual_verb_before",
-            pattern: /\b(big|hard|small|my|his)\b.{0,10}PROFANE_WORD/i,
-            weight: 1.5,
-            delta: 2,
-            languages: ["*"],
-            description: "Sexual/possessive context for cock",
-            examples: ["big cock", "my cock", "his hard cock"],
-        },
-        {
-            type: "compound",
-            pattern: /\b(crow|rooster|hen|farm|chicken|dawn|poultry|barnyard)\b.{0,30}PROFANE_WORD/i,
-            weight: 0.1,
-            delta: -3,
-            languages: ["*"],
-            description: "Farming/zoological context — cock as rooster",
-            examples: ["the cock crowed at dawn", "rooster and cock"],
-        },
-        {
-            type: "compound",
-            pattern: /PROFANE_WORD.{0,30}\b(crow|crowed|rooster|hen|farm|chicken|dawn|poultry|barnyard)\b/i,
-            weight: 0.1,
-            delta: -3,
-            languages: ["*"],
-            description: "Farming/zoological context after — cock as rooster",
-            examples: ["cock crowed at dawn", "cock and hen"],
-        },
-    ],
-    ass: [
-        {
-            type: "pejorative_adj",
-            pattern: /\b(fat|kick|dumb|lazy)\b.{0,10}PROFANE_WORD/i,
-            weight: 1.4,
-            delta: 2,
-            languages: ["*"],
-            description: "Insult context for ass",
-            examples: ["fat ass", "dumb ass", "kick your ass"],
-        },
-        {
-            type: "compound",
-            pattern: /\b(donkey|mule|equine|wild|herd|saddle|burro)\b.{0,30}PROFANE_WORD/i,
-            weight: 0.1,
-            delta: -3,
-            languages: ["*"],
-            description: "Zoological context — ass as donkey",
-            examples: ["wild ass", "the donkey or ass"],
-        },
-        {
-            type: "compound",
-            pattern: /PROFANE_WORD.{0,30}\b(donkey|mule|equine|herd|saddle|burro)\b/i,
-            weight: 0.1,
-            delta: -3,
-            languages: ["*"],
-            description: "Zoological context after — ass as donkey",
-            examples: ["ass is a species of equine"],
-        },
-    ],
-    dick: [
-        {
-            type: "sexual_verb_before",
-            pattern: /\b(big|small|my|his)\b.{0,10}PROFANE_WORD/i,
-            weight: 1.5,
-            delta: 2,
-            languages: ["*"],
-            description: "Sexual/possessive context for dick",
-            examples: ["big dick", "my dick"],
-        },
-        {
-            // Case-sensitive: matches "Dick Cheney" but not "dick cheney"
-            // Works because getCertaintyDelta tests against original (non-normalized) text
-            type: "proper_noun",
-            pattern: /\bDick\s+[A-Z][a-z]+/,
-            weight: 0.1,
-            delta: -3,
-            languages: ["en"],
-            description: "Dick as proper name followed by surname",
-            examples: ["Dick Cheney", "Dick Van Dyke"],
-        },
-    ],
-};
-/**
- * Context rule generator
- */
-export class ContextPatternMatcher {
-    constructor(languages = ["en"]) {
-        this.patterns = [...UNIVERSAL_CONTEXT_PATTERNS];
-        this.languagePatterns = new Map();
-        // Load language-specific patterns
-        for (const lang of languages) {
-            if (LANGUAGE_SPECIFIC_PATTERNS[lang]) {
-                this.languagePatterns.set(lang, LANGUAGE_SPECIFIC_PATTERNS[lang]);
-            }
-        }
-    }
-    /**
-     * Generate context rules for a specific word
-     */
-    generateRules(word, languages = ["en"]) {
-        const rules = [];
-        const allPatterns = [...this.patterns];
-        // Add language-specific patterns
-        for (const lang of languages) {
-            const langPatterns = this.languagePatterns.get(lang) || [];
-            allPatterns.push(...langPatterns);
-        }
-        // Add word-specific patterns
-        const wordPatterns = WORD_SPECIFIC_PATTERNS[word.toLowerCase()];
-        if (wordPatterns) {
-            allPatterns.push(...wordPatterns);
-        }
-        for (const pattern of allPatterns) {
-            // Skip if pattern doesn't apply to any of the specified languages
-            if (!pattern.languages.includes("*") &&
-                !pattern.languages.some((lang) => languages.includes(lang))) {
-                continue;
-            }
-            // Replace PROFANE_WORD placeholder with actual word
-            const regexSource = pattern.pattern.source.replace("PROFANE_WORD", this.escapeRegex(word));
-            const regex = new RegExp(regexSource, pattern.pattern.flags);
-            let action;
-            if (pattern.weight < 0.3) {
-                action = "reduce_score";
-            }
-            else if (pattern.weight > 0.8) {
-                action = "increase_score";
-            }
-            else {
-                action = "reduce_score";
-            }
-            rules.push({
-                pattern: regex,
-                action,
-                weight: pattern.weight,
-                delta: pattern.delta,
-                priority: this.getPriority(pattern.type),
-            });
-        }
-        return rules.sort((a, b) => a.priority - b.priority);
-    }
-    /**
-     * Get priority for pattern type (reducers before boosters)
-     */
-    getPriority(type) {
-        const priorities = {
-            medical: 1,
-            anatomical: 2,
-            negation: 3,
-            quotation: 4,
-            proper_noun: 5,
-            possessive: 6,
-            article: 7,
-            compound: 8,
-            // Boosters after reducers
-            pejorative_adj: 10,
-            direct_address: 11,
-            insult_construction: 12,
-            compound_slur: 13,
-            sexual_verb_before: 14,
-            sexual_verb_after: 15,
-        };
-        return priorities[type] || 9;
-    }
-    /**
-     * Escape regex special characters
-     */
-    escapeRegex(str) {
-        return str.replace(/[\\^$.*+?()[\]{}|]/g, "\\$&");
-    }
-    /**
-     * Add custom pattern
-     */
-    addPattern(pattern) {
-        this.patterns.push(pattern);
-    }
-    /**
-     * Add language-specific pattern
-     */
-    addLanguagePattern(language, pattern) {
-        if (!this.languagePatterns.has(language)) {
-            this.languagePatterns.set(language, []);
-        }
-        this.languagePatterns.get(language).push(pattern);
-    }
-    /**
-     * Get all patterns for debugging
-     */
-    getAllPatterns() {
-        return {
-            universal: [...this.patterns],
-            languageSpecific: new Map(this.languagePatterns),
-        };
-    }
-}
-/**
- * Context analyzer for scoring matches
- */
-export class ContextAnalyzer {
-    constructor(languages = ["en"]) {
-        this.contextWindow = 50; // Characters before and after the match
-        this.patternMatcher = new ContextPatternMatcher(languages);
-    }
-    /**
-     * Analyze context around a potential profanity match (legacy score-based model)
-     */
-    analyzeContext(text, matchStart, matchEnd, word) {
-        // Extract context window
-        const contextStart = Math.max(0, matchStart - this.contextWindow);
-        const contextEnd = Math.min(text.length, matchEnd + this.contextWindow);
-        const context = text.substring(contextStart, contextEnd);
-        // Get rules for this word
-        const rules = this.patternMatcher.generateRules(word);
-        let score = 1.0; // Start with full profanity score
-        const appliedRules = [];
-        // Apply context rules
-        for (const rule of rules) {
-            const matched = rule.pattern.test(context);
-            appliedRules.push({ rule, matched });
-            if (matched) {
-                if (rule.action === "reduce_score") {
-                    score *= rule.weight;
-                }
-                else if (rule.action === "increase_score") {
-                    score *= 2 - rule.weight; // Increase score
-                }
-                else if (rule.action === "whitelist") {
-                    score = 0; // Complete whitelist
-                    break;
-                }
-            }
-        }
-        // Determine confidence based on number of matching rules
-        const matchingRules = appliedRules.filter((ar) => ar.matched).length;
-        let confidence;
-        if (matchingRules === 0) {
-            confidence = "high"; // No context rules matched, likely profanity
-        }
-        else if (matchingRules <= 2) {
-            confidence = "medium";
-        }
-        else {
-            confidence = "low"; // Many context rules matched, likely innocent
-        }
-        return {
-            score: Math.max(0, Math.min(1, score)),
-            confidence,
-            appliedRules,
-            context,
-        };
-    }
-    /**
-     * Calculate the certainty delta for a word based on surrounding context.
-     * Positive delta = booster (more likely profane).
-     * Negative delta = reducer (more likely innocent).
-     * Returns the sum of all matching pattern deltas.
-     */
-    getCertaintyDelta(text, matchStart, matchEnd, word) {
-        const contextStart = Math.max(0, matchStart - this.contextWindow);
-        const contextEnd = Math.min(text.length, matchEnd + this.contextWindow);
-        const context = text.substring(contextStart, contextEnd);
-        const rules = this.patternMatcher.generateRules(word);
-        let totalDelta = 0;
-        for (const rule of rules) {
-            if (rule.pattern.test(context)) {
-                totalDelta += rule.delta;
-            }
-        }
-        return totalDelta;
-    }
-    /**
-     * Set context window size
-     */
-    setContextWindow(size) {
-        this.contextWindow = Math.max(10, Math.min(200, size));
-    }
-    /**
-     * Add custom pattern to the analyzer
-     */
-    addCustomPattern(pattern) {
-        this.patternMatcher.addPattern(pattern);
-    }
-}
-//# sourceMappingURL=context-patterns.js.map