bekindprofanityfilter 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,415 +0,0 @@
1
- /**
2
- * Universal context patterns for multi-language profanity detection
3
- */
4
- /**
5
- * Universal context patterns that work across multiple languages
6
- */
7
- export const UNIVERSAL_CONTEXT_PATTERNS = [
8
- // === REDUCER PATTERNS ===
9
- // Proper noun before profane word — require two consecutive capitalized words
10
- // to distinguish "Dick Cheney" from "Some dick"
11
- {
12
- type: "proper_noun",
13
- pattern: /\b[A-Z][a-z]+\s+PROFANE_WORD\s+[A-Z][a-z]+\b/,
14
- weight: 0.3,
15
- delta: -2,
16
- languages: ["en", "fr", "de", "es", "it"],
17
- description: "Profane word sandwiched between proper nouns (place/person name)",
18
- examples: ["Hell Creek Road", "Fort Dick California"],
19
- },
20
- // Profane word followed by capitalized word (place names like "Ass Mountain")
21
- {
22
- type: "proper_noun",
23
- pattern: /\bPROFANE_WORD\s+[A-Z][a-z]{2,}\b/,
24
- weight: 0.3,
25
- delta: -2,
26
- languages: ["en", "fr", "de", "es", "it"],
27
- description: "Potential profanity followed by proper noun (place/person name)",
28
- examples: ["Ass Mountain", "Dick Cheney", "Hell Creek"],
29
- },
30
- // Medical/anatomical context
31
- {
32
- type: "medical",
33
- pattern: /\b(medical|anatomy|doctor|hospital|clinic|patient|diagnosis|treatment|surgical|clinical)\b.{0,50}PROFANE_WORD/i,
34
- weight: 0.1,
35
- delta: -3,
36
- languages: ["*"],
37
- description: "Medical contexts where anatomical terms are appropriate",
38
- examples: [
39
- "medical examination of the ass",
40
- "doctor checked the damn thing",
41
- ],
42
- },
43
- // Anatomical context
44
- {
45
- type: "anatomical",
46
- pattern: /\b(body|part|muscle|bone|skin|tissue|organ|limb|extremity)\b.{0,30}PROFANE_WORD/i,
47
- weight: 0.3,
48
- delta: -2,
49
- languages: ["*"],
50
- description: "Anatomical contexts for body parts",
51
- examples: ["body part called ass", "muscle in the ass"],
52
- },
53
- // === BOOSTER PATTERNS ===
54
- {
55
- type: "sexual_verb_before",
56
- pattern: /\b(suck|ride|lick|grab|stroke|jerk|squirt|bang|blow|pound|hump|grind|fondle|grope|spank|thrust|mount|penetrate|finger|fist|step|stomp|foot)\b.{0,10}PROFANE_WORD/i,
57
- weight: 2.0,
58
- delta: 3,
59
- languages: ["*"],
60
- description: "Sexual verb before target word — confirms profane intent",
61
- examples: ["suck my cock", "ride that dick", "jerk that dick", "stomp on my"],
62
- },
63
- {
64
- type: "sexual_verb_after",
65
- pattern: /PROFANE_WORD.{0,10}\b(suck|ride|lick|grab|stroke|jerk|squirt|bang|blow|pound|hump|grind|fondle|grope|spank|thrust|mount|penetrate|finger|fist|step|stomp|foot|sucking|riding|licking|grabbing|stroking|jerking|squirting|banging|blowing|pounding|humping|grinding|fondling|groping|spanking|thrusting|mounting|penetrating|fingering|fisting|stepping|stomping|footing)\b/i,
66
- weight: 2.0,
67
- delta: 3,
68
- languages: ["*"],
69
- description: "Sexual verb after target word — confirms profane intent",
70
- examples: ["cock sucking", "dick riding", "ass pounding"],
71
- },
72
- {
73
- type: "compound_slur",
74
- pattern: /PROFANE_WORD.{0,10}(hole|face|head|wipe|bag|job)\b/i,
75
- weight: 2.0,
76
- delta: 3,
77
- languages: ["*"],
78
- description: "Compound slur suffix — confirms profane intent (no \\b before suffix to match compounds like 'asshole')",
79
- examples: ["asshole", "dickhead", "dickface"],
80
- },
81
- {
82
- type: "insult_construction",
83
- pattern: /\b(piece of|load of|full of)\s.{0,5}PROFANE_WORD/i,
84
- weight: 1.4,
85
- delta: 2,
86
- languages: ["*"],
87
- description: "Insult construction — likely profane",
88
- examples: ["piece of ass", "load of cock"],
89
- },
90
- {
91
- type: "direct_address",
92
- pattern: /\b(you|your|u|ur)\b.{0,10}PROFANE_WORD/i,
93
- weight: 1.3,
94
- delta: 1,
95
- languages: ["*"],
96
- description: "Direct address — likely insult",
97
- examples: ["you dick", "your ass"],
98
- },
99
- {
100
- type: "pejorative_adj",
101
- pattern: /\b(stupid|ugly|fat|dumb|dirty|nasty|filthy)\b.{0,10}PROFANE_WORD/i,
102
- weight: 1.3,
103
- delta: 1,
104
- languages: ["*"],
105
- description: "Pejorative adjective before target — likely profane",
106
- examples: ["stupid ass", "fat dick", "dirty cock"],
107
- },
108
- ];
109
- /**
110
- * Language-specific context patterns
111
- */
112
- export const LANGUAGE_SPECIFIC_PATTERNS = {
113
- en: [],
114
- fr: [],
115
- de: [
116
- {
117
- type: "compound",
118
- pattern: /\bPROFANE_WORD(kopf|zeug|ding|sache)\b/i,
119
- weight: 0.5,
120
- delta: -1,
121
- languages: ["de"],
122
- description: "German compound word patterns",
123
- examples: ["Scheißzeug", "Arschloch"],
124
- },
125
- ],
126
- es: [],
127
- };
128
- /**
129
- * Word-specific context patterns for disambiguating ambiguous profane words.
130
- * Keyed by the lowercase profane word.
131
- */
132
- export const WORD_SPECIFIC_PATTERNS = {
133
- cock: [
134
- {
135
- type: "sexual_verb_before",
136
- pattern: /\b(big|hard|small|my|his)\b.{0,10}PROFANE_WORD/i,
137
- weight: 1.5,
138
- delta: 2,
139
- languages: ["*"],
140
- description: "Sexual/possessive context for cock",
141
- examples: ["big cock", "my cock", "his hard cock"],
142
- },
143
- {
144
- type: "compound",
145
- pattern: /\b(crow|rooster|hen|farm|chicken|dawn|poultry|barnyard)\b.{0,30}PROFANE_WORD/i,
146
- weight: 0.1,
147
- delta: -3,
148
- languages: ["*"],
149
- description: "Farming/zoological context — cock as rooster",
150
- examples: ["the cock crowed at dawn", "rooster and cock"],
151
- },
152
- {
153
- type: "compound",
154
- pattern: /PROFANE_WORD.{0,30}\b(crow|crowed|rooster|hen|farm|chicken|dawn|poultry|barnyard)\b/i,
155
- weight: 0.1,
156
- delta: -3,
157
- languages: ["*"],
158
- description: "Farming/zoological context after — cock as rooster",
159
- examples: ["cock crowed at dawn", "cock and hen"],
160
- },
161
- ],
162
- ass: [
163
- {
164
- type: "pejorative_adj",
165
- pattern: /\b(fat|kick|dumb|lazy)\b.{0,10}PROFANE_WORD/i,
166
- weight: 1.4,
167
- delta: 2,
168
- languages: ["*"],
169
- description: "Insult context for ass",
170
- examples: ["fat ass", "dumb ass", "kick your ass"],
171
- },
172
- {
173
- type: "compound",
174
- pattern: /\b(donkey|mule|equine|wild|herd|saddle|burro)\b.{0,30}PROFANE_WORD/i,
175
- weight: 0.1,
176
- delta: -3,
177
- languages: ["*"],
178
- description: "Zoological context — ass as donkey",
179
- examples: ["wild ass", "the donkey or ass"],
180
- },
181
- {
182
- type: "compound",
183
- pattern: /PROFANE_WORD.{0,30}\b(donkey|mule|equine|herd|saddle|burro)\b/i,
184
- weight: 0.1,
185
- delta: -3,
186
- languages: ["*"],
187
- description: "Zoological context after — ass as donkey",
188
- examples: ["ass is a species of equine"],
189
- },
190
- ],
191
- dick: [
192
- {
193
- type: "sexual_verb_before",
194
- pattern: /\b(big|small|my|his)\b.{0,10}PROFANE_WORD/i,
195
- weight: 1.5,
196
- delta: 2,
197
- languages: ["*"],
198
- description: "Sexual/possessive context for dick",
199
- examples: ["big dick", "my dick"],
200
- },
201
- {
202
- // Case-sensitive: matches "Dick Cheney" but not "dick cheney"
203
- // Works because getCertaintyDelta tests against original (non-normalized) text
204
- type: "proper_noun",
205
- pattern: /\bDick\s+[A-Z][a-z]+/,
206
- weight: 0.1,
207
- delta: -3,
208
- languages: ["en"],
209
- description: "Dick as proper name followed by surname",
210
- examples: ["Dick Cheney", "Dick Van Dyke"],
211
- },
212
- ],
213
- };
214
- /**
215
- * Context rule generator
216
- */
217
- export class ContextPatternMatcher {
218
- constructor(languages = ["en"]) {
219
- this.patterns = [...UNIVERSAL_CONTEXT_PATTERNS];
220
- this.languagePatterns = new Map();
221
- // Load language-specific patterns
222
- for (const lang of languages) {
223
- if (LANGUAGE_SPECIFIC_PATTERNS[lang]) {
224
- this.languagePatterns.set(lang, LANGUAGE_SPECIFIC_PATTERNS[lang]);
225
- }
226
- }
227
- }
228
- /**
229
- * Generate context rules for a specific word
230
- */
231
- generateRules(word, languages = ["en"]) {
232
- const rules = [];
233
- const allPatterns = [...this.patterns];
234
- // Add language-specific patterns
235
- for (const lang of languages) {
236
- const langPatterns = this.languagePatterns.get(lang) || [];
237
- allPatterns.push(...langPatterns);
238
- }
239
- // Add word-specific patterns
240
- const wordPatterns = WORD_SPECIFIC_PATTERNS[word.toLowerCase()];
241
- if (wordPatterns) {
242
- allPatterns.push(...wordPatterns);
243
- }
244
- for (const pattern of allPatterns) {
245
- // Skip if pattern doesn't apply to any of the specified languages
246
- if (!pattern.languages.includes("*") &&
247
- !pattern.languages.some((lang) => languages.includes(lang))) {
248
- continue;
249
- }
250
- // Replace PROFANE_WORD placeholder with actual word
251
- const regexSource = pattern.pattern.source.replace("PROFANE_WORD", this.escapeRegex(word));
252
- const regex = new RegExp(regexSource, pattern.pattern.flags);
253
- let action;
254
- if (pattern.weight < 0.3) {
255
- action = "reduce_score";
256
- }
257
- else if (pattern.weight > 0.8) {
258
- action = "increase_score";
259
- }
260
- else {
261
- action = "reduce_score";
262
- }
263
- rules.push({
264
- pattern: regex,
265
- action,
266
- weight: pattern.weight,
267
- delta: pattern.delta,
268
- priority: this.getPriority(pattern.type),
269
- });
270
- }
271
- return rules.sort((a, b) => a.priority - b.priority);
272
- }
273
- /**
274
- * Get priority for pattern type (reducers before boosters)
275
- */
276
- getPriority(type) {
277
- const priorities = {
278
- medical: 1,
279
- anatomical: 2,
280
- negation: 3,
281
- quotation: 4,
282
- proper_noun: 5,
283
- possessive: 6,
284
- article: 7,
285
- compound: 8,
286
- // Boosters after reducers
287
- pejorative_adj: 10,
288
- direct_address: 11,
289
- insult_construction: 12,
290
- compound_slur: 13,
291
- sexual_verb_before: 14,
292
- sexual_verb_after: 15,
293
- };
294
- return priorities[type] || 9;
295
- }
296
- /**
297
- * Escape regex special characters
298
- */
299
- escapeRegex(str) {
300
- return str.replace(/[\\^$.*+?()[\]{}|]/g, "\\$&");
301
- }
302
- /**
303
- * Add custom pattern
304
- */
305
- addPattern(pattern) {
306
- this.patterns.push(pattern);
307
- }
308
- /**
309
- * Add language-specific pattern
310
- */
311
- addLanguagePattern(language, pattern) {
312
- if (!this.languagePatterns.has(language)) {
313
- this.languagePatterns.set(language, []);
314
- }
315
- this.languagePatterns.get(language).push(pattern);
316
- }
317
- /**
318
- * Get all patterns for debugging
319
- */
320
- getAllPatterns() {
321
- return {
322
- universal: [...this.patterns],
323
- languageSpecific: new Map(this.languagePatterns),
324
- };
325
- }
326
- }
327
- /**
328
- * Context analyzer for scoring matches
329
- */
330
- export class ContextAnalyzer {
331
- constructor(languages = ["en"]) {
332
- this.contextWindow = 50; // Characters before and after the match
333
- this.patternMatcher = new ContextPatternMatcher(languages);
334
- }
335
- /**
336
- * Analyze context around a potential profanity match (legacy score-based model)
337
- */
338
- analyzeContext(text, matchStart, matchEnd, word) {
339
- // Extract context window
340
- const contextStart = Math.max(0, matchStart - this.contextWindow);
341
- const contextEnd = Math.min(text.length, matchEnd + this.contextWindow);
342
- const context = text.substring(contextStart, contextEnd);
343
- // Get rules for this word
344
- const rules = this.patternMatcher.generateRules(word);
345
- let score = 1.0; // Start with full profanity score
346
- const appliedRules = [];
347
- // Apply context rules
348
- for (const rule of rules) {
349
- const matched = rule.pattern.test(context);
350
- appliedRules.push({ rule, matched });
351
- if (matched) {
352
- if (rule.action === "reduce_score") {
353
- score *= rule.weight;
354
- }
355
- else if (rule.action === "increase_score") {
356
- score *= 2 - rule.weight; // Increase score
357
- }
358
- else if (rule.action === "whitelist") {
359
- score = 0; // Complete whitelist
360
- break;
361
- }
362
- }
363
- }
364
- // Determine confidence based on number of matching rules
365
- const matchingRules = appliedRules.filter((ar) => ar.matched).length;
366
- let confidence;
367
- if (matchingRules === 0) {
368
- confidence = "high"; // No context rules matched, likely profanity
369
- }
370
- else if (matchingRules <= 2) {
371
- confidence = "medium";
372
- }
373
- else {
374
- confidence = "low"; // Many context rules matched, likely innocent
375
- }
376
- return {
377
- score: Math.max(0, Math.min(1, score)),
378
- confidence,
379
- appliedRules,
380
- context,
381
- };
382
- }
383
- /**
384
- * Calculate the certainty delta for a word based on surrounding context.
385
- * Positive delta = booster (more likely profane).
386
- * Negative delta = reducer (more likely innocent).
387
- * Returns the sum of all matching pattern deltas.
388
- */
389
- getCertaintyDelta(text, matchStart, matchEnd, word) {
390
- const contextStart = Math.max(0, matchStart - this.contextWindow);
391
- const contextEnd = Math.min(text.length, matchEnd + this.contextWindow);
392
- const context = text.substring(contextStart, contextEnd);
393
- const rules = this.patternMatcher.generateRules(word);
394
- let totalDelta = 0;
395
- for (const rule of rules) {
396
- if (rule.pattern.test(context)) {
397
- totalDelta += rule.delta;
398
- }
399
- }
400
- return totalDelta;
401
- }
402
- /**
403
- * Set context window size
404
- */
405
- setContextWindow(size) {
406
- this.contextWindow = Math.max(10, Math.min(200, size));
407
- }
408
- /**
409
- * Add custom pattern to the analyzer
410
- */
411
- addCustomPattern(pattern) {
412
- this.patternMatcher.addPattern(pattern);
413
- }
414
- }
415
- //# sourceMappingURL=context-patterns.js.map