badwords-py 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
badwords/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """A library for effective moderation of content."""
2
+
3
+ from .check import ProfanityFilter
4
+
5
+ __all__ = ["ProfanityFilter"]
badwords/check.py ADDED
@@ -0,0 +1,148 @@
1
+ """Module for checking text for badwords."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from difflib import SequenceMatcher
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING
8
+
9
+ from .exceptions import NotSupportedLanguage
10
+ from .text_processor import TextProcessor
11
+
12
+ if TYPE_CHECKING:
13
+ from typing import Self
14
+ else:
15
+ Self = "NotSupportedLanguage"
16
+
17
+
18
+ class ProfanityFilter:
19
+ """A class for filtering profanity from text."""
20
+
21
+ def init(
22
+ self: Self,
23
+ languages: list[str] | None = None,
24
+ processing_normalize_text: bool = True,
25
+ processing_aggressive_normalize: bool = True,
26
+ processing_transliterate: bool = True,
27
+ processing_replace_homoglyphs: bool = True,
28
+ ) -> None:
29
+ """Initialize the profanity filter.
30
+
31
+ :param languages: List of languages to load profanity words for.
32
+ :param processing_normalize_text: Enable text normalization for all loaded words and word to filter
33
+ :param processing_aggressive_normalize: Enable aggressive text normalization for all loaded words and word to filter
34
+ :param processing_transliterate: Enable transliteration of text for all loaded words and word to filter
35
+ :param processing_replace_homoglyphs: Enable replacing of homoglyphs in the text for all loaded words and word to filter
36
+ """
37
+ self.resource_dir = Path(__file__).parent / "resource"
38
+
39
+ self.text_processor = TextProcessor(
40
+ processing_normalize_text=processing_normalize_text,
41
+ processing_aggressive_normalize=processing_aggressive_normalize,
42
+ processing_transliterate=processing_transliterate,
43
+ processing_replace_homoglyphs=processing_replace_homoglyphs
44
+ )
45
+
46
+ self.language_files = self.initialize_language_files()
47
+
48
+ if languages:
49
+ if all(i in self.language_files for i in languages):
50
+ self.language_files = languages
51
+ else:
52
+ raise NotSupportedLanguage
53
+
54
+ self.bad_words = self.initialize_bad_words()
55
+
56
+ def initialize_language_files(self: Self) -> list[str]:
57
+ """Initialize language files.
58
+
59
+ :return: Dictionary mapping language names to file paths.
60
+ """
61
+ return [str(path)[-6:-4] for path in (self.resource_dir).iterdir()]
62
+
63
+ def initialize_bad_words(self: Self) -> set[str]:
64
+ """Initialize the set of bad words from language files."""
65
+ bad_words: set[str] = set()
66
+
67
+ for lang in self.language_files:
68
+ try:
69
+ # Sanitize the language code to prevent path traversal
70
+ lang = lang.lower().strip()
71
+ if not lang.isalpha():
72
+ continue
73
+
74
+ file_path = self.resource_dir / f"{lang}.bdw"
75
+ if not file_path.exists():
76
+ continue
77
+
78
+ with file_path.open(encoding="utf-8") as f:
79
+ words = f.read().splitlines()
80
+ processed_words = [self.text_processor.process_text(word) for word in words]
81
+ bad_words.update(processed_words)
82
+ except Exception as e:
83
+ print(f"Error loading language file for {lang}: {e}")
84
+ continue
85
+
86
+ return bad_words
87
+
88
+ def add_words(self: Self, words: list[str]) -> None:
89
+ """Add custom profanity words to the filter.
90
+
91
+ :param words: List of custom profanity words.
92
+ """
93
+ processed_words = [self.text_processor.process_text(word) for word in words]
94
+ self.bad_words.update(processed_words)
95
+
96
+ def similar(self: Self, a: str, b: str) -> float:
97
+ """Compute similarity ratio between two strings.
98
+
99
+ :param a: First string.
100
+ :param b: Second string.
101
+ :return: Similarity ratio.
102
+ """
103
+ return SequenceMatcher(None, a, b).ratio()
104
+
105
+ def filter_text(
106
+ self: Self, text: str,
107
+ match_threshold: float | None = None,
108
+ replace_character: str | None = None,
109
+ ) -> bool | str:
110
+ """Check if the given text contains profanity.
111
+
112
+ :param text: Input text to check.
113
+ :param match_threshold: Threshold for similarity match.
114
+ :param replace_character: Character to replace profane words with. If None,
115
+ return True/False.
116
+ :return: True if profanity found, False otherwise. If replace_character is
117
+ specified, return filtered text.
118
+ """
119
+ if not match_threshold:
120
+ match_threshold = 1
121
+
122
+ # Process the input text through all transformations
123
+ processed_text = self.text_processor.process_text(text)
124
+ words = processed_text.split()
125
+
126
+ for word in words:
127
+ # Check exact match
128
+ if word in self.bad_words:
129
+ if replace_character:
130
+ return text.replace(word, replace_character * len(word))
131
+ return True
132
+
133
+ # Check similar matches if threshold is less than 1
134
+ if 0 < match_threshold < 1:
135
+ for bad_word in self.bad_words:
136
+ if self.similar(word, bad_word) > match_threshold:
137
+ if replace_character:
138
+ return text.replace(word, replace_character * len(word))
139
+ return True
140
+
141
+ return False
142
+
143
+ def get_all_languages(self: Self) -> list[str]:
144
+ """Get a list of all available languages.
145
+
146
+ :return: List of all language names.
147
+ """
148
+ return self.language_files
badwords/exceptions.py ADDED
@@ -0,0 +1,16 @@
1
+ """Exceptions module."""
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from typing import Self
7
+ else:
8
+ Self = "NotSupportedLanguage"
9
+
10
+
11
+ class NotSupportedLanguage(BaseException):
12
+ """Unsupport language check."""
13
+
14
+ def __str__(self: Self) -> str:
15
+ """String-like representation of exception."""
16
+ return "This language is not supported"
@@ -0,0 +1,119 @@
1
+ foderíamos
2
+ fodêssemos
3
+ foderíeis
4
+ fodêramos
5
+ fodêsseis
6
+ merdimbuca
7
+ putariinha
8
+ chibundas
9
+ chibundos
10
+ cácété
11
+ foderemos
12
+ fodêreis
13
+ fodíamos
14
+ fudeção
15
+ putariona
16
+ boasudas
17
+ boazudas
18
+ cassetas
19
+ chibunda
20
+ chibundo
21
+ culhões
22
+ cunhões
23
+ foderdes
24
+ fodereis
25
+ foderiam
26
+ foderias
27
+ fodermos
28
+ foderás
29
+ foderão
30
+ fodessem
31
+ fodesses
32
+ fodestes
33
+ fodíeis
34
+ fudecão
35
+ fudeçao
36
+ fudidas?
37
+ fudidos?
38
+ peithola
39
+ putarias
40
+ xibundas
41
+ xibundos
42
+ babacas
43
+ boasuda
44
+ boazuda
45
+ cacetas
46
+ cacetes
47
+ casseta
48
+ cassete
49
+ culhoes
50
+ culhão
51
+ cunhoes
52
+ cunhão
53
+ curalho
54
+ fodamos
55
+ fodasse
56
+ fodemos
57
+ fodendo
58
+ foderam
59
+ foderas
60
+ foderei
61
+ foderem
62
+ foderes
63
+ foderia
64
+ foderum
65
+ foderá
66
+ fodesse
67
+ fodeste
68
+ fodinha
69
+ fudecao
70
+ fudedor
71
+ fudendo
72
+ furonas
73
+ putaria
74
+ putães
75
+ putãos
76
+ putões
77
+ xibunda
78
+ xibundo
79
+ babaca
80
+ bostas
81
+ caceta
82
+ cacete
83
+ culhao
84
+ cunhao
85
+ fodais
86
+ fodeis
87
+ fodera
88
+ fodete
89
+ fodiam
90
+ fodias
91
+ fodão
92
+ furona
93
+ furão
94
+ merdas
95
+ putão
96
+ sefoda
97
+ bosta
98
+ fodam
99
+ fodao
100
+ fodas
101
+ fodei
102
+ fodem
103
+ foder
104
+ fodes
105
+ fodeu
106
+ fodia
107
+ fuder
108
+ fudeu
109
+ furao
110
+ merda
111
+ porra
112
+ putos
113
+ foda
114
+ fode
115
+ fodi
116
+ fodo
117
+ puto
118
+ cus
119
+ cu
@@ -0,0 +1,12 @@
1
+ {
2
+ "a": ["4", "@", "а"],
3
+ "b": ["8", "6", "в"],
4
+ "e": ["3", "ё", "е"],
5
+ "g": ["9", "6"],
6
+ "i": ["1", "!", "і"],
7
+ "l": ["1", "|", "!"],
8
+ "o": ["0", "о"],
9
+ "s": ["5", "$", "ѕ"],
10
+ "t": ["7", "+"],
11
+ "z": ["2", "z"]
12
+ }
@@ -0,0 +1,51 @@
1
+ pokurvená
2
+ pokurvení
3
+ pokurvený
4
+ pomočená
5
+ pomočený
6
+ zkurvysyni
7
+ pochcaná
8
+ pochcaný
9
+ pochčije
10
+ prasárna
11
+ prasárny
12
+ zkurvená
13
+ zkurvení
14
+ zkurvený
15
+ zkurvysyn
16
+ chcánky
17
+ hovadina
18
+ hovadiny
19
+ kadění
20
+ kokotina
21
+ kokotiny
22
+ pokurvit
23
+ pokurví
24
+ prdění
25
+ chcaní
26
+ chčije
27
+ pochcat
28
+ sračka
29
+ sračky
30
+ zkurvil
31
+ zkurvit
32
+ zkurví
33
+ chcát
34
+ mamrdi
35
+ prdele
36
+ prdět
37
+ řitě
38
+ hovna
39
+ hovno
40
+ kadit
41
+ kadí
42
+ kruci
43
+ mamrd
44
+ prdel
45
+ sakra
46
+ satan
47
+ srát
48
+ řiť
49
+ prdy
50
+ sere
51
+ prd
@@ -0,0 +1,60 @@
1
+ åndsforsnottet
2
+ skidespræller
3
+ skvadderhoved
4
+ pestspreder
5
+ skvatpisser
6
+ spytslikker
7
+ torskepande
8
+ undermåler
9
+ åndsamøbe
10
+ danglebær
11
+ dinglebær
12
+ lortefjæs
13
+ lusepuster
14
+ pattebørn
15
+ pladderabe
16
+ ringlebær
17
+ satanedeme
18
+ sjatpisser
19
+ tøsedreng
20
+ åndsbolle
21
+ ærkefjols
22
+ øgleyngel
23
+ agurketud
24
+ forpulede
25
+ kvabodder
26
+ kvajhoved
27
+ kvajpande
28
+ lorteøre
29
+ pattebarn
30
+ slapsvans
31
+ forpulet
32
+ nakkeost
33
+ narrehat
34
+ pikfjæs
35
+ pikhoved
36
+ skiderik
37
+ abelort
38
+ fandeme
39
+ fåking
40
+ fåkker
41
+ møgdyr
42
+ narrøv
43
+ urinere
44
+ bovlam
45
+ fanden
46
+ fandme
47
+ focker
48
+ narhat
49
+ satan
50
+ satme
51
+ skide
52
+ skvat
53
+ fock
54
+ fåk
55
+ lort
56
+ pjok
57
+ skid
58
+ svin
59
+ urin
60
+ sgu
@@ -0,0 +1,62 @@
1
+ leckmich[ai]m[aä]rsche?
2
+ huren?s[oö]hne?
3
+ schlappschwänze
4
+ geht?sterben
5
+ schwanzlutscher
6
+ muschileckerin
7
+ pferdescheiße
8
+ schei(ss|ß)e?
9
+ schlappschwanz
10
+ drecksbälger
11
+ mutterfick\w+
12
+ sackgesichter
13
+ schwanzköpfe
14
+ wichsvorlagen
15
+ angeschissen
16
+ dreckstücke
17
+ fickteuch
18
+ gottverdammt
19
+ kackbratzen?
20
+ muschilecker
21
+ wichsvorlage
22
+ dreckskerle
23
+ dreckstück
24
+ hackfressen
25
+ ihraffen
26
+ sackgesicht
27
+ saftärsche
28
+ scheißkerl
29
+ schwanzkopf
30
+ verfickt
31
+ drecksbalg
32
+ dreckskerl
33
+ drecksäue
34
+ geschissen
35
+ hackfresse
36
+ kackfresse
37
+ schlampen?
38
+ arschloch
39
+ drecksack
40
+ huren?
41
+ saftarsch
42
+ schnepfen
43
+ arsch
44
+ drecksau
45
+ fickdich
46
+ sauhunde
47
+ saukerle
48
+ schnepfe
49
+ brunzen
50
+ fettsau
51
+ sauhund
52
+ saukerl
53
+ seichen
54
+ wichser
55
+ kacke?
56
+ poppen
57
+ schiss
58
+ husos
59
+ pisse
60
+ pisst
61
+ huso
62
+ nutte
@@ -0,0 +1,39 @@
1
+ opsodemieteren
2
+ hoerententen
3
+ godverdomme
4
+ hoeretenten
5
+ volkomenkut
6
+ hoerentent
7
+ opdonderen
8
+ schijterts
9
+ hoeretent
10
+ rotzakken
11
+ schijterd
12
+ schijters
13
+ schijtert
14
+ slappelul
15
+ verkloten
16
+ goddomme
17
+ godsamme
18
+ kankeren
19
+ oprotten
20
+ schijten
21
+ schijter
22
+ verdomme
23
+ verkloot
24
+ pleuris
25
+ rukkers
26
+ schurft
27
+ verdomd
28
+ godver
29
+ rotzak
30
+ rukker
31
+ scheit
32
+ schijt
33
+ stront
34
+ tering
35
+ zeiken
36
+ klote
37
+ tyfus
38
+ gvd
39
+ kut
@@ -0,0 +1,66 @@
1
+ (ape|bat|bull?|butt|dip|dog|dumb|ebo|holy|horse|jack|pedo|pig|ubi)sh(it|ti)s?
2
+ d[il]+(ck|kc)(ass|bag|breath|eat|face|flip|head|hole|less|suck|weed)
3
+ f[chj]?(a|au|aw|e|o+|u|uy)[ch]*k(e?d|e?rs?|[ei]?n+g?|t)
4
+ m[aou](d+|t+|th|ht|z)(a|e|er|ir|ur)(c|f)c?[aou]c?k
5
+ m[ou]th(a|er)(f|ph)[vue]+c?[gkqx]+(e?d|e?r|[ei]?n+g?)
6
+ f[vu]+h*c+[hjvk]*(e?d|e?r|[ei]?n+g?|t)
7
+ b+i+a*t+c+h+(e+d|e*r?[sz]+|[ei]?n+g?|y)
8
+ f[vu]+c?[xkq]+(e?d|e?r|[ei]?n+g?|t)
9
+ p(oo+|u+)s+(a+y+|e+|eh|ey|i+|ie|y+)s*
10
+ m[ou]th(a|er)(f|ph)[vue]+c?[gkqx]+
11
+ f[vu]+h*q(e?d|e?r|[ei]?n+g?|t)
12
+ f[chj]?(a|au|aw|e|o+|u|uy)[ch]*k
13
+ f[ck]+(e?d|e?r|[ei]?n+g?|t)
14
+ kill your self
15
+ fu󠀡󠀡cki󠀡󠀡ng
16
+ dumb(f|ph)[vu]+c?[xkq]
17
+ c(oc?|aw)ksuc?k
18
+ d[il]+(ck|kc)(ed|ing)
19
+ ph[vu]+h*c+[hjvk]*
20
+ sonofabitch
21
+ who+re(ed|ing|s)
22
+ c(o|ah|aw)c?k(s|ed)
23
+ s+h+[ily]+t+[ersy]*
24
+ a+ss+w?hole?
25
+ ph[vu]+c?[xkq]+
26
+ f[vu]+c?[xkq]+
27
+ bull+sh+[ily]+t+
28
+ goddamn?it
29
+ goddamn?
30
+ a+ss+fu
31
+ c(o|ah|aw)c?k
32
+ d[il]+(ck|kc)
33
+ b+i+a*t+c+h+
34
+ buttfu
35
+ cumm(er|ing)
36
+ f[vu]+h*c
37
+ f[vu]+h*q
38
+ sh+i+[ae]+t+
39
+ c+u+n+t+
40
+ d+i+c+k+s*
41
+ a+sholes?
42
+ sh+e+i+t+
43
+ F\.U\.C\.K
44
+ chodes?
45
+ cumshot
46
+ fucking
47
+ shit
48
+ shtpost
49
+ who+re?
50
+ cumbag
51
+ twats?
52
+ cums?
53
+ dafuq
54
+ diсk
55
+ coq+
56
+ hore
57
+ kock
58
+ shat
59
+ dik
60
+ fck
61
+ negr[ao]+e?s?
62
+ f+u+c+k+
63
+ fцск
64
+ shіtty
65
+ f_uc_k
66
+ pussy
@@ -0,0 +1,57 @@
1
+ perkeleeseen
2
+ perkeleessä
3
+ perkeleestä
4
+ perkeleelle
5
+ perkelettä
6
+ vittupään
7
+ vittupäät
8
+ paskanaama
9
+ vittupää
10
+ jumalauta
11
+ paskattaa
12
+ paskoihin
13
+ perkeleen
14
+ perkeleet
15
+ mulkeron
16
+ mulkerot
17
+ paskalla
18
+ paskalle
19
+ paskalta
20
+ paskassa
21
+ paskasta
22
+ paskojen
23
+ saatanaa
24
+ saatanan
25
+ saatanat
26
+ aashole
27
+ hitolla
28
+ hitossa
29
+ hitosta
30
+ hittoon
31
+ kyrpiä
32
+ mulkero
33
+ paskaan
34
+ paskoja
35
+ perkele
36
+ perkule
37
+ perseen
38
+ pissata
39
+ saatana
40
+ kyrpii
41
+ paskaa
42
+ paskan
43
+ paskat
44
+ paskoi
45
+ paskoo
46
+ paskot
47
+ pissaa
48
+ hiton
49
+ hitot
50
+ hitto
51
+ paska
52
+ perse
53
+ pissa
54
+ prkle
55
+ viddu
56
+ prkl
57
+ prk