badwords-py 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- badwords/__init__.py +5 -0
- badwords/check.py +148 -0
- badwords/exceptions.py +16 -0
- badwords/resource/br.bdw +119 -0
- badwords/resource/character_frequency.json +12 -0
- badwords/resource/cz.bdw +51 -0
- badwords/resource/da.bdw +60 -0
- badwords/resource/de.bdw +62 -0
- badwords/resource/du.bdw +39 -0
- badwords/resource/en.bdw +66 -0
- badwords/resource/fi.bdw +57 -0
- badwords/resource/fr.bdw +112 -0
- badwords/resource/gr.bdw +470 -0
- badwords/resource/homoglyphs.json +28 -0
- badwords/resource/hu.bdw +79 -0
- badwords/resource/in.bdw +119 -0
- badwords/resource/it.bdw +153 -0
- badwords/resource/ja.bdw +24 -0
- badwords/resource/ko.bdw +175 -0
- badwords/resource/lt.bdw +153 -0
- badwords/resource/no.bdw +62 -0
- badwords/resource/pl.bdw +6834 -0
- badwords/resource/po.bdw +83 -0
- badwords/resource/ro.bdw +8 -0
- badwords/resource/ru.bdw +3693 -0
- badwords/resource/sp.bdw +395 -0
- badwords/resource/sw.bdw +16 -0
- badwords/resource/th.bdw +13 -0
- badwords/resource/transliteration.json +9 -0
- badwords/resource/tu.bdw +780 -0
- badwords/resource/ua.bdw +1965 -0
- badwords/resource/unicode_mappings.json +114 -0
- badwords/text_processor.py +207 -0
- badwords_py-2.1.0.dist-info/METADATA +201 -0
- badwords_py-2.1.0.dist-info/RECORD +38 -0
- badwords_py-2.1.0.dist-info/WHEEL +5 -0
- badwords_py-2.1.0.dist-info/licenses/LICENSE +9 -0
- badwords_py-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
{
|
|
2
|
+
"canadian_aboriginal": {
|
|
3
|
+
"ᑎ": "n", "ᑌ": "u", "ᙐ": "d", "ᗪ": "d", "ᗣ": "a",
|
|
4
|
+
"ᐃ": "i", "ᐅ": "o", "ᐊ": "a", "ᐁ": "e", "ᐄ": "i",
|
|
5
|
+
"ᐆ": "o", "ᐋ": "a", "ᐍ": "e", "ᐏ": "w", "ᐑ": "wi",
|
|
6
|
+
"ᐓ": "wo", "ᐕ": "wa", "ᐗ": "p", "ᐘ": "t", "ᐚ": "k",
|
|
7
|
+
"ᐜ": "c", "ᐞ": "m", "ᐠ": "n", "ᐢ": "s", "ᐤ": "y",
|
|
8
|
+
"ᐦ": "h", "ᐧ": "w", "ᐨ": "s", "ᐩ": "s", "ᐪ": "s"
|
|
9
|
+
},
|
|
10
|
+
"mathematical": {
|
|
11
|
+
"𝔞": "a", "𝔟": "b", "𝔠": "c", "𝔡": "d", "𝔢": "e",
|
|
12
|
+
"𝔣": "f", "𝔤": "g", "𝔥": "h", "𝔦": "i", "𝔧": "j",
|
|
13
|
+
"𝔨": "k", "𝔩": "l", "𝔪": "m", "𝔫": "n", "𝔬": "o",
|
|
14
|
+
"𝔭": "p", "𝔮": "q", "𝔯": "r", "𝔰": "s", "𝔱": "t",
|
|
15
|
+
"𝔲": "u", "𝔳": "v", "𝔴": "w", "𝔵": "x", "𝔶": "y",
|
|
16
|
+
"𝔷": "z"
|
|
17
|
+
},
|
|
18
|
+
"enclosed": {
|
|
19
|
+
"🅰": "a", "🅱": "b", "🅲": "c", "🅳": "d", "🅴": "e",
|
|
20
|
+
"🅵": "f", "🅶": "g", "🅷": "h", "🅸": "i", "🅹": "j",
|
|
21
|
+
"🅺": "k", "🅻": "l", "🅼": "m", "🅽": "n", "🅾": "o",
|
|
22
|
+
"🅿": "p", "🆀": "q", "🆁": "r", "🆂": "s", "🆃": "t",
|
|
23
|
+
"🆄": "u", "🆅": "v", "🆆": "w", "🆇": "x", "🆈": "y",
|
|
24
|
+
"🆉": "z",
|
|
25
|
+
"🅐": "a", "🅑": "b", "🅒": "c", "🅓": "d", "🅔": "e",
|
|
26
|
+
"🅕": "f", "🅖": "g", "🅗": "h", "🅘": "i", "🅙": "j",
|
|
27
|
+
"🅚": "k", "🅛": "l", "🅜": "m", "🅝": "n", "🅞": "o",
|
|
28
|
+
"🅟": "p", "🅠": "q", "🅡": "r", "🅢": "s", "🅣": "t",
|
|
29
|
+
"🅤": "u", "🅥": "v", "🅦": "w", "🅧": "x", "🅨": "y",
|
|
30
|
+
"🅩": "z"
|
|
31
|
+
},
|
|
32
|
+
"circled": {
|
|
33
|
+
"ⓐ": "a", "ⓑ": "b", "ⓒ": "c", "ⓓ": "d", "ⓔ": "e",
|
|
34
|
+
"ⓕ": "f", "ⓖ": "g", "ⓗ": "h", "ⓘ": "i", "ⓙ": "j",
|
|
35
|
+
"ⓚ": "k", "ⓛ": "l", "ⓜ": "m", "ⓝ": "n", "ⓞ": "o",
|
|
36
|
+
"ⓟ": "p", "ⓠ": "q", "ⓡ": "r", "ⓢ": "s", "ⓣ": "t",
|
|
37
|
+
"ⓤ": "u", "ⓥ": "v", "ⓦ": "w", "ⓧ": "x", "ⓨ": "y",
|
|
38
|
+
"ⓩ": "z"
|
|
39
|
+
},
|
|
40
|
+
"aesthetic": {
|
|
41
|
+
"a": "a", "b": "b", "c": "c", "d": "d", "e": "e",
|
|
42
|
+
"f": "f", "g": "g", "h": "h", "i": "i", "j": "j",
|
|
43
|
+
"k": "k", "l": "l", "m": "m", "n": "n", "o": "o",
|
|
44
|
+
"p": "p", "q": "q", "r": "r", "s": "s", "t": "t",
|
|
45
|
+
"u": "u", "v": "v", "w": "w", "x": "x", "y": "y",
|
|
46
|
+
"z": "z",
|
|
47
|
+
"𝐚": "a", "𝐛": "b", "𝐜": "c", "𝐝": "d", "𝐞": "e",
|
|
48
|
+
"𝐟": "f", "𝐠": "g", "𝐡": "h", "𝐢": "i", "𝐣": "j",
|
|
49
|
+
"𝐤": "k", "𝐥": "l", "𝐦": "m", "𝐧": "n", "𝐨": "o",
|
|
50
|
+
"𝐩": "p", "𝐪": "q", "𝐫": "r", "𝐬": "s", "𝐭": "t",
|
|
51
|
+
"𝐮": "u", "𝐯": "v", "𝐰": "w", "𝐱": "x", "𝐲": "y",
|
|
52
|
+
"𝐳": "z",
|
|
53
|
+
"𝒂": "a", "𝒃": "b", "𝒄": "c", "𝒅": "d", "𝒆": "e",
|
|
54
|
+
"𝒇": "f", "𝒈": "g", "𝒉": "h", "𝒊": "i", "𝒋": "j",
|
|
55
|
+
"𝒌": "k", "𝒍": "l", "𝒎": "m", "𝒏": "n", "𝒐": "o",
|
|
56
|
+
"𝒑": "p", "𝒒": "q", "𝒓": "r", "𝒔": "s", "𝒕": "t",
|
|
57
|
+
"𝒖": "u", "𝒗": "v", "𝒘": "w", "𝒙": "x", "𝒚": "y",
|
|
58
|
+
"𝒛": "z",
|
|
59
|
+
"𝓪": "a", "𝓫": "b", "𝓬": "c", "𝓭": "d", "𝓮": "e",
|
|
60
|
+
"𝓯": "f", "𝓰": "g", "𝓱": "h", "𝓲": "i", "𝓳": "j",
|
|
61
|
+
"𝓴": "k", "𝓵": "l", "𝓶": "m", "𝓷": "n", "𝓸": "o",
|
|
62
|
+
"𝓹": "p", "𝓺": "q", "𝓻": "r", "𝓼": "s", "𝓽": "t",
|
|
63
|
+
"𝓾": "u", "𝓿": "v", "𝔀": "w", "𝔁": "x", "𝔂": "y",
|
|
64
|
+
"𝔃": "z",
|
|
65
|
+
"𝔞": "a", "𝔟": "b", "𝔠": "c", "𝔡": "d", "𝔢": "e",
|
|
66
|
+
"𝔣": "f", "𝔤": "g", "𝔥": "h", "𝔦": "i", "𝔧": "j",
|
|
67
|
+
"𝔨": "k", "𝔩": "l", "𝔪": "m", "𝔫": "n", "𝔬": "o",
|
|
68
|
+
"𝔭": "p", "𝔮": "q", "𝔯": "r", "𝔰": "s", "𝔱": "t",
|
|
69
|
+
"𝔲": "u", "𝔳": "v", "𝔴": "w", "𝔵": "x", "𝔶": "y",
|
|
70
|
+
"𝔷": "z",
|
|
71
|
+
"𝕒": "a", "𝕓": "b", "𝕔": "c", "𝕕": "d", "𝕖": "e",
|
|
72
|
+
"𝕗": "f", "𝕘": "g", "𝕙": "h", "𝕚": "i", "𝕛": "j",
|
|
73
|
+
"𝕜": "k", "𝕝": "l", "𝕞": "m", "𝕟": "n", "𝕠": "o",
|
|
74
|
+
"𝕡": "p", "𝕢": "q", "𝕣": "r", "𝕤": "s", "𝕥": "t",
|
|
75
|
+
"𝕦": "u", "𝕧": "v", "𝕨": "w", "𝕩": "x", "𝕪": "y",
|
|
76
|
+
"𝕫": "z",
|
|
77
|
+
"𝖆": "a", "𝖇": "b", "𝖈": "c", "𝖉": "d", "𝖊": "e",
|
|
78
|
+
"𝖋": "f", "𝖌": "g", "𝖍": "h", "𝖎": "i", "𝖏": "j",
|
|
79
|
+
"𝖐": "k", "𝖑": "l", "𝖒": "m", "𝖓": "n", "𝖔": "o",
|
|
80
|
+
"𝖕": "p", "𝖖": "q", "𝖗": "r", "𝖘": "s", "𝖙": "t",
|
|
81
|
+
"𝖚": "u", "𝖛": "v", "𝖜": "w", "𝖝": "x", "𝖞": "y",
|
|
82
|
+
"𝖟": "z",
|
|
83
|
+
"𝗮": "a", "𝗯": "b", "𝗰": "c", "𝗱": "d", "𝗲": "e",
|
|
84
|
+
"𝗳": "f", "𝗴": "g", "𝗵": "h", "𝗶": "i", "𝗷": "j",
|
|
85
|
+
"𝗸": "k", "𝗹": "l", "𝗺": "m", "𝗻": "n", "𝗼": "o",
|
|
86
|
+
"𝗽": "p", "𝗾": "q", "𝗿": "r", "𝘀": "s", "𝘁": "t",
|
|
87
|
+
"𝘂": "u", "𝘃": "v", "𝘄": "w", "𝘅": "x", "𝘆": "y",
|
|
88
|
+
"𝘇": "z",
|
|
89
|
+
"𝘢": "a", "𝘣": "b", "𝘤": "c", "𝘥": "d", "𝘦": "e",
|
|
90
|
+
"𝘧": "f", "𝘨": "g", "𝘩": "h", "𝘪": "i", "𝘫": "j",
|
|
91
|
+
"𝘬": "k", "𝘭": "l", "𝘮": "m", "𝘯": "n", "𝘰": "o",
|
|
92
|
+
"𝘱": "p", "𝘲": "q", "𝘳": "r", "𝘴": "s", "𝘵": "t",
|
|
93
|
+
"𝘶": "u", "𝘷": "v", "𝘸": "w", "𝘹": "x", "𝘺": "y",
|
|
94
|
+
"𝘻": "z",
|
|
95
|
+
"𝙖": "a", "𝙗": "b", "𝙘": "c", "𝙙": "d", "𝙚": "e",
|
|
96
|
+
"𝙛": "f", "𝙜": "g", "𝙝": "h", "𝙞": "i", "𝙟": "j",
|
|
97
|
+
"𝙠": "k", "𝙡": "l", "𝙢": "m", "𝙣": "n", "𝙤": "o",
|
|
98
|
+
"𝙥": "p", "𝙦": "q", "𝙧": "r", "𝙨": "s", "𝙩": "t",
|
|
99
|
+
"𝙪": "u", "𝙫": "v", "𝙬": "w", "𝙭": "x", "𝙮": "y",
|
|
100
|
+
"𝙯": "z",
|
|
101
|
+
"𝚊": "a", "𝚋": "b", "𝚌": "c", "𝚍": "d", "𝚎": "e",
|
|
102
|
+
"𝚏": "f", "𝚐": "g", "𝚑": "h", "𝚒": "i", "𝚓": "j",
|
|
103
|
+
"𝚔": "k", "𝚕": "l", "𝚖": "m", "𝚗": "n", "𝚘": "o",
|
|
104
|
+
"𝚙": "p", "𝚚": "q", "𝚛": "r", "𝚜": "s", "𝚝": "t",
|
|
105
|
+
"𝚞": "u", "𝚟": "v", "𝚠": "w", "𝚡": "x", "𝚢": "y",
|
|
106
|
+
"𝚣": "z",
|
|
107
|
+
"【": "[", "】": "]", "〖": "[", "〗": "]", "〘": "[", "〙": "]",
|
|
108
|
+
"〚": "[", "〛": "]", "〝": "\"", "〞": "\"", "〟": "\"",
|
|
109
|
+
"〰": "-", "〱": "-", "〲": "-", "〳": "-", "〴": "-",
|
|
110
|
+
"〵": "-", "〶": "-", "〷": "-", "〸": "-", "〹": "-",
|
|
111
|
+
"〺": "-", "〻": "-", "〼": "-", "〽": "-", "〾": "-",
|
|
112
|
+
"〿": "-"
|
|
113
|
+
}
|
|
114
|
+
}
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Module for advanced text processing and normalization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import unicodedata
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Set
|
|
10
|
+
|
|
11
|
+
class TextProcessor:
|
|
12
|
+
"""A class for advanced text processing and normalization."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
processing_normalize_text: bool = True,
|
|
17
|
+
processing_aggressive_normalize: bool = True,
|
|
18
|
+
processing_transliterate: bool = True,
|
|
19
|
+
processing_replace_homoglyphs: bool = True,
|
|
20
|
+
) -> None:
|
|
21
|
+
"""Initialize the text processor."""
|
|
22
|
+
self.processing_normalize_text = processing_normalize_text
|
|
23
|
+
self.processing_aggressive_normalize = processing_aggressive_normalize
|
|
24
|
+
self.processing_transliterate = processing_transliterate
|
|
25
|
+
self.processing_replace_homoglyphs = processing_replace_homoglyphs
|
|
26
|
+
|
|
27
|
+
self.resource_dir = Path(__file__).parent / 'resource'
|
|
28
|
+
self.unicode_mappings = self._load_unicode_mappings()
|
|
29
|
+
|
|
30
|
+
if self.processing_replace_homoglyphs == True:
|
|
31
|
+
self.homoglyphs = self._load_homoglyphs()
|
|
32
|
+
|
|
33
|
+
self.character_frequency = self._load_character_frequency()
|
|
34
|
+
|
|
35
|
+
if self.processing_transliterate == True:
|
|
36
|
+
self.cyrillic_to_latin = self._load_transliteration()
|
|
37
|
+
self.latin_to_cyrillic = {v: k for k, v in self.cyrillic_to_latin.items()}
|
|
38
|
+
|
|
39
|
+
if self.processing_replace_homoglyphs == True:
|
|
40
|
+
self._build_homoglyph_map()
|
|
41
|
+
|
|
42
|
+
self._build_frequency_map()
|
|
43
|
+
|
|
44
|
+
def _load_unicode_mappings(self) -> Dict[str, str]:
|
|
45
|
+
"""Load Unicode mappings from JSON file."""
|
|
46
|
+
with open(self.resource_dir / 'unicode_mappings.json', 'r', encoding='utf-8') as f:
|
|
47
|
+
data = json.load(f)
|
|
48
|
+
mappings = {}
|
|
49
|
+
for category in data.values():
|
|
50
|
+
mappings.update(category)
|
|
51
|
+
return mappings
|
|
52
|
+
|
|
53
|
+
def _load_homoglyphs(self) -> Dict[str, list[str]]:
|
|
54
|
+
"""Load homoglyph mappings from JSON file."""
|
|
55
|
+
with open(self.resource_dir / 'homoglyphs.json', 'r', encoding='utf-8') as f:
|
|
56
|
+
return json.load(f)
|
|
57
|
+
|
|
58
|
+
def _load_character_frequency(self) -> Dict[str, list[str]]:
|
|
59
|
+
"""Load character frequency mappings from JSON file."""
|
|
60
|
+
with open(self.resource_dir / 'character_frequency.json', 'r', encoding='utf-8') as f:
|
|
61
|
+
return json.load(f)
|
|
62
|
+
|
|
63
|
+
def _load_transliteration(self) -> Dict[str, str]:
|
|
64
|
+
"""Load transliteration mappings from JSON file."""
|
|
65
|
+
with open(self.resource_dir / 'transliteration.json', 'r', encoding='utf-8') as f:
|
|
66
|
+
data = json.load(f)
|
|
67
|
+
return data['cyrillic_to_latin']
|
|
68
|
+
|
|
69
|
+
def _build_homoglyph_map(self) -> None:
|
|
70
|
+
"""Build a comprehensive homoglyph map."""
|
|
71
|
+
self.homoglyph_map: Dict[str, Set[str]] = {}
|
|
72
|
+
for standard, variants in self.homoglyphs.items():
|
|
73
|
+
self.homoglyph_map[standard] = set(variants)
|
|
74
|
+
for variant in variants:
|
|
75
|
+
if variant not in self.homoglyph_map:
|
|
76
|
+
self.homoglyph_map[variant] = set()
|
|
77
|
+
self.homoglyph_map[variant].add(standard)
|
|
78
|
+
|
|
79
|
+
def _build_frequency_map(self) -> None:
|
|
80
|
+
"""Build a comprehensive frequency-based substitution map."""
|
|
81
|
+
self.frequency_map: Dict[str, Set[str]] = {}
|
|
82
|
+
for standard, variants in self.character_frequency.items():
|
|
83
|
+
self.frequency_map[standard] = set(variants)
|
|
84
|
+
for variant in variants:
|
|
85
|
+
if variant not in self.frequency_map:
|
|
86
|
+
self.frequency_map[variant] = set()
|
|
87
|
+
self.frequency_map[variant].add(standard)
|
|
88
|
+
|
|
89
|
+
def normalize_unicode(self, text: str) -> str:
|
|
90
|
+
"""Normalize Unicode characters to their basic form.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
text: Input text to normalize.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Normalized text.
|
|
97
|
+
"""
|
|
98
|
+
text = unicodedata.normalize('NFKC', text)
|
|
99
|
+
|
|
100
|
+
text = ''.join(c for c in text if not unicodedata.combining(c))
|
|
101
|
+
|
|
102
|
+
text = text.lower()
|
|
103
|
+
|
|
104
|
+
result = []
|
|
105
|
+
for char in text:
|
|
106
|
+
if char in self.unicode_mappings:
|
|
107
|
+
result.append(self.unicode_mappings[char])
|
|
108
|
+
else:
|
|
109
|
+
result.append(char)
|
|
110
|
+
|
|
111
|
+
return ''.join(result)
|
|
112
|
+
|
|
113
|
+
def normalize_text(self, text: str) -> str:
|
|
114
|
+
"""Normalize text by converting to lowercase and removing diacritics.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
text: Input text to normalize.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Normalized text.
|
|
121
|
+
"""
|
|
122
|
+
text = self.normalize_unicode(text)
|
|
123
|
+
|
|
124
|
+
text = re.sub(r'[^\w\s]', '', text)
|
|
125
|
+
|
|
126
|
+
return text
|
|
127
|
+
|
|
128
|
+
def aggressive_normalize(self, text: str) -> str:
|
|
129
|
+
"""Perform aggressive text normalization.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
text: Input text to normalize.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Aggressively normalized text.
|
|
136
|
+
"""
|
|
137
|
+
text = self.normalize_unicode(text)
|
|
138
|
+
|
|
139
|
+
text = ''.join(c for c in text if c.isalnum() or c.isspace())
|
|
140
|
+
|
|
141
|
+
text = ' '.join(text.split())
|
|
142
|
+
|
|
143
|
+
return text
|
|
144
|
+
|
|
145
|
+
def transliterate(self, text: str, to_latin: bool = True) -> str:
|
|
146
|
+
"""Transliterate text between Cyrillic and Latin.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
text: Input text to transliterate.
|
|
150
|
+
to_latin: If True, convert to Latin; if False, convert to Cyrillic.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Transliterated text.
|
|
154
|
+
"""
|
|
155
|
+
mapping = self.cyrillic_to_latin if to_latin else self.latin_to_cyrillic
|
|
156
|
+
result = []
|
|
157
|
+
|
|
158
|
+
for char in text:
|
|
159
|
+
if char in mapping:
|
|
160
|
+
result.append(mapping[char])
|
|
161
|
+
else:
|
|
162
|
+
result.append(char)
|
|
163
|
+
|
|
164
|
+
return ''.join(result)
|
|
165
|
+
|
|
166
|
+
def replace_homoglyphs(self, text: str) -> str:
|
|
167
|
+
"""Replace homoglyphs with their standard equivalents.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
text: Input text to process.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Text with homoglyphs replaced.
|
|
174
|
+
"""
|
|
175
|
+
result = []
|
|
176
|
+
for char in text:
|
|
177
|
+
if char in self.homoglyph_map and self.homoglyph_map[char]:
|
|
178
|
+
result.append(next(iter(self.homoglyph_map[char])))
|
|
179
|
+
else:
|
|
180
|
+
result.append(char)
|
|
181
|
+
return ''.join(result)
|
|
182
|
+
|
|
183
|
+
def process_text(self, text: str) -> str:
|
|
184
|
+
"""Apply all text processing steps in sequence.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
text: Input text to process.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Fully processed text.
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
txt = text
|
|
194
|
+
if self.processing_normalize_text == True:
|
|
195
|
+
txt = self.normalize_text(txt)
|
|
196
|
+
|
|
197
|
+
if self.processing_aggressive_normalize == True:
|
|
198
|
+
txt = self.aggressive_normalize(txt)
|
|
199
|
+
|
|
200
|
+
if self.processing_transliterate == True:
|
|
201
|
+
txt = self.transliterate(txt, to_latin=True)
|
|
202
|
+
txt = self.transliterate(txt, to_latin=False)
|
|
203
|
+
|
|
204
|
+
if self.processing_replace_homoglyphs == True:
|
|
205
|
+
txt = self.replace_homoglyphs(txt)
|
|
206
|
+
|
|
207
|
+
return txt
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: badwords-py
|
|
3
|
+
Version: 2.1.0
|
|
4
|
+
Summary: This is a library for effective moderation of content.
|
|
5
|
+
Author-email: iamlostshe <vanamelcikov7275@gmail.com>, FlacSy <flacsy.x@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/FlacSy/badwords
|
|
8
|
+
Project-URL: Repository, https://github.com/FlacSy/badwords.git
|
|
9
|
+
Project-URL: Issues, https://github.com/FlacSy/badwords/issues
|
|
10
|
+
Keywords: moderation,content filtering,obscenity detection,mood analysis,image moderation
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
18
|
+
Classifier: Intended Audience :: Developers
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
<div align="center">
|
|
26
|
+
|
|
27
|
+
# 🚫 BadWords
|
|
28
|
+
|
|
29
|
+
**High-performance profanity filter for Python with multilingual support and evasion detection.**
|
|
30
|
+
|
|
31
|
+
[](https://www.python.org/)
|
|
32
|
+
[](https://opensource.org/licenses/MIT)
|
|
33
|
+
[](#)
|
|
34
|
+
[](https://pypi.org/project/bad-words/)
|
|
35
|
+
|
|
36
|
+
[Installation](#-installation) • [Quick Start](#-quick-start) • [Supported Languages](#-supported-languages) • [Advanced Evasion Detection](#-advanced-evasion-detection)
|
|
37
|
+
|
|
38
|
+
</div>
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## 📖 Description
|
|
43
|
+
|
|
44
|
+
`BadWords` is a sophisticated profanity filtering library designed to clean up user-generated content. Unlike simple keyword matching, it uses **similarity scoring**, **homoglyph detection**, and **transliteration** to catch even the most cleverly disguised insults.
|
|
45
|
+
|
|
46
|
+
## 📦 Installation
|
|
47
|
+
|
|
48
|
+
### Requirements
|
|
49
|
+
- **Recommended:** Python 3.13
|
|
50
|
+
- **Minimum:** Python 3.10+
|
|
51
|
+
|
|
52
|
+
### Install via GitHub
|
|
53
|
+
```bash
|
|
54
|
+
pip install git+[https://github.com/FlacSy/badwords.git](https://github.com/FlacSy/badwords.git)
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Install via PyPI
|
|
59
|
+
```bash
|
|
60
|
+
pip install badwords-py
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## ⚡ Quick Start
|
|
66
|
+
|
|
67
|
+
### Basic Initialization
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from badwords import ProfanityFilter
|
|
71
|
+
|
|
72
|
+
# Initialize filter
|
|
73
|
+
p = ProfanityFilter()
|
|
74
|
+
|
|
75
|
+
# Load specific languages (e.g., English and Russian)
|
|
76
|
+
p.init(languages=["en", "ru"])
|
|
77
|
+
|
|
78
|
+
# Or load ALL 26+ supported languages
|
|
79
|
+
p.init()
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Checking and Filtering Text
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
text = "Some very b4d text here"
|
|
87
|
+
|
|
88
|
+
# 1. Simple check (Returns Boolean)
|
|
89
|
+
is_bad = p.filter_text(text)
|
|
90
|
+
print(is_bad) # True
|
|
91
|
+
|
|
92
|
+
# 2. Censoring text (Returns String)
|
|
93
|
+
clean_text = p.filter_text(text, replace_character="*")
|
|
94
|
+
print(clean_text) # "Some very *** text here"
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## 🛠 Methods & API
|
|
101
|
+
|
|
102
|
+
### `filter_text(text, match_threshold=0.8, replace_character=None)`
|
|
103
|
+
|
|
104
|
+
The core method of the library.
|
|
105
|
+
|
|
106
|
+
| Parameter | Type | Default | Description |
|
|
107
|
+
| --- | --- | --- | --- |
|
|
108
|
+
| `text` | `str` | Required | Input text to check. |
|
|
109
|
+
| `match_threshold` | `float` | `0.8` | Similarity threshold (1.0 = exact match, 0.7 = aggressive). |
|
|
110
|
+
| `replace_character` | `str/None` | `None` | If provided, returns censored string. If None, returns bool. |
|
|
111
|
+
|
|
112
|
+
> [!WARNING]
|
|
113
|
+
> **Performance Tip:** Using `match_threshold < 1.0` enables fuzzy matching which is slower. Use `1.0` for high-traffic real-time filtering, or `0.95` for a good balance.
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## 🧩 Advanced Evasion Detection
|
|
118
|
+
|
|
119
|
+
Standard filters are easy to bypass. `BadWords` is built to detect:
|
|
120
|
+
|
|
121
|
+
* **Homoglyphs:** Detects `hеllo` (using Cyrillic 'е') or `h4llo` (numbers).
|
|
122
|
+
* **Transliteration:** Automatically handles mapping between Cyrillic and Latin alphabets.
|
|
123
|
+
* **Normalization:** Strips diacritics, special characters, and decorative Unicode symbols.
|
|
124
|
+
* **Similarity Analysis:** Uses fuzzy matching to find words with deliberate typos.
|
|
125
|
+
|
|
126
|
+
### Examples of detected evasions:
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
_filter.filter_text("hеllо") # Mixed alphabets (Cyrillic + Latin) -> DETECTED
|
|
130
|
+
_filter.filter_text("h3ll0") # Character substitution -> DETECTED
|
|
131
|
+
_filter.filter_text("h⍺llo") # Mathematical/Greek symbols -> DETECTED
|
|
132
|
+
_filter.filter_text("привет") # Transliterated matches -> DETECTED
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## 🌍 Supported Languages
|
|
139
|
+
|
|
140
|
+
`BadWords` currently supports **26 languages** out of the box:
|
|
141
|
+
|
|
142
|
+
| Code | Language | Code | Language | Code | Language |
|
|
143
|
+
| --- | --- | --- | --- | --- | --- |
|
|
144
|
+
| `en` | English | `ru` | Russian | `ua` | Ukrainian |
|
|
145
|
+
| `de` | German | `fr` | French | `it` | Italian |
|
|
146
|
+
| `sp` | Spanish | `pl` | Polish | `cz` | Czech |
|
|
147
|
+
| `ja` | Japanese | `ko` | Korean | `th` | Thai |
|
|
148
|
+
| ... | & 14 more | | | | |
|
|
149
|
+
|
|
150
|
+
*Use `p.get_all_languages()` to see the full list in your code.*
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## 🚀 Full Integration Example
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from badwords import ProfanityFilter
|
|
158
|
+
|
|
159
|
+
def monitor_chat():
|
|
160
|
+
# Setup for a global chat
|
|
161
|
+
profanity_filter = ProfanityFilter()
|
|
162
|
+
profanity_filter.init(["en", "ru", "de"])
|
|
163
|
+
|
|
164
|
+
# Custom project-specific banned words
|
|
165
|
+
profanity_filter.add_words(["spam_link_v1", "scam_bot_99"])
|
|
166
|
+
|
|
167
|
+
user_input = "Hey! Check out this b.a.d.w.o.r.d"
|
|
168
|
+
|
|
169
|
+
# Moderate with high accuracy
|
|
170
|
+
is_offensive = profanity_filter.filter_text(user_input, match_threshold=0.95)
|
|
171
|
+
|
|
172
|
+
if is_offensive:
|
|
173
|
+
print("Message blocked: Contains restricted language.")
|
|
174
|
+
else:
|
|
175
|
+
# Proceed with processing
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
if __name__ == "__main__":
|
|
179
|
+
monitor_chat()
|
|
180
|
+
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## 🤝 Contributing
|
|
186
|
+
|
|
187
|
+
Contributions are what make the open-source community an amazing place to learn, inspire, and create.
|
|
188
|
+
|
|
189
|
+
1. Fork the Project
|
|
190
|
+
2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
|
|
191
|
+
3. Commit your Changes (`git commit -m 'Add AmazingFeature'`)
|
|
192
|
+
4. Push to the Branch (`git push origin feature/AmazingFeature`)
|
|
193
|
+
5. Open a Pull Request
|
|
194
|
+
|
|
195
|
+
## 📄 License
|
|
196
|
+
|
|
197
|
+
Distributed under the MIT License. See `LICENSE` for more information.
|
|
198
|
+
|
|
199
|
+
<div align="center">
|
|
200
|
+
<sub>Developed with ❤️ by <a href="https://github.com/FlacSy">FlacSy</a></sub>
|
|
201
|
+
</div>
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
badwords/__init__.py,sha256=vcOyALowOTBXKB6J_71Qc88mOJ3hr-phZsAGMKQ7mXI,120
|
|
2
|
+
badwords/check.py,sha256=Gbi0EmNKsc6GUBfHyLRavdpp9ZtM0i4GbHbdttRYojM,5575
|
|
3
|
+
badwords/exceptions.py,sha256=D3L-BuQdH5M-pGpk0uW7MQPr9h1ZOPNFzH6NtP4T_Jo,367
|
|
4
|
+
badwords/text_processor.py,sha256=dlFsoW678I9SCffGzNVp0OBG-D-8AkA1wGO-KuDCcsg,7107
|
|
5
|
+
badwords/resource/br.bdw,sha256=Pl9btZq3bO9i_aOjnU58fS7abJka5XQFZEOmE8sya_8,945
|
|
6
|
+
badwords/resource/character_frequency.json,sha256=60zRDkpvGiuttFYvaN_hXlxlNau1MluaMYPVxzGEyps,250
|
|
7
|
+
badwords/resource/cz.bdw,sha256=Tvc1-6BXbFQwEnKXJQvPRhv_aywhdnGN0ic5tn8j8k8,418
|
|
8
|
+
badwords/resource/da.bdw,sha256=eyZI-MtJTgIy-UvmeGTHM4ve50zn5ZVfokNXG_LFl20,550
|
|
9
|
+
badwords/resource/de.bdw,sha256=th4WYmpUrSs7qeOi6hOtJ25rXhWYrdqzJn3fmlGfP7c,683
|
|
10
|
+
badwords/resource/du.bdw,sha256=tPa6X8xDtWIH04cbmdJdozJvnB0KF_KjyrlvLBKMAmc,349
|
|
11
|
+
badwords/resource/en.bdw,sha256=Tv4QyDGlfs8gG2jf3xN4JKhfVLGpiA2Ewy2rYVtoxuU,1169
|
|
12
|
+
badwords/resource/fi.bdw,sha256=xrd-KtKJ2UzOuTjJfXkQGqoYiWur4xjN2KC7rej3aEw,485
|
|
13
|
+
badwords/resource/fr.bdw,sha256=R9Eog8PzzNqmfOqCmisoyq_bHAmxqAhKQ7FeDMGRKDg,1009
|
|
14
|
+
badwords/resource/gr.bdw,sha256=fUXS3-fulVN9iXn9Rk26fPHyK1Bx9lzG4ANOlstWK3c,9130
|
|
15
|
+
badwords/resource/homoglyphs.json,sha256=iNzL-X4Q_IHqyc5a5v2RviZ5v5GuVQM4TldBykVQ5HM,515
|
|
16
|
+
badwords/resource/hu.bdw,sha256=Nwi_yQ8TMa5GSeG3dFGNk5XC9gfxVpvLCnRzJM6tZJQ,618
|
|
17
|
+
badwords/resource/in.bdw,sha256=AkA2apiqw5mjswcsj3fIboJANt9rO6XTgzeN8Kgkj2o,813
|
|
18
|
+
badwords/resource/it.bdw,sha256=WnGXJBsVCDmbqNHSkiVo05GI2QRVEl70IO_8rzoa8OQ,1377
|
|
19
|
+
badwords/resource/ja.bdw,sha256=EfHxZ73C8LWYYiwlCyK9lJV8sFuGoZxC2YYCUnkQ_eE,194
|
|
20
|
+
badwords/resource/ko.bdw,sha256=vkPIdZIufbVqbite_6IiXNLyd8DVIRTYAHL8ce50q6I,1919
|
|
21
|
+
badwords/resource/lt.bdw,sha256=Cfzi7xS7ZO3ZYMDwK0iVkaBJW4Zf44HiN5BbLuLx_O4,1476
|
|
22
|
+
badwords/resource/no.bdw,sha256=noIurraqWgBZfta8HmqJoB0odGD9jkkgt2joHp23B-A,543
|
|
23
|
+
badwords/resource/pl.bdw,sha256=CG8sPO7Z-RXDgrkfQ_FMIJncpEo9RkOGelJFm3ZaOIY,79009
|
|
24
|
+
badwords/resource/po.bdw,sha256=OGtiiU6aMmhyTAm1H8KiMXgQC3Rzwa8O-xgoWe-IOfo,669
|
|
25
|
+
badwords/resource/ro.bdw,sha256=8zQVyfvnvhFhqW7nFo3lqTYUbK1lIs8ZPn4TFTp4eA8,50
|
|
26
|
+
badwords/resource/ru.bdw,sha256=U1X99mV7h9XFm5emLFIQoeq-hNaWACG0ieGpRbASJjM,81318
|
|
27
|
+
badwords/resource/sp.bdw,sha256=kLglEbtkN6Z7x2MFfUqfJt1jtkN1dC_gtlepqC5ocDU,3790
|
|
28
|
+
badwords/resource/sw.bdw,sha256=NOgNNv3DbKbUQkOkU_-UBLzar9tbnBqbzUN1vlDVlpE,120
|
|
29
|
+
badwords/resource/th.bdw,sha256=_hrKv4Rmfqeo8eDFTK-l-lHGN3_GGE9jxS4V5XF19U0,186
|
|
30
|
+
badwords/resource/transliteration.json,sha256=jCKYi-EVbnLSKbgEoFd7mfKwGMV4V9ko2QiLwVdjuXs,445
|
|
31
|
+
badwords/resource/tu.bdw,sha256=9qI5G2chVRHn4sqmw0rqtYJv4KrU4s1PGcBYel0jfq8,8864
|
|
32
|
+
badwords/resource/ua.bdw,sha256=1X2ld6-Dz8ocnRvOuaAy8dWyaQLNLSphZ9YQrOvPwDI,44475
|
|
33
|
+
badwords/resource/unicode_mappings.json,sha256=Q7lLaLUtI_LJaY2aas8C3FpR1V29_KlZgdnLHr5S5iY,6662
|
|
34
|
+
badwords_py-2.1.0.dist-info/licenses/LICENSE,sha256=HbGX33ESqv7YurvAg5UKhkwh-7CC2do77sMZsSj2Z6s,1029
|
|
35
|
+
badwords_py-2.1.0.dist-info/METADATA,sha256=PrZPm1oqMoy4ns3Dp7j1TCmzuuSVG01p81U7HVMoGlY,6318
|
|
36
|
+
badwords_py-2.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
37
|
+
badwords_py-2.1.0.dist-info/top_level.txt,sha256=SIGWaKBUlVaNjwc85Ypds1-U94feUaPyyo-hsdA9yCk,9
|
|
38
|
+
badwords_py-2.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 FlacSy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so.
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS," WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
badwords
|