profanity-filter 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +0 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +21 -0
- data/README.md +54 -0
- data/Rakefile +10 -0
- data/bin/console +7 -0
- data/bin/setup +8 -0
- data/lib/profanity_dictionaries/en.yaml +166 -0
- data/lib/profanity_dictionaries/es.yaml +68 -0
- data/lib/profanity_dictionaries/leet_strategy_dictionary.yaml +1716 -0
- data/lib/profanity_dictionaries/partial_match.yaml +1 -0
- data/lib/profanity_dictionaries/pt.yaml +74 -0
- data/lib/profanity_filter.rb +160 -0
- data/lib/profanity_filter/engines/allow_duplicate_characters_strategy.rb +15 -0
- data/lib/profanity_filter/engines/allow_symbols_in_words_strategy.rb +16 -0
- data/lib/profanity_filter/engines/component.rb +17 -0
- data/lib/profanity_filter/engines/composite.rb +37 -0
- data/lib/profanity_filter/engines/exact_match_strategy.rb +35 -0
- data/lib/profanity_filter/engines/leet_exact_match_strategy.rb +40 -0
- data/lib/profanity_filter/engines/partial_match_strategy.rb +28 -0
- data/lib/profanity_filter/engines/regexp_strategy.rb +37 -0
- data/lib/profanity_filter/version.rb +3 -0
- data/profanity_filter.gemspec +37 -0
- metadata +162 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
- 🖕
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
- aborto
|
|
2
|
+
- amador
|
|
3
|
+
- ânus
|
|
4
|
+
- aranha
|
|
5
|
+
- ariano
|
|
6
|
+
- balalao
|
|
7
|
+
- bastardo
|
|
8
|
+
- bicha
|
|
9
|
+
- biscate
|
|
10
|
+
- bissexual
|
|
11
|
+
- boceta
|
|
12
|
+
- boob
|
|
13
|
+
- bosta
|
|
14
|
+
- braulio de borracha
|
|
15
|
+
- bumbum
|
|
16
|
+
- burro
|
|
17
|
+
- cabrao
|
|
18
|
+
- cacete
|
|
19
|
+
- cagar
|
|
20
|
+
- camisinha
|
|
21
|
+
- caralho
|
|
22
|
+
- cerveja
|
|
23
|
+
- chochota
|
|
24
|
+
- chupar
|
|
25
|
+
- clitoris
|
|
26
|
+
- cocaína
|
|
27
|
+
- colhoes
|
|
28
|
+
- comer
|
|
29
|
+
- cona
|
|
30
|
+
- consolo
|
|
31
|
+
- corno
|
|
32
|
+
- cu
|
|
33
|
+
- dar o rabo
|
|
34
|
+
- dum raio
|
|
35
|
+
- esporra
|
|
36
|
+
- fecal
|
|
37
|
+
- filho da puta
|
|
38
|
+
- foda
|
|
39
|
+
- foda-se
|
|
40
|
+
- foder
|
|
41
|
+
- frango assado
|
|
42
|
+
- gozar
|
|
43
|
+
- grelho
|
|
44
|
+
- heroína
|
|
45
|
+
- heterosexual
|
|
46
|
+
- homem gay
|
|
47
|
+
- homoerótico
|
|
48
|
+
- homosexual
|
|
49
|
+
- inferno
|
|
50
|
+
- lésbica
|
|
51
|
+
- lolita
|
|
52
|
+
- mama
|
|
53
|
+
- merda
|
|
54
|
+
- paneleiro
|
|
55
|
+
- passar um cheque
|
|
56
|
+
- pau
|
|
57
|
+
- peidar
|
|
58
|
+
- pênis
|
|
59
|
+
- pinto
|
|
60
|
+
- porra
|
|
61
|
+
- puta
|
|
62
|
+
- puta que pariu
|
|
63
|
+
- puta que te pariu
|
|
64
|
+
- queca
|
|
65
|
+
- sacanagem
|
|
66
|
+
- saco
|
|
67
|
+
- torneira
|
|
68
|
+
- transar
|
|
69
|
+
- vai-te foder
|
|
70
|
+
- vai tomar no cu
|
|
71
|
+
- veado
|
|
72
|
+
- vibrador
|
|
73
|
+
- xana
|
|
74
|
+
- xochota
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'profanity_filter/version'
|
|
4
|
+
require 'profanity_filter/engines/composite'
|
|
5
|
+
require 'profanity_filter/engines/partial_match_strategy'
|
|
6
|
+
require 'profanity_filter/engines/allow_duplicate_characters_strategy'
|
|
7
|
+
require 'profanity_filter/engines/allow_symbols_in_words_strategy'
|
|
8
|
+
require 'profanity_filter/engines/leet_exact_match_strategy'
|
|
9
|
+
require 'web_purify'
|
|
10
|
+
|
|
11
|
+
class ProfanityFilter
|
|
12
|
+
WP_DEFAULT_LANGS = [:en, :sp, :pt].freeze
|
|
13
|
+
WP_AVAILABLE_LANGS = [
|
|
14
|
+
:en, :ar, :fr, :de, :hi, :jp, :it, :pt, :ru, :sp, :th, :tr, :zh, :kr, :pa
|
|
15
|
+
].freeze
|
|
16
|
+
WP_LANG_CONVERSIONS = { es: :sp, ko: :kr, ja: :jp }.freeze
|
|
17
|
+
|
|
18
|
+
attr_reader :strict_filter, :tolerant_filter
|
|
19
|
+
|
|
20
|
+
def initialize(web_purifier_api_key: nil)
|
|
21
|
+
# If we are using Web Purifier
|
|
22
|
+
@wp_client = web_purifier_api_key ? WebPurify::Client.new(web_purifier_api_key) : nil
|
|
23
|
+
|
|
24
|
+
exact_match_dictionary = load_exact_match_dictionary
|
|
25
|
+
partial_match_dictionary = load_partial_match_dictionary
|
|
26
|
+
|
|
27
|
+
allow_symbol_strategy = ::ProfanityFilterEngine::AllowSymbolsInWordsStrategy.new(
|
|
28
|
+
dictionary: exact_match_dictionary,
|
|
29
|
+
ignore_case: true
|
|
30
|
+
)
|
|
31
|
+
duplicate_characters_strategy = ::ProfanityFilterEngine::AllowDuplicateCharactersStrategy.new(
|
|
32
|
+
dictionary: exact_match_dictionary,
|
|
33
|
+
ignore_case: true
|
|
34
|
+
)
|
|
35
|
+
leet_strategy = ::ProfanityFilterEngine::LeetExactMatchStrategy.new(
|
|
36
|
+
dictionary: exact_match_dictionary,
|
|
37
|
+
ignore_case: true
|
|
38
|
+
)
|
|
39
|
+
partial_match_strategy = ::ProfanityFilterEngine::PartialMatchStrategy.new(
|
|
40
|
+
dictionary: partial_match_dictionary,
|
|
41
|
+
ignore_case: true
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Set up strict filter.
|
|
45
|
+
@strict_filter = ::ProfanityFilterEngine::Composite.new
|
|
46
|
+
@strict_filter.add_strategies(
|
|
47
|
+
leet_strategy,
|
|
48
|
+
allow_symbol_strategy,
|
|
49
|
+
partial_match_strategy,
|
|
50
|
+
duplicate_characters_strategy
|
|
51
|
+
)
|
|
52
|
+
# Set up tolerant filter.
|
|
53
|
+
@tolerant_filter = ::ProfanityFilterEngine::Composite.new
|
|
54
|
+
@tolerant_filter.add_strategies(
|
|
55
|
+
allow_symbol_strategy,
|
|
56
|
+
partial_match_strategy
|
|
57
|
+
)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def profane?(phrase, lang: nil, strictness: :tolerant)
|
|
61
|
+
return false if phrase == '' || phrase.nil?
|
|
62
|
+
|
|
63
|
+
is_profane = pf_profane?(phrase, strictness: strictness)
|
|
64
|
+
if !is_profane && use_webpurify?
|
|
65
|
+
wp_is_profane = wp_profane?(phrase, lang: lang)
|
|
66
|
+
is_profane = wp_is_profane unless wp_is_profane.nil?
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
!!is_profane
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def profanity_count(phrase, lang: nil, strictness: :tolerant)
|
|
73
|
+
return 0 if phrase == '' || phrase.nil?
|
|
74
|
+
|
|
75
|
+
banned_words_count = pf_profanity_count(phrase, strictness: strictness)
|
|
76
|
+
if banned_words_count == 0 && use_webpurify?
|
|
77
|
+
wp_banned_words_count = wp_profanity_count(phrase, lang: lang)
|
|
78
|
+
banned_words_count = wp_banned_words_count unless wp_banned_words_count.nil?
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
banned_words_count
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
def use_webpurify?
|
|
87
|
+
!!@wp_client
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def filter(strictness: :tolerant)
|
|
91
|
+
case strictness
|
|
92
|
+
when :strict
|
|
93
|
+
@strict_filter
|
|
94
|
+
when :tolerant
|
|
95
|
+
@tolerant_filter
|
|
96
|
+
else
|
|
97
|
+
@tolerant_filter
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def pf_profane?(phrase, strictness: :tolerant)
|
|
102
|
+
filter(strictness: strictness).profane?(phrase)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def pf_profanity_count(phrase, strictness: :tolerant)
|
|
106
|
+
filter(strictness: strictness).profanity_count(phrase)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def wp_profane?(phrase, lang: nil, timeout_duration: 5)
|
|
110
|
+
profanity_count = wp_profanity_count(phrase, lang: lang, timeout_duration: timeout_duration)
|
|
111
|
+
|
|
112
|
+
if profanity_count.nil? || profanity_count == 0
|
|
113
|
+
false
|
|
114
|
+
else
|
|
115
|
+
true
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def wp_profanity_count(phrase, lang: nil, timeout_duration: 5)
|
|
120
|
+
Timeout::timeout(timeout_duration) do
|
|
121
|
+
@wp_client.check_count phrase, lang: wp_langs_list_with(lang)
|
|
122
|
+
end
|
|
123
|
+
rescue StandardError => e
|
|
124
|
+
nil
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def wp_langs_list_with(lang)
|
|
128
|
+
langs = Set.new(WP_DEFAULT_LANGS)
|
|
129
|
+
|
|
130
|
+
if lang
|
|
131
|
+
lang = shorten_language(lang).to_sym
|
|
132
|
+
lang = WP_LANG_CONVERSIONS[lang] || lang
|
|
133
|
+
if WP_AVAILABLE_LANGS.include?(lang)
|
|
134
|
+
langs << lang
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
langs.to_a.join(',')
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def load_dictionary(file_path)
|
|
142
|
+
dir = File.dirname(__FILE__)
|
|
143
|
+
YAML.load(File.read("#{dir}/profanity_dictionaries/#{file_path}.yaml"))
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def load_exact_match_dictionary
|
|
147
|
+
en_dictionary = load_dictionary('en')
|
|
148
|
+
es_dictionary = load_dictionary('es')
|
|
149
|
+
pt_dictionary = load_dictionary('pt')
|
|
150
|
+
en_dictionary + es_dictionary + pt_dictionary
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def load_partial_match_dictionary
|
|
154
|
+
load_dictionary('partial_match')
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def shorten_language(lang)
|
|
158
|
+
lang && lang.to_s.downcase[0, 2]
|
|
159
|
+
end
|
|
160
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'exact_match_strategy'
|
|
4
|
+
|
|
5
|
+
module ProfanityFilterEngine
|
|
6
|
+
class AllowDuplicateCharactersStrategy < ExactMatchStrategy
|
|
7
|
+
DEFAULT_IGNORE_CASE = true
|
|
8
|
+
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
def build_word_regexp(word)
|
|
12
|
+
word.chars.map { |char| Regexp.escape(char) + '+' }.join
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'exact_match_strategy'
|
|
4
|
+
|
|
5
|
+
module ProfanityFilterEngine
|
|
6
|
+
class AllowSymbolsInWordsStrategy < ExactMatchStrategy
|
|
7
|
+
SYMBOLS_REGEXP = '(?:\p{Mark}|\p{Separator}|\p{Symbol}|\p{Punctuation})*'
|
|
8
|
+
DEFAULT_IGNORE_CASE = true
|
|
9
|
+
|
|
10
|
+
private
|
|
11
|
+
|
|
12
|
+
def build_word_regexp(word)
|
|
13
|
+
word.chars.map { |char| Regexp.escape(char) }.join(SYMBOLS_REGEXP)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ProfanityFilterEngine
|
|
4
|
+
class Component
|
|
5
|
+
def profane?(text)
|
|
6
|
+
raise NotImplementedError
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def profane_words(text)
|
|
10
|
+
raise NotImplementedError
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def profanity_count(text)
|
|
14
|
+
profane_words(text).size
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'yaml'
|
|
4
|
+
require_relative 'component'
|
|
5
|
+
|
|
6
|
+
module ProfanityFilterEngine
|
|
7
|
+
class Composite < Component
|
|
8
|
+
attr_reader :strategies
|
|
9
|
+
|
|
10
|
+
def initialize
|
|
11
|
+
@strategies = []
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def add_strategy(strategy)
|
|
15
|
+
strategies << strategy
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def add_strategies(*new_strategies)
|
|
19
|
+
strategies.concat(new_strategies)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def delete_strategy(strategy)
|
|
23
|
+
strategies.delete(strategy)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def profane?(text)
|
|
27
|
+
strategies.any? { |strategy| strategy.profane?(text) }
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def profane_words(text)
|
|
31
|
+
total_words = strategies.reduce([]) do |words, strategy|
|
|
32
|
+
words.concat(strategy.profane_words(text))
|
|
33
|
+
end
|
|
34
|
+
total_words.uniq
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'regexp_strategy'
|
|
4
|
+
|
|
5
|
+
module ProfanityFilterEngine
|
|
6
|
+
class ExactMatchStrategy < RegexpStrategy
|
|
7
|
+
DELIMITER = '(?:\b|^|$|_)'
|
|
8
|
+
DEFAULT_IGNORE_CASE = false
|
|
9
|
+
|
|
10
|
+
attr_reader :delimiter
|
|
11
|
+
attr_reader :ignore_case
|
|
12
|
+
|
|
13
|
+
def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
|
|
14
|
+
@dictionary = dictionary
|
|
15
|
+
@delimiter = DELIMITER
|
|
16
|
+
@ignore_case = ignore_case
|
|
17
|
+
@profanity_regexp = build_profanity_regexp
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def build_profanity_regexp
|
|
23
|
+
option = ignore_case ? Regexp::IGNORECASE : nil
|
|
24
|
+
regexp_list = dictionary.map do |word|
|
|
25
|
+
Regexp.new("#{delimiter}#{build_word_regexp(word)}#{delimiter}", option)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
Regexp.union(*regexp_list)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def build_word_regexp(word)
|
|
32
|
+
Regexp.escape(word)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'exact_match_strategy'
|
|
4
|
+
require 'pry'
|
|
5
|
+
|
|
6
|
+
module ProfanityFilterEngine
|
|
7
|
+
class LeetExactMatchStrategy < ExactMatchStrategy
|
|
8
|
+
DEFAULT_IGNORE_CASE = true
|
|
9
|
+
|
|
10
|
+
private
|
|
11
|
+
|
|
12
|
+
def build_word_regexp(word)
|
|
13
|
+
build_leet_dictionary unless defined? LEET_DICTIONARY
|
|
14
|
+
word.chars.map do |char|
|
|
15
|
+
downcase_char = char.downcase
|
|
16
|
+
if LEET_DICTIONARY.include?(downcase_char)
|
|
17
|
+
LEET_DICTIONARY[downcase_char]
|
|
18
|
+
else
|
|
19
|
+
Regexp.escape(char)
|
|
20
|
+
end
|
|
21
|
+
end.join
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def build_leet_dictionary
|
|
25
|
+
lib_dir = File.expand_path('../../../', __FILE__)
|
|
26
|
+
file = File.read("#{lib_dir}/profanity_dictionaries/leet_strategy_dictionary.yaml")
|
|
27
|
+
raw_data = YAML.safe_load(file)
|
|
28
|
+
dict = transform_data_to_regex(raw_data)
|
|
29
|
+
::ProfanityFilterEngine::LeetExactMatchStrategy.const_set('LEET_DICTIONARY', dict)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def transform_data_to_regex(dict)
|
|
33
|
+
dict.map do |char, data|
|
|
34
|
+
data_str = data.join('|')
|
|
35
|
+
dict[char] = "(?:#{data_str})"
|
|
36
|
+
end
|
|
37
|
+
dict
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'regexp_strategy'
|
|
4
|
+
|
|
5
|
+
module ProfanityFilterEngine
|
|
6
|
+
class PartialMatchStrategy < RegexpStrategy
|
|
7
|
+
DEFAULT_IGNORE_CASE = false
|
|
8
|
+
|
|
9
|
+
attr_reader :ignore_case
|
|
10
|
+
|
|
11
|
+
def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
|
|
12
|
+
@dictionary = dictionary
|
|
13
|
+
@ignore_case = ignore_case
|
|
14
|
+
@profanity_regexp = build_profanity_regexp
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def build_profanity_regexp
|
|
20
|
+
option = ignore_case ? Regexp::IGNORECASE : nil
|
|
21
|
+
regexp_list = dictionary.map do |word|
|
|
22
|
+
Regexp.new("#{Regexp.escape(word)}", option)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
Regexp.union(*regexp_list)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'component'
|
|
4
|
+
|
|
5
|
+
module ProfanityFilterEngine
|
|
6
|
+
class RegexpStrategy < Component
|
|
7
|
+
DEFAULT_DELIMITER = '(?:\b|^|$|_)'
|
|
8
|
+
|
|
9
|
+
attr_reader :dictionary, :profanity_regexp
|
|
10
|
+
|
|
11
|
+
attr_writer :profanity_regexp
|
|
12
|
+
private :profanity_regexp=
|
|
13
|
+
|
|
14
|
+
def initialize(dictionary:, profanity_regexp: nil)
|
|
15
|
+
@dictionary = dictionary
|
|
16
|
+
@profanity_regexp = profanity_regexp || build_profanity_regexp
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def profane_words(text)
|
|
20
|
+
text.scan(profanity_regexp).uniq
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def profane?(text)
|
|
24
|
+
profanity_regexp.match?(text)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def build_profanity_regexp
|
|
30
|
+
regexp_list = dictionary.map do |word|
|
|
31
|
+
Regexp.new("#{DEFAULT_DELIMITER}#{Regexp.escape(word)}#{DEFAULT_DELIMITER}")
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
Regexp.union(*regexp_list)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|