profanity-filter 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +0 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +21 -0
- data/README.md +54 -0
- data/Rakefile +10 -0
- data/bin/console +7 -0
- data/bin/setup +8 -0
- data/lib/profanity_dictionaries/en.yaml +166 -0
- data/lib/profanity_dictionaries/es.yaml +68 -0
- data/lib/profanity_dictionaries/leet_strategy_dictionary.yaml +1716 -0
- data/lib/profanity_dictionaries/partial_match.yaml +1 -0
- data/lib/profanity_dictionaries/pt.yaml +74 -0
- data/lib/profanity_filter.rb +160 -0
- data/lib/profanity_filter/engines/allow_duplicate_characters_strategy.rb +15 -0
- data/lib/profanity_filter/engines/allow_symbols_in_words_strategy.rb +16 -0
- data/lib/profanity_filter/engines/component.rb +17 -0
- data/lib/profanity_filter/engines/composite.rb +37 -0
- data/lib/profanity_filter/engines/exact_match_strategy.rb +35 -0
- data/lib/profanity_filter/engines/leet_exact_match_strategy.rb +40 -0
- data/lib/profanity_filter/engines/partial_match_strategy.rb +28 -0
- data/lib/profanity_filter/engines/regexp_strategy.rb +37 -0
- data/lib/profanity_filter/version.rb +3 -0
- data/profanity_filter.gemspec +37 -0
- metadata +162 -0
@@ -0,0 +1 @@
|
|
1
|
+
- 🖕
|
@@ -0,0 +1,74 @@
|
|
1
|
+
- aborto
|
2
|
+
- amador
|
3
|
+
- ânus
|
4
|
+
- aranha
|
5
|
+
- ariano
|
6
|
+
- balalao
|
7
|
+
- bastardo
|
8
|
+
- bicha
|
9
|
+
- biscate
|
10
|
+
- bissexual
|
11
|
+
- boceta
|
12
|
+
- boob
|
13
|
+
- bosta
|
14
|
+
- braulio de borracha
|
15
|
+
- bumbum
|
16
|
+
- burro
|
17
|
+
- cabrao
|
18
|
+
- cacete
|
19
|
+
- cagar
|
20
|
+
- camisinha
|
21
|
+
- caralho
|
22
|
+
- cerveja
|
23
|
+
- chochota
|
24
|
+
- chupar
|
25
|
+
- clitoris
|
26
|
+
- cocaína
|
27
|
+
- colhoes
|
28
|
+
- comer
|
29
|
+
- cona
|
30
|
+
- consolo
|
31
|
+
- corno
|
32
|
+
- cu
|
33
|
+
- dar o rabo
|
34
|
+
- dum raio
|
35
|
+
- esporra
|
36
|
+
- fecal
|
37
|
+
- filho da puta
|
38
|
+
- foda
|
39
|
+
- foda-se
|
40
|
+
- foder
|
41
|
+
- frango assado
|
42
|
+
- gozar
|
43
|
+
- grelho
|
44
|
+
- heroína
|
45
|
+
- heterosexual
|
46
|
+
- homem gay
|
47
|
+
- homoerótico
|
48
|
+
- homosexual
|
49
|
+
- inferno
|
50
|
+
- lésbica
|
51
|
+
- lolita
|
52
|
+
- mama
|
53
|
+
- merda
|
54
|
+
- paneleiro
|
55
|
+
- passar um cheque
|
56
|
+
- pau
|
57
|
+
- peidar
|
58
|
+
- pênis
|
59
|
+
- pinto
|
60
|
+
- porra
|
61
|
+
- puta
|
62
|
+
- puta que pariu
|
63
|
+
- puta que te pariu
|
64
|
+
- queca
|
65
|
+
- sacanagem
|
66
|
+
- saco
|
67
|
+
- torneira
|
68
|
+
- transar
|
69
|
+
- vai-te foder
|
70
|
+
- vai tomar no cu
|
71
|
+
- veado
|
72
|
+
- vibrador
|
73
|
+
- xana
|
74
|
+
- xochota
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'profanity_filter/version'
|
4
|
+
require 'profanity_filter/engines/composite'
|
5
|
+
require 'profanity_filter/engines/partial_match_strategy'
|
6
|
+
require 'profanity_filter/engines/allow_duplicate_characters_strategy'
|
7
|
+
require 'profanity_filter/engines/allow_symbols_in_words_strategy'
|
8
|
+
require 'profanity_filter/engines/leet_exact_match_strategy'
|
9
|
+
require 'web_purify'
|
10
|
+
|
11
|
+
class ProfanityFilter
|
12
|
+
WP_DEFAULT_LANGS = [:en, :sp, :pt].freeze
|
13
|
+
WP_AVAILABLE_LANGS = [
|
14
|
+
:en, :ar, :fr, :de, :hi, :jp, :it, :pt, :ru, :sp, :th, :tr, :zh, :kr, :pa
|
15
|
+
].freeze
|
16
|
+
WP_LANG_CONVERSIONS = { es: :sp, ko: :kr, ja: :jp }.freeze
|
17
|
+
|
18
|
+
attr_reader :strict_filter, :tolerant_filter
|
19
|
+
|
20
|
+
def initialize(web_purifier_api_key: nil)
|
21
|
+
# If we are using Web Purifier
|
22
|
+
@wp_client = web_purifier_api_key ? WebPurify::Client.new(web_purifier_api_key) : nil
|
23
|
+
|
24
|
+
exact_match_dictionary = load_exact_match_dictionary
|
25
|
+
partial_match_dictionary = load_partial_match_dictionary
|
26
|
+
|
27
|
+
allow_symbol_strategy = ::ProfanityFilterEngine::AllowSymbolsInWordsStrategy.new(
|
28
|
+
dictionary: exact_match_dictionary,
|
29
|
+
ignore_case: true
|
30
|
+
)
|
31
|
+
duplicate_characters_strategy = ::ProfanityFilterEngine::AllowDuplicateCharactersStrategy.new(
|
32
|
+
dictionary: exact_match_dictionary,
|
33
|
+
ignore_case: true
|
34
|
+
)
|
35
|
+
leet_strategy = ::ProfanityFilterEngine::LeetExactMatchStrategy.new(
|
36
|
+
dictionary: exact_match_dictionary,
|
37
|
+
ignore_case: true
|
38
|
+
)
|
39
|
+
partial_match_strategy = ::ProfanityFilterEngine::PartialMatchStrategy.new(
|
40
|
+
dictionary: partial_match_dictionary,
|
41
|
+
ignore_case: true
|
42
|
+
)
|
43
|
+
|
44
|
+
# Set up strict filter.
|
45
|
+
@strict_filter = ::ProfanityFilterEngine::Composite.new
|
46
|
+
@strict_filter.add_strategies(
|
47
|
+
leet_strategy,
|
48
|
+
allow_symbol_strategy,
|
49
|
+
partial_match_strategy,
|
50
|
+
duplicate_characters_strategy
|
51
|
+
)
|
52
|
+
# Set up tolerant filter.
|
53
|
+
@tolerant_filter = ::ProfanityFilterEngine::Composite.new
|
54
|
+
@tolerant_filter.add_strategies(
|
55
|
+
allow_symbol_strategy,
|
56
|
+
partial_match_strategy
|
57
|
+
)
|
58
|
+
end
|
59
|
+
|
60
|
+
def profane?(phrase, lang: nil, strictness: :tolerant)
|
61
|
+
return false if phrase == '' || phrase.nil?
|
62
|
+
|
63
|
+
is_profane = pf_profane?(phrase, strictness: strictness)
|
64
|
+
if !is_profane && use_webpurify?
|
65
|
+
wp_is_profane = wp_profane?(phrase, lang: lang)
|
66
|
+
is_profane = wp_is_profane unless wp_is_profane.nil?
|
67
|
+
end
|
68
|
+
|
69
|
+
!!is_profane
|
70
|
+
end
|
71
|
+
|
72
|
+
def profanity_count(phrase, lang: nil, strictness: :tolerant)
|
73
|
+
return 0 if phrase == '' || phrase.nil?
|
74
|
+
|
75
|
+
banned_words_count = pf_profanity_count(phrase, strictness: strictness)
|
76
|
+
if banned_words_count == 0 && use_webpurify?
|
77
|
+
wp_banned_words_count = wp_profanity_count(phrase, lang: lang)
|
78
|
+
banned_words_count = wp_banned_words_count unless wp_banned_words_count.nil?
|
79
|
+
end
|
80
|
+
|
81
|
+
banned_words_count
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def use_webpurify?
|
87
|
+
!!@wp_client
|
88
|
+
end
|
89
|
+
|
90
|
+
def filter(strictness: :tolerant)
|
91
|
+
case strictness
|
92
|
+
when :strict
|
93
|
+
@strict_filter
|
94
|
+
when :tolerant
|
95
|
+
@tolerant_filter
|
96
|
+
else
|
97
|
+
@tolerant_filter
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def pf_profane?(phrase, strictness: :tolerant)
|
102
|
+
filter(strictness: strictness).profane?(phrase)
|
103
|
+
end
|
104
|
+
|
105
|
+
def pf_profanity_count(phrase, strictness: :tolerant)
|
106
|
+
filter(strictness: strictness).profanity_count(phrase)
|
107
|
+
end
|
108
|
+
|
109
|
+
def wp_profane?(phrase, lang: nil, timeout_duration: 5)
|
110
|
+
profanity_count = wp_profanity_count(phrase, lang: lang, timeout_duration: timeout_duration)
|
111
|
+
|
112
|
+
if profanity_count.nil? || profanity_count == 0
|
113
|
+
false
|
114
|
+
else
|
115
|
+
true
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def wp_profanity_count(phrase, lang: nil, timeout_duration: 5)
|
120
|
+
Timeout::timeout(timeout_duration) do
|
121
|
+
@wp_client.check_count phrase, lang: wp_langs_list_with(lang)
|
122
|
+
end
|
123
|
+
rescue StandardError => e
|
124
|
+
nil
|
125
|
+
end
|
126
|
+
|
127
|
+
def wp_langs_list_with(lang)
|
128
|
+
langs = Set.new(WP_DEFAULT_LANGS)
|
129
|
+
|
130
|
+
if lang
|
131
|
+
lang = shorten_language(lang).to_sym
|
132
|
+
lang = WP_LANG_CONVERSIONS[lang] || lang
|
133
|
+
if WP_AVAILABLE_LANGS.include?(lang)
|
134
|
+
langs << lang
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
langs.to_a.join(',')
|
139
|
+
end
|
140
|
+
|
141
|
+
def load_dictionary(file_path)
|
142
|
+
dir = File.dirname(__FILE__)
|
143
|
+
YAML.load(File.read("#{dir}/profanity_dictionaries/#{file_path}.yaml"))
|
144
|
+
end
|
145
|
+
|
146
|
+
def load_exact_match_dictionary
|
147
|
+
en_dictionary = load_dictionary('en')
|
148
|
+
es_dictionary = load_dictionary('es')
|
149
|
+
pt_dictionary = load_dictionary('pt')
|
150
|
+
en_dictionary + es_dictionary + pt_dictionary
|
151
|
+
end
|
152
|
+
|
153
|
+
def load_partial_match_dictionary
|
154
|
+
load_dictionary('partial_match')
|
155
|
+
end
|
156
|
+
|
157
|
+
def shorten_language(lang)
|
158
|
+
lang && lang.to_s.downcase[0, 2]
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'exact_match_strategy'
|
4
|
+
|
5
|
+
module ProfanityFilterEngine
|
6
|
+
class AllowDuplicateCharactersStrategy < ExactMatchStrategy
|
7
|
+
DEFAULT_IGNORE_CASE = true
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def build_word_regexp(word)
|
12
|
+
word.chars.map { |char| Regexp.escape(char) + '+' }.join
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'exact_match_strategy'
|
4
|
+
|
5
|
+
module ProfanityFilterEngine
|
6
|
+
class AllowSymbolsInWordsStrategy < ExactMatchStrategy
|
7
|
+
SYMBOLS_REGEXP = '(?:\p{Mark}|\p{Separator}|\p{Symbol}|\p{Punctuation})*'
|
8
|
+
DEFAULT_IGNORE_CASE = true
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def build_word_regexp(word)
|
13
|
+
word.chars.map { |char| Regexp.escape(char) }.join(SYMBOLS_REGEXP)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ProfanityFilterEngine
|
4
|
+
class Component
|
5
|
+
def profane?(text)
|
6
|
+
raise NotImplementedError
|
7
|
+
end
|
8
|
+
|
9
|
+
def profane_words(text)
|
10
|
+
raise NotImplementedError
|
11
|
+
end
|
12
|
+
|
13
|
+
def profanity_count(text)
|
14
|
+
profane_words(text).size
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require_relative 'component'
|
5
|
+
|
6
|
+
module ProfanityFilterEngine
|
7
|
+
class Composite < Component
|
8
|
+
attr_reader :strategies
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
@strategies = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def add_strategy(strategy)
|
15
|
+
strategies << strategy
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_strategies(*new_strategies)
|
19
|
+
strategies.concat(new_strategies)
|
20
|
+
end
|
21
|
+
|
22
|
+
def delete_strategy(strategy)
|
23
|
+
strategies.delete(strategy)
|
24
|
+
end
|
25
|
+
|
26
|
+
def profane?(text)
|
27
|
+
strategies.any? { |strategy| strategy.profane?(text) }
|
28
|
+
end
|
29
|
+
|
30
|
+
def profane_words(text)
|
31
|
+
total_words = strategies.reduce([]) do |words, strategy|
|
32
|
+
words.concat(strategy.profane_words(text))
|
33
|
+
end
|
34
|
+
total_words.uniq
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'regexp_strategy'
|
4
|
+
|
5
|
+
module ProfanityFilterEngine
|
6
|
+
class ExactMatchStrategy < RegexpStrategy
|
7
|
+
DELIMITER = '(?:\b|^|$|_)'
|
8
|
+
DEFAULT_IGNORE_CASE = false
|
9
|
+
|
10
|
+
attr_reader :delimiter
|
11
|
+
attr_reader :ignore_case
|
12
|
+
|
13
|
+
def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
|
14
|
+
@dictionary = dictionary
|
15
|
+
@delimiter = DELIMITER
|
16
|
+
@ignore_case = ignore_case
|
17
|
+
@profanity_regexp = build_profanity_regexp
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def build_profanity_regexp
|
23
|
+
option = ignore_case ? Regexp::IGNORECASE : nil
|
24
|
+
regexp_list = dictionary.map do |word|
|
25
|
+
Regexp.new("#{delimiter}#{build_word_regexp(word)}#{delimiter}", option)
|
26
|
+
end
|
27
|
+
|
28
|
+
Regexp.union(*regexp_list)
|
29
|
+
end
|
30
|
+
|
31
|
+
def build_word_regexp(word)
|
32
|
+
Regexp.escape(word)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'exact_match_strategy'
|
4
|
+
require 'pry'
|
5
|
+
|
6
|
+
module ProfanityFilterEngine
|
7
|
+
class LeetExactMatchStrategy < ExactMatchStrategy
|
8
|
+
DEFAULT_IGNORE_CASE = true
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def build_word_regexp(word)
|
13
|
+
build_leet_dictionary unless defined? LEET_DICTIONARY
|
14
|
+
word.chars.map do |char|
|
15
|
+
downcase_char = char.downcase
|
16
|
+
if LEET_DICTIONARY.include?(downcase_char)
|
17
|
+
LEET_DICTIONARY[downcase_char]
|
18
|
+
else
|
19
|
+
Regexp.escape(char)
|
20
|
+
end
|
21
|
+
end.join
|
22
|
+
end
|
23
|
+
|
24
|
+
def build_leet_dictionary
|
25
|
+
lib_dir = File.expand_path('../../../', __FILE__)
|
26
|
+
file = File.read("#{lib_dir}/profanity_dictionaries/leet_strategy_dictionary.yaml")
|
27
|
+
raw_data = YAML.safe_load(file)
|
28
|
+
dict = transform_data_to_regex(raw_data)
|
29
|
+
::ProfanityFilterEngine::LeetExactMatchStrategy.const_set('LEET_DICTIONARY', dict)
|
30
|
+
end
|
31
|
+
|
32
|
+
def transform_data_to_regex(dict)
|
33
|
+
dict.map do |char, data|
|
34
|
+
data_str = data.join('|')
|
35
|
+
dict[char] = "(?:#{data_str})"
|
36
|
+
end
|
37
|
+
dict
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'regexp_strategy'
|
4
|
+
|
5
|
+
module ProfanityFilterEngine
|
6
|
+
class PartialMatchStrategy < RegexpStrategy
|
7
|
+
DEFAULT_IGNORE_CASE = false
|
8
|
+
|
9
|
+
attr_reader :ignore_case
|
10
|
+
|
11
|
+
def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
|
12
|
+
@dictionary = dictionary
|
13
|
+
@ignore_case = ignore_case
|
14
|
+
@profanity_regexp = build_profanity_regexp
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def build_profanity_regexp
|
20
|
+
option = ignore_case ? Regexp::IGNORECASE : nil
|
21
|
+
regexp_list = dictionary.map do |word|
|
22
|
+
Regexp.new("#{Regexp.escape(word)}", option)
|
23
|
+
end
|
24
|
+
|
25
|
+
Regexp.union(*regexp_list)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'component'
|
4
|
+
|
5
|
+
module ProfanityFilterEngine
|
6
|
+
class RegexpStrategy < Component
|
7
|
+
DEFAULT_DELIMITER = '(?:\b|^|$|_)'
|
8
|
+
|
9
|
+
attr_reader :dictionary, :profanity_regexp
|
10
|
+
|
11
|
+
attr_writer :profanity_regexp
|
12
|
+
private :profanity_regexp=
|
13
|
+
|
14
|
+
def initialize(dictionary:, profanity_regexp: nil)
|
15
|
+
@dictionary = dictionary
|
16
|
+
@profanity_regexp = profanity_regexp || build_profanity_regexp
|
17
|
+
end
|
18
|
+
|
19
|
+
def profane_words(text)
|
20
|
+
text.scan(profanity_regexp).uniq
|
21
|
+
end
|
22
|
+
|
23
|
+
def profane?(text)
|
24
|
+
profanity_regexp.match?(text)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def build_profanity_regexp
|
30
|
+
regexp_list = dictionary.map do |word|
|
31
|
+
Regexp.new("#{DEFAULT_DELIMITER}#{Regexp.escape(word)}#{DEFAULT_DELIMITER}")
|
32
|
+
end
|
33
|
+
|
34
|
+
Regexp.union(*regexp_list)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|