profanity-filter 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,74 @@
1
+ - aborto
2
+ - amador
3
+ - ânus
4
+ - aranha
5
+ - ariano
6
+ - balalao
7
+ - bastardo
8
+ - bicha
9
+ - biscate
10
+ - bissexual
11
+ - boceta
12
+ - boob
13
+ - bosta
14
+ - braulio de borracha
15
+ - bumbum
16
+ - burro
17
+ - cabrao
18
+ - cacete
19
+ - cagar
20
+ - camisinha
21
+ - caralho
22
+ - cerveja
23
+ - chochota
24
+ - chupar
25
+ - clitoris
26
+ - cocaína
27
+ - colhoes
28
+ - comer
29
+ - cona
30
+ - consolo
31
+ - corno
32
+ - cu
33
+ - dar o rabo
34
+ - dum raio
35
+ - esporra
36
+ - fecal
37
+ - filho da puta
38
+ - foda
39
+ - foda-se
40
+ - foder
41
+ - frango assado
42
+ - gozar
43
+ - grelho
44
+ - heroína
45
+ - heterosexual
46
+ - homem gay
47
+ - homoerótico
48
+ - homosexual
49
+ - inferno
50
+ - lésbica
51
+ - lolita
52
+ - mama
53
+ - merda
54
+ - paneleiro
55
+ - passar um cheque
56
+ - pau
57
+ - peidar
58
+ - pênis
59
+ - pinto
60
+ - porra
61
+ - puta
62
+ - puta que pariu
63
+ - puta que te pariu
64
+ - queca
65
+ - sacanagem
66
+ - saco
67
+ - torneira
68
+ - transar
69
+ - vai-te foder
70
+ - vai tomar no cu
71
+ - veado
72
+ - vibrador
73
+ - xana
74
+ - xochota
@@ -0,0 +1,160 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'profanity_filter/version'
4
+ require 'profanity_filter/engines/composite'
5
+ require 'profanity_filter/engines/partial_match_strategy'
6
+ require 'profanity_filter/engines/allow_duplicate_characters_strategy'
7
+ require 'profanity_filter/engines/allow_symbols_in_words_strategy'
8
+ require 'profanity_filter/engines/leet_exact_match_strategy'
9
+ require 'web_purify'
10
+
11
+ class ProfanityFilter
12
+ WP_DEFAULT_LANGS = [:en, :sp, :pt].freeze
13
+ WP_AVAILABLE_LANGS = [
14
+ :en, :ar, :fr, :de, :hi, :jp, :it, :pt, :ru, :sp, :th, :tr, :zh, :kr, :pa
15
+ ].freeze
16
+ WP_LANG_CONVERSIONS = { es: :sp, ko: :kr, ja: :jp }.freeze
17
+
18
+ attr_reader :strict_filter, :tolerant_filter
19
+
20
+ def initialize(web_purifier_api_key: nil)
21
+ # If we are using Web Purifier
22
+ @wp_client = web_purifier_api_key ? WebPurify::Client.new(web_purifier_api_key) : nil
23
+
24
+ exact_match_dictionary = load_exact_match_dictionary
25
+ partial_match_dictionary = load_partial_match_dictionary
26
+
27
+ allow_symbol_strategy = ::ProfanityFilterEngine::AllowSymbolsInWordsStrategy.new(
28
+ dictionary: exact_match_dictionary,
29
+ ignore_case: true
30
+ )
31
+ duplicate_characters_strategy = ::ProfanityFilterEngine::AllowDuplicateCharactersStrategy.new(
32
+ dictionary: exact_match_dictionary,
33
+ ignore_case: true
34
+ )
35
+ leet_strategy = ::ProfanityFilterEngine::LeetExactMatchStrategy.new(
36
+ dictionary: exact_match_dictionary,
37
+ ignore_case: true
38
+ )
39
+ partial_match_strategy = ::ProfanityFilterEngine::PartialMatchStrategy.new(
40
+ dictionary: partial_match_dictionary,
41
+ ignore_case: true
42
+ )
43
+
44
+ # Set up strict filter.
45
+ @strict_filter = ::ProfanityFilterEngine::Composite.new
46
+ @strict_filter.add_strategies(
47
+ leet_strategy,
48
+ allow_symbol_strategy,
49
+ partial_match_strategy,
50
+ duplicate_characters_strategy
51
+ )
52
+ # Set up tolerant filter.
53
+ @tolerant_filter = ::ProfanityFilterEngine::Composite.new
54
+ @tolerant_filter.add_strategies(
55
+ allow_symbol_strategy,
56
+ partial_match_strategy
57
+ )
58
+ end
59
+
60
+ def profane?(phrase, lang: nil, strictness: :tolerant)
61
+ return false if phrase == '' || phrase.nil?
62
+
63
+ is_profane = pf_profane?(phrase, strictness: strictness)
64
+ if !is_profane && use_webpurify?
65
+ wp_is_profane = wp_profane?(phrase, lang: lang)
66
+ is_profane = wp_is_profane unless wp_is_profane.nil?
67
+ end
68
+
69
+ !!is_profane
70
+ end
71
+
72
+ def profanity_count(phrase, lang: nil, strictness: :tolerant)
73
+ return 0 if phrase == '' || phrase.nil?
74
+
75
+ banned_words_count = pf_profanity_count(phrase, strictness: strictness)
76
+ if banned_words_count == 0 && use_webpurify?
77
+ wp_banned_words_count = wp_profanity_count(phrase, lang: lang)
78
+ banned_words_count = wp_banned_words_count unless wp_banned_words_count.nil?
79
+ end
80
+
81
+ banned_words_count
82
+ end
83
+
84
+ private
85
+
86
+ def use_webpurify?
87
+ !!@wp_client
88
+ end
89
+
90
+ def filter(strictness: :tolerant)
91
+ case strictness
92
+ when :strict
93
+ @strict_filter
94
+ when :tolerant
95
+ @tolerant_filter
96
+ else
97
+ @tolerant_filter
98
+ end
99
+ end
100
+
101
+ def pf_profane?(phrase, strictness: :tolerant)
102
+ filter(strictness: strictness).profane?(phrase)
103
+ end
104
+
105
+ def pf_profanity_count(phrase, strictness: :tolerant)
106
+ filter(strictness: strictness).profanity_count(phrase)
107
+ end
108
+
109
+ def wp_profane?(phrase, lang: nil, timeout_duration: 5)
110
+ profanity_count = wp_profanity_count(phrase, lang: lang, timeout_duration: timeout_duration)
111
+
112
+ if profanity_count.nil? || profanity_count == 0
113
+ false
114
+ else
115
+ true
116
+ end
117
+ end
118
+
119
+ def wp_profanity_count(phrase, lang: nil, timeout_duration: 5)
120
+ Timeout::timeout(timeout_duration) do
121
+ @wp_client.check_count phrase, lang: wp_langs_list_with(lang)
122
+ end
123
+ rescue StandardError => e
124
+ nil
125
+ end
126
+
127
+ def wp_langs_list_with(lang)
128
+ langs = Set.new(WP_DEFAULT_LANGS)
129
+
130
+ if lang
131
+ lang = shorten_language(lang).to_sym
132
+ lang = WP_LANG_CONVERSIONS[lang] || lang
133
+ if WP_AVAILABLE_LANGS.include?(lang)
134
+ langs << lang
135
+ end
136
+ end
137
+
138
+ langs.to_a.join(',')
139
+ end
140
+
141
+ def load_dictionary(file_path)
142
+ dir = File.dirname(__FILE__)
143
+ YAML.load(File.read("#{dir}/profanity_dictionaries/#{file_path}.yaml"))
144
+ end
145
+
146
+ def load_exact_match_dictionary
147
+ en_dictionary = load_dictionary('en')
148
+ es_dictionary = load_dictionary('es')
149
+ pt_dictionary = load_dictionary('pt')
150
+ en_dictionary + es_dictionary + pt_dictionary
151
+ end
152
+
153
+ def load_partial_match_dictionary
154
+ load_dictionary('partial_match')
155
+ end
156
+
157
+ def shorten_language(lang)
158
+ lang && lang.to_s.downcase[0, 2]
159
+ end
160
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'exact_match_strategy'
4
+
5
+ module ProfanityFilterEngine
6
+ class AllowDuplicateCharactersStrategy < ExactMatchStrategy
7
+ DEFAULT_IGNORE_CASE = true
8
+
9
+ private
10
+
11
+ def build_word_regexp(word)
12
+ word.chars.map { |char| Regexp.escape(char) + '+' }.join
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'exact_match_strategy'
4
+
5
+ module ProfanityFilterEngine
6
+ class AllowSymbolsInWordsStrategy < ExactMatchStrategy
7
+ SYMBOLS_REGEXP = '(?:\p{Mark}|\p{Separator}|\p{Symbol}|\p{Punctuation})*'
8
+ DEFAULT_IGNORE_CASE = true
9
+
10
+ private
11
+
12
+ def build_word_regexp(word)
13
+ word.chars.map { |char| Regexp.escape(char) }.join(SYMBOLS_REGEXP)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ProfanityFilterEngine
4
+ class Component
5
+ def profane?(text)
6
+ raise NotImplementedError
7
+ end
8
+
9
+ def profane_words(text)
10
+ raise NotImplementedError
11
+ end
12
+
13
+ def profanity_count(text)
14
+ profane_words(text).size
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+ require_relative 'component'
5
+
6
+ module ProfanityFilterEngine
7
+ class Composite < Component
8
+ attr_reader :strategies
9
+
10
+ def initialize
11
+ @strategies = []
12
+ end
13
+
14
+ def add_strategy(strategy)
15
+ strategies << strategy
16
+ end
17
+
18
+ def add_strategies(*new_strategies)
19
+ strategies.concat(new_strategies)
20
+ end
21
+
22
+ def delete_strategy(strategy)
23
+ strategies.delete(strategy)
24
+ end
25
+
26
+ def profane?(text)
27
+ strategies.any? { |strategy| strategy.profane?(text) }
28
+ end
29
+
30
+ def profane_words(text)
31
+ total_words = strategies.reduce([]) do |words, strategy|
32
+ words.concat(strategy.profane_words(text))
33
+ end
34
+ total_words.uniq
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'regexp_strategy'
4
+
5
+ module ProfanityFilterEngine
6
+ class ExactMatchStrategy < RegexpStrategy
7
+ DELIMITER = '(?:\b|^|$|_)'
8
+ DEFAULT_IGNORE_CASE = false
9
+
10
+ attr_reader :delimiter
11
+ attr_reader :ignore_case
12
+
13
+ def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
14
+ @dictionary = dictionary
15
+ @delimiter = DELIMITER
16
+ @ignore_case = ignore_case
17
+ @profanity_regexp = build_profanity_regexp
18
+ end
19
+
20
+ private
21
+
22
+ def build_profanity_regexp
23
+ option = ignore_case ? Regexp::IGNORECASE : nil
24
+ regexp_list = dictionary.map do |word|
25
+ Regexp.new("#{delimiter}#{build_word_regexp(word)}#{delimiter}", option)
26
+ end
27
+
28
+ Regexp.union(*regexp_list)
29
+ end
30
+
31
+ def build_word_regexp(word)
32
+ Regexp.escape(word)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'exact_match_strategy'
4
+ require 'pry'
5
+
6
+ module ProfanityFilterEngine
7
+ class LeetExactMatchStrategy < ExactMatchStrategy
8
+ DEFAULT_IGNORE_CASE = true
9
+
10
+ private
11
+
12
+ def build_word_regexp(word)
13
+ build_leet_dictionary unless defined? LEET_DICTIONARY
14
+ word.chars.map do |char|
15
+ downcase_char = char.downcase
16
+ if LEET_DICTIONARY.include?(downcase_char)
17
+ LEET_DICTIONARY[downcase_char]
18
+ else
19
+ Regexp.escape(char)
20
+ end
21
+ end.join
22
+ end
23
+
24
+ def build_leet_dictionary
25
+ lib_dir = File.expand_path('../../../', __FILE__)
26
+ file = File.read("#{lib_dir}/profanity_dictionaries/leet_strategy_dictionary.yaml")
27
+ raw_data = YAML.safe_load(file)
28
+ dict = transform_data_to_regex(raw_data)
29
+ ::ProfanityFilterEngine::LeetExactMatchStrategy.const_set('LEET_DICTIONARY', dict)
30
+ end
31
+
32
+ def transform_data_to_regex(dict)
33
+ dict.map do |char, data|
34
+ data_str = data.join('|')
35
+ dict[char] = "(?:#{data_str})"
36
+ end
37
+ dict
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'regexp_strategy'
4
+
5
+ module ProfanityFilterEngine
6
+ class PartialMatchStrategy < RegexpStrategy
7
+ DEFAULT_IGNORE_CASE = false
8
+
9
+ attr_reader :ignore_case
10
+
11
+ def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
12
+ @dictionary = dictionary
13
+ @ignore_case = ignore_case
14
+ @profanity_regexp = build_profanity_regexp
15
+ end
16
+
17
+ private
18
+
19
+ def build_profanity_regexp
20
+ option = ignore_case ? Regexp::IGNORECASE : nil
21
+ regexp_list = dictionary.map do |word|
22
+ Regexp.new("#{Regexp.escape(word)}", option)
23
+ end
24
+
25
+ Regexp.union(*regexp_list)
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'component'
4
+
5
+ module ProfanityFilterEngine
6
+ class RegexpStrategy < Component
7
+ DEFAULT_DELIMITER = '(?:\b|^|$|_)'
8
+
9
+ attr_reader :dictionary, :profanity_regexp
10
+
11
+ attr_writer :profanity_regexp
12
+ private :profanity_regexp=
13
+
14
+ def initialize(dictionary:, profanity_regexp: nil)
15
+ @dictionary = dictionary
16
+ @profanity_regexp = profanity_regexp || build_profanity_regexp
17
+ end
18
+
19
+ def profane_words(text)
20
+ text.scan(profanity_regexp).uniq
21
+ end
22
+
23
+ def profane?(text)
24
+ profanity_regexp.match?(text)
25
+ end
26
+
27
+ private
28
+
29
+ def build_profanity_regexp
30
+ regexp_list = dictionary.map do |word|
31
+ Regexp.new("#{DEFAULT_DELIMITER}#{Regexp.escape(word)}#{DEFAULT_DELIMITER}")
32
+ end
33
+
34
+ Regexp.union(*regexp_list)
35
+ end
36
+ end
37
+ end