profanity-filter 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ - aborto
2
+ - amador
3
+ - ânus
4
+ - aranha
5
+ - ariano
6
+ - balalao
7
+ - bastardo
8
+ - bicha
9
+ - biscate
10
+ - bissexual
11
+ - boceta
12
+ - boob
13
+ - bosta
14
+ - braulio de borracha
15
+ - bumbum
16
+ - burro
17
+ - cabrao
18
+ - cacete
19
+ - cagar
20
+ - camisinha
21
+ - caralho
22
+ - cerveja
23
+ - chochota
24
+ - chupar
25
+ - clitoris
26
+ - cocaína
27
+ - colhoes
28
+ - comer
29
+ - cona
30
+ - consolo
31
+ - corno
32
+ - cu
33
+ - dar o rabo
34
+ - dum raio
35
+ - esporra
36
+ - fecal
37
+ - filho da puta
38
+ - foda
39
+ - foda-se
40
+ - foder
41
+ - frango assado
42
+ - gozar
43
+ - grelho
44
+ - heroína
45
+ - heterosexual
46
+ - homem gay
47
+ - homoerótico
48
+ - homosexual
49
+ - inferno
50
+ - lésbica
51
+ - lolita
52
+ - mama
53
+ - merda
54
+ - paneleiro
55
+ - passar um cheque
56
+ - pau
57
+ - peidar
58
+ - pênis
59
+ - pinto
60
+ - porra
61
+ - puta
62
+ - puta que pariu
63
+ - puta que te pariu
64
+ - queca
65
+ - sacanagem
66
+ - saco
67
+ - torneira
68
+ - transar
69
+ - vai-te foder
70
+ - vai tomar no cu
71
+ - veado
72
+ - vibrador
73
+ - xana
74
+ - xochota
@@ -0,0 +1,160 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'profanity_filter/version'
4
+ require 'profanity_filter/engines/composite'
5
+ require 'profanity_filter/engines/partial_match_strategy'
6
+ require 'profanity_filter/engines/allow_duplicate_characters_strategy'
7
+ require 'profanity_filter/engines/allow_symbols_in_words_strategy'
8
+ require 'profanity_filter/engines/leet_exact_match_strategy'
9
+ require 'web_purify'
10
+
11
+ class ProfanityFilter
12
+ WP_DEFAULT_LANGS = [:en, :sp, :pt].freeze
13
+ WP_AVAILABLE_LANGS = [
14
+ :en, :ar, :fr, :de, :hi, :jp, :it, :pt, :ru, :sp, :th, :tr, :zh, :kr, :pa
15
+ ].freeze
16
+ WP_LANG_CONVERSIONS = { es: :sp, ko: :kr, ja: :jp }.freeze
17
+
18
+ attr_reader :strict_filter, :tolerant_filter
19
+
20
+ def initialize(web_purifier_api_key: nil)
21
+ # If we are using Web Purifier
22
+ @wp_client = web_purifier_api_key ? WebPurify::Client.new(web_purifier_api_key) : nil
23
+
24
+ exact_match_dictionary = load_exact_match_dictionary
25
+ partial_match_dictionary = load_partial_match_dictionary
26
+
27
+ allow_symbol_strategy = ::ProfanityFilterEngine::AllowSymbolsInWordsStrategy.new(
28
+ dictionary: exact_match_dictionary,
29
+ ignore_case: true
30
+ )
31
+ duplicate_characters_strategy = ::ProfanityFilterEngine::AllowDuplicateCharactersStrategy.new(
32
+ dictionary: exact_match_dictionary,
33
+ ignore_case: true
34
+ )
35
+ leet_strategy = ::ProfanityFilterEngine::LeetExactMatchStrategy.new(
36
+ dictionary: exact_match_dictionary,
37
+ ignore_case: true
38
+ )
39
+ partial_match_strategy = ::ProfanityFilterEngine::PartialMatchStrategy.new(
40
+ dictionary: partial_match_dictionary,
41
+ ignore_case: true
42
+ )
43
+
44
+ # Set up strict filter.
45
+ @strict_filter = ::ProfanityFilterEngine::Composite.new
46
+ @strict_filter.add_strategies(
47
+ leet_strategy,
48
+ allow_symbol_strategy,
49
+ partial_match_strategy,
50
+ duplicate_characters_strategy
51
+ )
52
+ # Set up tolerant filter.
53
+ @tolerant_filter = ::ProfanityFilterEngine::Composite.new
54
+ @tolerant_filter.add_strategies(
55
+ allow_symbol_strategy,
56
+ partial_match_strategy
57
+ )
58
+ end
59
+
60
+ def profane?(phrase, lang: nil, strictness: :tolerant)
61
+ return false if phrase == '' || phrase.nil?
62
+
63
+ is_profane = pf_profane?(phrase, strictness: strictness)
64
+ if !is_profane && use_webpurify?
65
+ wp_is_profane = wp_profane?(phrase, lang: lang)
66
+ is_profane = wp_is_profane unless wp_is_profane.nil?
67
+ end
68
+
69
+ !!is_profane
70
+ end
71
+
72
+ def profanity_count(phrase, lang: nil, strictness: :tolerant)
73
+ return 0 if phrase == '' || phrase.nil?
74
+
75
+ banned_words_count = pf_profanity_count(phrase, strictness: strictness)
76
+ if banned_words_count == 0 && use_webpurify?
77
+ wp_banned_words_count = wp_profanity_count(phrase, lang: lang)
78
+ banned_words_count = wp_banned_words_count unless wp_banned_words_count.nil?
79
+ end
80
+
81
+ banned_words_count
82
+ end
83
+
84
+ private
85
+
86
+ def use_webpurify?
87
+ !!@wp_client
88
+ end
89
+
90
+ def filter(strictness: :tolerant)
91
+ case strictness
92
+ when :strict
93
+ @strict_filter
94
+ when :tolerant
95
+ @tolerant_filter
96
+ else
97
+ @tolerant_filter
98
+ end
99
+ end
100
+
101
+ def pf_profane?(phrase, strictness: :tolerant)
102
+ filter(strictness: strictness).profane?(phrase)
103
+ end
104
+
105
+ def pf_profanity_count(phrase, strictness: :tolerant)
106
+ filter(strictness: strictness).profanity_count(phrase)
107
+ end
108
+
109
+ def wp_profane?(phrase, lang: nil, timeout_duration: 5)
110
+ profanity_count = wp_profanity_count(phrase, lang: lang, timeout_duration: timeout_duration)
111
+
112
+ if profanity_count.nil? || profanity_count == 0
113
+ false
114
+ else
115
+ true
116
+ end
117
+ end
118
+
119
+ def wp_profanity_count(phrase, lang: nil, timeout_duration: 5)
120
+ Timeout::timeout(timeout_duration) do
121
+ @wp_client.check_count phrase, lang: wp_langs_list_with(lang)
122
+ end
123
+ rescue StandardError => e
124
+ nil
125
+ end
126
+
127
+ def wp_langs_list_with(lang)
128
+ langs = Set.new(WP_DEFAULT_LANGS)
129
+
130
+ if lang
131
+ lang = shorten_language(lang).to_sym
132
+ lang = WP_LANG_CONVERSIONS[lang] || lang
133
+ if WP_AVAILABLE_LANGS.include?(lang)
134
+ langs << lang
135
+ end
136
+ end
137
+
138
+ langs.to_a.join(',')
139
+ end
140
+
141
+ def load_dictionary(file_path)
142
+ dir = File.dirname(__FILE__)
143
+ YAML.load(File.read("#{dir}/profanity_dictionaries/#{file_path}.yaml"))
144
+ end
145
+
146
+ def load_exact_match_dictionary
147
+ en_dictionary = load_dictionary('en')
148
+ es_dictionary = load_dictionary('es')
149
+ pt_dictionary = load_dictionary('pt')
150
+ en_dictionary + es_dictionary + pt_dictionary
151
+ end
152
+
153
+ def load_partial_match_dictionary
154
+ load_dictionary('partial_match')
155
+ end
156
+
157
+ def shorten_language(lang)
158
+ lang && lang.to_s.downcase[0, 2]
159
+ end
160
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'exact_match_strategy'
4
+
5
+ module ProfanityFilterEngine
6
+ class AllowDuplicateCharactersStrategy < ExactMatchStrategy
7
+ DEFAULT_IGNORE_CASE = true
8
+
9
+ private
10
+
11
+ def build_word_regexp(word)
12
+ word.chars.map { |char| Regexp.escape(char) + '+' }.join
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'exact_match_strategy'
4
+
5
+ module ProfanityFilterEngine
6
+ class AllowSymbolsInWordsStrategy < ExactMatchStrategy
7
+ SYMBOLS_REGEXP = '(?:\p{Mark}|\p{Separator}|\p{Symbol}|\p{Punctuation})*'
8
+ DEFAULT_IGNORE_CASE = true
9
+
10
+ private
11
+
12
+ def build_word_regexp(word)
13
+ word.chars.map { |char| Regexp.escape(char) }.join(SYMBOLS_REGEXP)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ProfanityFilterEngine
4
+ class Component
5
+ def profane?(text)
6
+ raise NotImplementedError
7
+ end
8
+
9
+ def profane_words(text)
10
+ raise NotImplementedError
11
+ end
12
+
13
+ def profanity_count(text)
14
+ profane_words(text).size
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+ require_relative 'component'
5
+
6
+ module ProfanityFilterEngine
7
+ class Composite < Component
8
+ attr_reader :strategies
9
+
10
+ def initialize
11
+ @strategies = []
12
+ end
13
+
14
+ def add_strategy(strategy)
15
+ strategies << strategy
16
+ end
17
+
18
+ def add_strategies(*new_strategies)
19
+ strategies.concat(new_strategies)
20
+ end
21
+
22
+ def delete_strategy(strategy)
23
+ strategies.delete(strategy)
24
+ end
25
+
26
+ def profane?(text)
27
+ strategies.any? { |strategy| strategy.profane?(text) }
28
+ end
29
+
30
+ def profane_words(text)
31
+ total_words = strategies.reduce([]) do |words, strategy|
32
+ words.concat(strategy.profane_words(text))
33
+ end
34
+ total_words.uniq
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'regexp_strategy'
4
+
5
+ module ProfanityFilterEngine
6
+ class ExactMatchStrategy < RegexpStrategy
7
+ DELIMITER = '(?:\b|^|$|_)'
8
+ DEFAULT_IGNORE_CASE = false
9
+
10
+ attr_reader :delimiter
11
+ attr_reader :ignore_case
12
+
13
+ def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
14
+ @dictionary = dictionary
15
+ @delimiter = DELIMITER
16
+ @ignore_case = ignore_case
17
+ @profanity_regexp = build_profanity_regexp
18
+ end
19
+
20
+ private
21
+
22
+ def build_profanity_regexp
23
+ option = ignore_case ? Regexp::IGNORECASE : nil
24
+ regexp_list = dictionary.map do |word|
25
+ Regexp.new("#{delimiter}#{build_word_regexp(word)}#{delimiter}", option)
26
+ end
27
+
28
+ Regexp.union(*regexp_list)
29
+ end
30
+
31
+ def build_word_regexp(word)
32
+ Regexp.escape(word)
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'exact_match_strategy'
4
+ require 'pry'
5
+
6
+ module ProfanityFilterEngine
7
+ class LeetExactMatchStrategy < ExactMatchStrategy
8
+ DEFAULT_IGNORE_CASE = true
9
+
10
+ private
11
+
12
+ def build_word_regexp(word)
13
+ build_leet_dictionary unless defined? LEET_DICTIONARY
14
+ word.chars.map do |char|
15
+ downcase_char = char.downcase
16
+ if LEET_DICTIONARY.include?(downcase_char)
17
+ LEET_DICTIONARY[downcase_char]
18
+ else
19
+ Regexp.escape(char)
20
+ end
21
+ end.join
22
+ end
23
+
24
+ def build_leet_dictionary
25
+ lib_dir = File.expand_path('../../../', __FILE__)
26
+ file = File.read("#{lib_dir}/profanity_dictionaries/leet_strategy_dictionary.yaml")
27
+ raw_data = YAML.safe_load(file)
28
+ dict = transform_data_to_regex(raw_data)
29
+ ::ProfanityFilterEngine::LeetExactMatchStrategy.const_set('LEET_DICTIONARY', dict)
30
+ end
31
+
32
+ def transform_data_to_regex(dict)
33
+ dict.map do |char, data|
34
+ data_str = data.join('|')
35
+ dict[char] = "(?:#{data_str})"
36
+ end
37
+ dict
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'regexp_strategy'
4
+
5
+ module ProfanityFilterEngine
6
+ class PartialMatchStrategy < RegexpStrategy
7
+ DEFAULT_IGNORE_CASE = false
8
+
9
+ attr_reader :ignore_case
10
+
11
+ def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
12
+ @dictionary = dictionary
13
+ @ignore_case = ignore_case
14
+ @profanity_regexp = build_profanity_regexp
15
+ end
16
+
17
+ private
18
+
19
+ def build_profanity_regexp
20
+ option = ignore_case ? Regexp::IGNORECASE : nil
21
+ regexp_list = dictionary.map do |word|
22
+ Regexp.new("#{Regexp.escape(word)}", option)
23
+ end
24
+
25
+ Regexp.union(*regexp_list)
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'component'
4
+
5
+ module ProfanityFilterEngine
6
+ class RegexpStrategy < Component
7
+ DEFAULT_DELIMITER = '(?:\b|^|$|_)'
8
+
9
+ attr_reader :dictionary, :profanity_regexp
10
+
11
+ attr_writer :profanity_regexp
12
+ private :profanity_regexp=
13
+
14
+ def initialize(dictionary:, profanity_regexp: nil)
15
+ @dictionary = dictionary
16
+ @profanity_regexp = profanity_regexp || build_profanity_regexp
17
+ end
18
+
19
+ def profane_words(text)
20
+ text.scan(profanity_regexp).uniq
21
+ end
22
+
23
+ def profane?(text)
24
+ profanity_regexp.match?(text)
25
+ end
26
+
27
+ private
28
+
29
+ def build_profanity_regexp
30
+ regexp_list = dictionary.map do |word|
31
+ Regexp.new("#{DEFAULT_DELIMITER}#{Regexp.escape(word)}#{DEFAULT_DELIMITER}")
32
+ end
33
+
34
+ Regexp.union(*regexp_list)
35
+ end
36
+ end
37
+ end