bad_word_detector 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +56 -0
- data/Rakefile +10 -0
- data/bad_word_detector.gemspec +33 -0
- data/bin/bad_word_detector +13 -0
- data/lib/bad_word_detector.rb +200 -0
- data/lib/bad_word_detector/bad_word.rb +30 -0
- data/lib/bad_word_detector/prefix_tree.rb +56 -0
- data/lib/bad_word_detector/rule.rb +35 -0
- data/lib/bad_word_detector/state.rb +41 -0
- data/lib/bad_word_detector/version.rb +3 -0
- data/lib/bad_word_detector/whitelist.rb +33 -0
- data/lib/conf/library.yaml +155 -0
- data/lib/conf/rules.yaml +59 -0
- data/lib/conf/whitelist.yaml +236921 -0
- data/lib/conf/words.yaml +459 -0
- data/test/rules.yaml +63 -0
- data/test/test.rb +26 -0
- data/test/test_bad_word_detector.rb +72 -0
- metadata +110 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Fedotov Daniil
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# BadWordDetector
|
2
|
+
|
3
|
+
Swear word detector
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'bad_word_detector'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install bad_word_detector
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
Detects `#uck` , `F|_|__C_K` and other variations of hidden swear words in text.
|
22
|
+
|
23
|
+
Usage:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
finder = BadWordDetector.new
|
27
|
+
finder.find("What the #uck")
|
28
|
+
```
|
29
|
+
|
30
|
+
it will return BadWord object
|
31
|
+
|
32
|
+
Transformation rules is defined in form:
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
{"#" => {"symbol"=>"f", "weight" => 2}} # weight is optional
|
36
|
+
```
|
37
|
+
|
38
|
+
Or in file conf/rules.yaml
|
39
|
+
|
40
|
+
List of swear words is located in conf/library.yaml
|
41
|
+
|
42
|
+
Whitelist of english words in conf/whitelist.yaml
|
43
|
+
|
44
|
+
You can also set own rules:
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
finder = BadWordDetector.new rules, library, whitelist
|
48
|
+
```
|
49
|
+
|
50
|
+
## Contributing
|
51
|
+
|
52
|
+
1. Fork it
|
53
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
54
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
55
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
56
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/bad_word_detector/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Fedotov Daniil"]
|
6
|
+
gem.email = ["fedotov.danil@gmail.com"]
|
7
|
+
gem.summary = %q{Swear word detector}
|
8
|
+
gem.description = %q{
|
9
|
+
Detects #uck F|_|__C_K and other variations of hidden swear words in text.
|
10
|
+
Usage:
|
11
|
+
```
|
12
|
+
finder = BadWordDetector.new
|
13
|
+
finder.find("What the #uck")
|
14
|
+
it will return BadWord object
|
15
|
+
```
|
16
|
+
Transformation rules is defined in form: {"#" => {"symbol"=>"f", "weight" => 2}} (where weight is optional)
|
17
|
+
in file conf/rules.yaml
|
18
|
+
List of swear words is located in conf/library.yaml
|
19
|
+
Whitelist of english words in conf/whitelist.yaml
|
20
|
+
You can also set own rules:
|
21
|
+
finder = BadWordDetector.new rules, library, whitelist
|
22
|
+
}
|
23
|
+
gem.homepage = "https://github.com/hairyhum/bad-words.ruby"
|
24
|
+
|
25
|
+
gem.files = `git ls-files`.split($\)
|
26
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
27
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
28
|
+
gem.name = "bad_word_detector"
|
29
|
+
gem.require_paths = ["lib"]
|
30
|
+
gem.version = BadWordDetector::VERSION
|
31
|
+
gem.add_development_dependency('yard')
|
32
|
+
gem.add_development_dependency('redcarpet')
|
33
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
require "yaml"
|
2
|
+
require "set"
|
3
|
+
require "bad_word_detector/state"
|
4
|
+
require "bad_word_detector/prefix_tree"
|
5
|
+
require "bad_word_detector/rule"
|
6
|
+
require "bad_word_detector/version"
|
7
|
+
require "bad_word_detector/whitelist"
|
8
|
+
require "bad_word_detector/bad_word"
|
9
|
+
|
10
|
+
class BadWordDetector
|
11
|
+
|
12
|
+
# Create new badword checker
|
13
|
+
#
|
14
|
+
# @param rules [Hash<String,Array<Hash<String, any>>>] Hash where values are arrays
|
15
|
+
# of Hash<['symbol', 'weight'], any> where weight is optional
|
16
|
+
#
|
17
|
+
# @param library [Array<String>] Array of bad words to find
|
18
|
+
#
|
19
|
+
# @param whitelist [Array<String>] Array of words that is acceptable. Used in false-positive check
|
20
|
+
#
|
21
|
+
def initialize(rules = nil, library = nil, whitelist = nil)
|
22
|
+
confdir = File.expand_path(File.dirname(__FILE__) + "/conf")
|
23
|
+
rules ||= YAML.load_file("#{confdir}/rules.yaml")
|
24
|
+
library ||= YAML.load_file("#{confdir}/library.yaml")
|
25
|
+
|
26
|
+
@rule_sets = rules.select do |key, _|
|
27
|
+
key.to_s.length == 1
|
28
|
+
end.hmap do |key, rule|
|
29
|
+
key = key.to_s
|
30
|
+
rule = rule.map do |item|
|
31
|
+
Rule.new(key, item['symbol'], item['weight'])
|
32
|
+
end
|
33
|
+
rule << Rule.new(key, key, 3)
|
34
|
+
[key, rule]
|
35
|
+
end
|
36
|
+
|
37
|
+
@string_sets = rules.select do |key, _|
|
38
|
+
key.to_s.length > 1
|
39
|
+
end.hmap do |key, rule|
|
40
|
+
key = key.to_s
|
41
|
+
rule = rule.map do |item|
|
42
|
+
Rule.new(key, item['symbol'], item['weight'])
|
43
|
+
end
|
44
|
+
[key, rule]
|
45
|
+
end
|
46
|
+
|
47
|
+
@library = PrefixTree.new library
|
48
|
+
@whitelist = Whitelist.new whitelist || YAML.load_file("#{confdir}/whitelist.yaml")
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# Searches string for some word in library
|
53
|
+
#
|
54
|
+
# @param text [String] String to search
|
55
|
+
#
|
56
|
+
# @param return_white [Boolean] Flag to indicate if search should return whitelist word
|
57
|
+
#
|
58
|
+
# @return [BadWord, nil] BadWord object, containing information about found word and it's position in text
|
59
|
+
#
|
60
|
+
def find(text, return_white = false)
|
61
|
+
downcased = text.downcase
|
62
|
+
length = text.length
|
63
|
+
index = 0
|
64
|
+
while index < length
|
65
|
+
found = find_part(downcased, index)
|
66
|
+
if found
|
67
|
+
word = BadWord.new(
|
68
|
+
found[:word],
|
69
|
+
text,
|
70
|
+
index,
|
71
|
+
found[:length],
|
72
|
+
@whitelist)
|
73
|
+
if not word.white? or return_white
|
74
|
+
return word
|
75
|
+
end
|
76
|
+
end
|
77
|
+
index += 1
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
def find_part(text, index)
|
83
|
+
input = text[index..-1]
|
84
|
+
found = unless input.start_with? ' '
|
85
|
+
process(input, @library)
|
86
|
+
end
|
87
|
+
if found
|
88
|
+
word, length = found
|
89
|
+
{:length => length, :word => word}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
def process(string, library)
|
95
|
+
plain = library[string]
|
96
|
+
if plain && plain.value
|
97
|
+
return plain.value, plain.value.length
|
98
|
+
end
|
99
|
+
passed = []
|
100
|
+
bad_states = []
|
101
|
+
queue = [State.new([], library)]
|
102
|
+
until queue.empty?
|
103
|
+
state = queue.shift
|
104
|
+
new_states = get_new_states state, string
|
105
|
+
if new_states
|
106
|
+
passed << state
|
107
|
+
success_index = new_states.index(&:success?)
|
108
|
+
if success_index
|
109
|
+
new_state = new_states[success_index]
|
110
|
+
return new_state.text, new_state.length
|
111
|
+
else
|
112
|
+
states = (new_states - bad_states) - passed
|
113
|
+
push_states queue, states
|
114
|
+
end
|
115
|
+
else
|
116
|
+
bad_states << state
|
117
|
+
end
|
118
|
+
end
|
119
|
+
nil
|
120
|
+
end
|
121
|
+
|
122
|
+
def push_states(queue, states)
|
123
|
+
states.each do |state|
|
124
|
+
weight = state.weight
|
125
|
+
if queue.any? { |q_state| q_state == state }
|
126
|
+
next
|
127
|
+
end
|
128
|
+
unless weight < 0.3
|
129
|
+
if queue.empty? || weight < queue.last.weight
|
130
|
+
queue << state
|
131
|
+
elsif weight > queue.first.weight
|
132
|
+
queue.insert(0, state)
|
133
|
+
else
|
134
|
+
new_index = queue.length
|
135
|
+
queue.each_with_index do |item, index|
|
136
|
+
if item.weight < weight
|
137
|
+
new_index = index
|
138
|
+
end
|
139
|
+
end
|
140
|
+
queue.insert(new_index, state)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def get_new_states(state, string)
|
147
|
+
next_symbols = get_next_symbols state.length, string
|
148
|
+
unless next_symbols
|
149
|
+
return nil
|
150
|
+
end
|
151
|
+
new_states = append_path next_symbols, state
|
152
|
+
if new_states.empty?
|
153
|
+
nil
|
154
|
+
else
|
155
|
+
new_states
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def append_path(symbols, state)
|
160
|
+
symbols.map do |sym|
|
161
|
+
state.append sym
|
162
|
+
end.reject do |new_state|
|
163
|
+
new_state.failure?
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def get_next_symbols(index, string)
|
168
|
+
char = string[index]
|
169
|
+
if char
|
170
|
+
char = char.to_s
|
171
|
+
get_rules(char, string[index..-1]) || []
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def get_rules(char, string)
|
176
|
+
char_rules = @rule_sets[char] || [Rule.self(char)]
|
177
|
+
if char_rules.none? { |rule| rule.symbol == '' }
|
178
|
+
char_rules << Rule.empty(char)
|
179
|
+
end
|
180
|
+
|
181
|
+
string_rules = @string_sets.select do |k, _|
|
182
|
+
k.start_with?(char) and string.start_with?(k)
|
183
|
+
end.values.flatten
|
184
|
+
|
185
|
+
char_rules.concat(string_rules)
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
|
190
|
+
class Hash
|
191
|
+
def hmap
|
192
|
+
result = {}
|
193
|
+
self.each do |k, v|
|
194
|
+
k, v = yield k, v
|
195
|
+
result[k] = v
|
196
|
+
end
|
197
|
+
result
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
class BadWord
|
2
|
+
attr_reader :text, :word, :index, :source, :white_words
|
3
|
+
|
4
|
+
#
|
5
|
+
# Create new BadWord
|
6
|
+
#
|
7
|
+
# @param word [String] found word
|
8
|
+
# @param source [String] Source text where word was found
|
9
|
+
# @param index [Integer] index of word in source
|
10
|
+
# @param length [Integer] word length
|
11
|
+
# @param whitelist [Array<String>] Whitelist words
|
12
|
+
def initialize(word, source, index, length, whitelist)
|
13
|
+
@index = index
|
14
|
+
@length = length
|
15
|
+
@word = word
|
16
|
+
@source = source
|
17
|
+
word_end = @index+@length-1
|
18
|
+
space_location = @source.index(' ', word_end) || 0
|
19
|
+
@text = @source[@index..space_location-1]
|
20
|
+
@white_words = whitelist.check_bad_word(self)
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Check if word is in whitelist
|
25
|
+
# @return [true, false]
|
26
|
+
def white?
|
27
|
+
!!@white_words
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
class PrefixTree
|
2
|
+
def initialize(items = [], hash_tree = {})
|
3
|
+
@hash_tree = hash_tree.clone
|
4
|
+
unless items.empty?
|
5
|
+
items.each do |i|
|
6
|
+
self << i
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def hash_tree
|
12
|
+
@hash_tree
|
13
|
+
end
|
14
|
+
|
15
|
+
def << (string)
|
16
|
+
parts = string.chars
|
17
|
+
new_hash = self.hash_tree
|
18
|
+
parts.each do |part|
|
19
|
+
unless new_hash[part]
|
20
|
+
new_hash[part] = {}
|
21
|
+
end
|
22
|
+
new_hash = new_hash[part]
|
23
|
+
end
|
24
|
+
new_hash[:value] = string
|
25
|
+
self
|
26
|
+
end
|
27
|
+
|
28
|
+
def [] (string = '')
|
29
|
+
parts = string.chars
|
30
|
+
new_hash = self.hash_tree
|
31
|
+
parts.each do |part|
|
32
|
+
unless new_hash[part]
|
33
|
+
return nil
|
34
|
+
end
|
35
|
+
new_hash = new_hash[part]
|
36
|
+
end
|
37
|
+
PrefixTree.new [], new_hash
|
38
|
+
end
|
39
|
+
|
40
|
+
def value
|
41
|
+
self.hash_tree[:value]
|
42
|
+
end
|
43
|
+
|
44
|
+
def children? (string)
|
45
|
+
(self[string].hash_tree.keys - [:values]).any?
|
46
|
+
end
|
47
|
+
|
48
|
+
def clone
|
49
|
+
PrefixTree.new [], self.hash_tree
|
50
|
+
end
|
51
|
+
|
52
|
+
def == (sec)
|
53
|
+
sec.hash_tree == self.hash_tree
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|