bad_word_detector 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +56 -0
- data/Rakefile +10 -0
- data/bad_word_detector.gemspec +33 -0
- data/bin/bad_word_detector +13 -0
- data/lib/bad_word_detector.rb +200 -0
- data/lib/bad_word_detector/bad_word.rb +30 -0
- data/lib/bad_word_detector/prefix_tree.rb +56 -0
- data/lib/bad_word_detector/rule.rb +35 -0
- data/lib/bad_word_detector/state.rb +41 -0
- data/lib/bad_word_detector/version.rb +3 -0
- data/lib/bad_word_detector/whitelist.rb +33 -0
- data/lib/conf/library.yaml +155 -0
- data/lib/conf/rules.yaml +59 -0
- data/lib/conf/whitelist.yaml +236921 -0
- data/lib/conf/words.yaml +459 -0
- data/test/rules.yaml +63 -0
- data/test/test.rb +26 -0
- data/test/test_bad_word_detector.rb +72 -0
- metadata +110 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Fedotov Daniil
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# BadWordDetector
|
2
|
+
|
3
|
+
Swear word detector
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'bad_word_detector'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install bad_word_detector
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
Detects `#uck` , `F|_|__C_K` and other variations of hidden swear words in text.
|
22
|
+
|
23
|
+
Usage:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
finder = BadWordDetector.new
|
27
|
+
finder.find("What the #uck")
|
28
|
+
```
|
29
|
+
|
30
|
+
it will return BadWord object
|
31
|
+
|
32
|
+
Transformation rules is defined in form:
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
{"#" => {"symbol"=>"f", "weight" => 2}} # weight is optional
|
36
|
+
```
|
37
|
+
|
38
|
+
Or in file conf/rules.yaml
|
39
|
+
|
40
|
+
List of swear words is located in conf/library.yaml
|
41
|
+
|
42
|
+
Whitelist of english words in conf/whitelist.yaml
|
43
|
+
|
44
|
+
You can also set own rules:
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
finder = BadWordDetector.new rules, library, whitelist
|
48
|
+
```
|
49
|
+
|
50
|
+
## Contributing
|
51
|
+
|
52
|
+
1. Fork it
|
53
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
54
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
55
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
56
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/bad_word_detector/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Fedotov Daniil"]
|
6
|
+
gem.email = ["fedotov.danil@gmail.com"]
|
7
|
+
gem.summary = %q{Swear word detector}
|
8
|
+
gem.description = %q{
|
9
|
+
Detects #uck F|_|__C_K and other variations of hidden swear words in text.
|
10
|
+
Usage:
|
11
|
+
```
|
12
|
+
finder = BadWordDetector.new
|
13
|
+
finder.find("What the #uck")
|
14
|
+
it will return BadWord object
|
15
|
+
```
|
16
|
+
Transformation rules is defined in form: {"#" => {"symbol"=>"f", "weight" => 2}} (where weight is optional)
|
17
|
+
in file conf/rules.yaml
|
18
|
+
List of swear words is located in conf/library.yaml
|
19
|
+
Whitelist of english words in conf/whitelist.yaml
|
20
|
+
You can also set own rules:
|
21
|
+
finder = BadWordDetector.new rules, library, whitelist
|
22
|
+
}
|
23
|
+
gem.homepage = "https://github.com/hairyhum/bad-words.ruby"
|
24
|
+
|
25
|
+
gem.files = `git ls-files`.split($\)
|
26
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
27
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
28
|
+
gem.name = "bad_word_detector"
|
29
|
+
gem.require_paths = ["lib"]
|
30
|
+
gem.version = BadWordDetector::VERSION
|
31
|
+
gem.add_development_dependency('yard')
|
32
|
+
gem.add_development_dependency('redcarpet')
|
33
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
require "yaml"
|
2
|
+
require "set"
|
3
|
+
require "bad_word_detector/state"
|
4
|
+
require "bad_word_detector/prefix_tree"
|
5
|
+
require "bad_word_detector/rule"
|
6
|
+
require "bad_word_detector/version"
|
7
|
+
require "bad_word_detector/whitelist"
|
8
|
+
require "bad_word_detector/bad_word"
|
9
|
+
|
10
|
+
class BadWordDetector
|
11
|
+
|
12
|
+
# Create new badword checker
|
13
|
+
#
|
14
|
+
# @param rules [Hash<String,Array<Hash<String, any>>>] Hash where values are arrays
|
15
|
+
# of Hash<['symbol', 'weight'], any> where weight is optional
|
16
|
+
#
|
17
|
+
# @param library [Array<String>] Array of bad words to find
|
18
|
+
#
|
19
|
+
# @param whitelist [Array<String>] Array of words that is acceptable. Used in false-positive check
|
20
|
+
#
|
21
|
+
def initialize(rules = nil, library = nil, whitelist = nil)
|
22
|
+
confdir = File.expand_path(File.dirname(__FILE__) + "/conf")
|
23
|
+
rules ||= YAML.load_file("#{confdir}/rules.yaml")
|
24
|
+
library ||= YAML.load_file("#{confdir}/library.yaml")
|
25
|
+
|
26
|
+
@rule_sets = rules.select do |key, _|
|
27
|
+
key.to_s.length == 1
|
28
|
+
end.hmap do |key, rule|
|
29
|
+
key = key.to_s
|
30
|
+
rule = rule.map do |item|
|
31
|
+
Rule.new(key, item['symbol'], item['weight'])
|
32
|
+
end
|
33
|
+
rule << Rule.new(key, key, 3)
|
34
|
+
[key, rule]
|
35
|
+
end
|
36
|
+
|
37
|
+
@string_sets = rules.select do |key, _|
|
38
|
+
key.to_s.length > 1
|
39
|
+
end.hmap do |key, rule|
|
40
|
+
key = key.to_s
|
41
|
+
rule = rule.map do |item|
|
42
|
+
Rule.new(key, item['symbol'], item['weight'])
|
43
|
+
end
|
44
|
+
[key, rule]
|
45
|
+
end
|
46
|
+
|
47
|
+
@library = PrefixTree.new library
|
48
|
+
@whitelist = Whitelist.new whitelist || YAML.load_file("#{confdir}/whitelist.yaml")
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# Searches string for some word in library
|
53
|
+
#
|
54
|
+
# @param text [String] String to search
|
55
|
+
#
|
56
|
+
# @param return_white [Boolean] Flag to indicate if search should return whitelist word
|
57
|
+
#
|
58
|
+
# @return [BadWord, nil] BadWord object, containing information about found word and it's position in text
|
59
|
+
#
|
60
|
+
def find(text, return_white = false)
|
61
|
+
downcased = text.downcase
|
62
|
+
length = text.length
|
63
|
+
index = 0
|
64
|
+
while index < length
|
65
|
+
found = find_part(downcased, index)
|
66
|
+
if found
|
67
|
+
word = BadWord.new(
|
68
|
+
found[:word],
|
69
|
+
text,
|
70
|
+
index,
|
71
|
+
found[:length],
|
72
|
+
@whitelist)
|
73
|
+
if not word.white? or return_white
|
74
|
+
return word
|
75
|
+
end
|
76
|
+
end
|
77
|
+
index += 1
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
def find_part(text, index)
|
83
|
+
input = text[index..-1]
|
84
|
+
found = unless input.start_with? ' '
|
85
|
+
process(input, @library)
|
86
|
+
end
|
87
|
+
if found
|
88
|
+
word, length = found
|
89
|
+
{:length => length, :word => word}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
def process(string, library)
|
95
|
+
plain = library[string]
|
96
|
+
if plain && plain.value
|
97
|
+
return plain.value, plain.value.length
|
98
|
+
end
|
99
|
+
passed = []
|
100
|
+
bad_states = []
|
101
|
+
queue = [State.new([], library)]
|
102
|
+
until queue.empty?
|
103
|
+
state = queue.shift
|
104
|
+
new_states = get_new_states state, string
|
105
|
+
if new_states
|
106
|
+
passed << state
|
107
|
+
success_index = new_states.index(&:success?)
|
108
|
+
if success_index
|
109
|
+
new_state = new_states[success_index]
|
110
|
+
return new_state.text, new_state.length
|
111
|
+
else
|
112
|
+
states = (new_states - bad_states) - passed
|
113
|
+
push_states queue, states
|
114
|
+
end
|
115
|
+
else
|
116
|
+
bad_states << state
|
117
|
+
end
|
118
|
+
end
|
119
|
+
nil
|
120
|
+
end
|
121
|
+
|
122
|
+
def push_states(queue, states)
|
123
|
+
states.each do |state|
|
124
|
+
weight = state.weight
|
125
|
+
if queue.any? { |q_state| q_state == state }
|
126
|
+
next
|
127
|
+
end
|
128
|
+
unless weight < 0.3
|
129
|
+
if queue.empty? || weight < queue.last.weight
|
130
|
+
queue << state
|
131
|
+
elsif weight > queue.first.weight
|
132
|
+
queue.insert(0, state)
|
133
|
+
else
|
134
|
+
new_index = queue.length
|
135
|
+
queue.each_with_index do |item, index|
|
136
|
+
if item.weight < weight
|
137
|
+
new_index = index
|
138
|
+
end
|
139
|
+
end
|
140
|
+
queue.insert(new_index, state)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def get_new_states(state, string)
|
147
|
+
next_symbols = get_next_symbols state.length, string
|
148
|
+
unless next_symbols
|
149
|
+
return nil
|
150
|
+
end
|
151
|
+
new_states = append_path next_symbols, state
|
152
|
+
if new_states.empty?
|
153
|
+
nil
|
154
|
+
else
|
155
|
+
new_states
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def append_path(symbols, state)
|
160
|
+
symbols.map do |sym|
|
161
|
+
state.append sym
|
162
|
+
end.reject do |new_state|
|
163
|
+
new_state.failure?
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def get_next_symbols(index, string)
|
168
|
+
char = string[index]
|
169
|
+
if char
|
170
|
+
char = char.to_s
|
171
|
+
get_rules(char, string[index..-1]) || []
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def get_rules(char, string)
|
176
|
+
char_rules = @rule_sets[char] || [Rule.self(char)]
|
177
|
+
if char_rules.none? { |rule| rule.symbol == '' }
|
178
|
+
char_rules << Rule.empty(char)
|
179
|
+
end
|
180
|
+
|
181
|
+
string_rules = @string_sets.select do |k, _|
|
182
|
+
k.start_with?(char) and string.start_with?(k)
|
183
|
+
end.values.flatten
|
184
|
+
|
185
|
+
char_rules.concat(string_rules)
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
|
190
|
+
class Hash
|
191
|
+
def hmap
|
192
|
+
result = {}
|
193
|
+
self.each do |k, v|
|
194
|
+
k, v = yield k, v
|
195
|
+
result[k] = v
|
196
|
+
end
|
197
|
+
result
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
class BadWord
|
2
|
+
attr_reader :text, :word, :index, :source, :white_words
|
3
|
+
|
4
|
+
#
|
5
|
+
# Create new BadWord
|
6
|
+
#
|
7
|
+
# @param word [String] found word
|
8
|
+
# @param source [String] Source text where word was found
|
9
|
+
# @param index [Integer] index of word in source
|
10
|
+
# @param length [Integer] word length
|
11
|
+
# @param whitelist [Array<String>] Whitelist words
|
12
|
+
def initialize(word, source, index, length, whitelist)
|
13
|
+
@index = index
|
14
|
+
@length = length
|
15
|
+
@word = word
|
16
|
+
@source = source
|
17
|
+
word_end = @index+@length-1
|
18
|
+
space_location = @source.index(' ', word_end) || 0
|
19
|
+
@text = @source[@index..space_location-1]
|
20
|
+
@white_words = whitelist.check_bad_word(self)
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Check if word is in whitelist
|
25
|
+
# @return [true, false]
|
26
|
+
def white?
|
27
|
+
!!@white_words
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
class PrefixTree
|
2
|
+
def initialize(items = [], hash_tree = {})
|
3
|
+
@hash_tree = hash_tree.clone
|
4
|
+
unless items.empty?
|
5
|
+
items.each do |i|
|
6
|
+
self << i
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def hash_tree
|
12
|
+
@hash_tree
|
13
|
+
end
|
14
|
+
|
15
|
+
def << (string)
|
16
|
+
parts = string.chars
|
17
|
+
new_hash = self.hash_tree
|
18
|
+
parts.each do |part|
|
19
|
+
unless new_hash[part]
|
20
|
+
new_hash[part] = {}
|
21
|
+
end
|
22
|
+
new_hash = new_hash[part]
|
23
|
+
end
|
24
|
+
new_hash[:value] = string
|
25
|
+
self
|
26
|
+
end
|
27
|
+
|
28
|
+
def [] (string = '')
|
29
|
+
parts = string.chars
|
30
|
+
new_hash = self.hash_tree
|
31
|
+
parts.each do |part|
|
32
|
+
unless new_hash[part]
|
33
|
+
return nil
|
34
|
+
end
|
35
|
+
new_hash = new_hash[part]
|
36
|
+
end
|
37
|
+
PrefixTree.new [], new_hash
|
38
|
+
end
|
39
|
+
|
40
|
+
def value
|
41
|
+
self.hash_tree[:value]
|
42
|
+
end
|
43
|
+
|
44
|
+
def children? (string)
|
45
|
+
(self[string].hash_tree.keys - [:values]).any?
|
46
|
+
end
|
47
|
+
|
48
|
+
def clone
|
49
|
+
PrefixTree.new [], self.hash_tree
|
50
|
+
end
|
51
|
+
|
52
|
+
def == (sec)
|
53
|
+
sec.hash_tree == self.hash_tree
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|