bad_word_detector 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Fedotov Daniil
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # BadWordDetector
2
+
3
+ Swear word detector
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'bad_word_detector'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install bad_word_detector
18
+
19
+ ## Usage
20
+
21
+ Detects `#uck` , `F|_|__C_K` and other variations of hidden swear words in text.
22
+
23
+ Usage:
24
+
25
+ ```ruby
26
+ finder = BadWordDetector.new
27
+ finder.find("What the #uck")
28
+ ```
29
+
30
+ it will return BadWord object
31
+
32
+ Transformation rules is defined in form:
33
+
34
+ ```ruby
35
+ {"#" => {"symbol"=>"f", "weight" => 2}} # weight is optional
36
+ ```
37
+
38
+ Or in file conf/rules.yaml
39
+
40
+ List of swear words is located in conf/library.yaml
41
+
42
+ Whitelist of english words in conf/whitelist.yaml
43
+
44
+ You can also set own rules:
45
+
46
+ ```ruby
47
+ finder = BadWordDetector.new rules, library, whitelist
48
+ ```
49
+
50
+ ## Contributing
51
+
52
+ 1. Fork it
53
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
54
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
55
+ 4. Push to the branch (`git push origin my-new-feature`)
56
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require "rake/testtask"
4
+
5
+
6
+ Rake::TestTask.new do |t|
7
+ t.libs << "test"
8
+ t.test_files = FileList['test/test*.rb']
9
+ t.verbose = true
10
+ end
@@ -0,0 +1,33 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/bad_word_detector/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Fedotov Daniil"]
6
+ gem.email = ["fedotov.danil@gmail.com"]
7
+ gem.summary = %q{Swear word detector}
8
+ gem.description = %q{
9
+ Detects #uck F|_|__C_K and other variations of hidden swear words in text.
10
+ Usage:
11
+ ```
12
+ finder = BadWordDetector.new
13
+ finder.find("What the #uck")
14
+ it will return BadWord object
15
+ ```
16
+ Transformation rules is defined in form: {"#" => {"symbol"=>"f", "weight" => 2}} (where weight is optional)
17
+ in file conf/rules.yaml
18
+ List of swear words is located in conf/library.yaml
19
+ Whitelist of english words in conf/whitelist.yaml
20
+ You can also set own rules:
21
+ finder = BadWordDetector.new rules, library, whitelist
22
+ }
23
+ gem.homepage = "https://github.com/hairyhum/bad-words.ruby"
24
+
25
+ gem.files = `git ls-files`.split($\)
26
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
27
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
28
+ gem.name = "bad_word_detector"
29
+ gem.require_paths = ["lib"]
30
+ gem.version = BadWordDetector::VERSION
31
+ gem.add_development_dependency('yard')
32
+ gem.add_development_dependency('redcarpet')
33
+ end
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH << "#{File.expand_path(File.dirname(__FILE__))}/../lib"
4
+
5
+ require 'bad_word_detector'
6
+
7
+ string = ARGV[0]
8
+
9
+ if string
10
+ puts BadWordDetector.new().find(ARGV[0])
11
+ else
12
+ puts 'Specify input string'
13
+ end
@@ -0,0 +1,200 @@
1
+ require "yaml"
2
+ require "set"
3
+ require "bad_word_detector/state"
4
+ require "bad_word_detector/prefix_tree"
5
+ require "bad_word_detector/rule"
6
+ require "bad_word_detector/version"
7
+ require "bad_word_detector/whitelist"
8
+ require "bad_word_detector/bad_word"
9
+
10
+ class BadWordDetector
11
+
12
+ # Create new badword checker
13
+ #
14
+ # @param rules [Hash<String,Array<Hash<String, any>>>] Hash where values are arrays
15
+ # of Hash<['symbol', 'weight'], any> where weight is optional
16
+ #
17
+ # @param library [Array<String>] Array of bad words to find
18
+ #
19
+ # @param whitelist [Array<String>] Array of words that is acceptable. Used in false-positive check
20
+ #
21
+ def initialize(rules = nil, library = nil, whitelist = nil)
22
+ confdir = File.expand_path(File.dirname(__FILE__) + "/conf")
23
+ rules ||= YAML.load_file("#{confdir}/rules.yaml")
24
+ library ||= YAML.load_file("#{confdir}/library.yaml")
25
+
26
+ @rule_sets = rules.select do |key, _|
27
+ key.to_s.length == 1
28
+ end.hmap do |key, rule|
29
+ key = key.to_s
30
+ rule = rule.map do |item|
31
+ Rule.new(key, item['symbol'], item['weight'])
32
+ end
33
+ rule << Rule.new(key, key, 3)
34
+ [key, rule]
35
+ end
36
+
37
+ @string_sets = rules.select do |key, _|
38
+ key.to_s.length > 1
39
+ end.hmap do |key, rule|
40
+ key = key.to_s
41
+ rule = rule.map do |item|
42
+ Rule.new(key, item['symbol'], item['weight'])
43
+ end
44
+ [key, rule]
45
+ end
46
+
47
+ @library = PrefixTree.new library
48
+ @whitelist = Whitelist.new whitelist || YAML.load_file("#{confdir}/whitelist.yaml")
49
+ end
50
+
51
+ #
52
+ # Searches string for some word in library
53
+ #
54
+ # @param text [String] String to search
55
+ #
56
+ # @param return_white [Boolean] Flag to indicate if search should return whitelist word
57
+ #
58
+ # @return [BadWord, nil] BadWord object, containing information about found word and it's position in text
59
+ #
60
+ def find(text, return_white = false)
61
+ downcased = text.downcase
62
+ length = text.length
63
+ index = 0
64
+ while index < length
65
+ found = find_part(downcased, index)
66
+ if found
67
+ word = BadWord.new(
68
+ found[:word],
69
+ text,
70
+ index,
71
+ found[:length],
72
+ @whitelist)
73
+ if not word.white? or return_white
74
+ return word
75
+ end
76
+ end
77
+ index += 1
78
+ end
79
+ end
80
+
81
+ private
82
+ def find_part(text, index)
83
+ input = text[index..-1]
84
+ found = unless input.start_with? ' '
85
+ process(input, @library)
86
+ end
87
+ if found
88
+ word, length = found
89
+ {:length => length, :word => word}
90
+ end
91
+ end
92
+
93
+
94
+ def process(string, library)
95
+ plain = library[string]
96
+ if plain && plain.value
97
+ return plain.value, plain.value.length
98
+ end
99
+ passed = []
100
+ bad_states = []
101
+ queue = [State.new([], library)]
102
+ until queue.empty?
103
+ state = queue.shift
104
+ new_states = get_new_states state, string
105
+ if new_states
106
+ passed << state
107
+ success_index = new_states.index(&:success?)
108
+ if success_index
109
+ new_state = new_states[success_index]
110
+ return new_state.text, new_state.length
111
+ else
112
+ states = (new_states - bad_states) - passed
113
+ push_states queue, states
114
+ end
115
+ else
116
+ bad_states << state
117
+ end
118
+ end
119
+ nil
120
+ end
121
+
122
+ def push_states(queue, states)
123
+ states.each do |state|
124
+ weight = state.weight
125
+ if queue.any? { |q_state| q_state == state }
126
+ next
127
+ end
128
+ unless weight < 0.3
129
+ if queue.empty? || weight < queue.last.weight
130
+ queue << state
131
+ elsif weight > queue.first.weight
132
+ queue.insert(0, state)
133
+ else
134
+ new_index = queue.length
135
+ queue.each_with_index do |item, index|
136
+ if item.weight < weight
137
+ new_index = index
138
+ end
139
+ end
140
+ queue.insert(new_index, state)
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ def get_new_states(state, string)
147
+ next_symbols = get_next_symbols state.length, string
148
+ unless next_symbols
149
+ return nil
150
+ end
151
+ new_states = append_path next_symbols, state
152
+ if new_states.empty?
153
+ nil
154
+ else
155
+ new_states
156
+ end
157
+ end
158
+
159
+ def append_path(symbols, state)
160
+ symbols.map do |sym|
161
+ state.append sym
162
+ end.reject do |new_state|
163
+ new_state.failure?
164
+ end
165
+ end
166
+
167
+ def get_next_symbols(index, string)
168
+ char = string[index]
169
+ if char
170
+ char = char.to_s
171
+ get_rules(char, string[index..-1]) || []
172
+ end
173
+ end
174
+
175
+ def get_rules(char, string)
176
+ char_rules = @rule_sets[char] || [Rule.self(char)]
177
+ if char_rules.none? { |rule| rule.symbol == '' }
178
+ char_rules << Rule.empty(char)
179
+ end
180
+
181
+ string_rules = @string_sets.select do |k, _|
182
+ k.start_with?(char) and string.start_with?(k)
183
+ end.values.flatten
184
+
185
+ char_rules.concat(string_rules)
186
+ end
187
+
188
+ end
189
+
190
+ class Hash
191
+ def hmap
192
+ result = {}
193
+ self.each do |k, v|
194
+ k, v = yield k, v
195
+ result[k] = v
196
+ end
197
+ result
198
+ end
199
+
200
+ end
@@ -0,0 +1,30 @@
1
+ class BadWord
2
+ attr_reader :text, :word, :index, :source, :white_words
3
+
4
+ #
5
+ # Create new BadWord
6
+ #
7
+ # @param word [String] found word
8
+ # @param source [String] Source text where word was found
9
+ # @param index [Integer] index of word in source
10
+ # @param length [Integer] word length
11
+ # @param whitelist [Array<String>] Whitelist words
12
+ def initialize(word, source, index, length, whitelist)
13
+ @index = index
14
+ @length = length
15
+ @word = word
16
+ @source = source
17
+ word_end = @index+@length-1
18
+ space_location = @source.index(' ', word_end) || 0
19
+ @text = @source[@index..space_location-1]
20
+ @white_words = whitelist.check_bad_word(self)
21
+ end
22
+
23
+ #
24
+ # Check if word is in whitelist
25
+ # @return [true, false]
26
+ def white?
27
+ !!@white_words
28
+ end
29
+
30
+ end
@@ -0,0 +1,56 @@
1
+ class PrefixTree
2
+ def initialize(items = [], hash_tree = {})
3
+ @hash_tree = hash_tree.clone
4
+ unless items.empty?
5
+ items.each do |i|
6
+ self << i
7
+ end
8
+ end
9
+ end
10
+
11
+ def hash_tree
12
+ @hash_tree
13
+ end
14
+
15
+ def << (string)
16
+ parts = string.chars
17
+ new_hash = self.hash_tree
18
+ parts.each do |part|
19
+ unless new_hash[part]
20
+ new_hash[part] = {}
21
+ end
22
+ new_hash = new_hash[part]
23
+ end
24
+ new_hash[:value] = string
25
+ self
26
+ end
27
+
28
+ def [] (string = '')
29
+ parts = string.chars
30
+ new_hash = self.hash_tree
31
+ parts.each do |part|
32
+ unless new_hash[part]
33
+ return nil
34
+ end
35
+ new_hash = new_hash[part]
36
+ end
37
+ PrefixTree.new [], new_hash
38
+ end
39
+
40
+ def value
41
+ self.hash_tree[:value]
42
+ end
43
+
44
+ def children? (string)
45
+ (self[string].hash_tree.keys - [:values]).any?
46
+ end
47
+
48
+ def clone
49
+ PrefixTree.new [], self.hash_tree
50
+ end
51
+
52
+ def == (sec)
53
+ sec.hash_tree == self.hash_tree
54
+ end
55
+
56
+ end