bad_word_detector 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Fedotov Daniil
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # BadWordDetector
2
+
3
+ Swear word detector
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'bad_word_detector'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install bad_word_detector
18
+
19
+ ## Usage
20
+
21
+ Detects `#uck` , `F|_|__C_K` and other variations of hidden swear words in text.
22
+
23
+ Usage:
24
+
25
+ ```ruby
26
+ finder = BadWordDetector.new
27
+ finder.find("What the #uck")
28
+ ```
29
+
30
+ it will return BadWord object
31
+
32
+ Transformation rules is defined in form:
33
+
34
+ ```ruby
35
+ {"#" => {"symbol"=>"f", "weight" => 2}} # weight is optional
36
+ ```
37
+
38
+ Or in file conf/rules.yaml
39
+
40
+ List of swear words is located in conf/library.yaml
41
+
42
+ Whitelist of english words in conf/whitelist.yaml
43
+
44
+ You can also set own rules:
45
+
46
+ ```ruby
47
+ finder = BadWordDetector.new rules, library, whitelist
48
+ ```
49
+
50
+ ## Contributing
51
+
52
+ 1. Fork it
53
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
54
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
55
+ 4. Push to the branch (`git push origin my-new-feature`)
56
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require "rake/testtask"
4
+
5
+
6
+ Rake::TestTask.new do |t|
7
+ t.libs << "test"
8
+ t.test_files = FileList['test/test*.rb']
9
+ t.verbose = true
10
+ end
@@ -0,0 +1,33 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/bad_word_detector/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Fedotov Daniil"]
6
+ gem.email = ["fedotov.danil@gmail.com"]
7
+ gem.summary = %q{Swear word detector}
8
+ gem.description = %q{
9
+ Detects #uck F|_|__C_K and other variations of hidden swear words in text.
10
+ Usage:
11
+ ```
12
+ finder = BadWordDetector.new
13
+ finder.find("What the #uck")
14
+ it will return BadWord object
15
+ ```
16
+ Transformation rules is defined in form: {"#" => {"symbol"=>"f", "weight" => 2}} (where weight is optional)
17
+ in file conf/rules.yaml
18
+ List of swear words is located in conf/library.yaml
19
+ Whitelist of english words in conf/whitelist.yaml
20
+ You can also set own rules:
21
+ finder = BadWordDetector.new rules, library, whitelist
22
+ }
23
+ gem.homepage = "https://github.com/hairyhum/bad-words.ruby"
24
+
25
+ gem.files = `git ls-files`.split($\)
26
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
27
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
28
+ gem.name = "bad_word_detector"
29
+ gem.require_paths = ["lib"]
30
+ gem.version = BadWordDetector::VERSION
31
+ gem.add_development_dependency('yard')
32
+ gem.add_development_dependency('redcarpet')
33
+ end
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH << "#{File.expand_path(File.dirname(__FILE__))}/../lib"
4
+
5
+ require 'bad_word_detector'
6
+
7
+ string = ARGV[0]
8
+
9
+ if string
10
+ puts BadWordDetector.new().find(ARGV[0])
11
+ else
12
+ puts 'Specify input string'
13
+ end
@@ -0,0 +1,200 @@
1
+ require "yaml"
2
+ require "set"
3
+ require "bad_word_detector/state"
4
+ require "bad_word_detector/prefix_tree"
5
+ require "bad_word_detector/rule"
6
+ require "bad_word_detector/version"
7
+ require "bad_word_detector/whitelist"
8
+ require "bad_word_detector/bad_word"
9
+
10
+ class BadWordDetector
11
+
12
+ # Create new badword checker
13
+ #
14
+ # @param rules [Hash<String,Array<Hash<String, any>>>] Hash where values are arrays
15
+ # of Hash<['symbol', 'weight'], any> where weight is optional
16
+ #
17
+ # @param library [Array<String>] Array of bad words to find
18
+ #
19
+ # @param whitelist [Array<String>] Array of words that is acceptable. Used in false-positive check
20
+ #
21
+ def initialize(rules = nil, library = nil, whitelist = nil)
22
+ confdir = File.expand_path(File.dirname(__FILE__) + "/conf")
23
+ rules ||= YAML.load_file("#{confdir}/rules.yaml")
24
+ library ||= YAML.load_file("#{confdir}/library.yaml")
25
+
26
+ @rule_sets = rules.select do |key, _|
27
+ key.to_s.length == 1
28
+ end.hmap do |key, rule|
29
+ key = key.to_s
30
+ rule = rule.map do |item|
31
+ Rule.new(key, item['symbol'], item['weight'])
32
+ end
33
+ rule << Rule.new(key, key, 3)
34
+ [key, rule]
35
+ end
36
+
37
+ @string_sets = rules.select do |key, _|
38
+ key.to_s.length > 1
39
+ end.hmap do |key, rule|
40
+ key = key.to_s
41
+ rule = rule.map do |item|
42
+ Rule.new(key, item['symbol'], item['weight'])
43
+ end
44
+ [key, rule]
45
+ end
46
+
47
+ @library = PrefixTree.new library
48
+ @whitelist = Whitelist.new whitelist || YAML.load_file("#{confdir}/whitelist.yaml")
49
+ end
50
+
51
+ #
52
+ # Searches string for some word in library
53
+ #
54
+ # @param text [String] String to search
55
+ #
56
+ # @param return_white [Boolean] Flag to indicate if search should return whitelist word
57
+ #
58
+ # @return [BadWord, nil] BadWord object, containing information about found word and it's position in text
59
+ #
60
+ def find(text, return_white = false)
61
+ downcased = text.downcase
62
+ length = text.length
63
+ index = 0
64
+ while index < length
65
+ found = find_part(downcased, index)
66
+ if found
67
+ word = BadWord.new(
68
+ found[:word],
69
+ text,
70
+ index,
71
+ found[:length],
72
+ @whitelist)
73
+ if not word.white? or return_white
74
+ return word
75
+ end
76
+ end
77
+ index += 1
78
+ end
79
+ end
80
+
81
+ private
82
+ def find_part(text, index)
83
+ input = text[index..-1]
84
+ found = unless input.start_with? ' '
85
+ process(input, @library)
86
+ end
87
+ if found
88
+ word, length = found
89
+ {:length => length, :word => word}
90
+ end
91
+ end
92
+
93
+
94
+ def process(string, library)
95
+ plain = library[string]
96
+ if plain && plain.value
97
+ return plain.value, plain.value.length
98
+ end
99
+ passed = []
100
+ bad_states = []
101
+ queue = [State.new([], library)]
102
+ until queue.empty?
103
+ state = queue.shift
104
+ new_states = get_new_states state, string
105
+ if new_states
106
+ passed << state
107
+ success_index = new_states.index(&:success?)
108
+ if success_index
109
+ new_state = new_states[success_index]
110
+ return new_state.text, new_state.length
111
+ else
112
+ states = (new_states - bad_states) - passed
113
+ push_states queue, states
114
+ end
115
+ else
116
+ bad_states << state
117
+ end
118
+ end
119
+ nil
120
+ end
121
+
122
+ def push_states(queue, states)
123
+ states.each do |state|
124
+ weight = state.weight
125
+ if queue.any? { |q_state| q_state == state }
126
+ next
127
+ end
128
+ unless weight < 0.3
129
+ if queue.empty? || weight < queue.last.weight
130
+ queue << state
131
+ elsif weight > queue.first.weight
132
+ queue.insert(0, state)
133
+ else
134
+ new_index = queue.length
135
+ queue.each_with_index do |item, index|
136
+ if item.weight < weight
137
+ new_index = index
138
+ end
139
+ end
140
+ queue.insert(new_index, state)
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ def get_new_states(state, string)
147
+ next_symbols = get_next_symbols state.length, string
148
+ unless next_symbols
149
+ return nil
150
+ end
151
+ new_states = append_path next_symbols, state
152
+ if new_states.empty?
153
+ nil
154
+ else
155
+ new_states
156
+ end
157
+ end
158
+
159
+ def append_path(symbols, state)
160
+ symbols.map do |sym|
161
+ state.append sym
162
+ end.reject do |new_state|
163
+ new_state.failure?
164
+ end
165
+ end
166
+
167
+ def get_next_symbols(index, string)
168
+ char = string[index]
169
+ if char
170
+ char = char.to_s
171
+ get_rules(char, string[index..-1]) || []
172
+ end
173
+ end
174
+
175
+ def get_rules(char, string)
176
+ char_rules = @rule_sets[char] || [Rule.self(char)]
177
+ if char_rules.none? { |rule| rule.symbol == '' }
178
+ char_rules << Rule.empty(char)
179
+ end
180
+
181
+ string_rules = @string_sets.select do |k, _|
182
+ k.start_with?(char) and string.start_with?(k)
183
+ end.values.flatten
184
+
185
+ char_rules.concat(string_rules)
186
+ end
187
+
188
+ end
189
+
190
+ class Hash
191
+ def hmap
192
+ result = {}
193
+ self.each do |k, v|
194
+ k, v = yield k, v
195
+ result[k] = v
196
+ end
197
+ result
198
+ end
199
+
200
+ end
@@ -0,0 +1,30 @@
1
+ class BadWord
2
+ attr_reader :text, :word, :index, :source, :white_words
3
+
4
+ #
5
+ # Create new BadWord
6
+ #
7
+ # @param word [String] found word
8
+ # @param source [String] Source text where word was found
9
+ # @param index [Integer] index of word in source
10
+ # @param length [Integer] word length
11
+ # @param whitelist [Array<String>] Whitelist words
12
+ def initialize(word, source, index, length, whitelist)
13
+ @index = index
14
+ @length = length
15
+ @word = word
16
+ @source = source
17
+ word_end = @index+@length-1
18
+ space_location = @source.index(' ', word_end) || 0
19
+ @text = @source[@index..space_location-1]
20
+ @white_words = whitelist.check_bad_word(self)
21
+ end
22
+
23
+ #
24
+ # Check if word is in whitelist
25
+ # @return [true, false]
26
+ def white?
27
+ !!@white_words
28
+ end
29
+
30
+ end
@@ -0,0 +1,56 @@
1
+ class PrefixTree
2
+ def initialize(items = [], hash_tree = {})
3
+ @hash_tree = hash_tree.clone
4
+ unless items.empty?
5
+ items.each do |i|
6
+ self << i
7
+ end
8
+ end
9
+ end
10
+
11
+ def hash_tree
12
+ @hash_tree
13
+ end
14
+
15
+ def << (string)
16
+ parts = string.chars
17
+ new_hash = self.hash_tree
18
+ parts.each do |part|
19
+ unless new_hash[part]
20
+ new_hash[part] = {}
21
+ end
22
+ new_hash = new_hash[part]
23
+ end
24
+ new_hash[:value] = string
25
+ self
26
+ end
27
+
28
+ def [] (string = '')
29
+ parts = string.chars
30
+ new_hash = self.hash_tree
31
+ parts.each do |part|
32
+ unless new_hash[part]
33
+ return nil
34
+ end
35
+ new_hash = new_hash[part]
36
+ end
37
+ PrefixTree.new [], new_hash
38
+ end
39
+
40
+ def value
41
+ self.hash_tree[:value]
42
+ end
43
+
44
+ def children? (string)
45
+ (self[string].hash_tree.keys - [:values]).any?
46
+ end
47
+
48
+ def clone
49
+ PrefixTree.new [], self.hash_tree
50
+ end
51
+
52
+ def == (sec)
53
+ sec.hash_tree == self.hash_tree
54
+ end
55
+
56
+ end