bayes_classifier 0.0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +22 -0
- data/README.md +55 -0
- data/Rakefile +1 -0
- data/bayes_classifier.gemspec +25 -0
- data/lib/bayes.rb +3 -0
- data/lib/bayes/category.rb +67 -0
- data/lib/bayes/classifier.rb +55 -0
- data/lib/bayes/string.rb +107 -0
- data/lib/bayes/test.rb +81 -0
- data/lib/bayes_classifier.rb +2 -0
- data/lib/bayes_classifier/version.rb +3 -0
- data/spec/category_spec.rb +144 -0
- data/spec/classifier_spec.rb +191 -0
- data/spec/data/negative +394 -0
- data/spec/data/positive +386 -0
- data/spec/spec_helper.rb +4 -0
- metadata +125 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d164296489c1f693f53f141b5233c8fc76babdcc
|
4
|
+
data.tar.gz: 65d421b448594b4e70c52b6841b4993c6276dcda
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b39634094b910f7cca0822803e10a378382b57d061f988a226dc62bae1d8685298959b7d8a16dc1b4c0091f13fb15332503f3fc71ce700478cb5fafe2ab790af
|
7
|
+
data.tar.gz: 953add18915f3bfa1881efb6c1a99658096fcdaffa14f4215778e3211517210b228e73405ecd76882856d810de424f7453e68e18c2607393deedeb9bc477ab3c
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
bayes_classifier (0.0.1.1)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
diff-lcs (1.2.4)
|
10
|
+
fuubar (1.1.1)
|
11
|
+
rspec (~> 2.0)
|
12
|
+
rspec-instafail (~> 0.2.0)
|
13
|
+
ruby-progressbar (~> 1.0)
|
14
|
+
rake (10.1.0)
|
15
|
+
rspec (2.14.1)
|
16
|
+
rspec-core (~> 2.14.0)
|
17
|
+
rspec-expectations (~> 2.14.0)
|
18
|
+
rspec-mocks (~> 2.14.0)
|
19
|
+
rspec-core (2.14.5)
|
20
|
+
rspec-expectations (2.14.2)
|
21
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
22
|
+
rspec-instafail (0.2.4)
|
23
|
+
rspec-mocks (2.14.3)
|
24
|
+
ruby-progressbar (1.2.0)
|
25
|
+
|
26
|
+
PLATFORMS
|
27
|
+
ruby
|
28
|
+
|
29
|
+
DEPENDENCIES
|
30
|
+
bayes_classifier!
|
31
|
+
bundler (~> 1.3)
|
32
|
+
fuubar
|
33
|
+
rake
|
34
|
+
rspec
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 DarthSim
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# Bayes::Classifier
|
2
|
+
|
3
|
+
Bayes::Classifier allows you to classify strings with naive Bayes classifier.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Just add the following line to your `Gemfile`:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'bayes_classifier'
|
11
|
+
```
|
12
|
+
|
13
|
+
Then run 'bundle install'.
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
# Create new classifier
|
19
|
+
classifier = Bayes::Classifier.new
|
20
|
+
|
21
|
+
# Train classifier with a string
|
22
|
+
classifier.train :category1, "lorem ipsum dolor sit amet"
|
23
|
+
|
24
|
+
# Train classifier with array of strings
|
25
|
+
classifier.train_with_array :category2, ["the first string", "the second string", "the third string"]
|
26
|
+
|
27
|
+
# Train classifier with textfile
|
28
|
+
classifier.train_with_file :category3, "data/category3.txt"
|
29
|
+
|
30
|
+
# Train classifier with CSV file (first column - string, second column - category)
|
31
|
+
classifier.train_with_csv "data/training.csv"
|
32
|
+
|
33
|
+
# Apply weighting to the top words of category
|
34
|
+
classifier.apply_weighting :category3, 10
|
35
|
+
|
36
|
+
# Remove empty categories
|
37
|
+
classifier.pop_unused
|
38
|
+
|
39
|
+
# Classify string
|
40
|
+
classifier.classify "the string"
|
41
|
+
|
42
|
+
# Reset categories
|
43
|
+
classifier.flush
|
44
|
+
|
45
|
+
# Remove all categories
|
46
|
+
classifier.flush_all
|
47
|
+
```
|
48
|
+
|
49
|
+
## Contributing
|
50
|
+
|
51
|
+
1. Fork it
|
52
|
+
2. Create your feature branch (git checkout -b my-new-feature)
|
53
|
+
3. Commit your changes (git commit -am 'Add some feature')
|
54
|
+
4. Push to the branch (git push origin my-new-feature)
|
55
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'bayes_classifier/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "bayes_classifier"
|
8
|
+
spec.version = Bayes::VERSION
|
9
|
+
spec.authors = ["DarthSim"]
|
10
|
+
spec.email = ["darthsim@gmail.com"]
|
11
|
+
spec.description = "Naive Bayes classifier"
|
12
|
+
spec.summary = "Allows to classify strings with naive Bayes classifier"
|
13
|
+
spec.homepage = "https://github.com/DarthSim/bayes_classifier"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^spec/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency "rspec"
|
24
|
+
spec.add_development_dependency "fuubar"
|
25
|
+
end
|
data/lib/bayes.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
module Bayes
|
2
|
+
class Category
|
3
|
+
MIN_SCORE = 0.0000001
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
reset
|
7
|
+
end
|
8
|
+
|
9
|
+
def reset
|
10
|
+
@words = {}
|
11
|
+
@words_count = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def train(text)
|
15
|
+
text.word_hash.each do |word, count|
|
16
|
+
@words[word] = @words[word].to_i + count
|
17
|
+
@words_count += count
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def forget(text)
|
22
|
+
text.word_hash.each do |word, count|
|
23
|
+
@words[word] = @words[word].to_i - count
|
24
|
+
@words.delete(word) if @words[word] == 0
|
25
|
+
@words_count -= count
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def apply_weighting(coeff)
|
30
|
+
top_words.each do |word|
|
31
|
+
apply_weighting_for word, coeff
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def apply_weighting_for(word, coeff)
|
36
|
+
if old_weight = @words[word]
|
37
|
+
@words[word] = old_weight * coeff
|
38
|
+
@words_count += @words[word] - old_weight
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def top_words(num = 100)
|
43
|
+
@words.sort_by{ |w,c| -c }.slice(0,num).map{ |w| w[0] }
|
44
|
+
end
|
45
|
+
|
46
|
+
def score_for(words)
|
47
|
+
if @words_count > 0
|
48
|
+
words = words.word_hash.keys unless words.is_a? Array
|
49
|
+
|
50
|
+
if words.any?
|
51
|
+
words.map do |word|
|
52
|
+
word_value = @words[word] || MIN_SCORE
|
53
|
+
Math.log(word_value / @words_count.to_f)
|
54
|
+
end.inject(:+)
|
55
|
+
else
|
56
|
+
Math.log(MIN_SCORE / @words_count)
|
57
|
+
end
|
58
|
+
else
|
59
|
+
-Float::INFINITY
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def blank?
|
64
|
+
@words_count == 0
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Bayes
|
2
|
+
class Classifier
|
3
|
+
attr_reader :categories
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@categories = {}
|
7
|
+
end
|
8
|
+
|
9
|
+
def train(category, text)
|
10
|
+
ensure_category(category).train(text)
|
11
|
+
end
|
12
|
+
|
13
|
+
def ensure_category(category)
|
14
|
+
@categories[category] ||= Bayes::Category.new
|
15
|
+
end
|
16
|
+
|
17
|
+
def train_with_array(category, lines)
|
18
|
+
lines.each{ |line| train(category, line) }
|
19
|
+
end
|
20
|
+
|
21
|
+
def train_with_file(category, filename)
|
22
|
+
train_with_array category, File.read(filename).split(/\r?\n/)
|
23
|
+
end
|
24
|
+
|
25
|
+
def train_with_csv(filename, separator: "||")
|
26
|
+
csv = CSV.new File.read(filename), col_sep: separator, quote_char: "§" # hope § won't be used anywhere
|
27
|
+
csv.each do |row|
|
28
|
+
train row[1], row[0]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def apply_weighting(category, coeff)
|
33
|
+
ensure_category(category).apply_weighting(coeff)
|
34
|
+
end
|
35
|
+
|
36
|
+
def classify(string)
|
37
|
+
words = string.word_hash.keys
|
38
|
+
@categories.each_with_object({}) do |category, hash|
|
39
|
+
hash[category[0]] = category[1].score_for(words)
|
40
|
+
end.sort_by { |cat| -cat[1] }[0][0]
|
41
|
+
end
|
42
|
+
|
43
|
+
def pop_unused
|
44
|
+
@categories.delete_if{ |name,cat| cat.blank? }
|
45
|
+
end
|
46
|
+
|
47
|
+
def flush
|
48
|
+
@categories.each{ |name, cat| cat.reset }
|
49
|
+
end
|
50
|
+
|
51
|
+
def flush_all
|
52
|
+
@categories = {}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/bayes/string.rb
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
class String
|
2
|
+
|
3
|
+
# Returns a Hash of words and their frequencies
|
4
|
+
def word_hash
|
5
|
+
split_words.each_with_object({}) do |word, hash|
|
6
|
+
word.downcase!
|
7
|
+
if !word.stopword? && word.length > 2
|
8
|
+
hash[word] ||= 0
|
9
|
+
hash[word] += 1
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def split_words
|
15
|
+
gsub(/[^\w\s]+/," ").split
|
16
|
+
end
|
17
|
+
|
18
|
+
def stopword?
|
19
|
+
STOPWORDS.include? self
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
STOPWORDS = [
|
25
|
+
"a",
|
26
|
+
"again",
|
27
|
+
"all",
|
28
|
+
"along",
|
29
|
+
"are",
|
30
|
+
"also",
|
31
|
+
"an",
|
32
|
+
"and",
|
33
|
+
"as",
|
34
|
+
"at",
|
35
|
+
"but",
|
36
|
+
"by",
|
37
|
+
"came",
|
38
|
+
"can",
|
39
|
+
"cant",
|
40
|
+
"couldnt",
|
41
|
+
"did",
|
42
|
+
"didn",
|
43
|
+
"didnt",
|
44
|
+
"do",
|
45
|
+
"doesnt",
|
46
|
+
"dont",
|
47
|
+
"ever",
|
48
|
+
"first",
|
49
|
+
"from",
|
50
|
+
"have",
|
51
|
+
"her",
|
52
|
+
"here",
|
53
|
+
"him",
|
54
|
+
"how",
|
55
|
+
"i",
|
56
|
+
"if",
|
57
|
+
"in",
|
58
|
+
"into",
|
59
|
+
"is",
|
60
|
+
"isnt",
|
61
|
+
"it",
|
62
|
+
"itll",
|
63
|
+
"just",
|
64
|
+
"last",
|
65
|
+
"least",
|
66
|
+
"like",
|
67
|
+
"most",
|
68
|
+
"my",
|
69
|
+
"new",
|
70
|
+
"no",
|
71
|
+
"not",
|
72
|
+
"now",
|
73
|
+
"of",
|
74
|
+
"on",
|
75
|
+
"or",
|
76
|
+
"should",
|
77
|
+
"sinc",
|
78
|
+
"so",
|
79
|
+
"some",
|
80
|
+
"th",
|
81
|
+
"than",
|
82
|
+
"this",
|
83
|
+
"that",
|
84
|
+
"the",
|
85
|
+
"their",
|
86
|
+
"then",
|
87
|
+
"those",
|
88
|
+
"to",
|
89
|
+
"told",
|
90
|
+
"too",
|
91
|
+
"true",
|
92
|
+
"try",
|
93
|
+
"until",
|
94
|
+
"url",
|
95
|
+
"us",
|
96
|
+
"were",
|
97
|
+
"when",
|
98
|
+
"whether",
|
99
|
+
"while",
|
100
|
+
"with",
|
101
|
+
"within",
|
102
|
+
"yes",
|
103
|
+
"you",
|
104
|
+
"youll",
|
105
|
+
].freeze
|
106
|
+
|
107
|
+
end
|
data/lib/bayes/test.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
module Bayes
|
4
|
+
module Stats
|
5
|
+
|
6
|
+
### Error Analysis ====================================
|
7
|
+
|
8
|
+
def self.error_analysis(classifier, category, positive_items, negative_items)
|
9
|
+
true_positives = 0
|
10
|
+
true_negatives = 0
|
11
|
+
false_negatives = 0
|
12
|
+
false_positives = 0
|
13
|
+
|
14
|
+
positive_items.each do |i|
|
15
|
+
if classifier.classify(i) == category
|
16
|
+
true_positives += 1.0
|
17
|
+
else
|
18
|
+
false_negatives += 1.0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
negative_items.each do |i|
|
23
|
+
if classifier.classify(i) == category
|
24
|
+
false_positives += 1.0
|
25
|
+
else
|
26
|
+
true_negatives += 1.0
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
precision = true_positives / (true_positives + false_positives)
|
31
|
+
recall = true_positives / (true_positives + false_negatives)
|
32
|
+
f_score = 2 * ( (precision * recall) / (precision + recall) )
|
33
|
+
|
34
|
+
{
|
35
|
+
true_positives: true_positives,
|
36
|
+
true_negatives: true_negatives,
|
37
|
+
false_negatives: false_negatives,
|
38
|
+
false_positives: false_positives,
|
39
|
+
precision: precision,
|
40
|
+
recall: recall,
|
41
|
+
f_score: f_score,
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.error_analysis_csv(classifier, filename)
|
46
|
+
items = File.read(filename).split("\n").map {|t| t.split("||") }
|
47
|
+
|
48
|
+
correct = 0
|
49
|
+
incorrect = 0
|
50
|
+
|
51
|
+
items.each do |item|
|
52
|
+
category = classifier.classify(item.first)
|
53
|
+
if category == item.last
|
54
|
+
correct += 1
|
55
|
+
else
|
56
|
+
incorrect += 1
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
{
|
61
|
+
correct: correct,
|
62
|
+
incorrect: incorrect,
|
63
|
+
error_rate: incorrect / (incorrect + correct).to_f
|
64
|
+
}
|
65
|
+
end
|
66
|
+
|
67
|
+
### Helpers ===================================================
|
68
|
+
|
69
|
+
def self.to_csv(results, name: "examples")
|
70
|
+
`mkdir -p spec/reports`
|
71
|
+
|
72
|
+
CSV.open("spec/reports/#{name}.csv", "w+") do |csv|
|
73
|
+
csv << results.first.keys
|
74
|
+
results.each do |r|
|
75
|
+
csv << r.values
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|