bayes_classifier 0.0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d164296489c1f693f53f141b5233c8fc76babdcc
4
+ data.tar.gz: 65d421b448594b4e70c52b6841b4993c6276dcda
5
+ SHA512:
6
+ metadata.gz: b39634094b910f7cca0822803e10a378382b57d061f988a226dc62bae1d8685298959b7d8a16dc1b4c0091f13fb15332503f3fc71ce700478cb5fafe2ab790af
7
+ data.tar.gz: 953add18915f3bfa1881efb6c1a99658096fcdaffa14f4215778e3211517210b228e73405ecd76882856d810de424f7453e68e18c2607393deedeb9bc477ab3c
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ pkg
2
+ spec/reports
3
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format Fuubar
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in bayes_classifier.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,34 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ bayes_classifier (0.0.1.1)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.2.4)
10
+ fuubar (1.1.1)
11
+ rspec (~> 2.0)
12
+ rspec-instafail (~> 0.2.0)
13
+ ruby-progressbar (~> 1.0)
14
+ rake (10.1.0)
15
+ rspec (2.14.1)
16
+ rspec-core (~> 2.14.0)
17
+ rspec-expectations (~> 2.14.0)
18
+ rspec-mocks (~> 2.14.0)
19
+ rspec-core (2.14.5)
20
+ rspec-expectations (2.14.2)
21
+ diff-lcs (>= 1.1.3, < 2.0)
22
+ rspec-instafail (0.2.4)
23
+ rspec-mocks (2.14.3)
24
+ ruby-progressbar (1.2.0)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ bayes_classifier!
31
+ bundler (~> 1.3)
32
+ fuubar
33
+ rake
34
+ rspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 DarthSim
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,55 @@
1
+ # Bayes::Classifier
2
+
3
+ Bayes::Classifier allows you to classify strings with naive Bayes classifier.
4
+
5
+ ## Installation
6
+
7
+ Just add the following line to your `Gemfile`:
8
+
9
+ ```ruby
10
+ gem 'bayes_classifier'
11
+ ```
12
+
13
+ Then run 'bundle install'.
14
+
15
+ ## Usage
16
+
17
+ ```ruby
18
+ # Create new classifier
19
+ classifier = Bayes::Classifier.new
20
+
21
+ # Train classifier with a string
22
+ classifier.train :category1, "lorem ipsum dolor sit amet"
23
+
24
+ # Train classifier with array of strings
25
+ classifier.train_with_array :category2, ["the first string", "the second string", "the third string"]
26
+
27
+ # Train classifier with textfile
28
+ classifier.train_with_file :category3, "data/category3.txt"
29
+
30
+ # Train classifier with CSV file (first column - string, second column - category)
31
+ classifier.train_with_csv "data/training.csv"
32
+
33
+ # Apply weighting to the top words of category
34
+ classifier.apply_weighting :category3, 10
35
+
36
+ # Remove empty categories
37
+ classifier.pop_unused
38
+
39
+ # Classify string
40
+ classifier.classify "the string"
41
+
42
+ # Reset categories
43
+ classifier.flush
44
+
45
+ # Remove all categories
46
+ classifier.flush_all
47
+ ```
48
+
49
+ ## Contributing
50
+
51
+ 1. Fork it
52
+ 2. Create your feature branch (git checkout -b my-new-feature)
53
+ 3. Commit your changes (git commit -am 'Add some feature')
54
+ 4. Push to the branch (git push origin my-new-feature)
55
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'bayes_classifier/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "bayes_classifier"
8
+ spec.version = Bayes::VERSION
9
+ spec.authors = ["DarthSim"]
10
+ spec.email = ["darthsim@gmail.com"]
11
+ spec.description = "Naive Bayes classifier"
12
+ spec.summary = "Allows to classify strings with naive Bayes classifier"
13
+ spec.homepage = "https://github.com/DarthSim/bayes_classifier"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^spec/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "fuubar"
25
+ end
data/lib/bayes.rb ADDED
@@ -0,0 +1,3 @@
1
+ require "bayes/string"
2
+ require "bayes/classifier"
3
+ require "bayes/category"
@@ -0,0 +1,67 @@
1
+ module Bayes
2
+ class Category
3
+ MIN_SCORE = 0.0000001
4
+
5
+ def initialize
6
+ reset
7
+ end
8
+
9
+ def reset
10
+ @words = {}
11
+ @words_count = 0
12
+ end
13
+
14
+ def train(text)
15
+ text.word_hash.each do |word, count|
16
+ @words[word] = @words[word].to_i + count
17
+ @words_count += count
18
+ end
19
+ end
20
+
21
+ def forget(text)
22
+ text.word_hash.each do |word, count|
23
+ @words[word] = @words[word].to_i - count
24
+ @words.delete(word) if @words[word] == 0
25
+ @words_count -= count
26
+ end
27
+ end
28
+
29
+ def apply_weighting(coeff)
30
+ top_words.each do |word|
31
+ apply_weighting_for word, coeff
32
+ end
33
+ end
34
+
35
+ def apply_weighting_for(word, coeff)
36
+ if old_weight = @words[word]
37
+ @words[word] = old_weight * coeff
38
+ @words_count += @words[word] - old_weight
39
+ end
40
+ end
41
+
42
+ def top_words(num = 100)
43
+ @words.sort_by{ |w,c| -c }.slice(0,num).map{ |w| w[0] }
44
+ end
45
+
46
+ def score_for(words)
47
+ if @words_count > 0
48
+ words = words.word_hash.keys unless words.is_a? Array
49
+
50
+ if words.any?
51
+ words.map do |word|
52
+ word_value = @words[word] || MIN_SCORE
53
+ Math.log(word_value / @words_count.to_f)
54
+ end.inject(:+)
55
+ else
56
+ Math.log(MIN_SCORE / @words_count)
57
+ end
58
+ else
59
+ -Float::INFINITY
60
+ end
61
+ end
62
+
63
+ def blank?
64
+ @words_count == 0
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,55 @@
1
+ module Bayes
2
+ class Classifier
3
+ attr_reader :categories
4
+
5
+ def initialize
6
+ @categories = {}
7
+ end
8
+
9
+ def train(category, text)
10
+ ensure_category(category).train(text)
11
+ end
12
+
13
+ def ensure_category(category)
14
+ @categories[category] ||= Bayes::Category.new
15
+ end
16
+
17
+ def train_with_array(category, lines)
18
+ lines.each{ |line| train(category, line) }
19
+ end
20
+
21
+ def train_with_file(category, filename)
22
+ train_with_array category, File.read(filename).split(/\r?\n/)
23
+ end
24
+
25
+ def train_with_csv(filename, separator: "||")
26
+ csv = CSV.new File.read(filename), col_sep: separator, quote_char: "§" # hope § won't be used anywhere
27
+ csv.each do |row|
28
+ train row[1], row[0]
29
+ end
30
+ end
31
+
32
+ def apply_weighting(category, coeff)
33
+ ensure_category(category).apply_weighting(coeff)
34
+ end
35
+
36
+ def classify(string)
37
+ words = string.word_hash.keys
38
+ @categories.each_with_object({}) do |category, hash|
39
+ hash[category[0]] = category[1].score_for(words)
40
+ end.sort_by { |cat| -cat[1] }[0][0]
41
+ end
42
+
43
+ def pop_unused
44
+ @categories.delete_if{ |name,cat| cat.blank? }
45
+ end
46
+
47
+ def flush
48
+ @categories.each{ |name, cat| cat.reset }
49
+ end
50
+
51
+ def flush_all
52
+ @categories = {}
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,107 @@
1
+ class String
2
+
3
+ # Returns a Hash of words and their frequencies
4
+ def word_hash
5
+ split_words.each_with_object({}) do |word, hash|
6
+ word.downcase!
7
+ if !word.stopword? && word.length > 2
8
+ hash[word] ||= 0
9
+ hash[word] += 1
10
+ end
11
+ end
12
+ end
13
+
14
+ def split_words
15
+ gsub(/[^\w\s]+/," ").split
16
+ end
17
+
18
+ def stopword?
19
+ STOPWORDS.include? self
20
+ end
21
+
22
+ private
23
+
24
+ STOPWORDS = [
25
+ "a",
26
+ "again",
27
+ "all",
28
+ "along",
29
+ "are",
30
+ "also",
31
+ "an",
32
+ "and",
33
+ "as",
34
+ "at",
35
+ "but",
36
+ "by",
37
+ "came",
38
+ "can",
39
+ "cant",
40
+ "couldnt",
41
+ "did",
42
+ "didn",
43
+ "didnt",
44
+ "do",
45
+ "doesnt",
46
+ "dont",
47
+ "ever",
48
+ "first",
49
+ "from",
50
+ "have",
51
+ "her",
52
+ "here",
53
+ "him",
54
+ "how",
55
+ "i",
56
+ "if",
57
+ "in",
58
+ "into",
59
+ "is",
60
+ "isnt",
61
+ "it",
62
+ "itll",
63
+ "just",
64
+ "last",
65
+ "least",
66
+ "like",
67
+ "most",
68
+ "my",
69
+ "new",
70
+ "no",
71
+ "not",
72
+ "now",
73
+ "of",
74
+ "on",
75
+ "or",
76
+ "should",
77
+ "sinc",
78
+ "so",
79
+ "some",
80
+ "th",
81
+ "than",
82
+ "this",
83
+ "that",
84
+ "the",
85
+ "their",
86
+ "then",
87
+ "those",
88
+ "to",
89
+ "told",
90
+ "too",
91
+ "true",
92
+ "try",
93
+ "until",
94
+ "url",
95
+ "us",
96
+ "were",
97
+ "when",
98
+ "whether",
99
+ "while",
100
+ "with",
101
+ "within",
102
+ "yes",
103
+ "you",
104
+ "youll",
105
+ ].freeze
106
+
107
+ end
data/lib/bayes/test.rb ADDED
@@ -0,0 +1,81 @@
1
+ require "csv"
2
+
3
+ module Bayes
4
+ module Stats
5
+
6
+ ### Error Analysis ====================================
7
+
8
+ def self.error_analysis(classifier, category, positive_items, negative_items)
9
+ true_positives = 0
10
+ true_negatives = 0
11
+ false_negatives = 0
12
+ false_positives = 0
13
+
14
+ positive_items.each do |i|
15
+ if classifier.classify(i) == category
16
+ true_positives += 1.0
17
+ else
18
+ false_negatives += 1.0
19
+ end
20
+ end
21
+
22
+ negative_items.each do |i|
23
+ if classifier.classify(i) == category
24
+ false_positives += 1.0
25
+ else
26
+ true_negatives += 1.0
27
+ end
28
+ end
29
+
30
+ precision = true_positives / (true_positives + false_positives)
31
+ recall = true_positives / (true_positives + false_negatives)
32
+ f_score = 2 * ( (precision * recall) / (precision + recall) )
33
+
34
+ {
35
+ true_positives: true_positives,
36
+ true_negatives: true_negatives,
37
+ false_negatives: false_negatives,
38
+ false_positives: false_positives,
39
+ precision: precision,
40
+ recall: recall,
41
+ f_score: f_score,
42
+ }
43
+ end
44
+
45
+ def self.error_analysis_csv(classifier, filename)
46
+ items = File.read(filename).split("\n").map {|t| t.split("||") }
47
+
48
+ correct = 0
49
+ incorrect = 0
50
+
51
+ items.each do |item|
52
+ category = classifier.classify(item.first)
53
+ if category == item.last
54
+ correct += 1
55
+ else
56
+ incorrect += 1
57
+ end
58
+ end
59
+
60
+ {
61
+ correct: correct,
62
+ incorrect: incorrect,
63
+ error_rate: incorrect / (incorrect + correct).to_f
64
+ }
65
+ end
66
+
67
+ ### Helpers ===================================================
68
+
69
+ def self.to_csv(results, name: "examples")
70
+ `mkdir -p spec/reports`
71
+
72
+ CSV.open("spec/reports/#{name}.csv", "w+") do |csv|
73
+ csv << results.first.keys
74
+ results.each do |r|
75
+ csv << r.values
76
+ end
77
+ end
78
+ end
79
+
80
+ end
81
+ end