RubyGems - bae - Versions diffs - 0.0.7-java → 0.0.8-java - Mend

bae 0.0.7-java → 0.0.8-java

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +37 -1
data/lib/bae/classifier.rb +165 -8
data/lib/bae/native_classifier.rb +26 -0
data/lib/bae/version.rb +1 -1
data/lib/bae.rb +2 -4
data/spec/lib/bae/classifier_spec.rb +51 -2
data/spec/lib/bae/native_classifier_spec.rb +33 -0
data/spec/spec_helper.rb +1 -0
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 71c761f2619746bfc4dd287a5afa5443a7bfe037
-  data.tar.gz: 886257c2c8987fd8e95edbb63105ef87d7960de5
+  metadata.gz: cb626ff0b92f80f096cebc7248a64a8f47f02fda
+  data.tar.gz: 87c41e0571e1a31c303f9ab346eef119cec83e6f
 SHA512:
-  metadata.gz: e02048771022daa4b61097ae500831a671f1a6ec3d8e9e48f235efd7ff9902be31678e503b4bd31b6cd0a0526d61c2868f1af39cb619c8c9fb120517242928cc
-  data.tar.gz: c2c7073db8b7afeea466a5aa9db9f157101a9a8e9b38d87787281c204990eb3c4b3f75b64a9ecb2fd7a8dc5629bd807d101b0ea702c38d735ba52e2595bdc822
+  metadata.gz: 478e21b1c13f82037a5773cbb5b960ff98387b69cd024ece14c17eee7e9bf5784b658ea555e223e0affcc689ff6aa3fed9f59460ce586f283e06eeecc0d2291f
+  data.tar.gz: a3ceddd6c99d9ca8826f2142b2f383b9426ca0f9b28f88fbf966d954595b368c286551c94a6202d1add93709f196b4704bbe6ef5657b371dc1f41a2ac80317ed

data/README.md CHANGED Viewed

@@ -3,6 +3,15 @@ Bae
 Bae is a multinomial naive bayes classifier based on another gem ["naivebayes"](https://github.com/id774/naivebayes), only this one uses java to do the heavy lifting.
+By default this will use the vanilla ruby implementation, but you can use the native classifier written in java.
+```ruby
+require 'bae/native_classifier'
+classifier = ::Bae::NativeClassifier.new
+```
 ## Installation
 Add this line to your application's Gemfile:
@@ -50,10 +59,37 @@ classifier.classify("aaa bbb")
 #=> {"positive"=>0.8962655601659751, "negative"=>0.0663900414937759, "neutral"=>0.037344398340248955}
 ```
+### Saving State
+You can actually save a snapshot of the trained classifier to disk and load it into memory.
+```ruby
+# From the example above...
+classifier = ::Bae::Classifier.new
+classifier.train("positive", {"aaa" => 0, "bbb" => 1})
+classifier.train("negative", {"ccc" => 2, "ddd" => 3})
+classifier.finish_training!
+classifier.classify({"aaa" => 1, "bbb" => 1})
+#=> {"positive" => 0.8767123287671234, "negative" => 0.12328767123287669}
+# Now let's save it to disk
+classifier.save_state("/tmp/some_state.json")
+# Let's create a new classifier and load from the sate we just saved
+classifier = ::Bae::Classifier.new
+classifier.load_state("/tmp/some_state.json")
+# Now we can classify without retraining
+classifier.classify({"aaa" => 1, "bbb" => 1})
+#=> {"positive" => 0.8767123287671234, "negative" => 0.12328767123287669}
+```
 ## Contributing
-1. Fork it ( https://github.com/[my-github-username]/bae/fork )
+1. Fork it ( https://github.com/film42/bae/fork )
 2. Create your feature branch (`git checkout -b my-new-feature`)
 3. Commit your changes (`git commit -am 'Add some feature'`)
 4. Push to the branch (`git push origin my-new-feature`)

data/lib/bae/classifier.rb CHANGED Viewed

@@ -1,23 +1,180 @@
 module Bae
   class Classifier
-    attr_reader :internal_classifier
+    attr_accessor :frequency_table, :label_index, :label_index_sequence,
+      :label_instance_count, :total_terms
     def initialize
-      @internal_classifier = ::Java::Bae::NaiveBayesClassifier.new
+      @frequency_table = ::Hash.new { |hash, feature| hash[feature] = [] }
+      @label_instance_count = ::Hash.new { |hash, label| hash[label] = 0 }
+      @label_index = ::Hash.new { |hash, label| hash[label] = 0 }
+      @label_index_sequence = -1 # start at -1 so 0 is first value
+      @total_terms = 0.0
     end
-    def train(label, feature)
-      internal_classifier.train(label, ::Java::Bae::Document.new(feature))
+    def finish_training!
+      calculate_likelihoods!
+      calculate_priors!
     end
-    def classify(feature)
-      internal_classifier.classify(::Java::Bae::Document.new(feature))
+    def train(label, training_data)
+      if training_data.is_a?(::String)
+        train_from_string(label, training_data)
+      elsif training_data.is_a?(::Hash)
+        train_from_hash(label, training_data)
+      else
+        fail 'Training data must either be a string or hash'
+      end
     end
-    def finish_training!
-      internal_classifier.calculateInitialLikelihoods()
+    def train_from_string(label, document)
+      words = document.split
+      words.each do |word|
+        update_label_index(label)
+        update_frequency_table(label, word, 1)
+      end
+      @label_instance_count[label] += 1
+      @total_terms += 1
+    end
+    def train_from_hash(label, frequency_hash)
+      frequency_hash.each do |word, frequency|
+        update_label_index(label)
+        update_frequency_table(label, word, frequency)
+      end
+      @label_instance_count[label] += 1
+      @total_terms += 1
+    end
+    def classify(data)
+      if data.is_a?(::String)
+        classify_from_string(data)
+      elsif data.is_a?(::Hash)
+        classify_from_hash(data)
+      else
+        fail 'Training data must either be a string or hash'
+      end
+    end
+    def classify_from_hash(frequency_hash)
+      document = frequency_hash.map{ |word, frequency| (word + ' ') * frequency }.join
+      classify_from_string(document)
+    end
+    def classify_from_string(document)
+      words = document.split.uniq
+      likelihoods = @likelihoods.dup
+      posterior = {}
+      vocab_size = frequency_table.keys.size
+      label_index.each do |label, index|
+        words.map do |word|
+          row = frequency_table[word]
+          unless row.empty?
+            laplace_word_likelihood = (row[index] + 1.0).to_f / (label_instance_count[label] + vocab_size).to_f
+            likelihoods[label] *= laplace_word_likelihood / (1.0 - laplace_word_likelihood)
+          end
+        end
+        posterior[label] = @priors[label] * likelihoods[label]
+      end
+      normalize(posterior)
+    end
+    def save_state(path)
+      state = {}
+      state['frequency_table'] = frequency_table
+      state['label_instance_count'] = label_instance_count
+      state['label_index'] = label_index
+      state['label_index_sequence'] = label_index_sequence
+      state['total_terms'] = total_terms
+      ::File.open(::File.expand_path(path), 'w') do |handle|
+        handle.write(state.to_json)
+      end
+    end
+    def load_state(path)
+      state = ::JSON.parse(::File.read(::File.expand_path(path)))
+      fail 'Missing frequency_table' unless state['frequency_table']
+      fail 'Missing label_instance_count' unless state['label_instance_count']
+      fail 'Missing label_index' unless state['label_index']
+      fail 'Missing label_index_sequence' unless state['label_index_sequence']
+      fail 'Missing total_terms' unless state['total_terms']
+      @frequency_table = state['frequency_table']
+      @label_instance_count = state['label_instance_count']
+      @label_index = state['label_index']
+      @label_index_sequence = state['label_index_sequence']
+      @total_terms = state['total_terms']
+      finish_training!
+    end
+  private
+    def calculate_likelihoods!
+      @likelihoods = label_index.inject({}) do |accumulator, (label, index)|
+        initial_likelihood = 1.0
+        vocab_size = frequency_table.keys.size
+        frequency_table.each do |feature, row|
+          laplace_word_likelihood = (row[index] + 1.0).to_f / (label_instance_count[label] + vocab_size).to_f
+          initial_likelihood *= (1.0 - laplace_word_likelihood)
+        end
+        accumulator[label] = initial_likelihood
+        accumulator
+      end
+    end
+    def calculate_priors!
+      @priors = label_instance_count.inject({}) do |hash, (label, count)|
+        hash[label] = count / total_terms
+        hash
+      end
     end
+    def get_next_sequence_value
+      @label_index_sequence += 1
+    end
+    def normalize(posterior)
+      sum = posterior.inject(0.0) { |accumulator, (key, value)| accumulator + value }
+      posterior.inject({}) do |accumulator, (key, value)|
+        accumulator[key] = value / sum
+        accumulator
+      end
+    end
+    def update_label_index(label)
+      unless label_index.keys.include?(label)
+        index = get_next_sequence_value
+        label_index[label] = index
+        frequency_table.each do |feature, value|
+          value[index] = 0
+        end
+      end
+    end
+    def update_frequency_table(label, word, frequency)
+      row = frequency_table[word]
+      index = label_index[label]
+      if row[index]
+        row[index] += frequency
+      else
+        row[0..1] = label_index.keys.map { |label| 0 }
+        row[index] = frequency
+      end
+    end
   end
 end

data/lib/bae/native_classifier.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'java'
+require ::File.join(::File.dirname(__FILE__), "..", "..", "target" , "bae.jar")
+module Bae
+  class NativeClassifier
+    attr_reader :internal_classifier
+    def initialize
+      @internal_classifier = ::Java::Bae::NaiveBayesClassifier.new
+    end
+    def train(label, feature)
+      internal_classifier.train(label, ::Java::Bae::Document.new(feature))
+    end
+    def classify(feature)
+      internal_classifier.classify(::Java::Bae::Document.new(feature))
+    end
+    def finish_training!
+      internal_classifier.calculateInitialLikelihoods()
+    end
+  end
+end

data/lib/bae/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Bae
-  VERSION = "0.0.7"
+  VERSION = "0.0.8"
 end

data/lib/bae.rb CHANGED Viewed

@@ -1,8 +1,6 @@
-require "bae/version"
-require 'java'
-require ::File.join(::File.dirname(__FILE__), "..", "target" , "bae.jar")
+require 'json'
+require "bae/version"
 require "bae/classifier"
 module Bae

data/spec/lib/bae/classifier_spec.rb CHANGED Viewed

@@ -1,10 +1,17 @@
 require 'spec_helper'
+require 'bae/native_classifier'
 describe ::Bae::Classifier do
   subject { described_class.new }
-  it "can classify from ruby to java with a hash document" do
+  let(:state_json) {
+    '{"frequency_table":{"aaa":[0,0],"bbb":[1,0],"ccc":[0,2],"ddd":[0,3]},"label_instance_count":{"positive":1,"negative":1},"label_index":{"positive":0,"negative":1},"label_index_sequence":1,"total_terms":2.0}'
+  }
+  let(:state) { ::JSON.parse(state_json) }
+  it "can classify a hash document" do
     subject.train("positive", {"aaa" => 0, "bbb" => 1})
     subject.train("negative", {"ccc" => 2, "ddd" => 3})
@@ -16,7 +23,7 @@ describe ::Bae::Classifier do
     expect(results["negative"]).to be_within(0.001).of(0.05882)
   end
-  it "can classify from ruby to java with a string based document" do
+  it "can classify from a string based document" do
     subject.train("positive", "aaa aaa bbb");
     subject.train("negative", "ccc ccc ddd ddd");
     subject.train("neutral", "eee eee eee fff fff fff");
@@ -30,4 +37,46 @@ describe ::Bae::Classifier do
     expect(results["neutral"]).to be_within(0.001).of(0.03734)
   end
+  it "fails when you attempt to train or test anything other than a hash or string" do
+    subject.train("positive", "aaa aaa bbb");
+    expect{ subject.train("a", 1337) }.to raise_error 'Training data must either be a string or hash'
+    subject.finish_training!
+    subject.classify("aaa bbb")
+    expect{ subject.classify(1337) }.to raise_error 'Training data must either be a string or hash'
+  end
+  it "can save the classifier state" do
+    subject.train("positive", {"aaa" => 0, "bbb" => 1})
+    subject.train("negative", {"ccc" => 2, "ddd" => 3})
+    subject.finish_training!
+    temp_file = ::Tempfile.new('some_state')
+    subject.save_state(temp_file.path)
+    temp_file.rewind
+    expect(temp_file.read).to eq(state_json)
+    temp_file.close
+    temp_file.unlink
+  end
+  it "can correctly load a classifier state and correctly classify" do
+    temp_file = ::Tempfile.new('some_state')
+    temp_file.write(state_json)
+    temp_file.rewind
+    subject.load_state(temp_file.path)
+    results = subject.classify({"aaa" => 1, "bbb" => 1})
+    expect(results["positive"]).to be_within(0.001).of(0.94117)
+    expect(results["negative"]).to be_within(0.001).of(0.05882)
+    temp_file.close
+    temp_file.unlink
+  end
 end

data/spec/lib/bae/native_classifier_spec.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'spec_helper'
+describe ::Bae::NativeClassifier do
+  subject { described_class.new }
+  it "can classify a hash document" do
+    subject.train("positive", {"aaa" => 0, "bbb" => 1})
+    subject.train("negative", {"ccc" => 2, "ddd" => 3})
+    subject.finish_training!
+    results = subject.classify({"aaa" => 1, "bbb" => 1})
+    expect(results["positive"]).to be_within(0.001).of(0.94117)
+    expect(results["negative"]).to be_within(0.001).of(0.05882)
+  end
+  it "can classify from a string based document" do
+    subject.train("positive", "aaa aaa bbb");
+    subject.train("negative", "ccc ccc ddd ddd");
+    subject.train("neutral", "eee eee eee fff fff fff");
+    subject.finish_training!
+    results = subject.classify("aaa bbb")
+    expect(results["positive"]).to be_within(0.001).of(0.89626)
+    expect(results["negative"]).to be_within(0.001).of(0.06639)
+    expect(results["neutral"]).to be_within(0.001).of(0.03734)
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'bundler/setup'
 require 'bae'
+require 'tempfile'
 require 'rspec'
 RSpec.configure do |c|

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bae
 version: !ruby/object:Gem::Version
-  version: 0.0.7
+  version: 0.0.8
 platform: java
 authors:
 - Garrett Thornburg
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-02-23 00:00:00.000000000 Z
+date: 2015-02-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
@@ -68,8 +68,10 @@ files:
 - build.xml
 - lib/bae.rb
 - lib/bae/classifier.rb
+- lib/bae/native_classifier.rb
 - lib/bae/version.rb
 - spec/lib/bae/classifier_spec.rb
+- spec/lib/bae/native_classifier_spec.rb
 - spec/spec_helper.rb
 - src/main/java/bae/Document.java
 - src/main/java/bae/FrequencyTable.java
@@ -104,4 +106,5 @@ specification_version: 4
 summary: Multinomial naive bayes classifier with a kick of java
 test_files:
 - spec/lib/bae/classifier_spec.rb
+- spec/lib/bae/native_classifier_spec.rb
 - spec/spec_helper.rb