RubyGems - bae - Versions diffs - 0.0.7-java → 0.0.8-java - Mend

bae 0.0.7-java → 0.0.8-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +37 -1
data/lib/bae/classifier.rb +165 -8
data/lib/bae/native_classifier.rb +26 -0
data/lib/bae/version.rb +1 -1
data/lib/bae.rb +2 -4
data/spec/lib/bae/classifier_spec.rb +51 -2
data/spec/lib/bae/native_classifier_spec.rb +33 -0
data/spec/spec_helper.rb +1 -0
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 71c761f2619746bfc4dd287a5afa5443a7bfe037
-  data.tar.gz: 886257c2c8987fd8e95edbb63105ef87d7960de5
+  metadata.gz: cb626ff0b92f80f096cebc7248a64a8f47f02fda
+  data.tar.gz: 87c41e0571e1a31c303f9ab346eef119cec83e6f
 SHA512:
-  metadata.gz: e02048771022daa4b61097ae500831a671f1a6ec3d8e9e48f235efd7ff9902be31678e503b4bd31b6cd0a0526d61c2868f1af39cb619c8c9fb120517242928cc
-  data.tar.gz: c2c7073db8b7afeea466a5aa9db9f157101a9a8e9b38d87787281c204990eb3c4b3f75b64a9ecb2fd7a8dc5629bd807d101b0ea702c38d735ba52e2595bdc822
+  metadata.gz: 478e21b1c13f82037a5773cbb5b960ff98387b69cd024ece14c17eee7e9bf5784b658ea555e223e0affcc689ff6aa3fed9f59460ce586f283e06eeecc0d2291f
+  data.tar.gz: a3ceddd6c99d9ca8826f2142b2f383b9426ca0f9b28f88fbf966d954595b368c286551c94a6202d1add93709f196b4704bbe6ef5657b371dc1f41a2ac80317ed

data/README.md CHANGED Viewed

@@ -3,6 +3,15 @@ Bae
 Bae is a multinomial naive bayes classifier based on another gem ["naivebayes"](https://github.com/id774/naivebayes), only this one uses java to do the heavy lifting.
+By default this will use the vanilla ruby implementation, but you can use the native classifier written in java.
+```ruby
+require 'bae/native_classifier'
+classifier = ::Bae::NativeClassifier.new
+```
 ## Installation
 Add this line to your application's Gemfile:
@@ -50,10 +59,37 @@ classifier.classify("aaa bbb")
 #=> {"positive"=>0.8962655601659751, "negative"=>0.0663900414937759, "neutral"=>0.037344398340248955}
 ```
+### Saving State
+You can actually save a snapshot of the trained classifier to disk and load it into memory.
+```ruby
+# From the example above...
+classifier = ::Bae::Classifier.new
+classifier.train("positive", {"aaa" => 0, "bbb" => 1})
+classifier.train("negative", {"ccc" => 2, "ddd" => 3})
+classifier.finish_training!
+classifier.classify({"aaa" => 1, "bbb" => 1})
+#=> {"positive" => 0.8767123287671234, "negative" => 0.12328767123287669}
+# Now let's save it to disk
+classifier.save_state("/tmp/some_state.json")
+# Let's create a new classifier and load from the sate we just saved
+classifier = ::Bae::Classifier.new
+classifier.load_state("/tmp/some_state.json")
+# Now we can classify without retraining
+classifier.classify({"aaa" => 1, "bbb" => 1})
+#=> {"positive" => 0.8767123287671234, "negative" => 0.12328767123287669}
+```
 ## Contributing
-1. Fork it ( https://github.com/[my-github-username]/bae/fork )
+1. Fork it ( https://github.com/film42/bae/fork )
 2. Create your feature branch (`git checkout -b my-new-feature`)
 3. Commit your changes (`git commit -am 'Add some feature'`)
 4. Push to the branch (`git push origin my-new-feature`)

data/lib/bae/classifier.rb CHANGED Viewed

@@ -1,23 +1,180 @@
 module Bae
   class Classifier
-    attr_reader :internal_classifier
+    attr_accessor :frequency_table, :label_index, :label_index_sequence,
+      :label_instance_count, :total_terms
     def initialize
-      @internal_classifier = ::Java::Bae::NaiveBayesClassifier.new
+      @frequency_table = ::Hash.new { |hash, feature| hash[feature] = [] }
+      @label_instance_count = ::Hash.new { |hash, label| hash[label] = 0 }
+      @label_index = ::Hash.new { |hash, label| hash[label] = 0 }
+      @label_index_sequence = -1 # start at -1 so 0 is first value
+      @total_terms = 0.0
     end
-    def train(label, feature)
-      internal_classifier.train(label, ::Java::Bae::Document.new(feature))
+    def finish_training!
+      calculate_likelihoods!
+      calculate_priors!
     end
-    def classify(feature)
-      internal_classifier.classify(::Java::Bae::Document.new(feature))
+    def train(label, training_data)
+      if training_data.is_a?(::String)
+        train_from_string(label, training_data)
+      elsif training_data.is_a?(::Hash)
+        train_from_hash(label, training_data)
+      else
+        fail 'Training data must either be a string or hash'
+      end
     end
-    def finish_training!
-      internal_classifier.calculateInitialLikelihoods()
+    def train_from_string(label, document)
+      words = document.split
+      words.each do |word|
+        update_label_index(label)
+        update_frequency_table(label, word, 1)
+      end
+      @label_instance_count[label] += 1
+      @total_terms += 1
+    end
+    def train_from_hash(label, frequency_hash)
+      frequency_hash.each do |word, frequency|
+        update_label_index(label)
+        update_frequency_table(label, word, frequency)
+      end
+      @label_instance_count[label] += 1
+      @total_terms += 1
+    end
+    def classify(data)
+      if data.is_a?(::String)
+        classify_from_string(data)
+      elsif data.is_a?(::Hash)
+        classify_from_hash(data)
+      else
+        fail 'Training data must either be a string or hash'
+      end
+    end
+    def classify_from_hash(frequency_hash)
+      document = frequency_hash.map{ |word, frequency| (word + ' ') * frequency }.join
+      classify_from_string(document)
+    end
+    def classify_from_string(document)
+      words = document.split.uniq
+      likelihoods = @likelihoods.dup
+      posterior = {}
+      vocab_size = frequency_table.keys.size
+      label_index.each do |label, index|
+        words.map do |word|
+          row = frequency_table[word]
+          unless row.empty?
+            laplace_word_likelihood = (row[index] + 1.0).to_f / (label_instance_count[label] + vocab_size).to_f
+            likelihoods[label] *= laplace_word_likelihood / (1.0 - laplace_word_likelihood)
+          end
+        end
+        posterior[label] = @priors[label] * likelihoods[label]
+      end
+      normalize(posterior)
+    end
+    def save_state(path)
+      state = {}
+      state['frequency_table'] = frequency_table
+      state['label_instance_count'] = label_instance_count
+      state['label_index'] = label_index
+      state['label_index_sequence'] = label_index_sequence
+      state['total_terms'] = total_terms
+      ::File.open(::File.expand_path(path), 'w') do |handle|
+        handle.write(state.to_json)
+      end
+    end
+    def load_state(path)
+      state = ::JSON.parse(::File.read(::File.expand_path(path)))
+      fail 'Missing frequency_table' unless state['frequency_table']
+      fail 'Missing label_instance_count' unless state['label_instance_count']
+      fail 'Missing label_index' unless state['label_index']
+      fail 'Missing label_index_sequence' unless state['label_index_sequence']
+      fail 'Missing total_terms' unless state['total_terms']
+      @frequency_table = state['frequency_table']
+      @label_instance_count = state['label_instance_count']
+      @label_index = state['label_index']
+      @label_index_sequence = state['label_index_sequence']
+      @total_terms = state['total_terms']
+      finish_training!
+    end
+  private
+    def calculate_likelihoods!
+      @likelihoods = label_index.inject({}) do |accumulator, (label, index)|
+        initial_likelihood = 1.0
+        vocab_size = frequency_table.keys.size
+        frequency_table.each do |feature, row|
+          laplace_word_likelihood = (row[index] + 1.0).to_f / (label_instance_count[label] + vocab_size).to_f
+          initial_likelihood *= (1.0 - laplace_word_likelihood)
+        end
+        accumulator[label] = initial_likelihood
+        accumulator
+      end
+    end
+    def calculate_priors!
+      @priors = label_instance_count.inject({}) do |hash, (label, count)|
+        hash[label] = count / total_terms
+        hash
+      end
     end
+    def get_next_sequence_value
+      @label_index_sequence += 1
+    end
+    def normalize(posterior)
+      sum = posterior.inject(0.0) { |accumulator, (key, value)| accumulator + value }
+      posterior.inject({}) do |accumulator, (key, value)|
+        accumulator[key] = value / sum
+        accumulator
+      end
+    end
+    def update_label_index(label)
+      unless label_index.keys.include?(label)
+        index = get_next_sequence_value
+        label_index[label] = index
+        frequency_table.each do |feature, value|
+          value[index] = 0
+        end
+      end
+    end
+    def update_frequency_table(label, word, frequency)
+      row = frequency_table[word]
+      index = label_index[label]
+      if row[index]
+        row[index] += frequency
+      else
+        row[0..1] = label_index.keys.map { |label| 0 }
+        row[index] = frequency
+      end
+    end
   end
 end

data/lib/bae/native_classifier.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'java'
+require ::File.join(::File.dirname(__FILE__), "..", "..", "target" , "bae.jar")
+module Bae
+  class NativeClassifier
+    attr_reader :internal_classifier
+    def initialize
+      @internal_classifier = ::Java::Bae::NaiveBayesClassifier.new
+    end
+    def train(label, feature)
+      internal_classifier.train(label, ::Java::Bae::Document.new(feature))
+    end
+    def classify(feature)
+      internal_classifier.classify(::Java::Bae::Document.new(feature))
+    end
+    def finish_training!
+      internal_classifier.calculateInitialLikelihoods()
+    end
+  end
+end

data/lib/bae/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Bae
-  VERSION = "0.0.7"
+  VERSION = "0.0.8"
 end

data/lib/bae.rb CHANGED Viewed

@@ -1,8 +1,6 @@
-require "bae/version"
-require 'java'
-require ::File.join(::File.dirname(__FILE__), "..", "target" , "bae.jar")
+require 'json'
+require "bae/version"
 require "bae/classifier"
 module Bae

data/spec/lib/bae/classifier_spec.rb CHANGED Viewed

@@ -1,10 +1,17 @@
 require 'spec_helper'
+require 'bae/native_classifier'
 describe ::Bae::Classifier do
   subject { described_class.new }
-  it "can classify from ruby to java with a hash document" do
+  let(:state_json) {
+    '{"frequency_table":{"aaa":[0,0],"bbb":[1,0],"ccc":[0,2],"ddd":[0,3]},"label_instance_count":{"positive":1,"negative":1},"label_index":{"positive":0,"negative":1},"label_index_sequence":1,"total_terms":2.0}'
+  }
+  let(:state) { ::JSON.parse(state_json) }
+  it "can classify a hash document" do
     subject.train("positive", {"aaa" => 0, "bbb" => 1})
     subject.train("negative", {"ccc" => 2, "ddd" => 3})
@@ -16,7 +23,7 @@ describe ::Bae::Classifier do
     expect(results["negative"]).to be_within(0.001).of(0.05882)
   end
-  it "can classify from ruby to java with a string based document" do
+  it "can classify from a string based document" do
     subject.train("positive", "aaa aaa bbb");
     subject.train("negative", "ccc ccc ddd ddd");
     subject.train("neutral", "eee eee eee fff fff fff");
@@ -30,4 +37,46 @@ describe ::Bae::Classifier do
     expect(results["neutral"]).to be_within(0.001).of(0.03734)
   end
+  it "fails when you attempt to train or test anything other than a hash or string" do
+    subject.train("positive", "aaa aaa bbb");
+    expect{ subject.train("a", 1337) }.to raise_error 'Training data must either be a string or hash'
+    subject.finish_training!
+    subject.classify("aaa bbb")
+    expect{ subject.classify(1337) }.to raise_error 'Training data must either be a string or hash'
+  end
+  it "can save the classifier state" do
+    subject.train("positive", {"aaa" => 0, "bbb" => 1})
+    subject.train("negative", {"ccc" => 2, "ddd" => 3})
+    subject.finish_training!
+    temp_file = ::Tempfile.new('some_state')
+    subject.save_state(temp_file.path)
+    temp_file.rewind
+    expect(temp_file.read).to eq(state_json)
+    temp_file.close
+    temp_file.unlink
+  end
+  it "can correctly load a classifier state and correctly classify" do
+    temp_file = ::Tempfile.new('some_state')
+    temp_file.write(state_json)
+    temp_file.rewind
+    subject.load_state(temp_file.path)
+    results = subject.classify({"aaa" => 1, "bbb" => 1})
+    expect(results["positive"]).to be_within(0.001).of(0.94117)
+    expect(results["negative"]).to be_within(0.001).of(0.05882)
+    temp_file.close
+    temp_file.unlink
+  end
 end

data/spec/lib/bae/native_classifier_spec.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'spec_helper'
+describe ::Bae::NativeClassifier do
+  subject { described_class.new }
+  it "can classify a hash document" do
+    subject.train("positive", {"aaa" => 0, "bbb" => 1})
+    subject.train("negative", {"ccc" => 2, "ddd" => 3})
+    subject.finish_training!
+    results = subject.classify({"aaa" => 1, "bbb" => 1})
+    expect(results["positive"]).to be_within(0.001).of(0.94117)
+    expect(results["negative"]).to be_within(0.001).of(0.05882)
+  end
+  it "can classify from a string based document" do
+    subject.train("positive", "aaa aaa bbb");
+    subject.train("negative", "ccc ccc ddd ddd");
+    subject.train("neutral", "eee eee eee fff fff fff");
+    subject.finish_training!
+    results = subject.classify("aaa bbb")
+    expect(results["positive"]).to be_within(0.001).of(0.89626)
+    expect(results["negative"]).to be_within(0.001).of(0.06639)
+    expect(results["neutral"]).to be_within(0.001).of(0.03734)
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'bundler/setup'
 require 'bae'
+require 'tempfile'
 require 'rspec'
 RSpec.configure do |c|

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bae
 version: !ruby/object:Gem::Version
-  version: 0.0.7
+  version: 0.0.8
 platform: java
 authors:
 - Garrett Thornburg
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-02-23 00:00:00.000000000 Z
+date: 2015-02-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
@@ -68,8 +68,10 @@ files:
 - build.xml
 - lib/bae.rb
 - lib/bae/classifier.rb
+- lib/bae/native_classifier.rb
 - lib/bae/version.rb
 - spec/lib/bae/classifier_spec.rb
+- spec/lib/bae/native_classifier_spec.rb
 - spec/spec_helper.rb
 - src/main/java/bae/Document.java
 - src/main/java/bae/FrequencyTable.java
@@ -104,4 +106,5 @@ specification_version: 4
 summary: Multinomial naive bayes classifier with a kick of java
 test_files:
 - spec/lib/bae/classifier_spec.rb
+- spec/lib/bae/native_classifier_spec.rb
 - spec/spec_helper.rb