RubyGems - bae - Versions diffs - 0.0.1 → 0.0.9 - Mend

bae 0.0.1 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/.gitignore +2 -1
data/README.md +43 -1
data/build.xml +3 -3
data/lib/bae/classifier.rb +167 -6
data/lib/bae/native_classifier.rb +26 -0
data/lib/bae/version.rb +1 -1
data/lib/bae.rb +2 -7
data/spec/lib/bae/classifier_spec.rb +57 -2
data/spec/lib/bae/native_classifier_spec.rb +33 -0
data/spec/spec_helper.rb +1 -0
data/src/main/java/bae/Document.java +3 -1
data/src/main/java/bae/FrequencyTable.java +6 -2
data/src/main/java/bae/NaiveBayesClassifier.java +43 -19
data/target/bae.jar +0 -0
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 4184eb7a5632271cf3c4ca2b8f44a47351027058
-  data.tar.gz: b3b84d5ed0aca936a95b602015b929b32b990fb1
+  metadata.gz: c28a60c92163259beddf8af99cd31357cf3a95a8
+  data.tar.gz: 34cc3ee332ec6f79d74e2ab04f8d60227c49c1a9
 SHA512:
-  metadata.gz: 1734b128dcccfc3229bb3c0d0226d598eea392a4f79986e8a9983f401f37113950e84b9a0fb1d297a7623709dc2109fa0cbccd90361a677f2bd3b757be876e15
-  data.tar.gz: ae9c7adf6237e6c3451bfcf7813c56cc250acb21f20bee5baa4d8014c9ec5204f0417b38b27e32a57b80575b11620f75068678a420df7a359da3cf5dc0dae404
+  metadata.gz: 4f92cc52a40438b18bf543299345b1a9ae57443b53e8b8cae7169181a436dfc2e4c86627527239c6c0c29d99f0ff723c4f9272a994f4595d58c1f6d5c8acbd5c
+  data.tar.gz: dfca0d36849a088fdc5c60cb62c9256a7743c329d8acbbdba87cac34f0fb3de1195909598474283a12862a5a1b92688258189b47945330dcf8cc6df563b77e19

data/.gitignore CHANGED Viewed

@@ -2,6 +2,7 @@
 *.rbc
 .bundle
 .config
+.ruby-*
 .yardoc
 Gemfile.lock
 InstalledFiles
@@ -23,4 +24,4 @@ mkmf.log
 .idea
 bae.iml
-out
+out

data/README.md CHANGED Viewed

@@ -3,6 +3,15 @@ Bae
 Bae is a multinomial naive bayes classifier based on another gem ["naivebayes"](https://github.com/id774/naivebayes), only this one uses java to do the heavy lifting.
+By default this will use the vanilla ruby implementation, but you can use the native classifier written in java.
+```ruby
+require 'bae/native_classifier'
+classifier = ::Bae::NativeClassifier.new
+```
 ## Installation
 Add this line to your application's Gemfile:
@@ -28,6 +37,9 @@ You can refer to ["naivebayes"](https://github.com/id774/naivebayes) gem for mor
 classifier = ::Bae::Classifier.new
 classifier.train("positive", {"aaa" => 0, "bbb" => 1})
 classifier.train("negative", {"ccc" => 2, "ddd" => 3})
+classifier.finish_training!
 classifier.classify({"aaa" => 1, "bbb" => 1})
 #=> {"positive" => 0.8767123287671234, "negative" => 0.12328767123287669}
@@ -39,15 +51,45 @@ classifier = ::Bae::Classifier.new
 classifier.train("positive", "aaa aaa bbb");
 classifier.train("negative", "ccc ccc ddd ddd");
 classifier.train("neutral", "eee eee eee fff fff fff");
+classifier.finish_training!
 classifier.classify("aaa bbb")
 #=> {"positive"=>0.8962655601659751, "negative"=>0.0663900414937759, "neutral"=>0.037344398340248955}
 ```
+### Saving State
+You can actually save a snapshot of the trained classifier to disk and load it into memory.
+```ruby
+# From the example above...
+classifier = ::Bae::Classifier.new
+classifier.train("positive", {"aaa" => 0, "bbb" => 1})
+classifier.train("negative", {"ccc" => 2, "ddd" => 3})
+classifier.finish_training!
+classifier.classify({"aaa" => 1, "bbb" => 1})
+#=> {"positive" => 0.8767123287671234, "negative" => 0.12328767123287669}
+# Now let's save it to disk
+classifier.save_state("/tmp/some_state.json")
+# Let's create a new classifier and load from the sate we just saved
+classifier = ::Bae::Classifier.new
+classifier.load_state("/tmp/some_state.json")
+# Now we can classify without retraining
+classifier.classify({"aaa" => 1, "bbb" => 1})
+#=> {"positive" => 0.8767123287671234, "negative" => 0.12328767123287669}
+```
 ## Contributing
-1. Fork it ( https://github.com/[my-github-username]/bae/fork )
+1. Fork it ( https://github.com/film42/bae/fork )
 2. Create your feature branch (`git checkout -b my-new-feature`)
 3. Commit your changes (`git commit -am 'Add some feature'`)
 4. Push to the branch (`git push origin my-new-feature`)

data/build.xml CHANGED Viewed

@@ -1,13 +1,13 @@
 <project>
     <target name="clean">
-        <delete dir="build"/>
+        <delete dir="out/classes"/>
     </target>
-    <target name="compile">
+    <target name="compile" depends="clean">
         <mkdir dir="out"/>
         <mkdir dir="out/classes"/>
-        <javac srcdir="src/main/java" destdir="out/classes"/>
+        <javac srcdir="src/main/java" destdir="out/classes" source="1.7" target="1.7" includeantruntime="false" />
     </target>
     <target name="jar" depends="compile">

data/lib/bae/classifier.rb CHANGED Viewed

@@ -1,19 +1,180 @@
 module Bae
   class Classifier
-    attr_reader :internal_classifier
+    attr_accessor :frequency_table, :label_index, :label_index_sequence,
+      :label_instance_count, :total_terms
     def initialize
-      @internal_classifier = ::Java::Bae::NaiveBayesClassifier.new
+      @frequency_table = ::Hash.new
+      @label_instance_count = ::Hash.new { |hash, label| hash[label] = 0 }
+      @label_index = ::Hash.new { |hash, label| hash[label] = 0 }
+      @label_index_sequence = -1 # start at -1 so 0 is first value
+      @total_terms = 0.0
     end
-    def train(label, feature)
-      internal_classifier.train(label, ::Java::Bae::Document.new(feature))
+    def finish_training!
+      calculate_likelihoods!
+      calculate_priors!
     end
-    def classify(feature)
-      internal_classifier.classify(::Java::Bae::Document.new(feature))
+    def train(label, training_data)
+      if training_data.is_a?(::String)
+        train_from_string(label, training_data)
+      elsif training_data.is_a?(::Hash)
+        train_from_hash(label, training_data)
+      else
+        fail 'Training data must either be a string or hash'
+      end
     end
+    def train_from_string(label, document)
+      words = document.split
+      words.each do |word|
+        update_label_index(label)
+        update_frequency_table(label, word, 1)
+      end
+      @label_instance_count[label] += 1
+      @total_terms += 1
+    end
+    def train_from_hash(label, frequency_hash)
+      frequency_hash.each do |word, frequency|
+        update_label_index(label)
+        update_frequency_table(label, word, frequency)
+      end
+      @label_instance_count[label] += 1
+      @total_terms += 1
+    end
+    def classify(data)
+      if data.is_a?(::String)
+        classify_from_string(data)
+      elsif data.is_a?(::Hash)
+        classify_from_hash(data)
+      else
+        fail 'Training data must either be a string or hash'
+      end
+    end
+    def classify_from_hash(frequency_hash)
+      document = frequency_hash.map{ |word, frequency| (word + ' ') * frequency }.join
+      classify_from_string(document)
+    end
+    def classify_from_string(document)
+      words = document.split.uniq
+      likelihoods = @likelihoods.dup
+      posterior = {}
+      vocab_size = frequency_table.keys.size
+      label_index.each do |label, index|
+        words.map do |word|
+          row = frequency_table[word]
+          unless row.nil?
+            laplace_word_likelihood = (row[index] + 1.0).to_f / (label_instance_count[label] + vocab_size).to_f
+            likelihoods[label] *= laplace_word_likelihood / (1.0 - laplace_word_likelihood)
+          end
+        end
+        posterior[label] = @priors[label] * likelihoods[label]
+      end
+      normalize(posterior)
+    end
+    def save_state(path)
+      state = {}
+      state['frequency_table'] = frequency_table
+      state['label_instance_count'] = label_instance_count
+      state['label_index'] = label_index
+      state['label_index_sequence'] = label_index_sequence
+      state['total_terms'] = total_terms
+      ::File.open(::File.expand_path(path), 'w') do |handle|
+        handle.write(state.to_json)
+      end
+    end
+    def load_state(path)
+      state = ::JSON.parse(::File.read(::File.expand_path(path)))
+      fail 'Missing frequency_table' unless state['frequency_table']
+      fail 'Missing label_instance_count' unless state['label_instance_count']
+      fail 'Missing label_index' unless state['label_index']
+      fail 'Missing label_index_sequence' unless state['label_index_sequence']
+      fail 'Missing total_terms' unless state['total_terms']
+      @frequency_table = state['frequency_table']
+      @label_instance_count = state['label_instance_count']
+      @label_index = state['label_index']
+      @label_index_sequence = state['label_index_sequence']
+      @total_terms = state['total_terms']
+      finish_training!
+    end
+  private
+    def calculate_likelihoods!
+      @likelihoods = label_index.inject({}) do |accumulator, (label, index)|
+        initial_likelihood = 1.0
+        vocab_size = frequency_table.keys.size
+        frequency_table.each do |feature, row|
+          laplace_word_likelihood = (row[index] + 1.0).to_f / (label_instance_count[label] + vocab_size).to_f
+          initial_likelihood *= (1.0 - laplace_word_likelihood)
+        end
+        accumulator[label] = initial_likelihood
+        accumulator
+      end
+    end
+    def calculate_priors!
+      @priors = label_instance_count.inject({}) do |hash, (label, count)|
+        hash[label] = count / total_terms
+        hash
+      end
+    end
+    def get_next_sequence_value
+      @label_index_sequence += 1
+    end
+    def normalize(posterior)
+      sum = posterior.inject(0.0) { |accumulator, (key, value)| accumulator + value }
+      posterior.inject({}) do |accumulator, (key, value)|
+        accumulator[key] = value / sum
+        accumulator
+      end
+    end
+    def update_label_index(label)
+      unless label_index.keys.include?(label)
+        index = get_next_sequence_value
+        label_index[label] = index
+        frequency_table.each do |feature, value|
+          value[index] = 0
+        end
+      end
+    end
+    def update_frequency_table(label, word, frequency)
+      row = frequency_table[word]
+      index = label_index[label]
+      if row
+        row[index] += frequency
+      else
+        frequency_table[word] = label_index.keys.map { |label| 0 }
+        frequency_table[word][index] += frequency
+      end
+    end
   end
 end

data/lib/bae/native_classifier.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'java'
+require ::File.join(::File.dirname(__FILE__), "..", "..", "target" , "bae.jar")
+module Bae
+  class NativeClassifier
+    attr_reader :internal_classifier
+    def initialize
+      @internal_classifier = ::Java::Bae::NaiveBayesClassifier.new
+    end
+    def train(label, feature)
+      internal_classifier.train(label, ::Java::Bae::Document.new(feature))
+    end
+    def classify(feature)
+      internal_classifier.classify(::Java::Bae::Document.new(feature))
+    end
+    def finish_training!
+      internal_classifier.calculateInitialLikelihoods()
+    end
+  end
+end

data/lib/bae/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Bae
-  VERSION = "0.0.1"
+  VERSION = "0.0.9"
 end

data/lib/bae.rb CHANGED Viewed

@@ -1,11 +1,6 @@
-require "bae/version"
-require "target/bae.jar"
-java_import "bae.Document"
-java_import "bae.FrequencyTable"
-java_import "bae.NaiveBayesClassifier"
+require 'json'
+require "bae/version"
 require "bae/classifier"
 module Bae

data/spec/lib/bae/classifier_spec.rb CHANGED Viewed

@@ -1,22 +1,35 @@
 require 'spec_helper'
+require 'bae/native_classifier'
 describe ::Bae::Classifier do
   subject { described_class.new }
-  it "can classify from ruby to java with a hash document" do
+  let(:state_json) {
+    '{"frequency_table":{"aaa":[0,0],"bbb":[1,0],"ccc":[0,2],"ddd":[0,3]},"label_instance_count":{"positive":1,"negative":1},"label_index":{"positive":0,"negative":1},"label_index_sequence":1,"total_terms":2.0}'
+  }
+  let(:state) { ::JSON.parse(state_json) }
+  it "can classify a hash document" do
     subject.train("positive", {"aaa" => 0, "bbb" => 1})
     subject.train("negative", {"ccc" => 2, "ddd" => 3})
+    subject.finish_training!
     results = subject.classify({"aaa" => 1, "bbb" => 1})
     expect(results["positive"]).to be_within(0.001).of(0.94117)
     expect(results["negative"]).to be_within(0.001).of(0.05882)
   end
-  it "can classify from ruby to java with a string based document" do
+  it "can classify from a string based document" do
     subject.train("positive", "aaa aaa bbb");
     subject.train("negative", "ccc ccc ddd ddd");
     subject.train("neutral", "eee eee eee fff fff fff");
+    subject.finish_training!
     results = subject.classify("aaa bbb")
     expect(results["positive"]).to be_within(0.001).of(0.89626)
@@ -24,4 +37,46 @@ describe ::Bae::Classifier do
     expect(results["neutral"]).to be_within(0.001).of(0.03734)
   end
+  it "fails when you attempt to train or test anything other than a hash or string" do
+    subject.train("positive", "aaa aaa bbb");
+    expect{ subject.train("a", 1337) }.to raise_error 'Training data must either be a string or hash'
+    subject.finish_training!
+    subject.classify("aaa bbb")
+    expect{ subject.classify(1337) }.to raise_error 'Training data must either be a string or hash'
+  end
+  it "can save the classifier state" do
+    subject.train("positive", {"aaa" => 0, "bbb" => 1})
+    subject.train("negative", {"ccc" => 2, "ddd" => 3})
+    subject.finish_training!
+    temp_file = ::Tempfile.new('some_state')
+    subject.save_state(temp_file.path)
+    temp_file.rewind
+    expect(temp_file.read).to eq(state_json)
+    temp_file.close
+    temp_file.unlink
+  end
+  it "can correctly load a classifier state and correctly classify" do
+    temp_file = ::Tempfile.new('some_state')
+    temp_file.write(state_json)
+    temp_file.rewind
+    subject.load_state(temp_file.path)
+    results = subject.classify({"aaa" => 1, "bbb" => 1})
+    expect(results["positive"]).to be_within(0.001).of(0.94117)
+    expect(results["negative"]).to be_within(0.001).of(0.05882)
+    temp_file.close
+    temp_file.unlink
+  end
 end

data/spec/lib/bae/native_classifier_spec.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'spec_helper'
+describe ::Bae::NativeClassifier do
+  subject { described_class.new }
+  it "can classify a hash document" do
+    subject.train("positive", {"aaa" => 0, "bbb" => 1})
+    subject.train("negative", {"ccc" => 2, "ddd" => 3})
+    subject.finish_training!
+    results = subject.classify({"aaa" => 1, "bbb" => 1})
+    expect(results["positive"]).to be_within(0.001).of(0.94117)
+    expect(results["negative"]).to be_within(0.001).of(0.05882)
+  end
+  it "can classify from a string based document" do
+    subject.train("positive", "aaa aaa bbb");
+    subject.train("negative", "ccc ccc ddd ddd");
+    subject.train("neutral", "eee eee eee fff fff fff");
+    subject.finish_training!
+    results = subject.classify("aaa bbb")
+    expect(results["positive"]).to be_within(0.001).of(0.89626)
+    expect(results["negative"]).to be_within(0.001).of(0.06639)
+    expect(results["neutral"]).to be_within(0.001).of(0.03734)
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'bundler/setup'
 require 'bae'
+require 'tempfile'
 require 'rspec'
 RSpec.configure do |c|

data/src/main/java/bae/Document.java CHANGED Viewed

@@ -33,7 +33,9 @@ public class Document {
             // Set initial count if it doesn't have one yet
             // Use zero because we'll add counts in the next line.
-            this.frequencyMap.putIfAbsent(wordToken, 0L);
+            if(!this.frequencyMap.containsKey(wordToken)) {
+                this.frequencyMap.put(wordToken, 0L);
+            }
             // Update count
             this.frequencyMap.put(wordToken, this.frequencyMap.get(wordToken) + 1);

data/src/main/java/bae/FrequencyTable.java CHANGED Viewed

@@ -14,7 +14,9 @@ public class FrequencyTable {
     public void insertOrIgnore(String label) {
         // Add new hash to frequency table if it's not already there
-        this.frequencyTable.putIfAbsent(label, new HashMap<String, Long>());
+        if(!this.frequencyTable.containsKey(label)) {
+            this.frequencyTable.put(label, new HashMap<String, Long>());
+        }
     }
     public void increaseFrequencyBy(String label, String word, long frequency) {
@@ -24,7 +26,9 @@ public class FrequencyTable {
         Map<String, Long> frequencyRow = this.frequencyTable.get(label);
         // Make sure we have a frequency for that position in the table
-        frequencyRow.putIfAbsent(word, 0L);
+        if(!frequencyRow.containsKey(word)) {
+            frequencyRow.put(word, 0L);
+        }
         // Update frequency
         frequencyRow.put(word, frequencyRow.get(word) + frequency);

data/src/main/java/bae/NaiveBayesClassifier.java CHANGED Viewed

@@ -8,12 +8,16 @@ public class NaiveBayesClassifier {
     private FrequencyTable frequencyTable;
     private Map<String, Long> wordTable;
     private Map<String, Long> instanceCountOf;
+    private Map<String, Double> initialLikelihoodOf;
+    Map<String, Double> classPriorOf;
     private double totalCount = 0;
     public NaiveBayesClassifier() {
         this.frequencyTable = new FrequencyTable();
         this.wordTable = new HashMap<>();
         this.instanceCountOf = new HashMap<>();
+        this.initialLikelihoodOf = new HashMap<>();
+        this.classPriorOf = new HashMap<>();
     }
     public void train(String label, Document document) {
@@ -37,12 +41,23 @@ public class NaiveBayesClassifier {
         updateIntegerCountBy(this.instanceCountOf, label, 1);
     }
-    public Map<String, Double> classify(Document document) {
-        Map<String, Double> classPriorOf = new HashMap<>();
-        Map<String, Double> likelihoodOf = new HashMap<>();
-        Map<String, Double> classPosteriorOf = new HashMap<>();
-        Map<String, Long> frequencyMap = document.getFrequencyMap();
-        double evidence = 0;
+    public void calculateInitialLikelihoods() {
+        // Update likelihood counts
+        for(String label : this.frequencyTable.getLabels()) {
+            // Set initial likelihood
+            initialLikelihoodOf.put(label, 1d);
+            // Calculate likelihoods
+            for (String word : this.wordTable.keySet()) {
+                double laplaceWordLikelihood =
+                        (this.frequencyTable.get(label, word) + 1d) /
+                        (this.instanceCountOf.get(label) + this.wordTable.size());
+                // Update likelihood
+                double likelihood = initialLikelihoodOf.get(label);
+                initialLikelihoodOf.put(label, likelihood * (1d - laplaceWordLikelihood));
+            }
+        }
         // Update the prior
         for(Map.Entry<String, Long> entry : this.instanceCountOf.entrySet()) {
@@ -50,34 +65,39 @@ public class NaiveBayesClassifier {
             double frequency = entry.getValue();
             // Update instance count
-            classPriorOf.put(label, (frequency / this.totalCount));
+            this.classPriorOf.put(label, (frequency / this.totalCount));
         }
+    }
+    public Map<String, Double> classify(Document document) {
+        Map<String, Double> likelihoodOf = new HashMap<>();
+        Map<String, Double> classPosteriorOf = new HashMap<>();
+        Map<String, Long> featureFrequencyMap = document.getFrequencyMap();
+        double evidence = 0;
         // Update likelihood counts
         for(String label : this.frequencyTable.getLabels()) {
             // Set initial likelihood
-            likelihoodOf.put(label, 1d);
+            likelihoodOf.put(label, this.initialLikelihoodOf.get(label));
-            // Calculate likelihoods
-            for(String word : wordTable.keySet()) {
+            // Calculate actual likelihoods likelihoods
+            for(String word : featureFrequencyMap.keySet()) {
                 double laplaceWordLikelihood =
                         (this.frequencyTable.get(label, word) + 1d) /
                         (this.instanceCountOf.get(label) + this.wordTable.size());
-                // Update likelihood
+                // Update likelihood for words not in features
                 double likelihood = likelihoodOf.get(label);
-                if(frequencyMap.containsKey(word)) {
-                    likelihoodOf.put(label, likelihood * laplaceWordLikelihood);
-                } else {
-                    likelihoodOf.put(label, likelihood * (1d - laplaceWordLikelihood));
+                if(featureFrequencyMap.containsKey(word)) {
+                    likelihoodOf.put(label, (likelihood * laplaceWordLikelihood) / (1d - laplaceWordLikelihood));
                 }
             }
             // Default class posterior of label to 1.0
-            classPosteriorOf.putIfAbsent(label, 1d);
+            classPosteriorOf.put(label, 1d);
             // Update class posterior
-            double classPosterior = classPriorOf.get(label) * likelihoodOf.get(label);
+            double classPosterior = this.classPriorOf.get(label) * likelihoodOf.get(label);
             classPosteriorOf.put(label, classPosterior);
             evidence += classPosterior;
         }
@@ -93,12 +113,16 @@ public class NaiveBayesClassifier {
     }
     public void updateIntegerCountBy(Map<String, Long> someMap, String someKey, long count) {
-        someMap.putIfAbsent(someKey, 0L);
+        if(!someMap.containsKey(someKey)) {
+            someMap.put(someKey, 0L);
+        }
         someMap.put(someKey, someMap.get(someKey) + count);
     }
     public void updateDoubleCountBy(Map<String, Double> someMap, String someKey, double count) {
-        someMap.putIfAbsent(someKey, 0.0);
+        if(!someMap.containsKey(someKey)) {
+            someMap.put(someKey, 0.0);
+        }
         someMap.put(someKey, someMap.get(someKey) + count);
     }

data/target/bae.jar CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bae
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.9
 platform: ruby
 authors:
 - Garrett Thornburg
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-02-18 00:00:00.000000000 Z
+date: 2015-02-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
@@ -68,8 +68,10 @@ files:
 - build.xml
 - lib/bae.rb
 - lib/bae/classifier.rb
+- lib/bae/native_classifier.rb
 - lib/bae/version.rb
 - spec/lib/bae/classifier_spec.rb
+- spec/lib/bae/native_classifier_spec.rb
 - spec/spec_helper.rb
 - src/main/java/bae/Document.java
 - src/main/java/bae/FrequencyTable.java
@@ -104,4 +106,5 @@ specification_version: 4
 summary: Multinomial naive bayes classifier with a kick of java
 test_files:
 - spec/lib/bae/classifier_spec.rb
+- spec/lib/bae/native_classifier_spec.rb
 - spec/spec_helper.rb