RubyGems - rjspotter-basset - Versions diffs - 1.0.5 - Mend

rjspotter-basset 1.0.5

Files changed (31) hide show

data/History.txt +7 -0
data/License.txt +20 -0
data/Manifest.txt +21 -0
data/README.rdoc +58 -0
data/Rakefile +63 -0
data/VERSION.yml +4 -0
data/basset.gemspec +38 -0
data/examples/example.rb +25 -0
data/lib/basset.rb +9 -0
data/lib/basset/classification_evaluator.rb +170 -0
data/lib/basset/classifier.rb +188 -0
data/lib/basset/core_extensions.rb +93 -0
data/lib/basset/document.rb +84 -0
data/lib/basset/document_override_example.rb +11 -0
data/lib/basset/feature.rb +26 -0
data/lib/basset/feature_extractor.rb +52 -0
data/lib/basset/feature_selector.rb +126 -0
data/lib/basset/naive_bayes.rb +151 -0
data/lib/basset/svm.rb +180 -0
data/lib/basset/yaml_serialization.rb +41 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +12 -0
data/spec/unit/classifier_spec.rb +166 -0
data/spec/unit/core_extension_spec.rb +33 -0
data/spec/unit/document_spec.rb +59 -0
data/spec/unit/feature_extractor_spec.rb +33 -0
data/spec/unit/feature_selector_spec.rb +108 -0
data/spec/unit/feature_spec.rb +40 -0
data/spec/unit/naive_bayes_spec.rb +119 -0
data/spec/unit/svm_spec.rb +83 -0
metadata +115 -0

data/spec/unit/feature_extractor_spec.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper")
+describe FeatureExtractor do
+  DocumentMock = Struct.new(:vector_of_features)
+  it "should save to file"
+  it "should be loadable from file"
+  it "should return number of features" do
+    FeatureExtractor.new(%w[one two]).number_of_features.should == 2
+  end
+  it "should throw away extra features" do
+    doc = DocumentMock.new([Feature.new("keep"), Feature.new("throwaway")])
+    FeatureExtractor.new(%w[keep]).extract(doc).should == [Feature.new("keep")]
+  end
+  it "should extract no features from a doc with no features" do
+    FeatureExtractor.new(%w[keep]).extract(DocumentMock.new([])).should == []
+  end
+  it "should extract numbered features" do
+    doc = DocumentMock.new([Feature.new("keep", 0)])
+    FeatureExtractor.new(%w[keep]).extract_numbered(doc).should == [Feature.new(1, 0)]
+  end
+  it "should sort extracted numbered features" do
+    feature_extractor = FeatureExtractor.new(%w[keep1 keep2])
+    doc = DocumentMock.new([Feature.new("keep2", 10), Feature.new("keep1", 20)])
+    feature_extractor.extract_numbered(doc).should == [Feature.new(1, 20), Feature.new(2, 10)]
+  end
+end

data/spec/unit/feature_selector_spec.rb ADDED Viewed

@@ -0,0 +1,108 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper")
+describe FeatureSelector do
+  DocumentMock = Struct.new(:vector_of_features, :classification)
+  it "should count documents" do
+    feature_selector = FeatureSelector.new
+    feature_selector.docs.should == 0
+    feature_selector.add_document(DocumentMock.new([]))
+    feature_selector.docs.should == 1
+  end
+  it "should return all feature names" do
+    feature_selector = FeatureSelector.new
+    feature_selector.all_feature_names.should == []
+    feature_selector.add_document(DocumentMock.new([Feature.new("a")]))
+    feature_selector.add_document(DocumentMock.new([Feature.new("b")]))
+    feature_selector.all_feature_names.should == %w[a b]
+  end
+  # TODO
+  # it "should return_all_features_as_best
+  #   feature_selector = FeatureSelector.new
+  #   feature_selector.add_document(DocumentMock.new([Feature.new("a")], :test))
+  #   assert_equal %w[a], feature_selector.best_features_for_classification(:test, 10)
+  # end
+  it "should count docs with feature and class" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.__send__(:docs_with_feature_and_class, "viagra", :spam).should == 1
+    feature_selector.__send__(:docs_with_feature_and_class, "viagra", :ham).should == 0
+  end
+  it "should count docs with feature and not class" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.__send__(:docs_with_feature_and_not_class, "puppy", :spam).should == 1
+    feature_selector.__send__(:docs_with_feature_and_not_class,"puppy", :ham).should == 0
+  end
+  it "should count docs with class and not feature" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.__send__(:docs_with_class_and_not_feature, :spam, "puppy").should == 1
+    feature_selector.__send__(:docs_with_class_and_not_feature, :spam, "viagra").should == 0
+  end
+  it "should count docs without feature or class" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.__send__(:docs_without_feature_or_class, "viagra", :spam).should == 1
+    feature_selector.__send__(:docs_without_feature_or_class, "viagra", :ham).should == 0
+  end
+  it "should return zero chi if all docs contain feature" do
+    feature_selector = FeatureSelector.new
+    the = Feature.new("the", 1)
+    feature_selector.add_document(doc([the], :spam))
+    feature_selector.add_document(doc([the], :ham))
+    feature_selector.features_with_chi(:spam).should == [Feature.new("the", 0.0)]
+  end
+  it "should compute chi squared" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.features_with_chi(:spam).should == [Feature.new("viagra", 2.0), Feature.new("puppy", 2.0)]
+  end
+  it "should not select any features if they are all insignificant" do
+    feature_selector = FeatureSelector.new
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("puppy", 1)], :ham))
+    feature_selector.select_features.should == []
+  end
+  it "should not select features in only one doc" do
+    feature_selector = FeatureSelector.new
+    the = Feature.new("the", 1)
+    feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
+    feature_selector.select_features.should == []
+  end
+  it "should select significant features occuring in more than one doc" do
+    feature_selector = FeatureSelector.new
+    the = Feature.new("the", 1)
+    feature_selector.add_document(doc([the, Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([Feature.new("viagra", 1)], :spam))
+    feature_selector.add_document(doc([the, Feature.new("puppy", 1)], :ham))
+    feature_selector.select_features.should == %w[viagra]
+  end
+  it "should return selected features sorted by chi squared descending"
+  it "should select based on first feature by default"
+  it "should select with a chi squared of 1 by default"
+private
+  def doc(*args)
+    DocumentMock.new(*args)
+  end
+end

data/spec/unit/feature_spec.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require File.join(File.dirname(__FILE__), "..", "spec_helper")
+describe Feature do
+  it "should store name" do
+    Feature.new("hello").name.should == "hello"
+  end
+  it "should require name" do
+    lambda { Feature.new }.should raise_error(ArgumentError)
+  end
+  it "should store values" do
+    Feature.new("name", 2).value.should ==2
+  end
+  it "should default value to zero" do
+    Feature.new("name").value.should == 0
+  end
+  it "should be equal with same name and no value" do
+    Feature.new("hello").should == Feature.new("hello")
+  end
+  it "should be equal with same name and same value" do
+    Feature.new("hello", 1).should == Feature.new("hello", 1)
+  end
+  it "should not be equal with different name" do
+    Feature.new("hello").should_not == Feature.new("test")
+  end
+  it "should not be equal with same name and different value" do
+    Feature.new("hello", 1).should_not == Feature.new("hello", 2)
+  end
+  it "should sort by name ascending then value ascending" do
+    [Feature.new("b", 3), Feature.new("a", 2), Feature.new("a", 1)].sort.should ==
+    [Feature.new("a", 1), Feature.new("a", 2), Feature.new("b", 3)]
+  end
+end

data/spec/unit/naive_bayes_spec.rb ADDED Viewed

@@ -0,0 +1,119 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe NaiveBayes::FeatureCount do
+  it "should be equal to another feature count if the feature name and counts per class are equal" do
+    NaiveBayes::FeatureCount.new("rspec", :sweet, 1).should == NaiveBayes::FeatureCount.new("rspec", :sweet, 1)
+  end
+  it "should give the sum of all occurrences of a feature for a given class" do
+    fc = NaiveBayes::FeatureCount.new("rspec", :sweet, 1)
+    fc.add_count_for_class(2, :sweet)
+    fc.add_count_for_class(6, :super_sweet)
+    fc.count_for_class(:sweet).should == 3
+    fc.count_for_class(:super_sweet).should == 6
+  end
+end
+describe NaiveBayes do
+  before(:each) do
+    @nbayes = NaiveBayes.new
+    @doc = Document.new("here are some interesting words", :interesting)
+    @feature_vectors = @doc.feature_vectors
+    @other_vectors = Document.new("these words are interesting", :interesting).feature_vectors
+    @test_vectors = Document.new("this word seems interesting", :interesting).feature_vectors
+  end
+  def feature_counts_for(classification, *feature_count_tuples)
+    feature_counts = {}
+    feature_count_tuples.each do |tuple|
+      feature_counts[tuple.first] = NaiveBayes::FeatureCount.new(tuple.first, classification, tuple.last)
+    end
+    feature_counts
+  end
+  def add_interesting_docs
+    @nbayes.add_document(:interesting, @feature_vectors)
+    @nbayes.add_document(:interesting, @other_vectors)
+  end
+  def add_boring_docs
+    @nbayes.add_document(:boring, Document.new("yawn lets go flame").feature_vectors)
+    @nbayes.add_document(:boring, Document.new("yawn lets flame and troll").feature_vectors)
+  end
+  it "should keep track of the total docs and total docs for class when adding new docs" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    @nbayes.total_docs.should == 1
+    @nbayes.total_docs_in_class[:interesting].should == 1
+  end
+  it "should create a feature count for each feature with the # of occurances and class" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    expected = feature_counts_for(:interesting, ["here", 1], ["ar", 1],["some", 1],["interest", 1],["word", 1])
+    @nbayes.feature_counts.should == expected
+  end
+  it "should sum the number of all occurances of all features for a given class" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    @nbayes.occurrences_of_all_features_in_class(:interesting).should == 5
+    add_boring_docs
+    @nbayes.occurrences_of_all_features_in_class(:interesting).should == 5
+  end
+  it "should give a list of classes it knows about" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    @nbayes.add_document(:kinda_interesting, @feature_vectors)
+    @nbayes.classes.sort_to_s.should == [:kinda_interesting, :interesting].sort_to_s
+  end
+  it "should not divide by zero or nil when determining the probability of a feature vector for a class" do
+    @nbayes.occurrences_of_all_features_in_class(:interesting).should be_nil
+    probability_computation = lambda {@nbayes.probability_of_vector_for_class(@test_vectors.first, :interesting)}
+    probability_computation.should_not raise_error
+  end
+  it "should compute the probability that a given (singular) feature vector belongs to a given class" do
+    @nbayes.add_document(:interesting, @feature_vectors)
+    probablity = @nbayes.probability_of_vector_for_class(@test_vectors.first, :interesting)
+    probablity.should be_a Float
+    probablity.round.should == -1
+  end
+  it "should compute the probability that a given set of feature vectors belongs to a given class" do
+    add_interesting_docs
+    probability = @nbayes.probability_of_vectors_for_class(@test_vectors, :interesting)
+    probability.should be_a(Float)
+    probability.round.should == -5
+  end
+  it "should compute a probability of a class for a set of vectors normalized by the number of features" do
+    add_interesting_docs
+    probability = @nbayes.probability_of_vectors_for_class(@test_vectors, :interesting, :normalize => true)
+    probability.should be_a Float
+    probability.round.should == -1
+  end
+  it "should determine the most likely class of a set feature vectors" do
+    add_interesting_docs
+    add_boring_docs
+    test_vectors = Document.new("some interesting words").feature_vectors
+    classification = @nbayes.classify(test_vectors, :normalize_classes => false)
+    classification.last.should == :interesting
+    classification.first.should be_a Float
+    classification.first.round.should == -2
+  end
+  it "should account for the relative probabilities of classes by default when classifying" do
+    add_interesting_docs
+    add_boring_docs
+    vectors = Document.new("some interesting words").feature_vectors
+    classification = @nbayes.classify(vectors)
+    classification.last.should == :interesting
+    classification.first.should be_a Float
+    classification.first.round.should == -2
+  end
+end

data/spec/unit/svm_spec.rb ADDED Viewed

@@ -0,0 +1,83 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe Svm do
+  before(:each) do
+    @svm = Svm.new
+    @doc = Document.new("here are some interesting words", :interesting)
+    @feature_vectors = @doc.feature_vectors
+    @negative_doc = Document.new("let's eat some spamiagra", :uninteresting)
+    @negative_fvs = @negative_doc.feature_vectors
+  end
+  def add_simple_docs_to_svm
+    @svm.add_document(:interesting, @feature_vectors)
+    @svm.add_document(:uninteresting, @negative_fvs)
+  end
+  it "should list document classes it knows about" do
+    add_simple_docs_to_svm
+    @svm.classes.sort_to_s.should == [:interesting, :uninteresting].sort_to_s
+  end
+  it "should express classes as SVM friendly unit integer labels"do
+    add_simple_docs_to_svm
+    spam_doc = Document.new("make your junk repulsive to women with free v14gra")
+    @svm.add_document(:uninteresting, spam_doc.feature_vectors)
+    @svm.class_labels.should == {:interesting => 0, :uninteresting => 1}
+  end
+  it "should create a feature dictionary based on training docs" do
+    add_simple_docs_to_svm
+    expected = (@feature_vectors + @negative_fvs).map { |fv| fv.name }.uniq
+    @svm.feature_dictionary.should == expected
+  end
+  it "should express documents as SVM friendly vectors using the binary method" do
+    # for a brief but usable description of binary, frequency, tf-idf, and
+    # Hadamard vector representations of documents, see section 2.1 (2nd page) of:
+    # http://jmlr.csail.mit.edu/papers/volume2/manevitz01a/manevitz01a.pdf
+    add_simple_docs_to_svm
+    @svm.vectorized_docs(:interesting).first.should == [1,1,1,1,1,0,0,0]
+    @svm.vectorized_docs(:uninteresting).first.should == [0,0,0,0,1,1,1,1]
+  end
+  it "should set SVM parameters to reasonable defaults and allow access via a block" do
+    @svm.parameters do |params|
+      params.C.should == 100
+      params.svm_type.should == NU_SVC
+      params.degree.should == 1
+      params.coef0.should == 0
+      params.eps.should == 0.001
+      params.kernel_type.should == RBF
+    end
+  end
+  it "should construct the list of labels and document feature vectors" do
+    add_simple_docs_to_svm
+    result = @svm.labels_and_document_vectors
+    result[:labels].sort.should == [0,1]
+    result[:features].sort.should == [[0,0,0,0,1,1,1,1], [1,1,1,1,1,0,0,0]]
+    # can't count on consistent Hash#each ordering, hence this:
+    expected_result_with_hash_ordering_workaround = {1 => [0,0,0,0,1,1,1,1], 0 => [1,1,1,1,1,0,0,0]}
+    stabilized_actual_result = {result[:labels].first => result[:features].first,
+      result[:labels].last => result[:features].last}
+    stabilized_actual_result.should == expected_result_with_hash_ordering_workaround
+  end
+  it "should classify unlabeled documents" do
+    # examples from http://www.igvita.com/2008/01/07/support-vector-machines-svm-in-ruby/
+    non_spam_texts = ["Peter and Stewie are hilarious", "New episode rocks, Peter and Stewie are hilarious",
+      "Peter is my fav!"]
+    spam_texts = ["FREE NATIONAL TREASURE", "FREE TV for EVERY visitor", "AS SEEN ON NATIONAL TV",
+      "FREE drugs"]
+    non_spam_texts.each { |t| @svm.add_document(:nonspam, Document.new(t, :nonspam).feature_vectors) }
+    spam_texts.each { |t| @svm.add_document(:spam, Document.new(t, :spam).feature_vectors) }
+    test_non_spams = ["Stewie is hilarious", "Poor Peter is hilarious"]
+    test_spam = "FREE lotterry for the NATIONAL TREASURE !!!"
+    @svm.classify(Document.new(test_non_spams.first).feature_vectors).should == :nonspam
+    @svm.classify(Document.new(test_non_spams.last).feature_vectors).should == :nonspam
+    @svm.classify(Document.new(test_spam).feature_vectors).should == :spam
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,115 @@
+--- !ruby/object:Gem::Specification
+name: rjspotter-basset
+version: !ruby/object:Gem::Version
+  version: 1.0.5
+platform: ruby
+authors:
+- Paul Dix
+- Bryan Helmkamp
+- Daniel DeLeo
+- R. Potter
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-05-09 00:00:00 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: stemmer
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.0.1
+    version:
+- !ruby/object:Gem::Dependency
+  name: tomz-libsvm-ruby-swig
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.3.3
+    version:
+- !ruby/object:Gem::Dependency
+  name: igrigorik-bloomfilter
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.2
+    version:
+description: A library for machine learning and classification
+email: rjspotter@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- README.rdoc
+files:
+- History.txt
+- License.txt
+- Manifest.txt
+- README.rdoc
+- Rakefile
+- VERSION.yml
+- basset.gemspec
+- examples/example.rb
+- lib/basset.rb
+- lib/basset/classification_evaluator.rb
+- lib/basset/classifier.rb
+- lib/basset/core_extensions.rb
+- lib/basset/document.rb
+- lib/basset/document_override_example.rb
+- lib/basset/feature.rb
+- lib/basset/feature_extractor.rb
+- lib/basset/feature_selector.rb
+- lib/basset/naive_bayes.rb
+- lib/basset/svm.rb
+- lib/basset/yaml_serialization.rb
+- spec/spec.opts
+- spec/spec_helper.rb
+- spec/unit/classifier_spec.rb
+- spec/unit/core_extension_spec.rb
+- spec/unit/document_spec.rb
+- spec/unit/feature_extractor_spec.rb
+- spec/unit/feature_selector_spec.rb
+- spec/unit/feature_spec.rb
+- spec/unit/naive_bayes_spec.rb
+- spec/unit/svm_spec.rb
+has_rdoc: false
+homepage: http://github.com/danielsdeleo/basset
+post_install_message:
+rdoc_options:
+- --inline-source
+- --charset=UTF-8
+require_paths:
+- - lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 3
+summary: A library for machine learning and classification
+test_files: []