RubyGems - ankusa - Versions diffs - 0.0.16 → 0.1.0 - Mend

ankusa 0.0.16 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/.gitignore +2 -0
data/.ruby-gemset +1 -0
data/.ruby-version +1 -0
data/Gemfile +0 -2
data/Gemfile.lock +1 -1
data/README.rdoc +2 -1
data/Rakefile +2 -0
data/ankusa.gemspec +19 -0
data/lib/ankusa/extensions.rb +1 -5
data/lib/ankusa/version.rb +1 -1
data/test/cassandra_classifier_test.rb +19 -0
data/test/classifier_base.rb +130 -0
data/test/config.yml +7 -0
data/test/file_system_classifier_test.rb +27 -0
data/test/hasher_test.rb +25 -0
data/test/hbase_classifier_test.rb +23 -0
data/test/helper.rb +8 -0
data/test/memory_classifier_test.rb +20 -0
data/test/mongo_db_classifier_test.rb +21 -0
metadata +82 -45

data/.gitignore ADDED

	@@ -0,0 +1,2 @@
1	+ docs
2	+ pkg

data/.ruby-gemset ADDED

	@@ -0,0 +1 @@
1	+ ankusa

data/.ruby-version ADDED

	@@ -0,0 +1 @@
1	+ ruby-1.9.3

data/Gemfile CHANGED

@@ -2,5 +2,3 @@ source "http://rubygems.org"
 # Specify your gem's dependencies in bandit.gemspec
 gemspec
-gem 'rake'

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    ankusa (0.0.16)
+    ankusa (0.1.0)
       fast-stemmer (>= 1.0.0)
 GEM

data/README.rdoc CHANGED

@@ -1,4 +1,6 @@
 = ankusa
+{<img src="https://secure.travis-ci.org/bmuller/ankusa.png?branch=master" alt="Build Status" />}[https://travis-ci.org/bmuller/ankusa]
+{<img src="https://gemnasium.com/bmuller/ankusa.png" alt="Dependency Status" />}[https://gemnasium.com/bmuller/ankusa]
 Ankusa is a text classifier in Ruby that can use either Hadoop's HBase, Mongo, or Cassandra for storage.  Because it uses HBase/Mongo/Cassandra as a backend, the training corpus can be many terabytes in size (though additional memory and single file storage abilities also exist for smaller corpora).
@@ -18,7 +20,6 @@ If you're using HBase, make sure the HBase Thrift interface has been started as
 == Basic Usage
 Using the naive Bayes classifier:
-  require 'rubygems'
   require 'ankusa'
   require 'ankusa/hbase_storage'

data/Rakefile CHANGED

@@ -5,6 +5,8 @@ require 'rake/alt_system'
 require 'rake/testtask'
 require 'rdoc/task'
+task :default => [:test_filesystem]
 Bundler::GemHelper.install_tasks
 desc "Create documentation"

data/ankusa.gemspec ADDED

@@ -0,0 +1,19 @@
+$:.push File.expand_path("../lib", __FILE__)
+require "ankusa/version"
+Gem::Specification.new do |s|
+  s.name = "ankusa"
+  s.version = Ankusa::VERSION
+  s.authors = ["Brian Muller"]
+  s.description = "Text classifier with HBase, Cassandra, or Mongo storage"
+  s.summary = "Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for storage"
+  s.email = "bamuller@gmail.com"
+  s.files = `git ls-files`.split($/)
+  s.test_files = s.files.grep(%r{^(test|spec|features)/})
+  s.homepage = "https://github.com/bmuller/ankusa"
+  s.require_paths = ["lib"]
+  s.add_dependency('fast-stemmer', '>= 1.0.0')
+  s.add_development_dependency("rake")
+  s.requirements << "Either hbaserb >= 0.0.3 or cassandra >= 0.7"
+  s.rubyforge_project = "ankusa"
+end

data/lib/ankusa/extensions.rb CHANGED

@@ -1,13 +1,9 @@
-require 'iconv'
 class String
   def numeric?
     true if Float(self) rescue false
   end
   def to_ascii
-    # from http://www.jroller.com/obie/tags/unicode
-    converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
-    converter.iconv(self).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
+    encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "").force_encoding('UTF-8') rescue ""
   end
 end

data/lib/ankusa/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Ankusa
-  VERSION = "0.0.16"
+  VERSION = "0.1.0"
 end

data/test/cassandra_classifier_test.rb ADDED

@@ -0,0 +1,19 @@
+require File.join File.dirname(__FILE__), 'classifier_base'
+require 'ankusa/cassandra_storage'
+module CassandraClassifierBase
+  def initialize(name)
+    @storage = Ankusa::CassandraStorage.new CONFIG['cassandra_host'], CONFIG['cassandra_port'], "ankusa_test"
+    super(name)
+  end
+end
+class NBClassifierTest < Test::Unit::TestCase
+  include CassandraClassifierBase
+  include NBClassifierBase
+end
+class KLClassifierTest < Test::Unit::TestCase
+  include CassandraClassifierBase
+  include KLClassifierBase
+end

data/test/classifier_base.rb ADDED

@@ -0,0 +1,130 @@
+require File.join File.dirname(__FILE__), 'helper'
+module ClassifierBase
+  def train
+    @classifier.train :spam, "spam and great spam"   # spam:2 great:1
+    @classifier.train :good, "words for processing" # word:1 process:1
+    @classifier.train :good, "good word"            # word:1 good:1
+  end
+  def test_train
+    counts = @storage.get_word_counts(:spam)
+    assert_equal counts[:spam], 2
+    counts = @storage.get_word_counts(:word)
+    assert_equal counts[:good], 2
+    assert_equal @storage.get_total_word_count(:good), 4
+    assert_equal @storage.get_doc_count(:good), 2
+    assert_equal @storage.get_total_word_count(:spam), 3
+    assert_equal @storage.get_doc_count(:spam), 1
+    totals = @storage.doc_count_totals
+    assert_equal totals.values.inject { |x,y| x+y }, 3
+    assert_equal totals[:spam], 1
+    assert_equal totals[:good], 2
+    vocab = @storage.get_vocabulary_sizes
+    assert_equal vocab[:spam], 2
+    assert_equal vocab[:good], 3
+  end
+  def teardown
+    @storage.drop_tables
+    @storage.close
+  end
+end
+module NBClassifierBase
+  include ClassifierBase
+  def setup
+    @classifier = Ankusa::NaiveBayesClassifier.new @storage
+    train
+  end
+  def test_untrained
+    @storage.reset
+    string = "spam is tastey"
+    hash = {:spam => 0, :good => 0}
+    assert_equal hash, @classifier.classifications(string)
+    assert_equal nil, @classifier.classify(string)
+  end
+  def test_probs
+    spamlog = Math.log(3.0 / 5.0) + Math.log(1.0 / 5.0) + Math.log(2.0 / 5.0)
+    goodlog = Math.log(1.0 / 7.0) + Math.log(1.0 / 7.0) + Math.log(3.0 / 5.0)
+    # exponentiate
+    spamex = Math.exp(spamlog)
+    goodex = Math.exp(goodlog)
+    # normalize
+    spam = spamex / (spamex + goodex)
+    good = goodex / (spamex + goodex)
+    cs = @classifier.classifications("spam is tastey")
+    assert_equal cs[:spam], spam
+    assert_equal cs[:good], good
+    cs = @classifier.log_likelihoods("spam is tastey")
+    assert_equal cs[:spam], spamlog
+    assert_equal cs[:good], goodlog
+    @classifier.train :somethingelse, "this is something else entirely spam"
+    cs = @classifier.classifications("spam is tastey", [:spam, :good])
+    assert_equal cs[:spam], spam
+    assert_equal cs[:good], good
+    # test for class we didn't train on
+    cs = @classifier.classifications("spam is super tastey if you are a zombie", [:spam, :nothing])
+    assert_equal cs[:nothing], 0
+  end
+  def test_prob_result
+    cs = @classifier.classifications("spam is tastey").sort_by { |c| -c[1] }.first.first
+    klass = @classifier.classify("spam is tastey")
+    assert_equal cs, klass
+    assert_equal klass, :spam
+  end
+end
+module KLClassifierBase
+  include ClassifierBase
+  def setup
+    @classifier = Ankusa::KLDivergenceClassifier.new @storage
+    train
+  end
+  def test_distances
+    ds = @classifier.distances("spam is tastey")
+    thprob_spam = 1.0 / 2.0
+    thprob_tastey = 1.0 / 2.0
+    train_prob_spam = (2 + 1).to_f / (3 + 2).to_f
+    train_prob_tastey = (0 + 1).to_f / (3 + 2).to_f
+    dist = thprob_spam * Math.log(thprob_spam / train_prob_spam)
+    dist += thprob_tastey * Math.log(thprob_tastey / train_prob_tastey)
+    assert_equal ds[:spam], dist
+    train_prob_spam = 1.0 / (4 + 3).to_f
+    train_prob_tastey = 1.0 / (4 + 3).to_f
+    dist = thprob_spam * Math.log(thprob_spam / train_prob_spam)
+    dist += thprob_tastey * Math.log(thprob_tastey / train_prob_tastey)
+    assert_equal ds[:good], dist
+  end
+  def test_distances_result
+    cs = @classifier.distances("spam is tastey").sort_by { |c| c[1] }.first.first
+    klass = @classifier.classify("spam is tastey")
+    assert_equal cs, klass
+    assert_equal klass, :spam
+    # assert distance from class we didn't train with is Infinity (1.0/0.0 is a way to get at Infinity)
+    cs = @classifier.distances("spam is tastey", [:spam, :nothing])
+    assert_equal cs[:nothing], (1.0/0.0)
+  end
+end

data/test/config.yml ADDED

@@ -0,0 +1,7 @@
+hbase_host: 127.0.0.1
+hbase_port: 9090
+cassandra_host: 127.0.0.1
+cassandra_port: 9160
+mongo_db_host: 127.0.0.1
+mongo_db_port: 27017
+file_system_storage_file: training.anuska

data/test/file_system_classifier_test.rb ADDED

@@ -0,0 +1,27 @@
+require File.join File.dirname(__FILE__), 'classifier_base'
+require 'ankusa/file_system_storage'
+module FileSystemClassifierBase
+  def initialize(name)
+    @storage = Ankusa::FileSystemStorage.new CONFIG['file_system_storage_file']
+    super name
+  end
+  def test_storage
+    # train will be called in setup method, now reload storage and test training
+    @storage.save
+    @storage = Ankusa::FileSystemStorage.new CONFIG['file_system_storage_file']
+    test_train
+  end
+end
+class NBMemoryClassifierTest < Test::Unit::TestCase
+  include FileSystemClassifierBase
+  include NBClassifierBase
+end
+class KLMemoryClassifierTest < Test::Unit::TestCase
+  include FileSystemClassifierBase
+  include KLClassifierBase
+end

data/test/hasher_test.rb ADDED

@@ -0,0 +1,25 @@
+require File.join File.dirname(__FILE__), 'helper'
+class HasherTest < Test::Unit::TestCase
+  def setup
+    string = "Words word a the at fish fishing fishes? /^/  The at a of! @#$!"
+    @text_hash = Ankusa::TextHash.new string
+    @array = Ankusa::TextHash.new [string]
+  end
+  def test_stemming
+    assert_equal @text_hash.length, 2
+    assert_equal @text_hash.word_count, 5
+    assert_equal @array.length, 2
+    assert_equal @array.word_count, 5
+  end
+  def test_valid_word
+    assert (not Ankusa::TextHash.valid_word? "accordingly")
+    assert (not Ankusa::TextHash.valid_word? "appropriate")
+    assert Ankusa::TextHash.valid_word? "^*&@"
+    assert Ankusa::TextHash.valid_word? "mother"
+    assert (not Ankusa::TextHash.valid_word? "21675")
+  end
+end

data/test/hbase_classifier_test.rb ADDED

@@ -0,0 +1,23 @@
+require File.join File.dirname(__FILE__), 'classifier_base'
+require 'ankusa/hbase_storage'
+module HBaseClassifierBase
+  def initialize(name)
+    @freq_tablename = "ankusa_word_frequencies_test"
+    @sum_tablename = "ankusa_summary_test"
+    @storage = Ankusa::HBaseStorage.new CONFIG['hbase_host'], CONFIG['hbase_port'], @freq_tablename, @sum_tablename
+    @freq_table = @storage.hbase.get_table(@freq_tablename)
+    @sum_table = @storage.hbase.get_table(@sum_tablename)
+    super(name)
+  end
+end
+class NBClassifierTest < Test::Unit::TestCase
+  include HBaseClassifierBase
+  include NBClassifierBase
+end
+class KLClassifierTest < Test::Unit::TestCase
+  include HBaseClassifierBase
+  include KLClassifierBase
+end

data/test/helper.rb ADDED

@@ -0,0 +1,8 @@
+require 'rubygems'
+require 'test/unit'
+require 'yaml'
+$:.unshift(File.join File.dirname(__FILE__), '..', 'lib')
+require 'ankusa'
+CONFIG = YAML.load_file File.join(File.dirname(__FILE__), "config.yml")

data/test/memory_classifier_test.rb ADDED

@@ -0,0 +1,20 @@
+require File.join File.dirname(__FILE__), 'classifier_base'
+require 'ankusa/memory_storage'
+module MemoryClassifierBase
+  def initialize(name)
+    @storage = Ankusa::MemoryStorage.new
+    super name
+  end
+end
+class NBMemoryClassifierTest < Test::Unit::TestCase
+  include MemoryClassifierBase
+  include NBClassifierBase
+end
+class KLMemoryClassifierTest < Test::Unit::TestCase
+  include MemoryClassifierBase
+  include KLClassifierBase
+end

data/test/mongo_db_classifier_test.rb ADDED

@@ -0,0 +1,21 @@
+require File.join File.dirname(__FILE__), 'classifier_base'
+require 'ankusa/mongo_db_storage'
+module MongoDbClassifierBase
+  def initialize(name)
+    @storage = Ankusa::MongoDbStorage.new :host => CONFIG['mongo_db_host'], :port => CONFIG['mongo_db_port'],
+                                          :username => CONFIG['mongo_db_username'], :password => CONFIG['mongo_db_password'],
+                                          :db => 'ankusa-test'
+    super(name)
+  end
+end
+class NBClassifierTest < Test::Unit::TestCase
+  include MongoDbClassifierBase
+  include NBClassifierBase
+end
+class KLClassifierTest < Test::Unit::TestCase
+  include MongoDbClassifierBase
+  include KLClassifierBase
+end

metadata CHANGED

@@ -1,37 +1,64 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: ankusa
-version: !ruby/object:Gem::Version
+version: !ruby/object:Gem::Version
+  version: 0.1.0
   prerelease:
-  version: 0.0.16
 platform: ruby
-authors:
+authors:
 - Brian Muller
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-11-30 00:00:00 Z
-dependencies:
-- !ruby/object:Gem::Dependency
+date: 2013-06-19 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
   name: fast-stemmer
-  requirement: &id001 !ruby/object:Gem::Requirement
+  requirement: !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
         version: 1.0.0
   type: :runtime
   prerelease: false
-  version_requirements: *id001
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.0.0
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Text classifier with HBase, Cassandra, or Mongo storage
 email: bamuller@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []
-files:
+files:
+- .gitignore
+- .ruby-gemset
+- .ruby-version
+- Gemfile
+- Gemfile.lock
+- LICENSE
+- README.rdoc
+- Rakefile
+- ankusa.gemspec
+- lib/ankusa.rb
 - lib/ankusa/cassandra_storage.rb
 - lib/ankusa/classifier.rb
 - lib/ankusa/extensions.rb
@@ -44,44 +71,54 @@ files:
 - lib/ankusa/naive_bayes.rb
 - lib/ankusa/stopwords.rb
 - lib/ankusa/version.rb
-- lib/ankusa.rb
-- Gemfile
-- Gemfile.lock
-- LICENSE
-- Rakefile
-- README.rdoc
+- test/cassandra_classifier_test.rb
+- test/classifier_base.rb
+- test/config.yml
+- test/file_system_classifier_test.rb
+- test/hasher_test.rb
+- test/hbase_classifier_test.rb
+- test/helper.rb
+- test/memory_classifier_test.rb
+- test/mongo_db_classifier_test.rb
 homepage: https://github.com/bmuller/ankusa
 licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 2264057684891862333
-      segments:
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
       - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
+      hash: 3381126087859790337
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 2264057684891862333
-      segments:
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
       - 0
-      version: "0"
-requirements:
+      hash: 3381126087859790337
+requirements:
 - Either hbaserb >= 0.0.3 or cassandra >= 0.7
 rubyforge_project: ankusa
-rubygems_version: 1.8.24
+rubygems_version: 1.8.25
 signing_key:
 specification_version: 3
-summary: Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for storage
-test_files: []
+summary: Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for
+  storage
+test_files:
+- test/cassandra_classifier_test.rb
+- test/classifier_base.rb
+- test/config.yml
+- test/file_system_classifier_test.rb
+- test/hasher_test.rb
+- test/hbase_classifier_test.rb
+- test/helper.rb
+- test/memory_classifier_test.rb
+- test/mongo_db_classifier_test.rb