ankusa 0.0.16 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ docs
2
+ pkg
@@ -0,0 +1 @@
1
+ ankusa
@@ -0,0 +1 @@
1
+ ruby-1.9.3
data/Gemfile CHANGED
@@ -2,5 +2,3 @@ source "http://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in bandit.gemspec
4
4
  gemspec
5
-
6
- gem 'rake'
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ankusa (0.0.16)
4
+ ankusa (0.1.0)
5
5
  fast-stemmer (>= 1.0.0)
6
6
 
7
7
  GEM
@@ -1,4 +1,6 @@
1
1
  = ankusa
2
+ {<img src="https://secure.travis-ci.org/bmuller/ankusa.png?branch=master" alt="Build Status" />}[https://travis-ci.org/bmuller/ankusa]
3
+ {<img src="https://gemnasium.com/bmuller/ankusa.png" alt="Dependency Status" />}[https://gemnasium.com/bmuller/ankusa]
2
4
 
3
5
  Ankusa is a text classifier in Ruby that can use either Hadoop's HBase, Mongo, or Cassandra for storage. Because it uses HBase/Mongo/Cassandra as a backend, the training corpus can be many terabytes in size (though additional memory and single file storage abilities also exist for smaller corpora).
4
6
 
@@ -18,7 +20,6 @@ If you're using HBase, make sure the HBase Thrift interface has been started as
18
20
  == Basic Usage
19
21
  Using the naive Bayes classifier:
20
22
 
21
- require 'rubygems'
22
23
  require 'ankusa'
23
24
  require 'ankusa/hbase_storage'
24
25
 
data/Rakefile CHANGED
@@ -5,6 +5,8 @@ require 'rake/alt_system'
5
5
  require 'rake/testtask'
6
6
  require 'rdoc/task'
7
7
 
8
+ task :default => [:test_filesystem]
9
+
8
10
  Bundler::GemHelper.install_tasks
9
11
 
10
12
  desc "Create documentation"
@@ -0,0 +1,19 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require "ankusa/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "ankusa"
6
+ s.version = Ankusa::VERSION
7
+ s.authors = ["Brian Muller"]
8
+ s.description = "Text classifier with HBase, Cassandra, or Mongo storage"
9
+ s.summary = "Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for storage"
10
+ s.email = "bamuller@gmail.com"
11
+ s.files = `git ls-files`.split($/)
12
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
13
+ s.homepage = "https://github.com/bmuller/ankusa"
14
+ s.require_paths = ["lib"]
15
+ s.add_dependency('fast-stemmer', '>= 1.0.0')
16
+ s.add_development_dependency("rake")
17
+ s.requirements << "Either hbaserb >= 0.0.3 or cassandra >= 0.7"
18
+ s.rubyforge_project = "ankusa"
19
+ end
@@ -1,13 +1,9 @@
1
- require 'iconv'
2
-
3
1
  class String
4
2
  def numeric?
5
3
  true if Float(self) rescue false
6
4
  end
7
5
 
8
6
  def to_ascii
9
- # from http://www.jroller.com/obie/tags/unicode
10
- converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
11
- converter.iconv(self).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
7
+ encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "").force_encoding('UTF-8') rescue ""
12
8
  end
13
9
  end
@@ -1,3 +1,3 @@
1
1
  module Ankusa
2
- VERSION = "0.0.16"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -0,0 +1,19 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/cassandra_storage'
3
+
4
+ module CassandraClassifierBase
5
+ def initialize(name)
6
+ @storage = Ankusa::CassandraStorage.new CONFIG['cassandra_host'], CONFIG['cassandra_port'], "ankusa_test"
7
+ super(name)
8
+ end
9
+ end
10
+
11
+ class NBClassifierTest < Test::Unit::TestCase
12
+ include CassandraClassifierBase
13
+ include NBClassifierBase
14
+ end
15
+
16
+ class KLClassifierTest < Test::Unit::TestCase
17
+ include CassandraClassifierBase
18
+ include KLClassifierBase
19
+ end
@@ -0,0 +1,130 @@
1
+ require File.join File.dirname(__FILE__), 'helper'
2
+
3
+ module ClassifierBase
4
+ def train
5
+ @classifier.train :spam, "spam and great spam" # spam:2 great:1
6
+ @classifier.train :good, "words for processing" # word:1 process:1
7
+ @classifier.train :good, "good word" # word:1 good:1
8
+ end
9
+
10
+ def test_train
11
+ counts = @storage.get_word_counts(:spam)
12
+ assert_equal counts[:spam], 2
13
+ counts = @storage.get_word_counts(:word)
14
+ assert_equal counts[:good], 2
15
+ assert_equal @storage.get_total_word_count(:good), 4
16
+ assert_equal @storage.get_doc_count(:good), 2
17
+ assert_equal @storage.get_total_word_count(:spam), 3
18
+ assert_equal @storage.get_doc_count(:spam), 1
19
+ totals = @storage.doc_count_totals
20
+ assert_equal totals.values.inject { |x,y| x+y }, 3
21
+ assert_equal totals[:spam], 1
22
+ assert_equal totals[:good], 2
23
+
24
+ vocab = @storage.get_vocabulary_sizes
25
+ assert_equal vocab[:spam], 2
26
+ assert_equal vocab[:good], 3
27
+ end
28
+
29
+ def teardown
30
+ @storage.drop_tables
31
+ @storage.close
32
+ end
33
+ end
34
+
35
+
36
+ module NBClassifierBase
37
+ include ClassifierBase
38
+
39
+ def setup
40
+ @classifier = Ankusa::NaiveBayesClassifier.new @storage
41
+ train
42
+ end
43
+
44
+ def test_untrained
45
+ @storage.reset
46
+
47
+ string = "spam is tastey"
48
+
49
+ hash = {:spam => 0, :good => 0}
50
+ assert_equal hash, @classifier.classifications(string)
51
+ assert_equal nil, @classifier.classify(string)
52
+ end
53
+
54
+
55
+ def test_probs
56
+ spamlog = Math.log(3.0 / 5.0) + Math.log(1.0 / 5.0) + Math.log(2.0 / 5.0)
57
+ goodlog = Math.log(1.0 / 7.0) + Math.log(1.0 / 7.0) + Math.log(3.0 / 5.0)
58
+
59
+ # exponentiate
60
+ spamex = Math.exp(spamlog)
61
+ goodex = Math.exp(goodlog)
62
+
63
+ # normalize
64
+ spam = spamex / (spamex + goodex)
65
+ good = goodex / (spamex + goodex)
66
+
67
+ cs = @classifier.classifications("spam is tastey")
68
+ assert_equal cs[:spam], spam
69
+ assert_equal cs[:good], good
70
+
71
+ cs = @classifier.log_likelihoods("spam is tastey")
72
+ assert_equal cs[:spam], spamlog
73
+ assert_equal cs[:good], goodlog
74
+
75
+ @classifier.train :somethingelse, "this is something else entirely spam"
76
+ cs = @classifier.classifications("spam is tastey", [:spam, :good])
77
+ assert_equal cs[:spam], spam
78
+ assert_equal cs[:good], good
79
+
80
+ # test for class we didn't train on
81
+ cs = @classifier.classifications("spam is super tastey if you are a zombie", [:spam, :nothing])
82
+ assert_equal cs[:nothing], 0
83
+ end
84
+
85
+ def test_prob_result
86
+ cs = @classifier.classifications("spam is tastey").sort_by { |c| -c[1] }.first.first
87
+ klass = @classifier.classify("spam is tastey")
88
+ assert_equal cs, klass
89
+ assert_equal klass, :spam
90
+ end
91
+ end
92
+
93
+
94
+ module KLClassifierBase
95
+ include ClassifierBase
96
+
97
+ def setup
98
+ @classifier = Ankusa::KLDivergenceClassifier.new @storage
99
+ train
100
+ end
101
+
102
+ def test_distances
103
+ ds = @classifier.distances("spam is tastey")
104
+ thprob_spam = 1.0 / 2.0
105
+ thprob_tastey = 1.0 / 2.0
106
+
107
+ train_prob_spam = (2 + 1).to_f / (3 + 2).to_f
108
+ train_prob_tastey = (0 + 1).to_f / (3 + 2).to_f
109
+ dist = thprob_spam * Math.log(thprob_spam / train_prob_spam)
110
+ dist += thprob_tastey * Math.log(thprob_tastey / train_prob_tastey)
111
+ assert_equal ds[:spam], dist
112
+
113
+ train_prob_spam = 1.0 / (4 + 3).to_f
114
+ train_prob_tastey = 1.0 / (4 + 3).to_f
115
+ dist = thprob_spam * Math.log(thprob_spam / train_prob_spam)
116
+ dist += thprob_tastey * Math.log(thprob_tastey / train_prob_tastey)
117
+ assert_equal ds[:good], dist
118
+ end
119
+
120
+ def test_distances_result
121
+ cs = @classifier.distances("spam is tastey").sort_by { |c| c[1] }.first.first
122
+ klass = @classifier.classify("spam is tastey")
123
+ assert_equal cs, klass
124
+ assert_equal klass, :spam
125
+
126
+ # assert distance from class we didn't train with is Infinity (1.0/0.0 is a way to get at Infinity)
127
+ cs = @classifier.distances("spam is tastey", [:spam, :nothing])
128
+ assert_equal cs[:nothing], (1.0/0.0)
129
+ end
130
+ end
@@ -0,0 +1,7 @@
1
+ hbase_host: 127.0.0.1
2
+ hbase_port: 9090
3
+ cassandra_host: 127.0.0.1
4
+ cassandra_port: 9160
5
+ mongo_db_host: 127.0.0.1
6
+ mongo_db_port: 27017
7
+ file_system_storage_file: training.anuska
@@ -0,0 +1,27 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/file_system_storage'
3
+
4
+ module FileSystemClassifierBase
5
+ def initialize(name)
6
+ @storage = Ankusa::FileSystemStorage.new CONFIG['file_system_storage_file']
7
+ super name
8
+ end
9
+
10
+ def test_storage
11
+ # train will be called in setup method, now reload storage and test training
12
+ @storage.save
13
+ @storage = Ankusa::FileSystemStorage.new CONFIG['file_system_storage_file']
14
+ test_train
15
+ end
16
+ end
17
+
18
+ class NBMemoryClassifierTest < Test::Unit::TestCase
19
+ include FileSystemClassifierBase
20
+ include NBClassifierBase
21
+ end
22
+
23
+
24
+ class KLMemoryClassifierTest < Test::Unit::TestCase
25
+ include FileSystemClassifierBase
26
+ include KLClassifierBase
27
+ end
@@ -0,0 +1,25 @@
1
+ require File.join File.dirname(__FILE__), 'helper'
2
+
3
+ class HasherTest < Test::Unit::TestCase
4
+ def setup
5
+ string = "Words word a the at fish fishing fishes? /^/ The at a of! @#$!"
6
+ @text_hash = Ankusa::TextHash.new string
7
+ @array = Ankusa::TextHash.new [string]
8
+ end
9
+
10
+ def test_stemming
11
+ assert_equal @text_hash.length, 2
12
+ assert_equal @text_hash.word_count, 5
13
+
14
+ assert_equal @array.length, 2
15
+ assert_equal @array.word_count, 5
16
+ end
17
+
18
+ def test_valid_word
19
+ assert (not Ankusa::TextHash.valid_word? "accordingly")
20
+ assert (not Ankusa::TextHash.valid_word? "appropriate")
21
+ assert Ankusa::TextHash.valid_word? "^*&@"
22
+ assert Ankusa::TextHash.valid_word? "mother"
23
+ assert (not Ankusa::TextHash.valid_word? "21675")
24
+ end
25
+ end
@@ -0,0 +1,23 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/hbase_storage'
3
+
4
+ module HBaseClassifierBase
5
+ def initialize(name)
6
+ @freq_tablename = "ankusa_word_frequencies_test"
7
+ @sum_tablename = "ankusa_summary_test"
8
+ @storage = Ankusa::HBaseStorage.new CONFIG['hbase_host'], CONFIG['hbase_port'], @freq_tablename, @sum_tablename
9
+ @freq_table = @storage.hbase.get_table(@freq_tablename)
10
+ @sum_table = @storage.hbase.get_table(@sum_tablename)
11
+ super(name)
12
+ end
13
+ end
14
+
15
+ class NBClassifierTest < Test::Unit::TestCase
16
+ include HBaseClassifierBase
17
+ include NBClassifierBase
18
+ end
19
+
20
+ class KLClassifierTest < Test::Unit::TestCase
21
+ include HBaseClassifierBase
22
+ include KLClassifierBase
23
+ end
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'yaml'
4
+
5
+ $:.unshift(File.join File.dirname(__FILE__), '..', 'lib')
6
+ require 'ankusa'
7
+
8
+ CONFIG = YAML.load_file File.join(File.dirname(__FILE__), "config.yml")
@@ -0,0 +1,20 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/memory_storage'
3
+
4
+ module MemoryClassifierBase
5
+ def initialize(name)
6
+ @storage = Ankusa::MemoryStorage.new
7
+ super name
8
+ end
9
+ end
10
+
11
+ class NBMemoryClassifierTest < Test::Unit::TestCase
12
+ include MemoryClassifierBase
13
+ include NBClassifierBase
14
+ end
15
+
16
+
17
+ class KLMemoryClassifierTest < Test::Unit::TestCase
18
+ include MemoryClassifierBase
19
+ include KLClassifierBase
20
+ end
@@ -0,0 +1,21 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/mongo_db_storage'
3
+
4
+ module MongoDbClassifierBase
5
+ def initialize(name)
6
+ @storage = Ankusa::MongoDbStorage.new :host => CONFIG['mongo_db_host'], :port => CONFIG['mongo_db_port'],
7
+ :username => CONFIG['mongo_db_username'], :password => CONFIG['mongo_db_password'],
8
+ :db => 'ankusa-test'
9
+ super(name)
10
+ end
11
+ end
12
+
13
+ class NBClassifierTest < Test::Unit::TestCase
14
+ include MongoDbClassifierBase
15
+ include NBClassifierBase
16
+ end
17
+
18
+ class KLClassifierTest < Test::Unit::TestCase
19
+ include MongoDbClassifierBase
20
+ include KLClassifierBase
21
+ end
metadata CHANGED
@@ -1,37 +1,64 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: ankusa
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
4
5
  prerelease:
5
- version: 0.0.16
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - Brian Muller
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2012-11-30 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
12
+ date: 2013-06-19 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
16
15
  name: fast-stemmer
17
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
18
17
  none: false
19
- requirements:
20
- - - ">="
21
- - !ruby/object:Gem::Version
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
22
21
  version: 1.0.0
23
22
  type: :runtime
24
23
  prerelease: false
25
- version_requirements: *id001
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.0.0
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
26
46
  description: Text classifier with HBase, Cassandra, or Mongo storage
27
47
  email: bamuller@gmail.com
28
48
  executables: []
29
-
30
49
  extensions: []
31
-
32
50
  extra_rdoc_files: []
33
-
34
- files:
51
+ files:
52
+ - .gitignore
53
+ - .ruby-gemset
54
+ - .ruby-version
55
+ - Gemfile
56
+ - Gemfile.lock
57
+ - LICENSE
58
+ - README.rdoc
59
+ - Rakefile
60
+ - ankusa.gemspec
61
+ - lib/ankusa.rb
35
62
  - lib/ankusa/cassandra_storage.rb
36
63
  - lib/ankusa/classifier.rb
37
64
  - lib/ankusa/extensions.rb
@@ -44,44 +71,54 @@ files:
44
71
  - lib/ankusa/naive_bayes.rb
45
72
  - lib/ankusa/stopwords.rb
46
73
  - lib/ankusa/version.rb
47
- - lib/ankusa.rb
48
- - Gemfile
49
- - Gemfile.lock
50
- - LICENSE
51
- - Rakefile
52
- - README.rdoc
74
+ - test/cassandra_classifier_test.rb
75
+ - test/classifier_base.rb
76
+ - test/config.yml
77
+ - test/file_system_classifier_test.rb
78
+ - test/hasher_test.rb
79
+ - test/hbase_classifier_test.rb
80
+ - test/helper.rb
81
+ - test/memory_classifier_test.rb
82
+ - test/mongo_db_classifier_test.rb
53
83
  homepage: https://github.com/bmuller/ankusa
54
84
  licenses: []
55
-
56
85
  post_install_message:
57
86
  rdoc_options: []
58
-
59
- require_paths:
87
+ require_paths:
60
88
  - lib
61
- required_ruby_version: !ruby/object:Gem::Requirement
89
+ required_ruby_version: !ruby/object:Gem::Requirement
62
90
  none: false
63
- requirements:
64
- - - ">="
65
- - !ruby/object:Gem::Version
66
- hash: 2264057684891862333
67
- segments:
91
+ requirements:
92
+ - - ! '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ segments:
68
96
  - 0
69
- version: "0"
70
- required_rubygems_version: !ruby/object:Gem::Requirement
97
+ hash: 3381126087859790337
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
99
  none: false
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- hash: 2264057684891862333
76
- segments:
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ segments:
77
105
  - 0
78
- version: "0"
79
- requirements:
106
+ hash: 3381126087859790337
107
+ requirements:
80
108
  - Either hbaserb >= 0.0.3 or cassandra >= 0.7
81
109
  rubyforge_project: ankusa
82
- rubygems_version: 1.8.24
110
+ rubygems_version: 1.8.25
83
111
  signing_key:
84
112
  specification_version: 3
85
- summary: Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for storage
86
- test_files: []
87
-
113
+ summary: Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for
114
+ storage
115
+ test_files:
116
+ - test/cassandra_classifier_test.rb
117
+ - test/classifier_base.rb
118
+ - test/config.yml
119
+ - test/file_system_classifier_test.rb
120
+ - test/hasher_test.rb
121
+ - test/hbase_classifier_test.rb
122
+ - test/helper.rb
123
+ - test/memory_classifier_test.rb
124
+ - test/mongo_db_classifier_test.rb