ankusa 0.0.16 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2 @@
1
+ docs
2
+ pkg
@@ -0,0 +1 @@
1
+ ankusa
@@ -0,0 +1 @@
1
+ ruby-1.9.3
data/Gemfile CHANGED
@@ -2,5 +2,3 @@ source "http://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in bandit.gemspec
4
4
  gemspec
5
-
6
- gem 'rake'
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ankusa (0.0.16)
4
+ ankusa (0.1.0)
5
5
  fast-stemmer (>= 1.0.0)
6
6
 
7
7
  GEM
@@ -1,4 +1,6 @@
1
1
  = ankusa
2
+ {<img src="https://secure.travis-ci.org/bmuller/ankusa.png?branch=master" alt="Build Status" />}[https://travis-ci.org/bmuller/ankusa]
3
+ {<img src="https://gemnasium.com/bmuller/ankusa.png" alt="Dependency Status" />}[https://gemnasium.com/bmuller/ankusa]
2
4
 
3
5
  Ankusa is a text classifier in Ruby that can use either Hadoop's HBase, Mongo, or Cassandra for storage. Because it uses HBase/Mongo/Cassandra as a backend, the training corpus can be many terabytes in size (though additional memory and single file storage abilities also exist for smaller corpora).
4
6
 
@@ -18,7 +20,6 @@ If you're using HBase, make sure the HBase Thrift interface has been started as
18
20
  == Basic Usage
19
21
  Using the naive Bayes classifier:
20
22
 
21
- require 'rubygems'
22
23
  require 'ankusa'
23
24
  require 'ankusa/hbase_storage'
24
25
 
data/Rakefile CHANGED
@@ -5,6 +5,8 @@ require 'rake/alt_system'
5
5
  require 'rake/testtask'
6
6
  require 'rdoc/task'
7
7
 
8
+ task :default => [:test_filesystem]
9
+
8
10
  Bundler::GemHelper.install_tasks
9
11
 
10
12
  desc "Create documentation"
@@ -0,0 +1,19 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require "ankusa/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "ankusa"
6
+ s.version = Ankusa::VERSION
7
+ s.authors = ["Brian Muller"]
8
+ s.description = "Text classifier with HBase, Cassandra, or Mongo storage"
9
+ s.summary = "Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for storage"
10
+ s.email = "bamuller@gmail.com"
11
+ s.files = `git ls-files`.split($/)
12
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
13
+ s.homepage = "https://github.com/bmuller/ankusa"
14
+ s.require_paths = ["lib"]
15
+ s.add_dependency('fast-stemmer', '>= 1.0.0')
16
+ s.add_development_dependency("rake")
17
+ s.requirements << "Either hbaserb >= 0.0.3 or cassandra >= 0.7"
18
+ s.rubyforge_project = "ankusa"
19
+ end
@@ -1,13 +1,9 @@
1
- require 'iconv'
2
-
3
1
  class String
4
2
  def numeric?
5
3
  true if Float(self) rescue false
6
4
  end
7
5
 
8
6
  def to_ascii
9
- # from http://www.jroller.com/obie/tags/unicode
10
- converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
11
- converter.iconv(self).unpack('U*').select { |cp| cp < 127 }.pack('U*') rescue ""
7
+ encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "").force_encoding('UTF-8') rescue ""
12
8
  end
13
9
  end
@@ -1,3 +1,3 @@
1
1
  module Ankusa
2
- VERSION = "0.0.16"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -0,0 +1,19 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/cassandra_storage'
3
+
4
+ module CassandraClassifierBase
5
+ def initialize(name)
6
+ @storage = Ankusa::CassandraStorage.new CONFIG['cassandra_host'], CONFIG['cassandra_port'], "ankusa_test"
7
+ super(name)
8
+ end
9
+ end
10
+
11
+ class NBClassifierTest < Test::Unit::TestCase
12
+ include CassandraClassifierBase
13
+ include NBClassifierBase
14
+ end
15
+
16
+ class KLClassifierTest < Test::Unit::TestCase
17
+ include CassandraClassifierBase
18
+ include KLClassifierBase
19
+ end
@@ -0,0 +1,130 @@
1
+ require File.join File.dirname(__FILE__), 'helper'
2
+
3
+ module ClassifierBase
4
+ def train
5
+ @classifier.train :spam, "spam and great spam" # spam:2 great:1
6
+ @classifier.train :good, "words for processing" # word:1 process:1
7
+ @classifier.train :good, "good word" # word:1 good:1
8
+ end
9
+
10
+ def test_train
11
+ counts = @storage.get_word_counts(:spam)
12
+ assert_equal counts[:spam], 2
13
+ counts = @storage.get_word_counts(:word)
14
+ assert_equal counts[:good], 2
15
+ assert_equal @storage.get_total_word_count(:good), 4
16
+ assert_equal @storage.get_doc_count(:good), 2
17
+ assert_equal @storage.get_total_word_count(:spam), 3
18
+ assert_equal @storage.get_doc_count(:spam), 1
19
+ totals = @storage.doc_count_totals
20
+ assert_equal totals.values.inject { |x,y| x+y }, 3
21
+ assert_equal totals[:spam], 1
22
+ assert_equal totals[:good], 2
23
+
24
+ vocab = @storage.get_vocabulary_sizes
25
+ assert_equal vocab[:spam], 2
26
+ assert_equal vocab[:good], 3
27
+ end
28
+
29
+ def teardown
30
+ @storage.drop_tables
31
+ @storage.close
32
+ end
33
+ end
34
+
35
+
36
+ module NBClassifierBase
37
+ include ClassifierBase
38
+
39
+ def setup
40
+ @classifier = Ankusa::NaiveBayesClassifier.new @storage
41
+ train
42
+ end
43
+
44
+ def test_untrained
45
+ @storage.reset
46
+
47
+ string = "spam is tastey"
48
+
49
+ hash = {:spam => 0, :good => 0}
50
+ assert_equal hash, @classifier.classifications(string)
51
+ assert_equal nil, @classifier.classify(string)
52
+ end
53
+
54
+
55
+ def test_probs
56
+ spamlog = Math.log(3.0 / 5.0) + Math.log(1.0 / 5.0) + Math.log(2.0 / 5.0)
57
+ goodlog = Math.log(1.0 / 7.0) + Math.log(1.0 / 7.0) + Math.log(3.0 / 5.0)
58
+
59
+ # exponentiate
60
+ spamex = Math.exp(spamlog)
61
+ goodex = Math.exp(goodlog)
62
+
63
+ # normalize
64
+ spam = spamex / (spamex + goodex)
65
+ good = goodex / (spamex + goodex)
66
+
67
+ cs = @classifier.classifications("spam is tastey")
68
+ assert_equal cs[:spam], spam
69
+ assert_equal cs[:good], good
70
+
71
+ cs = @classifier.log_likelihoods("spam is tastey")
72
+ assert_equal cs[:spam], spamlog
73
+ assert_equal cs[:good], goodlog
74
+
75
+ @classifier.train :somethingelse, "this is something else entirely spam"
76
+ cs = @classifier.classifications("spam is tastey", [:spam, :good])
77
+ assert_equal cs[:spam], spam
78
+ assert_equal cs[:good], good
79
+
80
+ # test for class we didn't train on
81
+ cs = @classifier.classifications("spam is super tastey if you are a zombie", [:spam, :nothing])
82
+ assert_equal cs[:nothing], 0
83
+ end
84
+
85
+ def test_prob_result
86
+ cs = @classifier.classifications("spam is tastey").sort_by { |c| -c[1] }.first.first
87
+ klass = @classifier.classify("spam is tastey")
88
+ assert_equal cs, klass
89
+ assert_equal klass, :spam
90
+ end
91
+ end
92
+
93
+
94
+ module KLClassifierBase
95
+ include ClassifierBase
96
+
97
+ def setup
98
+ @classifier = Ankusa::KLDivergenceClassifier.new @storage
99
+ train
100
+ end
101
+
102
+ def test_distances
103
+ ds = @classifier.distances("spam is tastey")
104
+ thprob_spam = 1.0 / 2.0
105
+ thprob_tastey = 1.0 / 2.0
106
+
107
+ train_prob_spam = (2 + 1).to_f / (3 + 2).to_f
108
+ train_prob_tastey = (0 + 1).to_f / (3 + 2).to_f
109
+ dist = thprob_spam * Math.log(thprob_spam / train_prob_spam)
110
+ dist += thprob_tastey * Math.log(thprob_tastey / train_prob_tastey)
111
+ assert_equal ds[:spam], dist
112
+
113
+ train_prob_spam = 1.0 / (4 + 3).to_f
114
+ train_prob_tastey = 1.0 / (4 + 3).to_f
115
+ dist = thprob_spam * Math.log(thprob_spam / train_prob_spam)
116
+ dist += thprob_tastey * Math.log(thprob_tastey / train_prob_tastey)
117
+ assert_equal ds[:good], dist
118
+ end
119
+
120
+ def test_distances_result
121
+ cs = @classifier.distances("spam is tastey").sort_by { |c| c[1] }.first.first
122
+ klass = @classifier.classify("spam is tastey")
123
+ assert_equal cs, klass
124
+ assert_equal klass, :spam
125
+
126
+ # assert distance from class we didn't train with is Infinity (1.0/0.0 is a way to get at Infinity)
127
+ cs = @classifier.distances("spam is tastey", [:spam, :nothing])
128
+ assert_equal cs[:nothing], (1.0/0.0)
129
+ end
130
+ end
@@ -0,0 +1,7 @@
1
+ hbase_host: 127.0.0.1
2
+ hbase_port: 9090
3
+ cassandra_host: 127.0.0.1
4
+ cassandra_port: 9160
5
+ mongo_db_host: 127.0.0.1
6
+ mongo_db_port: 27017
7
+ file_system_storage_file: training.anuska
@@ -0,0 +1,27 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/file_system_storage'
3
+
4
+ module FileSystemClassifierBase
5
+ def initialize(name)
6
+ @storage = Ankusa::FileSystemStorage.new CONFIG['file_system_storage_file']
7
+ super name
8
+ end
9
+
10
+ def test_storage
11
+ # train will be called in setup method, now reload storage and test training
12
+ @storage.save
13
+ @storage = Ankusa::FileSystemStorage.new CONFIG['file_system_storage_file']
14
+ test_train
15
+ end
16
+ end
17
+
18
+ class NBMemoryClassifierTest < Test::Unit::TestCase
19
+ include FileSystemClassifierBase
20
+ include NBClassifierBase
21
+ end
22
+
23
+
24
+ class KLMemoryClassifierTest < Test::Unit::TestCase
25
+ include FileSystemClassifierBase
26
+ include KLClassifierBase
27
+ end
@@ -0,0 +1,25 @@
1
+ require File.join File.dirname(__FILE__), 'helper'
2
+
3
+ class HasherTest < Test::Unit::TestCase
4
+ def setup
5
+ string = "Words word a the at fish fishing fishes? /^/ The at a of! @#$!"
6
+ @text_hash = Ankusa::TextHash.new string
7
+ @array = Ankusa::TextHash.new [string]
8
+ end
9
+
10
+ def test_stemming
11
+ assert_equal @text_hash.length, 2
12
+ assert_equal @text_hash.word_count, 5
13
+
14
+ assert_equal @array.length, 2
15
+ assert_equal @array.word_count, 5
16
+ end
17
+
18
+ def test_valid_word
19
+ assert (not Ankusa::TextHash.valid_word? "accordingly")
20
+ assert (not Ankusa::TextHash.valid_word? "appropriate")
21
+ assert Ankusa::TextHash.valid_word? "^*&@"
22
+ assert Ankusa::TextHash.valid_word? "mother"
23
+ assert (not Ankusa::TextHash.valid_word? "21675")
24
+ end
25
+ end
@@ -0,0 +1,23 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/hbase_storage'
3
+
4
+ module HBaseClassifierBase
5
+ def initialize(name)
6
+ @freq_tablename = "ankusa_word_frequencies_test"
7
+ @sum_tablename = "ankusa_summary_test"
8
+ @storage = Ankusa::HBaseStorage.new CONFIG['hbase_host'], CONFIG['hbase_port'], @freq_tablename, @sum_tablename
9
+ @freq_table = @storage.hbase.get_table(@freq_tablename)
10
+ @sum_table = @storage.hbase.get_table(@sum_tablename)
11
+ super(name)
12
+ end
13
+ end
14
+
15
+ class NBClassifierTest < Test::Unit::TestCase
16
+ include HBaseClassifierBase
17
+ include NBClassifierBase
18
+ end
19
+
20
+ class KLClassifierTest < Test::Unit::TestCase
21
+ include HBaseClassifierBase
22
+ include KLClassifierBase
23
+ end
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'yaml'
4
+
5
+ $:.unshift(File.join File.dirname(__FILE__), '..', 'lib')
6
+ require 'ankusa'
7
+
8
+ CONFIG = YAML.load_file File.join(File.dirname(__FILE__), "config.yml")
@@ -0,0 +1,20 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/memory_storage'
3
+
4
+ module MemoryClassifierBase
5
+ def initialize(name)
6
+ @storage = Ankusa::MemoryStorage.new
7
+ super name
8
+ end
9
+ end
10
+
11
+ class NBMemoryClassifierTest < Test::Unit::TestCase
12
+ include MemoryClassifierBase
13
+ include NBClassifierBase
14
+ end
15
+
16
+
17
+ class KLMemoryClassifierTest < Test::Unit::TestCase
18
+ include MemoryClassifierBase
19
+ include KLClassifierBase
20
+ end
@@ -0,0 +1,21 @@
1
+ require File.join File.dirname(__FILE__), 'classifier_base'
2
+ require 'ankusa/mongo_db_storage'
3
+
4
+ module MongoDbClassifierBase
5
+ def initialize(name)
6
+ @storage = Ankusa::MongoDbStorage.new :host => CONFIG['mongo_db_host'], :port => CONFIG['mongo_db_port'],
7
+ :username => CONFIG['mongo_db_username'], :password => CONFIG['mongo_db_password'],
8
+ :db => 'ankusa-test'
9
+ super(name)
10
+ end
11
+ end
12
+
13
+ class NBClassifierTest < Test::Unit::TestCase
14
+ include MongoDbClassifierBase
15
+ include NBClassifierBase
16
+ end
17
+
18
+ class KLClassifierTest < Test::Unit::TestCase
19
+ include MongoDbClassifierBase
20
+ include KLClassifierBase
21
+ end
metadata CHANGED
@@ -1,37 +1,64 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: ankusa
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
4
5
  prerelease:
5
- version: 0.0.16
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - Brian Muller
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2012-11-30 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
12
+ date: 2013-06-19 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
16
15
  name: fast-stemmer
17
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
18
17
  none: false
19
- requirements:
20
- - - ">="
21
- - !ruby/object:Gem::Version
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
22
21
  version: 1.0.0
23
22
  type: :runtime
24
23
  prerelease: false
25
- version_requirements: *id001
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.0.0
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
26
46
  description: Text classifier with HBase, Cassandra, or Mongo storage
27
47
  email: bamuller@gmail.com
28
48
  executables: []
29
-
30
49
  extensions: []
31
-
32
50
  extra_rdoc_files: []
33
-
34
- files:
51
+ files:
52
+ - .gitignore
53
+ - .ruby-gemset
54
+ - .ruby-version
55
+ - Gemfile
56
+ - Gemfile.lock
57
+ - LICENSE
58
+ - README.rdoc
59
+ - Rakefile
60
+ - ankusa.gemspec
61
+ - lib/ankusa.rb
35
62
  - lib/ankusa/cassandra_storage.rb
36
63
  - lib/ankusa/classifier.rb
37
64
  - lib/ankusa/extensions.rb
@@ -44,44 +71,54 @@ files:
44
71
  - lib/ankusa/naive_bayes.rb
45
72
  - lib/ankusa/stopwords.rb
46
73
  - lib/ankusa/version.rb
47
- - lib/ankusa.rb
48
- - Gemfile
49
- - Gemfile.lock
50
- - LICENSE
51
- - Rakefile
52
- - README.rdoc
74
+ - test/cassandra_classifier_test.rb
75
+ - test/classifier_base.rb
76
+ - test/config.yml
77
+ - test/file_system_classifier_test.rb
78
+ - test/hasher_test.rb
79
+ - test/hbase_classifier_test.rb
80
+ - test/helper.rb
81
+ - test/memory_classifier_test.rb
82
+ - test/mongo_db_classifier_test.rb
53
83
  homepage: https://github.com/bmuller/ankusa
54
84
  licenses: []
55
-
56
85
  post_install_message:
57
86
  rdoc_options: []
58
-
59
- require_paths:
87
+ require_paths:
60
88
  - lib
61
- required_ruby_version: !ruby/object:Gem::Requirement
89
+ required_ruby_version: !ruby/object:Gem::Requirement
62
90
  none: false
63
- requirements:
64
- - - ">="
65
- - !ruby/object:Gem::Version
66
- hash: 2264057684891862333
67
- segments:
91
+ requirements:
92
+ - - ! '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ segments:
68
96
  - 0
69
- version: "0"
70
- required_rubygems_version: !ruby/object:Gem::Requirement
97
+ hash: 3381126087859790337
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
99
  none: false
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- hash: 2264057684891862333
76
- segments:
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ segments:
77
105
  - 0
78
- version: "0"
79
- requirements:
106
+ hash: 3381126087859790337
107
+ requirements:
80
108
  - Either hbaserb >= 0.0.3 or cassandra >= 0.7
81
109
  rubyforge_project: ankusa
82
- rubygems_version: 1.8.24
110
+ rubygems_version: 1.8.25
83
111
  signing_key:
84
112
  specification_version: 3
85
- summary: Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for storage
86
- test_files: []
87
-
113
+ summary: Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for
114
+ storage
115
+ test_files:
116
+ - test/cassandra_classifier_test.rb
117
+ - test/classifier_base.rb
118
+ - test/config.yml
119
+ - test/file_system_classifier_test.rb
120
+ - test/hasher_test.rb
121
+ - test/hbase_classifier_test.rb
122
+ - test/helper.rb
123
+ - test/memory_classifier_test.rb
124
+ - test/mongo_db_classifier_test.rb