ankusa 0.0.14 → 0.0.15
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -0
- data/Gemfile.lock +3 -1
- data/README.rdoc +7 -7
- data/Rakefile +3 -1
- data/lib/ankusa/classifier.rb +1 -1
- data/lib/ankusa/hbase_storage.rb +6 -4
- data/lib/ankusa/naive_bayes.rb +19 -8
- data/lib/ankusa/version.rb +1 -1
- metadata +35 -35
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ankusa (0.0.
|
4
|
+
ankusa (0.0.15)
|
5
5
|
fast-stemmer (>= 1.0.0)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: http://rubygems.org/
|
9
9
|
specs:
|
10
10
|
fast-stemmer (1.0.1)
|
11
|
+
rake (10.0.2)
|
11
12
|
|
12
13
|
PLATFORMS
|
13
14
|
ruby
|
14
15
|
|
15
16
|
DEPENDENCIES
|
16
17
|
ankusa!
|
18
|
+
rake
|
data/README.rdoc
CHANGED
@@ -9,7 +9,7 @@ First, install HBase/Hadoop, Mongo, or Cassandra (>= 0.7.0-rc2). Then, install
|
|
9
9
|
gem install hbaserb
|
10
10
|
# or
|
11
11
|
gem install cassandra
|
12
|
-
# or
|
12
|
+
# or
|
13
13
|
gem install mongo
|
14
14
|
|
15
15
|
If you're using HBase, make sure the HBase Thrift interface has been started as well. Then:
|
@@ -35,7 +35,7 @@ Using the naive Bayes classifier:
|
|
35
35
|
# This will return the most likely class (as symbol)
|
36
36
|
puts c.classify "This is some spammy text"
|
37
37
|
|
38
|
-
# This will return Hash with classes as keys and
|
38
|
+
# This will return Hash with classes as keys and
|
39
39
|
# membership probability as values
|
40
40
|
puts c.classifications "This is some spammy text"
|
41
41
|
|
@@ -54,13 +54,13 @@ Using the naive Bayes classifier:
|
|
54
54
|
== KL Diverence Classifier
|
55
55
|
There is a Kullback–Leibler divergence classifier as well. KL divergence is a distance measure (though not a true metric because it does not satisfy the triangle inequality). The KL classifier simply measures the relative entropy between the text you want to classify and each of the classes. The class with the shortest "distance" is the best class. You may find that for a especially large corpus it may be slightly faster to use this classifier (since prior probablities are never calculated, only likelihoods).
|
56
56
|
|
57
|
-
The API is the same as the NaiveBayesClassifier, except rather than calling "classifications" if you want actual numbers you call "distances".
|
57
|
+
The API is the same as the NaiveBayesClassifier, except rather than calling "classifications" if you want actual numbers you call "distances".
|
58
58
|
|
59
59
|
require 'rubygems'
|
60
60
|
require 'ankusa'
|
61
61
|
require 'ankusa/hbase_storage'
|
62
62
|
|
63
|
-
# connect to HBase
|
63
|
+
# connect to HBase
|
64
64
|
storage = Ankusa::HBaseStorage.new 'localhost'
|
65
65
|
c = Ankusa::KLDivergenceClassifier.new storage
|
66
66
|
|
@@ -72,7 +72,7 @@ The API is the same as the NaiveBayesClassifier, except rather than calling "cla
|
|
72
72
|
# This will return the most likely class (as symbol)
|
73
73
|
puts c.classify "This is some spammy text"
|
74
74
|
|
75
|
-
# This will return Hash with classes as keys and
|
75
|
+
# This will return Hash with classes as keys and
|
76
76
|
# distances >= 0 as values
|
77
77
|
puts c.distances "This is some spammy text"
|
78
78
|
|
@@ -104,13 +104,13 @@ HBase storage:
|
|
104
104
|
|
105
105
|
For Cassandra storage:
|
106
106
|
* You will need Cassandra version 0.7.0-rc2 or greater.
|
107
|
-
* You will need to set a max number classes since current implementation of the Ruby Cassandra client doesn't support table scans.
|
107
|
+
* You will need to set a max number classes since current implementation of the Ruby Cassandra client doesn't support table scans.
|
108
108
|
* Prior to using the Cassandra storage you will need to run the following command from the cassandra-cli: "create keyspace ankusa with replication_factor = 1". This should be fixed with a new release candidate for Cassandra.
|
109
109
|
|
110
110
|
To use the Cassandra storage class:
|
111
111
|
require 'ankusa/cassandra_storage'
|
112
112
|
# defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
|
113
|
-
storage = Ankusa::
|
113
|
+
storage = Ankusa::CassandraStorage.new host, port, keyspace, max_classes
|
114
114
|
|
115
115
|
For MongoDB storage:
|
116
116
|
require 'ankusa/mongo_db_storage'
|
data/Rakefile
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'bundler'
|
3
|
+
require 'rake/dsl_definition'
|
4
|
+
require 'rake/alt_system'
|
3
5
|
require 'rake/testtask'
|
4
6
|
require 'rdoc/task'
|
5
7
|
|
@@ -23,7 +25,7 @@ Rake::TestTask.new("test_memory") { |t|
|
|
23
25
|
desc "Run all unit tests with HBase storage"
|
24
26
|
Rake::TestTask.new("test_hbase") { |t|
|
25
27
|
t.libs += ["lib", "."]
|
26
|
-
t.test_files = FileList['test/hasher_test.rb']
|
28
|
+
t.test_files = FileList['test/hasher_test.rb', 'test/hbase_classifier_test.rb']
|
27
29
|
t.verbose = true
|
28
30
|
}
|
29
31
|
|
data/lib/ankusa/classifier.rb
CHANGED
@@ -49,7 +49,7 @@ module Ankusa
|
|
49
49
|
probs = Hash.new 0
|
50
50
|
@storage.get_word_counts(word).each { |k,v| probs[k] = v if classnames.include? k }
|
51
51
|
vs = vocab_sizes
|
52
|
-
classnames.each { |cn|
|
52
|
+
classnames.each { |cn|
|
53
53
|
# if we've never seen the class, the word prob is 0
|
54
54
|
next unless vs.has_key? cn
|
55
55
|
|
data/lib/ankusa/hbase_storage.rb
CHANGED
@@ -26,7 +26,7 @@ module Ankusa
|
|
26
26
|
drop_tables
|
27
27
|
init_tables
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
def drop_tables
|
31
31
|
freq_table.delete
|
32
32
|
summary_table.delete
|
@@ -69,10 +69,12 @@ module Ankusa
|
|
69
69
|
@klass_word_counts[klass] = summary_table.get(klass, "totals:wordcount").first.to_i64.to_f
|
70
70
|
}
|
71
71
|
end
|
72
|
-
|
72
|
+
|
73
73
|
def get_doc_count(klass)
|
74
74
|
@klass_doc_counts.fetch(klass) {
|
75
|
-
|
75
|
+
totals = summary_table.get(klass, "totals:doccount")
|
76
|
+
totals = (totals.size === 0) ? 0 : totals.first.to_i64.to_f
|
77
|
+
@klass_doc_counts[klass] = totals
|
76
78
|
}
|
77
79
|
end
|
78
80
|
|
@@ -83,7 +85,7 @@ module Ankusa
|
|
83
85
|
if size == count
|
84
86
|
summary_table.atomic_increment klass, "totals:vocabsize"
|
85
87
|
elsif size == 0
|
86
|
-
summary_table.atomic_increment klass, "totals:vocabsize", -1
|
88
|
+
summary_table.atomic_increment klass, "totals:vocabsize", -1
|
87
89
|
end
|
88
90
|
size
|
89
91
|
end
|
data/lib/ankusa/naive_bayes.rb
CHANGED
@@ -6,9 +6,16 @@ module Ankusa
|
|
6
6
|
|
7
7
|
def classify(text, classes=nil)
|
8
8
|
# return the most probable class
|
9
|
-
|
9
|
+
|
10
|
+
result = log_likelihoods(text, classes)
|
11
|
+
if result.values.uniq.size. === 1
|
12
|
+
# unless all classes are equally likely, then return nil
|
13
|
+
return nil
|
14
|
+
else
|
15
|
+
result.sort_by { |c| -c[1] }.first.first
|
16
|
+
end
|
10
17
|
end
|
11
|
-
|
18
|
+
|
12
19
|
# Classes is an array of classes to look at
|
13
20
|
def classifications(text, classnames=nil)
|
14
21
|
result = log_likelihoods text, classnames
|
@@ -17,8 +24,10 @@ module Ankusa
|
|
17
24
|
}
|
18
25
|
|
19
26
|
# normalize to get probs
|
20
|
-
sum = result.values.inject
|
21
|
-
result.keys.each { |k|
|
27
|
+
sum = result.values.inject{ |x,y| x+y }
|
28
|
+
result.keys.each { |k|
|
29
|
+
result[k] = result[k] / sum
|
30
|
+
} unless sum.zero?
|
22
31
|
result
|
23
32
|
end
|
24
33
|
|
@@ -29,7 +38,7 @@ module Ankusa
|
|
29
38
|
|
30
39
|
TextHash.new(text).each { |word, count|
|
31
40
|
probs = get_word_probs(word, classnames)
|
32
|
-
classnames.each { |k|
|
41
|
+
classnames.each { |k|
|
33
42
|
# log likelihood should be negative infinity if we've never seen the klass
|
34
43
|
result[k] += probs[k] > 0 ? (Math.log(probs[k]) * count) : -INFTY
|
35
44
|
}
|
@@ -37,9 +46,11 @@ module Ankusa
|
|
37
46
|
|
38
47
|
# add the prior
|
39
48
|
doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
|
40
|
-
|
41
|
-
|
42
|
-
|
49
|
+
|
50
|
+
doc_count_total = (doc_counts.inject(0){ |x,y| x+y } + classnames.length).to_f
|
51
|
+
|
52
|
+
classnames.each { |k|
|
53
|
+
result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
|
43
54
|
}
|
44
55
|
|
45
56
|
result
|
data/lib/ankusa/version.rb
CHANGED
metadata
CHANGED
@@ -1,33 +1,37 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: ankusa
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.14
|
3
|
+
version: !ruby/object:Gem::Version
|
5
4
|
prerelease:
|
5
|
+
version: 0.0.15
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- Brian Muller
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
|
13
|
+
date: 2012-11-30 00:00:00 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
15
16
|
name: fast-stemmer
|
16
|
-
|
17
|
+
prerelease: false
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
17
19
|
none: false
|
18
|
-
requirements:
|
19
|
-
- -
|
20
|
-
- !ruby/object:Gem::Version
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
21
23
|
version: 1.0.0
|
22
24
|
type: :runtime
|
23
|
-
|
24
|
-
version_requirements: *70197701802820
|
25
|
+
version_requirements: *id001
|
25
26
|
description: Text classifier with HBase, Cassandra, or Mongo storage
|
26
27
|
email: brian.muller@livingsocial.com
|
27
28
|
executables: []
|
29
|
+
|
28
30
|
extensions: []
|
31
|
+
|
29
32
|
extra_rdoc_files: []
|
30
|
-
|
33
|
+
|
34
|
+
files:
|
31
35
|
- lib/ankusa/cassandra_storage.rb
|
32
36
|
- lib/ankusa/classifier.rb
|
33
37
|
- lib/ankusa/extensions.rb
|
@@ -48,34 +52,30 @@ files:
|
|
48
52
|
- README.rdoc
|
49
53
|
homepage: https://github.com/livingsocial/ankusa
|
50
54
|
licenses: []
|
55
|
+
|
51
56
|
post_install_message:
|
52
57
|
rdoc_options: []
|
53
|
-
|
58
|
+
|
59
|
+
require_paths:
|
54
60
|
- lib
|
55
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
56
62
|
none: false
|
57
|
-
requirements:
|
58
|
-
- -
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version:
|
61
|
-
|
62
|
-
- 0
|
63
|
-
hash: 2837888903817045284
|
64
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
68
|
none: false
|
66
|
-
requirements:
|
67
|
-
- -
|
68
|
-
- !ruby/object:Gem::Version
|
69
|
-
version:
|
70
|
-
|
71
|
-
- 0
|
72
|
-
hash: 2837888903817045284
|
73
|
-
requirements:
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: "0"
|
73
|
+
requirements:
|
74
74
|
- Either hbaserb >= 0.0.3 or cassandra >= 0.7
|
75
75
|
rubyforge_project: ankusa
|
76
|
-
rubygems_version: 1.8.
|
76
|
+
rubygems_version: 1.8.24
|
77
77
|
signing_key:
|
78
78
|
specification_version: 3
|
79
|
-
summary: Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for
|
80
|
-
storage
|
79
|
+
summary: Text classifier in Ruby that uses Hadoop's HBase, Cassandra, or Mongo for storage
|
81
80
|
test_files: []
|
81
|
+
|