ankusa 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +80 -6
- data/Rakefile +22 -10
- data/docs/classes/Ankusa.html +29 -1
- data/docs/classes/Ankusa/CassandraStorage.html +615 -0
- data/docs/classes/Ankusa/Classifier.html +23 -131
- data/docs/classes/Ankusa/HBaseStorage.html +102 -102
- data/docs/classes/Ankusa/KLDivergenceClassifier.html +194 -0
- data/docs/classes/Ankusa/MemoryStorage.html +84 -84
- data/docs/classes/Ankusa/NaiveBayesClassifier.html +231 -0
- data/docs/classes/Ankusa/TextHash.html +30 -30
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +132 -11
- data/docs/files/lib/ankusa/cassandra_storage_rb.html +108 -0
- data/docs/files/lib/ankusa/classifier_rb.html +1 -1
- data/docs/files/lib/ankusa/kl_divergence_rb.html +101 -0
- data/docs/files/lib/ankusa/naive_bayes_rb.html +101 -0
- data/docs/files/lib/ankusa_rb.html +3 -3
- data/docs/fr_class_index.html +3 -0
- data/docs/fr_file_index.html +3 -0
- data/docs/fr_method_index.html +59 -42
- data/lib/ankusa.rb +2 -2
- data/lib/ankusa/cassandra_storage.rb +194 -0
- data/lib/ankusa/classifier.rb +1 -39
- data/lib/ankusa/kl_divergence.rb +31 -0
- data/lib/ankusa/naive_bayes.rb +46 -0
- metadata +19 -26
data/lib/ankusa.rb
CHANGED
@@ -0,0 +1,194 @@
|
|
1
|
+
require 'cassandra/0.7'
|
2
|
+
|
3
|
+
#
|
4
|
+
# At the moment you'll have to do:
|
5
|
+
#
|
6
|
+
# create keyspace ankusa with replication_factor = 1
|
7
|
+
#
|
8
|
+
# from the cassandra-cli. This should be fixed with new release candidate for
|
9
|
+
# cassandra
|
10
|
+
#
|
11
|
+
module Ankusa
|
12
|
+
|
13
|
+
class CassandraStorage
|
14
|
+
attr_reader :cassandra
|
15
|
+
|
16
|
+
#
|
17
|
+
# Necessary to set max classes since current implementation of ruby
|
18
|
+
# cassandra client doesn't support table scans. Using crufty get_range
|
19
|
+
# method at the moment.
|
20
|
+
#
|
21
|
+
def initialize(host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100)
|
22
|
+
@cassandra = Cassandra.new('system', "#{host}:#{port}")
|
23
|
+
@klass_word_counts = {}
|
24
|
+
@klass_doc_counts = {}
|
25
|
+
@keyspace = keyspace
|
26
|
+
@max_classes = max_classes
|
27
|
+
init_tables
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Fetch the names of the distinct classes for classification:
|
32
|
+
# eg. :spam, :good, etc
|
33
|
+
#
|
34
|
+
def classnames
|
35
|
+
@cassandra.get_range(:totals, {:start => '', :finish => '', :count => @max_classes}).inject([]) do |cs, key_slice|
|
36
|
+
cs << key_slice.key.to_sym
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def reset
|
41
|
+
drop_tables
|
42
|
+
init_tables
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Drop ankusa keyspace, reset internal caches
|
47
|
+
#
|
48
|
+
# FIXME: truncate doesn't work with cassandra-beta2
|
49
|
+
#
|
50
|
+
def drop_tables
|
51
|
+
@cassandra.truncate!('classes')
|
52
|
+
@cassandra.truncate!('totals')
|
53
|
+
@cassandra.drop_keyspace(@keyspace)
|
54
|
+
@klass_word_counts = {}
|
55
|
+
@klass_doc_counts = {}
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
#
|
60
|
+
# Create required keyspace and column families
|
61
|
+
#
|
62
|
+
def init_tables
|
63
|
+
# Do nothing if keyspace already exists
|
64
|
+
if @cassandra.keyspaces.include?(@keyspace)
|
65
|
+
@cassandra.keyspace = @keyspace
|
66
|
+
else
|
67
|
+
freq_table = Cassandra::ColumnFamily.new({:keyspace => @keyspace, :name => "classes"}) # word => {classname => count}
|
68
|
+
summary_table = Cassandra::ColumnFamily.new({:keyspace => @keyspace, :name => "totals"}) # class => {wordcount => count}
|
69
|
+
ks_def = Cassandra::Keyspace.new({
|
70
|
+
:name => @keyspace,
|
71
|
+
:strategy_class => 'org.apache.cassandra.locator.SimpleStrategy',
|
72
|
+
:replication_factor => 1,
|
73
|
+
:cf_defs => [freq_table, summary_table]
|
74
|
+
})
|
75
|
+
@cassandra.add_keyspace ks_def
|
76
|
+
@cassandra.keyspace = @keyspace
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# Fetch hash of word counts as a single row from cassandra.
|
82
|
+
# Here column_name is the class and column value is the count
|
83
|
+
#
|
84
|
+
def get_word_counts(word)
|
85
|
+
# fetch all (class,count) pairs for a given word
|
86
|
+
row = @cassandra.get(:classes, word.to_s)
|
87
|
+
return row.to_hash if row.empty?
|
88
|
+
row.inject({}){|counts, col| counts[col.first.to_sym] = [col.last.to_f,0].max; counts}
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# Does a table 'scan' of summary table pulling out the 'vocabsize' column
|
93
|
+
# from each row. Generates a hash of (class, vocab_size) key value pairs
|
94
|
+
#
|
95
|
+
def get_vocabulary_sizes
|
96
|
+
get_summary "vocabsize"
|
97
|
+
end
|
98
|
+
|
99
|
+
#
|
100
|
+
# Fetch total word count for a given class and cache it
|
101
|
+
#
|
102
|
+
def get_total_word_count(klass)
|
103
|
+
@klass_word_counts[klass] = @cassandra.get(:totals, klass.to_s, "wordcount").values.last.to_f
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# Fetch total documents for a given class and cache it
|
108
|
+
#
|
109
|
+
def get_doc_count(klass)
|
110
|
+
@klass_doc_counts[klass] = @cassandra.get(:totals, klass.to_s, "doc_count").values.last.to_f
|
111
|
+
end
|
112
|
+
|
113
|
+
#
|
114
|
+
# Increment the count for a given (word,class) pair. Evidently, cassandra
|
115
|
+
# does not support atomic increment/decrement. Psh. HBase uses ZooKeeper to
|
116
|
+
# implement atomic operations, ain't it special?
|
117
|
+
#
|
118
|
+
def incr_word_count(klass, word, count)
|
119
|
+
# Only wants strings
|
120
|
+
klass = klass.to_s
|
121
|
+
word = word.to_s
|
122
|
+
|
123
|
+
prior_count = @cassandra.get(:classes, word, klass).values.last.to_i
|
124
|
+
new_count = prior_count + count
|
125
|
+
@cassandra.insert(:classes, word, {klass => new_count.to_s})
|
126
|
+
|
127
|
+
if (prior_count == 0 && count > 0)
|
128
|
+
#
|
129
|
+
# we've never seen this word before and we're not trying to unlearn it
|
130
|
+
#
|
131
|
+
vocab_size = @cassandra.get(:totals, klass, "vocabsize").values.last.to_i
|
132
|
+
vocab_size += 1
|
133
|
+
@cassandra.insert(:totals, klass, {"vocabsize" => vocab_size.to_s})
|
134
|
+
elsif new_count == 0
|
135
|
+
#
|
136
|
+
# we've seen this word before but we're trying to unlearn it
|
137
|
+
#
|
138
|
+
vocab_size = @cassandra.get(:totals, klass, "vocabsize").values.last.to_i
|
139
|
+
vocab_size -= 1
|
140
|
+
@cassandra.insert(:totals, klass, {"vocabsize" => vocab_size.to_s})
|
141
|
+
end
|
142
|
+
new_count
|
143
|
+
end
|
144
|
+
|
145
|
+
#
|
146
|
+
# Increment total word count for a given class by 'count'
|
147
|
+
#
|
148
|
+
def incr_total_word_count(klass, count)
|
149
|
+
klass = klass.to_s
|
150
|
+
wordcount = @cassandra.get(:totals, klass, "wordcount").values.last.to_i
|
151
|
+
wordcount += count
|
152
|
+
@cassandra.insert(:totals, klass, {"wordcount" => wordcount.to_s})
|
153
|
+
@klass_word_counts[klass.to_sym] = wordcount
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
# Increment total document count for a given class by 'count'
|
158
|
+
#
|
159
|
+
def incr_doc_count(klass, count)
|
160
|
+
klass = klass.to_s
|
161
|
+
doc_count = @cassandra.get(:totals, klass, "doc_count").values.last.to_i
|
162
|
+
doc_count += count
|
163
|
+
@cassandra.insert(:totals, klass, {"doc_count" => doc_count.to_s})
|
164
|
+
@klass_doc_counts[klass.to_sym] = doc_count
|
165
|
+
end
|
166
|
+
|
167
|
+
def doc_count_totals
|
168
|
+
get_summary "doc_count"
|
169
|
+
end
|
170
|
+
|
171
|
+
#
|
172
|
+
# Doesn't do anything
|
173
|
+
#
|
174
|
+
def close
|
175
|
+
end
|
176
|
+
|
177
|
+
protected
|
178
|
+
|
179
|
+
#
|
180
|
+
# Fetch 100 rows from summary table, yes, increase if necessary
|
181
|
+
#
|
182
|
+
def get_summary(name)
|
183
|
+
counts = {}
|
184
|
+
@cassandra.get_range(:totals, {:start => '', :finish => '', :count => @max_classes}).each do |key_slice|
|
185
|
+
# keyslice is a clunky thrift object, map into a ruby hash
|
186
|
+
row = key_slice.columns.inject({}){|hsh, c| hsh[c.column.name] = c.column.value; hsh}
|
187
|
+
counts[key_slice.key.to_sym] = row[name].to_f
|
188
|
+
end
|
189
|
+
counts
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
data/lib/ankusa/classifier.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Ankusa
|
2
2
|
|
3
|
-
|
3
|
+
module Classifier
|
4
4
|
attr_reader :classnames
|
5
5
|
|
6
6
|
def initialize(storage)
|
@@ -44,44 +44,6 @@ module Ankusa
|
|
44
44
|
th
|
45
45
|
end
|
46
46
|
|
47
|
-
def classify(text, classes=nil)
|
48
|
-
# return the most probable class
|
49
|
-
log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
|
50
|
-
end
|
51
|
-
|
52
|
-
# Classes is an array of classes to look at
|
53
|
-
def classifications(text, classnames=nil)
|
54
|
-
result = log_likelihoods text, classnames
|
55
|
-
result.keys.each { |k|
|
56
|
-
result[k] = Math.exp result[k]
|
57
|
-
}
|
58
|
-
|
59
|
-
# normalize to get probs
|
60
|
-
sum = result.values.inject { |x,y| x+y }
|
61
|
-
result.keys.each { |k| result[k] = result[k] / sum }
|
62
|
-
result
|
63
|
-
end
|
64
|
-
|
65
|
-
# Classes is an array of classes to look at
|
66
|
-
def log_likelihoods(text, classnames=nil)
|
67
|
-
classnames ||= @classnames
|
68
|
-
result = Hash.new 0
|
69
|
-
|
70
|
-
TextHash.new(text).each { |word, count|
|
71
|
-
probs = get_word_probs(word, classnames)
|
72
|
-
classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
|
73
|
-
}
|
74
|
-
|
75
|
-
# add the prior and exponentiate
|
76
|
-
doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
|
77
|
-
doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
|
78
|
-
classnames.each { |k|
|
79
|
-
result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
|
80
|
-
}
|
81
|
-
|
82
|
-
result
|
83
|
-
end
|
84
|
-
|
85
47
|
protected
|
86
48
|
def get_word_probs(word, classnames)
|
87
49
|
probs = Hash.new 0
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Ankusa
|
2
|
+
|
3
|
+
class KLDivergenceClassifier
|
4
|
+
include Classifier
|
5
|
+
|
6
|
+
def classify(text, classes=nil)
|
7
|
+
# return the class with the least distance from the word
|
8
|
+
# distribution of the given text
|
9
|
+
distances(text, classes).sort_by { |c| c[1] }.first.first
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
# Classes is an array of classes to look at
|
14
|
+
def distances(text, classnames=nil)
|
15
|
+
classnames ||= @classnames
|
16
|
+
distances = Hash.new 0
|
17
|
+
|
18
|
+
th = TextHash.new(text)
|
19
|
+
th.each { |word, count|
|
20
|
+
thprob = count.to_f / th.length.to_f
|
21
|
+
probs = get_word_probs(word, classnames)
|
22
|
+
classnames.each { |k|
|
23
|
+
distances[k] += (thprob * Math.log(thprob / probs[k]) * count)
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
distances
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Ankusa
|
2
|
+
|
3
|
+
class NaiveBayesClassifier
|
4
|
+
include Classifier
|
5
|
+
|
6
|
+
def classify(text, classes=nil)
|
7
|
+
# return the most probable class
|
8
|
+
log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
|
9
|
+
end
|
10
|
+
|
11
|
+
# Classes is an array of classes to look at
|
12
|
+
def classifications(text, classnames=nil)
|
13
|
+
result = log_likelihoods text, classnames
|
14
|
+
result.keys.each { |k|
|
15
|
+
result[k] = Math.exp result[k]
|
16
|
+
}
|
17
|
+
|
18
|
+
# normalize to get probs
|
19
|
+
sum = result.values.inject { |x,y| x+y }
|
20
|
+
result.keys.each { |k| result[k] = result[k] / sum }
|
21
|
+
result
|
22
|
+
end
|
23
|
+
|
24
|
+
# Classes is an array of classes to look at
|
25
|
+
def log_likelihoods(text, classnames=nil)
|
26
|
+
classnames ||= @classnames
|
27
|
+
result = Hash.new 0
|
28
|
+
|
29
|
+
TextHash.new(text).each { |word, count|
|
30
|
+
probs = get_word_probs(word, classnames)
|
31
|
+
classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
|
32
|
+
}
|
33
|
+
|
34
|
+
# add the prior and exponentiate
|
35
|
+
doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
|
36
|
+
doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
|
37
|
+
classnames.each { |k|
|
38
|
+
result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
|
39
|
+
}
|
40
|
+
|
41
|
+
result
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ankusa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 7
|
10
|
+
version: 0.0.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Muller
|
@@ -15,29 +15,13 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-12 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
|
-
- !ruby/object:Gem::Dependency
|
22
|
-
name: hbaserb
|
23
|
-
prerelease: false
|
24
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
|
-
requirements:
|
27
|
-
- - ">="
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
hash: 25
|
30
|
-
segments:
|
31
|
-
- 0
|
32
|
-
- 0
|
33
|
-
- 3
|
34
|
-
version: 0.0.3
|
35
|
-
type: :runtime
|
36
|
-
version_requirements: *id001
|
37
21
|
- !ruby/object:Gem::Dependency
|
38
22
|
name: fast-stemmer
|
39
23
|
prerelease: false
|
40
|
-
requirement: &
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
41
25
|
none: false
|
42
26
|
requirements:
|
43
27
|
- - ">="
|
@@ -49,8 +33,8 @@ dependencies:
|
|
49
33
|
- 0
|
50
34
|
version: 1.0.0
|
51
35
|
type: :runtime
|
52
|
-
version_requirements: *
|
53
|
-
description: Text classifier with HBase storage
|
36
|
+
version_requirements: *id001
|
37
|
+
description: Text classifier with HBase or Cassandra storage
|
54
38
|
email: brian.muller@livingsocial.com
|
55
39
|
executables: []
|
56
40
|
|
@@ -59,28 +43,37 @@ extensions: []
|
|
59
43
|
extra_rdoc_files: []
|
60
44
|
|
61
45
|
files:
|
46
|
+
- lib/ankusa/cassandra_storage.rb
|
62
47
|
- lib/ankusa/classifier.rb
|
63
48
|
- lib/ankusa/extensions.rb
|
64
49
|
- lib/ankusa/hasher.rb
|
65
50
|
- lib/ankusa/hbase_storage.rb
|
51
|
+
- lib/ankusa/kl_divergence.rb
|
66
52
|
- lib/ankusa/memory_storage.rb
|
53
|
+
- lib/ankusa/naive_bayes.rb
|
67
54
|
- lib/ankusa/stopwords.rb
|
68
55
|
- lib/ankusa.rb
|
69
56
|
- LICENSE
|
70
57
|
- Rakefile
|
71
58
|
- README.rdoc
|
59
|
+
- docs/classes/Ankusa/CassandraStorage.html
|
72
60
|
- docs/classes/Ankusa/Classifier.html
|
73
61
|
- docs/classes/Ankusa/HBaseStorage.html
|
62
|
+
- docs/classes/Ankusa/KLDivergenceClassifier.html
|
74
63
|
- docs/classes/Ankusa/MemoryStorage.html
|
64
|
+
- docs/classes/Ankusa/NaiveBayesClassifier.html
|
75
65
|
- docs/classes/Ankusa/TextHash.html
|
76
66
|
- docs/classes/Ankusa.html
|
77
67
|
- docs/classes/String.html
|
78
68
|
- docs/created.rid
|
69
|
+
- docs/files/lib/ankusa/cassandra_storage_rb.html
|
79
70
|
- docs/files/lib/ankusa/classifier_rb.html
|
80
71
|
- docs/files/lib/ankusa/extensions_rb.html
|
81
72
|
- docs/files/lib/ankusa/hasher_rb.html
|
82
73
|
- docs/files/lib/ankusa/hbase_storage_rb.html
|
74
|
+
- docs/files/lib/ankusa/kl_divergence_rb.html
|
83
75
|
- docs/files/lib/ankusa/memory_storage_rb.html
|
76
|
+
- docs/files/lib/ankusa/naive_bayes_rb.html
|
84
77
|
- docs/files/lib/ankusa/stopwords_rb.html
|
85
78
|
- docs/files/lib/ankusa_rb.html
|
86
79
|
- docs/files/README_rdoc.html
|
@@ -116,12 +109,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
109
|
segments:
|
117
110
|
- 0
|
118
111
|
version: "0"
|
119
|
-
requirements:
|
120
|
-
|
112
|
+
requirements:
|
113
|
+
- Either hbaserb >= 0.0.3 or cassandra >= 0.7
|
121
114
|
rubyforge_project:
|
122
115
|
rubygems_version: 1.3.7
|
123
116
|
signing_key:
|
124
117
|
specification_version: 3
|
125
|
-
summary: Text classifier in Ruby that uses Hadoop's HBase for storage
|
118
|
+
summary: Text classifier in Ruby that uses Hadoop's HBase or Cassandra for storage
|
126
119
|
test_files: []
|
127
120
|
|