ankusa 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  require 'ankusa/extensions'
2
2
  require 'ankusa/classifier'
3
+ require 'ankusa/naive_bayes'
4
+ require 'ankusa/kl_divergence'
3
5
  require 'ankusa/hasher'
4
- require 'ankusa/memory_storage'
5
- require 'ankusa/hbase_storage'
@@ -0,0 +1,194 @@
1
+ require 'cassandra/0.7'
2
+
3
+ #
4
+ # At the moment you'll have to do:
5
+ #
6
+ # create keyspace ankusa with replication_factor = 1
7
+ #
8
+ # from the cassandra-cli. This should be fixed with new release candidate for
9
+ # cassandra
10
+ #
11
+ module Ankusa
12
+
13
+ class CassandraStorage
14
+ attr_reader :cassandra
15
+
16
+ #
17
+ # Necessary to set max classes since current implementation of ruby
18
+ # cassandra client doesn't support table scans. Using crufty get_range
19
+ # method at the moment.
20
+ #
21
+ def initialize(host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100)
22
+ @cassandra = Cassandra.new('system', "#{host}:#{port}")
23
+ @klass_word_counts = {}
24
+ @klass_doc_counts = {}
25
+ @keyspace = keyspace
26
+ @max_classes = max_classes
27
+ init_tables
28
+ end
29
+
30
+ #
31
+ # Fetch the names of the distinct classes for classification:
32
+ # eg. :spam, :good, etc
33
+ #
34
+ def classnames
35
+ @cassandra.get_range(:totals, {:start => '', :finish => '', :count => @max_classes}).inject([]) do |cs, key_slice|
36
+ cs << key_slice.key.to_sym
37
+ end
38
+ end
39
+
40
+ def reset
41
+ drop_tables
42
+ init_tables
43
+ end
44
+
45
+ #
46
+ # Drop ankusa keyspace, reset internal caches
47
+ #
48
+ # FIXME: truncate doesn't work with cassandra-beta2
49
+ #
50
+ def drop_tables
51
+ @cassandra.truncate!('classes')
52
+ @cassandra.truncate!('totals')
53
+ @cassandra.drop_keyspace(@keyspace)
54
+ @klass_word_counts = {}
55
+ @klass_doc_counts = {}
56
+ end
57
+
58
+
59
+ #
60
+ # Create required keyspace and column families
61
+ #
62
+ def init_tables
63
+ # Do nothing if keyspace already exists
64
+ if @cassandra.keyspaces.include?(@keyspace)
65
+ @cassandra.keyspace = @keyspace
66
+ else
67
+ freq_table = Cassandra::ColumnFamily.new({:keyspace => @keyspace, :name => "classes"}) # word => {classname => count}
68
+ summary_table = Cassandra::ColumnFamily.new({:keyspace => @keyspace, :name => "totals"}) # class => {wordcount => count}
69
+ ks_def = Cassandra::Keyspace.new({
70
+ :name => @keyspace,
71
+ :strategy_class => 'org.apache.cassandra.locator.SimpleStrategy',
72
+ :replication_factor => 1,
73
+ :cf_defs => [freq_table, summary_table]
74
+ })
75
+ @cassandra.add_keyspace ks_def
76
+ @cassandra.keyspace = @keyspace
77
+ end
78
+ end
79
+
80
+ #
81
+ # Fetch hash of word counts as a single row from cassandra.
82
+ # Here column_name is the class and column value is the count
83
+ #
84
+ def get_word_counts(word)
85
+ # fetch all (class,count) pairs for a given word
86
+ row = @cassandra.get(:classes, word.to_s)
87
+ return row.to_hash if row.empty?
88
+ row.inject({}){|counts, col| counts[col.first.to_sym] = [col.last.to_f,0].max; counts}
89
+ end
90
+
91
+ #
92
+ # Does a table 'scan' of summary table pulling out the 'vocabsize' column
93
+ # from each row. Generates a hash of (class, vocab_size) key value pairs
94
+ #
95
+ def get_vocabulary_sizes
96
+ get_summary "vocabsize"
97
+ end
98
+
99
+ #
100
+ # Fetch total word count for a given class and cache it
101
+ #
102
+ def get_total_word_count(klass)
103
+ @klass_word_counts[klass] = @cassandra.get(:totals, klass.to_s, "wordcount").values.last.to_f
104
+ end
105
+
106
+ #
107
+ # Fetch total documents for a given class and cache it
108
+ #
109
+ def get_doc_count(klass)
110
+ @klass_doc_counts[klass] = @cassandra.get(:totals, klass.to_s, "doc_count").values.last.to_f
111
+ end
112
+
113
+ #
114
+ # Increment the count for a given (word,class) pair. Evidently, cassandra
115
+ # does not support atomic increment/decrement. Psh. HBase uses ZooKeeper to
116
+ # implement atomic operations, ain't it special?
117
+ #
118
+ def incr_word_count(klass, word, count)
119
+ # Only wants strings
120
+ klass = klass.to_s
121
+ word = word.to_s
122
+
123
+ prior_count = @cassandra.get(:classes, word, klass).values.last.to_i
124
+ new_count = prior_count + count
125
+ @cassandra.insert(:classes, word, {klass => new_count.to_s})
126
+
127
+ if (prior_count == 0 && count > 0)
128
+ #
129
+ # we've never seen this word before and we're not trying to unlearn it
130
+ #
131
+ vocab_size = @cassandra.get(:totals, klass, "vocabsize").values.last.to_i
132
+ vocab_size += 1
133
+ @cassandra.insert(:totals, klass, {"vocabsize" => vocab_size.to_s})
134
+ elsif new_count == 0
135
+ #
136
+ # we've seen this word before but we're trying to unlearn it
137
+ #
138
+ vocab_size = @cassandra.get(:totals, klass, "vocabsize").values.last.to_i
139
+ vocab_size -= 1
140
+ @cassandra.insert(:totals, klass, {"vocabsize" => vocab_size.to_s})
141
+ end
142
+ new_count
143
+ end
144
+
145
+ #
146
+ # Increment total word count for a given class by 'count'
147
+ #
148
+ def incr_total_word_count(klass, count)
149
+ klass = klass.to_s
150
+ wordcount = @cassandra.get(:totals, klass, "wordcount").values.last.to_i
151
+ wordcount += count
152
+ @cassandra.insert(:totals, klass, {"wordcount" => wordcount.to_s})
153
+ @klass_word_counts[klass.to_sym] = wordcount
154
+ end
155
+
156
+ #
157
+ # Increment total document count for a given class by 'count'
158
+ #
159
+ def incr_doc_count(klass, count)
160
+ klass = klass.to_s
161
+ doc_count = @cassandra.get(:totals, klass, "doc_count").values.last.to_i
162
+ doc_count += count
163
+ @cassandra.insert(:totals, klass, {"doc_count" => doc_count.to_s})
164
+ @klass_doc_counts[klass.to_sym] = doc_count
165
+ end
166
+
167
+ def doc_count_totals
168
+ get_summary "doc_count"
169
+ end
170
+
171
+ #
172
+ # Doesn't do anything
173
+ #
174
+ def close
175
+ end
176
+
177
+ protected
178
+
179
+ #
180
+ # Fetch 100 rows from summary table, yes, increase if necessary
181
+ #
182
+ def get_summary(name)
183
+ counts = {}
184
+ @cassandra.get_range(:totals, {:start => '', :finish => '', :count => @max_classes}).each do |key_slice|
185
+ # keyslice is a clunky thrift object, map into a ruby hash
186
+ row = key_slice.columns.inject({}){|hsh, c| hsh[c.column.name] = c.column.value; hsh}
187
+ counts[key_slice.key.to_sym] = row[name].to_f
188
+ end
189
+ counts
190
+ end
191
+
192
+ end
193
+
194
+ end
@@ -1,6 +1,6 @@
1
1
  module Ankusa
2
2
 
3
- class Classifier
3
+ module Classifier
4
4
  attr_reader :classnames
5
5
 
6
6
  def initialize(storage)
@@ -44,44 +44,6 @@ module Ankusa
44
44
  th
45
45
  end
46
46
 
47
- def classify(text, classes=nil)
48
- # return the most probable class
49
- log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
50
- end
51
-
52
- # Classes is an array of classes to look at
53
- def classifications(text, classnames=nil)
54
- result = log_likelihoods text, classnames
55
- result.keys.each { |k|
56
- result[k] = Math.exp result[k]
57
- }
58
-
59
- # normalize to get probs
60
- sum = result.values.inject { |x,y| x+y }
61
- result.keys.each { |k| result[k] = result[k] / sum }
62
- result
63
- end
64
-
65
- # Classes is an array of classes to look at
66
- def log_likelihoods(text, classnames=nil)
67
- classnames ||= @classnames
68
- result = Hash.new 0
69
-
70
- TextHash.new(text).each { |word, count|
71
- probs = get_word_probs(word, classnames)
72
- classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
73
- }
74
-
75
- # add the prior and exponentiate
76
- doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
77
- doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
78
- classnames.each { |k|
79
- result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
80
- }
81
-
82
- result
83
- end
84
-
85
47
  protected
86
48
  def get_word_probs(word, classnames)
87
49
  probs = Hash.new 0
@@ -0,0 +1,31 @@
1
+ module Ankusa
2
+
3
+ class KLDivergenceClassifier
4
+ include Classifier
5
+
6
+ def classify(text, classes=nil)
7
+ # return the class with the least distance from the word
8
+ # distribution of the given text
9
+ distances(text, classes).sort_by { |c| c[1] }.first.first
10
+ end
11
+
12
+
13
+ # Classes is an array of classes to look at
14
+ def distances(text, classnames=nil)
15
+ classnames ||= @classnames
16
+ distances = Hash.new 0
17
+
18
+ th = TextHash.new(text)
19
+ th.each { |word, count|
20
+ thprob = count.to_f / th.length.to_f
21
+ probs = get_word_probs(word, classnames)
22
+ classnames.each { |k|
23
+ distances[k] += (thprob * Math.log(thprob / probs[k]) * count)
24
+ }
25
+ }
26
+
27
+ distances
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,46 @@
1
+ module Ankusa
2
+
3
+ class NaiveBayesClassifier
4
+ include Classifier
5
+
6
+ def classify(text, classes=nil)
7
+ # return the most probable class
8
+ log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
9
+ end
10
+
11
+ # Classes is an array of classes to look at
12
+ def classifications(text, classnames=nil)
13
+ result = log_likelihoods text, classnames
14
+ result.keys.each { |k|
15
+ result[k] = Math.exp result[k]
16
+ }
17
+
18
+ # normalize to get probs
19
+ sum = result.values.inject { |x,y| x+y }
20
+ result.keys.each { |k| result[k] = result[k] / sum }
21
+ result
22
+ end
23
+
24
+ # Classes is an array of classes to look at
25
+ def log_likelihoods(text, classnames=nil)
26
+ classnames ||= @classnames
27
+ result = Hash.new 0
28
+
29
+ TextHash.new(text).each { |word, count|
30
+ probs = get_word_probs(word, classnames)
31
+ classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
32
+ }
33
+
34
+ # add the prior and exponentiate
35
+ doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
36
+ doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
37
+ classnames.each { |k|
38
+ result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
39
+ }
40
+
41
+ result
42
+ end
43
+
44
+ end
45
+
46
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ankusa
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
4
+ hash: 17
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 6
10
- version: 0.0.6
9
+ - 7
10
+ version: 0.0.7
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Muller
@@ -15,29 +15,13 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-06 00:00:00 -05:00
18
+ date: 2010-12-12 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
- - !ruby/object:Gem::Dependency
22
- name: hbaserb
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ">="
28
- - !ruby/object:Gem::Version
29
- hash: 25
30
- segments:
31
- - 0
32
- - 0
33
- - 3
34
- version: 0.0.3
35
- type: :runtime
36
- version_requirements: *id001
37
21
  - !ruby/object:Gem::Dependency
38
22
  name: fast-stemmer
39
23
  prerelease: false
40
- requirement: &id002 !ruby/object:Gem::Requirement
24
+ requirement: &id001 !ruby/object:Gem::Requirement
41
25
  none: false
42
26
  requirements:
43
27
  - - ">="
@@ -49,8 +33,8 @@ dependencies:
49
33
  - 0
50
34
  version: 1.0.0
51
35
  type: :runtime
52
- version_requirements: *id002
53
- description: Text classifier with HBase storage
36
+ version_requirements: *id001
37
+ description: Text classifier with HBase or Cassandra storage
54
38
  email: brian.muller@livingsocial.com
55
39
  executables: []
56
40
 
@@ -59,28 +43,37 @@ extensions: []
59
43
  extra_rdoc_files: []
60
44
 
61
45
  files:
46
+ - lib/ankusa/cassandra_storage.rb
62
47
  - lib/ankusa/classifier.rb
63
48
  - lib/ankusa/extensions.rb
64
49
  - lib/ankusa/hasher.rb
65
50
  - lib/ankusa/hbase_storage.rb
51
+ - lib/ankusa/kl_divergence.rb
66
52
  - lib/ankusa/memory_storage.rb
53
+ - lib/ankusa/naive_bayes.rb
67
54
  - lib/ankusa/stopwords.rb
68
55
  - lib/ankusa.rb
69
56
  - LICENSE
70
57
  - Rakefile
71
58
  - README.rdoc
59
+ - docs/classes/Ankusa/CassandraStorage.html
72
60
  - docs/classes/Ankusa/Classifier.html
73
61
  - docs/classes/Ankusa/HBaseStorage.html
62
+ - docs/classes/Ankusa/KLDivergenceClassifier.html
74
63
  - docs/classes/Ankusa/MemoryStorage.html
64
+ - docs/classes/Ankusa/NaiveBayesClassifier.html
75
65
  - docs/classes/Ankusa/TextHash.html
76
66
  - docs/classes/Ankusa.html
77
67
  - docs/classes/String.html
78
68
  - docs/created.rid
69
+ - docs/files/lib/ankusa/cassandra_storage_rb.html
79
70
  - docs/files/lib/ankusa/classifier_rb.html
80
71
  - docs/files/lib/ankusa/extensions_rb.html
81
72
  - docs/files/lib/ankusa/hasher_rb.html
82
73
  - docs/files/lib/ankusa/hbase_storage_rb.html
74
+ - docs/files/lib/ankusa/kl_divergence_rb.html
83
75
  - docs/files/lib/ankusa/memory_storage_rb.html
76
+ - docs/files/lib/ankusa/naive_bayes_rb.html
84
77
  - docs/files/lib/ankusa/stopwords_rb.html
85
78
  - docs/files/lib/ankusa_rb.html
86
79
  - docs/files/README_rdoc.html
@@ -116,12 +109,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
116
109
  segments:
117
110
  - 0
118
111
  version: "0"
119
- requirements: []
120
-
112
+ requirements:
113
+ - Either hbaserb >= 0.0.3 or cassandra >= 0.7
121
114
  rubyforge_project:
122
115
  rubygems_version: 1.3.7
123
116
  signing_key:
124
117
  specification_version: 3
125
- summary: Text classifier in Ruby that uses Hadoop's HBase for storage
118
+ summary: Text classifier in Ruby that uses Hadoop's HBase or Cassandra for storage
126
119
  test_files: []
127
120