ankusa 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/docs/created.rid CHANGED
@@ -1 +1 @@
1
- Fri, 03 Dec 2010 15:29:25 -0500
1
+ Mon, 06 Dec 2010 15:40:49 -0500
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Thu Dec 02 16:24:11 -0500 2010</td>
59
+ <td>Mon Dec 06 15:30:41 -0500 2010</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -71,10 +71,16 @@
71
71
  <div id="description">
72
72
  <h1>ankusa</h1>
73
73
  <p>
74
- <a href="../classes/Ankusa.html">Ankusa</a> is a Naive Bayes classifier in
75
- Ruby that uses Hadoop&#8216;s HBase for storage. Because it uses HBase as a
74
+ <a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
75
+ that uses Hadoop&#8216;s HBase for storage. Because it uses HBase as a
76
76
  backend, the training corpus can be many terabytes in size.
77
77
  </p>
78
+ <p>
79
+ <a href="../classes/Ankusa.html">Ankusa</a> currently uses a Naive Bayes
80
+ classifier. It ignores common words (a.k.a, stop words) and stems all
81
+ others. Additionally, it uses Laplacian smoothing in the classification
82
+ method.
83
+ </p>
78
84
  <h2>Installation</h2>
79
85
  <p>
80
86
  First, install HBase / Hadoop. Make sure the HBase Thrift interface has
@@ -92,6 +98,8 @@ been started as well. Then:
92
98
  storage = Ankusa::HBaseStorage.new 'localhost'
93
99
  c = Ankusa::Classifier.new storage
94
100
 
101
+ # Each of these calls will return a bag-of-words
102
+ # has with stemmed words as keys and counts as values
95
103
  c.train :spam, &quot;This is some spammy text&quot;
96
104
  c.train :good, &quot;This is not the bad stuff&quot;
97
105
 
@@ -102,6 +110,11 @@ been started as well. Then:
102
110
  # membership probability as values
103
111
  puts c.classifications &quot;This is some spammy text&quot;
104
112
 
113
+ # If you have a large corpus, the probabilities will
114
+ # likely all be 0. In that case, you must use log
115
+ # likelihood values
116
+ puts c.log_likelihoods &quot;This is some spammy text&quot;
117
+
105
118
  # get a list of all classes
106
119
  puts c.classes
107
120
 
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Fri Dec 03 07:36:13 -0500 2010</td>
59
+ <td>Mon Dec 06 15:17:27 -0500 2010</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Thu Dec 02 13:19:40 -0500 2010</td>
59
+ <td>Mon Dec 06 15:04:34 -0500 2010</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Thu Dec 02 10:10:26 -0500 2010</td>
59
+ <td>Mon Dec 06 15:03:20 -0500 2010</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -20,47 +20,53 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Methods</h1>
22
22
  <div id="index-entries">
23
- <a href="classes/Ankusa/TextHash.html#M000038">add_text (Ankusa::TextHash)</a><br />
24
- <a href="classes/Ankusa/TextHash.html#M000039">add_word (Ankusa::TextHash)</a><br />
25
- <a href="classes/Ankusa/TextHash.html#M000040">atomize (Ankusa::TextHash)</a><br />
23
+ <a href="classes/Ankusa/TextHash.html#M000044">add_text (Ankusa::TextHash)</a><br />
24
+ <a href="classes/Ankusa/TextHash.html#M000045">add_word (Ankusa::TextHash)</a><br />
25
+ <a href="classes/Ankusa/TextHash.html#M000046">atomize (Ankusa::TextHash)</a><br />
26
26
  <a href="classes/Ankusa/Classifier.html#M000007">classifications (Ankusa::Classifier)</a><br />
27
27
  <a href="classes/Ankusa/Classifier.html#M000006">classify (Ankusa::Classifier)</a><br />
28
- <a href="classes/Ankusa/HBaseStorage.html#M000023">classnames (Ankusa::HBaseStorage)</a><br />
29
- <a href="classes/Ankusa/MemoryStorage.html#M000010">classnames (Ankusa::MemoryStorage)</a><br />
30
- <a href="classes/Ankusa/HBaseStorage.html#M000034">close (Ankusa::HBaseStorage)</a><br />
31
- <a href="classes/Ankusa/MemoryStorage.html#M000021">close (Ankusa::MemoryStorage)</a><br />
32
- <a href="classes/Ankusa/MemoryStorage.html#M000020">doc_count_total (Ankusa::MemoryStorage)</a><br />
33
- <a href="classes/Ankusa/HBaseStorage.html#M000033">doc_count_total (Ankusa::HBaseStorage)</a><br />
34
- <a href="classes/Ankusa/MemoryStorage.html#M000012">drop_tables (Ankusa::MemoryStorage)</a><br />
35
- <a href="classes/Ankusa/HBaseStorage.html#M000025">drop_tables (Ankusa::HBaseStorage)</a><br />
36
- <a href="classes/Ankusa/HBaseStorage.html#M000036">freq_table (Ankusa::HBaseStorage)</a><br />
37
- <a href="classes/Ankusa/MemoryStorage.html#M000016">get_doc_count (Ankusa::MemoryStorage)</a><br />
38
- <a href="classes/Ankusa/HBaseStorage.html#M000029">get_doc_count (Ankusa::HBaseStorage)</a><br />
39
- <a href="classes/Ankusa/HBaseStorage.html#M000028">get_total_word_count (Ankusa::HBaseStorage)</a><br />
40
- <a href="classes/Ankusa/MemoryStorage.html#M000015">get_total_word_count (Ankusa::MemoryStorage)</a><br />
41
- <a href="classes/Ankusa/MemoryStorage.html#M000014">get_word_counts (Ankusa::MemoryStorage)</a><br />
42
- <a href="classes/Ankusa/HBaseStorage.html#M000027">get_word_counts (Ankusa::HBaseStorage)</a><br />
43
- <a href="classes/Ankusa/Classifier.html#M000008">get_word_probs (Ankusa::Classifier)</a><br />
44
- <a href="classes/Ankusa/MemoryStorage.html#M000019">incr_doc_count (Ankusa::MemoryStorage)</a><br />
45
- <a href="classes/Ankusa/HBaseStorage.html#M000032">incr_doc_count (Ankusa::HBaseStorage)</a><br />
46
- <a href="classes/Ankusa/HBaseStorage.html#M000031">incr_total_word_count (Ankusa::HBaseStorage)</a><br />
47
- <a href="classes/Ankusa/MemoryStorage.html#M000018">incr_total_word_count (Ankusa::MemoryStorage)</a><br />
48
- <a href="classes/Ankusa/HBaseStorage.html#M000030">incr_word_count (Ankusa::HBaseStorage)</a><br />
49
- <a href="classes/Ankusa/MemoryStorage.html#M000017">incr_word_count (Ankusa::MemoryStorage)</a><br />
50
- <a href="classes/Ankusa/HBaseStorage.html#M000026">init_tables (Ankusa::HBaseStorage)</a><br />
51
- <a href="classes/Ankusa/MemoryStorage.html#M000013">init_tables (Ankusa::MemoryStorage)</a><br />
52
- <a href="classes/Ankusa/TextHash.html#M000037">new (Ankusa::TextHash)</a><br />
28
+ <a href="classes/Ankusa/HBaseStorage.html#M000027">classnames (Ankusa::HBaseStorage)</a><br />
29
+ <a href="classes/Ankusa/MemoryStorage.html#M000013">classnames (Ankusa::MemoryStorage)</a><br />
30
+ <a href="classes/Ankusa/HBaseStorage.html#M000039">close (Ankusa::HBaseStorage)</a><br />
31
+ <a href="classes/Ankusa/MemoryStorage.html#M000025">close (Ankusa::MemoryStorage)</a><br />
32
+ <a href="classes/Ankusa/Classifier.html#M000010">doc_count_totals (Ankusa::Classifier)</a><br />
33
+ <a href="classes/Ankusa/MemoryStorage.html#M000024">doc_count_totals (Ankusa::MemoryStorage)</a><br />
34
+ <a href="classes/Ankusa/HBaseStorage.html#M000038">doc_count_totals (Ankusa::HBaseStorage)</a><br />
35
+ <a href="classes/Ankusa/HBaseStorage.html#M000029">drop_tables (Ankusa::HBaseStorage)</a><br />
36
+ <a href="classes/Ankusa/MemoryStorage.html#M000015">drop_tables (Ankusa::MemoryStorage)</a><br />
37
+ <a href="classes/Ankusa/HBaseStorage.html#M000042">freq_table (Ankusa::HBaseStorage)</a><br />
38
+ <a href="classes/Ankusa/MemoryStorage.html#M000020">get_doc_count (Ankusa::MemoryStorage)</a><br />
39
+ <a href="classes/Ankusa/HBaseStorage.html#M000034">get_doc_count (Ankusa::HBaseStorage)</a><br />
40
+ <a href="classes/Ankusa/HBaseStorage.html#M000040">get_summary (Ankusa::HBaseStorage)</a><br />
41
+ <a href="classes/Ankusa/MemoryStorage.html#M000019">get_total_word_count (Ankusa::MemoryStorage)</a><br />
42
+ <a href="classes/Ankusa/HBaseStorage.html#M000033">get_total_word_count (Ankusa::HBaseStorage)</a><br />
43
+ <a href="classes/Ankusa/HBaseStorage.html#M000032">get_vocabulary_sizes (Ankusa::HBaseStorage)</a><br />
44
+ <a href="classes/Ankusa/MemoryStorage.html#M000017">get_vocabulary_sizes (Ankusa::MemoryStorage)</a><br />
45
+ <a href="classes/Ankusa/HBaseStorage.html#M000031">get_word_counts (Ankusa::HBaseStorage)</a><br />
46
+ <a href="classes/Ankusa/MemoryStorage.html#M000018">get_word_counts (Ankusa::MemoryStorage)</a><br />
47
+ <a href="classes/Ankusa/Classifier.html#M000009">get_word_probs (Ankusa::Classifier)</a><br />
48
+ <a href="classes/Ankusa/MemoryStorage.html#M000023">incr_doc_count (Ankusa::MemoryStorage)</a><br />
49
+ <a href="classes/Ankusa/HBaseStorage.html#M000037">incr_doc_count (Ankusa::HBaseStorage)</a><br />
50
+ <a href="classes/Ankusa/MemoryStorage.html#M000022">incr_total_word_count (Ankusa::MemoryStorage)</a><br />
51
+ <a href="classes/Ankusa/HBaseStorage.html#M000036">incr_total_word_count (Ankusa::HBaseStorage)</a><br />
52
+ <a href="classes/Ankusa/MemoryStorage.html#M000021">incr_word_count (Ankusa::MemoryStorage)</a><br />
53
+ <a href="classes/Ankusa/HBaseStorage.html#M000035">incr_word_count (Ankusa::HBaseStorage)</a><br />
54
+ <a href="classes/Ankusa/MemoryStorage.html#M000016">init_tables (Ankusa::MemoryStorage)</a><br />
55
+ <a href="classes/Ankusa/HBaseStorage.html#M000030">init_tables (Ankusa::HBaseStorage)</a><br />
56
+ <a href="classes/Ankusa/Classifier.html#M000008">log_likelihoods (Ankusa::Classifier)</a><br />
57
+ <a href="classes/Ankusa/HBaseStorage.html#M000026">new (Ankusa::HBaseStorage)</a><br />
53
58
  <a href="classes/Ankusa/Classifier.html#M000003">new (Ankusa::Classifier)</a><br />
54
- <a href="classes/Ankusa/MemoryStorage.html#M000009">new (Ankusa::MemoryStorage)</a><br />
55
- <a href="classes/Ankusa/HBaseStorage.html#M000022">new (Ankusa::HBaseStorage)</a><br />
59
+ <a href="classes/Ankusa/TextHash.html#M000043">new (Ankusa::TextHash)</a><br />
60
+ <a href="classes/Ankusa/MemoryStorage.html#M000012">new (Ankusa::MemoryStorage)</a><br />
56
61
  <a href="classes/String.html#M000001">numeric? (String)</a><br />
57
- <a href="classes/Ankusa/MemoryStorage.html#M000011">reset (Ankusa::MemoryStorage)</a><br />
58
- <a href="classes/Ankusa/HBaseStorage.html#M000024">reset (Ankusa::HBaseStorage)</a><br />
59
- <a href="classes/Ankusa/HBaseStorage.html#M000035">summary_table (Ankusa::HBaseStorage)</a><br />
62
+ <a href="classes/Ankusa/HBaseStorage.html#M000028">reset (Ankusa::HBaseStorage)</a><br />
63
+ <a href="classes/Ankusa/MemoryStorage.html#M000014">reset (Ankusa::MemoryStorage)</a><br />
64
+ <a href="classes/Ankusa/HBaseStorage.html#M000041">summary_table (Ankusa::HBaseStorage)</a><br />
60
65
  <a href="classes/String.html#M000002">to_ascii (String)</a><br />
61
66
  <a href="classes/Ankusa/Classifier.html#M000004">train (Ankusa::Classifier)</a><br />
62
67
  <a href="classes/Ankusa/Classifier.html#M000005">untrain (Ankusa::Classifier)</a><br />
63
- <a href="classes/Ankusa/TextHash.html#M000041">valid_word? (Ankusa::TextHash)</a><br />
68
+ <a href="classes/Ankusa/TextHash.html#M000047">valid_word? (Ankusa::TextHash)</a><br />
69
+ <a href="classes/Ankusa/Classifier.html#M000011">vocab_sizes (Ankusa::Classifier)</a><br />
64
70
  </div>
65
71
  </div>
66
72
  </body>
@@ -21,6 +21,9 @@ module Ankusa
21
21
  doccount = (text.kind_of? Array) ? text.length : 1
22
22
  @storage.incr_doc_count klass, doccount
23
23
  @classnames << klass if not @classnames.include? klass
24
+ # cache is now dirty of these vars
25
+ @doc_count_totals = nil
26
+ @vocab_sizes = nil
24
27
  th
25
28
  end
26
29
 
@@ -35,44 +38,70 @@ module Ankusa
35
38
  @storage.incr_total_word_count klass, -th.word_count
36
39
  doccount = (text.kind_of? Array) ? text.length : 1
37
40
  @storage.incr_doc_count klass, -doccount
41
+ # cache is now dirty of these vars
42
+ @doc_count_totals = nil
43
+ @vocab_sizes = nil
38
44
  th
39
45
  end
40
46
 
41
- def classify(text)
47
+ def classify(text, classes=nil)
42
48
  # return the most probable class
43
- classifications(text).sort_by { |c| -c[1] }.first.first
49
+ log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
44
50
  end
45
51
 
46
- def classifications(text)
52
+ # Classes is an array of classes to look at
53
+ def classifications(text, classnames=nil)
54
+ result = log_likelihoods text, classnames
55
+ result.keys.each { |k|
56
+ result[k] = Math.exp result[k]
57
+ }
58
+
59
+ # normalize to get probs
60
+ sum = result.values.inject { |x,y| x+y }
61
+ result.keys.each { |k| result[k] = result[k] / sum }
62
+ result
63
+ end
64
+
65
+ # Classes is an array of classes to look at
66
+ def log_likelihoods(text, classnames=nil)
67
+ classnames ||= @classnames
47
68
  result = Hash.new 0
48
69
 
49
70
  TextHash.new(text).each { |word, count|
50
- probs = get_word_probs(word)
51
- @classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
71
+ probs = get_word_probs(word, classnames)
72
+ classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
52
73
  }
53
74
 
54
75
  # add the prior and exponentiate
55
- @classnames.each { |k|
56
- result[k] += Math.log(@storage.get_doc_count(k).to_f / @storage.doc_count_total.to_f)
57
- result[k] = Math.exp(result[k])
76
+ doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
77
+ doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
78
+ classnames.each { |k|
79
+ result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
58
80
  }
59
81
 
60
- # normalize to get probs
61
- sum = result.values.inject { |x,y| x+y }
62
- @classnames.each { |k| result[k] = result[k] / sum }
63
82
  result
64
83
  end
65
84
 
66
85
  protected
67
- def get_word_probs(word)
68
- probs = @storage.get_word_counts(word)
69
- @classnames.each { |cn|
86
+ def get_word_probs(word, classnames)
87
+ probs = Hash.new 0
88
+ @storage.get_word_counts(word).each { |k,v| probs[k] = v if classnames.include? k }
89
+ vs = vocab_sizes
90
+ classnames.each { |cn|
70
91
  # use a laplacian smoother
71
- probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + 1).to_f
92
+ probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + vs[cn]).to_f
72
93
  }
73
94
  probs
74
95
  end
75
96
 
97
+ def doc_count_totals
98
+ @doc_count_totals ||= @storage.doc_count_totals
99
+ end
100
+
101
+ def vocab_sizes
102
+ @vocab_sizes ||= @storage.get_vocabulary_sizes
103
+ end
104
+
76
105
  end
77
106
 
78
107
  end
@@ -53,12 +53,17 @@ module Ankusa
53
53
 
54
54
  row.first.columns.each { |colname, cell|
55
55
  classname = colname.split(':')[1].intern
56
- counts[classname] = cell.to_i64.to_f
56
+ # in case untrain has been called too many times
57
+ counts[classname] = [cell.to_i64.to_f, 0].max
57
58
  }
58
59
 
59
60
  counts
60
61
  end
61
62
 
63
+ def get_vocabulary_sizes
64
+ get_summary "totals:vocabsize"
65
+ end
66
+
62
67
  def get_total_word_count(klass)
63
68
  @klass_word_counts.fetch(klass) {
64
69
  @klass_word_counts[klass] = summary_table.get(klass, "totals:wordcount").first.to_i64.to_f
@@ -72,7 +77,15 @@ module Ankusa
72
77
  end
73
78
 
74
79
  def incr_word_count(klass, word, count)
75
- freq_table.atomic_increment word, "classes:#{klass.to_s}", count
80
+ size = freq_table.atomic_increment word, "classes:#{klass.to_s}", count
81
+ # if this is a new word, increase the klass's vocab size. If the new word
82
+ # count is 0, then we need to decrement our vocab size
83
+ if size == count
84
+ summary_table.atomic_increment klass, "totals:vocabsize"
85
+ elsif size == 0
86
+ summary_table.atomic_increment klass, "totals:vocabsize", -1
87
+ end
88
+ size
76
89
  end
77
90
 
78
91
  def incr_total_word_count(klass, count)
@@ -83,12 +96,8 @@ module Ankusa
83
96
  @klass_doc_counts[klass] = summary_table.atomic_increment klass, "totals:doccount", count
84
97
  end
85
98
 
86
- def doc_count_total
87
- total = 0
88
- summary_table.create_scanner("", "totals:doccount") { |row|
89
- total += row.columns["totals:doccount"].to_i64
90
- }
91
- total
99
+ def doc_count_totals
100
+ get_summary "totals:doccount"
92
101
  end
93
102
 
94
103
  def close
@@ -96,6 +105,14 @@ module Ankusa
96
105
  end
97
106
 
98
107
  protected
108
+ def get_summary(name)
109
+ counts = Hash.new 0
110
+ summary_table.create_scanner("", name) { |row|
111
+ counts[row.row.intern] = row.columns[name].to_i64
112
+ }
113
+ counts
114
+ end
115
+
99
116
  def summary_table
100
117
  @stable ||= @hbase.get_table @stablename
101
118
  end
@@ -24,6 +24,14 @@ module Ankusa
24
24
  @klass_doc_counts = {}
25
25
  end
26
26
 
27
+ def get_vocabulary_sizes
28
+ count = Hash.new 0
29
+ @freqs.each { |w, ks|
30
+ ks.keys.each { |k| count[k] += 1 }
31
+ }
32
+ count
33
+ end
34
+
27
35
  def get_word_counts(word)
28
36
  @freqs.fetch word, Hash.new(0)
29
37
  end
@@ -49,8 +57,8 @@ module Ankusa
49
57
  @total_doc_counts[klass] += count
50
58
  end
51
59
 
52
- def doc_count_total
53
- @total_doc_counts.values.inject { |x,y| x+y }
60
+ def doc_count_totals
61
+ @total_doc_counts
54
62
  end
55
63
 
56
64
  def close
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ankusa
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 5
10
- version: 0.0.5
9
+ - 6
10
+ version: 0.0.6
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Muller
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-12-03 00:00:00 -05:00
18
+ date: 2010-12-06 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -50,7 +50,7 @@ dependencies:
50
50
  version: 1.0.0
51
51
  type: :runtime
52
52
  version_requirements: *id002
53
- description: Naive Bayes classifier with HBase storage
53
+ description: Text classifier with HBase storage
54
54
  email: brian.muller@livingsocial.com
55
55
  executables: []
56
56
 
@@ -122,6 +122,6 @@ rubyforge_project:
122
122
  rubygems_version: 1.3.7
123
123
  signing_key:
124
124
  specification_version: 3
125
- summary: Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage
125
+ summary: Text classifier in Ruby that uses Hadoop's HBase for storage
126
126
  test_files: []
127
127