ankusa 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Tue Nov 30 12:15:06 -0500 2010</td>
59
+ <td>Thu Dec 02 07:40:55 -0500 2010</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -73,9 +73,11 @@
73
73
  <h3 class="section-bar">Required files</h3>
74
74
 
75
75
  <div class="name-list">
76
+ ankusa/extensions&nbsp;&nbsp;
76
77
  ankusa/classifier&nbsp;&nbsp;
77
78
  ankusa/hasher&nbsp;&nbsp;
78
- ankusa/nbclass&nbsp;&nbsp;
79
+ ankusa/memory_storage&nbsp;&nbsp;
80
+ ankusa/hbase_storage&nbsp;&nbsp;
79
81
  </div>
80
82
  </div>
81
83
 
@@ -22,8 +22,10 @@
22
22
  <div id="index-entries">
23
23
  <a href="classes/Ankusa.html">Ankusa</a><br />
24
24
  <a href="classes/Ankusa/Classifier.html">Ankusa::Classifier</a><br />
25
- <a href="classes/Ankusa/NBClass.html">Ankusa::NBClass</a><br />
25
+ <a href="classes/Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a><br />
26
+ <a href="classes/Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a><br />
26
27
  <a href="classes/Ankusa/TextHash.html">Ankusa::TextHash</a><br />
28
+ <a href="classes/String.html">String</a><br />
27
29
  </div>
28
30
  </div>
29
31
  </body>
@@ -23,8 +23,10 @@
23
23
  <a href="files/README_rdoc.html">README.rdoc</a><br />
24
24
  <a href="files/lib/ankusa_rb.html">lib/ankusa.rb</a><br />
25
25
  <a href="files/lib/ankusa/classifier_rb.html">lib/ankusa/classifier.rb</a><br />
26
+ <a href="files/lib/ankusa/extensions_rb.html">lib/ankusa/extensions.rb</a><br />
26
27
  <a href="files/lib/ankusa/hasher_rb.html">lib/ankusa/hasher.rb</a><br />
27
- <a href="files/lib/ankusa/nbclass_rb.html">lib/ankusa/nbclass.rb</a><br />
28
+ <a href="files/lib/ankusa/hbase_storage_rb.html">lib/ankusa/hbase_storage.rb</a><br />
29
+ <a href="files/lib/ankusa/memory_storage_rb.html">lib/ankusa/memory_storage.rb</a><br />
28
30
  <a href="files/lib/ankusa/stopwords_rb.html">lib/ankusa/stopwords.rb</a><br />
29
31
  </div>
30
32
  </div>
@@ -20,23 +20,47 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Methods</h1>
22
22
  <div id="index-entries">
23
- <a href="classes/Ankusa/TextHash.html#M000016">add_text (Ankusa::TextHash)</a><br />
24
- <a href="classes/Ankusa/TextHash.html#M000017">add_word (Ankusa::TextHash)</a><br />
25
- <a href="classes/Ankusa/Classifier.html#M000005">classifications (Ankusa::Classifier)</a><br />
26
- <a href="classes/Ankusa/Classifier.html#M000004">classify (Ankusa::Classifier)</a><br />
27
- <a href="classes/Ankusa/Classifier.html#M000009">doc_count_total (Ankusa::Classifier)</a><br />
28
- <a href="classes/Ankusa/Classifier.html#M000007">drop_tables (Ankusa::Classifier)</a><br />
29
- <a href="classes/Ankusa/Classifier.html#M000013">freq_table (Ankusa::Classifier)</a><br />
30
- <a href="classes/Ankusa/Classifier.html#M000010">get_word_probs (Ankusa::Classifier)</a><br />
31
- <a href="classes/Ankusa/Classifier.html#M000011">init_tables (Ankusa::Classifier)</a><br />
32
- <a href="classes/Ankusa/NBClass.html#M000014">new (Ankusa::NBClass)</a><br />
33
- <a href="classes/Ankusa/TextHash.html#M000015">new (Ankusa::TextHash)</a><br />
34
- <a href="classes/Ankusa/Classifier.html#M000001">new (Ankusa::Classifier)</a><br />
35
- <a href="classes/Ankusa/Classifier.html#M000006">refresh_classnames (Ankusa::Classifier)</a><br />
36
- <a href="classes/Ankusa/Classifier.html#M000008">reset (Ankusa::Classifier)</a><br />
37
- <a href="classes/Ankusa/Classifier.html#M000012">summary_table (Ankusa::Classifier)</a><br />
38
- <a href="classes/Ankusa/Classifier.html#M000002">train (Ankusa::Classifier)</a><br />
39
- <a href="classes/Ankusa/Classifier.html#M000003">untrain (Ankusa::Classifier)</a><br />
23
+ <a href="classes/Ankusa/TextHash.html#M000038">add_text (Ankusa::TextHash)</a><br />
24
+ <a href="classes/Ankusa/TextHash.html#M000039">add_word (Ankusa::TextHash)</a><br />
25
+ <a href="classes/Ankusa/TextHash.html#M000040">atomize (Ankusa::TextHash)</a><br />
26
+ <a href="classes/Ankusa/Classifier.html#M000007">classifications (Ankusa::Classifier)</a><br />
27
+ <a href="classes/Ankusa/Classifier.html#M000006">classify (Ankusa::Classifier)</a><br />
28
+ <a href="classes/Ankusa/HBaseStorage.html#M000023">classnames (Ankusa::HBaseStorage)</a><br />
29
+ <a href="classes/Ankusa/MemoryStorage.html#M000010">classnames (Ankusa::MemoryStorage)</a><br />
30
+ <a href="classes/Ankusa/HBaseStorage.html#M000034">close (Ankusa::HBaseStorage)</a><br />
31
+ <a href="classes/Ankusa/MemoryStorage.html#M000021">close (Ankusa::MemoryStorage)</a><br />
32
+ <a href="classes/Ankusa/MemoryStorage.html#M000020">doc_count_total (Ankusa::MemoryStorage)</a><br />
33
+ <a href="classes/Ankusa/HBaseStorage.html#M000033">doc_count_total (Ankusa::HBaseStorage)</a><br />
34
+ <a href="classes/Ankusa/MemoryStorage.html#M000012">drop_tables (Ankusa::MemoryStorage)</a><br />
35
+ <a href="classes/Ankusa/HBaseStorage.html#M000025">drop_tables (Ankusa::HBaseStorage)</a><br />
36
+ <a href="classes/Ankusa/HBaseStorage.html#M000036">freq_table (Ankusa::HBaseStorage)</a><br />
37
+ <a href="classes/Ankusa/MemoryStorage.html#M000016">get_doc_count (Ankusa::MemoryStorage)</a><br />
38
+ <a href="classes/Ankusa/HBaseStorage.html#M000029">get_doc_count (Ankusa::HBaseStorage)</a><br />
39
+ <a href="classes/Ankusa/HBaseStorage.html#M000028">get_total_word_count (Ankusa::HBaseStorage)</a><br />
40
+ <a href="classes/Ankusa/MemoryStorage.html#M000015">get_total_word_count (Ankusa::MemoryStorage)</a><br />
41
+ <a href="classes/Ankusa/MemoryStorage.html#M000014">get_word_counts (Ankusa::MemoryStorage)</a><br />
42
+ <a href="classes/Ankusa/HBaseStorage.html#M000027">get_word_counts (Ankusa::HBaseStorage)</a><br />
43
+ <a href="classes/Ankusa/Classifier.html#M000008">get_word_probs (Ankusa::Classifier)</a><br />
44
+ <a href="classes/Ankusa/MemoryStorage.html#M000019">incr_doc_count (Ankusa::MemoryStorage)</a><br />
45
+ <a href="classes/Ankusa/HBaseStorage.html#M000032">incr_doc_count (Ankusa::HBaseStorage)</a><br />
46
+ <a href="classes/Ankusa/HBaseStorage.html#M000031">incr_total_word_count (Ankusa::HBaseStorage)</a><br />
47
+ <a href="classes/Ankusa/MemoryStorage.html#M000018">incr_total_word_count (Ankusa::MemoryStorage)</a><br />
48
+ <a href="classes/Ankusa/HBaseStorage.html#M000030">incr_word_count (Ankusa::HBaseStorage)</a><br />
49
+ <a href="classes/Ankusa/MemoryStorage.html#M000017">incr_word_count (Ankusa::MemoryStorage)</a><br />
50
+ <a href="classes/Ankusa/HBaseStorage.html#M000026">init_tables (Ankusa::HBaseStorage)</a><br />
51
+ <a href="classes/Ankusa/MemoryStorage.html#M000013">init_tables (Ankusa::MemoryStorage)</a><br />
52
+ <a href="classes/Ankusa/TextHash.html#M000037">new (Ankusa::TextHash)</a><br />
53
+ <a href="classes/Ankusa/Classifier.html#M000003">new (Ankusa::Classifier)</a><br />
54
+ <a href="classes/Ankusa/MemoryStorage.html#M000009">new (Ankusa::MemoryStorage)</a><br />
55
+ <a href="classes/Ankusa/HBaseStorage.html#M000022">new (Ankusa::HBaseStorage)</a><br />
56
+ <a href="classes/String.html#M000001">numeric? (String)</a><br />
57
+ <a href="classes/Ankusa/MemoryStorage.html#M000011">reset (Ankusa::MemoryStorage)</a><br />
58
+ <a href="classes/Ankusa/HBaseStorage.html#M000024">reset (Ankusa::HBaseStorage)</a><br />
59
+ <a href="classes/Ankusa/HBaseStorage.html#M000035">summary_table (Ankusa::HBaseStorage)</a><br />
60
+ <a href="classes/String.html#M000002">to_ascii (String)</a><br />
61
+ <a href="classes/Ankusa/Classifier.html#M000004">train (Ankusa::Classifier)</a><br />
62
+ <a href="classes/Ankusa/Classifier.html#M000005">untrain (Ankusa::Classifier)</a><br />
63
+ <a href="classes/Ankusa/TextHash.html#M000041">valid_word? (Ankusa::TextHash)</a><br />
40
64
  </div>
41
65
  </div>
42
66
  </body>
@@ -1,3 +1,5 @@
1
+ require 'ankusa/extensions'
1
2
  require 'ankusa/classifier'
2
3
  require 'ankusa/hasher'
3
- require 'ankusa/nbclass'
4
+ require 'ankusa/memory_storage'
5
+ require 'ankusa/hbase_storage'
@@ -1,125 +1,76 @@
1
1
  module Ankusa
2
- SMALL_PROB = 0.0001
3
2
 
4
3
  class Classifier
5
4
  attr_reader :classnames
6
5
 
7
- def initialize(hbase_client, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")
8
- @hbase = hbase_client
9
- @ftablename = frequency_tablename
10
- @stablename = summary_tablename
11
- init_tables
12
- @classnames = refresh_classnames
6
+ def initialize(storage)
7
+ @storage = storage
8
+ @storage.init_tables
9
+ @classnames = @storage.classnames
13
10
  end
14
-
11
+
12
+ # text can be either an array of strings or a string
13
+ # klass is a symbol
15
14
  def train(klass, text)
16
15
  th = TextHash.new(text)
17
16
  th.each { |word, count|
18
- freq_table.atomic_increment word, "classes:#{klass.to_s}", count
17
+ @storage.incr_word_count klass, word, count
18
+ yield word, count if block_given?
19
19
  }
20
- summary_table.atomic_increment klass, "totals:wordcount", th.word_count
21
- summary_table.atomic_increment klass, "totals:doccount"
20
+ @storage.incr_total_word_count klass, th.word_count
21
+ doccount = (text.kind_of? Array) ? text.length : 1
22
+ @storage.incr_doc_count klass, doccount
22
23
  @classnames << klass if not @classnames.include? klass
23
24
  end
24
25
 
26
+ # text can be either an array of strings or a string
27
+ # klass is a symbol
25
28
  def untrain(klass, text)
26
29
  th = TextHash.new(text)
27
30
  th.each { |word, count|
28
- freq_table.atomic_increment word, "classes:#{klass.to_s}", -count
31
+ @storage.incr_word_count klass, word, -count
32
+ yield word, count if block_given?
29
33
  }
30
- summary_table.atomic_increment klass, "totals:wordcount", -th.word_count
31
- summary_table.atomic_increment klass, "totals:doccount", -1
34
+ @storage.incr_total_word_count klass, -th.word_count
35
+ doccount = (text.kind_of? Array) ? text.length : 1
36
+ @storage.incr_doc_count klass, -doccount
32
37
  end
33
38
 
34
39
  def classify(text)
35
40
  # return the most probable class
36
- classifications(text).sort { |o,t| o[1] <=> t[1] }.first.first
41
+ classifications(text).sort_by { |c| -c[1] }.first.first
37
42
  end
38
43
 
39
44
  def classifications(text)
40
- classes = {}
41
- result = {}
42
- @classnames.each { |k|
43
- classes[k] = NBClass.new k, summary_table, freq_table
44
- result[k] = 0
45
- }
45
+ result = Hash.new 0
46
46
 
47
- TextHash.new(text).each { |word,count|
48
- probs = get_word_probs(word, classes)
49
- @classnames.each { |k| result[k] += Math.log(probs[k]) }
47
+ TextHash.new(text).each { |word, count|
48
+ probs = get_word_probs(word)
49
+ @classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
50
50
  }
51
-
52
- @classnames.each { |k| result[k] += Math.log(classes[k].doc_count / doc_count_total) }
53
51
 
54
- result.keys.each { |k| result[k] = Math.exp(result[k]) }
55
- sum = result.values.inject { |x,y| x+y }
56
- result.keys.each { |klass|
57
- result[klass] = result[klass] / sum
52
+ # add the prior and exponentiate
53
+ @classnames.each { |k|
54
+ result[k] += Math.log(@storage.get_doc_count(k).to_f / @storage.doc_count_total.to_f)
55
+ result[k] = Math.exp(result[k])
58
56
  }
59
-
57
+
58
+ # normalize to get probs
59
+ sum = result.values.inject { |x,y| x+y }
60
+ @classnames.each { |k| result[k] = result[k] / sum }
60
61
  result
61
62
  end
62
63
 
63
- # get all classes
64
- def refresh_classnames
65
- cs = []
66
- summary_table.create_scanner("", "totals") { |row|
67
- cs << row.row.intern
68
- }
69
- cs
70
- end
71
-
72
- def drop_tables
73
- freq_table.delete
74
- summary_table.delete
75
- @stable = nil
76
- @ftable = nil
77
- end
78
-
79
- def reset
80
- drop_tables
81
- init_tables
82
- end
83
-
84
- def doc_count_total
85
- total = 0
86
- summary_table.create_scanner("", "totals:doccount") { |row|
87
- total += row.columns["totals:doccount"].to_i64
88
- }
89
- total
90
- end
91
-
92
64
  protected
93
- def get_word_probs(word, classes)
94
- probs = {}
95
- @classnames.each { |cn| probs[cn] = Ankusa::SMALL_PROB / classes[cn].word_count }
96
- row = freq_table.get_row(word)
97
- return probs if row.length == 0
98
-
99
- row.first.columns.each { |colname, cell|
100
- classname = colname.split(':')[1].intern
101
- probs[classname] = cell.to_i64.to_f / classes[classname].word_count
65
+ def get_word_probs(word)
66
+ probs = @storage.get_word_counts(word)
67
+ @classnames.each { |cn|
68
+ # use a laplacian smoother
69
+ probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + 1).to_f
102
70
  }
103
71
  probs
104
72
  end
105
73
 
106
- def init_tables
107
- if not @hbase.has_table? @ftablename
108
- @hbase.create_table @ftablename, "classes", "total"
109
- end
110
-
111
- if not @hbase.has_table? @stablename
112
- @hbase.create_table @stablename, "totals"
113
- end
114
- end
115
-
116
- def summary_table
117
- @stable ||= @hbase.get_table @stablename
118
- end
119
-
120
- def freq_table
121
- @ftable ||= @hbase.get_table @ftablename
122
- end
123
74
  end
124
75
 
125
76
  end
@@ -0,0 +1,13 @@
1
+ require 'iconv'
2
+
3
+ class String
4
+ def numeric?
5
+ true if Float(self) rescue false
6
+ end
7
+
8
+ def to_ascii
9
+ # from http://www.jroller.com/obie/tags/unicode
10
+ converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
11
+ converter.iconv(self).unpack('U*').select { |cp| cp < 127 }.pack('U*')
12
+ end
13
+ end
@@ -13,21 +13,35 @@ module Ankusa
13
13
  end
14
14
 
15
15
  def add_text(text)
16
- # replace dashes with spaces, then get rid of non-word/non-space characters,
17
- # then split by space to get words
18
- words = text.tr('-', ' ').gsub(/[^\w\s]/,"").split
19
- words.each { |word| add_word word }
16
+ if text.kind_of? Array
17
+ text.each { |t| add_text t }
18
+ else
19
+ # replace dashes with spaces, then get rid of non-word/non-space characters,
20
+ # then split by space to get words
21
+ words = TextHash.atomize text
22
+ words.each { |word| add_word(word) if TextHash.valid_word?(word) }
23
+ end
20
24
  self
21
25
  end
22
26
 
23
27
  def add_word(word)
24
- word = word.downcase
25
- if not Ankusa::STOPWORDS.include? word
26
- @word_count += 1
27
- key = word.stem.intern
28
- store key, fetch(key, 0)+1
29
- end
28
+ @word_count += 1
29
+ key = word.stem.intern
30
+ store key, fetch(key, 0)+1
31
+ end
32
+
33
+ def self.atomize(text)
34
+ text.to_ascii.tr('-', ' ').gsub(/[^\w\s]/," ").split.map { |w| w.downcase }
30
35
  end
36
+
37
+ # word should be only alphanum chars at this point
38
+ def self.valid_word?(word)
39
+ return false if Ankusa::STOPWORDS.include? word
40
+ return false if word.length < 3
41
+ return false if word.numeric?
42
+ true
43
+ end
44
+
31
45
  end
32
46
 
33
47
  end
@@ -0,0 +1,109 @@
1
+ require 'hbaserb'
2
+
3
+ module Ankusa
4
+
5
+ class HBaseStorage
6
+ attr_reader :hbase
7
+
8
+ def initialize(host='localhost', port=9090, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")
9
+ @hbase = HBaseRb::Client.new host, port
10
+ @ftablename = frequency_tablename
11
+ @stablename = summary_tablename
12
+ @klass_word_counts = {}
13
+ @klass_doc_counts = {}
14
+ init_tables
15
+ end
16
+
17
+ def classnames
18
+ cs = []
19
+ summary_table.create_scanner("", "totals") { |row|
20
+ cs << row.row.intern
21
+ }
22
+ cs
23
+ end
24
+
25
+ def reset
26
+ drop_tables
27
+ init_tables
28
+ end
29
+
30
+ def drop_tables
31
+ freq_table.delete
32
+ summary_table.delete
33
+ @stable = nil
34
+ @ftable = nil
35
+ @klass_word_counts = {}
36
+ @klass_doc_counts = {}
37
+ end
38
+
39
+ def init_tables
40
+ if not @hbase.has_table? @ftablename
41
+ @hbase.create_table @ftablename, "classes", "total"
42
+ end
43
+
44
+ if not @hbase.has_table? @stablename
45
+ @hbase.create_table @stablename, "totals"
46
+ end
47
+ end
48
+
49
+ def get_word_counts(word)
50
+ counts = Hash.new(0)
51
+ row = freq_table.get_row(word)
52
+ return counts if row.length == 0
53
+
54
+ row.first.columns.each { |colname, cell|
55
+ classname = colname.split(':')[1].intern
56
+ counts[classname] = cell.to_i64.to_f
57
+ }
58
+
59
+ counts
60
+ end
61
+
62
+ def get_total_word_count(klass)
63
+ @klass_word_counts.fetch(klass) {
64
+ @klass_word_counts[klass] = summary_table.get(klass, "totals:wordcount").first.to_i64.to_f
65
+ }
66
+ end
67
+
68
+ def get_doc_count(klass)
69
+ @klass_doc_counts.fetch(klass) {
70
+ @klass_doc_counts[klass] = summary_table.get(klass, "totals:doccount").first.to_i64.to_f
71
+ }
72
+ end
73
+
74
+ def incr_word_count(klass, word, count)
75
+ freq_table.atomic_increment word, "classes:#{klass.to_s}", count
76
+ end
77
+
78
+ def incr_total_word_count(klass, count)
79
+ @klass_word_counts[klass] = summary_table.atomic_increment klass, "totals:wordcount", count
80
+ end
81
+
82
+ def incr_doc_count(klass, count)
83
+ @klass_doc_counts[klass] = summary_table.atomic_increment klass, "totals:doccount", count
84
+ end
85
+
86
+ def doc_count_total
87
+ total = 0
88
+ summary_table.create_scanner("", "totals:doccount") { |row|
89
+ total += row.columns["totals:doccount"].to_i64
90
+ }
91
+ total
92
+ end
93
+
94
+ def close
95
+ @hbase.close
96
+ end
97
+
98
+ protected
99
+ def summary_table
100
+ @stable ||= @hbase.get_table @stablename
101
+ end
102
+
103
+ def freq_table
104
+ @ftable ||= @hbase.get_table @ftablename
105
+ end
106
+
107
+ end
108
+
109
+ end