ankusa 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Tue Nov 30 12:15:06 -0500 2010</td>
59
+ <td>Thu Dec 02 07:40:55 -0500 2010</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -73,9 +73,11 @@
73
73
  <h3 class="section-bar">Required files</h3>
74
74
 
75
75
  <div class="name-list">
76
+ ankusa/extensions&nbsp;&nbsp;
76
77
  ankusa/classifier&nbsp;&nbsp;
77
78
  ankusa/hasher&nbsp;&nbsp;
78
- ankusa/nbclass&nbsp;&nbsp;
79
+ ankusa/memory_storage&nbsp;&nbsp;
80
+ ankusa/hbase_storage&nbsp;&nbsp;
79
81
  </div>
80
82
  </div>
81
83
 
@@ -22,8 +22,10 @@
22
22
  <div id="index-entries">
23
23
  <a href="classes/Ankusa.html">Ankusa</a><br />
24
24
  <a href="classes/Ankusa/Classifier.html">Ankusa::Classifier</a><br />
25
- <a href="classes/Ankusa/NBClass.html">Ankusa::NBClass</a><br />
25
+ <a href="classes/Ankusa/HBaseStorage.html">Ankusa::HBaseStorage</a><br />
26
+ <a href="classes/Ankusa/MemoryStorage.html">Ankusa::MemoryStorage</a><br />
26
27
  <a href="classes/Ankusa/TextHash.html">Ankusa::TextHash</a><br />
28
+ <a href="classes/String.html">String</a><br />
27
29
  </div>
28
30
  </div>
29
31
  </body>
@@ -23,8 +23,10 @@
23
23
  <a href="files/README_rdoc.html">README.rdoc</a><br />
24
24
  <a href="files/lib/ankusa_rb.html">lib/ankusa.rb</a><br />
25
25
  <a href="files/lib/ankusa/classifier_rb.html">lib/ankusa/classifier.rb</a><br />
26
+ <a href="files/lib/ankusa/extensions_rb.html">lib/ankusa/extensions.rb</a><br />
26
27
  <a href="files/lib/ankusa/hasher_rb.html">lib/ankusa/hasher.rb</a><br />
27
- <a href="files/lib/ankusa/nbclass_rb.html">lib/ankusa/nbclass.rb</a><br />
28
+ <a href="files/lib/ankusa/hbase_storage_rb.html">lib/ankusa/hbase_storage.rb</a><br />
29
+ <a href="files/lib/ankusa/memory_storage_rb.html">lib/ankusa/memory_storage.rb</a><br />
28
30
  <a href="files/lib/ankusa/stopwords_rb.html">lib/ankusa/stopwords.rb</a><br />
29
31
  </div>
30
32
  </div>
@@ -20,23 +20,47 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Methods</h1>
22
22
  <div id="index-entries">
23
- <a href="classes/Ankusa/TextHash.html#M000016">add_text (Ankusa::TextHash)</a><br />
24
- <a href="classes/Ankusa/TextHash.html#M000017">add_word (Ankusa::TextHash)</a><br />
25
- <a href="classes/Ankusa/Classifier.html#M000005">classifications (Ankusa::Classifier)</a><br />
26
- <a href="classes/Ankusa/Classifier.html#M000004">classify (Ankusa::Classifier)</a><br />
27
- <a href="classes/Ankusa/Classifier.html#M000009">doc_count_total (Ankusa::Classifier)</a><br />
28
- <a href="classes/Ankusa/Classifier.html#M000007">drop_tables (Ankusa::Classifier)</a><br />
29
- <a href="classes/Ankusa/Classifier.html#M000013">freq_table (Ankusa::Classifier)</a><br />
30
- <a href="classes/Ankusa/Classifier.html#M000010">get_word_probs (Ankusa::Classifier)</a><br />
31
- <a href="classes/Ankusa/Classifier.html#M000011">init_tables (Ankusa::Classifier)</a><br />
32
- <a href="classes/Ankusa/NBClass.html#M000014">new (Ankusa::NBClass)</a><br />
33
- <a href="classes/Ankusa/TextHash.html#M000015">new (Ankusa::TextHash)</a><br />
34
- <a href="classes/Ankusa/Classifier.html#M000001">new (Ankusa::Classifier)</a><br />
35
- <a href="classes/Ankusa/Classifier.html#M000006">refresh_classnames (Ankusa::Classifier)</a><br />
36
- <a href="classes/Ankusa/Classifier.html#M000008">reset (Ankusa::Classifier)</a><br />
37
- <a href="classes/Ankusa/Classifier.html#M000012">summary_table (Ankusa::Classifier)</a><br />
38
- <a href="classes/Ankusa/Classifier.html#M000002">train (Ankusa::Classifier)</a><br />
39
- <a href="classes/Ankusa/Classifier.html#M000003">untrain (Ankusa::Classifier)</a><br />
23
+ <a href="classes/Ankusa/TextHash.html#M000038">add_text (Ankusa::TextHash)</a><br />
24
+ <a href="classes/Ankusa/TextHash.html#M000039">add_word (Ankusa::TextHash)</a><br />
25
+ <a href="classes/Ankusa/TextHash.html#M000040">atomize (Ankusa::TextHash)</a><br />
26
+ <a href="classes/Ankusa/Classifier.html#M000007">classifications (Ankusa::Classifier)</a><br />
27
+ <a href="classes/Ankusa/Classifier.html#M000006">classify (Ankusa::Classifier)</a><br />
28
+ <a href="classes/Ankusa/HBaseStorage.html#M000023">classnames (Ankusa::HBaseStorage)</a><br />
29
+ <a href="classes/Ankusa/MemoryStorage.html#M000010">classnames (Ankusa::MemoryStorage)</a><br />
30
+ <a href="classes/Ankusa/HBaseStorage.html#M000034">close (Ankusa::HBaseStorage)</a><br />
31
+ <a href="classes/Ankusa/MemoryStorage.html#M000021">close (Ankusa::MemoryStorage)</a><br />
32
+ <a href="classes/Ankusa/MemoryStorage.html#M000020">doc_count_total (Ankusa::MemoryStorage)</a><br />
33
+ <a href="classes/Ankusa/HBaseStorage.html#M000033">doc_count_total (Ankusa::HBaseStorage)</a><br />
34
+ <a href="classes/Ankusa/MemoryStorage.html#M000012">drop_tables (Ankusa::MemoryStorage)</a><br />
35
+ <a href="classes/Ankusa/HBaseStorage.html#M000025">drop_tables (Ankusa::HBaseStorage)</a><br />
36
+ <a href="classes/Ankusa/HBaseStorage.html#M000036">freq_table (Ankusa::HBaseStorage)</a><br />
37
+ <a href="classes/Ankusa/MemoryStorage.html#M000016">get_doc_count (Ankusa::MemoryStorage)</a><br />
38
+ <a href="classes/Ankusa/HBaseStorage.html#M000029">get_doc_count (Ankusa::HBaseStorage)</a><br />
39
+ <a href="classes/Ankusa/HBaseStorage.html#M000028">get_total_word_count (Ankusa::HBaseStorage)</a><br />
40
+ <a href="classes/Ankusa/MemoryStorage.html#M000015">get_total_word_count (Ankusa::MemoryStorage)</a><br />
41
+ <a href="classes/Ankusa/MemoryStorage.html#M000014">get_word_counts (Ankusa::MemoryStorage)</a><br />
42
+ <a href="classes/Ankusa/HBaseStorage.html#M000027">get_word_counts (Ankusa::HBaseStorage)</a><br />
43
+ <a href="classes/Ankusa/Classifier.html#M000008">get_word_probs (Ankusa::Classifier)</a><br />
44
+ <a href="classes/Ankusa/MemoryStorage.html#M000019">incr_doc_count (Ankusa::MemoryStorage)</a><br />
45
+ <a href="classes/Ankusa/HBaseStorage.html#M000032">incr_doc_count (Ankusa::HBaseStorage)</a><br />
46
+ <a href="classes/Ankusa/HBaseStorage.html#M000031">incr_total_word_count (Ankusa::HBaseStorage)</a><br />
47
+ <a href="classes/Ankusa/MemoryStorage.html#M000018">incr_total_word_count (Ankusa::MemoryStorage)</a><br />
48
+ <a href="classes/Ankusa/HBaseStorage.html#M000030">incr_word_count (Ankusa::HBaseStorage)</a><br />
49
+ <a href="classes/Ankusa/MemoryStorage.html#M000017">incr_word_count (Ankusa::MemoryStorage)</a><br />
50
+ <a href="classes/Ankusa/HBaseStorage.html#M000026">init_tables (Ankusa::HBaseStorage)</a><br />
51
+ <a href="classes/Ankusa/MemoryStorage.html#M000013">init_tables (Ankusa::MemoryStorage)</a><br />
52
+ <a href="classes/Ankusa/TextHash.html#M000037">new (Ankusa::TextHash)</a><br />
53
+ <a href="classes/Ankusa/Classifier.html#M000003">new (Ankusa::Classifier)</a><br />
54
+ <a href="classes/Ankusa/MemoryStorage.html#M000009">new (Ankusa::MemoryStorage)</a><br />
55
+ <a href="classes/Ankusa/HBaseStorage.html#M000022">new (Ankusa::HBaseStorage)</a><br />
56
+ <a href="classes/String.html#M000001">numeric? (String)</a><br />
57
+ <a href="classes/Ankusa/MemoryStorage.html#M000011">reset (Ankusa::MemoryStorage)</a><br />
58
+ <a href="classes/Ankusa/HBaseStorage.html#M000024">reset (Ankusa::HBaseStorage)</a><br />
59
+ <a href="classes/Ankusa/HBaseStorage.html#M000035">summary_table (Ankusa::HBaseStorage)</a><br />
60
+ <a href="classes/String.html#M000002">to_ascii (String)</a><br />
61
+ <a href="classes/Ankusa/Classifier.html#M000004">train (Ankusa::Classifier)</a><br />
62
+ <a href="classes/Ankusa/Classifier.html#M000005">untrain (Ankusa::Classifier)</a><br />
63
+ <a href="classes/Ankusa/TextHash.html#M000041">valid_word? (Ankusa::TextHash)</a><br />
40
64
  </div>
41
65
  </div>
42
66
  </body>
@@ -1,3 +1,5 @@
1
+ require 'ankusa/extensions'
1
2
  require 'ankusa/classifier'
2
3
  require 'ankusa/hasher'
3
- require 'ankusa/nbclass'
4
+ require 'ankusa/memory_storage'
5
+ require 'ankusa/hbase_storage'
@@ -1,125 +1,76 @@
1
1
  module Ankusa
2
- SMALL_PROB = 0.0001
3
2
 
4
3
  class Classifier
5
4
  attr_reader :classnames
6
5
 
7
- def initialize(hbase_client, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")
8
- @hbase = hbase_client
9
- @ftablename = frequency_tablename
10
- @stablename = summary_tablename
11
- init_tables
12
- @classnames = refresh_classnames
6
+ def initialize(storage)
7
+ @storage = storage
8
+ @storage.init_tables
9
+ @classnames = @storage.classnames
13
10
  end
14
-
11
+
12
+ # text can be either an array of strings or a string
13
+ # klass is a symbol
15
14
  def train(klass, text)
16
15
  th = TextHash.new(text)
17
16
  th.each { |word, count|
18
- freq_table.atomic_increment word, "classes:#{klass.to_s}", count
17
+ @storage.incr_word_count klass, word, count
18
+ yield word, count if block_given?
19
19
  }
20
- summary_table.atomic_increment klass, "totals:wordcount", th.word_count
21
- summary_table.atomic_increment klass, "totals:doccount"
20
+ @storage.incr_total_word_count klass, th.word_count
21
+ doccount = (text.kind_of? Array) ? text.length : 1
22
+ @storage.incr_doc_count klass, doccount
22
23
  @classnames << klass if not @classnames.include? klass
23
24
  end
24
25
 
26
+ # text can be either an array of strings or a string
27
+ # klass is a symbol
25
28
  def untrain(klass, text)
26
29
  th = TextHash.new(text)
27
30
  th.each { |word, count|
28
- freq_table.atomic_increment word, "classes:#{klass.to_s}", -count
31
+ @storage.incr_word_count klass, word, -count
32
+ yield word, count if block_given?
29
33
  }
30
- summary_table.atomic_increment klass, "totals:wordcount", -th.word_count
31
- summary_table.atomic_increment klass, "totals:doccount", -1
34
+ @storage.incr_total_word_count klass, -th.word_count
35
+ doccount = (text.kind_of? Array) ? text.length : 1
36
+ @storage.incr_doc_count klass, -doccount
32
37
  end
33
38
 
34
39
  def classify(text)
35
40
  # return the most probable class
36
- classifications(text).sort { |o,t| o[1] <=> t[1] }.first.first
41
+ classifications(text).sort_by { |c| -c[1] }.first.first
37
42
  end
38
43
 
39
44
  def classifications(text)
40
- classes = {}
41
- result = {}
42
- @classnames.each { |k|
43
- classes[k] = NBClass.new k, summary_table, freq_table
44
- result[k] = 0
45
- }
45
+ result = Hash.new 0
46
46
 
47
- TextHash.new(text).each { |word,count|
48
- probs = get_word_probs(word, classes)
49
- @classnames.each { |k| result[k] += Math.log(probs[k]) }
47
+ TextHash.new(text).each { |word, count|
48
+ probs = get_word_probs(word)
49
+ @classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
50
50
  }
51
-
52
- @classnames.each { |k| result[k] += Math.log(classes[k].doc_count / doc_count_total) }
53
51
 
54
- result.keys.each { |k| result[k] = Math.exp(result[k]) }
55
- sum = result.values.inject { |x,y| x+y }
56
- result.keys.each { |klass|
57
- result[klass] = result[klass] / sum
52
+ # add the prior and exponentiate
53
+ @classnames.each { |k|
54
+ result[k] += Math.log(@storage.get_doc_count(k).to_f / @storage.doc_count_total.to_f)
55
+ result[k] = Math.exp(result[k])
58
56
  }
59
-
57
+
58
+ # normalize to get probs
59
+ sum = result.values.inject { |x,y| x+y }
60
+ @classnames.each { |k| result[k] = result[k] / sum }
60
61
  result
61
62
  end
62
63
 
63
- # get all classes
64
- def refresh_classnames
65
- cs = []
66
- summary_table.create_scanner("", "totals") { |row|
67
- cs << row.row.intern
68
- }
69
- cs
70
- end
71
-
72
- def drop_tables
73
- freq_table.delete
74
- summary_table.delete
75
- @stable = nil
76
- @ftable = nil
77
- end
78
-
79
- def reset
80
- drop_tables
81
- init_tables
82
- end
83
-
84
- def doc_count_total
85
- total = 0
86
- summary_table.create_scanner("", "totals:doccount") { |row|
87
- total += row.columns["totals:doccount"].to_i64
88
- }
89
- total
90
- end
91
-
92
64
  protected
93
- def get_word_probs(word, classes)
94
- probs = {}
95
- @classnames.each { |cn| probs[cn] = Ankusa::SMALL_PROB / classes[cn].word_count }
96
- row = freq_table.get_row(word)
97
- return probs if row.length == 0
98
-
99
- row.first.columns.each { |colname, cell|
100
- classname = colname.split(':')[1].intern
101
- probs[classname] = cell.to_i64.to_f / classes[classname].word_count
65
+ def get_word_probs(word)
66
+ probs = @storage.get_word_counts(word)
67
+ @classnames.each { |cn|
68
+ # use a laplacian smoother
69
+ probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + 1).to_f
102
70
  }
103
71
  probs
104
72
  end
105
73
 
106
- def init_tables
107
- if not @hbase.has_table? @ftablename
108
- @hbase.create_table @ftablename, "classes", "total"
109
- end
110
-
111
- if not @hbase.has_table? @stablename
112
- @hbase.create_table @stablename, "totals"
113
- end
114
- end
115
-
116
- def summary_table
117
- @stable ||= @hbase.get_table @stablename
118
- end
119
-
120
- def freq_table
121
- @ftable ||= @hbase.get_table @ftablename
122
- end
123
74
  end
124
75
 
125
76
  end
@@ -0,0 +1,13 @@
1
+ require 'iconv'
2
+
3
+ class String
4
+ def numeric?
5
+ true if Float(self) rescue false
6
+ end
7
+
8
+ def to_ascii
9
+ # from http://www.jroller.com/obie/tags/unicode
10
+ converter = Iconv.new('ASCII//IGNORE//TRANSLIT', 'UTF-8')
11
+ converter.iconv(self).unpack('U*').select { |cp| cp < 127 }.pack('U*')
12
+ end
13
+ end
@@ -13,21 +13,35 @@ module Ankusa
13
13
  end
14
14
 
15
15
  def add_text(text)
16
- # replace dashes with spaces, then get rid of non-word/non-space characters,
17
- # then split by space to get words
18
- words = text.tr('-', ' ').gsub(/[^\w\s]/,"").split
19
- words.each { |word| add_word word }
16
+ if text.kind_of? Array
17
+ text.each { |t| add_text t }
18
+ else
19
+ # replace dashes with spaces, then get rid of non-word/non-space characters,
20
+ # then split by space to get words
21
+ words = TextHash.atomize text
22
+ words.each { |word| add_word(word) if TextHash.valid_word?(word) }
23
+ end
20
24
  self
21
25
  end
22
26
 
23
27
  def add_word(word)
24
- word = word.downcase
25
- if not Ankusa::STOPWORDS.include? word
26
- @word_count += 1
27
- key = word.stem.intern
28
- store key, fetch(key, 0)+1
29
- end
28
+ @word_count += 1
29
+ key = word.stem.intern
30
+ store key, fetch(key, 0)+1
31
+ end
32
+
33
+ def self.atomize(text)
34
+ text.to_ascii.tr('-', ' ').gsub(/[^\w\s]/," ").split.map { |w| w.downcase }
30
35
  end
36
+
37
+ # word should be only alphanum chars at this point
38
+ def self.valid_word?(word)
39
+ return false if Ankusa::STOPWORDS.include? word
40
+ return false if word.length < 3
41
+ return false if word.numeric?
42
+ true
43
+ end
44
+
31
45
  end
32
46
 
33
47
  end
@@ -0,0 +1,109 @@
1
+ require 'hbaserb'
2
+
3
+ module Ankusa
4
+
5
+ class HBaseStorage
6
+ attr_reader :hbase
7
+
8
+ def initialize(host='localhost', port=9090, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")
9
+ @hbase = HBaseRb::Client.new host, port
10
+ @ftablename = frequency_tablename
11
+ @stablename = summary_tablename
12
+ @klass_word_counts = {}
13
+ @klass_doc_counts = {}
14
+ init_tables
15
+ end
16
+
17
+ def classnames
18
+ cs = []
19
+ summary_table.create_scanner("", "totals") { |row|
20
+ cs << row.row.intern
21
+ }
22
+ cs
23
+ end
24
+
25
+ def reset
26
+ drop_tables
27
+ init_tables
28
+ end
29
+
30
+ def drop_tables
31
+ freq_table.delete
32
+ summary_table.delete
33
+ @stable = nil
34
+ @ftable = nil
35
+ @klass_word_counts = {}
36
+ @klass_doc_counts = {}
37
+ end
38
+
39
+ def init_tables
40
+ if not @hbase.has_table? @ftablename
41
+ @hbase.create_table @ftablename, "classes", "total"
42
+ end
43
+
44
+ if not @hbase.has_table? @stablename
45
+ @hbase.create_table @stablename, "totals"
46
+ end
47
+ end
48
+
49
+ def get_word_counts(word)
50
+ counts = Hash.new(0)
51
+ row = freq_table.get_row(word)
52
+ return counts if row.length == 0
53
+
54
+ row.first.columns.each { |colname, cell|
55
+ classname = colname.split(':')[1].intern
56
+ counts[classname] = cell.to_i64.to_f
57
+ }
58
+
59
+ counts
60
+ end
61
+
62
+ def get_total_word_count(klass)
63
+ @klass_word_counts.fetch(klass) {
64
+ @klass_word_counts[klass] = summary_table.get(klass, "totals:wordcount").first.to_i64.to_f
65
+ }
66
+ end
67
+
68
+ def get_doc_count(klass)
69
+ @klass_doc_counts.fetch(klass) {
70
+ @klass_doc_counts[klass] = summary_table.get(klass, "totals:doccount").first.to_i64.to_f
71
+ }
72
+ end
73
+
74
+ def incr_word_count(klass, word, count)
75
+ freq_table.atomic_increment word, "classes:#{klass.to_s}", count
76
+ end
77
+
78
+ def incr_total_word_count(klass, count)
79
+ @klass_word_counts[klass] = summary_table.atomic_increment klass, "totals:wordcount", count
80
+ end
81
+
82
+ def incr_doc_count(klass, count)
83
+ @klass_doc_counts[klass] = summary_table.atomic_increment klass, "totals:doccount", count
84
+ end
85
+
86
+ def doc_count_total
87
+ total = 0
88
+ summary_table.create_scanner("", "totals:doccount") { |row|
89
+ total += row.columns["totals:doccount"].to_i64
90
+ }
91
+ total
92
+ end
93
+
94
+ def close
95
+ @hbase.close
96
+ end
97
+
98
+ protected
99
+ def summary_table
100
+ @stable ||= @hbase.get_table @stablename
101
+ end
102
+
103
+ def freq_table
104
+ @ftable ||= @hbase.get_table @ftablename
105
+ end
106
+
107
+ end
108
+
109
+ end