ankusa 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +10 -1
- data/Rakefile +4 -4
- data/docs/classes/Ankusa/Classifier.html +125 -32
- data/docs/classes/Ankusa/HBaseStorage.html +165 -108
- data/docs/classes/Ankusa/MemoryStorage.html +117 -89
- data/docs/classes/Ankusa/TextHash.html +30 -30
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +16 -3
- data/docs/files/lib/ankusa/classifier_rb.html +1 -1
- data/docs/files/lib/ankusa/hbase_storage_rb.html +1 -1
- data/docs/files/lib/ankusa/memory_storage_rb.html +1 -1
- data/docs/fr_method_index.html +40 -34
- data/lib/ankusa/classifier.rb +44 -15
- data/lib/ankusa/hbase_storage.rb +25 -8
- data/lib/ankusa/memory_storage.rb +10 -2
- metadata +6 -6
data/docs/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Mon, 06 Dec 2010 15:40:49 -0500
|
data/docs/files/README_rdoc.html
CHANGED
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Mon Dec 06 15:30:41 -0500 2010</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -71,10 +71,16 @@
|
|
71
71
|
<div id="description">
|
72
72
|
<h1>ankusa</h1>
|
73
73
|
<p>
|
74
|
-
<a href="../classes/Ankusa.html">Ankusa</a> is a
|
75
|
-
|
74
|
+
<a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
|
75
|
+
that uses Hadoop‘s HBase for storage. Because it uses HBase as a
|
76
76
|
backend, the training corpus can be many terabytes in size.
|
77
77
|
</p>
|
78
|
+
<p>
|
79
|
+
<a href="../classes/Ankusa.html">Ankusa</a> currently uses a Naive Bayes
|
80
|
+
classifier. It ignores common words (a.k.a, stop words) and stems all
|
81
|
+
others. Additionally, it uses Laplacian smoothing in the classification
|
82
|
+
method.
|
83
|
+
</p>
|
78
84
|
<h2>Installation</h2>
|
79
85
|
<p>
|
80
86
|
First, install HBase / Hadoop. Make sure the HBase Thrift interface has
|
@@ -92,6 +98,8 @@ been started as well. Then:
|
|
92
98
|
storage = Ankusa::HBaseStorage.new 'localhost'
|
93
99
|
c = Ankusa::Classifier.new storage
|
94
100
|
|
101
|
+
# Each of these calls will return a bag-of-words
|
102
|
+
# has with stemmed words as keys and counts as values
|
95
103
|
c.train :spam, "This is some spammy text"
|
96
104
|
c.train :good, "This is not the bad stuff"
|
97
105
|
|
@@ -102,6 +110,11 @@ been started as well. Then:
|
|
102
110
|
# membership probability as values
|
103
111
|
puts c.classifications "This is some spammy text"
|
104
112
|
|
113
|
+
# If you have a large corpus, the probabilities will
|
114
|
+
# likely all be 0. In that case, you must use log
|
115
|
+
# likelihood values
|
116
|
+
puts c.log_likelihoods "This is some spammy text"
|
117
|
+
|
105
118
|
# get a list of all classes
|
106
119
|
puts c.classes
|
107
120
|
|
data/docs/fr_method_index.html
CHANGED
@@ -20,47 +20,53 @@
|
|
20
20
|
<div id="index">
|
21
21
|
<h1 class="section-bar">Methods</h1>
|
22
22
|
<div id="index-entries">
|
23
|
-
<a href="classes/Ankusa/TextHash.html#
|
24
|
-
<a href="classes/Ankusa/TextHash.html#
|
25
|
-
<a href="classes/Ankusa/TextHash.html#
|
23
|
+
<a href="classes/Ankusa/TextHash.html#M000044">add_text (Ankusa::TextHash)</a><br />
|
24
|
+
<a href="classes/Ankusa/TextHash.html#M000045">add_word (Ankusa::TextHash)</a><br />
|
25
|
+
<a href="classes/Ankusa/TextHash.html#M000046">atomize (Ankusa::TextHash)</a><br />
|
26
26
|
<a href="classes/Ankusa/Classifier.html#M000007">classifications (Ankusa::Classifier)</a><br />
|
27
27
|
<a href="classes/Ankusa/Classifier.html#M000006">classify (Ankusa::Classifier)</a><br />
|
28
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
29
|
-
<a href="classes/Ankusa/MemoryStorage.html#
|
30
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
31
|
-
<a href="classes/Ankusa/MemoryStorage.html#
|
32
|
-
<a href="classes/Ankusa/
|
33
|
-
<a href="classes/Ankusa/
|
34
|
-
<a href="classes/Ankusa/
|
35
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
36
|
-
<a href="classes/Ankusa/
|
37
|
-
<a href="classes/Ankusa/
|
38
|
-
<a href="classes/Ankusa/
|
39
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
40
|
-
<a href="classes/Ankusa/
|
41
|
-
<a href="classes/Ankusa/MemoryStorage.html#
|
42
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
43
|
-
<a href="classes/Ankusa/
|
44
|
-
<a href="classes/Ankusa/MemoryStorage.html#
|
45
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
46
|
-
<a href="classes/Ankusa/
|
47
|
-
<a href="classes/Ankusa/
|
48
|
-
<a href="classes/Ankusa/
|
49
|
-
<a href="classes/Ankusa/
|
50
|
-
<a href="classes/Ankusa/
|
51
|
-
<a href="classes/Ankusa/
|
52
|
-
<a href="classes/Ankusa/
|
28
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000027">classnames (Ankusa::HBaseStorage)</a><br />
|
29
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000013">classnames (Ankusa::MemoryStorage)</a><br />
|
30
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000039">close (Ankusa::HBaseStorage)</a><br />
|
31
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000025">close (Ankusa::MemoryStorage)</a><br />
|
32
|
+
<a href="classes/Ankusa/Classifier.html#M000010">doc_count_totals (Ankusa::Classifier)</a><br />
|
33
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000024">doc_count_totals (Ankusa::MemoryStorage)</a><br />
|
34
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000038">doc_count_totals (Ankusa::HBaseStorage)</a><br />
|
35
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000029">drop_tables (Ankusa::HBaseStorage)</a><br />
|
36
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000015">drop_tables (Ankusa::MemoryStorage)</a><br />
|
37
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000042">freq_table (Ankusa::HBaseStorage)</a><br />
|
38
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000020">get_doc_count (Ankusa::MemoryStorage)</a><br />
|
39
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000034">get_doc_count (Ankusa::HBaseStorage)</a><br />
|
40
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000040">get_summary (Ankusa::HBaseStorage)</a><br />
|
41
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000019">get_total_word_count (Ankusa::MemoryStorage)</a><br />
|
42
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000033">get_total_word_count (Ankusa::HBaseStorage)</a><br />
|
43
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000032">get_vocabulary_sizes (Ankusa::HBaseStorage)</a><br />
|
44
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000017">get_vocabulary_sizes (Ankusa::MemoryStorage)</a><br />
|
45
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000031">get_word_counts (Ankusa::HBaseStorage)</a><br />
|
46
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000018">get_word_counts (Ankusa::MemoryStorage)</a><br />
|
47
|
+
<a href="classes/Ankusa/Classifier.html#M000009">get_word_probs (Ankusa::Classifier)</a><br />
|
48
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000023">incr_doc_count (Ankusa::MemoryStorage)</a><br />
|
49
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000037">incr_doc_count (Ankusa::HBaseStorage)</a><br />
|
50
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000022">incr_total_word_count (Ankusa::MemoryStorage)</a><br />
|
51
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000036">incr_total_word_count (Ankusa::HBaseStorage)</a><br />
|
52
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000021">incr_word_count (Ankusa::MemoryStorage)</a><br />
|
53
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000035">incr_word_count (Ankusa::HBaseStorage)</a><br />
|
54
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000016">init_tables (Ankusa::MemoryStorage)</a><br />
|
55
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000030">init_tables (Ankusa::HBaseStorage)</a><br />
|
56
|
+
<a href="classes/Ankusa/Classifier.html#M000008">log_likelihoods (Ankusa::Classifier)</a><br />
|
57
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000026">new (Ankusa::HBaseStorage)</a><br />
|
53
58
|
<a href="classes/Ankusa/Classifier.html#M000003">new (Ankusa::Classifier)</a><br />
|
54
|
-
<a href="classes/Ankusa/
|
55
|
-
<a href="classes/Ankusa/
|
59
|
+
<a href="classes/Ankusa/TextHash.html#M000043">new (Ankusa::TextHash)</a><br />
|
60
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000012">new (Ankusa::MemoryStorage)</a><br />
|
56
61
|
<a href="classes/String.html#M000001">numeric? (String)</a><br />
|
57
|
-
<a href="classes/Ankusa/
|
58
|
-
<a href="classes/Ankusa/
|
59
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
62
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000028">reset (Ankusa::HBaseStorage)</a><br />
|
63
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000014">reset (Ankusa::MemoryStorage)</a><br />
|
64
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000041">summary_table (Ankusa::HBaseStorage)</a><br />
|
60
65
|
<a href="classes/String.html#M000002">to_ascii (String)</a><br />
|
61
66
|
<a href="classes/Ankusa/Classifier.html#M000004">train (Ankusa::Classifier)</a><br />
|
62
67
|
<a href="classes/Ankusa/Classifier.html#M000005">untrain (Ankusa::Classifier)</a><br />
|
63
|
-
<a href="classes/Ankusa/TextHash.html#
|
68
|
+
<a href="classes/Ankusa/TextHash.html#M000047">valid_word? (Ankusa::TextHash)</a><br />
|
69
|
+
<a href="classes/Ankusa/Classifier.html#M000011">vocab_sizes (Ankusa::Classifier)</a><br />
|
64
70
|
</div>
|
65
71
|
</div>
|
66
72
|
</body>
|
data/lib/ankusa/classifier.rb
CHANGED
@@ -21,6 +21,9 @@ module Ankusa
|
|
21
21
|
doccount = (text.kind_of? Array) ? text.length : 1
|
22
22
|
@storage.incr_doc_count klass, doccount
|
23
23
|
@classnames << klass if not @classnames.include? klass
|
24
|
+
# cache is now dirty of these vars
|
25
|
+
@doc_count_totals = nil
|
26
|
+
@vocab_sizes = nil
|
24
27
|
th
|
25
28
|
end
|
26
29
|
|
@@ -35,44 +38,70 @@ module Ankusa
|
|
35
38
|
@storage.incr_total_word_count klass, -th.word_count
|
36
39
|
doccount = (text.kind_of? Array) ? text.length : 1
|
37
40
|
@storage.incr_doc_count klass, -doccount
|
41
|
+
# cache is now dirty of these vars
|
42
|
+
@doc_count_totals = nil
|
43
|
+
@vocab_sizes = nil
|
38
44
|
th
|
39
45
|
end
|
40
46
|
|
41
|
-
def classify(text)
|
47
|
+
def classify(text, classes=nil)
|
42
48
|
# return the most probable class
|
43
|
-
|
49
|
+
log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
|
44
50
|
end
|
45
51
|
|
46
|
-
|
52
|
+
# Classes is an array of classes to look at
|
53
|
+
def classifications(text, classnames=nil)
|
54
|
+
result = log_likelihoods text, classnames
|
55
|
+
result.keys.each { |k|
|
56
|
+
result[k] = Math.exp result[k]
|
57
|
+
}
|
58
|
+
|
59
|
+
# normalize to get probs
|
60
|
+
sum = result.values.inject { |x,y| x+y }
|
61
|
+
result.keys.each { |k| result[k] = result[k] / sum }
|
62
|
+
result
|
63
|
+
end
|
64
|
+
|
65
|
+
# Classes is an array of classes to look at
|
66
|
+
def log_likelihoods(text, classnames=nil)
|
67
|
+
classnames ||= @classnames
|
47
68
|
result = Hash.new 0
|
48
69
|
|
49
70
|
TextHash.new(text).each { |word, count|
|
50
|
-
probs = get_word_probs(word)
|
51
|
-
|
71
|
+
probs = get_word_probs(word, classnames)
|
72
|
+
classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
|
52
73
|
}
|
53
74
|
|
54
75
|
# add the prior and exponentiate
|
55
|
-
|
56
|
-
|
57
|
-
|
76
|
+
doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
|
77
|
+
doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
|
78
|
+
classnames.each { |k|
|
79
|
+
result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
|
58
80
|
}
|
59
81
|
|
60
|
-
# normalize to get probs
|
61
|
-
sum = result.values.inject { |x,y| x+y }
|
62
|
-
@classnames.each { |k| result[k] = result[k] / sum }
|
63
82
|
result
|
64
83
|
end
|
65
84
|
|
66
85
|
protected
|
67
|
-
def get_word_probs(word)
|
68
|
-
probs =
|
69
|
-
@
|
86
|
+
def get_word_probs(word, classnames)
|
87
|
+
probs = Hash.new 0
|
88
|
+
@storage.get_word_counts(word).each { |k,v| probs[k] = v if classnames.include? k }
|
89
|
+
vs = vocab_sizes
|
90
|
+
classnames.each { |cn|
|
70
91
|
# use a laplacian smoother
|
71
|
-
probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) +
|
92
|
+
probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + vs[cn]).to_f
|
72
93
|
}
|
73
94
|
probs
|
74
95
|
end
|
75
96
|
|
97
|
+
def doc_count_totals
|
98
|
+
@doc_count_totals ||= @storage.doc_count_totals
|
99
|
+
end
|
100
|
+
|
101
|
+
def vocab_sizes
|
102
|
+
@vocab_sizes ||= @storage.get_vocabulary_sizes
|
103
|
+
end
|
104
|
+
|
76
105
|
end
|
77
106
|
|
78
107
|
end
|
data/lib/ankusa/hbase_storage.rb
CHANGED
@@ -53,12 +53,17 @@ module Ankusa
|
|
53
53
|
|
54
54
|
row.first.columns.each { |colname, cell|
|
55
55
|
classname = colname.split(':')[1].intern
|
56
|
-
|
56
|
+
# in case untrain has been called too many times
|
57
|
+
counts[classname] = [cell.to_i64.to_f, 0].max
|
57
58
|
}
|
58
59
|
|
59
60
|
counts
|
60
61
|
end
|
61
62
|
|
63
|
+
def get_vocabulary_sizes
|
64
|
+
get_summary "totals:vocabsize"
|
65
|
+
end
|
66
|
+
|
62
67
|
def get_total_word_count(klass)
|
63
68
|
@klass_word_counts.fetch(klass) {
|
64
69
|
@klass_word_counts[klass] = summary_table.get(klass, "totals:wordcount").first.to_i64.to_f
|
@@ -72,7 +77,15 @@ module Ankusa
|
|
72
77
|
end
|
73
78
|
|
74
79
|
def incr_word_count(klass, word, count)
|
75
|
-
freq_table.atomic_increment word, "classes:#{klass.to_s}", count
|
80
|
+
size = freq_table.atomic_increment word, "classes:#{klass.to_s}", count
|
81
|
+
# if this is a new word, increase the klass's vocab size. If the new word
|
82
|
+
# count is 0, then we need to decrement our vocab size
|
83
|
+
if size == count
|
84
|
+
summary_table.atomic_increment klass, "totals:vocabsize"
|
85
|
+
elsif size == 0
|
86
|
+
summary_table.atomic_increment klass, "totals:vocabsize", -1
|
87
|
+
end
|
88
|
+
size
|
76
89
|
end
|
77
90
|
|
78
91
|
def incr_total_word_count(klass, count)
|
@@ -83,12 +96,8 @@ module Ankusa
|
|
83
96
|
@klass_doc_counts[klass] = summary_table.atomic_increment klass, "totals:doccount", count
|
84
97
|
end
|
85
98
|
|
86
|
-
def
|
87
|
-
|
88
|
-
summary_table.create_scanner("", "totals:doccount") { |row|
|
89
|
-
total += row.columns["totals:doccount"].to_i64
|
90
|
-
}
|
91
|
-
total
|
99
|
+
def doc_count_totals
|
100
|
+
get_summary "totals:doccount"
|
92
101
|
end
|
93
102
|
|
94
103
|
def close
|
@@ -96,6 +105,14 @@ module Ankusa
|
|
96
105
|
end
|
97
106
|
|
98
107
|
protected
|
108
|
+
def get_summary(name)
|
109
|
+
counts = Hash.new 0
|
110
|
+
summary_table.create_scanner("", name) { |row|
|
111
|
+
counts[row.row.intern] = row.columns[name].to_i64
|
112
|
+
}
|
113
|
+
counts
|
114
|
+
end
|
115
|
+
|
99
116
|
def summary_table
|
100
117
|
@stable ||= @hbase.get_table @stablename
|
101
118
|
end
|
@@ -24,6 +24,14 @@ module Ankusa
|
|
24
24
|
@klass_doc_counts = {}
|
25
25
|
end
|
26
26
|
|
27
|
+
def get_vocabulary_sizes
|
28
|
+
count = Hash.new 0
|
29
|
+
@freqs.each { |w, ks|
|
30
|
+
ks.keys.each { |k| count[k] += 1 }
|
31
|
+
}
|
32
|
+
count
|
33
|
+
end
|
34
|
+
|
27
35
|
def get_word_counts(word)
|
28
36
|
@freqs.fetch word, Hash.new(0)
|
29
37
|
end
|
@@ -49,8 +57,8 @@ module Ankusa
|
|
49
57
|
@total_doc_counts[klass] += count
|
50
58
|
end
|
51
59
|
|
52
|
-
def
|
53
|
-
@total_doc_counts
|
60
|
+
def doc_count_totals
|
61
|
+
@total_doc_counts
|
54
62
|
end
|
55
63
|
|
56
64
|
def close
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ankusa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 6
|
10
|
+
version: 0.0.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Muller
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-06 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -50,7 +50,7 @@ dependencies:
|
|
50
50
|
version: 1.0.0
|
51
51
|
type: :runtime
|
52
52
|
version_requirements: *id002
|
53
|
-
description:
|
53
|
+
description: Text classifier with HBase storage
|
54
54
|
email: brian.muller@livingsocial.com
|
55
55
|
executables: []
|
56
56
|
|
@@ -122,6 +122,6 @@ rubyforge_project:
|
|
122
122
|
rubygems_version: 1.3.7
|
123
123
|
signing_key:
|
124
124
|
specification_version: 3
|
125
|
-
summary:
|
125
|
+
summary: Text classifier in Ruby that uses Hadoop's HBase for storage
|
126
126
|
test_files: []
|
127
127
|
|