ankusa 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +10 -1
- data/Rakefile +4 -4
- data/docs/classes/Ankusa/Classifier.html +125 -32
- data/docs/classes/Ankusa/HBaseStorage.html +165 -108
- data/docs/classes/Ankusa/MemoryStorage.html +117 -89
- data/docs/classes/Ankusa/TextHash.html +30 -30
- data/docs/created.rid +1 -1
- data/docs/files/README_rdoc.html +16 -3
- data/docs/files/lib/ankusa/classifier_rb.html +1 -1
- data/docs/files/lib/ankusa/hbase_storage_rb.html +1 -1
- data/docs/files/lib/ankusa/memory_storage_rb.html +1 -1
- data/docs/fr_method_index.html +40 -34
- data/lib/ankusa/classifier.rb +44 -15
- data/lib/ankusa/hbase_storage.rb +25 -8
- data/lib/ankusa/memory_storage.rb +10 -2
- metadata +6 -6
data/docs/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Mon, 06 Dec 2010 15:40:49 -0500
|
data/docs/files/README_rdoc.html
CHANGED
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Mon Dec 06 15:30:41 -0500 2010</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -71,10 +71,16 @@
|
|
71
71
|
<div id="description">
|
72
72
|
<h1>ankusa</h1>
|
73
73
|
<p>
|
74
|
-
<a href="../classes/Ankusa.html">Ankusa</a> is a
|
75
|
-
|
74
|
+
<a href="../classes/Ankusa.html">Ankusa</a> is a text classifier in Ruby
|
75
|
+
that uses Hadoop‘s HBase for storage. Because it uses HBase as a
|
76
76
|
backend, the training corpus can be many terabytes in size.
|
77
77
|
</p>
|
78
|
+
<p>
|
79
|
+
<a href="../classes/Ankusa.html">Ankusa</a> currently uses a Naive Bayes
|
80
|
+
classifier. It ignores common words (a.k.a, stop words) and stems all
|
81
|
+
others. Additionally, it uses Laplacian smoothing in the classification
|
82
|
+
method.
|
83
|
+
</p>
|
78
84
|
<h2>Installation</h2>
|
79
85
|
<p>
|
80
86
|
First, install HBase / Hadoop. Make sure the HBase Thrift interface has
|
@@ -92,6 +98,8 @@ been started as well. Then:
|
|
92
98
|
storage = Ankusa::HBaseStorage.new 'localhost'
|
93
99
|
c = Ankusa::Classifier.new storage
|
94
100
|
|
101
|
+
# Each of these calls will return a bag-of-words
|
102
|
+
# has with stemmed words as keys and counts as values
|
95
103
|
c.train :spam, "This is some spammy text"
|
96
104
|
c.train :good, "This is not the bad stuff"
|
97
105
|
|
@@ -102,6 +110,11 @@ been started as well. Then:
|
|
102
110
|
# membership probability as values
|
103
111
|
puts c.classifications "This is some spammy text"
|
104
112
|
|
113
|
+
# If you have a large corpus, the probabilities will
|
114
|
+
# likely all be 0. In that case, you must use log
|
115
|
+
# likelihood values
|
116
|
+
puts c.log_likelihoods "This is some spammy text"
|
117
|
+
|
105
118
|
# get a list of all classes
|
106
119
|
puts c.classes
|
107
120
|
|
data/docs/fr_method_index.html
CHANGED
@@ -20,47 +20,53 @@
|
|
20
20
|
<div id="index">
|
21
21
|
<h1 class="section-bar">Methods</h1>
|
22
22
|
<div id="index-entries">
|
23
|
-
<a href="classes/Ankusa/TextHash.html#
|
24
|
-
<a href="classes/Ankusa/TextHash.html#
|
25
|
-
<a href="classes/Ankusa/TextHash.html#
|
23
|
+
<a href="classes/Ankusa/TextHash.html#M000044">add_text (Ankusa::TextHash)</a><br />
|
24
|
+
<a href="classes/Ankusa/TextHash.html#M000045">add_word (Ankusa::TextHash)</a><br />
|
25
|
+
<a href="classes/Ankusa/TextHash.html#M000046">atomize (Ankusa::TextHash)</a><br />
|
26
26
|
<a href="classes/Ankusa/Classifier.html#M000007">classifications (Ankusa::Classifier)</a><br />
|
27
27
|
<a href="classes/Ankusa/Classifier.html#M000006">classify (Ankusa::Classifier)</a><br />
|
28
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
29
|
-
<a href="classes/Ankusa/MemoryStorage.html#
|
30
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
31
|
-
<a href="classes/Ankusa/MemoryStorage.html#
|
32
|
-
<a href="classes/Ankusa/
|
33
|
-
<a href="classes/Ankusa/
|
34
|
-
<a href="classes/Ankusa/
|
35
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
36
|
-
<a href="classes/Ankusa/
|
37
|
-
<a href="classes/Ankusa/
|
38
|
-
<a href="classes/Ankusa/
|
39
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
40
|
-
<a href="classes/Ankusa/
|
41
|
-
<a href="classes/Ankusa/MemoryStorage.html#
|
42
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
43
|
-
<a href="classes/Ankusa/
|
44
|
-
<a href="classes/Ankusa/MemoryStorage.html#
|
45
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
46
|
-
<a href="classes/Ankusa/
|
47
|
-
<a href="classes/Ankusa/
|
48
|
-
<a href="classes/Ankusa/
|
49
|
-
<a href="classes/Ankusa/
|
50
|
-
<a href="classes/Ankusa/
|
51
|
-
<a href="classes/Ankusa/
|
52
|
-
<a href="classes/Ankusa/
|
28
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000027">classnames (Ankusa::HBaseStorage)</a><br />
|
29
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000013">classnames (Ankusa::MemoryStorage)</a><br />
|
30
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000039">close (Ankusa::HBaseStorage)</a><br />
|
31
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000025">close (Ankusa::MemoryStorage)</a><br />
|
32
|
+
<a href="classes/Ankusa/Classifier.html#M000010">doc_count_totals (Ankusa::Classifier)</a><br />
|
33
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000024">doc_count_totals (Ankusa::MemoryStorage)</a><br />
|
34
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000038">doc_count_totals (Ankusa::HBaseStorage)</a><br />
|
35
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000029">drop_tables (Ankusa::HBaseStorage)</a><br />
|
36
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000015">drop_tables (Ankusa::MemoryStorage)</a><br />
|
37
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000042">freq_table (Ankusa::HBaseStorage)</a><br />
|
38
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000020">get_doc_count (Ankusa::MemoryStorage)</a><br />
|
39
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000034">get_doc_count (Ankusa::HBaseStorage)</a><br />
|
40
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000040">get_summary (Ankusa::HBaseStorage)</a><br />
|
41
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000019">get_total_word_count (Ankusa::MemoryStorage)</a><br />
|
42
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000033">get_total_word_count (Ankusa::HBaseStorage)</a><br />
|
43
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000032">get_vocabulary_sizes (Ankusa::HBaseStorage)</a><br />
|
44
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000017">get_vocabulary_sizes (Ankusa::MemoryStorage)</a><br />
|
45
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000031">get_word_counts (Ankusa::HBaseStorage)</a><br />
|
46
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000018">get_word_counts (Ankusa::MemoryStorage)</a><br />
|
47
|
+
<a href="classes/Ankusa/Classifier.html#M000009">get_word_probs (Ankusa::Classifier)</a><br />
|
48
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000023">incr_doc_count (Ankusa::MemoryStorage)</a><br />
|
49
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000037">incr_doc_count (Ankusa::HBaseStorage)</a><br />
|
50
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000022">incr_total_word_count (Ankusa::MemoryStorage)</a><br />
|
51
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000036">incr_total_word_count (Ankusa::HBaseStorage)</a><br />
|
52
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000021">incr_word_count (Ankusa::MemoryStorage)</a><br />
|
53
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000035">incr_word_count (Ankusa::HBaseStorage)</a><br />
|
54
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000016">init_tables (Ankusa::MemoryStorage)</a><br />
|
55
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000030">init_tables (Ankusa::HBaseStorage)</a><br />
|
56
|
+
<a href="classes/Ankusa/Classifier.html#M000008">log_likelihoods (Ankusa::Classifier)</a><br />
|
57
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000026">new (Ankusa::HBaseStorage)</a><br />
|
53
58
|
<a href="classes/Ankusa/Classifier.html#M000003">new (Ankusa::Classifier)</a><br />
|
54
|
-
<a href="classes/Ankusa/
|
55
|
-
<a href="classes/Ankusa/
|
59
|
+
<a href="classes/Ankusa/TextHash.html#M000043">new (Ankusa::TextHash)</a><br />
|
60
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000012">new (Ankusa::MemoryStorage)</a><br />
|
56
61
|
<a href="classes/String.html#M000001">numeric? (String)</a><br />
|
57
|
-
<a href="classes/Ankusa/
|
58
|
-
<a href="classes/Ankusa/
|
59
|
-
<a href="classes/Ankusa/HBaseStorage.html#
|
62
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000028">reset (Ankusa::HBaseStorage)</a><br />
|
63
|
+
<a href="classes/Ankusa/MemoryStorage.html#M000014">reset (Ankusa::MemoryStorage)</a><br />
|
64
|
+
<a href="classes/Ankusa/HBaseStorage.html#M000041">summary_table (Ankusa::HBaseStorage)</a><br />
|
60
65
|
<a href="classes/String.html#M000002">to_ascii (String)</a><br />
|
61
66
|
<a href="classes/Ankusa/Classifier.html#M000004">train (Ankusa::Classifier)</a><br />
|
62
67
|
<a href="classes/Ankusa/Classifier.html#M000005">untrain (Ankusa::Classifier)</a><br />
|
63
|
-
<a href="classes/Ankusa/TextHash.html#
|
68
|
+
<a href="classes/Ankusa/TextHash.html#M000047">valid_word? (Ankusa::TextHash)</a><br />
|
69
|
+
<a href="classes/Ankusa/Classifier.html#M000011">vocab_sizes (Ankusa::Classifier)</a><br />
|
64
70
|
</div>
|
65
71
|
</div>
|
66
72
|
</body>
|
data/lib/ankusa/classifier.rb
CHANGED
@@ -21,6 +21,9 @@ module Ankusa
|
|
21
21
|
doccount = (text.kind_of? Array) ? text.length : 1
|
22
22
|
@storage.incr_doc_count klass, doccount
|
23
23
|
@classnames << klass if not @classnames.include? klass
|
24
|
+
# cache is now dirty of these vars
|
25
|
+
@doc_count_totals = nil
|
26
|
+
@vocab_sizes = nil
|
24
27
|
th
|
25
28
|
end
|
26
29
|
|
@@ -35,44 +38,70 @@ module Ankusa
|
|
35
38
|
@storage.incr_total_word_count klass, -th.word_count
|
36
39
|
doccount = (text.kind_of? Array) ? text.length : 1
|
37
40
|
@storage.incr_doc_count klass, -doccount
|
41
|
+
# cache is now dirty of these vars
|
42
|
+
@doc_count_totals = nil
|
43
|
+
@vocab_sizes = nil
|
38
44
|
th
|
39
45
|
end
|
40
46
|
|
41
|
-
def classify(text)
|
47
|
+
def classify(text, classes=nil)
|
42
48
|
# return the most probable class
|
43
|
-
|
49
|
+
log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
|
44
50
|
end
|
45
51
|
|
46
|
-
|
52
|
+
# Classes is an array of classes to look at
|
53
|
+
def classifications(text, classnames=nil)
|
54
|
+
result = log_likelihoods text, classnames
|
55
|
+
result.keys.each { |k|
|
56
|
+
result[k] = Math.exp result[k]
|
57
|
+
}
|
58
|
+
|
59
|
+
# normalize to get probs
|
60
|
+
sum = result.values.inject { |x,y| x+y }
|
61
|
+
result.keys.each { |k| result[k] = result[k] / sum }
|
62
|
+
result
|
63
|
+
end
|
64
|
+
|
65
|
+
# Classes is an array of classes to look at
|
66
|
+
def log_likelihoods(text, classnames=nil)
|
67
|
+
classnames ||= @classnames
|
47
68
|
result = Hash.new 0
|
48
69
|
|
49
70
|
TextHash.new(text).each { |word, count|
|
50
|
-
probs = get_word_probs(word)
|
51
|
-
|
71
|
+
probs = get_word_probs(word, classnames)
|
72
|
+
classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
|
52
73
|
}
|
53
74
|
|
54
75
|
# add the prior and exponentiate
|
55
|
-
|
56
|
-
|
57
|
-
|
76
|
+
doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
|
77
|
+
doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
|
78
|
+
classnames.each { |k|
|
79
|
+
result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
|
58
80
|
}
|
59
81
|
|
60
|
-
# normalize to get probs
|
61
|
-
sum = result.values.inject { |x,y| x+y }
|
62
|
-
@classnames.each { |k| result[k] = result[k] / sum }
|
63
82
|
result
|
64
83
|
end
|
65
84
|
|
66
85
|
protected
|
67
|
-
def get_word_probs(word)
|
68
|
-
probs =
|
69
|
-
@
|
86
|
+
def get_word_probs(word, classnames)
|
87
|
+
probs = Hash.new 0
|
88
|
+
@storage.get_word_counts(word).each { |k,v| probs[k] = v if classnames.include? k }
|
89
|
+
vs = vocab_sizes
|
90
|
+
classnames.each { |cn|
|
70
91
|
# use a laplacian smoother
|
71
|
-
probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) +
|
92
|
+
probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + vs[cn]).to_f
|
72
93
|
}
|
73
94
|
probs
|
74
95
|
end
|
75
96
|
|
97
|
+
def doc_count_totals
|
98
|
+
@doc_count_totals ||= @storage.doc_count_totals
|
99
|
+
end
|
100
|
+
|
101
|
+
def vocab_sizes
|
102
|
+
@vocab_sizes ||= @storage.get_vocabulary_sizes
|
103
|
+
end
|
104
|
+
|
76
105
|
end
|
77
106
|
|
78
107
|
end
|
data/lib/ankusa/hbase_storage.rb
CHANGED
@@ -53,12 +53,17 @@ module Ankusa
|
|
53
53
|
|
54
54
|
row.first.columns.each { |colname, cell|
|
55
55
|
classname = colname.split(':')[1].intern
|
56
|
-
|
56
|
+
# in case untrain has been called too many times
|
57
|
+
counts[classname] = [cell.to_i64.to_f, 0].max
|
57
58
|
}
|
58
59
|
|
59
60
|
counts
|
60
61
|
end
|
61
62
|
|
63
|
+
def get_vocabulary_sizes
|
64
|
+
get_summary "totals:vocabsize"
|
65
|
+
end
|
66
|
+
|
62
67
|
def get_total_word_count(klass)
|
63
68
|
@klass_word_counts.fetch(klass) {
|
64
69
|
@klass_word_counts[klass] = summary_table.get(klass, "totals:wordcount").first.to_i64.to_f
|
@@ -72,7 +77,15 @@ module Ankusa
|
|
72
77
|
end
|
73
78
|
|
74
79
|
def incr_word_count(klass, word, count)
|
75
|
-
freq_table.atomic_increment word, "classes:#{klass.to_s}", count
|
80
|
+
size = freq_table.atomic_increment word, "classes:#{klass.to_s}", count
|
81
|
+
# if this is a new word, increase the klass's vocab size. If the new word
|
82
|
+
# count is 0, then we need to decrement our vocab size
|
83
|
+
if size == count
|
84
|
+
summary_table.atomic_increment klass, "totals:vocabsize"
|
85
|
+
elsif size == 0
|
86
|
+
summary_table.atomic_increment klass, "totals:vocabsize", -1
|
87
|
+
end
|
88
|
+
size
|
76
89
|
end
|
77
90
|
|
78
91
|
def incr_total_word_count(klass, count)
|
@@ -83,12 +96,8 @@ module Ankusa
|
|
83
96
|
@klass_doc_counts[klass] = summary_table.atomic_increment klass, "totals:doccount", count
|
84
97
|
end
|
85
98
|
|
86
|
-
def
|
87
|
-
|
88
|
-
summary_table.create_scanner("", "totals:doccount") { |row|
|
89
|
-
total += row.columns["totals:doccount"].to_i64
|
90
|
-
}
|
91
|
-
total
|
99
|
+
def doc_count_totals
|
100
|
+
get_summary "totals:doccount"
|
92
101
|
end
|
93
102
|
|
94
103
|
def close
|
@@ -96,6 +105,14 @@ module Ankusa
|
|
96
105
|
end
|
97
106
|
|
98
107
|
protected
|
108
|
+
def get_summary(name)
|
109
|
+
counts = Hash.new 0
|
110
|
+
summary_table.create_scanner("", name) { |row|
|
111
|
+
counts[row.row.intern] = row.columns[name].to_i64
|
112
|
+
}
|
113
|
+
counts
|
114
|
+
end
|
115
|
+
|
99
116
|
def summary_table
|
100
117
|
@stable ||= @hbase.get_table @stablename
|
101
118
|
end
|
@@ -24,6 +24,14 @@ module Ankusa
|
|
24
24
|
@klass_doc_counts = {}
|
25
25
|
end
|
26
26
|
|
27
|
+
def get_vocabulary_sizes
|
28
|
+
count = Hash.new 0
|
29
|
+
@freqs.each { |w, ks|
|
30
|
+
ks.keys.each { |k| count[k] += 1 }
|
31
|
+
}
|
32
|
+
count
|
33
|
+
end
|
34
|
+
|
27
35
|
def get_word_counts(word)
|
28
36
|
@freqs.fetch word, Hash.new(0)
|
29
37
|
end
|
@@ -49,8 +57,8 @@ module Ankusa
|
|
49
57
|
@total_doc_counts[klass] += count
|
50
58
|
end
|
51
59
|
|
52
|
-
def
|
53
|
-
@total_doc_counts
|
60
|
+
def doc_count_totals
|
61
|
+
@total_doc_counts
|
54
62
|
end
|
55
63
|
|
56
64
|
def close
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ankusa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 6
|
10
|
+
version: 0.0.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Muller
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-06 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -50,7 +50,7 @@ dependencies:
|
|
50
50
|
version: 1.0.0
|
51
51
|
type: :runtime
|
52
52
|
version_requirements: *id002
|
53
|
-
description:
|
53
|
+
description: Text classifier with HBase storage
|
54
54
|
email: brian.muller@livingsocial.com
|
55
55
|
executables: []
|
56
56
|
|
@@ -122,6 +122,6 @@ rubyforge_project:
|
|
122
122
|
rubygems_version: 1.3.7
|
123
123
|
signing_key:
|
124
124
|
specification_version: 3
|
125
|
-
summary:
|
125
|
+
summary: Text classifier in Ruby that uses Hadoop's HBase for storage
|
126
126
|
test_files: []
|
127
127
|
|