ankusa 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +674 -0
- data/README.rdoc +30 -0
- data/Rakefile +45 -0
- data/docs/classes/Ankusa.html +149 -0
- data/docs/classes/Ankusa/Classifier.html +517 -0
- data/docs/classes/Ankusa/NBClass.html +168 -0
- data/docs/classes/Ankusa/TextHash.html +220 -0
- data/docs/created.rid +1 -0
- data/docs/files/README_rdoc.html +141 -0
- data/docs/files/lib/ankusa/classifier_rb.html +101 -0
- data/docs/files/lib/ankusa/hasher_rb.html +109 -0
- data/docs/files/lib/ankusa/nbclass_rb.html +101 -0
- data/docs/files/lib/ankusa/stopwords_rb.html +101 -0
- data/docs/files/lib/ankusa_rb.html +110 -0
- data/docs/fr_class_index.html +30 -0
- data/docs/fr_file_index.html +32 -0
- data/docs/fr_method_index.html +43 -0
- data/docs/index.html +24 -0
- data/docs/rdoc-style.css +208 -0
- data/lib/ankusa.rb +3 -0
- data/lib/ankusa/classifier.rb +125 -0
- data/lib/ankusa/hasher.rb +33 -0
- data/lib/ankusa/nbclass.rb +15 -0
- data/lib/ankusa/stopwords.rb +4 -0
- metadata +121 -0
@@ -0,0 +1,30 @@
|
|
1
|
+
|
2
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
3
|
+
<!DOCTYPE html
|
4
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
5
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
6
|
+
|
7
|
+
<!--
|
8
|
+
|
9
|
+
Classes
|
10
|
+
|
11
|
+
-->
|
12
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
13
|
+
<head>
|
14
|
+
<title>Classes</title>
|
15
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
16
|
+
<link rel="stylesheet" href="rdoc-style.css" type="text/css" />
|
17
|
+
<base target="docwin" />
|
18
|
+
</head>
|
19
|
+
<body>
|
20
|
+
<div id="index">
|
21
|
+
<h1 class="section-bar">Classes</h1>
|
22
|
+
<div id="index-entries">
|
23
|
+
<a href="classes/Ankusa.html">Ankusa</a><br />
|
24
|
+
<a href="classes/Ankusa/Classifier.html">Ankusa::Classifier</a><br />
|
25
|
+
<a href="classes/Ankusa/NBClass.html">Ankusa::NBClass</a><br />
|
26
|
+
<a href="classes/Ankusa/TextHash.html">Ankusa::TextHash</a><br />
|
27
|
+
</div>
|
28
|
+
</div>
|
29
|
+
</body>
|
30
|
+
</html>
|
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
3
|
+
<!DOCTYPE html
|
4
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
5
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
6
|
+
|
7
|
+
<!--
|
8
|
+
|
9
|
+
Files
|
10
|
+
|
11
|
+
-->
|
12
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
13
|
+
<head>
|
14
|
+
<title>Files</title>
|
15
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
16
|
+
<link rel="stylesheet" href="rdoc-style.css" type="text/css" />
|
17
|
+
<base target="docwin" />
|
18
|
+
</head>
|
19
|
+
<body>
|
20
|
+
<div id="index">
|
21
|
+
<h1 class="section-bar">Files</h1>
|
22
|
+
<div id="index-entries">
|
23
|
+
<a href="files/README_rdoc.html">README.rdoc</a><br />
|
24
|
+
<a href="files/lib/ankusa_rb.html">lib/ankusa.rb</a><br />
|
25
|
+
<a href="files/lib/ankusa/classifier_rb.html">lib/ankusa/classifier.rb</a><br />
|
26
|
+
<a href="files/lib/ankusa/hasher_rb.html">lib/ankusa/hasher.rb</a><br />
|
27
|
+
<a href="files/lib/ankusa/nbclass_rb.html">lib/ankusa/nbclass.rb</a><br />
|
28
|
+
<a href="files/lib/ankusa/stopwords_rb.html">lib/ankusa/stopwords.rb</a><br />
|
29
|
+
</div>
|
30
|
+
</div>
|
31
|
+
</body>
|
32
|
+
</html>
|
@@ -0,0 +1,43 @@
|
|
1
|
+
|
2
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
3
|
+
<!DOCTYPE html
|
4
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
5
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
6
|
+
|
7
|
+
<!--
|
8
|
+
|
9
|
+
Methods
|
10
|
+
|
11
|
+
-->
|
12
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
13
|
+
<head>
|
14
|
+
<title>Methods</title>
|
15
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
16
|
+
<link rel="stylesheet" href="rdoc-style.css" type="text/css" />
|
17
|
+
<base target="docwin" />
|
18
|
+
</head>
|
19
|
+
<body>
|
20
|
+
<div id="index">
|
21
|
+
<h1 class="section-bar">Methods</h1>
|
22
|
+
<div id="index-entries">
|
23
|
+
<a href="classes/Ankusa/TextHash.html#M000016">add_text (Ankusa::TextHash)</a><br />
|
24
|
+
<a href="classes/Ankusa/TextHash.html#M000017">add_word (Ankusa::TextHash)</a><br />
|
25
|
+
<a href="classes/Ankusa/Classifier.html#M000005">classifications (Ankusa::Classifier)</a><br />
|
26
|
+
<a href="classes/Ankusa/Classifier.html#M000004">classify (Ankusa::Classifier)</a><br />
|
27
|
+
<a href="classes/Ankusa/Classifier.html#M000009">doc_count_total (Ankusa::Classifier)</a><br />
|
28
|
+
<a href="classes/Ankusa/Classifier.html#M000007">drop_tables (Ankusa::Classifier)</a><br />
|
29
|
+
<a href="classes/Ankusa/Classifier.html#M000013">freq_table (Ankusa::Classifier)</a><br />
|
30
|
+
<a href="classes/Ankusa/Classifier.html#M000010">get_word_probs (Ankusa::Classifier)</a><br />
|
31
|
+
<a href="classes/Ankusa/Classifier.html#M000011">init_tables (Ankusa::Classifier)</a><br />
|
32
|
+
<a href="classes/Ankusa/NBClass.html#M000014">new (Ankusa::NBClass)</a><br />
|
33
|
+
<a href="classes/Ankusa/TextHash.html#M000015">new (Ankusa::TextHash)</a><br />
|
34
|
+
<a href="classes/Ankusa/Classifier.html#M000001">new (Ankusa::Classifier)</a><br />
|
35
|
+
<a href="classes/Ankusa/Classifier.html#M000006">refresh_classnames (Ankusa::Classifier)</a><br />
|
36
|
+
<a href="classes/Ankusa/Classifier.html#M000008">reset (Ankusa::Classifier)</a><br />
|
37
|
+
<a href="classes/Ankusa/Classifier.html#M000012">summary_table (Ankusa::Classifier)</a><br />
|
38
|
+
<a href="classes/Ankusa/Classifier.html#M000002">train (Ankusa::Classifier)</a><br />
|
39
|
+
<a href="classes/Ankusa/Classifier.html#M000003">untrain (Ankusa::Classifier)</a><br />
|
40
|
+
</div>
|
41
|
+
</div>
|
42
|
+
</body>
|
43
|
+
</html>
|
data/docs/index.html
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
|
5
|
+
|
6
|
+
<!--
|
7
|
+
|
8
|
+
HBaseRb - Naive Bayes classifier with HBase storage
|
9
|
+
|
10
|
+
-->
|
11
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
12
|
+
<head>
|
13
|
+
<title>HBaseRb - Naive Bayes classifier with HBase storage</title>
|
14
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
15
|
+
</head>
|
16
|
+
<frameset rows="20%, 80%">
|
17
|
+
<frameset cols="25%,35%,45%">
|
18
|
+
<frame src="fr_file_index.html" title="Files" name="Files" />
|
19
|
+
<frame src="fr_class_index.html" name="Classes" />
|
20
|
+
<frame src="fr_method_index.html" name="Methods" />
|
21
|
+
</frameset>
|
22
|
+
<frame src="files/README_rdoc.html" name="docwin" />
|
23
|
+
</frameset>
|
24
|
+
</html>
|
data/docs/rdoc-style.css
ADDED
@@ -0,0 +1,208 @@
|
|
1
|
+
|
2
|
+
body {
|
3
|
+
font-family: Verdana,Arial,Helvetica,sans-serif;
|
4
|
+
font-size: 90%;
|
5
|
+
margin: 0;
|
6
|
+
margin-left: 40px;
|
7
|
+
padding: 0;
|
8
|
+
background: white;
|
9
|
+
}
|
10
|
+
|
11
|
+
h1,h2,h3,h4 { margin: 0; color: #efefef; background: transparent; }
|
12
|
+
h1 { font-size: 150%; }
|
13
|
+
h2,h3,h4 { margin-top: 1em; }
|
14
|
+
|
15
|
+
a { background: #eef; color: #039; text-decoration: none; }
|
16
|
+
a:hover { background: #039; color: #eef; }
|
17
|
+
|
18
|
+
/* Override the base stylesheet's Anchor inside a table cell */
|
19
|
+
td > a {
|
20
|
+
background: transparent;
|
21
|
+
color: #039;
|
22
|
+
text-decoration: none;
|
23
|
+
}
|
24
|
+
|
25
|
+
/* and inside a section title */
|
26
|
+
.section-title > a {
|
27
|
+
background: transparent;
|
28
|
+
color: #eee;
|
29
|
+
text-decoration: none;
|
30
|
+
}
|
31
|
+
|
32
|
+
/* === Structural elements =================================== */
|
33
|
+
|
34
|
+
div#index {
|
35
|
+
margin: 0;
|
36
|
+
margin-left: -40px;
|
37
|
+
padding: 0;
|
38
|
+
font-size: 90%;
|
39
|
+
}
|
40
|
+
|
41
|
+
|
42
|
+
div#index a {
|
43
|
+
margin-left: 0.7em;
|
44
|
+
}
|
45
|
+
|
46
|
+
div#index .section-bar {
|
47
|
+
margin-left: 0px;
|
48
|
+
padding-left: 0.7em;
|
49
|
+
background: #ccc;
|
50
|
+
font-size: small;
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
div#classHeader, div#fileHeader {
|
55
|
+
width: auto;
|
56
|
+
color: white;
|
57
|
+
padding: 0.5em 1.5em 0.5em 1.5em;
|
58
|
+
margin: 0;
|
59
|
+
margin-left: -40px;
|
60
|
+
border-bottom: 3px solid #006;
|
61
|
+
}
|
62
|
+
|
63
|
+
div#classHeader a, div#fileHeader a {
|
64
|
+
background: inherit;
|
65
|
+
color: white;
|
66
|
+
}
|
67
|
+
|
68
|
+
div#classHeader td, div#fileHeader td {
|
69
|
+
background: inherit;
|
70
|
+
color: white;
|
71
|
+
}
|
72
|
+
|
73
|
+
|
74
|
+
div#fileHeader {
|
75
|
+
background: #057;
|
76
|
+
}
|
77
|
+
|
78
|
+
div#classHeader {
|
79
|
+
background: #048;
|
80
|
+
}
|
81
|
+
|
82
|
+
|
83
|
+
.class-name-in-header {
|
84
|
+
font-size: 180%;
|
85
|
+
font-weight: bold;
|
86
|
+
}
|
87
|
+
|
88
|
+
|
89
|
+
div#bodyContent {
|
90
|
+
padding: 0 1.5em 0 1.5em;
|
91
|
+
}
|
92
|
+
|
93
|
+
div#description {
|
94
|
+
padding: 0.5em 1.5em;
|
95
|
+
background: #efefef;
|
96
|
+
border: 1px dotted #999;
|
97
|
+
}
|
98
|
+
|
99
|
+
div#description h1,h2,h3,h4,h5,h6 {
|
100
|
+
color: #125;;
|
101
|
+
background: transparent;
|
102
|
+
}
|
103
|
+
|
104
|
+
div#validator-badges {
|
105
|
+
text-align: center;
|
106
|
+
}
|
107
|
+
div#validator-badges img { border: 0; }
|
108
|
+
|
109
|
+
div#copyright {
|
110
|
+
color: #333;
|
111
|
+
background: #efefef;
|
112
|
+
font: 0.75em sans-serif;
|
113
|
+
margin-top: 5em;
|
114
|
+
margin-bottom: 0;
|
115
|
+
padding: 0.5em 2em;
|
116
|
+
}
|
117
|
+
|
118
|
+
|
119
|
+
/* === Classes =================================== */
|
120
|
+
|
121
|
+
table.header-table {
|
122
|
+
color: white;
|
123
|
+
font-size: small;
|
124
|
+
}
|
125
|
+
|
126
|
+
.type-note {
|
127
|
+
font-size: small;
|
128
|
+
color: #DEDEDE;
|
129
|
+
}
|
130
|
+
|
131
|
+
.xxsection-bar {
|
132
|
+
background: #eee;
|
133
|
+
color: #333;
|
134
|
+
padding: 3px;
|
135
|
+
}
|
136
|
+
|
137
|
+
.section-bar {
|
138
|
+
color: #333;
|
139
|
+
border-bottom: 1px solid #999;
|
140
|
+
margin-left: -20px;
|
141
|
+
}
|
142
|
+
|
143
|
+
|
144
|
+
.section-title {
|
145
|
+
background: #79a;
|
146
|
+
color: #eee;
|
147
|
+
padding: 3px;
|
148
|
+
margin-top: 2em;
|
149
|
+
margin-left: -30px;
|
150
|
+
border: 1px solid #999;
|
151
|
+
}
|
152
|
+
|
153
|
+
.top-aligned-row { vertical-align: top }
|
154
|
+
.bottom-aligned-row { vertical-align: bottom }
|
155
|
+
|
156
|
+
/* --- Context section classes ----------------------- */
|
157
|
+
|
158
|
+
.context-row { }
|
159
|
+
.context-item-name { font-family: monospace; font-weight: bold; color: black; }
|
160
|
+
.context-item-value { font-size: small; color: #448; }
|
161
|
+
.context-item-desc { color: #333; padding-left: 2em; }
|
162
|
+
|
163
|
+
/* --- Method classes -------------------------- */
|
164
|
+
.method-detail {
|
165
|
+
background: #efefef;
|
166
|
+
padding: 0;
|
167
|
+
margin-top: 0.5em;
|
168
|
+
margin-bottom: 1em;
|
169
|
+
border: 1px dotted #ccc;
|
170
|
+
}
|
171
|
+
.method-heading {
|
172
|
+
color: black;
|
173
|
+
background: #ccc;
|
174
|
+
border-bottom: 1px solid #666;
|
175
|
+
padding: 0.2em 0.5em 0 0.5em;
|
176
|
+
}
|
177
|
+
.method-signature { color: black; background: inherit; }
|
178
|
+
.method-name { font-weight: bold; }
|
179
|
+
.method-args { font-style: italic; }
|
180
|
+
.method-description { padding: 0 0.5em 0 0.5em; }
|
181
|
+
|
182
|
+
/* --- Source code sections -------------------- */
|
183
|
+
|
184
|
+
a.source-toggle { font-size: 90%; }
|
185
|
+
div.method-source-code {
|
186
|
+
background: #262626;
|
187
|
+
color: #ffdead;
|
188
|
+
margin: 1em;
|
189
|
+
padding: 0.5em;
|
190
|
+
border: 1px dashed #999;
|
191
|
+
overflow: hidden;
|
192
|
+
}
|
193
|
+
|
194
|
+
div.method-source-code pre { color: #ffdead; overflow: hidden; }
|
195
|
+
|
196
|
+
/* --- Ruby keyword styles --------------------- */
|
197
|
+
|
198
|
+
.standalone-code { background: #221111; color: #ffdead; overflow: hidden; }
|
199
|
+
|
200
|
+
.ruby-constant { color: #7fffd4; background: transparent; }
|
201
|
+
.ruby-keyword { color: #00ffff; background: transparent; }
|
202
|
+
.ruby-ivar { color: #eedd82; background: transparent; }
|
203
|
+
.ruby-operator { color: #00ffee; background: transparent; }
|
204
|
+
.ruby-identifier { color: #ffdead; background: transparent; }
|
205
|
+
.ruby-node { color: #ffa07a; background: transparent; }
|
206
|
+
.ruby-comment { color: #b22222; font-weight: bold; background: transparent; }
|
207
|
+
.ruby-regexp { color: #ffa07a; background: transparent; }
|
208
|
+
.ruby-value { color: #7fffd4; background: transparent; }
|
data/lib/ankusa.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
module Ankusa
|
2
|
+
SMALL_PROB = 0.0001
|
3
|
+
|
4
|
+
class Classifier
|
5
|
+
attr_reader :classnames
|
6
|
+
|
7
|
+
def initialize(hbase_client, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")
|
8
|
+
@hbase = hbase_client
|
9
|
+
@ftablename = frequency_tablename
|
10
|
+
@stablename = summary_tablename
|
11
|
+
init_tables
|
12
|
+
@classnames = refresh_classnames
|
13
|
+
end
|
14
|
+
|
15
|
+
def train(klass, text)
|
16
|
+
th = TextHash.new(text)
|
17
|
+
th.each { |word, count|
|
18
|
+
freq_table.atomic_increment word, "classes:#{klass.to_s}", count
|
19
|
+
}
|
20
|
+
summary_table.atomic_increment klass, "totals:wordcount", th.word_count
|
21
|
+
summary_table.atomic_increment klass, "totals:doccount"
|
22
|
+
@classnames << klass if not @classnames.include? klass
|
23
|
+
end
|
24
|
+
|
25
|
+
def untrain(klass, text)
|
26
|
+
th = TextHash.new(text)
|
27
|
+
th.each { |word, count|
|
28
|
+
freq_table.atomic_increment word, "classes:#{klass.to_s}", -count
|
29
|
+
}
|
30
|
+
summary_table.atomic_increment klass, "totals:wordcount", -th.word_count
|
31
|
+
summary_table.atomic_increment klass, "totals:doccount", -1
|
32
|
+
end
|
33
|
+
|
34
|
+
def classify(text)
|
35
|
+
# return the most probable class
|
36
|
+
classifications(text).sort { |o,t| o[1] <=> t[1] }.first.first
|
37
|
+
end
|
38
|
+
|
39
|
+
def classifications(text)
|
40
|
+
classes = {}
|
41
|
+
result = {}
|
42
|
+
@classnames.each { |k|
|
43
|
+
classes[k] = NBClass.new k, summary_table, freq_table
|
44
|
+
result[k] = 0
|
45
|
+
}
|
46
|
+
|
47
|
+
TextHash.new(text).each { |word,count|
|
48
|
+
probs = get_word_probs(word, classes)
|
49
|
+
@classnames.each { |k| result[k] += Math.log(probs[k]) }
|
50
|
+
}
|
51
|
+
|
52
|
+
@classnames.each { |k| result[k] += Math.log(classes[k].doc_count / doc_count_total) }
|
53
|
+
|
54
|
+
result.keys.each { |k| result[k] = Math.exp(result[k]) }
|
55
|
+
sum = result.values.inject { |x,y| x+y }
|
56
|
+
result.keys.each { |klass|
|
57
|
+
result[klass] = result[klass] / sum
|
58
|
+
}
|
59
|
+
|
60
|
+
result
|
61
|
+
end
|
62
|
+
|
63
|
+
# get all classes
|
64
|
+
def refresh_classnames
|
65
|
+
cs = []
|
66
|
+
summary_table.create_scanner("", "totals") { |row|
|
67
|
+
cs << row.row.intern
|
68
|
+
}
|
69
|
+
cs
|
70
|
+
end
|
71
|
+
|
72
|
+
def drop_tables
|
73
|
+
freq_table.delete
|
74
|
+
summary_table.delete
|
75
|
+
@stable = nil
|
76
|
+
@ftable = nil
|
77
|
+
end
|
78
|
+
|
79
|
+
def reset
|
80
|
+
drop_tables
|
81
|
+
init_tables
|
82
|
+
end
|
83
|
+
|
84
|
+
def doc_count_total
|
85
|
+
total = 0
|
86
|
+
summary_table.create_scanner("", "totals:doccount") { |row|
|
87
|
+
total += row.columns["totals:doccount"].to_i64
|
88
|
+
}
|
89
|
+
total
|
90
|
+
end
|
91
|
+
|
92
|
+
protected
|
93
|
+
def get_word_probs(word, classes)
|
94
|
+
probs = {}
|
95
|
+
@classnames.each { |cn| probs[cn] = Ankusa::SMALL_PROB / classes[cn].word_count }
|
96
|
+
row = freq_table.get_row(word)
|
97
|
+
return probs if row.length == 0
|
98
|
+
|
99
|
+
row.first.columns.each { |colname, cell|
|
100
|
+
classname = colname.split(':')[1].intern
|
101
|
+
probs[classname] = cell.to_i64.to_f / classes[classname].word_count
|
102
|
+
}
|
103
|
+
probs
|
104
|
+
end
|
105
|
+
|
106
|
+
def init_tables
|
107
|
+
if not @hbase.has_table? @ftablename
|
108
|
+
@hbase.create_table @ftablename, "classes", "total"
|
109
|
+
end
|
110
|
+
|
111
|
+
if not @hbase.has_table? @stablename
|
112
|
+
@hbase.create_table @stablename, "totals"
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def summary_table
|
117
|
+
@stable ||= @hbase.get_table @stablename
|
118
|
+
end
|
119
|
+
|
120
|
+
def freq_table
|
121
|
+
@ftable ||= @hbase.get_table @ftablename
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|