clusterer 0.1.0 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- data/README +29 -7
- data/examples/google_search_cluster.rb +13 -7
- data/examples/yahoo_search_cluster.rb +18 -31
- data/lib/clusterer.rb +36 -95
- data/lib/clusterer/algorithms.rb +95 -0
- data/lib/clusterer/bayes.rb +255 -0
- data/lib/clusterer/cluster.rb +56 -0
- data/lib/clusterer/clustering.rb +35 -0
- data/lib/clusterer/document.rb +71 -0
- data/lib/clusterer/document_array.rb +79 -0
- data/lib/clusterer/document_base.rb +32 -0
- data/lib/clusterer/documents_centroid.rb +44 -0
- data/lib/clusterer/inverse_document_frequency.rb +83 -0
- data/lib/clusterer/lsi/dmatrix.rb +132 -0
- data/lib/clusterer/lsi/document_vector.rb +54 -0
- data/lib/clusterer/lsi/documents_centroid_vector.rb +51 -0
- data/lib/clusterer/lsi/lsi.rb +95 -0
- data/lib/clusterer/similarity.rb +34 -0
- data/lib/{word_hash.rb → clusterer/stop_words.rb} +21 -23
- data/lib/clusterer/tokenizer.rb +70 -0
- data/tests/algorithms_test.rb +48 -0
- data/tests/bayes_test.rb +68 -0
- data/tests/cluster_test.rb +54 -0
- data/tests/document_array_test.rb +64 -0
- data/tests/document_centroid_test.rb +64 -0
- data/tests/document_test.rb +71 -0
- data/tests/inverse_document_frequency_test.rb +76 -0
- data/tests/lsi_test.rb +77 -0
- data/tests/similarity_test.rb +62 -0
- data/tests/tokenizer_test.rb +72 -0
- metadata +35 -9
- data/lib/similarity.rb +0 -27
- data/tests/clusterer_test.rb +0 -20
data/README
CHANGED
@@ -1,8 +1,15 @@
|
|
1
|
-
A ruby library which implements
|
2
|
-
mining.
|
1
|
+
A ruby library which implements clustering and classification
|
2
|
+
algorithms for text mining.
|
3
3
|
|
4
|
-
|
5
|
-
clustering.
|
4
|
+
Clustering algorithms currently implemented are - K-Means, and
|
5
|
+
Hierarchical clustering, LSI. Many variations of these algorithms are
|
6
|
+
also available, where you can change the similarity matrix, use
|
7
|
+
refined version, i.e., hierarchical/bisecting clustering followed by
|
8
|
+
Kmeans.
|
9
|
+
|
10
|
+
LSI can be also used for clustering. In this first SVD transformation
|
11
|
+
is done, and then the documents in the new space are clustered (any of
|
12
|
+
KMeans, hierarchical/bisecting clustering can be used).
|
6
13
|
|
7
14
|
Hierarchical gives better results, but complexity roughly O(n*n)
|
8
15
|
|
@@ -13,9 +20,24 @@ ysearch-rb from
|
|
13
20
|
|
14
21
|
http://developer.yahoo.com/download/download.html
|
15
22
|
|
23
|
+
The multinomial, complement and weightnormalized complement Bayes
|
24
|
+
algorithm also implementd.
|
25
|
+
|
26
|
+
Lsi can also be used for classification.
|
16
27
|
|
17
|
-
|
18
|
-
clustering... coming soon ... (or submit pacthes/show keen interest, if
|
19
|
-
you want faster results)
|
28
|
+
Thanks a lot to several people who researched and worked on the several ideas.
|
20
29
|
|
21
30
|
Happy hacking......
|
31
|
+
|
32
|
+
|
33
|
+
ToDo:
|
34
|
+
Add more documentation, and explain the API.
|
35
|
+
Add more examples.
|
36
|
+
Incorporate the C version of Gorrell and Simon Funk's GHA and SVD algorithm, also
|
37
|
+
write a Ruby version.
|
38
|
+
Incorporate my own C version of Kernel SVD, and GHA.
|
39
|
+
Explore ways to improve and make a better tokenizer, and introduce other NLP techniques.
|
40
|
+
Add more classification algos: Decision Trees, and various extensions of it.
|
41
|
+
Ruby and C version of SVM and NN.
|
42
|
+
Feature Selection.
|
43
|
+
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
4
|
|
5
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
6
|
+
|
5
7
|
require 'soap/wsdlDriver'
|
6
8
|
require 'clusterer'
|
7
9
|
|
@@ -9,7 +11,7 @@ require 'clusterer'
|
|
9
11
|
|
10
12
|
WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
|
11
13
|
driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
|
12
|
-
query = '
|
14
|
+
query = 'kolkata'
|
13
15
|
key = ""
|
14
16
|
|
15
17
|
results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
|
@@ -17,28 +19,32 @@ count= results.resultElements.size
|
|
17
19
|
max_count = results.estimatedTotalResultsCount.to_i
|
18
20
|
results = results.resultElements
|
19
21
|
|
20
|
-
while (count <
|
22
|
+
while (count < 20 && count <= max_count)
|
21
23
|
more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
|
22
24
|
results.concat(more_results.resultElements)
|
23
25
|
count += more_results.resultElements.size
|
24
26
|
end
|
25
27
|
|
26
|
-
clusters = Clusterer::Clustering.
|
27
|
-
|
28
|
+
clusters = Clusterer::Clustering.cluster(:kmeans, results, :no_stem => true, :tokenizer => :simple_ngram_tokenizer) {|r|
|
29
|
+
r.title.to_s.gsub(/<\/?[^>]*>/, "") + " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")}
|
28
30
|
|
29
31
|
#writing the output
|
30
32
|
File.open("temp.html","w") do |f|
|
31
33
|
f.write("<ul>")
|
32
34
|
clusters.each do |clus|
|
33
35
|
f.write("<li>")
|
36
|
+
f.write("<h4>")
|
37
|
+
clus.centroid.to_a.sort{|a,b| b[1] <=> a[1]}.slice(0,5).each {|w| f.write("#{w[0]} - #{format '%.2f',w[1]}, ")}
|
38
|
+
f.write("</h4>")
|
34
39
|
f.write("<ul>")
|
35
|
-
clus.each do |
|
40
|
+
clus.documents.each do |doc|
|
41
|
+
result = doc.object
|
36
42
|
f.write("<li>")
|
37
43
|
f.write("<span class='title'>")
|
38
|
-
f.write(
|
44
|
+
f.write(result.title)
|
39
45
|
f.write("</span>")
|
40
46
|
f.write("<span class='snippet'>")
|
41
|
-
f.write(
|
47
|
+
f.write(result.snippet)
|
42
48
|
f.write("</span>")
|
43
49
|
f.write("</li>")
|
44
50
|
end
|
@@ -2,53 +2,40 @@
|
|
2
2
|
|
3
3
|
#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
4
|
|
5
|
-
|
6
|
-
require 'ysearch-rb/lib/ysearch'
|
7
|
-
|
8
|
-
## try using HTML stripping to get better results
|
5
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
9
6
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
##
|
14
|
-
# create a web search object:
|
15
|
-
# Arguments:
|
16
|
-
# 1. App ID (You can get one at http://developer.yahoo.net)
|
17
|
-
# 2. The query
|
18
|
-
# 3. type can be one of: 'all', 'any' or 'phrase'
|
19
|
-
# 4. The no. of results
|
20
|
-
##
|
21
|
-
obj = WebSearch.new('YahooDemo', query, 'all', 100)
|
7
|
+
require 'clusterer'
|
8
|
+
require 'rubygems'
|
9
|
+
require 'yahoo/web_search'
|
22
10
|
|
23
|
-
results = obj.parse_results
|
24
11
|
|
25
|
-
|
26
|
-
|
27
|
-
|
12
|
+
ys = Yahoo::WebSearch.new "mUZGF4TV34F2H2aNPZat57sIgR7P2aKBwQm4aEq3TxOg1lYrlhRdMbbmdhmSwyYoRA4bOw--"
|
13
|
+
query = "kolkata"
|
14
|
+
results, = ys.search query, 10
|
28
15
|
|
29
|
-
|
30
|
-
# more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
|
31
|
-
# results.concat(more_results.resultElements)
|
32
|
-
# count += more_results.resultElements.size
|
33
|
-
# end
|
16
|
+
## try using HTML stripping to get better results
|
34
17
|
|
35
|
-
#
|
36
|
-
clusters = Clusterer::Clustering.
|
37
|
-
|
18
|
+
#kmeans
|
19
|
+
clusters = Clusterer::Clustering.cluster(:hierarchical, results, :no_stem => true, :tokenizer => :simple_ngram_tokenizer){|r|
|
20
|
+
r.title.to_s.gsub(/<\/?[^>]*>/, "") + " " + r.summary.to_s.gsub(/<\/?[^>]*>/, "")}
|
38
21
|
|
39
22
|
#writing the output
|
40
23
|
File.open("temp.html","w") do |f|
|
41
24
|
f.write("<ul>")
|
42
25
|
clusters.each do |clus|
|
43
26
|
f.write("<li>")
|
27
|
+
f.write("<h4>")
|
28
|
+
clus.centroid.to_a.sort{|a,b| b[1] <=> a[1]}.slice(0,5).each {|w| f.write("#{w[0]} - #{format '%.2f',w[1]}, ")}
|
29
|
+
f.write("</h4>")
|
44
30
|
f.write("<ul>")
|
45
|
-
clus.each do |
|
31
|
+
clus.documents.each do |doc|
|
32
|
+
result = doc.object
|
46
33
|
f.write("<li>")
|
47
34
|
f.write("<span class='title'>")
|
48
|
-
f.write(
|
35
|
+
f.write(result.title)
|
49
36
|
f.write("</span>")
|
50
37
|
f.write("<span class='snippet'>")
|
51
|
-
f.write(
|
38
|
+
f.write(result.summary)
|
52
39
|
f.write("</span>")
|
53
40
|
f.write("</li>")
|
54
41
|
end
|
data/lib/clusterer.rb
CHANGED
@@ -1,100 +1,41 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
#--
|
3
2
|
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
4
22
|
|
5
|
-
require 'word_hash'
|
6
|
-
require 'similarity'
|
7
23
|
|
8
24
|
module Clusterer
|
9
|
-
|
10
|
-
class << self
|
11
|
-
#returns clusters containing index of the elements in doc
|
12
|
-
def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
|
13
|
-
similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
|
14
|
-
k = Math.sqrt(docs.size) unless k
|
15
|
-
docs_hash = Array.new(docs.size)
|
16
|
-
clusters = Array.new(k)
|
17
|
-
cluster_centers = Array.new(k)
|
18
|
-
old_cluster_centers = Array.new(k)
|
19
|
-
docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
|
20
|
-
0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
|
21
|
-
iter = 0
|
22
|
-
while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
|
23
|
-
puts "Iteration ....#{iter}...#{clusters.inspect}"
|
24
|
-
0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
|
25
|
-
docs_hash.each_with_index do |doc, i|
|
26
|
-
max_value, max_index = 0, 0
|
27
|
-
cluster_centers.each_with_index do |cen, j|
|
28
|
-
sim = similarity_function.call(doc,cen)
|
29
|
-
max_value, max_index = sim,j if sim >= max_value
|
30
|
-
end
|
31
|
-
clusters[max_index] << i
|
32
|
-
end
|
33
|
-
recalculate_centers(cluster_centers,clusters,docs_hash)
|
34
|
-
iter += 1
|
35
|
-
end
|
36
|
-
clusters
|
37
|
-
end
|
38
|
-
|
39
|
-
def hierarchical_clustering (docs, k = nil, &similarity_function)
|
40
|
-
similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
|
41
|
-
k = Math.sqrt(docs.size) unless k
|
42
|
-
docs_hash = Array.new(docs.size)
|
43
|
-
clusters = Array.new(docs.size)
|
44
|
-
cluster_centers = Array.new(docs.size)
|
45
|
-
docs.each_with_index do |d,i|
|
46
|
-
cluster_centers[i] = d.clean_word_hash
|
47
|
-
clusters[i] = [i]
|
48
|
-
end
|
49
|
-
iter = 0
|
50
|
-
while (clusters.size > k)
|
51
|
-
puts "Iteration ....#{iter}...#{clusters.inspect}"
|
52
|
-
min_value, min_index = clusters.size[0], 0
|
53
|
-
clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
|
54
|
-
p = cluster_centers.delete_at(min_index)
|
55
|
-
c = clusters.delete_at(min_index)
|
56
|
-
max_value, max_index = 0, 0
|
57
|
-
cluster_centers.each_with_index do |cen, j|
|
58
|
-
sim = similarity_function.call(p,cen)
|
59
|
-
max_value, max_index = sim,j if sim >= max_value
|
60
|
-
end
|
61
|
-
merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
|
62
|
-
iter += 1
|
63
|
-
end
|
64
|
-
clusters
|
65
|
-
end
|
66
|
-
|
67
|
-
private
|
68
|
-
#merge cluster 2 into cluster 1
|
69
|
-
def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
|
70
|
-
cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
|
71
|
-
cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
|
72
|
-
cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
|
73
|
-
cluster1.concat(cluster2)
|
74
|
-
cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
|
75
|
-
cluster_center1.delete(:total)
|
76
|
-
end
|
77
|
-
|
78
|
-
def recalculate_centers(cluster_centers,clusters,docs_hash)
|
79
|
-
clusters.each_with_index do |cluster,i|
|
80
|
-
center = { }
|
81
|
-
cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
|
82
|
-
total = 0
|
83
|
-
count = cluster.size
|
84
|
-
count = 1 if count ==0
|
85
|
-
center.each_key {|w| next unless w.class == String;
|
86
|
-
center[w] /=count.to_f; total += center[w]**2}
|
87
|
-
total = 1 if total == 0
|
88
|
-
center[:total] = total
|
89
|
-
cluster_centers[i] = center
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
|
94
|
-
def convergence(new_centers,old_centers)
|
95
|
-
new_centers.each_with_index {|c,i| return false unless c == old_centers[i]}
|
96
|
-
return true
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
25
|
+
VERSION = '0.2.0'
|
100
26
|
end
|
27
|
+
|
28
|
+
require 'mathn'
|
29
|
+
require 'clusterer/stop_words'
|
30
|
+
require 'clusterer/similarity'
|
31
|
+
require 'clusterer/tokenizer'
|
32
|
+
require 'clusterer/document_base'
|
33
|
+
require 'clusterer/inverse_document_frequency'
|
34
|
+
require 'clusterer/document'
|
35
|
+
require 'clusterer/documents_centroid'
|
36
|
+
require 'clusterer/document_array'
|
37
|
+
require 'clusterer/cluster'
|
38
|
+
require 'clusterer/algorithms'
|
39
|
+
require 'clusterer/clustering'
|
40
|
+
require 'clusterer/lsi/lsi'
|
41
|
+
require 'clusterer/bayes'
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class Algorithms
|
25
|
+
class << self
|
26
|
+
|
27
|
+
private
|
28
|
+
def random_cluster_seeds(documents,k)
|
29
|
+
temp = []
|
30
|
+
(1..k).collect do
|
31
|
+
t= nil
|
32
|
+
while(!t || temp.include?(t))
|
33
|
+
t= Cluster.new([documents[rand(documents.size)]]);
|
34
|
+
end
|
35
|
+
temp << t
|
36
|
+
t
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
public
|
41
|
+
def kmeans(documents, k, options = { })
|
42
|
+
old_clusters = Array.new(k)
|
43
|
+
max_iter = options[:maximum_iterations] || 10
|
44
|
+
clusters = options[:seeds] || random_cluster_seeds(documents, k)
|
45
|
+
sim_fun = options[:similarity_function] || :cosine_similarity
|
46
|
+
|
47
|
+
iter = 0
|
48
|
+
while (!max_iter || iter < max_iter) && clusters != old_clusters
|
49
|
+
puts "Iteration ....#{iter}"
|
50
|
+
k.times {|i| old_clusters[i] = clusters[i]; clusters[i] = []}
|
51
|
+
|
52
|
+
documents.each do |document|
|
53
|
+
max_index = (0..k-1).max do |i,j|
|
54
|
+
document.send(sim_fun, old_clusters[i].centroid) <=> document.send(sim_fun, old_clusters[j].centroid)
|
55
|
+
end
|
56
|
+
clusters[max_index] << document
|
57
|
+
end
|
58
|
+
|
59
|
+
k.times {|i| clusters[i] = Cluster.new(clusters[i])}
|
60
|
+
iter += 1
|
61
|
+
end
|
62
|
+
return clusters
|
63
|
+
end
|
64
|
+
|
65
|
+
def bisecting_kmeans(documents, k, options = { })
|
66
|
+
clusters = [Cluster.new(documents)]
|
67
|
+
while clusters.size < k
|
68
|
+
lg_clus = clusters.max {|a, b| a.documents.size <=> b.documents.size} #largest cluster
|
69
|
+
clusters.delete(lg_clus)
|
70
|
+
clusters.concat(kmeans(lg_clus.documents,2))
|
71
|
+
end
|
72
|
+
options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
|
73
|
+
end
|
74
|
+
|
75
|
+
def hierarchical(documents, k, options = { })
|
76
|
+
clusters = documents.collect {|d| Cluster.new([d])}
|
77
|
+
iter = 0
|
78
|
+
sim_fun = options[:similarity_function] || :upgma
|
79
|
+
options[:similarity_function] = nil
|
80
|
+
while clusters.size > k
|
81
|
+
puts "Iteration ....#{iter}"
|
82
|
+
|
83
|
+
pairs = []
|
84
|
+
clusters.each_with_index {|c,i| pairs.concat(clusters.slice(i+1,clusters.size).collect{|f| [c,f] })}
|
85
|
+
pair = pairs.max {|a,b| a[0].send(sim_fun, a[1]) <=> b[0].send(sim_fun, b[1]) }
|
86
|
+
clusters.delete(pair[1])
|
87
|
+
pair[0].merge!(pair[1])
|
88
|
+
|
89
|
+
iter += 1
|
90
|
+
end
|
91
|
+
options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,255 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
#The class Bayes is the base class for implementing different types of Naive
|
25
|
+
#Bayes classifier. The initialize method of this class is protected, so objects
|
26
|
+
#of this class cannot be instantiated.
|
27
|
+
#The Bayesian Formula is P(y|x) = P(x/y) * P(y) / P(x)
|
28
|
+
# posterior = likelhood * prior / evidence
|
29
|
+
#Given the evidence, we have to predict the posterior. The different Bayesian variants
|
30
|
+
#given below calculate likelihood using different methods.
|
31
|
+
#While calculating the posterior since the evidence value is same for all the categories, this
|
32
|
+
#values is not calculated. Also, posterior distribution over all possible categories sum upto 1.
|
33
|
+
class Bayes
|
34
|
+
#an attribute for storing the different types of classes or categories
|
35
|
+
attr_accessor :categories
|
36
|
+
|
37
|
+
protected
|
38
|
+
#The first argument is an Array of categories. Currently no options are supported.
|
39
|
+
def initialize(categories, options = { })
|
40
|
+
@prior_count = Hash.new #stores the number of document of diffrent classes/categories.
|
41
|
+
@categories = categories.collect {|c| c.to_sym }
|
42
|
+
@likelihood_numer = Hash.new #hash of hash for storing the numerator in the likelihood value for each class
|
43
|
+
@likelihood_denom = Hash.new #hash of hash for storing the denominator in the likelihood value for each class
|
44
|
+
@documents_count = 0 #total number of documents
|
45
|
+
@categories.each {|cl| @likelihood_numer[cl] = Hash.new; @likelihood_denom[cl] = 0.0; @prior_count[cl] = 0}
|
46
|
+
end
|
47
|
+
|
48
|
+
#The first argument is the document, which will be used for training, and the
|
49
|
+
#second is the category.
|
50
|
+
def train(document, category)
|
51
|
+
check_class(category)
|
52
|
+
@prior_count[category] +=1
|
53
|
+
@documents_count += 1
|
54
|
+
end
|
55
|
+
|
56
|
+
#The first argument is the document, which should be removed, and the
|
57
|
+
#second is the category to which it belonged.
|
58
|
+
def untrain(document, category)
|
59
|
+
check_class(category)
|
60
|
+
raise StandardError, "There are no documents for this class.",caller if @prior_count[category] <= 0
|
61
|
+
@prior_count[category] -= 1
|
62
|
+
@documents_count -= 1
|
63
|
+
end
|
64
|
+
|
65
|
+
#For an input document returns the probability distribution of the different
|
66
|
+
#categories in the same order as the order in categories array.
|
67
|
+
def distribution
|
68
|
+
posterior = Array.new(@categories.size,0.0)
|
69
|
+
@categories.each_with_index do |cl,ind|
|
70
|
+
posterior[ind] = yield(cl,ind) + Math.log((@prior_count[cl] + 1)/(@documents_count + 1).to_f)
|
71
|
+
end
|
72
|
+
sum = 0
|
73
|
+
posterior.each_with_index {|v,i| posterior[i] = Math.exp(v); sum += posterior[i]}
|
74
|
+
posterior.each_with_index {|v,i| posterior[i] /= sum}
|
75
|
+
posterior
|
76
|
+
end
|
77
|
+
|
78
|
+
public
|
79
|
+
#For an input document returns the prediction in favor of class with the
|
80
|
+
#highest probability.
|
81
|
+
def classify(document, weight = nil)
|
82
|
+
posterior = distribution(document)
|
83
|
+
@categories[(0..(@categories.size - 1)).max {|i,j| posterior[i] <=> posterior[j]}]
|
84
|
+
end
|
85
|
+
|
86
|
+
#This method missing helps in having training and untraining method which have the
|
87
|
+
#category appended to their front. For example:
|
88
|
+
#
|
89
|
+
# train_good document
|
90
|
+
#
|
91
|
+
def method_missing (name, *args)
|
92
|
+
if name.to_s =~ /^(un)?train_/
|
93
|
+
category = name.to_s.gsub(/(un)?train_/, '').to_sym
|
94
|
+
send("#{$1}train",args[0],category)
|
95
|
+
else
|
96
|
+
super
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
def check_class(category)
|
102
|
+
raise ArgumentError,"Unknown class. It should be one of the following #{categories}.",caller unless categories.include?(category)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
#Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
|
107
|
+
#by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
|
108
|
+
#
|
109
|
+
#The basic idea is that likelihood of a document for certain category is directly proportional to
|
110
|
+
#the number of other documents containing the same terms appearing while training for the
|
111
|
+
#particular class.
|
112
|
+
class MultinomialBayes < Bayes
|
113
|
+
def train(document, category)
|
114
|
+
category = category.to_sym
|
115
|
+
super
|
116
|
+
numer, sum = @likelihood_numer[category], 0.0
|
117
|
+
document.each do |term,freq|
|
118
|
+
numer[term] = (numer[term] || 0) + freq
|
119
|
+
sum += freq
|
120
|
+
end
|
121
|
+
@likelihood_denom[category] += sum
|
122
|
+
end
|
123
|
+
|
124
|
+
def untrain(document, category)
|
125
|
+
category = category.to_sym
|
126
|
+
super
|
127
|
+
numer, sum = @likelihood_numer[category], 0.0
|
128
|
+
document.each do |term,freq|
|
129
|
+
if numer[term]
|
130
|
+
numer[term] = [numer[term] - freq, 0].max
|
131
|
+
sum += freq
|
132
|
+
end
|
133
|
+
end
|
134
|
+
@likelihood_denom[category] = [@likelihood_denom[category] - sum, 0.0].max
|
135
|
+
end
|
136
|
+
|
137
|
+
def distribution(document)
|
138
|
+
super() do |cl,ind|
|
139
|
+
numer, denom, sum = @likelihood_numer[cl], (1 + @likelihood_denom[cl]), 0.0
|
140
|
+
document.each {|term,freq| sum += freq * Math.log((1 + (numer[term] || 0))/denom)}
|
141
|
+
sum
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
#Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
|
147
|
+
#by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
|
148
|
+
#
|
149
|
+
#The idea is that likelihood of a document for certain category is inversely proportional to
|
150
|
+
#the number of other documents containing the same terms appearing in other classes. Notice, the
|
151
|
+
#difference with MultiNomialBayes, and hence it is called complement.
|
152
|
+
#Though the authors claim that this performs better than MultiNomialBayes, but take the results
|
153
|
+
#with a pinch of salt, the performance of MultiNomial may be better with balanced datasets.
|
154
|
+
#If the dataset is skewed with the minority class being important, use ComplementBayes.
|
155
|
+
class ComplementBayes < Bayes
|
156
|
+
def train(document, category)
|
157
|
+
category = category.to_sym
|
158
|
+
super
|
159
|
+
(@categories - [category]).each_with_index do |cl,ind|
|
160
|
+
numer, sum = @likelihood_numer[cl], 0.0
|
161
|
+
document.each do |term,freq|
|
162
|
+
numer[term] = (numer[term] || 0) + freq
|
163
|
+
sum += freq
|
164
|
+
end
|
165
|
+
@likelihood_denom[cl] += sum
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def untrain(document, category)
|
170
|
+
category = category.to_sym
|
171
|
+
super
|
172
|
+
(@categories - [category]).each_with_index do |cl,ind|
|
173
|
+
numer, sum = @likelihood_numer[category], 0.0
|
174
|
+
document.each do |term,freq|
|
175
|
+
if numer[term]
|
176
|
+
numer[term] = [numer[term] - freq, 0].max
|
177
|
+
sum += freq
|
178
|
+
end
|
179
|
+
end
|
180
|
+
@likelihood_denom[category] = [@likelihood_denom[category] - sum, 0.0].max
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def distribution(document)
|
185
|
+
super() do |cl,ind|
|
186
|
+
numer, denom, sum = @likelihood_numer[cl], (1 + @likelihood_denom[cl]), 0.0
|
187
|
+
document.each {|term, freq| sum += freq * Math.log((1 + (numer[term] || 0))/denom)}
|
188
|
+
-sum
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
##Module to help in implementing weighted form of MultimonialBayes and ComplementBayes.
|
194
|
+
##For performance reasons the normalized classifier weights are cached. These weights
|
195
|
+
#are calculated only when the classifier is first used for training or prediction.
|
196
|
+
#Training or Untraining an instance clears the cached normalized weights.
|
197
|
+
module WeightNormalized
|
198
|
+
def initialize(categories, options = { })
|
199
|
+
super
|
200
|
+
@weighted_likelihood = Hash.new
|
201
|
+
end
|
202
|
+
|
203
|
+
def train(document, category)
|
204
|
+
super
|
205
|
+
@weighted_likelihood.clear
|
206
|
+
end
|
207
|
+
|
208
|
+
def untrain(document, category)
|
209
|
+
super
|
210
|
+
@weighted_likelihood.clear
|
211
|
+
end
|
212
|
+
|
213
|
+
private
|
214
|
+
def weighted_likelihood(category)
|
215
|
+
@weighted_likelihood[category] ||= begin
|
216
|
+
sum, le, denom = 0.0, Hash.new, (1 + @likelihood_denom[category])
|
217
|
+
numer =
|
218
|
+
@likelihood_numer[category].each do |term,freq|
|
219
|
+
le[term] = Math.log((1 + freq)/denom)
|
220
|
+
sum += le[term]
|
221
|
+
end
|
222
|
+
le.each {|term, weight| le[term] = weight/sum }
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
#Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
|
228
|
+
#by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
|
229
|
+
#
|
230
|
+
#An improved complement bayes, the authors claim that this algorithm performs better, then the
|
231
|
+
#ComplementBayes. The weights are normalized, before using this algorithm.
|
232
|
+
class WeightNormalizedComplementBayes < ComplementBayes
|
233
|
+
include WeightNormalized
|
234
|
+
def distribution(document)
|
235
|
+
self.class.superclass.superclass.instance_method(:distribution).bind(self).call do |cl,ind|
|
236
|
+
we, sum = weighted_likelihood(cl), 0.0
|
237
|
+
document.each {|term,freq| sum += freq * (we[term] || 0)}
|
238
|
+
-sum
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
#Hopefully an improved MultinomialBayes, based on the same ideas as the WeightNormalizedComplementBayes
|
244
|
+
#only using MultinomialBayes as the base. The weights are normalized, before using this algorithm.
|
245
|
+
class WeightNormalizedMultinomialBayes < MultinomialBayes
|
246
|
+
include WeightNormalized
|
247
|
+
def distribution(document)
|
248
|
+
self.class.superclass.superclass.instance_method(:distribution).bind(self).call do |cl,ind|
|
249
|
+
we, sum = weighted_likelihood(cl), 0.0
|
250
|
+
document.each {|term,freq| sum += freq * (we[term] || 0)}
|
251
|
+
sum
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|