clusterer 0.1.0 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +29 -7
- data/examples/google_search_cluster.rb +13 -7
- data/examples/yahoo_search_cluster.rb +18 -31
- data/lib/clusterer.rb +36 -95
- data/lib/clusterer/algorithms.rb +95 -0
- data/lib/clusterer/bayes.rb +255 -0
- data/lib/clusterer/cluster.rb +56 -0
- data/lib/clusterer/clustering.rb +35 -0
- data/lib/clusterer/document.rb +71 -0
- data/lib/clusterer/document_array.rb +79 -0
- data/lib/clusterer/document_base.rb +32 -0
- data/lib/clusterer/documents_centroid.rb +44 -0
- data/lib/clusterer/inverse_document_frequency.rb +83 -0
- data/lib/clusterer/lsi/dmatrix.rb +132 -0
- data/lib/clusterer/lsi/document_vector.rb +54 -0
- data/lib/clusterer/lsi/documents_centroid_vector.rb +51 -0
- data/lib/clusterer/lsi/lsi.rb +95 -0
- data/lib/clusterer/similarity.rb +34 -0
- data/lib/{word_hash.rb → clusterer/stop_words.rb} +21 -23
- data/lib/clusterer/tokenizer.rb +70 -0
- data/tests/algorithms_test.rb +48 -0
- data/tests/bayes_test.rb +68 -0
- data/tests/cluster_test.rb +54 -0
- data/tests/document_array_test.rb +64 -0
- data/tests/document_centroid_test.rb +64 -0
- data/tests/document_test.rb +71 -0
- data/tests/inverse_document_frequency_test.rb +76 -0
- data/tests/lsi_test.rb +77 -0
- data/tests/similarity_test.rb +62 -0
- data/tests/tokenizer_test.rb +72 -0
- metadata +35 -9
- data/lib/similarity.rb +0 -27
- data/tests/clusterer_test.rb +0 -20
data/README
CHANGED
@@ -1,8 +1,15 @@
|
|
1
|
-
A ruby library which implements
|
2
|
-
mining.
|
1
|
+
A ruby library which implements clustering and classification
|
2
|
+
algorithms for text mining.
|
3
3
|
|
4
|
-
|
5
|
-
clustering.
|
4
|
+
Clustering algorithms currently implemented are - K-Means, and
|
5
|
+
Hierarchical clustering, LSI. Many variations of these algorithms are
|
6
|
+
also available, where you can change the similarity matrix, use
|
7
|
+
refined version, i.e., hierarchical/bisecting clustering followed by
|
8
|
+
Kmeans.
|
9
|
+
|
10
|
+
LSI can be also used for clustering. In this first SVD transformation
|
11
|
+
is done, and then the documents in the new space are clustered (any of
|
12
|
+
KMeans, hierarchical/bisecting clustering can be used).
|
6
13
|
|
7
14
|
Hierarchical gives better results, but complexity roughly O(n*n)
|
8
15
|
|
@@ -13,9 +20,24 @@ ysearch-rb from
|
|
13
20
|
|
14
21
|
http://developer.yahoo.com/download/download.html
|
15
22
|
|
23
|
+
The multinomial, complement and weightnormalized complement Bayes
|
24
|
+
algorithm also implementd.
|
25
|
+
|
26
|
+
Lsi can also be used for classification.
|
16
27
|
|
17
|
-
|
18
|
-
clustering... coming soon ... (or submit pacthes/show keen interest, if
|
19
|
-
you want faster results)
|
28
|
+
Thanks a lot to several people who researched and worked on the several ideas.
|
20
29
|
|
21
30
|
Happy hacking......
|
31
|
+
|
32
|
+
|
33
|
+
ToDo:
|
34
|
+
Add more documentation, and explain the API.
|
35
|
+
Add more examples.
|
36
|
+
Incorporate the C version of Gorrell and Simon Funk's GHA and SVD algorithm, also
|
37
|
+
write a Ruby version.
|
38
|
+
Incorporate my own C version of Kernel SVD, and GHA.
|
39
|
+
Explore ways to improve and make a better tokenizer, and introduce other NLP techniques.
|
40
|
+
Add more classification algos: Decision Trees, and various extensions of it.
|
41
|
+
Ruby and C version of SVM and NN.
|
42
|
+
Feature Selection.
|
43
|
+
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
4
|
|
5
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
6
|
+
|
5
7
|
require 'soap/wsdlDriver'
|
6
8
|
require 'clusterer'
|
7
9
|
|
@@ -9,7 +11,7 @@ require 'clusterer'
|
|
9
11
|
|
10
12
|
WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
|
11
13
|
driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
|
12
|
-
query = '
|
14
|
+
query = 'kolkata'
|
13
15
|
key = ""
|
14
16
|
|
15
17
|
results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
|
@@ -17,28 +19,32 @@ count= results.resultElements.size
|
|
17
19
|
max_count = results.estimatedTotalResultsCount.to_i
|
18
20
|
results = results.resultElements
|
19
21
|
|
20
|
-
while (count <
|
22
|
+
while (count < 20 && count <= max_count)
|
21
23
|
more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
|
22
24
|
results.concat(more_results.resultElements)
|
23
25
|
count += more_results.resultElements.size
|
24
26
|
end
|
25
27
|
|
26
|
-
clusters = Clusterer::Clustering.
|
27
|
-
|
28
|
+
clusters = Clusterer::Clustering.cluster(:kmeans, results, :no_stem => true, :tokenizer => :simple_ngram_tokenizer) {|r|
|
29
|
+
r.title.to_s.gsub(/<\/?[^>]*>/, "") + " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")}
|
28
30
|
|
29
31
|
#writing the output
|
30
32
|
File.open("temp.html","w") do |f|
|
31
33
|
f.write("<ul>")
|
32
34
|
clusters.each do |clus|
|
33
35
|
f.write("<li>")
|
36
|
+
f.write("<h4>")
|
37
|
+
clus.centroid.to_a.sort{|a,b| b[1] <=> a[1]}.slice(0,5).each {|w| f.write("#{w[0]} - #{format '%.2f',w[1]}, ")}
|
38
|
+
f.write("</h4>")
|
34
39
|
f.write("<ul>")
|
35
|
-
clus.each do |
|
40
|
+
clus.documents.each do |doc|
|
41
|
+
result = doc.object
|
36
42
|
f.write("<li>")
|
37
43
|
f.write("<span class='title'>")
|
38
|
-
f.write(
|
44
|
+
f.write(result.title)
|
39
45
|
f.write("</span>")
|
40
46
|
f.write("<span class='snippet'>")
|
41
|
-
f.write(
|
47
|
+
f.write(result.snippet)
|
42
48
|
f.write("</span>")
|
43
49
|
f.write("</li>")
|
44
50
|
end
|
@@ -2,53 +2,40 @@
|
|
2
2
|
|
3
3
|
#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
4
|
|
5
|
-
|
6
|
-
require 'ysearch-rb/lib/ysearch'
|
7
|
-
|
8
|
-
## try using HTML stripping to get better results
|
5
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
9
6
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
##
|
14
|
-
# create a web search object:
|
15
|
-
# Arguments:
|
16
|
-
# 1. App ID (You can get one at http://developer.yahoo.net)
|
17
|
-
# 2. The query
|
18
|
-
# 3. type can be one of: 'all', 'any' or 'phrase'
|
19
|
-
# 4. The no. of results
|
20
|
-
##
|
21
|
-
obj = WebSearch.new('YahooDemo', query, 'all', 100)
|
7
|
+
require 'clusterer'
|
8
|
+
require 'rubygems'
|
9
|
+
require 'yahoo/web_search'
|
22
10
|
|
23
|
-
results = obj.parse_results
|
24
11
|
|
25
|
-
|
26
|
-
|
27
|
-
|
12
|
+
ys = Yahoo::WebSearch.new "mUZGF4TV34F2H2aNPZat57sIgR7P2aKBwQm4aEq3TxOg1lYrlhRdMbbmdhmSwyYoRA4bOw--"
|
13
|
+
query = "kolkata"
|
14
|
+
results, = ys.search query, 10
|
28
15
|
|
29
|
-
|
30
|
-
# more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
|
31
|
-
# results.concat(more_results.resultElements)
|
32
|
-
# count += more_results.resultElements.size
|
33
|
-
# end
|
16
|
+
## try using HTML stripping to get better results
|
34
17
|
|
35
|
-
#
|
36
|
-
clusters = Clusterer::Clustering.
|
37
|
-
|
18
|
+
#kmeans
|
19
|
+
clusters = Clusterer::Clustering.cluster(:hierarchical, results, :no_stem => true, :tokenizer => :simple_ngram_tokenizer){|r|
|
20
|
+
r.title.to_s.gsub(/<\/?[^>]*>/, "") + " " + r.summary.to_s.gsub(/<\/?[^>]*>/, "")}
|
38
21
|
|
39
22
|
#writing the output
|
40
23
|
File.open("temp.html","w") do |f|
|
41
24
|
f.write("<ul>")
|
42
25
|
clusters.each do |clus|
|
43
26
|
f.write("<li>")
|
27
|
+
f.write("<h4>")
|
28
|
+
clus.centroid.to_a.sort{|a,b| b[1] <=> a[1]}.slice(0,5).each {|w| f.write("#{w[0]} - #{format '%.2f',w[1]}, ")}
|
29
|
+
f.write("</h4>")
|
44
30
|
f.write("<ul>")
|
45
|
-
clus.each do |
|
31
|
+
clus.documents.each do |doc|
|
32
|
+
result = doc.object
|
46
33
|
f.write("<li>")
|
47
34
|
f.write("<span class='title'>")
|
48
|
-
f.write(
|
35
|
+
f.write(result.title)
|
49
36
|
f.write("</span>")
|
50
37
|
f.write("<span class='snippet'>")
|
51
|
-
f.write(
|
38
|
+
f.write(result.summary)
|
52
39
|
f.write("</span>")
|
53
40
|
f.write("</li>")
|
54
41
|
end
|
data/lib/clusterer.rb
CHANGED
@@ -1,100 +1,41 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
#--
|
3
2
|
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
4
22
|
|
5
|
-
require 'word_hash'
|
6
|
-
require 'similarity'
|
7
23
|
|
8
24
|
module Clusterer
|
9
|
-
|
10
|
-
class << self
|
11
|
-
#returns clusters containing index of the elements in doc
|
12
|
-
def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
|
13
|
-
similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
|
14
|
-
k = Math.sqrt(docs.size) unless k
|
15
|
-
docs_hash = Array.new(docs.size)
|
16
|
-
clusters = Array.new(k)
|
17
|
-
cluster_centers = Array.new(k)
|
18
|
-
old_cluster_centers = Array.new(k)
|
19
|
-
docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
|
20
|
-
0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
|
21
|
-
iter = 0
|
22
|
-
while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
|
23
|
-
puts "Iteration ....#{iter}...#{clusters.inspect}"
|
24
|
-
0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
|
25
|
-
docs_hash.each_with_index do |doc, i|
|
26
|
-
max_value, max_index = 0, 0
|
27
|
-
cluster_centers.each_with_index do |cen, j|
|
28
|
-
sim = similarity_function.call(doc,cen)
|
29
|
-
max_value, max_index = sim,j if sim >= max_value
|
30
|
-
end
|
31
|
-
clusters[max_index] << i
|
32
|
-
end
|
33
|
-
recalculate_centers(cluster_centers,clusters,docs_hash)
|
34
|
-
iter += 1
|
35
|
-
end
|
36
|
-
clusters
|
37
|
-
end
|
38
|
-
|
39
|
-
def hierarchical_clustering (docs, k = nil, &similarity_function)
|
40
|
-
similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
|
41
|
-
k = Math.sqrt(docs.size) unless k
|
42
|
-
docs_hash = Array.new(docs.size)
|
43
|
-
clusters = Array.new(docs.size)
|
44
|
-
cluster_centers = Array.new(docs.size)
|
45
|
-
docs.each_with_index do |d,i|
|
46
|
-
cluster_centers[i] = d.clean_word_hash
|
47
|
-
clusters[i] = [i]
|
48
|
-
end
|
49
|
-
iter = 0
|
50
|
-
while (clusters.size > k)
|
51
|
-
puts "Iteration ....#{iter}...#{clusters.inspect}"
|
52
|
-
min_value, min_index = clusters.size[0], 0
|
53
|
-
clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
|
54
|
-
p = cluster_centers.delete_at(min_index)
|
55
|
-
c = clusters.delete_at(min_index)
|
56
|
-
max_value, max_index = 0, 0
|
57
|
-
cluster_centers.each_with_index do |cen, j|
|
58
|
-
sim = similarity_function.call(p,cen)
|
59
|
-
max_value, max_index = sim,j if sim >= max_value
|
60
|
-
end
|
61
|
-
merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
|
62
|
-
iter += 1
|
63
|
-
end
|
64
|
-
clusters
|
65
|
-
end
|
66
|
-
|
67
|
-
private
|
68
|
-
#merge cluster 2 into cluster 1
|
69
|
-
def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
|
70
|
-
cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
|
71
|
-
cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
|
72
|
-
cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
|
73
|
-
cluster1.concat(cluster2)
|
74
|
-
cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
|
75
|
-
cluster_center1.delete(:total)
|
76
|
-
end
|
77
|
-
|
78
|
-
def recalculate_centers(cluster_centers,clusters,docs_hash)
|
79
|
-
clusters.each_with_index do |cluster,i|
|
80
|
-
center = { }
|
81
|
-
cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
|
82
|
-
total = 0
|
83
|
-
count = cluster.size
|
84
|
-
count = 1 if count ==0
|
85
|
-
center.each_key {|w| next unless w.class == String;
|
86
|
-
center[w] /=count.to_f; total += center[w]**2}
|
87
|
-
total = 1 if total == 0
|
88
|
-
center[:total] = total
|
89
|
-
cluster_centers[i] = center
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
|
94
|
-
def convergence(new_centers,old_centers)
|
95
|
-
new_centers.each_with_index {|c,i| return false unless c == old_centers[i]}
|
96
|
-
return true
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
25
|
+
VERSION = '0.2.0'
|
100
26
|
end
|
27
|
+
|
28
|
+
require 'mathn'
|
29
|
+
require 'clusterer/stop_words'
|
30
|
+
require 'clusterer/similarity'
|
31
|
+
require 'clusterer/tokenizer'
|
32
|
+
require 'clusterer/document_base'
|
33
|
+
require 'clusterer/inverse_document_frequency'
|
34
|
+
require 'clusterer/document'
|
35
|
+
require 'clusterer/documents_centroid'
|
36
|
+
require 'clusterer/document_array'
|
37
|
+
require 'clusterer/cluster'
|
38
|
+
require 'clusterer/algorithms'
|
39
|
+
require 'clusterer/clustering'
|
40
|
+
require 'clusterer/lsi/lsi'
|
41
|
+
require 'clusterer/bayes'
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
class Algorithms
|
25
|
+
class << self
|
26
|
+
|
27
|
+
private
|
28
|
+
def random_cluster_seeds(documents,k)
|
29
|
+
temp = []
|
30
|
+
(1..k).collect do
|
31
|
+
t= nil
|
32
|
+
while(!t || temp.include?(t))
|
33
|
+
t= Cluster.new([documents[rand(documents.size)]]);
|
34
|
+
end
|
35
|
+
temp << t
|
36
|
+
t
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
public
|
41
|
+
def kmeans(documents, k, options = { })
|
42
|
+
old_clusters = Array.new(k)
|
43
|
+
max_iter = options[:maximum_iterations] || 10
|
44
|
+
clusters = options[:seeds] || random_cluster_seeds(documents, k)
|
45
|
+
sim_fun = options[:similarity_function] || :cosine_similarity
|
46
|
+
|
47
|
+
iter = 0
|
48
|
+
while (!max_iter || iter < max_iter) && clusters != old_clusters
|
49
|
+
puts "Iteration ....#{iter}"
|
50
|
+
k.times {|i| old_clusters[i] = clusters[i]; clusters[i] = []}
|
51
|
+
|
52
|
+
documents.each do |document|
|
53
|
+
max_index = (0..k-1).max do |i,j|
|
54
|
+
document.send(sim_fun, old_clusters[i].centroid) <=> document.send(sim_fun, old_clusters[j].centroid)
|
55
|
+
end
|
56
|
+
clusters[max_index] << document
|
57
|
+
end
|
58
|
+
|
59
|
+
k.times {|i| clusters[i] = Cluster.new(clusters[i])}
|
60
|
+
iter += 1
|
61
|
+
end
|
62
|
+
return clusters
|
63
|
+
end
|
64
|
+
|
65
|
+
def bisecting_kmeans(documents, k, options = { })
|
66
|
+
clusters = [Cluster.new(documents)]
|
67
|
+
while clusters.size < k
|
68
|
+
lg_clus = clusters.max {|a, b| a.documents.size <=> b.documents.size} #largest cluster
|
69
|
+
clusters.delete(lg_clus)
|
70
|
+
clusters.concat(kmeans(lg_clus.documents,2))
|
71
|
+
end
|
72
|
+
options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
|
73
|
+
end
|
74
|
+
|
75
|
+
def hierarchical(documents, k, options = { })
|
76
|
+
clusters = documents.collect {|d| Cluster.new([d])}
|
77
|
+
iter = 0
|
78
|
+
sim_fun = options[:similarity_function] || :upgma
|
79
|
+
options[:similarity_function] = nil
|
80
|
+
while clusters.size > k
|
81
|
+
puts "Iteration ....#{iter}"
|
82
|
+
|
83
|
+
pairs = []
|
84
|
+
clusters.each_with_index {|c,i| pairs.concat(clusters.slice(i+1,clusters.size).collect{|f| [c,f] })}
|
85
|
+
pair = pairs.max {|a,b| a[0].send(sim_fun, a[1]) <=> b[0].send(sim_fun, b[1]) }
|
86
|
+
clusters.delete(pair[1])
|
87
|
+
pair[0].merge!(pair[1])
|
88
|
+
|
89
|
+
iter += 1
|
90
|
+
end
|
91
|
+
options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,255 @@
|
|
1
|
+
#--
|
2
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
|
23
|
+
module Clusterer
|
24
|
+
#The class Bayes is the base class for implementing different types of Naive
|
25
|
+
#Bayes classifier. The initialize method of this class is protected, so objects
|
26
|
+
#of this class cannot be instantiated.
|
27
|
+
#The Bayesian Formula is P(y|x) = P(x/y) * P(y) / P(x)
|
28
|
+
# posterior = likelhood * prior / evidence
|
29
|
+
#Given the evidence, we have to predict the posterior. The different Bayesian variants
|
30
|
+
#given below calculate likelihood using different methods.
|
31
|
+
#While calculating the posterior since the evidence value is same for all the categories, this
|
32
|
+
#values is not calculated. Also, posterior distribution over all possible categories sum upto 1.
|
33
|
+
class Bayes
|
34
|
+
#an attribute for storing the different types of classes or categories
|
35
|
+
attr_accessor :categories
|
36
|
+
|
37
|
+
protected
|
38
|
+
#The first argument is an Array of categories. Currently no options are supported.
|
39
|
+
def initialize(categories, options = { })
|
40
|
+
@prior_count = Hash.new #stores the number of document of diffrent classes/categories.
|
41
|
+
@categories = categories.collect {|c| c.to_sym }
|
42
|
+
@likelihood_numer = Hash.new #hash of hash for storing the numerator in the likelihood value for each class
|
43
|
+
@likelihood_denom = Hash.new #hash of hash for storing the denominator in the likelihood value for each class
|
44
|
+
@documents_count = 0 #total number of documents
|
45
|
+
@categories.each {|cl| @likelihood_numer[cl] = Hash.new; @likelihood_denom[cl] = 0.0; @prior_count[cl] = 0}
|
46
|
+
end
|
47
|
+
|
48
|
+
#The first argument is the document, which will be used for training, and the
|
49
|
+
#second is the category.
|
50
|
+
def train(document, category)
|
51
|
+
check_class(category)
|
52
|
+
@prior_count[category] +=1
|
53
|
+
@documents_count += 1
|
54
|
+
end
|
55
|
+
|
56
|
+
#The first argument is the document, which should be removed, and the
|
57
|
+
#second is the category to which it belonged.
|
58
|
+
def untrain(document, category)
|
59
|
+
check_class(category)
|
60
|
+
raise StandardError, "There are no documents for this class.",caller if @prior_count[category] <= 0
|
61
|
+
@prior_count[category] -= 1
|
62
|
+
@documents_count -= 1
|
63
|
+
end
|
64
|
+
|
65
|
+
#For an input document returns the probability distribution of the different
|
66
|
+
#categories in the same order as the order in categories array.
|
67
|
+
def distribution
|
68
|
+
posterior = Array.new(@categories.size,0.0)
|
69
|
+
@categories.each_with_index do |cl,ind|
|
70
|
+
posterior[ind] = yield(cl,ind) + Math.log((@prior_count[cl] + 1)/(@documents_count + 1).to_f)
|
71
|
+
end
|
72
|
+
sum = 0
|
73
|
+
posterior.each_with_index {|v,i| posterior[i] = Math.exp(v); sum += posterior[i]}
|
74
|
+
posterior.each_with_index {|v,i| posterior[i] /= sum}
|
75
|
+
posterior
|
76
|
+
end
|
77
|
+
|
78
|
+
public
|
79
|
+
#For an input document returns the prediction in favor of class with the
|
80
|
+
#highest probability.
|
81
|
+
def classify(document, weight = nil)
|
82
|
+
posterior = distribution(document)
|
83
|
+
@categories[(0..(@categories.size - 1)).max {|i,j| posterior[i] <=> posterior[j]}]
|
84
|
+
end
|
85
|
+
|
86
|
+
#This method missing helps in having training and untraining method which have the
|
87
|
+
#category appended to their front. For example:
|
88
|
+
#
|
89
|
+
# train_good document
|
90
|
+
#
|
91
|
+
def method_missing (name, *args)
|
92
|
+
if name.to_s =~ /^(un)?train_/
|
93
|
+
category = name.to_s.gsub(/(un)?train_/, '').to_sym
|
94
|
+
send("#{$1}train",args[0],category)
|
95
|
+
else
|
96
|
+
super
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
def check_class(category)
|
102
|
+
raise ArgumentError,"Unknown class. It should be one of the following #{categories}.",caller unless categories.include?(category)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
#Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
|
107
|
+
#by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
|
108
|
+
#
|
109
|
+
#The basic idea is that likelihood of a document for certain category is directly proportional to
|
110
|
+
#the number of other documents containing the same terms appearing while training for the
|
111
|
+
#particular class.
|
112
|
+
class MultinomialBayes < Bayes
|
113
|
+
def train(document, category)
|
114
|
+
category = category.to_sym
|
115
|
+
super
|
116
|
+
numer, sum = @likelihood_numer[category], 0.0
|
117
|
+
document.each do |term,freq|
|
118
|
+
numer[term] = (numer[term] || 0) + freq
|
119
|
+
sum += freq
|
120
|
+
end
|
121
|
+
@likelihood_denom[category] += sum
|
122
|
+
end
|
123
|
+
|
124
|
+
def untrain(document, category)
|
125
|
+
category = category.to_sym
|
126
|
+
super
|
127
|
+
numer, sum = @likelihood_numer[category], 0.0
|
128
|
+
document.each do |term,freq|
|
129
|
+
if numer[term]
|
130
|
+
numer[term] = [numer[term] - freq, 0].max
|
131
|
+
sum += freq
|
132
|
+
end
|
133
|
+
end
|
134
|
+
@likelihood_denom[category] = [@likelihood_denom[category] - sum, 0.0].max
|
135
|
+
end
|
136
|
+
|
137
|
+
def distribution(document)
|
138
|
+
super() do |cl,ind|
|
139
|
+
numer, denom, sum = @likelihood_numer[cl], (1 + @likelihood_denom[cl]), 0.0
|
140
|
+
document.each {|term,freq| sum += freq * Math.log((1 + (numer[term] || 0))/denom)}
|
141
|
+
sum
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
#Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
|
147
|
+
#by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
|
148
|
+
#
|
149
|
+
#The idea is that likelihood of a document for certain category is inversely proportional to
|
150
|
+
#the number of other documents containing the same terms appearing in other classes. Notice, the
|
151
|
+
#difference with MultiNomialBayes, and hence it is called complement.
|
152
|
+
#Though the authors claim that this performs better than MultiNomialBayes, but take the results
|
153
|
+
#with a pinch of salt, the performance of MultiNomial may be better with balanced datasets.
|
154
|
+
#If the dataset is skewed with the minority class being important, use ComplementBayes.
|
155
|
+
class ComplementBayes < Bayes
|
156
|
+
def train(document, category)
|
157
|
+
category = category.to_sym
|
158
|
+
super
|
159
|
+
(@categories - [category]).each_with_index do |cl,ind|
|
160
|
+
numer, sum = @likelihood_numer[cl], 0.0
|
161
|
+
document.each do |term,freq|
|
162
|
+
numer[term] = (numer[term] || 0) + freq
|
163
|
+
sum += freq
|
164
|
+
end
|
165
|
+
@likelihood_denom[cl] += sum
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def untrain(document, category)
|
170
|
+
category = category.to_sym
|
171
|
+
super
|
172
|
+
(@categories - [category]).each_with_index do |cl,ind|
|
173
|
+
numer, sum = @likelihood_numer[category], 0.0
|
174
|
+
document.each do |term,freq|
|
175
|
+
if numer[term]
|
176
|
+
numer[term] = [numer[term] - freq, 0].max
|
177
|
+
sum += freq
|
178
|
+
end
|
179
|
+
end
|
180
|
+
@likelihood_denom[category] = [@likelihood_denom[category] - sum, 0.0].max
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def distribution(document)
|
185
|
+
super() do |cl,ind|
|
186
|
+
numer, denom, sum = @likelihood_numer[cl], (1 + @likelihood_denom[cl]), 0.0
|
187
|
+
document.each {|term, freq| sum += freq * Math.log((1 + (numer[term] || 0))/denom)}
|
188
|
+
-sum
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
##Module to help in implementing weighted form of MultimonialBayes and ComplementBayes.
|
194
|
+
##For performance reasons the normalized classifier weights are cached. These weights
|
195
|
+
#are calculated only when the classifier is first used for training or prediction.
|
196
|
+
#Training or Untraining an instance clears the cached normalized weights.
|
197
|
+
module WeightNormalized
|
198
|
+
def initialize(categories, options = { })
|
199
|
+
super
|
200
|
+
@weighted_likelihood = Hash.new
|
201
|
+
end
|
202
|
+
|
203
|
+
def train(document, category)
|
204
|
+
super
|
205
|
+
@weighted_likelihood.clear
|
206
|
+
end
|
207
|
+
|
208
|
+
def untrain(document, category)
|
209
|
+
super
|
210
|
+
@weighted_likelihood.clear
|
211
|
+
end
|
212
|
+
|
213
|
+
private
|
214
|
+
def weighted_likelihood(category)
|
215
|
+
@weighted_likelihood[category] ||= begin
|
216
|
+
sum, le, denom = 0.0, Hash.new, (1 + @likelihood_denom[category])
|
217
|
+
numer =
|
218
|
+
@likelihood_numer[category].each do |term,freq|
|
219
|
+
le[term] = Math.log((1 + freq)/denom)
|
220
|
+
sum += le[term]
|
221
|
+
end
|
222
|
+
le.each {|term, weight| le[term] = weight/sum }
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
#Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
|
228
|
+
#by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
|
229
|
+
#
|
230
|
+
#An improved complement bayes, the authors claim that this algorithm performs better, then the
|
231
|
+
#ComplementBayes. The weights are normalized, before using this algorithm.
|
232
|
+
class WeightNormalizedComplementBayes < ComplementBayes
|
233
|
+
include WeightNormalized
|
234
|
+
def distribution(document)
|
235
|
+
self.class.superclass.superclass.instance_method(:distribution).bind(self).call do |cl,ind|
|
236
|
+
we, sum = weighted_likelihood(cl), 0.0
|
237
|
+
document.each {|term,freq| sum += freq * (we[term] || 0)}
|
238
|
+
-sum
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
#Hopefully an improved MultinomialBayes, based on the same ideas as the WeightNormalizedComplementBayes
|
244
|
+
#only using MultinomialBayes as the base. The weights are normalized, before using this algorithm.
|
245
|
+
class WeightNormalizedMultinomialBayes < MultinomialBayes
|
246
|
+
include WeightNormalized
|
247
|
+
def distribution(document)
|
248
|
+
self.class.superclass.superclass.instance_method(:distribution).bind(self).call do |cl,ind|
|
249
|
+
we, sum = weighted_likelihood(cl), 0.0
|
250
|
+
document.each {|term,freq| sum += freq * (we[term] || 0)}
|
251
|
+
sum
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|