clusterer 0.1.0 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -1,8 +1,15 @@
1
- A ruby library which implements clustering algorithms for text
2
- mining.
1
+ A ruby library which implements clustering and classification
2
+ algorithms for text mining.
3
3
 
4
- Currently implemented algorithms are K-Means, and Hierarchical
5
- clustering.
4
+ Clustering algorithms currently implemented are - K-Means, and
5
+ Hierarchical clustering, LSI. Many variations of these algorithms are
6
+ also available, where you can change the similarity matrix, use
7
+ refined version, i.e., hierarchical/bisecting clustering followed by
8
+ Kmeans.
9
+
10
+ LSI can be also used for clustering. In this first SVD transformation
11
+ is done, and then the documents in the new space are clustered (any of
12
+ KMeans, hierarchical/bisecting clustering can be used).
6
13
 
7
14
  Hierarchical gives better results, but complexity roughly O(n*n)
8
15
 
@@ -13,9 +20,24 @@ ysearch-rb from
13
20
 
14
21
  http://developer.yahoo.com/download/download.html
15
22
 
23
+ The multinomial, complement and weightnormalized complement Bayes
24
+ algorithm also implementd.
25
+
26
+ Lsi can also be used for classification.
16
27
 
17
- Hybrid clustering algorithms + more similarity metrics + semi-supervised
18
- clustering... coming soon ... (or submit pacthes/show keen interest, if
19
- you want faster results)
28
+ Thanks a lot to several people who researched and worked on the several ideas.
20
29
 
21
30
  Happy hacking......
31
+
32
+
33
+ ToDo:
34
+ Add more documentation, and explain the API.
35
+ Add more examples.
36
+ Incorporate the C version of Gorrell and Simon Funk's GHA and SVD algorithm, also
37
+ write a Ruby version.
38
+ Incorporate my own C version of Kernel SVD, and GHA.
39
+ Explore ways to improve and make a better tokenizer, and introduce other NLP techniques.
40
+ Add more classification algos: Decision Trees, and various extensions of it.
41
+ Ruby and C version of SVM and NN.
42
+ Feature Selection.
43
+
@@ -2,6 +2,8 @@
2
2
 
3
3
  #Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
4
 
5
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
6
+
5
7
  require 'soap/wsdlDriver'
6
8
  require 'clusterer'
7
9
 
@@ -9,7 +11,7 @@ require 'clusterer'
9
11
 
10
12
  WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
11
13
  driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
12
- query = 'kreeti'
14
+ query = 'kolkata'
13
15
  key = ""
14
16
 
15
17
  results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
@@ -17,28 +19,32 @@ count= results.resultElements.size
17
19
  max_count = results.estimatedTotalResultsCount.to_i
18
20
  results = results.resultElements
19
21
 
20
- while (count < 100 && count <= max_count)
22
+ while (count < 20 && count <= max_count)
21
23
  more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
22
24
  results.concat(more_results.resultElements)
23
25
  count += more_results.resultElements.size
24
26
  end
25
27
 
26
- clusters = Clusterer::Clustering.kmeans_clustering(results.collect {|r| r.title.to_s.gsub(/<\/?[^>]*>/, "") +
27
- " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")})
28
+ clusters = Clusterer::Clustering.cluster(:kmeans, results, :no_stem => true, :tokenizer => :simple_ngram_tokenizer) {|r|
29
+ r.title.to_s.gsub(/<\/?[^>]*>/, "") + " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")}
28
30
 
29
31
  #writing the output
30
32
  File.open("temp.html","w") do |f|
31
33
  f.write("<ul>")
32
34
  clusters.each do |clus|
33
35
  f.write("<li>")
36
+ f.write("<h4>")
37
+ clus.centroid.to_a.sort{|a,b| b[1] <=> a[1]}.slice(0,5).each {|w| f.write("#{w[0]} - #{format '%.2f',w[1]}, ")}
38
+ f.write("</h4>")
34
39
  f.write("<ul>")
35
- clus.each do |d|
40
+ clus.documents.each do |doc|
41
+ result = doc.object
36
42
  f.write("<li>")
37
43
  f.write("<span class='title'>")
38
- f.write(results[d].title)
44
+ f.write(result.title)
39
45
  f.write("</span>")
40
46
  f.write("<span class='snippet'>")
41
- f.write(results[d].snippet)
47
+ f.write(result.snippet)
42
48
  f.write("</span>")
43
49
  f.write("</li>")
44
50
  end
@@ -2,53 +2,40 @@
2
2
 
3
3
  #Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
4
 
5
- require 'clusterer'
6
- require 'ysearch-rb/lib/ysearch'
7
-
8
- ## try using HTML stripping to get better results
5
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
9
6
 
10
- # get the query parameter
11
- query = "kreeti"
12
-
13
- ##
14
- # create a web search object:
15
- # Arguments:
16
- # 1. App ID (You can get one at http://developer.yahoo.net)
17
- # 2. The query
18
- # 3. type can be one of: 'all', 'any' or 'phrase'
19
- # 4. The no. of results
20
- ##
21
- obj = WebSearch.new('YahooDemo', query, 'all', 100)
7
+ require 'clusterer'
8
+ require 'rubygems'
9
+ require 'yahoo/web_search'
22
10
 
23
- results = obj.parse_results
24
11
 
25
- # count= results.resultElements.size
26
- # max_count = results.estimatedTotalResultsCount.to_i
27
- # results = results.resultElements
12
+ ys = Yahoo::WebSearch.new "mUZGF4TV34F2H2aNPZat57sIgR7P2aKBwQm4aEq3TxOg1lYrlhRdMbbmdhmSwyYoRA4bOw--"
13
+ query = "kolkata"
14
+ results, = ys.search query, 10
28
15
 
29
- # while (count < 100 && count <= max_count)
30
- # more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
31
- # results.concat(more_results.resultElements)
32
- # count += more_results.resultElements.size
33
- # end
16
+ ## try using HTML stripping to get better results
34
17
 
35
- #kmeans_clustering
36
- clusters = Clusterer::Clustering.hierarchical_clustering(results.collect {|r| r['Title'].to_s.gsub(/<\/?[^>]*>/, "") +
37
- " " + r['Summary'].to_s.gsub(/<\/?[^>]*>/, "")})
18
+ #kmeans
19
+ clusters = Clusterer::Clustering.cluster(:hierarchical, results, :no_stem => true, :tokenizer => :simple_ngram_tokenizer){|r|
20
+ r.title.to_s.gsub(/<\/?[^>]*>/, "") + " " + r.summary.to_s.gsub(/<\/?[^>]*>/, "")}
38
21
 
39
22
  #writing the output
40
23
  File.open("temp.html","w") do |f|
41
24
  f.write("<ul>")
42
25
  clusters.each do |clus|
43
26
  f.write("<li>")
27
+ f.write("<h4>")
28
+ clus.centroid.to_a.sort{|a,b| b[1] <=> a[1]}.slice(0,5).each {|w| f.write("#{w[0]} - #{format '%.2f',w[1]}, ")}
29
+ f.write("</h4>")
44
30
  f.write("<ul>")
45
- clus.each do |d|
31
+ clus.documents.each do |doc|
32
+ result = doc.object
46
33
  f.write("<li>")
47
34
  f.write("<span class='title'>")
48
- f.write(results[d]['Title'])
35
+ f.write(result.title)
49
36
  f.write("</span>")
50
37
  f.write("<span class='snippet'>")
51
- f.write(results[d]['Summary'])
38
+ f.write(result.summary)
52
39
  f.write("</span>")
53
40
  f.write("</li>")
54
41
  end
@@ -1,100 +1,41 @@
1
- #The MIT License
2
-
1
+ #--
3
2
  ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
4
22
 
5
- require 'word_hash'
6
- require 'similarity'
7
23
 
8
24
  module Clusterer
9
- class Clustering
10
- class << self
11
- #returns clusters containing index of the elements in doc
12
- def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
13
- similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
14
- k = Math.sqrt(docs.size) unless k
15
- docs_hash = Array.new(docs.size)
16
- clusters = Array.new(k)
17
- cluster_centers = Array.new(k)
18
- old_cluster_centers = Array.new(k)
19
- docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
20
- 0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
21
- iter = 0
22
- while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
23
- puts "Iteration ....#{iter}...#{clusters.inspect}"
24
- 0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
25
- docs_hash.each_with_index do |doc, i|
26
- max_value, max_index = 0, 0
27
- cluster_centers.each_with_index do |cen, j|
28
- sim = similarity_function.call(doc,cen)
29
- max_value, max_index = sim,j if sim >= max_value
30
- end
31
- clusters[max_index] << i
32
- end
33
- recalculate_centers(cluster_centers,clusters,docs_hash)
34
- iter += 1
35
- end
36
- clusters
37
- end
38
-
39
- def hierarchical_clustering (docs, k = nil, &similarity_function)
40
- similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
41
- k = Math.sqrt(docs.size) unless k
42
- docs_hash = Array.new(docs.size)
43
- clusters = Array.new(docs.size)
44
- cluster_centers = Array.new(docs.size)
45
- docs.each_with_index do |d,i|
46
- cluster_centers[i] = d.clean_word_hash
47
- clusters[i] = [i]
48
- end
49
- iter = 0
50
- while (clusters.size > k)
51
- puts "Iteration ....#{iter}...#{clusters.inspect}"
52
- min_value, min_index = clusters.size[0], 0
53
- clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
54
- p = cluster_centers.delete_at(min_index)
55
- c = clusters.delete_at(min_index)
56
- max_value, max_index = 0, 0
57
- cluster_centers.each_with_index do |cen, j|
58
- sim = similarity_function.call(p,cen)
59
- max_value, max_index = sim,j if sim >= max_value
60
- end
61
- merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
62
- iter += 1
63
- end
64
- clusters
65
- end
66
-
67
- private
68
- #merge cluster 2 into cluster 1
69
- def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
70
- cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
71
- cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
72
- cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
73
- cluster1.concat(cluster2)
74
- cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
75
- cluster_center1.delete(:total)
76
- end
77
-
78
- def recalculate_centers(cluster_centers,clusters,docs_hash)
79
- clusters.each_with_index do |cluster,i|
80
- center = { }
81
- cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
82
- total = 0
83
- count = cluster.size
84
- count = 1 if count ==0
85
- center.each_key {|w| next unless w.class == String;
86
- center[w] /=count.to_f; total += center[w]**2}
87
- total = 1 if total == 0
88
- center[:total] = total
89
- cluster_centers[i] = center
90
- end
91
- end
92
-
93
-
94
- def convergence(new_centers,old_centers)
95
- new_centers.each_with_index {|c,i| return false unless c == old_centers[i]}
96
- return true
97
- end
98
- end
99
- end
25
+ VERSION = '0.2.0'
100
26
  end
27
+
28
+ require 'mathn'
29
+ require 'clusterer/stop_words'
30
+ require 'clusterer/similarity'
31
+ require 'clusterer/tokenizer'
32
+ require 'clusterer/document_base'
33
+ require 'clusterer/inverse_document_frequency'
34
+ require 'clusterer/document'
35
+ require 'clusterer/documents_centroid'
36
+ require 'clusterer/document_array'
37
+ require 'clusterer/cluster'
38
+ require 'clusterer/algorithms'
39
+ require 'clusterer/clustering'
40
+ require 'clusterer/lsi/lsi'
41
+ require 'clusterer/bayes'
@@ -0,0 +1,95 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ module Clusterer
24
+ class Algorithms
25
+ class << self
26
+
27
+ private
28
+ def random_cluster_seeds(documents,k)
29
+ temp = []
30
+ (1..k).collect do
31
+ t= nil
32
+ while(!t || temp.include?(t))
33
+ t= Cluster.new([documents[rand(documents.size)]]);
34
+ end
35
+ temp << t
36
+ t
37
+ end
38
+ end
39
+
40
+ public
41
+ def kmeans(documents, k, options = { })
42
+ old_clusters = Array.new(k)
43
+ max_iter = options[:maximum_iterations] || 10
44
+ clusters = options[:seeds] || random_cluster_seeds(documents, k)
45
+ sim_fun = options[:similarity_function] || :cosine_similarity
46
+
47
+ iter = 0
48
+ while (!max_iter || iter < max_iter) && clusters != old_clusters
49
+ puts "Iteration ....#{iter}"
50
+ k.times {|i| old_clusters[i] = clusters[i]; clusters[i] = []}
51
+
52
+ documents.each do |document|
53
+ max_index = (0..k-1).max do |i,j|
54
+ document.send(sim_fun, old_clusters[i].centroid) <=> document.send(sim_fun, old_clusters[j].centroid)
55
+ end
56
+ clusters[max_index] << document
57
+ end
58
+
59
+ k.times {|i| clusters[i] = Cluster.new(clusters[i])}
60
+ iter += 1
61
+ end
62
+ return clusters
63
+ end
64
+
65
+ def bisecting_kmeans(documents, k, options = { })
66
+ clusters = [Cluster.new(documents)]
67
+ while clusters.size < k
68
+ lg_clus = clusters.max {|a, b| a.documents.size <=> b.documents.size} #largest cluster
69
+ clusters.delete(lg_clus)
70
+ clusters.concat(kmeans(lg_clus.documents,2))
71
+ end
72
+ options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
73
+ end
74
+
75
+ def hierarchical(documents, k, options = { })
76
+ clusters = documents.collect {|d| Cluster.new([d])}
77
+ iter = 0
78
+ sim_fun = options[:similarity_function] || :upgma
79
+ options[:similarity_function] = nil
80
+ while clusters.size > k
81
+ puts "Iteration ....#{iter}"
82
+
83
+ pairs = []
84
+ clusters.each_with_index {|c,i| pairs.concat(clusters.slice(i+1,clusters.size).collect{|f| [c,f] })}
85
+ pair = pairs.max {|a,b| a[0].send(sim_fun, a[1]) <=> b[0].send(sim_fun, b[1]) }
86
+ clusters.delete(pair[1])
87
+ pair[0].merge!(pair[1])
88
+
89
+ iter += 1
90
+ end
91
+ options[:refined] ? clusters = kmeans(documents, k, options.merge(:seeds => clusters)) : clusters
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,255 @@
1
+ #--
2
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ module Clusterer
24
+ #The class Bayes is the base class for implementing different types of Naive
25
+ #Bayes classifier. The initialize method of this class is protected, so objects
26
+ #of this class cannot be instantiated.
27
+ #The Bayesian Formula is P(y|x) = P(x/y) * P(y) / P(x)
28
+ # posterior = likelhood * prior / evidence
29
+ #Given the evidence, we have to predict the posterior. The different Bayesian variants
30
+ #given below calculate likelihood using different methods.
31
+ #While calculating the posterior since the evidence value is same for all the categories, this
32
+ #values is not calculated. Also, posterior distribution over all possible categories sum upto 1.
33
+ class Bayes
34
+ #an attribute for storing the different types of classes or categories
35
+ attr_accessor :categories
36
+
37
+ protected
38
+ #The first argument is an Array of categories. Currently no options are supported.
39
+ def initialize(categories, options = { })
40
+ @prior_count = Hash.new #stores the number of document of diffrent classes/categories.
41
+ @categories = categories.collect {|c| c.to_sym }
42
+ @likelihood_numer = Hash.new #hash of hash for storing the numerator in the likelihood value for each class
43
+ @likelihood_denom = Hash.new #hash of hash for storing the denominator in the likelihood value for each class
44
+ @documents_count = 0 #total number of documents
45
+ @categories.each {|cl| @likelihood_numer[cl] = Hash.new; @likelihood_denom[cl] = 0.0; @prior_count[cl] = 0}
46
+ end
47
+
48
+ #The first argument is the document, which will be used for training, and the
49
+ #second is the category.
50
+ def train(document, category)
51
+ check_class(category)
52
+ @prior_count[category] +=1
53
+ @documents_count += 1
54
+ end
55
+
56
+ #The first argument is the document, which should be removed, and the
57
+ #second is the category to which it belonged.
58
+ def untrain(document, category)
59
+ check_class(category)
60
+ raise StandardError, "There are no documents for this class.",caller if @prior_count[category] <= 0
61
+ @prior_count[category] -= 1
62
+ @documents_count -= 1
63
+ end
64
+
65
+ #For an input document returns the probability distribution of the different
66
+ #categories in the same order as the order in categories array.
67
+ def distribution
68
+ posterior = Array.new(@categories.size,0.0)
69
+ @categories.each_with_index do |cl,ind|
70
+ posterior[ind] = yield(cl,ind) + Math.log((@prior_count[cl] + 1)/(@documents_count + 1).to_f)
71
+ end
72
+ sum = 0
73
+ posterior.each_with_index {|v,i| posterior[i] = Math.exp(v); sum += posterior[i]}
74
+ posterior.each_with_index {|v,i| posterior[i] /= sum}
75
+ posterior
76
+ end
77
+
78
+ public
79
+ #For an input document returns the prediction in favor of class with the
80
+ #highest probability.
81
+ def classify(document, weight = nil)
82
+ posterior = distribution(document)
83
+ @categories[(0..(@categories.size - 1)).max {|i,j| posterior[i] <=> posterior[j]}]
84
+ end
85
+
86
+ #This method missing helps in having training and untraining method which have the
87
+ #category appended to their front. For example:
88
+ #
89
+ # train_good document
90
+ #
91
+ def method_missing (name, *args)
92
+ if name.to_s =~ /^(un)?train_/
93
+ category = name.to_s.gsub(/(un)?train_/, '').to_sym
94
+ send("#{$1}train",args[0],category)
95
+ else
96
+ super
97
+ end
98
+ end
99
+
100
+ private
101
+ def check_class(category)
102
+ raise ArgumentError,"Unknown class. It should be one of the following #{categories}.",caller unless categories.include?(category)
103
+ end
104
+ end
105
+
106
+ #Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
107
+ #by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
108
+ #
109
+ #The basic idea is that likelihood of a document for certain category is directly proportional to
110
+ #the number of other documents containing the same terms appearing while training for the
111
+ #particular class.
112
+ class MultinomialBayes < Bayes
113
+ def train(document, category)
114
+ category = category.to_sym
115
+ super
116
+ numer, sum = @likelihood_numer[category], 0.0
117
+ document.each do |term,freq|
118
+ numer[term] = (numer[term] || 0) + freq
119
+ sum += freq
120
+ end
121
+ @likelihood_denom[category] += sum
122
+ end
123
+
124
+ def untrain(document, category)
125
+ category = category.to_sym
126
+ super
127
+ numer, sum = @likelihood_numer[category], 0.0
128
+ document.each do |term,freq|
129
+ if numer[term]
130
+ numer[term] = [numer[term] - freq, 0].max
131
+ sum += freq
132
+ end
133
+ end
134
+ @likelihood_denom[category] = [@likelihood_denom[category] - sum, 0.0].max
135
+ end
136
+
137
+ def distribution(document)
138
+ super() do |cl,ind|
139
+ numer, denom, sum = @likelihood_numer[cl], (1 + @likelihood_denom[cl]), 0.0
140
+ document.each {|term,freq| sum += freq * Math.log((1 + (numer[term] || 0))/denom)}
141
+ sum
142
+ end
143
+ end
144
+ end
145
+
146
+ #Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
147
+ #by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
148
+ #
149
+ #The idea is that likelihood of a document for certain category is inversely proportional to
150
+ #the number of other documents containing the same terms appearing in other classes. Notice, the
151
+ #difference with MultiNomialBayes, and hence it is called complement.
152
+ #Though the authors claim that this performs better than MultiNomialBayes, but take the results
153
+ #with a pinch of salt, the performance of MultiNomial may be better with balanced datasets.
154
+ #If the dataset is skewed with the minority class being important, use ComplementBayes.
155
+ class ComplementBayes < Bayes
156
+ def train(document, category)
157
+ category = category.to_sym
158
+ super
159
+ (@categories - [category]).each_with_index do |cl,ind|
160
+ numer, sum = @likelihood_numer[cl], 0.0
161
+ document.each do |term,freq|
162
+ numer[term] = (numer[term] || 0) + freq
163
+ sum += freq
164
+ end
165
+ @likelihood_denom[cl] += sum
166
+ end
167
+ end
168
+
169
+ def untrain(document, category)
170
+ category = category.to_sym
171
+ super
172
+ (@categories - [category]).each_with_index do |cl,ind|
173
+ numer, sum = @likelihood_numer[category], 0.0
174
+ document.each do |term,freq|
175
+ if numer[term]
176
+ numer[term] = [numer[term] - freq, 0].max
177
+ sum += freq
178
+ end
179
+ end
180
+ @likelihood_denom[category] = [@likelihood_denom[category] - sum, 0.0].max
181
+ end
182
+ end
183
+
184
+ def distribution(document)
185
+ super() do |cl,ind|
186
+ numer, denom, sum = @likelihood_numer[cl], (1 + @likelihood_denom[cl]), 0.0
187
+ document.each {|term, freq| sum += freq * Math.log((1 + (numer[term] || 0))/denom)}
188
+ -sum
189
+ end
190
+ end
191
+ end
192
+
193
+ ##Module to help in implementing weighted form of MultimonialBayes and ComplementBayes.
194
+ ##For performance reasons the normalized classifier weights are cached. These weights
195
+ #are calculated only when the classifier is first used for training or prediction.
196
+ #Training or Untraining an instance clears the cached normalized weights.
197
+ module WeightNormalized
198
+ def initialize(categories, options = { })
199
+ super
200
+ @weighted_likelihood = Hash.new
201
+ end
202
+
203
+ def train(document, category)
204
+ super
205
+ @weighted_likelihood.clear
206
+ end
207
+
208
+ def untrain(document, category)
209
+ super
210
+ @weighted_likelihood.clear
211
+ end
212
+
213
+ private
214
+ def weighted_likelihood(category)
215
+ @weighted_likelihood[category] ||= begin
216
+ sum, le, denom = 0.0, Hash.new, (1 + @likelihood_denom[category])
217
+ numer =
218
+ @likelihood_numer[category].each do |term,freq|
219
+ le[term] = Math.log((1 + freq)/denom)
220
+ sum += le[term]
221
+ end
222
+ le.each {|term, weight| le[term] = weight/sum }
223
+ end
224
+ end
225
+ end
226
+
227
+ #Based on the description given in "Tackling the Poor Assumptions of Naive Bayes Text Classifiers"
228
+ #by Jason D. M. Rennie, Lawrence Shih, Jaime Teevan and David R. Karger, ICML - 2003
229
+ #
230
+ #An improved complement bayes, the authors claim that this algorithm performs better, then the
231
+ #ComplementBayes. The weights are normalized, before using this algorithm.
232
+ class WeightNormalizedComplementBayes < ComplementBayes
233
+ include WeightNormalized
234
+ def distribution(document)
235
+ self.class.superclass.superclass.instance_method(:distribution).bind(self).call do |cl,ind|
236
+ we, sum = weighted_likelihood(cl), 0.0
237
+ document.each {|term,freq| sum += freq * (we[term] || 0)}
238
+ -sum
239
+ end
240
+ end
241
+ end
242
+
243
+ #Hopefully an improved MultinomialBayes, based on the same ideas as the WeightNormalizedComplementBayes
244
+ #only using MultinomialBayes as the base. The weights are normalized, before using this algorithm.
245
+ class WeightNormalizedMultinomialBayes < MultinomialBayes
246
+ include WeightNormalized
247
+ def distribution(document)
248
+ self.class.superclass.superclass.instance_method(:distribution).bind(self).call do |cl,ind|
249
+ we, sum = weighted_likelihood(cl), 0.0
250
+ document.each {|term,freq| sum += freq * (we[term] || 0)}
251
+ sum
252
+ end
253
+ end
254
+ end
255
+ end