sclust 1.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,294 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'rubygems'
26
+ require 'log4r'
27
+ require 'sclust/util/word'
28
+
29
+ module SClust
30
+ module KMean
31
+ class CosineDistance
32
+
33
+ # Given two vectors, compute the distance
34
+ def self.distance(a,b)
35
+
36
+ acc1 = 0.0
37
+ acc2 = 0.0
38
+ acc3 = 0.0
39
+
40
+ a.merge(b).keys.each do |i|
41
+ acc1 += a[i]*b[i]
42
+ acc2 += a[i]*a[i]
43
+ acc3 += b[i]*b[i]
44
+ end
45
+
46
+ v = 1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
47
+
48
+ # Return nil if we detect no distance between documents.
49
+ (v==1)? nil : v
50
+ end
51
+ end
52
+
53
+ class ClusterPoint
54
+
55
+ attr_reader :values, :cluster, :source_object
56
+ attr_writer :cluster, :source_object
57
+
58
+ # Initialize the ClusterPoint with a SparseVector or SparseLabeledVector.
59
+ def initialize(sparse_vector, source_object = nil)
60
+ @values = sparse_vector
61
+ @cluster = nil
62
+ @source_object = source_object
63
+ end
64
+
65
+ def distance(clusterPoint)
66
+ CosineDistance.distance(@values, clusterPoint.values)
67
+ end
68
+
69
+ # Add each item in the cluster point to this cluster point adjusting the values per the given weight.
70
+ # Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
71
+ # this cluster point while a weight value of 0 will have no effect.
72
+ def add(clusterPoint, weight)
73
+ @values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] * (1-weight) ) + (clusterPoint.values[i] * weight)}
74
+ end
75
+
76
+
77
+ # Similar to add, but subtract.
78
+ def sub(clusterPoint, weight)
79
+ @values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / ( 1 - weight ) }
80
+ end
81
+
82
+ # Return the top n words. Return all the terms sorted if n is 0.
83
+ def get_max_terms(n=3)
84
+
85
+ values_to_terms = {}
86
+
87
+ @values.each do |t, v|
88
+ values_to_terms[v] ||= []
89
+ values_to_terms[v] << SClust::Util::Word.new(t, v, {:stemmed_word => t})
90
+ end
91
+
92
+ sorted_values = values_to_terms.keys.sort { |x,y| y <=> x }
93
+
94
+ result = []
95
+
96
+ #n = @values.length if ( n > @values.length || n == 0)
97
+
98
+ catch(:haveEnough) do
99
+
100
+ sorted_values.each do |value|
101
+
102
+ result += values_to_terms[value]
103
+
104
+ throw :haveEnough if result.length >= n
105
+
106
+ end
107
+
108
+ end
109
+
110
+ # Trim our results to exactly the requested size.
111
+ result[0...n]
112
+
113
+ end
114
+
115
+ def get_term_value(term)
116
+ @values[term]
117
+ end
118
+
119
+ end
120
+
121
+ class Cluster
122
+
123
+ attr_reader :center, :size
124
+
125
+ def initialize(centerPoint)
126
+ @fixed = false
127
+ @center = centerPoint.clone
128
+ @size = 1
129
+ end
130
+
131
+ def +(point)
132
+ point.cluster = self
133
+
134
+ @size+=1
135
+
136
+ @center.add(point, 1.0/@size.to_f)
137
+ end
138
+
139
+ def -(point)
140
+ point.cluster = nil
141
+
142
+ @center.sub(point, 1.0/@size.to_f)
143
+
144
+ @size-=1
145
+ end
146
+
147
+ def get_max_terms(n=3)
148
+ @center.get_max_terms(n)
149
+ end
150
+
151
+ end
152
+
153
+ class Clusterer
154
+
155
+ attr_reader :clusters, :points, :cluster_count, :iterations, :logger
156
+ attr_writer :clusters, :points, :cluster_count, :iterations, :logger
157
+
158
+ # Optionally takes a notifier.
159
+ def initialize(points=[])
160
+ @iterations = 3
161
+ @cluster_count = 0
162
+ @points = points
163
+ @clusters = []
164
+ @logger = Log4r::Logger.new('Clusterer')
165
+ @logger.add('default')
166
+
167
+ # Randomly select a few starting documents.
168
+ #build_empty_clusters('crp')
169
+ end
170
+
171
+ # Drop all existing clusters and recreate them using the given method.
172
+ # If the given method is an integer, then that many clusters are created
173
+ # and the centers are randomly chosen from the documents contained in the @points attribute.
174
+ # If it is CRP, then the Chinese Resteraunt Process is used, considering each document
175
+ # and creating a cluster with that document as the center stochastically and proportionally
176
+ # the number of documents already considered.
177
+ def topics=(process)
178
+
179
+ @clusters = []
180
+
181
+ if ( process.is_a?(Integer))
182
+ @logger.info("Building cluster of constant cluster count #{process}.")
183
+ @cluster_count = process
184
+ @cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
185
+
186
+ elsif(process.is_a?(String))
187
+ if ( process == "crp" )
188
+
189
+ @logger.info("Building clusters using CRP.")
190
+
191
+ 1.upto(@points.length) do |i|
192
+
193
+ @cluster_count = 0
194
+
195
+ if ( rand(i) == 0 )
196
+ @clusters << Cluster.new(@points[i-1])
197
+ @cluster_count += 1
198
+ end
199
+
200
+ end
201
+
202
+ @logger.info("Built #{@cluster_count} clusters.")
203
+ end
204
+ end
205
+ end
206
+
207
+ def +(point)
208
+ @points << point
209
+ end
210
+
211
+ def each_cluster(&c)
212
+ @clusters.each { |cluster| yield cluster }
213
+ end
214
+
215
+ def assign_all_points
216
+
217
+ @points.each do |pt|
218
+
219
+ #@logger.debug("Assigning point #{pt}.")
220
+
221
+ # Randomize the first selection to ensure that in the case where there are
222
+ # many centers that are close, each has a (statistically) equal chance of
223
+ # getting the document, thus moving the center, changing the center,
224
+ # and perhaps matching other documents better because of more terms.
225
+ min_cluster = @clusters[rand(@clusters.length)]
226
+ min_dst = min_cluster.center.distance(pt)
227
+
228
+ @clusters.each do |cluster|
229
+
230
+ tmp_distance = cluster.center.distance(pt)
231
+
232
+ if tmp_distance.nil?
233
+ next
234
+
235
+ elsif min_dst.nil?
236
+ min_dst = tmp_distance
237
+ min_cluster = cluster
238
+
239
+ elsif tmp_distance < min_dst
240
+ min_cluster = cluster
241
+ min_dst = tmp_distance
242
+
243
+ end
244
+ end
245
+
246
+ # If a point has a center...
247
+ if pt.cluster
248
+
249
+ # If it is not the same cluster...
250
+ unless pt.cluster.equal? min_cluster
251
+ pt.cluster - pt
252
+ min_cluster + pt
253
+ end
254
+ else
255
+ min_cluster + pt
256
+ end
257
+
258
+ #pt.cluster - pt if pt.cluster
259
+
260
+ #min_cluster + pt
261
+ end
262
+ end
263
+
264
+ def cluster
265
+
266
+ # If we are not initialized, initialize the cluster! :)
267
+ self.build_empty_clusters('crp') unless @clusters && @clusters.size > 0
268
+
269
+ iterations.times do |i|
270
+ @logger.info("Starting iteration #{i+1} of #{iterations}.")
271
+ assign_all_points
272
+ end
273
+ end
274
+
275
+ def get_max_terms(n=3)
276
+ r = []
277
+
278
+ each_cluster do |cluster|
279
+ r << cluster.get_max_terms(n)
280
+ end
281
+
282
+ r
283
+ end
284
+
285
+ # If you edit the document collection behind the scenes in and LDA clusterer, you need to run
286
+ # this to avoid terms with 0 showing up. However, K-Mean has so little document-related
287
+ # state that this method does nothing and is only here for API compatibility.
288
+ # We would like LDA and KMean implementations that are drop-in replacements.
289
+ def rebuild_document_collection()
290
+ end
291
+
292
+ end
293
+ end
294
+ end
@@ -0,0 +1,83 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'sclust/kmean/cluster'
26
+ require 'sclust/util/sparse_vector'
27
+
28
+ module SClust
29
+
30
+ module KMean
31
+
32
+ # A document clusterer that overrides the + operator
33
+ # to allow for adding Document objects.
34
+ class DocumentClusterer < Clusterer
35
+
36
+ attr_reader :document_collection
37
+
38
+ def initialize()
39
+ @document_collection = SClust::Util::DocumentCollection.new()
40
+ super()
41
+ end
42
+
43
+ def <<(d)
44
+ if ( d.is_a?(SClust::Util::Document) )
45
+ @document_collection << d
46
+ else
47
+ @document_collection << SClust::Util::Document.new(d.to_s)
48
+ end
49
+ end
50
+
51
+ # This must be run to conver the document collection into
52
+ # the points in a cluster.
53
+ def initialize_points()
54
+
55
+ point_list = []
56
+
57
+ @document_collection.doclist.each do |doc|
58
+
59
+ doc_terms = SClust::Util::SparseVector.new(0)
60
+
61
+ # Buid a BIG term vector list for this document.
62
+ doc.terms.each_key do |term|
63
+ doc_terms[term] = doc.tf(term) - @document_collection.idf(term)
64
+ end
65
+
66
+ # def initialize(terms, values, source_object = nil)
67
+ point_list << ClusterPoint.new(doc_terms, doc)
68
+ end
69
+
70
+ self.points = point_list
71
+
72
+ end
73
+
74
+ def topics=(n)
75
+
76
+ initialize_points unless ( self.points && self.points.size > 0 )
77
+ super(n)
78
+
79
+ end
80
+
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,243 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+ require 'rubygems'
25
+ require 'sclust/util/word'
26
+ require 'log4r'
27
+
28
+ module SClust
29
+ module LDA
30
+
31
+ class Topic
32
+
33
+ attr_reader :words, :wordcount, :docs
34
+ attr_writer :words, :wordcount, :docs
35
+
36
+ def initialize()
37
+ @words = {}
38
+ @wordcount = 0
39
+ @docs = {}
40
+ end
41
+ end
42
+
43
+ class LDA
44
+
45
+ attr_reader :logger, :iterations, :doclist, :topics
46
+ attr_writer :logger, :iterations, :doclist
47
+
48
+ # Documents may be added after LDA is created, unlike k-mean clustering.
49
+ def initialize(docCol=nil)
50
+ @iterations = 3
51
+ @wordlist = []
52
+ @doclist = []
53
+ @logger = Log4r::Logger.new('Clusterer')
54
+
55
+
56
+ # Array the same size as @wordlist but stores the document object at index i
57
+ # that produced @wordlist[i].
58
+ @word2doc = []
59
+
60
+ self.topics = 10
61
+
62
+ if ( docCol )
63
+ docCol.each {|d| self << d}
64
+ end
65
+ end
66
+
67
+ def <<(document)
68
+ @doclist << document
69
+ @wordlist += document.words
70
+ document.words.length.times {@word2doc << document}
71
+ end
72
+
73
+ def topics=(count)
74
+ @topics = []
75
+ count.times do |t|
76
+ @topics << Topic.new()
77
+ @topic2doc
78
+ end
79
+ end
80
+
81
+ # Build a wordlist index array. This is an array that contains indexes into @wordlist.
82
+ # However, instead of being simply {0,1,2,3...} this array is randomized so that
83
+ # we index into @wordlist in a random order.
84
+ def build_randomized_index_into_words()
85
+ @randomized_word_index = []
86
+
87
+ @wordlist.each_index { |i| @randomized_word_index << i }
88
+
89
+ @wordlist.each_index do |i|
90
+ new_home = (@wordlist.length * rand).to_i
91
+ tmp = @randomized_word_index[i]
92
+ @randomized_word_index[i] = @randomized_word_index[new_home]
93
+ @randomized_word_index[new_home] = tmp
94
+ end
95
+
96
+ end
97
+
98
+ # Compute P(z=j | z..._i, w). Or, the probability that
99
+ # a topic z is the topic j represented by the given word given that word.
100
+ def p_of_z(topic, word)
101
+
102
+ return 0 unless topic.words[word]
103
+
104
+ ((topic.words[word] - 1 + @beta) / (topic.wordcount - topic.words[word] - 1 + @beta * @wordlist.length)) *
105
+ ((topic.docs.size - 1 + @alpha) / (@doclist.size - 1 + @alpha * @topics.size))
106
+
107
+ end
108
+
109
+ def each_radomized_word_index(&call)
110
+ @randomized_word_index.each &call
111
+ end
112
+
113
+ def lda_setup()
114
+ @beta = 0.01
115
+ @alpha = 50.0 / @topics.length
116
+
117
+ build_randomized_index_into_words()
118
+
119
+ @word2topic = []
120
+ @doc2topic = []
121
+
122
+ each_radomized_word_index do |i|
123
+ topic = (@topics.size * rand).to_i
124
+
125
+ @word2topic[i] = topic # Record that this word goes to this topic.
126
+ @topics[topic].words[@wordlist[i]] ||= 0
127
+ @topics[topic].docs[@word2doc[i]] ||= 0
128
+
129
+ @topics[topic].words[@wordlist[i]] += 1 # Record a new word in this topic
130
+ @topics[topic].wordcount += 1 # Total sum of words
131
+ @topics[topic].docs[@word2doc[i]] += 1 # Record this doc index in this topic
132
+ end
133
+
134
+ end
135
+
136
+ # Perform 1 phase of lda
137
+ def lda_once()
138
+ each_radomized_word_index do |random_word_index|
139
+
140
+ random_word = @wordlist[random_word_index]
141
+
142
+ zdist = []
143
+ ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
144
+
145
+ # Compute distribution over z for word i.
146
+ @topics.each do |topic|
147
+ z = p_of_z(topic, random_word)
148
+ ztotal += z
149
+ zdist << z
150
+ end
151
+
152
+ r = rand * ztotal # Random value to pick topic with.
153
+ zacc = 0.0 # Accumulator of seen values of zdist[topici].
154
+ topici = (rand() * @topics.size).to_i
155
+
156
+ # Pick a topic, t
157
+
158
+ catch(:picked_topic) do
159
+ @topics.each_index do |topici|
160
+ zacc += zdist[topici]
161
+ throw :picked_topic if r < zacc
162
+ end
163
+ end
164
+
165
+ topic = @topics[topici]
166
+
167
+ previous_topic = @topics[@word2topic[random_word_index]]
168
+
169
+ # Skip if src and dst topic are the same
170
+ next if @word2topic[random_word_index] == topici
171
+
172
+ # Remove word from previous topic.
173
+
174
+ if ( previous_topic.words[@wordlist[random_word_index]] > 0 )
175
+ previous_topic.words[@wordlist[random_word_index]] -= 1 # Remove a new word in this topic
176
+ previous_topic.wordcount -= 1 # Reduce sum of words
177
+ previous_topic.docs[@word2doc[random_word_index]] -= 1 # Remove this doc index in this topic
178
+
179
+ previous_topic.docs.delete(@word2doc[random_word_index]) if previous_topic.docs[@word2doc[random_word_index]] <= 0
180
+ end
181
+
182
+ topic.words[@wordlist[random_word_index]] ||= 0 # If word was not in previous topic, add to this one.
183
+ topic.docs[@word2doc[random_word_index]] ||= 0 # If doc was not previously here.
184
+
185
+ # Add word to chosen topic.
186
+ @word2topic[random_word_index] = topici # Record that this word goes to this topic.
187
+ topic.words[@wordlist[random_word_index]] += 1 # Record a new word in this topic
188
+ topic.wordcount += 1 # Total sum of words
189
+ topic.docs[@word2doc[random_word_index]] += 1 # Record this doc index in this topic
190
+ end
191
+ end
192
+
193
+ def lda(opts={})
194
+ opts[:iterations] ||= @iterations
195
+
196
+ unless (opts[:continue])
197
+ lda_setup()
198
+ end
199
+
200
+ opts[:iterations].times do |i|
201
+ lda_once()
202
+ end
203
+ end
204
+
205
+ # Takes {|topic| ... }
206
+ def each_topic(&topicproc)
207
+ @topics.each &topicproc
208
+ end
209
+
210
+ # Return a list lists, [ z, word ].
211
+ def get_top_words_for_topic(topic, n = 3)
212
+
213
+ # List of (z, topic, word)
214
+ tupleList = []
215
+
216
+ topic.words.each_key do |word|
217
+ tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
218
+ end
219
+
220
+ # Yes, rev the comparison so the list sorts backwards.
221
+ tupleList.sort! { |x, y| y.weight <=> x.weight }
222
+
223
+ tupleList[0...n]
224
+
225
+ end
226
+
227
+ # Returns list list list.
228
+ # Each list is a topic list.
229
+ # Each topic list contains a word list.
230
+ # [ [ z, word, topic ], ... ]
231
+ def get_max_terms(n=3)
232
+ topics = []
233
+
234
+ each_topic { |t| topics << get_top_words_for_topic(t, n) }
235
+
236
+ topics
237
+ end
238
+
239
+ alias cluster lda
240
+
241
+ end
242
+ end
243
+ end