sclust 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,294 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'rubygems'
26
+ require 'log4r'
27
+ require 'sclust/util/word'
28
+
29
+ module SClust
30
+ module KMean
31
+ class CosineDistance
32
+
33
+ # Given two vectors, compute the distance
34
+ def self.distance(a,b)
35
+
36
+ acc1 = 0.0
37
+ acc2 = 0.0
38
+ acc3 = 0.0
39
+
40
+ a.merge(b).keys.each do |i|
41
+ acc1 += a[i]*b[i]
42
+ acc2 += a[i]*a[i]
43
+ acc3 += b[i]*b[i]
44
+ end
45
+
46
+ v = 1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
47
+
48
+ # Return nil if we detect no distance between documents.
49
+ (v==1)? nil : v
50
+ end
51
+ end
52
+
53
+ class ClusterPoint
54
+
55
+ attr_reader :values, :cluster, :source_object
56
+ attr_writer :cluster, :source_object
57
+
58
+ # Initialize the ClusterPoint with a SparseVector or SparseLabeledVector.
59
+ def initialize(sparse_vector, source_object = nil)
60
+ @values = sparse_vector
61
+ @cluster = nil
62
+ @source_object = source_object
63
+ end
64
+
65
+ def distance(clusterPoint)
66
+ CosineDistance.distance(@values, clusterPoint.values)
67
+ end
68
+
69
+ # Add each item in the cluster point to this cluster point adjusting the values per the given weight.
70
+ # Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
71
+ # this cluster point while a weight value of 0 will have no effect.
72
+ def add(clusterPoint, weight)
73
+ @values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] * (1-weight) ) + (clusterPoint.values[i] * weight)}
74
+ end
75
+
76
+
77
+ # Similar to add, but subtract.
78
+ def sub(clusterPoint, weight)
79
+ @values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / ( 1 - weight ) }
80
+ end
81
+
82
+ # Return the top n words. Return all the terms sorted if n is 0.
83
+ def get_max_terms(n=3)
84
+
85
+ values_to_terms = {}
86
+
87
+ @values.each do |t, v|
88
+ values_to_terms[v] ||= []
89
+ values_to_terms[v] << SClust::Util::Word.new(t, v, {:stemmed_word => t})
90
+ end
91
+
92
+ sorted_values = values_to_terms.keys.sort { |x,y| y <=> x }
93
+
94
+ result = []
95
+
96
+ #n = @values.length if ( n > @values.length || n == 0)
97
+
98
+ catch(:haveEnough) do
99
+
100
+ sorted_values.each do |value|
101
+
102
+ result += values_to_terms[value]
103
+
104
+ throw :haveEnough if result.length >= n
105
+
106
+ end
107
+
108
+ end
109
+
110
+ # Trim our results to exactly the requested size.
111
+ result[0...n]
112
+
113
+ end
114
+
115
+ def get_term_value(term)
116
+ @values[term]
117
+ end
118
+
119
+ end
120
+
121
+ class Cluster
122
+
123
+ attr_reader :center, :size
124
+
125
+ def initialize(centerPoint)
126
+ @fixed = false
127
+ @center = centerPoint.clone
128
+ @size = 1
129
+ end
130
+
131
+ def +(point)
132
+ point.cluster = self
133
+
134
+ @size+=1
135
+
136
+ @center.add(point, 1.0/@size.to_f)
137
+ end
138
+
139
+ def -(point)
140
+ point.cluster = nil
141
+
142
+ @center.sub(point, 1.0/@size.to_f)
143
+
144
+ @size-=1
145
+ end
146
+
147
+ def get_max_terms(n=3)
148
+ @center.get_max_terms(n)
149
+ end
150
+
151
+ end
152
+
153
+ class Clusterer
154
+
155
+ attr_reader :clusters, :points, :cluster_count, :iterations, :logger
156
+ attr_writer :clusters, :points, :cluster_count, :iterations, :logger
157
+
158
+ # Optionally takes a notifier.
159
+ def initialize(points=[])
160
+ @iterations = 3
161
+ @cluster_count = 0
162
+ @points = points
163
+ @clusters = []
164
+ @logger = Log4r::Logger.new('Clusterer')
165
+ @logger.add('default')
166
+
167
+ # Randomly select a few starting documents.
168
+ #build_empty_clusters('crp')
169
+ end
170
+
171
+ # Drop all existing clusters and recreate them using the given method.
172
+ # If the given method is an integer, then that many clusters are created
173
+ # and the centers are randomly chosen from the documents contained in the @points attribute.
174
+ # If it is CRP, then the Chinese Resteraunt Process is used, considering each document
175
+ # and creating a cluster with that document as the center stochastically and proportionally
176
+ # the number of documents already considered.
177
+ def topics=(process)
178
+
179
+ @clusters = []
180
+
181
+ if ( process.is_a?(Integer))
182
+ @logger.info("Building cluster of constant cluster count #{process}.")
183
+ @cluster_count = process
184
+ @cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
185
+
186
+ elsif(process.is_a?(String))
187
+ if ( process == "crp" )
188
+
189
+ @logger.info("Building clusters using CRP.")
190
+
191
+ 1.upto(@points.length) do |i|
192
+
193
+ @cluster_count = 0
194
+
195
+ if ( rand(i) == 0 )
196
+ @clusters << Cluster.new(@points[i-1])
197
+ @cluster_count += 1
198
+ end
199
+
200
+ end
201
+
202
+ @logger.info("Built #{@cluster_count} clusters.")
203
+ end
204
+ end
205
+ end
206
+
207
+ def +(point)
208
+ @points << point
209
+ end
210
+
211
+ def each_cluster(&c)
212
+ @clusters.each { |cluster| yield cluster }
213
+ end
214
+
215
+ def assign_all_points
216
+
217
+ @points.each do |pt|
218
+
219
+ #@logger.debug("Assigning point #{pt}.")
220
+
221
+ # Randomize the first selection to ensure that in the case where there are
222
+ # many centers that are close, each has a (statistically) equal chance of
223
+ # getting the document, thus moving the center, changing the center,
224
+ # and perhaps matching other documents better because of more terms.
225
+ min_cluster = @clusters[rand(@clusters.length)]
226
+ min_dst = min_cluster.center.distance(pt)
227
+
228
+ @clusters.each do |cluster|
229
+
230
+ tmp_distance = cluster.center.distance(pt)
231
+
232
+ if tmp_distance.nil?
233
+ next
234
+
235
+ elsif min_dst.nil?
236
+ min_dst = tmp_distance
237
+ min_cluster = cluster
238
+
239
+ elsif tmp_distance < min_dst
240
+ min_cluster = cluster
241
+ min_dst = tmp_distance
242
+
243
+ end
244
+ end
245
+
246
+ # If a point has a center...
247
+ if pt.cluster
248
+
249
+ # If it is not the same cluster...
250
+ unless pt.cluster.equal? min_cluster
251
+ pt.cluster - pt
252
+ min_cluster + pt
253
+ end
254
+ else
255
+ min_cluster + pt
256
+ end
257
+
258
+ #pt.cluster - pt if pt.cluster
259
+
260
+ #min_cluster + pt
261
+ end
262
+ end
263
+
264
+ def cluster
265
+
266
+ # If we are not initialized, initialize the cluster! :)
267
+ self.build_empty_clusters('crp') unless @clusters && @clusters.size > 0
268
+
269
+ iterations.times do |i|
270
+ @logger.info("Starting iteration #{i+1} of #{iterations}.")
271
+ assign_all_points
272
+ end
273
+ end
274
+
275
+ def get_max_terms(n=3)
276
+ r = []
277
+
278
+ each_cluster do |cluster|
279
+ r << cluster.get_max_terms(n)
280
+ end
281
+
282
+ r
283
+ end
284
+
285
+ # If you edit the document collection behind the scenes in and LDA clusterer, you need to run
286
+ # this to avoid terms with 0 showing up. However, K-Mean has so little document-related
287
+ # state that this method does nothing and is only here for API compatibility.
288
+ # We would like LDA and KMean implementations that are drop-in replacements.
289
+ def rebuild_document_collection()
290
+ end
291
+
292
+ end
293
+ end
294
+ end
@@ -0,0 +1,83 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'sclust/kmean/cluster'
26
+ require 'sclust/util/sparse_vector'
27
+
28
+ module SClust
29
+
30
+ module KMean
31
+
32
+ # A document clusterer that overrides the + operator
33
+ # to allow for adding Document objects.
34
+ class DocumentClusterer < Clusterer
35
+
36
+ attr_reader :document_collection
37
+
38
+ def initialize()
39
+ @document_collection = SClust::Util::DocumentCollection.new()
40
+ super()
41
+ end
42
+
43
+ def <<(d)
44
+ if ( d.is_a?(SClust::Util::Document) )
45
+ @document_collection << d
46
+ else
47
+ @document_collection << SClust::Util::Document.new(d.to_s)
48
+ end
49
+ end
50
+
51
+ # This must be run to conver the document collection into
52
+ # the points in a cluster.
53
+ def initialize_points()
54
+
55
+ point_list = []
56
+
57
+ @document_collection.doclist.each do |doc|
58
+
59
+ doc_terms = SClust::Util::SparseVector.new(0)
60
+
61
+ # Buid a BIG term vector list for this document.
62
+ doc.terms.each_key do |term|
63
+ doc_terms[term] = doc.tf(term) - @document_collection.idf(term)
64
+ end
65
+
66
+ # def initialize(terms, values, source_object = nil)
67
+ point_list << ClusterPoint.new(doc_terms, doc)
68
+ end
69
+
70
+ self.points = point_list
71
+
72
+ end
73
+
74
+ def topics=(n)
75
+
76
+ initialize_points unless ( self.points && self.points.size > 0 )
77
+ super(n)
78
+
79
+ end
80
+
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,243 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+ require 'rubygems'
25
+ require 'sclust/util/word'
26
+ require 'log4r'
27
+
28
+ module SClust
29
+ module LDA
30
+
31
+ class Topic
32
+
33
+ attr_reader :words, :wordcount, :docs
34
+ attr_writer :words, :wordcount, :docs
35
+
36
+ def initialize()
37
+ @words = {}
38
+ @wordcount = 0
39
+ @docs = {}
40
+ end
41
+ end
42
+
43
+ class LDA
44
+
45
+ attr_reader :logger, :iterations, :doclist, :topics
46
+ attr_writer :logger, :iterations, :doclist
47
+
48
+ # Documents may be added after LDA is created, unlike k-mean clustering.
49
+ def initialize(docCol=nil)
50
+ @iterations = 3
51
+ @wordlist = []
52
+ @doclist = []
53
+ @logger = Log4r::Logger.new('Clusterer')
54
+
55
+
56
+ # Array the same size as @wordlist but stores the document object at index i
57
+ # that produced @wordlist[i].
58
+ @word2doc = []
59
+
60
+ self.topics = 10
61
+
62
+ if ( docCol )
63
+ docCol.each {|d| self << d}
64
+ end
65
+ end
66
+
67
+ def <<(document)
68
+ @doclist << document
69
+ @wordlist += document.words
70
+ document.words.length.times {@word2doc << document}
71
+ end
72
+
73
+ def topics=(count)
74
+ @topics = []
75
+ count.times do |t|
76
+ @topics << Topic.new()
77
+ @topic2doc
78
+ end
79
+ end
80
+
81
+ # Build a wordlist index array. This is an array that contains indexes into @wordlist.
82
+ # However, instead of being simply {0,1,2,3...} this array is randomized so that
83
+ # we index into @wordlist in a random order.
84
+ def build_randomized_index_into_words()
85
+ @randomized_word_index = []
86
+
87
+ @wordlist.each_index { |i| @randomized_word_index << i }
88
+
89
+ @wordlist.each_index do |i|
90
+ new_home = (@wordlist.length * rand).to_i
91
+ tmp = @randomized_word_index[i]
92
+ @randomized_word_index[i] = @randomized_word_index[new_home]
93
+ @randomized_word_index[new_home] = tmp
94
+ end
95
+
96
+ end
97
+
98
+ # Compute P(z=j | z..._i, w). Or, the probability that
99
+ # a topic z is the topic j represented by the given word given that word.
100
+ def p_of_z(topic, word)
101
+
102
+ return 0 unless topic.words[word]
103
+
104
+ ((topic.words[word] - 1 + @beta) / (topic.wordcount - topic.words[word] - 1 + @beta * @wordlist.length)) *
105
+ ((topic.docs.size - 1 + @alpha) / (@doclist.size - 1 + @alpha * @topics.size))
106
+
107
+ end
108
+
109
+ def each_radomized_word_index(&call)
110
+ @randomized_word_index.each &call
111
+ end
112
+
113
+ def lda_setup()
114
+ @beta = 0.01
115
+ @alpha = 50.0 / @topics.length
116
+
117
+ build_randomized_index_into_words()
118
+
119
+ @word2topic = []
120
+ @doc2topic = []
121
+
122
+ each_radomized_word_index do |i|
123
+ topic = (@topics.size * rand).to_i
124
+
125
+ @word2topic[i] = topic # Record that this word goes to this topic.
126
+ @topics[topic].words[@wordlist[i]] ||= 0
127
+ @topics[topic].docs[@word2doc[i]] ||= 0
128
+
129
+ @topics[topic].words[@wordlist[i]] += 1 # Record a new word in this topic
130
+ @topics[topic].wordcount += 1 # Total sum of words
131
+ @topics[topic].docs[@word2doc[i]] += 1 # Record this doc index in this topic
132
+ end
133
+
134
+ end
135
+
136
+ # Perform 1 phase of lda
137
+ def lda_once()
138
+ each_radomized_word_index do |random_word_index|
139
+
140
+ random_word = @wordlist[random_word_index]
141
+
142
+ zdist = []
143
+ ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
144
+
145
+ # Compute distribution over z for word i.
146
+ @topics.each do |topic|
147
+ z = p_of_z(topic, random_word)
148
+ ztotal += z
149
+ zdist << z
150
+ end
151
+
152
+ r = rand * ztotal # Random value to pick topic with.
153
+ zacc = 0.0 # Accumulator of seen values of zdist[topici].
154
+ topici = (rand() * @topics.size).to_i
155
+
156
+ # Pick a topic, t
157
+
158
+ catch(:picked_topic) do
159
+ @topics.each_index do |topici|
160
+ zacc += zdist[topici]
161
+ throw :picked_topic if r < zacc
162
+ end
163
+ end
164
+
165
+ topic = @topics[topici]
166
+
167
+ previous_topic = @topics[@word2topic[random_word_index]]
168
+
169
+ # Skip if src and dst topic are the same
170
+ next if @word2topic[random_word_index] == topici
171
+
172
+ # Remove word from previous topic.
173
+
174
+ if ( previous_topic.words[@wordlist[random_word_index]] > 0 )
175
+ previous_topic.words[@wordlist[random_word_index]] -= 1 # Remove a new word in this topic
176
+ previous_topic.wordcount -= 1 # Reduce sum of words
177
+ previous_topic.docs[@word2doc[random_word_index]] -= 1 # Remove this doc index in this topic
178
+
179
+ previous_topic.docs.delete(@word2doc[random_word_index]) if previous_topic.docs[@word2doc[random_word_index]] <= 0
180
+ end
181
+
182
+ topic.words[@wordlist[random_word_index]] ||= 0 # If word was not in previous topic, add to this one.
183
+ topic.docs[@word2doc[random_word_index]] ||= 0 # If doc was not previously here.
184
+
185
+ # Add word to chosen topic.
186
+ @word2topic[random_word_index] = topici # Record that this word goes to this topic.
187
+ topic.words[@wordlist[random_word_index]] += 1 # Record a new word in this topic
188
+ topic.wordcount += 1 # Total sum of words
189
+ topic.docs[@word2doc[random_word_index]] += 1 # Record this doc index in this topic
190
+ end
191
+ end
192
+
193
+ def lda(opts={})
194
+ opts[:iterations] ||= @iterations
195
+
196
+ unless (opts[:continue])
197
+ lda_setup()
198
+ end
199
+
200
+ opts[:iterations].times do |i|
201
+ lda_once()
202
+ end
203
+ end
204
+
205
+ # Takes {|topic| ... }
206
+ def each_topic(&topicproc)
207
+ @topics.each &topicproc
208
+ end
209
+
210
+ # Return a list lists, [ z, word ].
211
+ def get_top_words_for_topic(topic, n = 3)
212
+
213
+ # List of (z, topic, word)
214
+ tupleList = []
215
+
216
+ topic.words.each_key do |word|
217
+ tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
218
+ end
219
+
220
+ # Yes, rev the comparison so the list sorts backwards.
221
+ tupleList.sort! { |x, y| y.weight <=> x.weight }
222
+
223
+ tupleList[0...n]
224
+
225
+ end
226
+
227
+ # Returns list list list.
228
+ # Each list is a topic list.
229
+ # Each topic list contains a word list.
230
+ # [ [ z, word, topic ], ... ]
231
+ def get_max_terms(n=3)
232
+ topics = []
233
+
234
+ each_topic { |t| topics << get_top_words_for_topic(t, n) }
235
+
236
+ topics
237
+ end
238
+
239
+ alias cluster lda
240
+
241
+ end
242
+ end
243
+ end