sclust 1.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
@@ -0,0 +1,294 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'log4r'
|
27
|
+
require 'sclust/util/word'
|
28
|
+
|
29
|
+
module SClust
|
30
|
+
module KMean
|
31
|
+
class CosineDistance
|
32
|
+
|
33
|
+
# Given two vectors, compute the distance
|
34
|
+
def self.distance(a,b)
|
35
|
+
|
36
|
+
acc1 = 0.0
|
37
|
+
acc2 = 0.0
|
38
|
+
acc3 = 0.0
|
39
|
+
|
40
|
+
a.merge(b).keys.each do |i|
|
41
|
+
acc1 += a[i]*b[i]
|
42
|
+
acc2 += a[i]*a[i]
|
43
|
+
acc3 += b[i]*b[i]
|
44
|
+
end
|
45
|
+
|
46
|
+
v = 1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
|
47
|
+
|
48
|
+
# Return nil if we detect no distance between documents.
|
49
|
+
(v==1)? nil : v
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class ClusterPoint
|
54
|
+
|
55
|
+
attr_reader :values, :cluster, :source_object
|
56
|
+
attr_writer :cluster, :source_object
|
57
|
+
|
58
|
+
# Initialize the ClusterPoint with a SparseVector or SparseLabeledVector.
|
59
|
+
def initialize(sparse_vector, source_object = nil)
|
60
|
+
@values = sparse_vector
|
61
|
+
@cluster = nil
|
62
|
+
@source_object = source_object
|
63
|
+
end
|
64
|
+
|
65
|
+
def distance(clusterPoint)
|
66
|
+
CosineDistance.distance(@values, clusterPoint.values)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Add each item in the cluster point to this cluster point adjusting the values per the given weight.
|
70
|
+
# Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
|
71
|
+
# this cluster point while a weight value of 0 will have no effect.
|
72
|
+
def add(clusterPoint, weight)
|
73
|
+
@values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] * (1-weight) ) + (clusterPoint.values[i] * weight)}
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# Similar to add, but subtract.
|
78
|
+
def sub(clusterPoint, weight)
|
79
|
+
@values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / ( 1 - weight ) }
|
80
|
+
end
|
81
|
+
|
82
|
+
# Return the top n words. Return all the terms sorted if n is 0.
|
83
|
+
def get_max_terms(n=3)
|
84
|
+
|
85
|
+
values_to_terms = {}
|
86
|
+
|
87
|
+
@values.each do |t, v|
|
88
|
+
values_to_terms[v] ||= []
|
89
|
+
values_to_terms[v] << SClust::Util::Word.new(t, v, {:stemmed_word => t})
|
90
|
+
end
|
91
|
+
|
92
|
+
sorted_values = values_to_terms.keys.sort { |x,y| y <=> x }
|
93
|
+
|
94
|
+
result = []
|
95
|
+
|
96
|
+
#n = @values.length if ( n > @values.length || n == 0)
|
97
|
+
|
98
|
+
catch(:haveEnough) do
|
99
|
+
|
100
|
+
sorted_values.each do |value|
|
101
|
+
|
102
|
+
result += values_to_terms[value]
|
103
|
+
|
104
|
+
throw :haveEnough if result.length >= n
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
# Trim our results to exactly the requested size.
|
111
|
+
result[0...n]
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
def get_term_value(term)
|
116
|
+
@values[term]
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
class Cluster
|
122
|
+
|
123
|
+
attr_reader :center, :size
|
124
|
+
|
125
|
+
def initialize(centerPoint)
|
126
|
+
@fixed = false
|
127
|
+
@center = centerPoint.clone
|
128
|
+
@size = 1
|
129
|
+
end
|
130
|
+
|
131
|
+
def +(point)
|
132
|
+
point.cluster = self
|
133
|
+
|
134
|
+
@size+=1
|
135
|
+
|
136
|
+
@center.add(point, 1.0/@size.to_f)
|
137
|
+
end
|
138
|
+
|
139
|
+
def -(point)
|
140
|
+
point.cluster = nil
|
141
|
+
|
142
|
+
@center.sub(point, 1.0/@size.to_f)
|
143
|
+
|
144
|
+
@size-=1
|
145
|
+
end
|
146
|
+
|
147
|
+
def get_max_terms(n=3)
|
148
|
+
@center.get_max_terms(n)
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
|
153
|
+
class Clusterer
|
154
|
+
|
155
|
+
attr_reader :clusters, :points, :cluster_count, :iterations, :logger
|
156
|
+
attr_writer :clusters, :points, :cluster_count, :iterations, :logger
|
157
|
+
|
158
|
+
# Optionally takes a notifier.
|
159
|
+
def initialize(points=[])
|
160
|
+
@iterations = 3
|
161
|
+
@cluster_count = 0
|
162
|
+
@points = points
|
163
|
+
@clusters = []
|
164
|
+
@logger = Log4r::Logger.new('Clusterer')
|
165
|
+
@logger.add('default')
|
166
|
+
|
167
|
+
# Randomly select a few starting documents.
|
168
|
+
#build_empty_clusters('crp')
|
169
|
+
end
|
170
|
+
|
171
|
+
# Drop all existing clusters and recreate them using the given method.
|
172
|
+
# If the given method is an integer, then that many clusters are created
|
173
|
+
# and the centers are randomly chosen from the documents contained in the @points attribute.
|
174
|
+
# If it is CRP, then the Chinese Resteraunt Process is used, considering each document
|
175
|
+
# and creating a cluster with that document as the center stochastically and proportionally
|
176
|
+
# the number of documents already considered.
|
177
|
+
def topics=(process)
|
178
|
+
|
179
|
+
@clusters = []
|
180
|
+
|
181
|
+
if ( process.is_a?(Integer))
|
182
|
+
@logger.info("Building cluster of constant cluster count #{process}.")
|
183
|
+
@cluster_count = process
|
184
|
+
@cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
|
185
|
+
|
186
|
+
elsif(process.is_a?(String))
|
187
|
+
if ( process == "crp" )
|
188
|
+
|
189
|
+
@logger.info("Building clusters using CRP.")
|
190
|
+
|
191
|
+
1.upto(@points.length) do |i|
|
192
|
+
|
193
|
+
@cluster_count = 0
|
194
|
+
|
195
|
+
if ( rand(i) == 0 )
|
196
|
+
@clusters << Cluster.new(@points[i-1])
|
197
|
+
@cluster_count += 1
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
@logger.info("Built #{@cluster_count} clusters.")
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def +(point)
|
208
|
+
@points << point
|
209
|
+
end
|
210
|
+
|
211
|
+
def each_cluster(&c)
|
212
|
+
@clusters.each { |cluster| yield cluster }
|
213
|
+
end
|
214
|
+
|
215
|
+
def assign_all_points
|
216
|
+
|
217
|
+
@points.each do |pt|
|
218
|
+
|
219
|
+
#@logger.debug("Assigning point #{pt}.")
|
220
|
+
|
221
|
+
# Randomize the first selection to ensure that in the case where there are
|
222
|
+
# many centers that are close, each has a (statistically) equal chance of
|
223
|
+
# getting the document, thus moving the center, changing the center,
|
224
|
+
# and perhaps matching other documents better because of more terms.
|
225
|
+
min_cluster = @clusters[rand(@clusters.length)]
|
226
|
+
min_dst = min_cluster.center.distance(pt)
|
227
|
+
|
228
|
+
@clusters.each do |cluster|
|
229
|
+
|
230
|
+
tmp_distance = cluster.center.distance(pt)
|
231
|
+
|
232
|
+
if tmp_distance.nil?
|
233
|
+
next
|
234
|
+
|
235
|
+
elsif min_dst.nil?
|
236
|
+
min_dst = tmp_distance
|
237
|
+
min_cluster = cluster
|
238
|
+
|
239
|
+
elsif tmp_distance < min_dst
|
240
|
+
min_cluster = cluster
|
241
|
+
min_dst = tmp_distance
|
242
|
+
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
# If a point has a center...
|
247
|
+
if pt.cluster
|
248
|
+
|
249
|
+
# If it is not the same cluster...
|
250
|
+
unless pt.cluster.equal? min_cluster
|
251
|
+
pt.cluster - pt
|
252
|
+
min_cluster + pt
|
253
|
+
end
|
254
|
+
else
|
255
|
+
min_cluster + pt
|
256
|
+
end
|
257
|
+
|
258
|
+
#pt.cluster - pt if pt.cluster
|
259
|
+
|
260
|
+
#min_cluster + pt
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
def cluster
|
265
|
+
|
266
|
+
# If we are not initialized, initialize the cluster! :)
|
267
|
+
self.build_empty_clusters('crp') unless @clusters && @clusters.size > 0
|
268
|
+
|
269
|
+
iterations.times do |i|
|
270
|
+
@logger.info("Starting iteration #{i+1} of #{iterations}.")
|
271
|
+
assign_all_points
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
def get_max_terms(n=3)
|
276
|
+
r = []
|
277
|
+
|
278
|
+
each_cluster do |cluster|
|
279
|
+
r << cluster.get_max_terms(n)
|
280
|
+
end
|
281
|
+
|
282
|
+
r
|
283
|
+
end
|
284
|
+
|
285
|
+
# If you edit the document collection behind the scenes in and LDA clusterer, you need to run
|
286
|
+
# this to avoid terms with 0 showing up. However, K-Mean has so little document-related
|
287
|
+
# state that this method does nothing and is only here for API compatibility.
|
288
|
+
# We would like LDA and KMean implementations that are drop-in replacements.
|
289
|
+
def rebuild_document_collection()
|
290
|
+
end
|
291
|
+
|
292
|
+
end
|
293
|
+
end
|
294
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'sclust/kmean/cluster'
|
26
|
+
require 'sclust/util/sparse_vector'
|
27
|
+
|
28
|
+
module SClust
|
29
|
+
|
30
|
+
module KMean
|
31
|
+
|
32
|
+
# A document clusterer that overrides the + operator
|
33
|
+
# to allow for adding Document objects.
|
34
|
+
class DocumentClusterer < Clusterer
|
35
|
+
|
36
|
+
attr_reader :document_collection
|
37
|
+
|
38
|
+
def initialize()
|
39
|
+
@document_collection = SClust::Util::DocumentCollection.new()
|
40
|
+
super()
|
41
|
+
end
|
42
|
+
|
43
|
+
def <<(d)
|
44
|
+
if ( d.is_a?(SClust::Util::Document) )
|
45
|
+
@document_collection << d
|
46
|
+
else
|
47
|
+
@document_collection << SClust::Util::Document.new(d.to_s)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# This must be run to conver the document collection into
|
52
|
+
# the points in a cluster.
|
53
|
+
def initialize_points()
|
54
|
+
|
55
|
+
point_list = []
|
56
|
+
|
57
|
+
@document_collection.doclist.each do |doc|
|
58
|
+
|
59
|
+
doc_terms = SClust::Util::SparseVector.new(0)
|
60
|
+
|
61
|
+
# Buid a BIG term vector list for this document.
|
62
|
+
doc.terms.each_key do |term|
|
63
|
+
doc_terms[term] = doc.tf(term) - @document_collection.idf(term)
|
64
|
+
end
|
65
|
+
|
66
|
+
# def initialize(terms, values, source_object = nil)
|
67
|
+
point_list << ClusterPoint.new(doc_terms, doc)
|
68
|
+
end
|
69
|
+
|
70
|
+
self.points = point_list
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
def topics=(n)
|
75
|
+
|
76
|
+
initialize_points unless ( self.points && self.points.size > 0 )
|
77
|
+
super(n)
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,243 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
require 'rubygems'
|
25
|
+
require 'sclust/util/word'
|
26
|
+
require 'log4r'
|
27
|
+
|
28
|
+
module SClust
|
29
|
+
module LDA
|
30
|
+
|
31
|
+
class Topic
|
32
|
+
|
33
|
+
attr_reader :words, :wordcount, :docs
|
34
|
+
attr_writer :words, :wordcount, :docs
|
35
|
+
|
36
|
+
def initialize()
|
37
|
+
@words = {}
|
38
|
+
@wordcount = 0
|
39
|
+
@docs = {}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class LDA
|
44
|
+
|
45
|
+
attr_reader :logger, :iterations, :doclist, :topics
|
46
|
+
attr_writer :logger, :iterations, :doclist
|
47
|
+
|
48
|
+
# Documents may be added after LDA is created, unlike k-mean clustering.
|
49
|
+
def initialize(docCol=nil)
|
50
|
+
@iterations = 3
|
51
|
+
@wordlist = []
|
52
|
+
@doclist = []
|
53
|
+
@logger = Log4r::Logger.new('Clusterer')
|
54
|
+
|
55
|
+
|
56
|
+
# Array the same size as @wordlist but stores the document object at index i
|
57
|
+
# that produced @wordlist[i].
|
58
|
+
@word2doc = []
|
59
|
+
|
60
|
+
self.topics = 10
|
61
|
+
|
62
|
+
if ( docCol )
|
63
|
+
docCol.each {|d| self << d}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def <<(document)
|
68
|
+
@doclist << document
|
69
|
+
@wordlist += document.words
|
70
|
+
document.words.length.times {@word2doc << document}
|
71
|
+
end
|
72
|
+
|
73
|
+
def topics=(count)
|
74
|
+
@topics = []
|
75
|
+
count.times do |t|
|
76
|
+
@topics << Topic.new()
|
77
|
+
@topic2doc
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Build a wordlist index array. This is an array that contains indexes into @wordlist.
|
82
|
+
# However, instead of being simply {0,1,2,3...} this array is randomized so that
|
83
|
+
# we index into @wordlist in a random order.
|
84
|
+
def build_randomized_index_into_words()
|
85
|
+
@randomized_word_index = []
|
86
|
+
|
87
|
+
@wordlist.each_index { |i| @randomized_word_index << i }
|
88
|
+
|
89
|
+
@wordlist.each_index do |i|
|
90
|
+
new_home = (@wordlist.length * rand).to_i
|
91
|
+
tmp = @randomized_word_index[i]
|
92
|
+
@randomized_word_index[i] = @randomized_word_index[new_home]
|
93
|
+
@randomized_word_index[new_home] = tmp
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
# Compute P(z=j | z..._i, w). Or, the probability that
|
99
|
+
# a topic z is the topic j represented by the given word given that word.
|
100
|
+
def p_of_z(topic, word)
|
101
|
+
|
102
|
+
return 0 unless topic.words[word]
|
103
|
+
|
104
|
+
((topic.words[word] - 1 + @beta) / (topic.wordcount - topic.words[word] - 1 + @beta * @wordlist.length)) *
|
105
|
+
((topic.docs.size - 1 + @alpha) / (@doclist.size - 1 + @alpha * @topics.size))
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def each_radomized_word_index(&call)
|
110
|
+
@randomized_word_index.each &call
|
111
|
+
end
|
112
|
+
|
113
|
+
def lda_setup()
|
114
|
+
@beta = 0.01
|
115
|
+
@alpha = 50.0 / @topics.length
|
116
|
+
|
117
|
+
build_randomized_index_into_words()
|
118
|
+
|
119
|
+
@word2topic = []
|
120
|
+
@doc2topic = []
|
121
|
+
|
122
|
+
each_radomized_word_index do |i|
|
123
|
+
topic = (@topics.size * rand).to_i
|
124
|
+
|
125
|
+
@word2topic[i] = topic # Record that this word goes to this topic.
|
126
|
+
@topics[topic].words[@wordlist[i]] ||= 0
|
127
|
+
@topics[topic].docs[@word2doc[i]] ||= 0
|
128
|
+
|
129
|
+
@topics[topic].words[@wordlist[i]] += 1 # Record a new word in this topic
|
130
|
+
@topics[topic].wordcount += 1 # Total sum of words
|
131
|
+
@topics[topic].docs[@word2doc[i]] += 1 # Record this doc index in this topic
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
|
136
|
+
# Perform 1 phase of lda
|
137
|
+
def lda_once()
|
138
|
+
each_radomized_word_index do |random_word_index|
|
139
|
+
|
140
|
+
random_word = @wordlist[random_word_index]
|
141
|
+
|
142
|
+
zdist = []
|
143
|
+
ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
|
144
|
+
|
145
|
+
# Compute distribution over z for word i.
|
146
|
+
@topics.each do |topic|
|
147
|
+
z = p_of_z(topic, random_word)
|
148
|
+
ztotal += z
|
149
|
+
zdist << z
|
150
|
+
end
|
151
|
+
|
152
|
+
r = rand * ztotal # Random value to pick topic with.
|
153
|
+
zacc = 0.0 # Accumulator of seen values of zdist[topici].
|
154
|
+
topici = (rand() * @topics.size).to_i
|
155
|
+
|
156
|
+
# Pick a topic, t
|
157
|
+
|
158
|
+
catch(:picked_topic) do
|
159
|
+
@topics.each_index do |topici|
|
160
|
+
zacc += zdist[topici]
|
161
|
+
throw :picked_topic if r < zacc
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
topic = @topics[topici]
|
166
|
+
|
167
|
+
previous_topic = @topics[@word2topic[random_word_index]]
|
168
|
+
|
169
|
+
# Skip if src and dst topic are the same
|
170
|
+
next if @word2topic[random_word_index] == topici
|
171
|
+
|
172
|
+
# Remove word from previous topic.
|
173
|
+
|
174
|
+
if ( previous_topic.words[@wordlist[random_word_index]] > 0 )
|
175
|
+
previous_topic.words[@wordlist[random_word_index]] -= 1 # Remove a new word in this topic
|
176
|
+
previous_topic.wordcount -= 1 # Reduce sum of words
|
177
|
+
previous_topic.docs[@word2doc[random_word_index]] -= 1 # Remove this doc index in this topic
|
178
|
+
|
179
|
+
previous_topic.docs.delete(@word2doc[random_word_index]) if previous_topic.docs[@word2doc[random_word_index]] <= 0
|
180
|
+
end
|
181
|
+
|
182
|
+
topic.words[@wordlist[random_word_index]] ||= 0 # If word was not in previous topic, add to this one.
|
183
|
+
topic.docs[@word2doc[random_word_index]] ||= 0 # If doc was not previously here.
|
184
|
+
|
185
|
+
# Add word to chosen topic.
|
186
|
+
@word2topic[random_word_index] = topici # Record that this word goes to this topic.
|
187
|
+
topic.words[@wordlist[random_word_index]] += 1 # Record a new word in this topic
|
188
|
+
topic.wordcount += 1 # Total sum of words
|
189
|
+
topic.docs[@word2doc[random_word_index]] += 1 # Record this doc index in this topic
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def lda(opts={})
|
194
|
+
opts[:iterations] ||= @iterations
|
195
|
+
|
196
|
+
unless (opts[:continue])
|
197
|
+
lda_setup()
|
198
|
+
end
|
199
|
+
|
200
|
+
opts[:iterations].times do |i|
|
201
|
+
lda_once()
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
# Takes {|topic| ... }
|
206
|
+
def each_topic(&topicproc)
|
207
|
+
@topics.each &topicproc
|
208
|
+
end
|
209
|
+
|
210
|
+
# Return a list lists, [ z, word ].
|
211
|
+
def get_top_words_for_topic(topic, n = 3)
|
212
|
+
|
213
|
+
# List of (z, topic, word)
|
214
|
+
tupleList = []
|
215
|
+
|
216
|
+
topic.words.each_key do |word|
|
217
|
+
tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
|
218
|
+
end
|
219
|
+
|
220
|
+
# Yes, rev the comparison so the list sorts backwards.
|
221
|
+
tupleList.sort! { |x, y| y.weight <=> x.weight }
|
222
|
+
|
223
|
+
tupleList[0...n]
|
224
|
+
|
225
|
+
end
|
226
|
+
|
227
|
+
# Returns list list list.
|
228
|
+
# Each list is a topic list.
|
229
|
+
# Each topic list contains a word list.
|
230
|
+
# [ [ z, word, topic ], ... ]
|
231
|
+
def get_max_terms(n=3)
|
232
|
+
topics = []
|
233
|
+
|
234
|
+
each_topic { |t| topics << get_top_words_for_topic(t, n) }
|
235
|
+
|
236
|
+
topics
|
237
|
+
end
|
238
|
+
|
239
|
+
alias cluster lda
|
240
|
+
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|