sclust 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
@@ -0,0 +1,294 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'log4r'
|
27
|
+
require 'sclust/util/word'
|
28
|
+
|
29
|
+
module SClust
|
30
|
+
module KMean
|
31
|
+
class CosineDistance
|
32
|
+
|
33
|
+
# Given two vectors, compute the distance
|
34
|
+
def self.distance(a,b)
|
35
|
+
|
36
|
+
acc1 = 0.0
|
37
|
+
acc2 = 0.0
|
38
|
+
acc3 = 0.0
|
39
|
+
|
40
|
+
a.merge(b).keys.each do |i|
|
41
|
+
acc1 += a[i]*b[i]
|
42
|
+
acc2 += a[i]*a[i]
|
43
|
+
acc3 += b[i]*b[i]
|
44
|
+
end
|
45
|
+
|
46
|
+
v = 1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
|
47
|
+
|
48
|
+
# Return nil if we detect no distance between documents.
|
49
|
+
(v==1)? nil : v
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class ClusterPoint
|
54
|
+
|
55
|
+
attr_reader :values, :cluster, :source_object
|
56
|
+
attr_writer :cluster, :source_object
|
57
|
+
|
58
|
+
# Initialize the ClusterPoint with a SparseVector or SparseLabeledVector.
|
59
|
+
def initialize(sparse_vector, source_object = nil)
|
60
|
+
@values = sparse_vector
|
61
|
+
@cluster = nil
|
62
|
+
@source_object = source_object
|
63
|
+
end
|
64
|
+
|
65
|
+
def distance(clusterPoint)
|
66
|
+
CosineDistance.distance(@values, clusterPoint.values)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Add each item in the cluster point to this cluster point adjusting the values per the given weight.
|
70
|
+
# Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
|
71
|
+
# this cluster point while a weight value of 0 will have no effect.
|
72
|
+
def add(clusterPoint, weight)
|
73
|
+
@values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] * (1-weight) ) + (clusterPoint.values[i] * weight)}
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# Similar to add, but subtract.
|
78
|
+
def sub(clusterPoint, weight)
|
79
|
+
@values.merge(clusterPoint.values).keys.each { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / ( 1 - weight ) }
|
80
|
+
end
|
81
|
+
|
82
|
+
# Return the top n words. Return all the terms sorted if n is 0.
|
83
|
+
def get_max_terms(n=3)
|
84
|
+
|
85
|
+
values_to_terms = {}
|
86
|
+
|
87
|
+
@values.each do |t, v|
|
88
|
+
values_to_terms[v] ||= []
|
89
|
+
values_to_terms[v] << SClust::Util::Word.new(t, v, {:stemmed_word => t})
|
90
|
+
end
|
91
|
+
|
92
|
+
sorted_values = values_to_terms.keys.sort { |x,y| y <=> x }
|
93
|
+
|
94
|
+
result = []
|
95
|
+
|
96
|
+
#n = @values.length if ( n > @values.length || n == 0)
|
97
|
+
|
98
|
+
catch(:haveEnough) do
|
99
|
+
|
100
|
+
sorted_values.each do |value|
|
101
|
+
|
102
|
+
result += values_to_terms[value]
|
103
|
+
|
104
|
+
throw :haveEnough if result.length >= n
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
# Trim our results to exactly the requested size.
|
111
|
+
result[0...n]
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
def get_term_value(term)
|
116
|
+
@values[term]
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
class Cluster
|
122
|
+
|
123
|
+
attr_reader :center, :size
|
124
|
+
|
125
|
+
def initialize(centerPoint)
|
126
|
+
@fixed = false
|
127
|
+
@center = centerPoint.clone
|
128
|
+
@size = 1
|
129
|
+
end
|
130
|
+
|
131
|
+
def +(point)
|
132
|
+
point.cluster = self
|
133
|
+
|
134
|
+
@size+=1
|
135
|
+
|
136
|
+
@center.add(point, 1.0/@size.to_f)
|
137
|
+
end
|
138
|
+
|
139
|
+
def -(point)
|
140
|
+
point.cluster = nil
|
141
|
+
|
142
|
+
@center.sub(point, 1.0/@size.to_f)
|
143
|
+
|
144
|
+
@size-=1
|
145
|
+
end
|
146
|
+
|
147
|
+
def get_max_terms(n=3)
|
148
|
+
@center.get_max_terms(n)
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
|
153
|
+
class Clusterer
|
154
|
+
|
155
|
+
attr_reader :clusters, :points, :cluster_count, :iterations, :logger
|
156
|
+
attr_writer :clusters, :points, :cluster_count, :iterations, :logger
|
157
|
+
|
158
|
+
# Optionally takes a notifier.
|
159
|
+
def initialize(points=[])
|
160
|
+
@iterations = 3
|
161
|
+
@cluster_count = 0
|
162
|
+
@points = points
|
163
|
+
@clusters = []
|
164
|
+
@logger = Log4r::Logger.new('Clusterer')
|
165
|
+
@logger.add('default')
|
166
|
+
|
167
|
+
# Randomly select a few starting documents.
|
168
|
+
#build_empty_clusters('crp')
|
169
|
+
end
|
170
|
+
|
171
|
+
# Drop all existing clusters and recreate them using the given method.
|
172
|
+
# If the given method is an integer, then that many clusters are created
|
173
|
+
# and the centers are randomly chosen from the documents contained in the @points attribute.
|
174
|
+
# If it is CRP, then the Chinese Resteraunt Process is used, considering each document
|
175
|
+
# and creating a cluster with that document as the center stochastically and proportionally
|
176
|
+
# the number of documents already considered.
|
177
|
+
def topics=(process)
|
178
|
+
|
179
|
+
@clusters = []
|
180
|
+
|
181
|
+
if ( process.is_a?(Integer))
|
182
|
+
@logger.info("Building cluster of constant cluster count #{process}.")
|
183
|
+
@cluster_count = process
|
184
|
+
@cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
|
185
|
+
|
186
|
+
elsif(process.is_a?(String))
|
187
|
+
if ( process == "crp" )
|
188
|
+
|
189
|
+
@logger.info("Building clusters using CRP.")
|
190
|
+
|
191
|
+
1.upto(@points.length) do |i|
|
192
|
+
|
193
|
+
@cluster_count = 0
|
194
|
+
|
195
|
+
if ( rand(i) == 0 )
|
196
|
+
@clusters << Cluster.new(@points[i-1])
|
197
|
+
@cluster_count += 1
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
@logger.info("Built #{@cluster_count} clusters.")
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def +(point)
|
208
|
+
@points << point
|
209
|
+
end
|
210
|
+
|
211
|
+
def each_cluster(&c)
|
212
|
+
@clusters.each { |cluster| yield cluster }
|
213
|
+
end
|
214
|
+
|
215
|
+
def assign_all_points
|
216
|
+
|
217
|
+
@points.each do |pt|
|
218
|
+
|
219
|
+
#@logger.debug("Assigning point #{pt}.")
|
220
|
+
|
221
|
+
# Randomize the first selection to ensure that in the case where there are
|
222
|
+
# many centers that are close, each has a (statistically) equal chance of
|
223
|
+
# getting the document, thus moving the center, changing the center,
|
224
|
+
# and perhaps matching other documents better because of more terms.
|
225
|
+
min_cluster = @clusters[rand(@clusters.length)]
|
226
|
+
min_dst = min_cluster.center.distance(pt)
|
227
|
+
|
228
|
+
@clusters.each do |cluster|
|
229
|
+
|
230
|
+
tmp_distance = cluster.center.distance(pt)
|
231
|
+
|
232
|
+
if tmp_distance.nil?
|
233
|
+
next
|
234
|
+
|
235
|
+
elsif min_dst.nil?
|
236
|
+
min_dst = tmp_distance
|
237
|
+
min_cluster = cluster
|
238
|
+
|
239
|
+
elsif tmp_distance < min_dst
|
240
|
+
min_cluster = cluster
|
241
|
+
min_dst = tmp_distance
|
242
|
+
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
# If a point has a center...
|
247
|
+
if pt.cluster
|
248
|
+
|
249
|
+
# If it is not the same cluster...
|
250
|
+
unless pt.cluster.equal? min_cluster
|
251
|
+
pt.cluster - pt
|
252
|
+
min_cluster + pt
|
253
|
+
end
|
254
|
+
else
|
255
|
+
min_cluster + pt
|
256
|
+
end
|
257
|
+
|
258
|
+
#pt.cluster - pt if pt.cluster
|
259
|
+
|
260
|
+
#min_cluster + pt
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
def cluster
|
265
|
+
|
266
|
+
# If we are not initialized, initialize the cluster! :)
|
267
|
+
self.build_empty_clusters('crp') unless @clusters && @clusters.size > 0
|
268
|
+
|
269
|
+
iterations.times do |i|
|
270
|
+
@logger.info("Starting iteration #{i+1} of #{iterations}.")
|
271
|
+
assign_all_points
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
def get_max_terms(n=3)
|
276
|
+
r = []
|
277
|
+
|
278
|
+
each_cluster do |cluster|
|
279
|
+
r << cluster.get_max_terms(n)
|
280
|
+
end
|
281
|
+
|
282
|
+
r
|
283
|
+
end
|
284
|
+
|
285
|
+
# If you edit the document collection behind the scenes in and LDA clusterer, you need to run
|
286
|
+
# this to avoid terms with 0 showing up. However, K-Mean has so little document-related
|
287
|
+
# state that this method does nothing and is only here for API compatibility.
|
288
|
+
# We would like LDA and KMean implementations that are drop-in replacements.
|
289
|
+
def rebuild_document_collection()
|
290
|
+
end
|
291
|
+
|
292
|
+
end
|
293
|
+
end
|
294
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'sclust/kmean/cluster'
|
26
|
+
require 'sclust/util/sparse_vector'
|
27
|
+
|
28
|
+
module SClust
|
29
|
+
|
30
|
+
module KMean
|
31
|
+
|
32
|
+
# A document clusterer that overrides the + operator
|
33
|
+
# to allow for adding Document objects.
|
34
|
+
class DocumentClusterer < Clusterer
|
35
|
+
|
36
|
+
attr_reader :document_collection
|
37
|
+
|
38
|
+
def initialize()
|
39
|
+
@document_collection = SClust::Util::DocumentCollection.new()
|
40
|
+
super()
|
41
|
+
end
|
42
|
+
|
43
|
+
def <<(d)
|
44
|
+
if ( d.is_a?(SClust::Util::Document) )
|
45
|
+
@document_collection << d
|
46
|
+
else
|
47
|
+
@document_collection << SClust::Util::Document.new(d.to_s)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# This must be run to conver the document collection into
|
52
|
+
# the points in a cluster.
|
53
|
+
def initialize_points()
|
54
|
+
|
55
|
+
point_list = []
|
56
|
+
|
57
|
+
@document_collection.doclist.each do |doc|
|
58
|
+
|
59
|
+
doc_terms = SClust::Util::SparseVector.new(0)
|
60
|
+
|
61
|
+
# Buid a BIG term vector list for this document.
|
62
|
+
doc.terms.each_key do |term|
|
63
|
+
doc_terms[term] = doc.tf(term) - @document_collection.idf(term)
|
64
|
+
end
|
65
|
+
|
66
|
+
# def initialize(terms, values, source_object = nil)
|
67
|
+
point_list << ClusterPoint.new(doc_terms, doc)
|
68
|
+
end
|
69
|
+
|
70
|
+
self.points = point_list
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
def topics=(n)
|
75
|
+
|
76
|
+
initialize_points unless ( self.points && self.points.size > 0 )
|
77
|
+
super(n)
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,243 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
require 'rubygems'
|
25
|
+
require 'sclust/util/word'
|
26
|
+
require 'log4r'
|
27
|
+
|
28
|
+
module SClust
|
29
|
+
module LDA
|
30
|
+
|
31
|
+
class Topic
|
32
|
+
|
33
|
+
attr_reader :words, :wordcount, :docs
|
34
|
+
attr_writer :words, :wordcount, :docs
|
35
|
+
|
36
|
+
def initialize()
|
37
|
+
@words = {}
|
38
|
+
@wordcount = 0
|
39
|
+
@docs = {}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
class LDA
|
44
|
+
|
45
|
+
attr_reader :logger, :iterations, :doclist, :topics
|
46
|
+
attr_writer :logger, :iterations, :doclist
|
47
|
+
|
48
|
+
# Documents may be added after LDA is created, unlike k-mean clustering.
|
49
|
+
def initialize(docCol=nil)
|
50
|
+
@iterations = 3
|
51
|
+
@wordlist = []
|
52
|
+
@doclist = []
|
53
|
+
@logger = Log4r::Logger.new('Clusterer')
|
54
|
+
|
55
|
+
|
56
|
+
# Array the same size as @wordlist but stores the document object at index i
|
57
|
+
# that produced @wordlist[i].
|
58
|
+
@word2doc = []
|
59
|
+
|
60
|
+
self.topics = 10
|
61
|
+
|
62
|
+
if ( docCol )
|
63
|
+
docCol.each {|d| self << d}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def <<(document)
|
68
|
+
@doclist << document
|
69
|
+
@wordlist += document.words
|
70
|
+
document.words.length.times {@word2doc << document}
|
71
|
+
end
|
72
|
+
|
73
|
+
def topics=(count)
|
74
|
+
@topics = []
|
75
|
+
count.times do |t|
|
76
|
+
@topics << Topic.new()
|
77
|
+
@topic2doc
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Build a wordlist index array. This is an array that contains indexes into @wordlist.
|
82
|
+
# However, instead of being simply {0,1,2,3...} this array is randomized so that
|
83
|
+
# we index into @wordlist in a random order.
|
84
|
+
def build_randomized_index_into_words()
|
85
|
+
@randomized_word_index = []
|
86
|
+
|
87
|
+
@wordlist.each_index { |i| @randomized_word_index << i }
|
88
|
+
|
89
|
+
@wordlist.each_index do |i|
|
90
|
+
new_home = (@wordlist.length * rand).to_i
|
91
|
+
tmp = @randomized_word_index[i]
|
92
|
+
@randomized_word_index[i] = @randomized_word_index[new_home]
|
93
|
+
@randomized_word_index[new_home] = tmp
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
# Compute P(z=j | z..._i, w). Or, the probability that
|
99
|
+
# a topic z is the topic j represented by the given word given that word.
|
100
|
+
def p_of_z(topic, word)
|
101
|
+
|
102
|
+
return 0 unless topic.words[word]
|
103
|
+
|
104
|
+
((topic.words[word] - 1 + @beta) / (topic.wordcount - topic.words[word] - 1 + @beta * @wordlist.length)) *
|
105
|
+
((topic.docs.size - 1 + @alpha) / (@doclist.size - 1 + @alpha * @topics.size))
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def each_radomized_word_index(&call)
|
110
|
+
@randomized_word_index.each &call
|
111
|
+
end
|
112
|
+
|
113
|
+
def lda_setup()
|
114
|
+
@beta = 0.01
|
115
|
+
@alpha = 50.0 / @topics.length
|
116
|
+
|
117
|
+
build_randomized_index_into_words()
|
118
|
+
|
119
|
+
@word2topic = []
|
120
|
+
@doc2topic = []
|
121
|
+
|
122
|
+
each_radomized_word_index do |i|
|
123
|
+
topic = (@topics.size * rand).to_i
|
124
|
+
|
125
|
+
@word2topic[i] = topic # Record that this word goes to this topic.
|
126
|
+
@topics[topic].words[@wordlist[i]] ||= 0
|
127
|
+
@topics[topic].docs[@word2doc[i]] ||= 0
|
128
|
+
|
129
|
+
@topics[topic].words[@wordlist[i]] += 1 # Record a new word in this topic
|
130
|
+
@topics[topic].wordcount += 1 # Total sum of words
|
131
|
+
@topics[topic].docs[@word2doc[i]] += 1 # Record this doc index in this topic
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
|
136
|
+
# Perform 1 phase of lda
|
137
|
+
def lda_once()
|
138
|
+
each_radomized_word_index do |random_word_index|
|
139
|
+
|
140
|
+
random_word = @wordlist[random_word_index]
|
141
|
+
|
142
|
+
zdist = []
|
143
|
+
ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
|
144
|
+
|
145
|
+
# Compute distribution over z for word i.
|
146
|
+
@topics.each do |topic|
|
147
|
+
z = p_of_z(topic, random_word)
|
148
|
+
ztotal += z
|
149
|
+
zdist << z
|
150
|
+
end
|
151
|
+
|
152
|
+
r = rand * ztotal # Random value to pick topic with.
|
153
|
+
zacc = 0.0 # Accumulator of seen values of zdist[topici].
|
154
|
+
topici = (rand() * @topics.size).to_i
|
155
|
+
|
156
|
+
# Pick a topic, t
|
157
|
+
|
158
|
+
catch(:picked_topic) do
|
159
|
+
@topics.each_index do |topici|
|
160
|
+
zacc += zdist[topici]
|
161
|
+
throw :picked_topic if r < zacc
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
topic = @topics[topici]
|
166
|
+
|
167
|
+
previous_topic = @topics[@word2topic[random_word_index]]
|
168
|
+
|
169
|
+
# Skip if src and dst topic are the same
|
170
|
+
next if @word2topic[random_word_index] == topici
|
171
|
+
|
172
|
+
# Remove word from previous topic.
|
173
|
+
|
174
|
+
if ( previous_topic.words[@wordlist[random_word_index]] > 0 )
|
175
|
+
previous_topic.words[@wordlist[random_word_index]] -= 1 # Remove a new word in this topic
|
176
|
+
previous_topic.wordcount -= 1 # Reduce sum of words
|
177
|
+
previous_topic.docs[@word2doc[random_word_index]] -= 1 # Remove this doc index in this topic
|
178
|
+
|
179
|
+
previous_topic.docs.delete(@word2doc[random_word_index]) if previous_topic.docs[@word2doc[random_word_index]] <= 0
|
180
|
+
end
|
181
|
+
|
182
|
+
topic.words[@wordlist[random_word_index]] ||= 0 # If word was not in previous topic, add to this one.
|
183
|
+
topic.docs[@word2doc[random_word_index]] ||= 0 # If doc was not previously here.
|
184
|
+
|
185
|
+
# Add word to chosen topic.
|
186
|
+
@word2topic[random_word_index] = topici # Record that this word goes to this topic.
|
187
|
+
topic.words[@wordlist[random_word_index]] += 1 # Record a new word in this topic
|
188
|
+
topic.wordcount += 1 # Total sum of words
|
189
|
+
topic.docs[@word2doc[random_word_index]] += 1 # Record this doc index in this topic
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def lda(opts={})
|
194
|
+
opts[:iterations] ||= @iterations
|
195
|
+
|
196
|
+
unless (opts[:continue])
|
197
|
+
lda_setup()
|
198
|
+
end
|
199
|
+
|
200
|
+
opts[:iterations].times do |i|
|
201
|
+
lda_once()
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
# Takes {|topic| ... }
|
206
|
+
def each_topic(&topicproc)
|
207
|
+
@topics.each &topicproc
|
208
|
+
end
|
209
|
+
|
210
|
+
# Return a list lists, [ z, word ].
|
211
|
+
def get_top_words_for_topic(topic, n = 3)
|
212
|
+
|
213
|
+
# List of (z, topic, word)
|
214
|
+
tupleList = []
|
215
|
+
|
216
|
+
topic.words.each_key do |word|
|
217
|
+
tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
|
218
|
+
end
|
219
|
+
|
220
|
+
# Yes, rev the comparison so the list sorts backwards.
|
221
|
+
tupleList.sort! { |x, y| y.weight <=> x.weight }
|
222
|
+
|
223
|
+
tupleList[0...n]
|
224
|
+
|
225
|
+
end
|
226
|
+
|
227
|
+
# Returns list list list.
|
228
|
+
# Each list is a topic list.
|
229
|
+
# Each topic list contains a word list.
|
230
|
+
# [ [ z, word, topic ], ... ]
|
231
|
+
def get_max_terms(n=3)
|
232
|
+
topics = []
|
233
|
+
|
234
|
+
each_topic { |t| topics << get_top_words_for_topic(t, n) }
|
235
|
+
|
236
|
+
topics
|
237
|
+
end
|
238
|
+
|
239
|
+
alias cluster lda
|
240
|
+
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|