sclust 1.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
@@ -0,0 +1,328 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
require 'rubygems'
|
25
|
+
require 'sclust/util/word'
|
26
|
+
require 'sclust/util/doccol'
|
27
|
+
require 'log4r'
|
28
|
+
require 'sclust/util/weightedmovingaverage'
|
29
|
+
|
30
|
+
module SClust
|
31
|
+
|
32
|
+
# A second approach to using LDA on documents.
|
33
|
+
# This uses the tf-idf value to scale the probability of words being included (B value).
|
34
|
+
module LDA2
|
35
|
+
|
36
|
+
class Topic
|
37
|
+
|
38
|
+
attr_reader :words, :wordcount, :docs
|
39
|
+
attr_writer :words, :wordcount, :docs
|
40
|
+
|
41
|
+
def initialize()
|
42
|
+
@words = SClust::Util::SparseVector.new(0) # Hash count of words. Keys are indexes into @wordlist
|
43
|
+
#@words = Hash.new(0) # Hash count of words. Keys are indexes into @wordlist
|
44
|
+
@wordcount = 0 # Sum of values in @words.
|
45
|
+
@docs = SClust::Util::SparseVector.new(0)
|
46
|
+
#@docs = Hash.new(0) # Collection of documents. Hash is to eliminate duplicates.
|
47
|
+
end
|
48
|
+
|
49
|
+
def has_word_and_doc?(word, doc)
|
50
|
+
@words.member?(word) and @docs.member?(doc)
|
51
|
+
end
|
52
|
+
|
53
|
+
def add(word, doc)
|
54
|
+
@words[word] += 1
|
55
|
+
@wordcount += 1
|
56
|
+
@docs[doc] += 1
|
57
|
+
end
|
58
|
+
|
59
|
+
def remove(word, doc)
|
60
|
+
@words[word] -= 1
|
61
|
+
@wordcount -= 1
|
62
|
+
@docs.delete(doc) if (@docs[doc] -= 1 ) < 0 # NOTE: Sparse Vector deletes when @docs[doc] == 0.
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class LDA2
|
67
|
+
|
68
|
+
attr_reader :document_collection
|
69
|
+
|
70
|
+
attr_reader :logger, :iterations, :doclist, :topics
|
71
|
+
attr_writer :logger, :iterations, :doclist
|
72
|
+
|
73
|
+
# Documents may be added after LDA is created, unlike k-mean clustering.
|
74
|
+
def initialize()
|
75
|
+
@iterations = 3
|
76
|
+
@wordlist = []
|
77
|
+
@doclist = []
|
78
|
+
@logger = Log4r::Logger.new(self.class.to_s)
|
79
|
+
@logger.add('default')
|
80
|
+
@topic_change_rate = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
|
81
|
+
@word_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
|
82
|
+
@doc_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
|
83
|
+
|
84
|
+
# Used for inverse document frequency values.
|
85
|
+
@document_collection = SClust::Util::DocumentCollection.new()
|
86
|
+
|
87
|
+
# Array the same size as @wordlist but stores the document object at index i
|
88
|
+
# that produced @wordlist[i].
|
89
|
+
@word2doc = []
|
90
|
+
|
91
|
+
self.topics = 10
|
92
|
+
end
|
93
|
+
|
94
|
+
# Set the topic count and initialize the @topics array with empty SClust::LDA2::Topic instances.
|
95
|
+
def topics=(count)
|
96
|
+
@topics = []
|
97
|
+
count.times do |t|
|
98
|
+
@topics << Topic.new()
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Add a document to the collection backing this cluster. This must be a
|
103
|
+
# SClust::Util::Document.
|
104
|
+
def <<(document)
|
105
|
+
@doclist << document
|
106
|
+
|
107
|
+
@document_collection << document
|
108
|
+
|
109
|
+
@wordlist += document.words
|
110
|
+
|
111
|
+
document.words.size.times { @word2doc << document }
|
112
|
+
end
|
113
|
+
|
114
|
+
# If you edit the document collection behind the scenes, you need to run this to avoid
|
115
|
+
# terms with 0 showing up.
|
116
|
+
def rebuild_document_collection()
|
117
|
+
|
118
|
+
@logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
|
119
|
+
@logger.info("Rebuilding document collection and word list.")
|
120
|
+
|
121
|
+
dl = @document_collection.doclist
|
122
|
+
|
123
|
+
@doclist = []
|
124
|
+
|
125
|
+
@document_collection = SClust::Util::DocumentCollection.new()
|
126
|
+
|
127
|
+
@wordlist = []
|
128
|
+
|
129
|
+
@word2doc = []
|
130
|
+
|
131
|
+
dl.each { |doc| self << doc }
|
132
|
+
|
133
|
+
@logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
|
134
|
+
|
135
|
+
end
|
136
|
+
|
137
|
+
# Build a wordlist index array. This is an array that contains indexes into @wordlist.
|
138
|
+
# However, instead of being simply {0,1,2,3...} this array is randomized so that
|
139
|
+
# we index into @wordlist in a random order.
|
140
|
+
def build_randomized_index_into_words()
|
141
|
+
|
142
|
+
@logger.info("Randomizing words.")
|
143
|
+
|
144
|
+
@randomized_word_index = []
|
145
|
+
|
146
|
+
@wordlist.each_index { |i| @randomized_word_index << i }
|
147
|
+
|
148
|
+
@wordlist.each_index do |i|
|
149
|
+
new_home = (@wordlist.length * rand).to_i
|
150
|
+
tmp = @randomized_word_index[i]
|
151
|
+
@randomized_word_index[i] = @randomized_word_index[new_home]
|
152
|
+
@randomized_word_index[new_home] = tmp
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
#
|
158
|
+
# Compute p(z_i|theta) * p(w|z_i,B).
|
159
|
+
#
|
160
|
+
def p_of_z(topic, word, doc=nil)
|
161
|
+
|
162
|
+
beta = @beta
|
163
|
+
|
164
|
+
words_from_doc_in_topic = (doc.nil?) ?
|
165
|
+
topic.docs.reduce(0.0) { |x, num| x+num[1] } :
|
166
|
+
words_from_doc_in_topic = topic.docs[doc]
|
167
|
+
|
168
|
+
word_prob_avg = ((topic.words[word] - 1.0 + beta) / (topic.wordcount - 1.0 + beta ) )
|
169
|
+
doc_prob_avg = ((words_from_doc_in_topic - 1.0 + @alpha) / (topic.wordcount - 1.0 + @alpha ))
|
170
|
+
|
171
|
+
|
172
|
+
# Stop-gap protection for when the denominator gets wonky.
|
173
|
+
doc_prob_avg = 0.0 if doc_prob_avg.nan? || doc_prob_avg < 0.0
|
174
|
+
word_prob_avg = 0.0 if word_prob_avg.nan? || word_prob_avg < 0.0
|
175
|
+
|
176
|
+
@word_prob_avg.adjust(word_prob_avg)
|
177
|
+
@doc_prob_avg.adjust(doc_prob_avg)
|
178
|
+
|
179
|
+
#@logger.info("WHAJL:KJ:LKDS: #{doc_prob_avg} #{topic.docs.size} #{@doclist.size}")
|
180
|
+
|
181
|
+
# Final result.
|
182
|
+
doc_prob_avg * word_prob_avg
|
183
|
+
|
184
|
+
# Alternate forumla. Denominator changed.
|
185
|
+
#((topic.words[word] - 1.0 + beta) / (topic.wordcount - 1.0 + beta ) ) *
|
186
|
+
#((topic.docs.size - 1.0 + alpha) / (@doclist.size - topic.docs.size - 1.0 + alpha ))
|
187
|
+
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
def each_radomized_word_index(&call)
|
192
|
+
@randomized_word_index.each &call
|
193
|
+
end
|
194
|
+
|
195
|
+
def lda_setup()
|
196
|
+
@beta = 0.01
|
197
|
+
@alpha = 1.0 #( @doclist.size / @topics.length ).to_f
|
198
|
+
|
199
|
+
build_randomized_index_into_words()
|
200
|
+
|
201
|
+
@word2topic = []
|
202
|
+
@doc2topic = []
|
203
|
+
|
204
|
+
each_radomized_word_index do |i|
|
205
|
+
topic = (@topics.size * rand).to_i
|
206
|
+
|
207
|
+
@word2topic[i] = topic # Record that this word goes to this topic.
|
208
|
+
|
209
|
+
@topics[topic].add(@wordlist[i], @word2doc[i])
|
210
|
+
end
|
211
|
+
|
212
|
+
@topic_change_rate.weight = 1.0 / @wordlist.size
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
# Perform 1 phase of lda
|
217
|
+
def lda_once()
|
218
|
+
each_radomized_word_index do |random_word_index|
|
219
|
+
|
220
|
+
random_word = @wordlist[random_word_index]
|
221
|
+
doc = @word2doc[random_word_index]
|
222
|
+
|
223
|
+
zdist = []
|
224
|
+
ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
|
225
|
+
|
226
|
+
# Compute distribution over z for word i.
|
227
|
+
@topics.each do |topic|
|
228
|
+
z = p_of_z(topic, random_word, doc)
|
229
|
+
ztotal += z
|
230
|
+
zdist << z
|
231
|
+
end
|
232
|
+
|
233
|
+
r = rand * ztotal # Random value to pick topic with.
|
234
|
+
zacc = 0.0 # Accumulator of seen values of zdist[topic_i].
|
235
|
+
topic_i = (rand() * @topics.size).to_i
|
236
|
+
|
237
|
+
# Pick a topic, t
|
238
|
+
|
239
|
+
catch(:picked_topic) do
|
240
|
+
@topics.each_index do |topic_i|
|
241
|
+
zacc += zdist[topic_i]
|
242
|
+
throw :picked_topic if r < zacc
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
topic = @topics[topic_i]
|
247
|
+
|
248
|
+
previous_topic = @topics[@word2topic[random_word_index]]
|
249
|
+
|
250
|
+
# Skip if src and dst topic are the same
|
251
|
+
if @word2topic[random_word_index] == topic_i
|
252
|
+
|
253
|
+
@topic_change_rate.adjust(0.0) # adjust...
|
254
|
+
|
255
|
+
else
|
256
|
+
|
257
|
+
# Adjust the topic change rate. This is how we will trac convergence.
|
258
|
+
# Few topic moves (comparatively) and we're done.
|
259
|
+
@topic_change_rate.adjust(1.0)
|
260
|
+
|
261
|
+
# Remove word from previous topic.
|
262
|
+
|
263
|
+
previous_topic.remove(random_word, doc) if previous_topic.has_word_and_doc?(random_word, doc)
|
264
|
+
|
265
|
+
# Add word to chosen topic.
|
266
|
+
@word2topic[random_word_index] = topic_i # Record that this word goes to this topic.
|
267
|
+
|
268
|
+
topic.add(random_word, doc)
|
269
|
+
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
$logger.info { "Topic change rate: #{@topic_change_rate.value} Doc% #{ @doc_prob_avg.value} Word% #{ @word_prob_avg.value}" }
|
274
|
+
end
|
275
|
+
|
276
|
+
def lda(opts={})
|
277
|
+
opts[:iterations] ||= @iterations
|
278
|
+
|
279
|
+
unless (opts[:continue])
|
280
|
+
@logger.info("Setting up to run LDA.")
|
281
|
+
lda_setup()
|
282
|
+
end
|
283
|
+
|
284
|
+
opts[:iterations].times do |i|
|
285
|
+
@logger.info { "LDA Iteration #{i+1} / #{opts[:iterations]}"}
|
286
|
+
lda_once()
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
# Takes {|topic| ... }
|
291
|
+
def each_topic(&topicproc)
|
292
|
+
@topics.each &topicproc
|
293
|
+
end
|
294
|
+
|
295
|
+
# Return a list lists, [ z, word ].
|
296
|
+
def get_top_words_for_topic(topic, n = 3)
|
297
|
+
|
298
|
+
# List of (z, topic, word)
|
299
|
+
tupleList = []
|
300
|
+
|
301
|
+
topic.words.each_key do |word|
|
302
|
+
tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
|
303
|
+
end
|
304
|
+
|
305
|
+
# Yes, rev the comparison so the list sorts backwards.
|
306
|
+
tupleList.sort! { |x, y| y.weight <=> x.weight }
|
307
|
+
|
308
|
+
tupleList[0...n]
|
309
|
+
|
310
|
+
end
|
311
|
+
|
312
|
+
# Returns list list list.
|
313
|
+
# Each list is a topic list.
|
314
|
+
# Each topic list contains a word list.
|
315
|
+
# [ [ z, word, topic ], ... ]
|
316
|
+
def get_max_terms(n=3)
|
317
|
+
topics = []
|
318
|
+
|
319
|
+
each_topic { |t| topics << get_top_words_for_topic(t, n) }
|
320
|
+
|
321
|
+
topics
|
322
|
+
end
|
323
|
+
|
324
|
+
alias cluster lda
|
325
|
+
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'sclust/util/filters'
|
26
|
+
require 'log4r'
|
27
|
+
|
28
|
+
module SClust
|
29
|
+
module Util
|
30
|
+
|
31
|
+
# A typical document representation that
|
32
|
+
# is backed by a body of text but also breaks it up into
|
33
|
+
# a set of n-grams using a DocumentTokenizer and a DocumentTermFilter.
|
34
|
+
class Document
|
35
|
+
|
36
|
+
@@logger = Log4r::Logger.new(self.class.to_s)
|
37
|
+
@@logger.add('default')
|
38
|
+
@@logger.level = Log4r::DEBUG
|
39
|
+
|
40
|
+
attr_reader :terms, :userDate, :filter, :word_count, :words, :text
|
41
|
+
|
42
|
+
# Takes { :userData, :ngrams => [1,2,3], :filter => Filter, :term_limit => 100 }
|
43
|
+
# also { :min_freq => [ minimum frequency below which a term is removed from the document. ] }
|
44
|
+
# also { :max_freq => [ maximum frequency above which a term is removed from the document. ] }
|
45
|
+
def initialize(text, opts={})
|
46
|
+
|
47
|
+
@text = text # The raw document. Never changed.
|
48
|
+
@userData = opts[:userData] # Options!
|
49
|
+
|
50
|
+
opts[:ngrams] ||= [ 1, 2, 3 ]
|
51
|
+
opts[:filter] ||= DocumentTermFilter.new()
|
52
|
+
opts[:tokenizer] ||= DocumentTokenizer.new()
|
53
|
+
|
54
|
+
@words = opts[:tokenizer].apply(text).map { |word|
|
55
|
+
opts[:filter].apply(word) }.delete_if { |x| x.nil? or x=~/^\s+$/ }
|
56
|
+
|
57
|
+
@word_count = @words.size
|
58
|
+
@terms = Hash.new(0)
|
59
|
+
|
60
|
+
# Array of counts of grams built.
|
61
|
+
builtGramCounts = []
|
62
|
+
|
63
|
+
# Build a set of n-grams from our requested ngram range.
|
64
|
+
opts[:ngrams].each do |n|
|
65
|
+
|
66
|
+
builtGramCounts[n] = 0
|
67
|
+
|
68
|
+
# For each word in our list...
|
69
|
+
@words.length.times do |j|
|
70
|
+
|
71
|
+
if ( n + j <= @words.length )
|
72
|
+
|
73
|
+
term = @words[j]
|
74
|
+
|
75
|
+
# Pick number of iterations based on how close to the end of the array we are.
|
76
|
+
(( ( @words.length > n+j) ? n : @words.length-j)-1).times { |ngram| term += " #{@words[j+ngram+1]}" }
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
@terms[term] += 1.0 if term
|
81
|
+
|
82
|
+
builtGramCounts[n] += 1
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
if opts.key?(:min_freq) or opts.key?(:max_freq)
|
89
|
+
minwords = @words.size * ( opts[:min_freq] || 0.0 )
|
90
|
+
maxwords = @words.size * ( opts[:max_freq] || 1.0 )
|
91
|
+
|
92
|
+
#@@logger.debug { "Keeping terms between #{minwords} and #{maxwords} out of a total of #{@words.size}" }
|
93
|
+
|
94
|
+
@terms.delete_if do |term, freq|
|
95
|
+
if ( freq < minwords or freq > maxwords )
|
96
|
+
@words.delete_if { |x| term == x}
|
97
|
+
true
|
98
|
+
else
|
99
|
+
false
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
@wordcount = @words.size
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Frequency information is never updated.
|
108
|
+
def delete_term_if(&call)
|
109
|
+
@terms.delete_if { |term, val| call.call(term) }
|
110
|
+
@words.delete_if { |term| call.call(term) }
|
111
|
+
end
|
112
|
+
|
113
|
+
def term_count(term)
|
114
|
+
@terms[term]
|
115
|
+
end
|
116
|
+
|
117
|
+
def term_frequency(term)
|
118
|
+
@terms[term] / @words.size
|
119
|
+
end
|
120
|
+
|
121
|
+
alias tf term_frequency
|
122
|
+
|
123
|
+
# Each term and the term count passed to the given block. Divide the count by the total number of works to get the term frequency.
|
124
|
+
def each_term(&call)
|
125
|
+
terms.each{ |k,v| yield(k, v) }
|
126
|
+
end
|
127
|
+
|
128
|
+
def has_term?(term)
|
129
|
+
@terms.has_key?(term)
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|