sclust 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
@@ -0,0 +1,328 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
require 'rubygems'
|
25
|
+
require 'sclust/util/word'
|
26
|
+
require 'sclust/util/doccol'
|
27
|
+
require 'log4r'
|
28
|
+
require 'sclust/util/weightedmovingaverage'
|
29
|
+
|
30
|
+
module SClust
|
31
|
+
|
32
|
+
# A second approach to using LDA on documents.
|
33
|
+
# This uses the tf-idf value to scale the probability of words being included (B value).
|
34
|
+
module LDA2
|
35
|
+
|
36
|
+
class Topic
|
37
|
+
|
38
|
+
attr_reader :words, :wordcount, :docs
|
39
|
+
attr_writer :words, :wordcount, :docs
|
40
|
+
|
41
|
+
def initialize()
|
42
|
+
@words = SClust::Util::SparseVector.new(0) # Hash count of words. Keys are indexes into @wordlist
|
43
|
+
#@words = Hash.new(0) # Hash count of words. Keys are indexes into @wordlist
|
44
|
+
@wordcount = 0 # Sum of values in @words.
|
45
|
+
@docs = SClust::Util::SparseVector.new(0)
|
46
|
+
#@docs = Hash.new(0) # Collection of documents. Hash is to eliminate duplicates.
|
47
|
+
end
|
48
|
+
|
49
|
+
def has_word_and_doc?(word, doc)
|
50
|
+
@words.member?(word) and @docs.member?(doc)
|
51
|
+
end
|
52
|
+
|
53
|
+
def add(word, doc)
|
54
|
+
@words[word] += 1
|
55
|
+
@wordcount += 1
|
56
|
+
@docs[doc] += 1
|
57
|
+
end
|
58
|
+
|
59
|
+
def remove(word, doc)
|
60
|
+
@words[word] -= 1
|
61
|
+
@wordcount -= 1
|
62
|
+
@docs.delete(doc) if (@docs[doc] -= 1 ) < 0 # NOTE: Sparse Vector deletes when @docs[doc] == 0.
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class LDA2
|
67
|
+
|
68
|
+
attr_reader :document_collection
|
69
|
+
|
70
|
+
attr_reader :logger, :iterations, :doclist, :topics
|
71
|
+
attr_writer :logger, :iterations, :doclist
|
72
|
+
|
73
|
+
# Documents may be added after LDA is created, unlike k-mean clustering.
|
74
|
+
def initialize()
|
75
|
+
@iterations = 3
|
76
|
+
@wordlist = []
|
77
|
+
@doclist = []
|
78
|
+
@logger = Log4r::Logger.new(self.class.to_s)
|
79
|
+
@logger.add('default')
|
80
|
+
@topic_change_rate = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
|
81
|
+
@word_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
|
82
|
+
@doc_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
|
83
|
+
|
84
|
+
# Used for inverse document frequency values.
|
85
|
+
@document_collection = SClust::Util::DocumentCollection.new()
|
86
|
+
|
87
|
+
# Array the same size as @wordlist but stores the document object at index i
|
88
|
+
# that produced @wordlist[i].
|
89
|
+
@word2doc = []
|
90
|
+
|
91
|
+
self.topics = 10
|
92
|
+
end
|
93
|
+
|
94
|
+
# Set the topic count and initialize the @topics array with empty SClust::LDA2::Topic instances.
|
95
|
+
def topics=(count)
|
96
|
+
@topics = []
|
97
|
+
count.times do |t|
|
98
|
+
@topics << Topic.new()
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Add a document to the collection backing this cluster. This must be a
|
103
|
+
# SClust::Util::Document.
|
104
|
+
def <<(document)
|
105
|
+
@doclist << document
|
106
|
+
|
107
|
+
@document_collection << document
|
108
|
+
|
109
|
+
@wordlist += document.words
|
110
|
+
|
111
|
+
document.words.size.times { @word2doc << document }
|
112
|
+
end
|
113
|
+
|
114
|
+
# If you edit the document collection behind the scenes, you need to run this to avoid
|
115
|
+
# terms with 0 showing up.
|
116
|
+
def rebuild_document_collection()
|
117
|
+
|
118
|
+
@logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
|
119
|
+
@logger.info("Rebuilding document collection and word list.")
|
120
|
+
|
121
|
+
dl = @document_collection.doclist
|
122
|
+
|
123
|
+
@doclist = []
|
124
|
+
|
125
|
+
@document_collection = SClust::Util::DocumentCollection.new()
|
126
|
+
|
127
|
+
@wordlist = []
|
128
|
+
|
129
|
+
@word2doc = []
|
130
|
+
|
131
|
+
dl.each { |doc| self << doc }
|
132
|
+
|
133
|
+
@logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
|
134
|
+
|
135
|
+
end
|
136
|
+
|
137
|
+
# Build a wordlist index array. This is an array that contains indexes into @wordlist.
|
138
|
+
# However, instead of being simply {0,1,2,3...} this array is randomized so that
|
139
|
+
# we index into @wordlist in a random order.
|
140
|
+
def build_randomized_index_into_words()
|
141
|
+
|
142
|
+
@logger.info("Randomizing words.")
|
143
|
+
|
144
|
+
@randomized_word_index = []
|
145
|
+
|
146
|
+
@wordlist.each_index { |i| @randomized_word_index << i }
|
147
|
+
|
148
|
+
@wordlist.each_index do |i|
|
149
|
+
new_home = (@wordlist.length * rand).to_i
|
150
|
+
tmp = @randomized_word_index[i]
|
151
|
+
@randomized_word_index[i] = @randomized_word_index[new_home]
|
152
|
+
@randomized_word_index[new_home] = tmp
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
#
|
158
|
+
# Compute p(z_i|theta) * p(w|z_i,B).
|
159
|
+
#
|
160
|
+
def p_of_z(topic, word, doc=nil)
|
161
|
+
|
162
|
+
beta = @beta
|
163
|
+
|
164
|
+
words_from_doc_in_topic = (doc.nil?) ?
|
165
|
+
topic.docs.reduce(0.0) { |x, num| x+num[1] } :
|
166
|
+
words_from_doc_in_topic = topic.docs[doc]
|
167
|
+
|
168
|
+
word_prob_avg = ((topic.words[word] - 1.0 + beta) / (topic.wordcount - 1.0 + beta ) )
|
169
|
+
doc_prob_avg = ((words_from_doc_in_topic - 1.0 + @alpha) / (topic.wordcount - 1.0 + @alpha ))
|
170
|
+
|
171
|
+
|
172
|
+
# Stop-gap protection for when the denominator gets wonky.
|
173
|
+
doc_prob_avg = 0.0 if doc_prob_avg.nan? || doc_prob_avg < 0.0
|
174
|
+
word_prob_avg = 0.0 if word_prob_avg.nan? || word_prob_avg < 0.0
|
175
|
+
|
176
|
+
@word_prob_avg.adjust(word_prob_avg)
|
177
|
+
@doc_prob_avg.adjust(doc_prob_avg)
|
178
|
+
|
179
|
+
#@logger.info("WHAJL:KJ:LKDS: #{doc_prob_avg} #{topic.docs.size} #{@doclist.size}")
|
180
|
+
|
181
|
+
# Final result.
|
182
|
+
doc_prob_avg * word_prob_avg
|
183
|
+
|
184
|
+
# Alternate forumla. Denominator changed.
|
185
|
+
#((topic.words[word] - 1.0 + beta) / (topic.wordcount - 1.0 + beta ) ) *
|
186
|
+
#((topic.docs.size - 1.0 + alpha) / (@doclist.size - topic.docs.size - 1.0 + alpha ))
|
187
|
+
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
def each_radomized_word_index(&call)
|
192
|
+
@randomized_word_index.each &call
|
193
|
+
end
|
194
|
+
|
195
|
+
def lda_setup()
|
196
|
+
@beta = 0.01
|
197
|
+
@alpha = 1.0 #( @doclist.size / @topics.length ).to_f
|
198
|
+
|
199
|
+
build_randomized_index_into_words()
|
200
|
+
|
201
|
+
@word2topic = []
|
202
|
+
@doc2topic = []
|
203
|
+
|
204
|
+
each_radomized_word_index do |i|
|
205
|
+
topic = (@topics.size * rand).to_i
|
206
|
+
|
207
|
+
@word2topic[i] = topic # Record that this word goes to this topic.
|
208
|
+
|
209
|
+
@topics[topic].add(@wordlist[i], @word2doc[i])
|
210
|
+
end
|
211
|
+
|
212
|
+
@topic_change_rate.weight = 1.0 / @wordlist.size
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
# Perform 1 phase of lda
|
217
|
+
def lda_once()
|
218
|
+
each_radomized_word_index do |random_word_index|
|
219
|
+
|
220
|
+
random_word = @wordlist[random_word_index]
|
221
|
+
doc = @word2doc[random_word_index]
|
222
|
+
|
223
|
+
zdist = []
|
224
|
+
ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
|
225
|
+
|
226
|
+
# Compute distribution over z for word i.
|
227
|
+
@topics.each do |topic|
|
228
|
+
z = p_of_z(topic, random_word, doc)
|
229
|
+
ztotal += z
|
230
|
+
zdist << z
|
231
|
+
end
|
232
|
+
|
233
|
+
r = rand * ztotal # Random value to pick topic with.
|
234
|
+
zacc = 0.0 # Accumulator of seen values of zdist[topic_i].
|
235
|
+
topic_i = (rand() * @topics.size).to_i
|
236
|
+
|
237
|
+
# Pick a topic, t
|
238
|
+
|
239
|
+
catch(:picked_topic) do
|
240
|
+
@topics.each_index do |topic_i|
|
241
|
+
zacc += zdist[topic_i]
|
242
|
+
throw :picked_topic if r < zacc
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
topic = @topics[topic_i]
|
247
|
+
|
248
|
+
previous_topic = @topics[@word2topic[random_word_index]]
|
249
|
+
|
250
|
+
# Skip if src and dst topic are the same
|
251
|
+
if @word2topic[random_word_index] == topic_i
|
252
|
+
|
253
|
+
@topic_change_rate.adjust(0.0) # adjust...
|
254
|
+
|
255
|
+
else
|
256
|
+
|
257
|
+
# Adjust the topic change rate. This is how we will trac convergence.
|
258
|
+
# Few topic moves (comparatively) and we're done.
|
259
|
+
@topic_change_rate.adjust(1.0)
|
260
|
+
|
261
|
+
# Remove word from previous topic.
|
262
|
+
|
263
|
+
previous_topic.remove(random_word, doc) if previous_topic.has_word_and_doc?(random_word, doc)
|
264
|
+
|
265
|
+
# Add word to chosen topic.
|
266
|
+
@word2topic[random_word_index] = topic_i # Record that this word goes to this topic.
|
267
|
+
|
268
|
+
topic.add(random_word, doc)
|
269
|
+
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
$logger.info { "Topic change rate: #{@topic_change_rate.value} Doc% #{ @doc_prob_avg.value} Word% #{ @word_prob_avg.value}" }
|
274
|
+
end
|
275
|
+
|
276
|
+
def lda(opts={})
|
277
|
+
opts[:iterations] ||= @iterations
|
278
|
+
|
279
|
+
unless (opts[:continue])
|
280
|
+
@logger.info("Setting up to run LDA.")
|
281
|
+
lda_setup()
|
282
|
+
end
|
283
|
+
|
284
|
+
opts[:iterations].times do |i|
|
285
|
+
@logger.info { "LDA Iteration #{i+1} / #{opts[:iterations]}"}
|
286
|
+
lda_once()
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
# Takes {|topic| ... }
|
291
|
+
def each_topic(&topicproc)
|
292
|
+
@topics.each &topicproc
|
293
|
+
end
|
294
|
+
|
295
|
+
# Return a list lists, [ z, word ].
|
296
|
+
def get_top_words_for_topic(topic, n = 3)
|
297
|
+
|
298
|
+
# List of (z, topic, word)
|
299
|
+
tupleList = []
|
300
|
+
|
301
|
+
topic.words.each_key do |word|
|
302
|
+
tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
|
303
|
+
end
|
304
|
+
|
305
|
+
# Yes, rev the comparison so the list sorts backwards.
|
306
|
+
tupleList.sort! { |x, y| y.weight <=> x.weight }
|
307
|
+
|
308
|
+
tupleList[0...n]
|
309
|
+
|
310
|
+
end
|
311
|
+
|
312
|
+
# Returns list list list.
|
313
|
+
# Each list is a topic list.
|
314
|
+
# Each topic list contains a word list.
|
315
|
+
# [ [ z, word, topic ], ... ]
|
316
|
+
def get_max_terms(n=3)
|
317
|
+
topics = []
|
318
|
+
|
319
|
+
each_topic { |t| topics << get_top_words_for_topic(t, n) }
|
320
|
+
|
321
|
+
topics
|
322
|
+
end
|
323
|
+
|
324
|
+
alias cluster lda
|
325
|
+
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
#
|
2
|
+
# The MIT License
|
3
|
+
#
|
4
|
+
# Copyright (c) 2010 Samuel R. Baskinger
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'sclust/util/filters'
|
26
|
+
require 'log4r'
|
27
|
+
|
28
|
+
module SClust
|
29
|
+
module Util
|
30
|
+
|
31
|
+
# A typical document representation that
|
32
|
+
# is backed by a body of text but also breaks it up into
|
33
|
+
# a set of n-grams using a DocumentTokenizer and a DocumentTermFilter.
|
34
|
+
class Document
|
35
|
+
|
36
|
+
@@logger = Log4r::Logger.new(self.class.to_s)
|
37
|
+
@@logger.add('default')
|
38
|
+
@@logger.level = Log4r::DEBUG
|
39
|
+
|
40
|
+
attr_reader :terms, :userDate, :filter, :word_count, :words, :text
|
41
|
+
|
42
|
+
# Takes { :userData, :ngrams => [1,2,3], :filter => Filter, :term_limit => 100 }
|
43
|
+
# also { :min_freq => [ minimum frequency below which a term is removed from the document. ] }
|
44
|
+
# also { :max_freq => [ maximum frequency above which a term is removed from the document. ] }
|
45
|
+
def initialize(text, opts={})
|
46
|
+
|
47
|
+
@text = text # The raw document. Never changed.
|
48
|
+
@userData = opts[:userData] # Options!
|
49
|
+
|
50
|
+
opts[:ngrams] ||= [ 1, 2, 3 ]
|
51
|
+
opts[:filter] ||= DocumentTermFilter.new()
|
52
|
+
opts[:tokenizer] ||= DocumentTokenizer.new()
|
53
|
+
|
54
|
+
@words = opts[:tokenizer].apply(text).map { |word|
|
55
|
+
opts[:filter].apply(word) }.delete_if { |x| x.nil? or x=~/^\s+$/ }
|
56
|
+
|
57
|
+
@word_count = @words.size
|
58
|
+
@terms = Hash.new(0)
|
59
|
+
|
60
|
+
# Array of counts of grams built.
|
61
|
+
builtGramCounts = []
|
62
|
+
|
63
|
+
# Build a set of n-grams from our requested ngram range.
|
64
|
+
opts[:ngrams].each do |n|
|
65
|
+
|
66
|
+
builtGramCounts[n] = 0
|
67
|
+
|
68
|
+
# For each word in our list...
|
69
|
+
@words.length.times do |j|
|
70
|
+
|
71
|
+
if ( n + j <= @words.length )
|
72
|
+
|
73
|
+
term = @words[j]
|
74
|
+
|
75
|
+
# Pick number of iterations based on how close to the end of the array we are.
|
76
|
+
(( ( @words.length > n+j) ? n : @words.length-j)-1).times { |ngram| term += " #{@words[j+ngram+1]}" }
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
@terms[term] += 1.0 if term
|
81
|
+
|
82
|
+
builtGramCounts[n] += 1
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
if opts.key?(:min_freq) or opts.key?(:max_freq)
|
89
|
+
minwords = @words.size * ( opts[:min_freq] || 0.0 )
|
90
|
+
maxwords = @words.size * ( opts[:max_freq] || 1.0 )
|
91
|
+
|
92
|
+
#@@logger.debug { "Keeping terms between #{minwords} and #{maxwords} out of a total of #{@words.size}" }
|
93
|
+
|
94
|
+
@terms.delete_if do |term, freq|
|
95
|
+
if ( freq < minwords or freq > maxwords )
|
96
|
+
@words.delete_if { |x| term == x}
|
97
|
+
true
|
98
|
+
else
|
99
|
+
false
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
@wordcount = @words.size
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Frequency information is never updated.
|
108
|
+
def delete_term_if(&call)
|
109
|
+
@terms.delete_if { |term, val| call.call(term) }
|
110
|
+
@words.delete_if { |term| call.call(term) }
|
111
|
+
end
|
112
|
+
|
113
|
+
def term_count(term)
|
114
|
+
@terms[term]
|
115
|
+
end
|
116
|
+
|
117
|
+
def term_frequency(term)
|
118
|
+
@terms[term] / @words.size
|
119
|
+
end
|
120
|
+
|
121
|
+
alias tf term_frequency
|
122
|
+
|
123
|
+
# Each term and the term count passed to the given block. Divide the count by the total number of works to get the term frequency.
|
124
|
+
def each_term(&call)
|
125
|
+
terms.each{ |k,v| yield(k, v) }
|
126
|
+
end
|
127
|
+
|
128
|
+
def has_term?(term)
|
129
|
+
@terms.has_key?(term)
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|