sclust 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,328 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+ require 'rubygems'
25
+ require 'sclust/util/word'
26
+ require 'sclust/util/doccol'
27
+ require 'log4r'
28
+ require 'sclust/util/weightedmovingaverage'
29
+
30
+ module SClust
31
+
32
+ # A second approach to using LDA on documents.
33
+ # This uses the tf-idf value to scale the probability of words being included (B value).
34
+ module LDA2
35
+
36
+ class Topic
37
+
38
+ attr_reader :words, :wordcount, :docs
39
+ attr_writer :words, :wordcount, :docs
40
+
41
+ def initialize()
42
+ @words = SClust::Util::SparseVector.new(0) # Hash count of words. Keys are indexes into @wordlist
43
+ #@words = Hash.new(0) # Hash count of words. Keys are indexes into @wordlist
44
+ @wordcount = 0 # Sum of values in @words.
45
+ @docs = SClust::Util::SparseVector.new(0)
46
+ #@docs = Hash.new(0) # Collection of documents. Hash is to eliminate duplicates.
47
+ end
48
+
49
+ def has_word_and_doc?(word, doc)
50
+ @words.member?(word) and @docs.member?(doc)
51
+ end
52
+
53
+ def add(word, doc)
54
+ @words[word] += 1
55
+ @wordcount += 1
56
+ @docs[doc] += 1
57
+ end
58
+
59
+ def remove(word, doc)
60
+ @words[word] -= 1
61
+ @wordcount -= 1
62
+ @docs.delete(doc) if (@docs[doc] -= 1 ) < 0 # NOTE: Sparse Vector deletes when @docs[doc] == 0.
63
+ end
64
+ end
65
+
66
+ class LDA2
67
+
68
+ attr_reader :document_collection
69
+
70
+ attr_reader :logger, :iterations, :doclist, :topics
71
+ attr_writer :logger, :iterations, :doclist
72
+
73
+ # Documents may be added after LDA is created, unlike k-mean clustering.
74
+ def initialize()
75
+ @iterations = 3
76
+ @wordlist = []
77
+ @doclist = []
78
+ @logger = Log4r::Logger.new(self.class.to_s)
79
+ @logger.add('default')
80
+ @topic_change_rate = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
81
+ @word_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
82
+ @doc_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
83
+
84
+ # Used for inverse document frequency values.
85
+ @document_collection = SClust::Util::DocumentCollection.new()
86
+
87
+ # Array the same size as @wordlist but stores the document object at index i
88
+ # that produced @wordlist[i].
89
+ @word2doc = []
90
+
91
+ self.topics = 10
92
+ end
93
+
94
+ # Set the topic count and initialize the @topics array with empty SClust::LDA2::Topic instances.
95
+ def topics=(count)
96
+ @topics = []
97
+ count.times do |t|
98
+ @topics << Topic.new()
99
+ end
100
+ end
101
+
102
+ # Add a document to the collection backing this cluster. This must be a
103
+ # SClust::Util::Document.
104
+ def <<(document)
105
+ @doclist << document
106
+
107
+ @document_collection << document
108
+
109
+ @wordlist += document.words
110
+
111
+ document.words.size.times { @word2doc << document }
112
+ end
113
+
114
+ # If you edit the document collection behind the scenes, you need to run this to avoid
115
+ # terms with 0 showing up.
116
+ def rebuild_document_collection()
117
+
118
+ @logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
119
+ @logger.info("Rebuilding document collection and word list.")
120
+
121
+ dl = @document_collection.doclist
122
+
123
+ @doclist = []
124
+
125
+ @document_collection = SClust::Util::DocumentCollection.new()
126
+
127
+ @wordlist = []
128
+
129
+ @word2doc = []
130
+
131
+ dl.each { |doc| self << doc }
132
+
133
+ @logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
134
+
135
+ end
136
+
137
+ # Build a wordlist index array. This is an array that contains indexes into @wordlist.
138
+ # However, instead of being simply {0,1,2,3...} this array is randomized so that
139
+ # we index into @wordlist in a random order.
140
+ def build_randomized_index_into_words()
141
+
142
+ @logger.info("Randomizing words.")
143
+
144
+ @randomized_word_index = []
145
+
146
+ @wordlist.each_index { |i| @randomized_word_index << i }
147
+
148
+ @wordlist.each_index do |i|
149
+ new_home = (@wordlist.length * rand).to_i
150
+ tmp = @randomized_word_index[i]
151
+ @randomized_word_index[i] = @randomized_word_index[new_home]
152
+ @randomized_word_index[new_home] = tmp
153
+ end
154
+
155
+ end
156
+
157
+ #
158
+ # Compute p(z_i|theta) * p(w|z_i,B).
159
+ #
160
+ def p_of_z(topic, word, doc=nil)
161
+
162
+ beta = @beta
163
+
164
+ words_from_doc_in_topic = (doc.nil?) ?
165
+ topic.docs.reduce(0.0) { |x, num| x+num[1] } :
166
+ words_from_doc_in_topic = topic.docs[doc]
167
+
168
+ word_prob_avg = ((topic.words[word] - 1.0 + beta) / (topic.wordcount - 1.0 + beta ) )
169
+ doc_prob_avg = ((words_from_doc_in_topic - 1.0 + @alpha) / (topic.wordcount - 1.0 + @alpha ))
170
+
171
+
172
+ # Stop-gap protection for when the denominator gets wonky.
173
+ doc_prob_avg = 0.0 if doc_prob_avg.nan? || doc_prob_avg < 0.0
174
+ word_prob_avg = 0.0 if word_prob_avg.nan? || word_prob_avg < 0.0
175
+
176
+ @word_prob_avg.adjust(word_prob_avg)
177
+ @doc_prob_avg.adjust(doc_prob_avg)
178
+
179
+ #@logger.info("WHAJL:KJ:LKDS: #{doc_prob_avg} #{topic.docs.size} #{@doclist.size}")
180
+
181
+ # Final result.
182
+ doc_prob_avg * word_prob_avg
183
+
184
+ # Alternate forumla. Denominator changed.
185
+ #((topic.words[word] - 1.0 + beta) / (topic.wordcount - 1.0 + beta ) ) *
186
+ #((topic.docs.size - 1.0 + alpha) / (@doclist.size - topic.docs.size - 1.0 + alpha ))
187
+
188
+
189
+ end
190
+
191
+ def each_radomized_word_index(&call)
192
+ @randomized_word_index.each &call
193
+ end
194
+
195
+ def lda_setup()
196
+ @beta = 0.01
197
+ @alpha = 1.0 #( @doclist.size / @topics.length ).to_f
198
+
199
+ build_randomized_index_into_words()
200
+
201
+ @word2topic = []
202
+ @doc2topic = []
203
+
204
+ each_radomized_word_index do |i|
205
+ topic = (@topics.size * rand).to_i
206
+
207
+ @word2topic[i] = topic # Record that this word goes to this topic.
208
+
209
+ @topics[topic].add(@wordlist[i], @word2doc[i])
210
+ end
211
+
212
+ @topic_change_rate.weight = 1.0 / @wordlist.size
213
+
214
+ end
215
+
216
+ # Perform 1 phase of lda
217
+ def lda_once()
218
+ each_radomized_word_index do |random_word_index|
219
+
220
+ random_word = @wordlist[random_word_index]
221
+ doc = @word2doc[random_word_index]
222
+
223
+ zdist = []
224
+ ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
225
+
226
+ # Compute distribution over z for word i.
227
+ @topics.each do |topic|
228
+ z = p_of_z(topic, random_word, doc)
229
+ ztotal += z
230
+ zdist << z
231
+ end
232
+
233
+ r = rand * ztotal # Random value to pick topic with.
234
+ zacc = 0.0 # Accumulator of seen values of zdist[topic_i].
235
+ topic_i = (rand() * @topics.size).to_i
236
+
237
+ # Pick a topic, t
238
+
239
+ catch(:picked_topic) do
240
+ @topics.each_index do |topic_i|
241
+ zacc += zdist[topic_i]
242
+ throw :picked_topic if r < zacc
243
+ end
244
+ end
245
+
246
+ topic = @topics[topic_i]
247
+
248
+ previous_topic = @topics[@word2topic[random_word_index]]
249
+
250
+ # Skip if src and dst topic are the same
251
+ if @word2topic[random_word_index] == topic_i
252
+
253
+ @topic_change_rate.adjust(0.0) # adjust...
254
+
255
+ else
256
+
257
+ # Adjust the topic change rate. This is how we will trac convergence.
258
+ # Few topic moves (comparatively) and we're done.
259
+ @topic_change_rate.adjust(1.0)
260
+
261
+ # Remove word from previous topic.
262
+
263
+ previous_topic.remove(random_word, doc) if previous_topic.has_word_and_doc?(random_word, doc)
264
+
265
+ # Add word to chosen topic.
266
+ @word2topic[random_word_index] = topic_i # Record that this word goes to this topic.
267
+
268
+ topic.add(random_word, doc)
269
+
270
+ end
271
+ end
272
+
273
+ $logger.info { "Topic change rate: #{@topic_change_rate.value} Doc% #{ @doc_prob_avg.value} Word% #{ @word_prob_avg.value}" }
274
+ end
275
+
276
+ def lda(opts={})
277
+ opts[:iterations] ||= @iterations
278
+
279
+ unless (opts[:continue])
280
+ @logger.info("Setting up to run LDA.")
281
+ lda_setup()
282
+ end
283
+
284
+ opts[:iterations].times do |i|
285
+ @logger.info { "LDA Iteration #{i+1} / #{opts[:iterations]}"}
286
+ lda_once()
287
+ end
288
+ end
289
+
290
+ # Takes {|topic| ... }
291
+ def each_topic(&topicproc)
292
+ @topics.each &topicproc
293
+ end
294
+
295
+ # Return a list lists, [ z, word ].
296
+ def get_top_words_for_topic(topic, n = 3)
297
+
298
+ # List of (z, topic, word)
299
+ tupleList = []
300
+
301
+ topic.words.each_key do |word|
302
+ tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
303
+ end
304
+
305
+ # Yes, rev the comparison so the list sorts backwards.
306
+ tupleList.sort! { |x, y| y.weight <=> x.weight }
307
+
308
+ tupleList[0...n]
309
+
310
+ end
311
+
312
+ # Returns list list list.
313
+ # Each list is a topic list.
314
+ # Each topic list contains a word list.
315
+ # [ [ z, word, topic ], ... ]
316
+ def get_max_terms(n=3)
317
+ topics = []
318
+
319
+ each_topic { |t| topics << get_top_words_for_topic(t, n) }
320
+
321
+ topics
322
+ end
323
+
324
+ alias cluster lda
325
+
326
+ end
327
+ end
328
+ end
@@ -0,0 +1,134 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'sclust/util/filters'
26
+ require 'log4r'
27
+
28
+ module SClust
29
+ module Util
30
+
31
+ # A typical document representation that
32
+ # is backed by a body of text but also breaks it up into
33
+ # a set of n-grams using a DocumentTokenizer and a DocumentTermFilter.
34
+ class Document
35
+
36
+ @@logger = Log4r::Logger.new(self.class.to_s)
37
+ @@logger.add('default')
38
+ @@logger.level = Log4r::DEBUG
39
+
40
+ attr_reader :terms, :userDate, :filter, :word_count, :words, :text
41
+
42
+ # Takes { :userData, :ngrams => [1,2,3], :filter => Filter, :term_limit => 100 }
43
+ # also { :min_freq => [ minimum frequency below which a term is removed from the document. ] }
44
+ # also { :max_freq => [ maximum frequency above which a term is removed from the document. ] }
45
+ def initialize(text, opts={})
46
+
47
+ @text = text # The raw document. Never changed.
48
+ @userData = opts[:userData] # Options!
49
+
50
+ opts[:ngrams] ||= [ 1, 2, 3 ]
51
+ opts[:filter] ||= DocumentTermFilter.new()
52
+ opts[:tokenizer] ||= DocumentTokenizer.new()
53
+
54
+ @words = opts[:tokenizer].apply(text).map { |word|
55
+ opts[:filter].apply(word) }.delete_if { |x| x.nil? or x=~/^\s+$/ }
56
+
57
+ @word_count = @words.size
58
+ @terms = Hash.new(0)
59
+
60
+ # Array of counts of grams built.
61
+ builtGramCounts = []
62
+
63
+ # Build a set of n-grams from our requested ngram range.
64
+ opts[:ngrams].each do |n|
65
+
66
+ builtGramCounts[n] = 0
67
+
68
+ # For each word in our list...
69
+ @words.length.times do |j|
70
+
71
+ if ( n + j <= @words.length )
72
+
73
+ term = @words[j]
74
+
75
+ # Pick number of iterations based on how close to the end of the array we are.
76
+ (( ( @words.length > n+j) ? n : @words.length-j)-1).times { |ngram| term += " #{@words[j+ngram+1]}" }
77
+
78
+ end
79
+
80
+ @terms[term] += 1.0 if term
81
+
82
+ builtGramCounts[n] += 1
83
+
84
+ end
85
+ end
86
+
87
+
88
+ if opts.key?(:min_freq) or opts.key?(:max_freq)
89
+ minwords = @words.size * ( opts[:min_freq] || 0.0 )
90
+ maxwords = @words.size * ( opts[:max_freq] || 1.0 )
91
+
92
+ #@@logger.debug { "Keeping terms between #{minwords} and #{maxwords} out of a total of #{@words.size}" }
93
+
94
+ @terms.delete_if do |term, freq|
95
+ if ( freq < minwords or freq > maxwords )
96
+ @words.delete_if { |x| term == x}
97
+ true
98
+ else
99
+ false
100
+ end
101
+ end
102
+
103
+ @wordcount = @words.size
104
+ end
105
+ end
106
+
107
+ # Frequency information is never updated.
108
+ def delete_term_if(&call)
109
+ @terms.delete_if { |term, val| call.call(term) }
110
+ @words.delete_if { |term| call.call(term) }
111
+ end
112
+
113
+ def term_count(term)
114
+ @terms[term]
115
+ end
116
+
117
+ def term_frequency(term)
118
+ @terms[term] / @words.size
119
+ end
120
+
121
+ alias tf term_frequency
122
+
123
+ # Each term and the term count passed to the given block. Divide the count by the total number of works to get the term frequency.
124
+ def each_term(&call)
125
+ terms.each{ |k,v| yield(k, v) }
126
+ end
127
+
128
+ def has_term?(term)
129
+ @terms.has_key?(term)
130
+ end
131
+
132
+ end
133
+ end
134
+ end