sclust 1.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,328 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+ require 'rubygems'
25
+ require 'sclust/util/word'
26
+ require 'sclust/util/doccol'
27
+ require 'log4r'
28
+ require 'sclust/util/weightedmovingaverage'
29
+
30
+ module SClust
31
+
32
+ # A second approach to using LDA on documents.
33
+ # This uses the tf-idf value to scale the probability of words being included (B value).
34
+ module LDA2
35
+
36
+ class Topic
37
+
38
+ attr_reader :words, :wordcount, :docs
39
+ attr_writer :words, :wordcount, :docs
40
+
41
+ def initialize()
42
+ @words = SClust::Util::SparseVector.new(0) # Hash count of words. Keys are indexes into @wordlist
43
+ #@words = Hash.new(0) # Hash count of words. Keys are indexes into @wordlist
44
+ @wordcount = 0 # Sum of values in @words.
45
+ @docs = SClust::Util::SparseVector.new(0)
46
+ #@docs = Hash.new(0) # Collection of documents. Hash is to eliminate duplicates.
47
+ end
48
+
49
+ def has_word_and_doc?(word, doc)
50
+ @words.member?(word) and @docs.member?(doc)
51
+ end
52
+
53
+ def add(word, doc)
54
+ @words[word] += 1
55
+ @wordcount += 1
56
+ @docs[doc] += 1
57
+ end
58
+
59
+ def remove(word, doc)
60
+ @words[word] -= 1
61
+ @wordcount -= 1
62
+ @docs.delete(doc) if (@docs[doc] -= 1 ) < 0 # NOTE: Sparse Vector deletes when @docs[doc] == 0.
63
+ end
64
+ end
65
+
66
+ class LDA2
67
+
68
+ attr_reader :document_collection
69
+
70
+ attr_reader :logger, :iterations, :doclist, :topics
71
+ attr_writer :logger, :iterations, :doclist
72
+
73
+ # Documents may be added after LDA is created, unlike k-mean clustering.
74
+ def initialize()
75
+ @iterations = 3
76
+ @wordlist = []
77
+ @doclist = []
78
+ @logger = Log4r::Logger.new(self.class.to_s)
79
+ @logger.add('default')
80
+ @topic_change_rate = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
81
+ @word_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
82
+ @doc_prob_avg = SClust::Util::WeightedMovingAverage.new(0.05, 0.0)
83
+
84
+ # Used for inverse document frequency values.
85
+ @document_collection = SClust::Util::DocumentCollection.new()
86
+
87
+ # Array the same size as @wordlist but stores the document object at index i
88
+ # that produced @wordlist[i].
89
+ @word2doc = []
90
+
91
+ self.topics = 10
92
+ end
93
+
94
+ # Set the topic count and initialize the @topics array with empty SClust::LDA2::Topic instances.
95
+ def topics=(count)
96
+ @topics = []
97
+ count.times do |t|
98
+ @topics << Topic.new()
99
+ end
100
+ end
101
+
102
+ # Add a document to the collection backing this cluster. This must be a
103
+ # SClust::Util::Document.
104
+ def <<(document)
105
+ @doclist << document
106
+
107
+ @document_collection << document
108
+
109
+ @wordlist += document.words
110
+
111
+ document.words.size.times { @word2doc << document }
112
+ end
113
+
114
+ # If you edit the document collection behind the scenes, you need to run this to avoid
115
+ # terms with 0 showing up.
116
+ def rebuild_document_collection()
117
+
118
+ @logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
119
+ @logger.info("Rebuilding document collection and word list.")
120
+
121
+ dl = @document_collection.doclist
122
+
123
+ @doclist = []
124
+
125
+ @document_collection = SClust::Util::DocumentCollection.new()
126
+
127
+ @wordlist = []
128
+
129
+ @word2doc = []
130
+
131
+ dl.each { |doc| self << doc }
132
+
133
+ @logger.debug { "Collection now has #{@doclist.size} documents, #{@wordlist.size} words."}
134
+
135
+ end
136
+
137
+ # Build a wordlist index array. This is an array that contains indexes into @wordlist.
138
+ # However, instead of being simply {0,1,2,3...} this array is randomized so that
139
+ # we index into @wordlist in a random order.
140
+ def build_randomized_index_into_words()
141
+
142
+ @logger.info("Randomizing words.")
143
+
144
+ @randomized_word_index = []
145
+
146
+ @wordlist.each_index { |i| @randomized_word_index << i }
147
+
148
+ @wordlist.each_index do |i|
149
+ new_home = (@wordlist.length * rand).to_i
150
+ tmp = @randomized_word_index[i]
151
+ @randomized_word_index[i] = @randomized_word_index[new_home]
152
+ @randomized_word_index[new_home] = tmp
153
+ end
154
+
155
+ end
156
+
157
+ #
158
+ # Compute p(z_i|theta) * p(w|z_i,B).
159
+ #
160
+ def p_of_z(topic, word, doc=nil)
161
+
162
+ beta = @beta
163
+
164
+ words_from_doc_in_topic = (doc.nil?) ?
165
+ topic.docs.reduce(0.0) { |x, num| x+num[1] } :
166
+ words_from_doc_in_topic = topic.docs[doc]
167
+
168
+ word_prob_avg = ((topic.words[word] - 1.0 + beta) / (topic.wordcount - 1.0 + beta ) )
169
+ doc_prob_avg = ((words_from_doc_in_topic - 1.0 + @alpha) / (topic.wordcount - 1.0 + @alpha ))
170
+
171
+
172
+ # Stop-gap protection for when the denominator gets wonky.
173
+ doc_prob_avg = 0.0 if doc_prob_avg.nan? || doc_prob_avg < 0.0
174
+ word_prob_avg = 0.0 if word_prob_avg.nan? || word_prob_avg < 0.0
175
+
176
+ @word_prob_avg.adjust(word_prob_avg)
177
+ @doc_prob_avg.adjust(doc_prob_avg)
178
+
179
+ #@logger.info("WHAJL:KJ:LKDS: #{doc_prob_avg} #{topic.docs.size} #{@doclist.size}")
180
+
181
+ # Final result.
182
+ doc_prob_avg * word_prob_avg
183
+
184
+ # Alternate forumla. Denominator changed.
185
+ #((topic.words[word] - 1.0 + beta) / (topic.wordcount - 1.0 + beta ) ) *
186
+ #((topic.docs.size - 1.0 + alpha) / (@doclist.size - topic.docs.size - 1.0 + alpha ))
187
+
188
+
189
+ end
190
+
191
+ def each_radomized_word_index(&call)
192
+ @randomized_word_index.each &call
193
+ end
194
+
195
+ def lda_setup()
196
+ @beta = 0.01
197
+ @alpha = 1.0 #( @doclist.size / @topics.length ).to_f
198
+
199
+ build_randomized_index_into_words()
200
+
201
+ @word2topic = []
202
+ @doc2topic = []
203
+
204
+ each_radomized_word_index do |i|
205
+ topic = (@topics.size * rand).to_i
206
+
207
+ @word2topic[i] = topic # Record that this word goes to this topic.
208
+
209
+ @topics[topic].add(@wordlist[i], @word2doc[i])
210
+ end
211
+
212
+ @topic_change_rate.weight = 1.0 / @wordlist.size
213
+
214
+ end
215
+
216
+ # Perform 1 phase of lda
217
+ def lda_once()
218
+ each_radomized_word_index do |random_word_index|
219
+
220
+ random_word = @wordlist[random_word_index]
221
+ doc = @word2doc[random_word_index]
222
+
223
+ zdist = []
224
+ ztotal = 0.0 # Track actual total incase the sum of zdist isn't quite 1.0.
225
+
226
+ # Compute distribution over z for word i.
227
+ @topics.each do |topic|
228
+ z = p_of_z(topic, random_word, doc)
229
+ ztotal += z
230
+ zdist << z
231
+ end
232
+
233
+ r = rand * ztotal # Random value to pick topic with.
234
+ zacc = 0.0 # Accumulator of seen values of zdist[topic_i].
235
+ topic_i = (rand() * @topics.size).to_i
236
+
237
+ # Pick a topic, t
238
+
239
+ catch(:picked_topic) do
240
+ @topics.each_index do |topic_i|
241
+ zacc += zdist[topic_i]
242
+ throw :picked_topic if r < zacc
243
+ end
244
+ end
245
+
246
+ topic = @topics[topic_i]
247
+
248
+ previous_topic = @topics[@word2topic[random_word_index]]
249
+
250
+ # Skip if src and dst topic are the same
251
+ if @word2topic[random_word_index] == topic_i
252
+
253
+ @topic_change_rate.adjust(0.0) # adjust...
254
+
255
+ else
256
+
257
+ # Adjust the topic change rate. This is how we will trac convergence.
258
+ # Few topic moves (comparatively) and we're done.
259
+ @topic_change_rate.adjust(1.0)
260
+
261
+ # Remove word from previous topic.
262
+
263
+ previous_topic.remove(random_word, doc) if previous_topic.has_word_and_doc?(random_word, doc)
264
+
265
+ # Add word to chosen topic.
266
+ @word2topic[random_word_index] = topic_i # Record that this word goes to this topic.
267
+
268
+ topic.add(random_word, doc)
269
+
270
+ end
271
+ end
272
+
273
+ $logger.info { "Topic change rate: #{@topic_change_rate.value} Doc% #{ @doc_prob_avg.value} Word% #{ @word_prob_avg.value}" }
274
+ end
275
+
276
+ def lda(opts={})
277
+ opts[:iterations] ||= @iterations
278
+
279
+ unless (opts[:continue])
280
+ @logger.info("Setting up to run LDA.")
281
+ lda_setup()
282
+ end
283
+
284
+ opts[:iterations].times do |i|
285
+ @logger.info { "LDA Iteration #{i+1} / #{opts[:iterations]}"}
286
+ lda_once()
287
+ end
288
+ end
289
+
290
+ # Takes {|topic| ... }
291
+ def each_topic(&topicproc)
292
+ @topics.each &topicproc
293
+ end
294
+
295
+ # Return a list lists, [ z, word ].
296
+ def get_top_words_for_topic(topic, n = 3)
297
+
298
+ # List of (z, topic, word)
299
+ tupleList = []
300
+
301
+ topic.words.each_key do |word|
302
+ tupleList << SClust::Util::Word.new(word, p_of_z(topic, word), { :topic=>topic } )
303
+ end
304
+
305
+ # Yes, rev the comparison so the list sorts backwards.
306
+ tupleList.sort! { |x, y| y.weight <=> x.weight }
307
+
308
+ tupleList[0...n]
309
+
310
+ end
311
+
312
+ # Returns list list list.
313
+ # Each list is a topic list.
314
+ # Each topic list contains a word list.
315
+ # [ [ z, word, topic ], ... ]
316
+ def get_max_terms(n=3)
317
+ topics = []
318
+
319
+ each_topic { |t| topics << get_top_words_for_topic(t, n) }
320
+
321
+ topics
322
+ end
323
+
324
+ alias cluster lda
325
+
326
+ end
327
+ end
328
+ end
@@ -0,0 +1,134 @@
1
+ #
2
+ # The MIT License
3
+ #
4
+ # Copyright (c) 2010 Samuel R. Baskinger
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+ #
24
+
25
+ require 'sclust/util/filters'
26
+ require 'log4r'
27
+
28
+ module SClust
29
+ module Util
30
+
31
+ # A typical document representation that
32
+ # is backed by a body of text but also breaks it up into
33
+ # a set of n-grams using a DocumentTokenizer and a DocumentTermFilter.
34
+ class Document
35
+
36
+ @@logger = Log4r::Logger.new(self.class.to_s)
37
+ @@logger.add('default')
38
+ @@logger.level = Log4r::DEBUG
39
+
40
+ attr_reader :terms, :userDate, :filter, :word_count, :words, :text
41
+
42
+ # Takes { :userData, :ngrams => [1,2,3], :filter => Filter, :term_limit => 100 }
43
+ # also { :min_freq => [ minimum frequency below which a term is removed from the document. ] }
44
+ # also { :max_freq => [ maximum frequency above which a term is removed from the document. ] }
45
+ def initialize(text, opts={})
46
+
47
+ @text = text # The raw document. Never changed.
48
+ @userData = opts[:userData] # Options!
49
+
50
+ opts[:ngrams] ||= [ 1, 2, 3 ]
51
+ opts[:filter] ||= DocumentTermFilter.new()
52
+ opts[:tokenizer] ||= DocumentTokenizer.new()
53
+
54
+ @words = opts[:tokenizer].apply(text).map { |word|
55
+ opts[:filter].apply(word) }.delete_if { |x| x.nil? or x=~/^\s+$/ }
56
+
57
+ @word_count = @words.size
58
+ @terms = Hash.new(0)
59
+
60
+ # Array of counts of grams built.
61
+ builtGramCounts = []
62
+
63
+ # Build a set of n-grams from our requested ngram range.
64
+ opts[:ngrams].each do |n|
65
+
66
+ builtGramCounts[n] = 0
67
+
68
+ # For each word in our list...
69
+ @words.length.times do |j|
70
+
71
+ if ( n + j <= @words.length )
72
+
73
+ term = @words[j]
74
+
75
+ # Pick number of iterations based on how close to the end of the array we are.
76
+ (( ( @words.length > n+j) ? n : @words.length-j)-1).times { |ngram| term += " #{@words[j+ngram+1]}" }
77
+
78
+ end
79
+
80
+ @terms[term] += 1.0 if term
81
+
82
+ builtGramCounts[n] += 1
83
+
84
+ end
85
+ end
86
+
87
+
88
+ if opts.key?(:min_freq) or opts.key?(:max_freq)
89
+ minwords = @words.size * ( opts[:min_freq] || 0.0 )
90
+ maxwords = @words.size * ( opts[:max_freq] || 1.0 )
91
+
92
+ #@@logger.debug { "Keeping terms between #{minwords} and #{maxwords} out of a total of #{@words.size}" }
93
+
94
+ @terms.delete_if do |term, freq|
95
+ if ( freq < minwords or freq > maxwords )
96
+ @words.delete_if { |x| term == x}
97
+ true
98
+ else
99
+ false
100
+ end
101
+ end
102
+
103
+ @wordcount = @words.size
104
+ end
105
+ end
106
+
107
+ # Frequency information is never updated.
108
+ def delete_term_if(&call)
109
+ @terms.delete_if { |term, val| call.call(term) }
110
+ @words.delete_if { |term| call.call(term) }
111
+ end
112
+
113
+ def term_count(term)
114
+ @terms[term]
115
+ end
116
+
117
+ def term_frequency(term)
118
+ @terms[term] / @words.size
119
+ end
120
+
121
+ alias tf term_frequency
122
+
123
+ # Each term and the term count passed to the given block. Divide the count by the total number of works to get the term frequency.
124
+ def each_term(&call)
125
+ terms.each{ |k,v| yield(k, v) }
126
+ end
127
+
128
+ def has_term?(term)
129
+ @terms.has_key?(term)
130
+ end
131
+
132
+ end
133
+ end
134
+ end