ealdent-lda-ruby 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'lda-ruby'
8
+
9
+ class Test::Unit::TestCase
10
+ end
metadata CHANGED
@@ -1,64 +1,74 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ealdent-lda-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
- - Jason M. Adams
8
- - David M. Blei
7
+ - David Blei
8
+ - Jason Adams
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2009-07-19 00:00:00 -07:00
13
+ date: 2009-07-24 00:00:00 -07:00
14
14
  default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
17
- name: stemmer
18
- type: :runtime
19
- version_requirement:
20
- version_requirements: !ruby/object:Gem::Requirement
21
- requirements:
22
- - - ">="
23
- - !ruby/object:Gem::Version
24
- version: "0"
25
- version:
26
- description:
15
+ dependencies: []
16
+
17
+ description: Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.
27
18
  email: jasonmadams@gmail.com
28
19
  executables: []
29
20
 
30
21
  extensions:
31
- - lib/extconf.rb
22
+ - ext/lda-ruby/extconf.rb
32
23
  extra_rdoc_files:
33
24
  - README
34
25
  - README.markdown
35
26
  files:
27
+ - .gitignore
36
28
  - README
29
+ - README.markdown
30
+ - Rakefile
37
31
  - VERSION.yml
38
- - lib/cokus.c
39
- - lib/cokus.h
40
- - lib/extconf.rb
41
- - lib/lda-alpha.c
42
- - lib/lda-alpha.h
43
- - lib/lda-data.c
44
- - lib/lda-data.h
45
- - lib/lda-inference.c
46
- - lib/lda-inference.h
47
- - lib/lda-model.c
48
- - lib/lda-model.h
49
- - lib/lda.h
50
- - lib/lda.rb
51
- - lib/utils.c
52
- - lib/utils.h
32
+ - ext/lda-ruby/Makefile
33
+ - ext/lda-ruby/cokus.c
34
+ - ext/lda-ruby/cokus.h
35
+ - ext/lda-ruby/extconf.rb
36
+ - ext/lda-ruby/lda-alpha.c
37
+ - ext/lda-ruby/lda-alpha.h
38
+ - ext/lda-ruby/lda-data.c
39
+ - ext/lda-ruby/lda-data.h
40
+ - ext/lda-ruby/lda-inference.c
41
+ - ext/lda-ruby/lda-inference.h
42
+ - ext/lda-ruby/lda-model.c
43
+ - ext/lda-ruby/lda-model.h
44
+ - ext/lda-ruby/lda.h
45
+ - ext/lda-ruby/utils.c
46
+ - ext/lda-ruby/utils.h
47
+ - lda-ruby.gemspec
48
+ - lib/lda-ruby.rb
49
+ - lib/lda-ruby/corpus/corpus.rb
50
+ - lib/lda-ruby/corpus/data_corpus.rb
51
+ - lib/lda-ruby/corpus/directory_corpus.rb
52
+ - lib/lda-ruby/corpus/text_corpus.rb
53
+ - lib/lda-ruby/document/data_document.rb
54
+ - lib/lda-ruby/document/document.rb
55
+ - lib/lda-ruby/document/text_document.rb
56
+ - lib/lda-ruby/vocabulary.rb
53
57
  - license.txt
54
- - README.markdown
58
+ - test/data/.gitignore
59
+ - test/data/docs.dat
60
+ - test/data/wiki-test-docs.yml
61
+ - test/lda_ruby_test.rb
62
+ - test/test_helper.rb
55
63
  has_rdoc: false
56
64
  homepage: http://github.com/ealdent/lda-ruby
65
+ licenses:
57
66
  post_install_message:
58
67
  rdoc_options:
59
68
  - --charset=UTF-8
60
69
  require_paths:
61
70
  - lib
71
+ - ext
62
72
  required_ruby_version: !ruby/object:Gem::Requirement
63
73
  requirements:
64
74
  - - ">="
@@ -74,9 +84,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
74
84
  requirements: []
75
85
 
76
86
  rubyforge_project:
77
- rubygems_version: 1.2.0
87
+ rubygems_version: 1.3.5
78
88
  signing_key:
79
89
  specification_version: 3
80
90
  summary: Ruby port of Latent Dirichlet Allocation by David M. Blei.
81
- test_files: []
82
-
91
+ test_files:
92
+ - test/lda_ruby_test.rb
93
+ - test/test_helper.rb
@@ -1,7 +0,0 @@
1
- require 'mkmf'
2
-
3
- $CFLAGS << ' -Wall -ggdb -O0'
4
- $defs.push( "-D USE_RUBY" )
5
-
6
- dir_config("lda_ext")
7
- create_makefile("lda_ext")
data/lib/lda.rb DELETED
@@ -1,319 +0,0 @@
1
- require 'set'
2
-
3
- module Lda
4
-
5
- #
6
- # Corpus class handles the data passed to the LDA algorithm.
7
- #
8
- class Corpus
9
- attr_reader :documents, :num_docs, :num_terms
10
-
11
- #
12
- # Create a blank corpus object. Either add documents to it
13
- # using the +add_document+ method or load the data from a file
14
- # using +load_from_file+.
15
- #
16
- def initialize(filename=nil)
17
- @documents = Array.new
18
- @all_terms = Set.new
19
- @num_terms = 0
20
- @num_docs = 0
21
-
22
- if filename
23
- self.load_from_file(filename)
24
- end
25
- end
26
-
27
- # Add a new document to the corpus. This can either be
28
- # an svmlight-style formatted line with the first element
29
- # being the number of words, or it can be a Document object.
30
- def add_document(doc)
31
- if doc.is_a?(Document)
32
- @documents << doc
33
- @all_terms += doc.words
34
- elsif doc.is_a?(String)
35
- d = Document.new(doc)
36
- @all_terms += d.words
37
- @documents << d
38
- end
39
- @num_docs += 1
40
- @num_terms = @all_terms.size
41
- true
42
- end
43
-
44
- # Populate this corpus from the data in the file.
45
- def load_from_file(filename)
46
- File.open(filename, 'r') do |f|
47
- f.each do |line|
48
- self.add_document(line)
49
- end
50
- end
51
- true
52
- end
53
- end
54
-
55
- class BaseDocument
56
- def words
57
- raise NotSupportedError
58
- end
59
-
60
- def length
61
- raise NotSupportedError
62
- end
63
-
64
- def total
65
- raise NotSupportedError
66
- end
67
- end
68
-
69
- #
70
- # A single document.
71
- #
72
- class Document < BaseDocument
73
- attr_accessor :words, :counts
74
- attr_reader :length, :total
75
-
76
- # Create the Document using the svmlight-style text line:
77
- #
78
- # num_words w1:freq1 w2:freq2 ... w_n:freq_n
79
- #
80
- # Ex.
81
- # 5 1:2 3:1 4:2 7:3 12:1
82
- #
83
- # The value for the number of words should equal the number of pairs
84
- # following it, though this isn't strictly enforced. Order of word-pair
85
- # indices is not important.
86
- #
87
- def initialize(doc_line=nil)
88
- if doc_line.is_a?(String)
89
- tmp = doc_line.split
90
- @words = Array.new
91
- @counts = Array.new
92
- @total = 0
93
- tmp.slice(1,tmp.size).each do |pair|
94
- tmp2 = pair.split(":")
95
- @words << tmp2[0].to_i
96
- @counts << tmp2[1].to_i
97
- end
98
- @length = @words.size
99
- @total = @counts.inject(0) {|sum, i| sum + i}
100
- else # doc_line == nil
101
- @words = Array.new
102
- @counts = Array.new
103
- @total = 0
104
- @length = 0
105
- end
106
- end
107
-
108
-
109
- #
110
- # Recompute the total and length values if the document has been
111
- # altered externally. This probably won't happen, but might be useful
112
- # if you want to subclass +Document+.
113
- #
114
- def recompute
115
- @total = @counts.inject(0) {|sum, i| sum + i}
116
- @length = @words.size
117
- end
118
- end
119
-
120
-
121
- class Lda
122
- attr_reader :vocab, :corpus
123
-
124
- #
125
- # Create a new LDA instance with the default settings.
126
- #
127
- def initialize
128
- self.load_default_settings
129
- @corpus = nil
130
- @vocab = nil
131
- @phi = nil
132
- end
133
-
134
- #
135
- # Load the default settings.
136
- # * max_iter = 20
137
- # * convergence = 1e-6
138
- # * em_max_iter = 100
139
- # * em_convergence = 1e-4
140
- # * num_topics = 20
141
- # * init_alpha = 0.3
142
- # * est_alpha = 1
143
- #
144
- def load_default_settings
145
- self.max_iter = 20
146
- self.convergence = 1e-6
147
- self.em_max_iter = 100
148
- self.em_convergence = 1e-4
149
- self.num_topics = 20
150
- self.init_alpha = 0.3
151
- self.est_alpha = 1
152
- nil
153
- end
154
-
155
-
156
- #
157
- # Load the corpus from file. The corpus is in svmlight-style where the
158
- # first element of each line is the number of words in the document and
159
- # then each element is the pair word_idx:weight.
160
- #
161
- # num_words word1:wgt1 word2:wgt2 ... word_n:wgt_n
162
- #
163
- # The value for the number of words should equal the number of pairs
164
- # following it, though this isn't strictly enforced in this method.
165
- #
166
- def load_corpus(filename)
167
- @corpus = Corpus.new
168
- @corpus.load_from_file(filename)
169
-
170
- true
171
- end
172
-
173
-
174
- #
175
- # Load the vocabulary file which is a list of words, one per line
176
- # where the line number corresponds the word list index. This allows
177
- # the words to be extracted for topics later.
178
- #
179
- # +vocab+ can either be the filename of the vocabulary file or the
180
- # array itself.
181
- #
182
- def load_vocabulary(vocab)
183
- if vocab.is_a?(Array)
184
- @vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array
185
- else
186
- @vocab = File.open(vocab, 'r') { |f| f.read.split(/[\n\r]+/) }
187
- end
188
-
189
- true
190
- end
191
-
192
-
193
- #
194
- # Visualization method for printing out the top +words_per_topic+ words
195
- # for each topic.
196
- #
197
- # See also +top_words+.
198
- #
199
- def print_topics(words_per_topic=10)
200
- unless @vocab
201
- puts "No vocabulary loaded."
202
- return nil
203
- end
204
-
205
- beta = self.beta
206
- indices = (0..(@vocab.size - 1)).to_a
207
- topic_num = 0
208
- beta.each do |topic|
209
- indices.sort! {|x, y| -(topic[x] <=> topic[y])}
210
- outp = []
211
- puts "Topic #{topic_num}"
212
- words_per_topic.times do |i|
213
- outp << @vocab[indices[i]]
214
- end
215
- puts "\t" + outp.join("\n\t")
216
- puts ""
217
- topic_num += 1
218
- end
219
-
220
- nil
221
- end
222
-
223
- #
224
- # After the model has been run and a vocabulary has been loaded, return the
225
- # +words_per_topic+ top words chosen by the model for each topic. This is
226
- # returned as a hash mapping the topic number to an array of top words
227
- # (in descending order of importance).
228
- #
229
- # topic_number => [w1, w2, ..., w_n]
230
- #
231
- # See also +print_topics+.
232
- #
233
- def top_words(words_per_topic=10)
234
- unless @vocab
235
- puts "No vocabulary loaded."
236
- return nil
237
- end
238
-
239
- # find the highest scoring words per topic
240
- topics = Hash.new
241
- indices = (0...@vocab.size).to_a
242
-
243
- begin
244
- beta.each_with_index do |topic, topic_idx|
245
- indices.sort! {|x, y| -(topic[x] <=> topic[y])}
246
- topics[topic_idx] = indices.first(words_per_topic).map { |i| @vocab[i] }
247
- end
248
- rescue NoMethodError
249
- puts "Error: model has not been run."
250
- topics = nil
251
- end
252
-
253
- topics
254
- end
255
-
256
-
257
- #
258
- # Get the phi matrix which can be used to assign probabilities to words
259
- # belonging to a specific topic in each document. The return value is a
260
- # 3D matrix: num_docs x doc_length x num_topics. The value is cached
261
- # after the first call, so if it needs to be recomputed, set the +recompute+
262
- # value to true.
263
- #
264
- def phi(recompute=false)
265
- if not @phi or recompute
266
- # either the phi variable has not been instantiated or the recompute flag has been set
267
- @phi = self.compute_phi
268
- end
269
- @phi
270
- end
271
-
272
-
273
-
274
- #
275
- # Compute the average log probability for each topic for each document in the corpus.
276
- # This method returns a matrix: num_docs x num_topics with the average log probability
277
- # for the topic in the document.
278
- #
279
- def compute_topic_document_probability
280
- outp = Array.new
281
- @corpus.documents.each_with_index do |doc, idx|
282
- tops = [0.0] * self.num_topics
283
- ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
284
- self.phi[idx].each_with_index do |word_dist, word_idx|
285
- word_dist.each_with_index do |top_prob, top_idx|
286
- tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
287
- end
288
- end
289
- tops = tops.map {|i| i / ttl}
290
- outp << tops
291
- end
292
-
293
- outp
294
- end
295
-
296
-
297
-
298
-
299
- #
300
- # String representation displaying current settings.
301
- #
302
- def to_s
303
- outp = []
304
- outp << "LDA Settings:"
305
- outp << " Initial alpha: %0.6f" % self.init_alpha
306
- outp << " # of topics: %d" % self.num_topics
307
- outp << " Max iterations: %d" % self.max_iter
308
- outp << " Convergence: %0.6f" % self.convergence
309
- outp << "EM max iterations: %d" % self.em_max_iter
310
- outp << " EM convergence: %0.6f" % self.em_convergence
311
- outp << " Estimate alpha: %d" % self.est_alpha
312
-
313
- return outp.join("\n")
314
- end
315
- end
316
- end
317
-
318
- # load the c-side stuff
319
- require 'lda_ext'