ealdent-lda-ruby 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/README +5 -6
- data/README.markdown +8 -9
- data/Rakefile +58 -0
- data/VERSION.yml +2 -2
- data/ext/lda-ruby/Makefile +181 -0
- data/{lib → ext/lda-ruby}/cokus.c +0 -0
- data/{lib → ext/lda-ruby}/cokus.h +0 -0
- data/ext/lda-ruby/extconf.rb +9 -0
- data/{lib → ext/lda-ruby}/lda-alpha.c +0 -0
- data/{lib → ext/lda-ruby}/lda-alpha.h +0 -0
- data/{lib → ext/lda-ruby}/lda-data.c +0 -0
- data/{lib → ext/lda-ruby}/lda-data.h +0 -0
- data/{lib → ext/lda-ruby}/lda-inference.c +43 -44
- data/{lib → ext/lda-ruby}/lda-inference.h +0 -0
- data/{lib → ext/lda-ruby}/lda-model.c +18 -3
- data/{lib → ext/lda-ruby}/lda-model.h +0 -0
- data/{lib → ext/lda-ruby}/lda.h +0 -0
- data/{lib → ext/lda-ruby}/utils.c +0 -0
- data/{lib → ext/lda-ruby}/utils.h +0 -0
- data/lda-ruby.gemspec +74 -0
- data/lib/lda-ruby.rb +157 -0
- data/lib/lda-ruby/corpus/corpus.rb +34 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +36 -0
- data/lib/lda-ruby/document/text_document.rb +32 -0
- data/lib/lda-ruby/vocabulary.rb +39 -0
- data/test/data/.gitignore +2 -0
- data/test/data/docs.dat +46 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/lda_ruby_test.rb +274 -0
- data/test/test_helper.rb +10 -0
- metadata +47 -36
- data/lib/extconf.rb +0 -7
- data/lib/lda.rb +0 -319
data/test/test_helper.rb
ADDED
metadata
CHANGED
@@ -1,64 +1,74 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ealdent-lda-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
8
|
-
-
|
7
|
+
- David Blei
|
8
|
+
- Jason Adams
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2009-07-
|
13
|
+
date: 2009-07-24 00:00:00 -07:00
|
14
14
|
default_executable:
|
15
|
-
dependencies:
|
16
|
-
|
17
|
-
|
18
|
-
type: :runtime
|
19
|
-
version_requirement:
|
20
|
-
version_requirements: !ruby/object:Gem::Requirement
|
21
|
-
requirements:
|
22
|
-
- - ">="
|
23
|
-
- !ruby/object:Gem::Version
|
24
|
-
version: "0"
|
25
|
-
version:
|
26
|
-
description:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.
|
27
18
|
email: jasonmadams@gmail.com
|
28
19
|
executables: []
|
29
20
|
|
30
21
|
extensions:
|
31
|
-
-
|
22
|
+
- ext/lda-ruby/extconf.rb
|
32
23
|
extra_rdoc_files:
|
33
24
|
- README
|
34
25
|
- README.markdown
|
35
26
|
files:
|
27
|
+
- .gitignore
|
36
28
|
- README
|
29
|
+
- README.markdown
|
30
|
+
- Rakefile
|
37
31
|
- VERSION.yml
|
38
|
-
-
|
39
|
-
-
|
40
|
-
-
|
41
|
-
-
|
42
|
-
-
|
43
|
-
-
|
44
|
-
-
|
45
|
-
-
|
46
|
-
-
|
47
|
-
-
|
48
|
-
-
|
49
|
-
-
|
50
|
-
-
|
51
|
-
-
|
52
|
-
-
|
32
|
+
- ext/lda-ruby/Makefile
|
33
|
+
- ext/lda-ruby/cokus.c
|
34
|
+
- ext/lda-ruby/cokus.h
|
35
|
+
- ext/lda-ruby/extconf.rb
|
36
|
+
- ext/lda-ruby/lda-alpha.c
|
37
|
+
- ext/lda-ruby/lda-alpha.h
|
38
|
+
- ext/lda-ruby/lda-data.c
|
39
|
+
- ext/lda-ruby/lda-data.h
|
40
|
+
- ext/lda-ruby/lda-inference.c
|
41
|
+
- ext/lda-ruby/lda-inference.h
|
42
|
+
- ext/lda-ruby/lda-model.c
|
43
|
+
- ext/lda-ruby/lda-model.h
|
44
|
+
- ext/lda-ruby/lda.h
|
45
|
+
- ext/lda-ruby/utils.c
|
46
|
+
- ext/lda-ruby/utils.h
|
47
|
+
- lda-ruby.gemspec
|
48
|
+
- lib/lda-ruby.rb
|
49
|
+
- lib/lda-ruby/corpus/corpus.rb
|
50
|
+
- lib/lda-ruby/corpus/data_corpus.rb
|
51
|
+
- lib/lda-ruby/corpus/directory_corpus.rb
|
52
|
+
- lib/lda-ruby/corpus/text_corpus.rb
|
53
|
+
- lib/lda-ruby/document/data_document.rb
|
54
|
+
- lib/lda-ruby/document/document.rb
|
55
|
+
- lib/lda-ruby/document/text_document.rb
|
56
|
+
- lib/lda-ruby/vocabulary.rb
|
53
57
|
- license.txt
|
54
|
-
-
|
58
|
+
- test/data/.gitignore
|
59
|
+
- test/data/docs.dat
|
60
|
+
- test/data/wiki-test-docs.yml
|
61
|
+
- test/lda_ruby_test.rb
|
62
|
+
- test/test_helper.rb
|
55
63
|
has_rdoc: false
|
56
64
|
homepage: http://github.com/ealdent/lda-ruby
|
65
|
+
licenses:
|
57
66
|
post_install_message:
|
58
67
|
rdoc_options:
|
59
68
|
- --charset=UTF-8
|
60
69
|
require_paths:
|
61
70
|
- lib
|
71
|
+
- ext
|
62
72
|
required_ruby_version: !ruby/object:Gem::Requirement
|
63
73
|
requirements:
|
64
74
|
- - ">="
|
@@ -74,9 +84,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
74
84
|
requirements: []
|
75
85
|
|
76
86
|
rubyforge_project:
|
77
|
-
rubygems_version: 1.
|
87
|
+
rubygems_version: 1.3.5
|
78
88
|
signing_key:
|
79
89
|
specification_version: 3
|
80
90
|
summary: Ruby port of Latent Dirichlet Allocation by David M. Blei.
|
81
|
-
test_files:
|
82
|
-
|
91
|
+
test_files:
|
92
|
+
- test/lda_ruby_test.rb
|
93
|
+
- test/test_helper.rb
|
data/lib/extconf.rb
DELETED
data/lib/lda.rb
DELETED
@@ -1,319 +0,0 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
|
-
module Lda
|
4
|
-
|
5
|
-
#
|
6
|
-
# Corpus class handles the data passed to the LDA algorithm.
|
7
|
-
#
|
8
|
-
class Corpus
|
9
|
-
attr_reader :documents, :num_docs, :num_terms
|
10
|
-
|
11
|
-
#
|
12
|
-
# Create a blank corpus object. Either add documents to it
|
13
|
-
# using the +add_document+ method or load the data from a file
|
14
|
-
# using +load_from_file+.
|
15
|
-
#
|
16
|
-
def initialize(filename=nil)
|
17
|
-
@documents = Array.new
|
18
|
-
@all_terms = Set.new
|
19
|
-
@num_terms = 0
|
20
|
-
@num_docs = 0
|
21
|
-
|
22
|
-
if filename
|
23
|
-
self.load_from_file(filename)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
# Add a new document to the corpus. This can either be
|
28
|
-
# an svmlight-style formatted line with the first element
|
29
|
-
# being the number of words, or it can be a Document object.
|
30
|
-
def add_document(doc)
|
31
|
-
if doc.is_a?(Document)
|
32
|
-
@documents << doc
|
33
|
-
@all_terms += doc.words
|
34
|
-
elsif doc.is_a?(String)
|
35
|
-
d = Document.new(doc)
|
36
|
-
@all_terms += d.words
|
37
|
-
@documents << d
|
38
|
-
end
|
39
|
-
@num_docs += 1
|
40
|
-
@num_terms = @all_terms.size
|
41
|
-
true
|
42
|
-
end
|
43
|
-
|
44
|
-
# Populate this corpus from the data in the file.
|
45
|
-
def load_from_file(filename)
|
46
|
-
File.open(filename, 'r') do |f|
|
47
|
-
f.each do |line|
|
48
|
-
self.add_document(line)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
true
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
class BaseDocument
|
56
|
-
def words
|
57
|
-
raise NotSupportedError
|
58
|
-
end
|
59
|
-
|
60
|
-
def length
|
61
|
-
raise NotSupportedError
|
62
|
-
end
|
63
|
-
|
64
|
-
def total
|
65
|
-
raise NotSupportedError
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
#
|
70
|
-
# A single document.
|
71
|
-
#
|
72
|
-
class Document < BaseDocument
|
73
|
-
attr_accessor :words, :counts
|
74
|
-
attr_reader :length, :total
|
75
|
-
|
76
|
-
# Create the Document using the svmlight-style text line:
|
77
|
-
#
|
78
|
-
# num_words w1:freq1 w2:freq2 ... w_n:freq_n
|
79
|
-
#
|
80
|
-
# Ex.
|
81
|
-
# 5 1:2 3:1 4:2 7:3 12:1
|
82
|
-
#
|
83
|
-
# The value for the number of words should equal the number of pairs
|
84
|
-
# following it, though this isn't strictly enforced. Order of word-pair
|
85
|
-
# indices is not important.
|
86
|
-
#
|
87
|
-
def initialize(doc_line=nil)
|
88
|
-
if doc_line.is_a?(String)
|
89
|
-
tmp = doc_line.split
|
90
|
-
@words = Array.new
|
91
|
-
@counts = Array.new
|
92
|
-
@total = 0
|
93
|
-
tmp.slice(1,tmp.size).each do |pair|
|
94
|
-
tmp2 = pair.split(":")
|
95
|
-
@words << tmp2[0].to_i
|
96
|
-
@counts << tmp2[1].to_i
|
97
|
-
end
|
98
|
-
@length = @words.size
|
99
|
-
@total = @counts.inject(0) {|sum, i| sum + i}
|
100
|
-
else # doc_line == nil
|
101
|
-
@words = Array.new
|
102
|
-
@counts = Array.new
|
103
|
-
@total = 0
|
104
|
-
@length = 0
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
|
109
|
-
#
|
110
|
-
# Recompute the total and length values if the document has been
|
111
|
-
# altered externally. This probably won't happen, but might be useful
|
112
|
-
# if you want to subclass +Document+.
|
113
|
-
#
|
114
|
-
def recompute
|
115
|
-
@total = @counts.inject(0) {|sum, i| sum + i}
|
116
|
-
@length = @words.size
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
|
121
|
-
class Lda
|
122
|
-
attr_reader :vocab, :corpus
|
123
|
-
|
124
|
-
#
|
125
|
-
# Create a new LDA instance with the default settings.
|
126
|
-
#
|
127
|
-
def initialize
|
128
|
-
self.load_default_settings
|
129
|
-
@corpus = nil
|
130
|
-
@vocab = nil
|
131
|
-
@phi = nil
|
132
|
-
end
|
133
|
-
|
134
|
-
#
|
135
|
-
# Load the default settings.
|
136
|
-
# * max_iter = 20
|
137
|
-
# * convergence = 1e-6
|
138
|
-
# * em_max_iter = 100
|
139
|
-
# * em_convergence = 1e-4
|
140
|
-
# * num_topics = 20
|
141
|
-
# * init_alpha = 0.3
|
142
|
-
# * est_alpha = 1
|
143
|
-
#
|
144
|
-
def load_default_settings
|
145
|
-
self.max_iter = 20
|
146
|
-
self.convergence = 1e-6
|
147
|
-
self.em_max_iter = 100
|
148
|
-
self.em_convergence = 1e-4
|
149
|
-
self.num_topics = 20
|
150
|
-
self.init_alpha = 0.3
|
151
|
-
self.est_alpha = 1
|
152
|
-
nil
|
153
|
-
end
|
154
|
-
|
155
|
-
|
156
|
-
#
|
157
|
-
# Load the corpus from file. The corpus is in svmlight-style where the
|
158
|
-
# first element of each line is the number of words in the document and
|
159
|
-
# then each element is the pair word_idx:weight.
|
160
|
-
#
|
161
|
-
# num_words word1:wgt1 word2:wgt2 ... word_n:wgt_n
|
162
|
-
#
|
163
|
-
# The value for the number of words should equal the number of pairs
|
164
|
-
# following it, though this isn't strictly enforced in this method.
|
165
|
-
#
|
166
|
-
def load_corpus(filename)
|
167
|
-
@corpus = Corpus.new
|
168
|
-
@corpus.load_from_file(filename)
|
169
|
-
|
170
|
-
true
|
171
|
-
end
|
172
|
-
|
173
|
-
|
174
|
-
#
|
175
|
-
# Load the vocabulary file which is a list of words, one per line
|
176
|
-
# where the line number corresponds the word list index. This allows
|
177
|
-
# the words to be extracted for topics later.
|
178
|
-
#
|
179
|
-
# +vocab+ can either be the filename of the vocabulary file or the
|
180
|
-
# array itself.
|
181
|
-
#
|
182
|
-
def load_vocabulary(vocab)
|
183
|
-
if vocab.is_a?(Array)
|
184
|
-
@vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array
|
185
|
-
else
|
186
|
-
@vocab = File.open(vocab, 'r') { |f| f.read.split(/[\n\r]+/) }
|
187
|
-
end
|
188
|
-
|
189
|
-
true
|
190
|
-
end
|
191
|
-
|
192
|
-
|
193
|
-
#
|
194
|
-
# Visualization method for printing out the top +words_per_topic+ words
|
195
|
-
# for each topic.
|
196
|
-
#
|
197
|
-
# See also +top_words+.
|
198
|
-
#
|
199
|
-
def print_topics(words_per_topic=10)
|
200
|
-
unless @vocab
|
201
|
-
puts "No vocabulary loaded."
|
202
|
-
return nil
|
203
|
-
end
|
204
|
-
|
205
|
-
beta = self.beta
|
206
|
-
indices = (0..(@vocab.size - 1)).to_a
|
207
|
-
topic_num = 0
|
208
|
-
beta.each do |topic|
|
209
|
-
indices.sort! {|x, y| -(topic[x] <=> topic[y])}
|
210
|
-
outp = []
|
211
|
-
puts "Topic #{topic_num}"
|
212
|
-
words_per_topic.times do |i|
|
213
|
-
outp << @vocab[indices[i]]
|
214
|
-
end
|
215
|
-
puts "\t" + outp.join("\n\t")
|
216
|
-
puts ""
|
217
|
-
topic_num += 1
|
218
|
-
end
|
219
|
-
|
220
|
-
nil
|
221
|
-
end
|
222
|
-
|
223
|
-
#
|
224
|
-
# After the model has been run and a vocabulary has been loaded, return the
|
225
|
-
# +words_per_topic+ top words chosen by the model for each topic. This is
|
226
|
-
# returned as a hash mapping the topic number to an array of top words
|
227
|
-
# (in descending order of importance).
|
228
|
-
#
|
229
|
-
# topic_number => [w1, w2, ..., w_n]
|
230
|
-
#
|
231
|
-
# See also +print_topics+.
|
232
|
-
#
|
233
|
-
def top_words(words_per_topic=10)
|
234
|
-
unless @vocab
|
235
|
-
puts "No vocabulary loaded."
|
236
|
-
return nil
|
237
|
-
end
|
238
|
-
|
239
|
-
# find the highest scoring words per topic
|
240
|
-
topics = Hash.new
|
241
|
-
indices = (0...@vocab.size).to_a
|
242
|
-
|
243
|
-
begin
|
244
|
-
beta.each_with_index do |topic, topic_idx|
|
245
|
-
indices.sort! {|x, y| -(topic[x] <=> topic[y])}
|
246
|
-
topics[topic_idx] = indices.first(words_per_topic).map { |i| @vocab[i] }
|
247
|
-
end
|
248
|
-
rescue NoMethodError
|
249
|
-
puts "Error: model has not been run."
|
250
|
-
topics = nil
|
251
|
-
end
|
252
|
-
|
253
|
-
topics
|
254
|
-
end
|
255
|
-
|
256
|
-
|
257
|
-
#
|
258
|
-
# Get the phi matrix which can be used to assign probabilities to words
|
259
|
-
# belonging to a specific topic in each document. The return value is a
|
260
|
-
# 3D matrix: num_docs x doc_length x num_topics. The value is cached
|
261
|
-
# after the first call, so if it needs to be recomputed, set the +recompute+
|
262
|
-
# value to true.
|
263
|
-
#
|
264
|
-
def phi(recompute=false)
|
265
|
-
if not @phi or recompute
|
266
|
-
# either the phi variable has not been instantiated or the recompute flag has been set
|
267
|
-
@phi = self.compute_phi
|
268
|
-
end
|
269
|
-
@phi
|
270
|
-
end
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
#
|
275
|
-
# Compute the average log probability for each topic for each document in the corpus.
|
276
|
-
# This method returns a matrix: num_docs x num_topics with the average log probability
|
277
|
-
# for the topic in the document.
|
278
|
-
#
|
279
|
-
def compute_topic_document_probability
|
280
|
-
outp = Array.new
|
281
|
-
@corpus.documents.each_with_index do |doc, idx|
|
282
|
-
tops = [0.0] * self.num_topics
|
283
|
-
ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
|
284
|
-
self.phi[idx].each_with_index do |word_dist, word_idx|
|
285
|
-
word_dist.each_with_index do |top_prob, top_idx|
|
286
|
-
tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
|
287
|
-
end
|
288
|
-
end
|
289
|
-
tops = tops.map {|i| i / ttl}
|
290
|
-
outp << tops
|
291
|
-
end
|
292
|
-
|
293
|
-
outp
|
294
|
-
end
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
#
|
300
|
-
# String representation displaying current settings.
|
301
|
-
#
|
302
|
-
def to_s
|
303
|
-
outp = []
|
304
|
-
outp << "LDA Settings:"
|
305
|
-
outp << " Initial alpha: %0.6f" % self.init_alpha
|
306
|
-
outp << " # of topics: %d" % self.num_topics
|
307
|
-
outp << " Max iterations: %d" % self.max_iter
|
308
|
-
outp << " Convergence: %0.6f" % self.convergence
|
309
|
-
outp << "EM max iterations: %d" % self.em_max_iter
|
310
|
-
outp << " EM convergence: %0.6f" % self.em_convergence
|
311
|
-
outp << " Estimate alpha: %d" % self.est_alpha
|
312
|
-
|
313
|
-
return outp.join("\n")
|
314
|
-
end
|
315
|
-
end
|
316
|
-
end
|
317
|
-
|
318
|
-
# load the c-side stuff
|
319
|
-
require 'lda_ext'
|