ealdent-lda-ruby 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +22 -0
- data/lib/lda-inference.c +27 -10
- data/lib/lda.rb +44 -0
- metadata +2 -2
data/README
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
Latent Dirichlet Allocation – Ruby Wrapper
|
2
|
+
|
3
|
+
This wrapper is based on C-code by David M. Blei. In a nutshell, it can be used to automatic cluster documents into topics. The number of topics are chosen beforehand and the topics found are usually fairly intuitive. Details of the implementation can be found in the paper by Blei, Ng, and Jordan.
|
4
|
+
|
5
|
+
The original C code relied on files for the input and output. We felt it was necessary to depart from that model and use Ruby objects for these steps instead. The only file necessary will be the data file (in a format similar to that used by SVMlight). Optionally you may need a vocabulary file to be able to extract the words belonging to topics.
|
6
|
+
|
7
|
+
Example usage:
|
8
|
+
|
9
|
+
require 'lda'
|
10
|
+
lda = Lda::Lda.new # create an Lda object for training
|
11
|
+
corpus = Lda::Corpus.new("data/data_file.dat")
|
12
|
+
lda.corpus = corpus
|
13
|
+
lda.em("random") # run EM algorithm using random starting points
|
14
|
+
lda.load_vocabulary("data/vocab.txt")
|
15
|
+
lda.print_topics(20) # print the topic 20 words per topic
|
16
|
+
|
17
|
+
See the rdocs for further information. You can also check out the mailing list for this project if you have any questions or mail lda-ruby@groups.google.com [email link]. If you have general questions about Latent Dirichlet Allocation, I urge you to use the topic models mailing list, since the people who monitor that are very knowledgeable.
|
18
|
+
|
19
|
+
|
20
|
+
References
|
21
|
+
|
22
|
+
Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022.
|
data/lib/lda-inference.c
CHANGED
@@ -837,27 +837,44 @@ static VALUE wrap_get_gamma(VALUE self) {
|
|
837
837
|
return arr;
|
838
838
|
}
|
839
839
|
|
840
|
+
|
841
|
+
/*
|
842
|
+
* Compute the phi values by running inference after the initial EM run has been completed.
|
843
|
+
*
|
844
|
+
* Returns a 3D matrix: <tt>num_docs x length x num_topics</tt>.
|
845
|
+
*/
|
840
846
|
static VALUE wrap_get_phi(VALUE self) {
|
841
847
|
if (!model_loaded)
|
842
848
|
return Qnil;
|
843
849
|
|
844
|
-
VALUE arr;
|
845
|
-
int i = 0, j = 0;
|
846
|
-
int max_length = max_corpus_length(last_corpus);
|
850
|
+
VALUE arr = rb_ary_new2(last_corpus->num_docs);
|
851
|
+
int i = 0, j = 0, k = 0;
|
847
852
|
|
853
|
+
int max_length = max_corpus_length(last_corpus);
|
848
854
|
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
855
|
+
for (i = 0; i < last_corpus->num_docs; i++) {
|
856
|
+
VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
|
857
|
+
|
858
|
+
lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi);
|
859
|
+
|
860
|
+
for (j = 0; j < last_corpus->docs[i].length; j++) {
|
861
|
+
VALUE arr2 = rb_ary_new2(last_model->num_topics);
|
862
|
+
|
863
|
+
for (k = 0; k < last_model->num_topics; k++) {
|
864
|
+
rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
|
865
|
+
}
|
866
|
+
|
867
|
+
rb_ary_store(arr1, j, arr2);
|
854
868
|
}
|
855
|
-
|
869
|
+
|
870
|
+
rb_ary_store(arr, i, arr1);
|
856
871
|
}
|
857
872
|
|
858
873
|
return arr;
|
859
874
|
}
|
860
875
|
|
876
|
+
|
877
|
+
|
861
878
|
/*
|
862
879
|
* Get the beta matrix after the model has been run.
|
863
880
|
*/
|
@@ -963,7 +980,7 @@ void Init_lda_ext() {
|
|
963
980
|
// retrieve model and gamma
|
964
981
|
rb_define_method(rb_cLda, "beta", wrap_get_model_beta, 0);
|
965
982
|
rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0);
|
966
|
-
rb_define_method(rb_cLda, "
|
983
|
+
rb_define_method(rb_cLda, "compute_phi", wrap_get_phi, 0);
|
967
984
|
rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
|
968
985
|
}
|
969
986
|
|
data/lib/lda.rb
CHANGED
@@ -114,6 +114,7 @@ module Lda
|
|
114
114
|
self.load_default_settings
|
115
115
|
@corpus = nil
|
116
116
|
@vocab = nil
|
117
|
+
@phi = nil
|
117
118
|
end
|
118
119
|
|
119
120
|
#
|
@@ -248,6 +249,49 @@ module Lda
|
|
248
249
|
topics
|
249
250
|
end
|
250
251
|
|
252
|
+
|
253
|
+
#
|
254
|
+
# Get the phi matrix which can be used to assign probabilities to words
|
255
|
+
# belonging to a specific topic in each document. The return value is a
|
256
|
+
# 3D matrix: num_docs x doc_length x num_topics. The value is cached
|
257
|
+
# after the first call, so if it needs to be recomputed, set the +recompute+
|
258
|
+
# value to true.
|
259
|
+
#
|
260
|
+
def phi(recompute=false)
|
261
|
+
if not @phi or recompute
|
262
|
+
# either the phi variable has not been instantiated or the recompute flag has been set
|
263
|
+
@phi = self.compute_phi
|
264
|
+
end
|
265
|
+
@phi
|
266
|
+
end
|
267
|
+
|
268
|
+
|
269
|
+
|
270
|
+
#
|
271
|
+
# Compute the average log probability for each topic for each document in the corpus.
|
272
|
+
# This method returns a matrix: num_docs x num_topics with the average log probability
|
273
|
+
# for the topic in the document.
|
274
|
+
#
|
275
|
+
def compute_topic_document_probability
|
276
|
+
outp = Array.new
|
277
|
+
@corpus.documents.each_with_index do |doc, idx|
|
278
|
+
tops = [0.0] * self.num_topics
|
279
|
+
ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
|
280
|
+
self.phi[idx].each_with_index do |word_dist, word_idx|
|
281
|
+
word_dist.each_with_index do |top_prob, top_idx|
|
282
|
+
tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
|
283
|
+
end
|
284
|
+
end
|
285
|
+
tops = tops.map {|i| i / ttl}
|
286
|
+
outp << tops
|
287
|
+
end
|
288
|
+
|
289
|
+
outp
|
290
|
+
end
|
291
|
+
|
292
|
+
|
293
|
+
|
294
|
+
|
251
295
|
#
|
252
296
|
# String representation displaying current settings.
|
253
297
|
#
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ealdent-lda-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jason M. Adams
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2008-11-
|
13
|
+
date: 2008-11-25 00:00:00 -08:00
|
14
14
|
default_executable:
|
15
15
|
dependencies: []
|
16
16
|
|