ealdent-lda-ruby 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/README +22 -0
  2. data/lib/lda-inference.c +27 -10
  3. data/lib/lda.rb +44 -0
  4. metadata +2 -2
data/README CHANGED
@@ -0,0 +1,22 @@
1
+ Latent Dirichlet Allocation – Ruby Wrapper
2
+
3
+ This wrapper is based on C-code by David M. Blei. In a nutshell, it can be used to automatic cluster documents into topics. The number of topics are chosen beforehand and the topics found are usually fairly intuitive. Details of the implementation can be found in the paper by Blei, Ng, and Jordan.
4
+
5
+ The original C code relied on files for the input and output. We felt it was necessary to depart from that model and use Ruby objects for these steps instead. The only file necessary will be the data file (in a format similar to that used by SVMlight). Optionally you may need a vocabulary file to be able to extract the words belonging to topics.
6
+
7
+ Example usage:
8
+
9
+ require 'lda'
10
+ lda = Lda::Lda.new # create an Lda object for training
11
+ corpus = Lda::Corpus.new("data/data_file.dat")
12
+ lda.corpus = corpus
13
+ lda.em("random") # run EM algorithm using random starting points
14
+ lda.load_vocabulary("data/vocab.txt")
15
+ lda.print_topics(20) # print the topic 20 words per topic
16
+
17
+ See the rdocs for further information. You can also check out the mailing list for this project if you have any questions or mail lda-ruby@groups.google.com [email link]. If you have general questions about Latent Dirichlet Allocation, I urge you to use the topic models mailing list, since the people who monitor that are very knowledgeable.
18
+
19
+
20
+ References
21
+
22
+ Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022.
data/lib/lda-inference.c CHANGED
@@ -837,27 +837,44 @@ static VALUE wrap_get_gamma(VALUE self) {
837
837
  return arr;
838
838
  }
839
839
 
840
+
841
+ /*
842
+ * Compute the phi values by running inference after the initial EM run has been completed.
843
+ *
844
+ * Returns a 3D matrix: <tt>num_docs x length x num_topics</tt>.
845
+ */
840
846
  static VALUE wrap_get_phi(VALUE self) {
841
847
  if (!model_loaded)
842
848
  return Qnil;
843
849
 
844
- VALUE arr;
845
- int i = 0, j = 0;
846
- int max_length = max_corpus_length(last_corpus);
850
+ VALUE arr = rb_ary_new2(last_corpus->num_docs);
851
+ int i = 0, j = 0, k = 0;
847
852
 
853
+ int max_length = max_corpus_length(last_corpus);
848
854
 
849
- arr = rb_ary_new2(max_length);
850
- for (i = 0; i < max_length; i++) {
851
- VALUE arr2 = rb_ary_new2(last_model->num_topics);
852
- for (j = 0; j < last_model->num_topics; j++) {
853
- rb_ary_store(arr2, j, rb_float_new(last_phi[i][j]));
855
+ for (i = 0; i < last_corpus->num_docs; i++) {
856
+ VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
857
+
858
+ lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi);
859
+
860
+ for (j = 0; j < last_corpus->docs[i].length; j++) {
861
+ VALUE arr2 = rb_ary_new2(last_model->num_topics);
862
+
863
+ for (k = 0; k < last_model->num_topics; k++) {
864
+ rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
865
+ }
866
+
867
+ rb_ary_store(arr1, j, arr2);
854
868
  }
855
- rb_ary_store(arr, i, arr2);
869
+
870
+ rb_ary_store(arr, i, arr1);
856
871
  }
857
872
 
858
873
  return arr;
859
874
  }
860
875
 
876
+
877
+
861
878
  /*
862
879
  * Get the beta matrix after the model has been run.
863
880
  */
@@ -963,7 +980,7 @@ void Init_lda_ext() {
963
980
  // retrieve model and gamma
964
981
  rb_define_method(rb_cLda, "beta", wrap_get_model_beta, 0);
965
982
  rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0);
966
- rb_define_method(rb_cLda, "phi", wrap_get_phi, 0);
983
+ rb_define_method(rb_cLda, "compute_phi", wrap_get_phi, 0);
967
984
  rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
968
985
  }
969
986
 
data/lib/lda.rb CHANGED
@@ -114,6 +114,7 @@ module Lda
114
114
  self.load_default_settings
115
115
  @corpus = nil
116
116
  @vocab = nil
117
+ @phi = nil
117
118
  end
118
119
 
119
120
  #
@@ -248,6 +249,49 @@ module Lda
248
249
  topics
249
250
  end
250
251
 
252
+
253
+ #
254
+ # Get the phi matrix which can be used to assign probabilities to words
255
+ # belonging to a specific topic in each document. The return value is a
256
+ # 3D matrix: num_docs x doc_length x num_topics. The value is cached
257
+ # after the first call, so if it needs to be recomputed, set the +recompute+
258
+ # value to true.
259
+ #
260
+ def phi(recompute=false)
261
+ if not @phi or recompute
262
+ # either the phi variable has not been instantiated or the recompute flag has been set
263
+ @phi = self.compute_phi
264
+ end
265
+ @phi
266
+ end
267
+
268
+
269
+
270
+ #
271
+ # Compute the average log probability for each topic for each document in the corpus.
272
+ # This method returns a matrix: num_docs x num_topics with the average log probability
273
+ # for the topic in the document.
274
+ #
275
+ def compute_topic_document_probability
276
+ outp = Array.new
277
+ @corpus.documents.each_with_index do |doc, idx|
278
+ tops = [0.0] * self.num_topics
279
+ ttl = doc.counts.inject(0.0) {|sum, i| sum + i}
280
+ self.phi[idx].each_with_index do |word_dist, word_idx|
281
+ word_dist.each_with_index do |top_prob, top_idx|
282
+ tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
283
+ end
284
+ end
285
+ tops = tops.map {|i| i / ttl}
286
+ outp << tops
287
+ end
288
+
289
+ outp
290
+ end
291
+
292
+
293
+
294
+
251
295
  #
252
296
  # String representation displaying current settings.
253
297
  #
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ealdent-lda-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jason M. Adams
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2008-11-24 00:00:00 -08:00
13
+ date: 2008-11-25 00:00:00 -08:00
14
14
  default_executable:
15
15
  dependencies: []
16
16