ealdent-lda-ruby 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ # Latent Dirichlet Allocation – Ruby Wrapper
2
+
3
+ ## What is LDA-Ruby?
4
+
5
+ This wrapper is based on C-code by David M. Blei. In a nutshell, it can be used to automatically cluster documents into topics. The number of topics are chosen beforehand and the topics found are usually fairly intuitive. Details of the implementation can be found in the paper by Blei, Ng, and Jordan.
6
+
7
+ The original C code relied on files for the input and output. We felt it was necessary to depart from that model and use Ruby objects for these steps instead. The only file necessary will be the data file (in a format similar to that used by [SVMlight][svmlight]). Optionally you may need a vocabulary file to be able to extract the words belonging to topics.
8
+
9
+ ### Example usage:
10
+
11
+ require 'lda'
12
+ lda = Lda::Lda.new # create an Lda object for training
13
+ corpus = Lda::Corpus.new("data/data_file.dat")
14
+ lda.corpus = corpus
15
+ lda.em("random") # run EM algorithm using random starting points
16
+ lda.load_vocabulary("data/vocab.txt")
17
+ lda.print_topics(20) # print the topic 20 words per topic
18
+
19
+ See the rdocs for further information. You can also check out the mailing list for this project if you have any questions or mail lda-ruby@groups.google.com [email link]. If you have general questions about Latent Dirichlet Allocation, I urge you to use the topic models mailing list, since the people who monitor that are very knowledgeable.
20
+
21
+ ## Resources
22
+
23
+
24
+ + [Blog post about LDA-Ruby][lda-ruby]
25
+ + [David Blei's lda-c code][blei]
26
+ + [Wikipedia article on LDA][wikipedia]
27
+ + [Sample AP data][ap-data]
28
+
29
+
30
+ ## References
31
+
32
+ Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022 [[pdf][pdf]].
33
+
34
+ [svmlight]: http://svmlight.joachims.org
35
+ [lda-ruby]: http://mendicantbug.com/2008/11/17/lda-in-ruby/
36
+ [blei]: http://www.cs.princeton.edu/~blei/lda-c/
37
+ [wikipedia]: http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
38
+ [ap-data]: http://www.cs.princeton.edu/~blei/lda-c/ap.tgz
39
+ [pdf]: http://www.cs.princeton.edu/picasso/mats/BleiNgJordan2003_blei.pdf
@@ -1,4 +1,4 @@
1
1
  ---
2
- :minor: 2
3
- :patch: 2
2
+ :patch: 3
4
3
  :major: 0
4
+ :minor: 2
@@ -1,4 +1,7 @@
1
1
  require 'mkmf'
2
2
 
3
+ $CFLAGS << ' -Wall -ggdb -O0'
4
+ $defs.push( "-D USE_RUBY" )
5
+
3
6
  dir_config("lda_ext")
4
7
  create_makefile("lda_ext")
@@ -17,10 +17,6 @@
17
17
  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
18
  // USA
19
19
 
20
- #ifndef USE_RUBY
21
- #define USE_RUBY
22
- #endif
23
-
24
20
  #include <stdlib.h>
25
21
  #include <stdio.h>
26
22
  #include <math.h>
@@ -50,13 +46,17 @@ VALUE rb_cLdaDocument;
50
46
  * variational inference
51
47
  */
52
48
 
53
- double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi) {
49
+ double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi, short* errors) {
54
50
  double converged = 1;
55
51
  double phisum = 0, likelihood = 0;
56
52
  double likelihood_old = 0, oldphi[model->num_topics];
57
- int k, n, var_iter;
53
+ int k = 0, n = 0, var_iter = 0, index = 0;
58
54
  double digamma_gam[model->num_topics];
59
55
 
56
+ /* zero'em out */
57
+ memset(digamma_gam,0.0,sizeof(digamma_gam));
58
+ memset(oldphi,0.0,sizeof(oldphi));
59
+
60
60
  // compute posterior dirichlet
61
61
 
62
62
  for (k = 0; k < model->num_topics; k++)
@@ -78,9 +78,16 @@ double lda_inference(document* doc, lda_model* model, double* var_gamma, double*
78
78
  for (k = 0; k < model->num_topics; k++)
79
79
  {
80
80
  oldphi[k] = phi[n][k];
81
- phi[n][k] =
82
- digamma_gam[k] +
83
- model->log_prob_w[k][doc->words[n]];
81
+ index = doc->words[n];
82
+ if( index < 0 || index > model->num_terms ) {
83
+ printf("phi for term: %d of %d\n", index, model->num_terms);
84
+ phi[n][k] = 0.0;
85
+ }
86
+ else {
87
+ phi[n][k] =
88
+ digamma_gam[k] +
89
+ model->log_prob_w[k][index];
90
+ }
84
91
 
85
92
  if (k > 0)
86
93
  phisum = log_sum(phisum, phi[n][k]);
@@ -100,7 +107,8 @@ double lda_inference(document* doc, lda_model* model, double* var_gamma, double*
100
107
  }
101
108
 
102
109
  likelihood = compute_likelihood(doc, model, phi, var_gamma);
103
- assert(!isnan(likelihood));
110
+ //assert(!isnan(likelihood));
111
+ if( isnan(likelihood) ) { *errors = 1; }
104
112
  converged = (likelihood_old - likelihood) / likelihood_old;
105
113
  likelihood_old = likelihood;
106
114
 
@@ -116,7 +124,8 @@ double lda_inference(document* doc, lda_model* model, double* var_gamma, double*
116
124
 
117
125
  double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) {
118
126
  double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics];
119
- int k, n;
127
+ int k = 0, n = 0, index = 0;
128
+ memset(dig,0.0,sizeof(dig));
120
129
 
121
130
  for (k = 0; k < model->num_topics; k++)
122
131
  {
@@ -125,7 +134,10 @@ double compute_likelihood(document* doc, lda_model* model, double** phi, double*
125
134
  }
126
135
  digsum = digamma(var_gamma_sum);
127
136
 
128
- likelihood = lgamma(model->alpha * model -> num_topics) - model -> num_topics * lgamma(model->alpha) - (lgamma(var_gamma_sum));
137
+ likelihood = lgamma(model->alpha * model->num_topics) -
138
+ model->num_topics *
139
+ lgamma(model->alpha) -
140
+ lgamma(var_gamma_sum);
129
141
 
130
142
  for (k = 0; k < model->num_topics; k++)
131
143
  {
@@ -135,9 +147,10 @@ double compute_likelihood(document* doc, lda_model* model, double** phi, double*
135
147
  {
136
148
  if (phi[n][k] > 0)
137
149
  {
150
+ index = doc->words[n];
138
151
  likelihood += doc->counts[n]*
139
152
  (phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
140
- + model->log_prob_w[k][doc->words[n]]));
153
+ + model->log_prob_w[k][index]));
141
154
  }
142
155
  }
143
156
  }
@@ -148,10 +161,13 @@ double compute_likelihood(document* doc, lda_model* model, double** phi, double*
148
161
  double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) {
149
162
  double likelihood;
150
163
  int n, k;
164
+ short error = 0;
165
+
166
+ // posterior inference
151
167
 
152
- // posterior inference
168
+ likelihood = lda_inference(doc, model, gamma, phi, &error);
169
+ if (error) { likelihood = 0.0; }
153
170
 
154
- likelihood = lda_inference(doc, model, gamma, phi);
155
171
 
156
172
  // update sufficient statistics
157
173
 
@@ -221,6 +237,7 @@ void run_em(char* start, char* directory, corpus* corpus) {
221
237
 
222
238
  // allocate variational parameters
223
239
 
240
+
224
241
  var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
225
242
  for (d = 0; d < corpus->num_docs; d++)
226
243
  var_gamma[d] = malloc(sizeof(double) * NTOPICS);
@@ -279,23 +296,22 @@ void run_em(char* start, char* directory, corpus* corpus) {
279
296
  zero_initialize_ss(ss, model);
280
297
 
281
298
  // e-step
299
+ printf("e-step\n");
282
300
 
283
301
  for (d = 0; d < corpus->num_docs; d++) {
284
302
  if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
285
303
  likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
286
304
  }
305
+ printf("m-step\n");
287
306
 
288
307
  // m-step
289
-
290
- if (VERBOSE) {
291
- lda_mle(model, ss, ESTIMATE_ALPHA);
292
- } else {
293
- quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
294
- }
295
-
308
+ if (VERBOSE) {
309
+ lda_mle(model, ss, ESTIMATE_ALPHA);
310
+ } else {
311
+ quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
312
+ }
296
313
 
297
314
  // check for convergence
298
-
299
315
  converged = (likelihood_old - likelihood) / (likelihood_old);
300
316
  if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
301
317
  likelihood_old = likelihood;
@@ -324,10 +340,15 @@ void run_em(char* start, char* directory, corpus* corpus) {
324
340
 
325
341
  sprintf(filename, "%s/word-assignments.dat", directory);
326
342
  FILE* w_asgn_file = fopen(filename, "w");
343
+ short error = 0;
344
+ double tl = 0.0;
327
345
  for (d = 0; d < corpus->num_docs; d++)
328
346
  {
329
347
  if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
330
- likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
348
+ error = 0;
349
+ tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
350
+ if( error ) { continue; }
351
+ likelihood += tl;
331
352
  write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
332
353
  }
333
354
  fclose(w_asgn_file);
@@ -388,7 +409,8 @@ void infer(char* model_root, char* save, corpus* corpus) {
388
409
  phi = (double**) malloc(sizeof(double*) * doc->length);
389
410
  for (n = 0; n < doc->length; n++)
390
411
  phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
391
- likelihood = lda_inference(doc, model, var_gamma[d], phi);
412
+ short error = 0;
413
+ likelihood = lda_inference(doc, model, var_gamma[d], phi, &error);
392
414
 
393
415
  fprintf(fileptr, "%5.5f\n", likelihood);
394
416
  }
@@ -448,58 +470,68 @@ int main(int argc, char* argv[]) {
448
470
 
449
471
  /* */
450
472
  void run_quiet_em(char* start, corpus* corpus) {
451
- int d, n;
473
+ int d = 0, n = 0;
452
474
  lda_model *model = NULL;
453
- double **var_gamma, **phi;
475
+ double **var_gamma = NULL, **phi = NULL;
476
+ // last_gamma is a double[num_docs][num_topics]
454
477
 
455
478
  // allocate variational parameters
456
479
 
457
- var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
458
- for (d = 0; d < corpus->num_docs; d++)
459
- var_gamma[d] = malloc(sizeof(double) * NTOPICS);
480
+
481
+ var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
482
+ memset(var_gamma, 0.0, corpus->num_docs);
483
+
484
+ for (d = 0; d < corpus->num_docs; ++d) {
485
+ var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS);
486
+ memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS);
487
+ }
460
488
 
461
489
  int max_length = max_corpus_length(corpus);
462
- phi = malloc(sizeof(double*)*max_length);
463
- for (n = 0; n < max_length; n++)
464
- phi[n] = malloc(sizeof(double) * NTOPICS);
490
+
491
+ phi = (double**)malloc(sizeof(double*)*max_length);
492
+ memset(phi, 0.0, max_length);
493
+ for (n = 0; n < max_length; ++n) {
494
+ phi[n] = (double*)malloc(sizeof(double) * NTOPICS);
495
+ memset(phi[n], 0.0, sizeof(double)*NTOPICS);
496
+ }
465
497
 
466
498
  // initialize model
467
499
 
468
500
  lda_suffstats* ss = NULL;
469
- if (strcmp(start, "seeded")==0) {
501
+ if (strncmp(start, "seeded",6)==0) {
470
502
  model = new_lda_model(corpus->num_terms, NTOPICS);
471
- ss = new_lda_suffstats(model);
472
- if (VERBOSE) {
473
- corpus_initialize_ss(ss, model, corpus);
474
- } else {
475
- quiet_corpus_initialize_ss(ss, model, corpus);
476
- }
477
- if (VERBOSE) {
478
- lda_mle(model, ss, 0);
479
- } else {
480
- quiet_lda_mle(model, ss, 0);
481
- }
482
503
  model->alpha = INITIAL_ALPHA;
483
- } else if (strcmp(start, "fixed")==0) {
484
- model = new_lda_model(corpus->num_terms, NTOPICS);
485
504
  ss = new_lda_suffstats(model);
486
- corpus_initialize_fixed_ss(ss, model, corpus);
487
505
  if (VERBOSE) {
488
- lda_mle(model, ss, 0);
506
+ corpus_initialize_ss(ss, model, corpus);
507
+ } else {
508
+ quiet_corpus_initialize_ss(ss, model, corpus);
509
+ }
510
+ if (VERBOSE) {
511
+ lda_mle(model, ss, 0);
489
512
  } else {
490
- quiet_lda_mle(model, ss, 0);
513
+ quiet_lda_mle(model, ss, 0);
491
514
  }
492
- model->alpha = INITIAL_ALPHA;
493
- } else if (strcmp(start, "random")==0) {
515
+ } else if (strncmp(start, "fixed",5)==0) {
516
+ model = new_lda_model(corpus->num_terms, NTOPICS);
517
+ model->alpha = INITIAL_ALPHA;
518
+ ss = new_lda_suffstats(model);
519
+ corpus_initialize_fixed_ss(ss, model, corpus);
520
+ if (VERBOSE) {
521
+ lda_mle(model, ss, 0);
522
+ } else {
523
+ quiet_lda_mle(model, ss, 0);
524
+ }
525
+ } else if (strncmp(start, "random",6)==0) {
494
526
  model = new_lda_model(corpus->num_terms, NTOPICS);
527
+ model->alpha = INITIAL_ALPHA;
495
528
  ss = new_lda_suffstats(model);
496
529
  random_initialize_ss(ss, model);
497
530
  if (VERBOSE) {
498
- lda_mle(model, ss, 0);
531
+ lda_mle(model, ss, 0);
499
532
  } else {
500
- quiet_lda_mle(model, ss, 0);
533
+ quiet_lda_mle(model, ss, 0);
501
534
  }
502
- model->alpha = INITIAL_ALPHA;
503
535
  } else {
504
536
  model = load_lda_model(start);
505
537
  ss = new_lda_suffstats(model);
@@ -512,12 +544,11 @@ void run_quiet_em(char* start, corpus* corpus) {
512
544
  // run expectation maximization
513
545
 
514
546
  int i = 0;
515
- double likelihood, likelihood_old = 0, converged = 1;
547
+ double likelihood = 0.0, likelihood_old = 0, converged = 1;
516
548
 
517
549
  while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
518
550
  i++;
519
- if (VERBOSE)
520
- printf("**** em iteration %d ****\n", i);
551
+ if (VERBOSE) printf("**** em iteration %d ****\n", i);
521
552
  likelihood = 0;
522
553
  zero_initialize_ss(ss, model);
523
554
 
@@ -529,12 +560,11 @@ void run_quiet_em(char* start, corpus* corpus) {
529
560
  }
530
561
 
531
562
  // m-step
532
-
533
- if (VERBOSE) {
534
- lda_mle(model, ss, ESTIMATE_ALPHA);
535
- } else {
536
- quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
537
- }
563
+ if (VERBOSE) {
564
+ lda_mle(model, ss, ESTIMATE_ALPHA);
565
+ } else {
566
+ quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
567
+ }
538
568
 
539
569
  // check for convergence
540
570
 
@@ -546,14 +576,16 @@ void run_quiet_em(char* start, corpus* corpus) {
546
576
 
547
577
  last_model = model;
548
578
  last_gamma = var_gamma;
549
- last_phi = phi;
579
+ last_phi = phi;
550
580
  }
551
581
 
552
582
  // output the final model
553
583
 
554
584
  last_model = model;
555
585
  last_gamma = var_gamma;
556
- last_phi = phi;
586
+ last_phi = phi;
587
+
588
+ free_lda_suffstats(model,ss);
557
589
 
558
590
  // output the word assignments (for visualization)
559
591
  /*
@@ -585,6 +617,7 @@ void run_quiet_em(char* start, corpus* corpus) {
585
617
  static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
586
618
  INITIAL_ALPHA = NUM2DBL(init_alpha);
587
619
  NTOPICS = NUM2INT(num_topics);
620
+ if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
588
621
  VAR_MAX_ITER = NUM2INT(max_iter);
589
622
  VAR_CONVERGED = (float)NUM2DBL(convergence);
590
623
  EM_MAX_ITER = NUM2INT(em_max_iter);
@@ -798,8 +831,11 @@ static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
798
831
  c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
799
832
  c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
800
833
  for (j = 0; j < c->docs[i].length; j++) {
801
- VALUE one_word = NUM2INT(rb_ary_entry(words, j));
802
- VALUE one_count = NUM2INT(rb_ary_entry(counts, j));
834
+ int one_word = NUM2INT(rb_ary_entry(words, j));
835
+ int one_count = NUM2INT(rb_ary_entry(counts, j));
836
+ if( one_word > c->num_terms ) {
837
+ rb_raise(rb_eRuntimeError, "error term count(%d) less then word index(%d)", c->num_terms, one_word);
838
+ }
803
839
  c->docs[i].words[j] = one_word;
804
840
  c->docs[i].counts[j] = one_count;
805
841
  }
@@ -850,12 +886,13 @@ static VALUE wrap_get_phi(VALUE self) {
850
886
  VALUE arr = rb_ary_new2(last_corpus->num_docs);
851
887
  int i = 0, j = 0, k = 0;
852
888
 
853
- int max_length = max_corpus_length(last_corpus);
889
+ //int max_length = max_corpus_length(last_corpus);
890
+ short error = 0;
854
891
 
855
892
  for (i = 0; i < last_corpus->num_docs; i++) {
856
893
  VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
857
894
 
858
- lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi);
895
+ lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);
859
896
 
860
897
  for (j = 0; j < last_corpus->docs[i].length; j++) {
861
898
  VALUE arr2 = rb_ary_new2(last_model->num_topics);
@@ -968,4 +1005,4 @@ void Init_lda_ext() {
968
1005
  rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
969
1006
  }
970
1007
 
971
- #endif
1008
+ #endif
@@ -4,7 +4,6 @@
4
4
  #include <stdlib.h>
5
5
  #include <math.h>
6
6
  #include <float.h>
7
- #include <assert.h>
8
7
  #include "lda.h"
9
8
  #include "utils.h"
10
9
 
@@ -31,7 +30,7 @@ enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded, VERBOSE;
31
30
 
32
31
 
33
32
 
34
- double lda_inference(document*, lda_model*, double*, double**);
33
+ double lda_inference(document*, lda_model*, double*, double**, short*);
35
34
  double compute_likelihood(document*, lda_model*, double**, double*);
36
35
 
37
36
 
@@ -61,4 +60,4 @@ void infer(char* model_root,
61
60
  corpus* corpus);
62
61
 
63
62
 
64
- #endif
63
+ #endif
@@ -18,6 +18,7 @@
18
18
  // USA
19
19
 
20
20
  #include "lda-model.h"
21
+ #include <string.h>
21
22
 
22
23
 
23
24
  /*
@@ -88,25 +89,38 @@ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
88
89
  */
89
90
 
90
91
  lda_suffstats* new_lda_suffstats(lda_model* model) {
92
+ register int i;
91
93
  int num_topics = model->num_topics;
92
94
  int num_terms = model->num_terms;
93
- int i,j;
94
95
 
95
- lda_suffstats* ss = malloc(sizeof(lda_suffstats));
96
- ss->class_total = malloc(sizeof(double)*num_topics);
97
- ss->class_word = malloc(sizeof(double*)*num_topics);
98
- for (i = 0; i < num_topics; i++)
99
- {
96
+ lda_suffstats* ss = (lda_suffstats*)malloc(sizeof(lda_suffstats));
97
+ memset(ss,0,sizeof(lda_suffstats));
98
+ ss->class_total = (double*)malloc(sizeof(double)*num_topics);
99
+ ss->class_word = (double**)malloc(sizeof(double*)*num_topics);
100
+
101
+ for (i = 0; i < num_topics; ++i) {
100
102
  ss->class_total[i] = 0;
101
- ss->class_word[i] = malloc(sizeof(double)*num_terms);
102
- for (j = 0; j < num_terms; j++)
103
- {
104
- ss->class_word[i][j] = 0;
105
- }
103
+ ss->class_word[i] = (double*)malloc(sizeof(double)*num_terms);
104
+ memset(ss->class_word[i],0.0,sizeof(double)*num_terms);
106
105
  }
106
+
107
107
  return(ss);
108
108
  }
109
+ /*
110
+ * deallocate new lda suffstats
111
+ *
112
+ */
113
+ void free_lda_suffstats(lda_model* model, lda_suffstats* ss) {
114
+ int i;
115
+ int num_topics = model->num_topics;
109
116
 
117
+ free(ss->class_total);
118
+ for (i = 0; i < num_topics; ++i) {
119
+ free(ss->class_word[i]);
120
+ }
121
+ free(ss->class_word);
122
+ free(ss);
123
+ }
110
124
 
111
125
  /*
112
126
  * various intializations for the sufficient statistics
@@ -132,6 +146,7 @@ void random_initialize_ss(lda_suffstats* ss, lda_model* model) {
132
146
  int num_topics = model->num_topics;
133
147
  int num_terms = model->num_terms;
134
148
  int k, n;
149
+
135
150
  for (k = 0; k < num_topics; k++)
136
151
  {
137
152
  for (n = 0; n < num_terms; n++)
@@ -221,7 +236,7 @@ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c)
221
236
  */
222
237
 
223
238
  lda_model* new_lda_model(int num_terms, int num_topics) {
224
- int i,j;
239
+ int i;
225
240
  lda_model* model;
226
241
 
227
242
  model = malloc(sizeof(lda_model));
@@ -229,11 +244,11 @@ lda_model* new_lda_model(int num_terms, int num_topics) {
229
244
  model->num_terms = num_terms;
230
245
  model->alpha = 1.0;
231
246
  model->log_prob_w = malloc(sizeof(double*)*num_topics);
247
+ printf("new model with: %d topics and %d terms\n", num_topics, num_terms);
232
248
  for (i = 0; i < num_topics; i++)
233
249
  {
234
250
  model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
235
- for (j = 0; j < num_terms; j++)
236
- model->log_prob_w[i][j] = 0;
251
+ memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
237
252
  }
238
253
  return(model);
239
254
  }
@@ -16,6 +16,7 @@ void free_lda_model(lda_model*);
16
16
  void save_lda_model(lda_model*, char*);
17
17
  lda_model* new_lda_model(int, int);
18
18
  lda_suffstats* new_lda_suffstats(lda_model* model);
19
+ void free_lda_suffstats(lda_model* model, lda_suffstats* ss);
19
20
  void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
20
21
  void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
21
22
  void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c);
data/lib/lda.rb CHANGED
@@ -30,10 +30,10 @@ module Lda
30
30
  def add_document(doc)
31
31
  if doc.is_a?(Document)
32
32
  @documents << doc
33
- @all_terms = @all_terms + doc.words
33
+ @all_terms += doc.words
34
34
  elsif doc.is_a?(String)
35
35
  d = Document.new(doc)
36
- @all_terms = @all_terms + d.words
36
+ @all_terms += d.words
37
37
  @documents << d
38
38
  end
39
39
  @num_docs += 1
@@ -51,11 +51,25 @@ module Lda
51
51
  true
52
52
  end
53
53
  end
54
+
55
+ class BaseDocument
56
+ def words
57
+ raise NotSupportedError
58
+ end
59
+
60
+ def length
61
+ raise NotSupportedError
62
+ end
63
+
64
+ def total
65
+ raise NotSupportedError
66
+ end
67
+ end
54
68
 
55
69
  #
56
70
  # A single document.
57
71
  #
58
- class Document
72
+ class Document < BaseDocument
59
73
  attr_accessor :words, :counts
60
74
  attr_reader :length, :total
61
75
 
@@ -222,26 +236,20 @@ module Lda
222
236
  return nil
223
237
  end
224
238
 
225
- # Load the model
226
- beta = self.beta
227
- unless beta
228
- puts "Model has not been run."
229
- return nil
230
- end
231
-
232
239
  # find the highest scoring words per topic
233
240
  topics = Hash.new
234
- indices = (0..(@vocab.size - 1)).to_a
235
- topic_num = 0
236
- beta.each do |topic|
237
- topics[topic_num] = Array.new
238
- indices.sort! {|x, y| -(topic[x] <=> topic[y])}
239
- words_per_topic.times do |i|
240
- topics[topic_num] << @vocab[indices[i]]
241
+ indices = (0...@vocab.size).to_a
242
+
243
+ begin
244
+ beta.each_with_index do |topic, topic_idx|
245
+ indices.sort! {|x, y| -(topic[x] <=> topic[y])}
246
+ topics[topic_idx] = indices.first(words_per_topic).map { |i| @vocab[i] }
241
247
  end
242
- topic_num += 1
248
+ rescue NoMethodError
249
+ puts "Error: model has not been run."
250
+ topics = nil
243
251
  end
244
-
252
+
245
253
  topics
246
254
  end
247
255
 
@@ -308,4 +316,4 @@ module Lda
308
316
  end
309
317
 
310
318
  # load the c-side stuff
311
- require 'lda_ext'
319
+ require 'lda_ext'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ealdent-lda-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jason M. Adams
@@ -10,21 +10,31 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2009-01-24 00:00:00 -08:00
13
+ date: 2009-07-19 00:00:00 -07:00
14
14
  default_executable:
15
- dependencies: []
16
-
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: stemmer
18
+ type: :runtime
19
+ version_requirement:
20
+ version_requirements: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ version:
17
26
  description:
18
27
  email: jasonmadams@gmail.com
19
28
  executables: []
20
29
 
21
30
  extensions:
22
31
  - lib/extconf.rb
23
- extra_rdoc_files: []
24
-
32
+ extra_rdoc_files:
33
+ - README
34
+ - README.markdown
25
35
  files:
26
36
  - README
27
- - license.txt
37
+ - VERSION.yml
28
38
  - lib/cokus.c
29
39
  - lib/cokus.h
30
40
  - lib/extconf.rb
@@ -40,12 +50,12 @@ files:
40
50
  - lib/lda.rb
41
51
  - lib/utils.c
42
52
  - lib/utils.h
43
- - VERSION.yml
44
- has_rdoc: true
53
+ - license.txt
54
+ - README.markdown
55
+ has_rdoc: false
45
56
  homepage: http://github.com/ealdent/lda-ruby
46
57
  post_install_message:
47
58
  rdoc_options:
48
- - --inline-source
49
59
  - --charset=UTF-8
50
60
  require_paths:
51
61
  - lib
@@ -66,7 +76,7 @@ requirements: []
66
76
  rubyforge_project:
67
77
  rubygems_version: 1.2.0
68
78
  signing_key:
69
- specification_version: 2
79
+ specification_version: 3
70
80
  summary: Ruby port of Latent Dirichlet Allocation by David M. Blei.
71
81
  test_files: []
72
82