ealdent-lda-ruby 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +39 -0
- data/VERSION.yml +2 -2
- data/lib/extconf.rb +3 -0
- data/lib/lda-inference.c +108 -71
- data/lib/lda-inference.h +2 -3
- data/lib/lda-model.c +29 -14
- data/lib/lda-model.h +1 -0
- data/lib/lda.rb +28 -20
- metadata +21 -11
data/README.markdown
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# Latent Dirichlet Allocation – Ruby Wrapper
|
2
|
+
|
3
|
+
## What is LDA-Ruby?
|
4
|
+
|
5
|
+
This wrapper is based on C-code by David M. Blei. In a nutshell, it can be used to automatically cluster documents into topics. The number of topics are chosen beforehand and the topics found are usually fairly intuitive. Details of the implementation can be found in the paper by Blei, Ng, and Jordan.
|
6
|
+
|
7
|
+
The original C code relied on files for the input and output. We felt it was necessary to depart from that model and use Ruby objects for these steps instead. The only file necessary will be the data file (in a format similar to that used by [SVMlight][svmlight]). Optionally you may need a vocabulary file to be able to extract the words belonging to topics.
|
8
|
+
|
9
|
+
### Example usage:
|
10
|
+
|
11
|
+
require 'lda'
|
12
|
+
lda = Lda::Lda.new # create an Lda object for training
|
13
|
+
corpus = Lda::Corpus.new("data/data_file.dat")
|
14
|
+
lda.corpus = corpus
|
15
|
+
lda.em("random") # run EM algorithm using random starting points
|
16
|
+
lda.load_vocabulary("data/vocab.txt")
|
17
|
+
lda.print_topics(20) # print the topic 20 words per topic
|
18
|
+
|
19
|
+
See the rdocs for further information. You can also check out the mailing list for this project if you have any questions or mail lda-ruby@groups.google.com [email link]. If you have general questions about Latent Dirichlet Allocation, I urge you to use the topic models mailing list, since the people who monitor that are very knowledgeable.
|
20
|
+
|
21
|
+
## Resources
|
22
|
+
|
23
|
+
|
24
|
+
+ [Blog post about LDA-Ruby][lda-ruby]
|
25
|
+
+ [David Blei's lda-c code][blei]
|
26
|
+
+ [Wikipedia article on LDA][wikipedia]
|
27
|
+
+ [Sample AP data][ap-data]
|
28
|
+
|
29
|
+
|
30
|
+
## References
|
31
|
+
|
32
|
+
Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022 [[pdf][pdf]].
|
33
|
+
|
34
|
+
[svmlight]: http://svmlight.joachims.org
|
35
|
+
[lda-ruby]: http://mendicantbug.com/2008/11/17/lda-in-ruby/
|
36
|
+
[blei]: http://www.cs.princeton.edu/~blei/lda-c/
|
37
|
+
[wikipedia]: http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
|
38
|
+
[ap-data]: http://www.cs.princeton.edu/~blei/lda-c/ap.tgz
|
39
|
+
[pdf]: http://www.cs.princeton.edu/picasso/mats/BleiNgJordan2003_blei.pdf
|
data/VERSION.yml
CHANGED
data/lib/extconf.rb
CHANGED
data/lib/lda-inference.c
CHANGED
@@ -17,10 +17,6 @@
|
|
17
17
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
18
18
|
// USA
|
19
19
|
|
20
|
-
#ifndef USE_RUBY
|
21
|
-
#define USE_RUBY
|
22
|
-
#endif
|
23
|
-
|
24
20
|
#include <stdlib.h>
|
25
21
|
#include <stdio.h>
|
26
22
|
#include <math.h>
|
@@ -50,13 +46,17 @@ VALUE rb_cLdaDocument;
|
|
50
46
|
* variational inference
|
51
47
|
*/
|
52
48
|
|
53
|
-
double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi) {
|
49
|
+
double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi, short* errors) {
|
54
50
|
double converged = 1;
|
55
51
|
double phisum = 0, likelihood = 0;
|
56
52
|
double likelihood_old = 0, oldphi[model->num_topics];
|
57
|
-
int k, n, var_iter;
|
53
|
+
int k = 0, n = 0, var_iter = 0, index = 0;
|
58
54
|
double digamma_gam[model->num_topics];
|
59
55
|
|
56
|
+
/* zero'em out */
|
57
|
+
memset(digamma_gam,0.0,sizeof(digamma_gam));
|
58
|
+
memset(oldphi,0.0,sizeof(oldphi));
|
59
|
+
|
60
60
|
// compute posterior dirichlet
|
61
61
|
|
62
62
|
for (k = 0; k < model->num_topics; k++)
|
@@ -78,9 +78,16 @@ double lda_inference(document* doc, lda_model* model, double* var_gamma, double*
|
|
78
78
|
for (k = 0; k < model->num_topics; k++)
|
79
79
|
{
|
80
80
|
oldphi[k] = phi[n][k];
|
81
|
-
|
82
|
-
|
83
|
-
|
81
|
+
index = doc->words[n];
|
82
|
+
if( index < 0 || index > model->num_terms ) {
|
83
|
+
printf("phi for term: %d of %d\n", index, model->num_terms);
|
84
|
+
phi[n][k] = 0.0;
|
85
|
+
}
|
86
|
+
else {
|
87
|
+
phi[n][k] =
|
88
|
+
digamma_gam[k] +
|
89
|
+
model->log_prob_w[k][index];
|
90
|
+
}
|
84
91
|
|
85
92
|
if (k > 0)
|
86
93
|
phisum = log_sum(phisum, phi[n][k]);
|
@@ -100,7 +107,8 @@ double lda_inference(document* doc, lda_model* model, double* var_gamma, double*
|
|
100
107
|
}
|
101
108
|
|
102
109
|
likelihood = compute_likelihood(doc, model, phi, var_gamma);
|
103
|
-
assert(!isnan(likelihood));
|
110
|
+
//assert(!isnan(likelihood));
|
111
|
+
if( isnan(likelihood) ) { *errors = 1; }
|
104
112
|
converged = (likelihood_old - likelihood) / likelihood_old;
|
105
113
|
likelihood_old = likelihood;
|
106
114
|
|
@@ -116,7 +124,8 @@ double lda_inference(document* doc, lda_model* model, double* var_gamma, double*
|
|
116
124
|
|
117
125
|
double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) {
|
118
126
|
double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics];
|
119
|
-
int k, n;
|
127
|
+
int k = 0, n = 0, index = 0;
|
128
|
+
memset(dig,0.0,sizeof(dig));
|
120
129
|
|
121
130
|
for (k = 0; k < model->num_topics; k++)
|
122
131
|
{
|
@@ -125,7 +134,10 @@ double compute_likelihood(document* doc, lda_model* model, double** phi, double*
|
|
125
134
|
}
|
126
135
|
digsum = digamma(var_gamma_sum);
|
127
136
|
|
128
|
-
likelihood = lgamma(model->alpha * model
|
137
|
+
likelihood = lgamma(model->alpha * model->num_topics) -
|
138
|
+
model->num_topics *
|
139
|
+
lgamma(model->alpha) -
|
140
|
+
lgamma(var_gamma_sum);
|
129
141
|
|
130
142
|
for (k = 0; k < model->num_topics; k++)
|
131
143
|
{
|
@@ -135,9 +147,10 @@ double compute_likelihood(document* doc, lda_model* model, double** phi, double*
|
|
135
147
|
{
|
136
148
|
if (phi[n][k] > 0)
|
137
149
|
{
|
150
|
+
index = doc->words[n];
|
138
151
|
likelihood += doc->counts[n]*
|
139
152
|
(phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
|
140
|
-
+ model->log_prob_w[k][
|
153
|
+
+ model->log_prob_w[k][index]));
|
141
154
|
}
|
142
155
|
}
|
143
156
|
}
|
@@ -148,10 +161,13 @@ double compute_likelihood(document* doc, lda_model* model, double** phi, double*
|
|
148
161
|
double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) {
|
149
162
|
double likelihood;
|
150
163
|
int n, k;
|
164
|
+
short error = 0;
|
165
|
+
|
166
|
+
// posterior inference
|
151
167
|
|
152
|
-
|
168
|
+
likelihood = lda_inference(doc, model, gamma, phi, &error);
|
169
|
+
if (error) { likelihood = 0.0; }
|
153
170
|
|
154
|
-
likelihood = lda_inference(doc, model, gamma, phi);
|
155
171
|
|
156
172
|
// update sufficient statistics
|
157
173
|
|
@@ -221,6 +237,7 @@ void run_em(char* start, char* directory, corpus* corpus) {
|
|
221
237
|
|
222
238
|
// allocate variational parameters
|
223
239
|
|
240
|
+
|
224
241
|
var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
|
225
242
|
for (d = 0; d < corpus->num_docs; d++)
|
226
243
|
var_gamma[d] = malloc(sizeof(double) * NTOPICS);
|
@@ -279,23 +296,22 @@ void run_em(char* start, char* directory, corpus* corpus) {
|
|
279
296
|
zero_initialize_ss(ss, model);
|
280
297
|
|
281
298
|
// e-step
|
299
|
+
printf("e-step\n");
|
282
300
|
|
283
301
|
for (d = 0; d < corpus->num_docs; d++) {
|
284
302
|
if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
|
285
303
|
likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
|
286
304
|
}
|
305
|
+
printf("m-step\n");
|
287
306
|
|
288
307
|
// m-step
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
}
|
295
|
-
|
308
|
+
if (VERBOSE) {
|
309
|
+
lda_mle(model, ss, ESTIMATE_ALPHA);
|
310
|
+
} else {
|
311
|
+
quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
|
312
|
+
}
|
296
313
|
|
297
314
|
// check for convergence
|
298
|
-
|
299
315
|
converged = (likelihood_old - likelihood) / (likelihood_old);
|
300
316
|
if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
|
301
317
|
likelihood_old = likelihood;
|
@@ -324,10 +340,15 @@ void run_em(char* start, char* directory, corpus* corpus) {
|
|
324
340
|
|
325
341
|
sprintf(filename, "%s/word-assignments.dat", directory);
|
326
342
|
FILE* w_asgn_file = fopen(filename, "w");
|
343
|
+
short error = 0;
|
344
|
+
double tl = 0.0;
|
327
345
|
for (d = 0; d < corpus->num_docs; d++)
|
328
346
|
{
|
329
347
|
if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
|
330
|
-
|
348
|
+
error = 0;
|
349
|
+
tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
|
350
|
+
if( error ) { continue; }
|
351
|
+
likelihood += tl;
|
331
352
|
write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
|
332
353
|
}
|
333
354
|
fclose(w_asgn_file);
|
@@ -388,7 +409,8 @@ void infer(char* model_root, char* save, corpus* corpus) {
|
|
388
409
|
phi = (double**) malloc(sizeof(double*) * doc->length);
|
389
410
|
for (n = 0; n < doc->length; n++)
|
390
411
|
phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
|
391
|
-
|
412
|
+
short error = 0;
|
413
|
+
likelihood = lda_inference(doc, model, var_gamma[d], phi, &error);
|
392
414
|
|
393
415
|
fprintf(fileptr, "%5.5f\n", likelihood);
|
394
416
|
}
|
@@ -448,58 +470,68 @@ int main(int argc, char* argv[]) {
|
|
448
470
|
|
449
471
|
/* */
|
450
472
|
void run_quiet_em(char* start, corpus* corpus) {
|
451
|
-
int d, n;
|
473
|
+
int d = 0, n = 0;
|
452
474
|
lda_model *model = NULL;
|
453
|
-
double **var_gamma, **phi;
|
475
|
+
double **var_gamma = NULL, **phi = NULL;
|
476
|
+
// last_gamma is a double[num_docs][num_topics]
|
454
477
|
|
455
478
|
// allocate variational parameters
|
456
479
|
|
457
|
-
|
458
|
-
|
459
|
-
|
480
|
+
|
481
|
+
var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
|
482
|
+
memset(var_gamma, 0.0, corpus->num_docs);
|
483
|
+
|
484
|
+
for (d = 0; d < corpus->num_docs; ++d) {
|
485
|
+
var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS);
|
486
|
+
memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS);
|
487
|
+
}
|
460
488
|
|
461
489
|
int max_length = max_corpus_length(corpus);
|
462
|
-
|
463
|
-
|
464
|
-
|
490
|
+
|
491
|
+
phi = (double**)malloc(sizeof(double*)*max_length);
|
492
|
+
memset(phi, 0.0, max_length);
|
493
|
+
for (n = 0; n < max_length; ++n) {
|
494
|
+
phi[n] = (double*)malloc(sizeof(double) * NTOPICS);
|
495
|
+
memset(phi[n], 0.0, sizeof(double)*NTOPICS);
|
496
|
+
}
|
465
497
|
|
466
498
|
// initialize model
|
467
499
|
|
468
500
|
lda_suffstats* ss = NULL;
|
469
|
-
if (
|
501
|
+
if (strncmp(start, "seeded",6)==0) {
|
470
502
|
model = new_lda_model(corpus->num_terms, NTOPICS);
|
471
|
-
ss = new_lda_suffstats(model);
|
472
|
-
if (VERBOSE) {
|
473
|
-
corpus_initialize_ss(ss, model, corpus);
|
474
|
-
} else {
|
475
|
-
quiet_corpus_initialize_ss(ss, model, corpus);
|
476
|
-
}
|
477
|
-
if (VERBOSE) {
|
478
|
-
lda_mle(model, ss, 0);
|
479
|
-
} else {
|
480
|
-
quiet_lda_mle(model, ss, 0);
|
481
|
-
}
|
482
503
|
model->alpha = INITIAL_ALPHA;
|
483
|
-
} else if (strcmp(start, "fixed")==0) {
|
484
|
-
model = new_lda_model(corpus->num_terms, NTOPICS);
|
485
504
|
ss = new_lda_suffstats(model);
|
486
|
-
corpus_initialize_fixed_ss(ss, model, corpus);
|
487
505
|
if (VERBOSE) {
|
488
|
-
|
506
|
+
corpus_initialize_ss(ss, model, corpus);
|
507
|
+
} else {
|
508
|
+
quiet_corpus_initialize_ss(ss, model, corpus);
|
509
|
+
}
|
510
|
+
if (VERBOSE) {
|
511
|
+
lda_mle(model, ss, 0);
|
489
512
|
} else {
|
490
|
-
|
513
|
+
quiet_lda_mle(model, ss, 0);
|
491
514
|
}
|
492
|
-
|
493
|
-
|
515
|
+
} else if (strncmp(start, "fixed",5)==0) {
|
516
|
+
model = new_lda_model(corpus->num_terms, NTOPICS);
|
517
|
+
model->alpha = INITIAL_ALPHA;
|
518
|
+
ss = new_lda_suffstats(model);
|
519
|
+
corpus_initialize_fixed_ss(ss, model, corpus);
|
520
|
+
if (VERBOSE) {
|
521
|
+
lda_mle(model, ss, 0);
|
522
|
+
} else {
|
523
|
+
quiet_lda_mle(model, ss, 0);
|
524
|
+
}
|
525
|
+
} else if (strncmp(start, "random",6)==0) {
|
494
526
|
model = new_lda_model(corpus->num_terms, NTOPICS);
|
527
|
+
model->alpha = INITIAL_ALPHA;
|
495
528
|
ss = new_lda_suffstats(model);
|
496
529
|
random_initialize_ss(ss, model);
|
497
530
|
if (VERBOSE) {
|
498
|
-
|
531
|
+
lda_mle(model, ss, 0);
|
499
532
|
} else {
|
500
|
-
|
533
|
+
quiet_lda_mle(model, ss, 0);
|
501
534
|
}
|
502
|
-
model->alpha = INITIAL_ALPHA;
|
503
535
|
} else {
|
504
536
|
model = load_lda_model(start);
|
505
537
|
ss = new_lda_suffstats(model);
|
@@ -512,12 +544,11 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
512
544
|
// run expectation maximization
|
513
545
|
|
514
546
|
int i = 0;
|
515
|
-
double likelihood, likelihood_old = 0, converged = 1;
|
547
|
+
double likelihood = 0.0, likelihood_old = 0, converged = 1;
|
516
548
|
|
517
549
|
while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
|
518
550
|
i++;
|
519
|
-
if (VERBOSE)
|
520
|
-
printf("**** em iteration %d ****\n", i);
|
551
|
+
if (VERBOSE) printf("**** em iteration %d ****\n", i);
|
521
552
|
likelihood = 0;
|
522
553
|
zero_initialize_ss(ss, model);
|
523
554
|
|
@@ -529,12 +560,11 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
529
560
|
}
|
530
561
|
|
531
562
|
// m-step
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
}
|
563
|
+
if (VERBOSE) {
|
564
|
+
lda_mle(model, ss, ESTIMATE_ALPHA);
|
565
|
+
} else {
|
566
|
+
quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
|
567
|
+
}
|
538
568
|
|
539
569
|
// check for convergence
|
540
570
|
|
@@ -546,14 +576,16 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
546
576
|
|
547
577
|
last_model = model;
|
548
578
|
last_gamma = var_gamma;
|
549
|
-
|
579
|
+
last_phi = phi;
|
550
580
|
}
|
551
581
|
|
552
582
|
// output the final model
|
553
583
|
|
554
584
|
last_model = model;
|
555
585
|
last_gamma = var_gamma;
|
556
|
-
|
586
|
+
last_phi = phi;
|
587
|
+
|
588
|
+
free_lda_suffstats(model,ss);
|
557
589
|
|
558
590
|
// output the word assignments (for visualization)
|
559
591
|
/*
|
@@ -585,6 +617,7 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
585
617
|
static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
|
586
618
|
INITIAL_ALPHA = NUM2DBL(init_alpha);
|
587
619
|
NTOPICS = NUM2INT(num_topics);
|
620
|
+
if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
|
588
621
|
VAR_MAX_ITER = NUM2INT(max_iter);
|
589
622
|
VAR_CONVERGED = (float)NUM2DBL(convergence);
|
590
623
|
EM_MAX_ITER = NUM2INT(em_max_iter);
|
@@ -798,8 +831,11 @@ static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
|
|
798
831
|
c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
|
799
832
|
c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
|
800
833
|
for (j = 0; j < c->docs[i].length; j++) {
|
801
|
-
|
802
|
-
|
834
|
+
int one_word = NUM2INT(rb_ary_entry(words, j));
|
835
|
+
int one_count = NUM2INT(rb_ary_entry(counts, j));
|
836
|
+
if( one_word > c->num_terms ) {
|
837
|
+
rb_raise(rb_eRuntimeError, "error term count(%d) less then word index(%d)", c->num_terms, one_word);
|
838
|
+
}
|
803
839
|
c->docs[i].words[j] = one_word;
|
804
840
|
c->docs[i].counts[j] = one_count;
|
805
841
|
}
|
@@ -850,12 +886,13 @@ static VALUE wrap_get_phi(VALUE self) {
|
|
850
886
|
VALUE arr = rb_ary_new2(last_corpus->num_docs);
|
851
887
|
int i = 0, j = 0, k = 0;
|
852
888
|
|
853
|
-
int max_length = max_corpus_length(last_corpus);
|
889
|
+
//int max_length = max_corpus_length(last_corpus);
|
890
|
+
short error = 0;
|
854
891
|
|
855
892
|
for (i = 0; i < last_corpus->num_docs; i++) {
|
856
893
|
VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
|
857
894
|
|
858
|
-
lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi);
|
895
|
+
lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);
|
859
896
|
|
860
897
|
for (j = 0; j < last_corpus->docs[i].length; j++) {
|
861
898
|
VALUE arr2 = rb_ary_new2(last_model->num_topics);
|
@@ -968,4 +1005,4 @@ void Init_lda_ext() {
|
|
968
1005
|
rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
|
969
1006
|
}
|
970
1007
|
|
971
|
-
#endif
|
1008
|
+
#endif
|
data/lib/lda-inference.h
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
#include <stdlib.h>
|
5
5
|
#include <math.h>
|
6
6
|
#include <float.h>
|
7
|
-
#include <assert.h>
|
8
7
|
#include "lda.h"
|
9
8
|
#include "utils.h"
|
10
9
|
|
@@ -31,7 +30,7 @@ enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded, VERBOSE;
|
|
31
30
|
|
32
31
|
|
33
32
|
|
34
|
-
double lda_inference(document*, lda_model*, double*, double
|
33
|
+
double lda_inference(document*, lda_model*, double*, double**, short*);
|
35
34
|
double compute_likelihood(document*, lda_model*, double**, double*);
|
36
35
|
|
37
36
|
|
@@ -61,4 +60,4 @@ void infer(char* model_root,
|
|
61
60
|
corpus* corpus);
|
62
61
|
|
63
62
|
|
64
|
-
#endif
|
63
|
+
#endif
|
data/lib/lda-model.c
CHANGED
@@ -18,6 +18,7 @@
|
|
18
18
|
// USA
|
19
19
|
|
20
20
|
#include "lda-model.h"
|
21
|
+
#include <string.h>
|
21
22
|
|
22
23
|
|
23
24
|
/*
|
@@ -88,25 +89,38 @@ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
|
|
88
89
|
*/
|
89
90
|
|
90
91
|
lda_suffstats* new_lda_suffstats(lda_model* model) {
|
92
|
+
register int i;
|
91
93
|
int num_topics = model->num_topics;
|
92
94
|
int num_terms = model->num_terms;
|
93
|
-
int i,j;
|
94
95
|
|
95
|
-
lda_suffstats* ss = malloc(sizeof(lda_suffstats));
|
96
|
-
|
97
|
-
ss->
|
98
|
-
|
99
|
-
|
96
|
+
lda_suffstats* ss = (lda_suffstats*)malloc(sizeof(lda_suffstats));
|
97
|
+
memset(ss,0,sizeof(lda_suffstats));
|
98
|
+
ss->class_total = (double*)malloc(sizeof(double)*num_topics);
|
99
|
+
ss->class_word = (double**)malloc(sizeof(double*)*num_topics);
|
100
|
+
|
101
|
+
for (i = 0; i < num_topics; ++i) {
|
100
102
|
ss->class_total[i] = 0;
|
101
|
-
ss->class_word[i] = malloc(sizeof(double)*num_terms);
|
102
|
-
|
103
|
-
{
|
104
|
-
ss->class_word[i][j] = 0;
|
105
|
-
}
|
103
|
+
ss->class_word[i] = (double*)malloc(sizeof(double)*num_terms);
|
104
|
+
memset(ss->class_word[i],0.0,sizeof(double)*num_terms);
|
106
105
|
}
|
106
|
+
|
107
107
|
return(ss);
|
108
108
|
}
|
109
|
+
/*
|
110
|
+
* deallocate new lda suffstats
|
111
|
+
*
|
112
|
+
*/
|
113
|
+
void free_lda_suffstats(lda_model* model, lda_suffstats* ss) {
|
114
|
+
int i;
|
115
|
+
int num_topics = model->num_topics;
|
109
116
|
|
117
|
+
free(ss->class_total);
|
118
|
+
for (i = 0; i < num_topics; ++i) {
|
119
|
+
free(ss->class_word[i]);
|
120
|
+
}
|
121
|
+
free(ss->class_word);
|
122
|
+
free(ss);
|
123
|
+
}
|
110
124
|
|
111
125
|
/*
|
112
126
|
* various intializations for the sufficient statistics
|
@@ -132,6 +146,7 @@ void random_initialize_ss(lda_suffstats* ss, lda_model* model) {
|
|
132
146
|
int num_topics = model->num_topics;
|
133
147
|
int num_terms = model->num_terms;
|
134
148
|
int k, n;
|
149
|
+
|
135
150
|
for (k = 0; k < num_topics; k++)
|
136
151
|
{
|
137
152
|
for (n = 0; n < num_terms; n++)
|
@@ -221,7 +236,7 @@ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c)
|
|
221
236
|
*/
|
222
237
|
|
223
238
|
lda_model* new_lda_model(int num_terms, int num_topics) {
|
224
|
-
int i
|
239
|
+
int i;
|
225
240
|
lda_model* model;
|
226
241
|
|
227
242
|
model = malloc(sizeof(lda_model));
|
@@ -229,11 +244,11 @@ lda_model* new_lda_model(int num_terms, int num_topics) {
|
|
229
244
|
model->num_terms = num_terms;
|
230
245
|
model->alpha = 1.0;
|
231
246
|
model->log_prob_w = malloc(sizeof(double*)*num_topics);
|
247
|
+
printf("new model with: %d topics and %d terms\n", num_topics, num_terms);
|
232
248
|
for (i = 0; i < num_topics; i++)
|
233
249
|
{
|
234
250
|
model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
|
235
|
-
|
236
|
-
model->log_prob_w[i][j] = 0;
|
251
|
+
memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
|
237
252
|
}
|
238
253
|
return(model);
|
239
254
|
}
|
data/lib/lda-model.h
CHANGED
@@ -16,6 +16,7 @@ void free_lda_model(lda_model*);
|
|
16
16
|
void save_lda_model(lda_model*, char*);
|
17
17
|
lda_model* new_lda_model(int, int);
|
18
18
|
lda_suffstats* new_lda_suffstats(lda_model* model);
|
19
|
+
void free_lda_suffstats(lda_model* model, lda_suffstats* ss);
|
19
20
|
void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
|
20
21
|
void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
|
21
22
|
void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c);
|
data/lib/lda.rb
CHANGED
@@ -30,10 +30,10 @@ module Lda
|
|
30
30
|
def add_document(doc)
|
31
31
|
if doc.is_a?(Document)
|
32
32
|
@documents << doc
|
33
|
-
@all_terms
|
33
|
+
@all_terms += doc.words
|
34
34
|
elsif doc.is_a?(String)
|
35
35
|
d = Document.new(doc)
|
36
|
-
@all_terms
|
36
|
+
@all_terms += d.words
|
37
37
|
@documents << d
|
38
38
|
end
|
39
39
|
@num_docs += 1
|
@@ -51,11 +51,25 @@ module Lda
|
|
51
51
|
true
|
52
52
|
end
|
53
53
|
end
|
54
|
+
|
55
|
+
class BaseDocument
|
56
|
+
def words
|
57
|
+
raise NotSupportedError
|
58
|
+
end
|
59
|
+
|
60
|
+
def length
|
61
|
+
raise NotSupportedError
|
62
|
+
end
|
63
|
+
|
64
|
+
def total
|
65
|
+
raise NotSupportedError
|
66
|
+
end
|
67
|
+
end
|
54
68
|
|
55
69
|
#
|
56
70
|
# A single document.
|
57
71
|
#
|
58
|
-
class Document
|
72
|
+
class Document < BaseDocument
|
59
73
|
attr_accessor :words, :counts
|
60
74
|
attr_reader :length, :total
|
61
75
|
|
@@ -222,26 +236,20 @@ module Lda
|
|
222
236
|
return nil
|
223
237
|
end
|
224
238
|
|
225
|
-
# Load the model
|
226
|
-
beta = self.beta
|
227
|
-
unless beta
|
228
|
-
puts "Model has not been run."
|
229
|
-
return nil
|
230
|
-
end
|
231
|
-
|
232
239
|
# find the highest scoring words per topic
|
233
240
|
topics = Hash.new
|
234
|
-
indices = (0
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
topics[topic_num] << @vocab[indices[i]]
|
241
|
+
indices = (0...@vocab.size).to_a
|
242
|
+
|
243
|
+
begin
|
244
|
+
beta.each_with_index do |topic, topic_idx|
|
245
|
+
indices.sort! {|x, y| -(topic[x] <=> topic[y])}
|
246
|
+
topics[topic_idx] = indices.first(words_per_topic).map { |i| @vocab[i] }
|
241
247
|
end
|
242
|
-
|
248
|
+
rescue NoMethodError
|
249
|
+
puts "Error: model has not been run."
|
250
|
+
topics = nil
|
243
251
|
end
|
244
|
-
|
252
|
+
|
245
253
|
topics
|
246
254
|
end
|
247
255
|
|
@@ -308,4 +316,4 @@ module Lda
|
|
308
316
|
end
|
309
317
|
|
310
318
|
# load the c-side stuff
|
311
|
-
require 'lda_ext'
|
319
|
+
require 'lda_ext'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ealdent-lda-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jason M. Adams
|
@@ -10,21 +10,31 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2009-
|
13
|
+
date: 2009-07-19 00:00:00 -07:00
|
14
14
|
default_executable:
|
15
|
-
dependencies:
|
16
|
-
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: stemmer
|
18
|
+
type: :runtime
|
19
|
+
version_requirement:
|
20
|
+
version_requirements: !ruby/object:Gem::Requirement
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
version:
|
17
26
|
description:
|
18
27
|
email: jasonmadams@gmail.com
|
19
28
|
executables: []
|
20
29
|
|
21
30
|
extensions:
|
22
31
|
- lib/extconf.rb
|
23
|
-
extra_rdoc_files:
|
24
|
-
|
32
|
+
extra_rdoc_files:
|
33
|
+
- README
|
34
|
+
- README.markdown
|
25
35
|
files:
|
26
36
|
- README
|
27
|
-
-
|
37
|
+
- VERSION.yml
|
28
38
|
- lib/cokus.c
|
29
39
|
- lib/cokus.h
|
30
40
|
- lib/extconf.rb
|
@@ -40,12 +50,12 @@ files:
|
|
40
50
|
- lib/lda.rb
|
41
51
|
- lib/utils.c
|
42
52
|
- lib/utils.h
|
43
|
-
-
|
44
|
-
|
53
|
+
- license.txt
|
54
|
+
- README.markdown
|
55
|
+
has_rdoc: false
|
45
56
|
homepage: http://github.com/ealdent/lda-ruby
|
46
57
|
post_install_message:
|
47
58
|
rdoc_options:
|
48
|
-
- --inline-source
|
49
59
|
- --charset=UTF-8
|
50
60
|
require_paths:
|
51
61
|
- lib
|
@@ -66,7 +76,7 @@ requirements: []
|
|
66
76
|
rubyforge_project:
|
67
77
|
rubygems_version: 1.2.0
|
68
78
|
signing_key:
|
69
|
-
specification_version:
|
79
|
+
specification_version: 3
|
70
80
|
summary: Ruby port of Latent Dirichlet Allocation by David M. Blei.
|
71
81
|
test_files: []
|
72
82
|
|