lda-ruby 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,27 @@
1
+ #ifndef COKUS_H
2
+ #define COKUS_H
3
+
4
+ #include <stdio.h>
5
+ #include <stdlib.h>
6
+
7
+ //
8
+ // uint32 must be an unsigned integer type capable of holding at least 32
9
+ // bits; exactly 32 should be fastest, but 64 is better on an Alpha with
10
+ // GCC at -O3 optimization so try your options and see what's best for you
11
+ //
12
+
13
+ typedef unsigned long uint32;
14
+
15
+ #define N (624) // length of state vector
16
+ #define M (397) // a period parameter
17
+ #define K (0x9908B0DFU) // a magic constant
18
+ #define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u
19
+ #define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u
20
+ #define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u
21
+ #define mixBits(u, v) (hiBit(u)|loBits(v)) // move hi bit of u to hi bit of v
22
+
23
+ void seedMT(uint32 seed);
24
+ uint32 reloadMT(void);
25
+ uint32 randomMT(void);
26
+
27
+ #endif
@@ -0,0 +1,9 @@
1
+ ENV["ARCHFLAGS"] = "-arch #{`uname -p` =~ /powerpc/ ? 'ppc' : 'i386'}"
2
+
3
+ require 'mkmf'
4
+
5
+ $CFLAGS << ' -Wall -ggdb -O0'
6
+ $defs.push( "-D USE_RUBY" )
7
+
8
+ dir_config('lda-ruby/lda')
9
+ create_makefile("lda-ruby/lda")
@@ -0,0 +1,96 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #include "lda-alpha.h"
21
+
22
+ /*
23
+ * objective function and its derivatives
24
+ *
25
+ */
26
+
27
+ double alhood(double a, double ss, int D, int K)
28
+ { return(D * (lgamma(K * a) - K * lgamma(a)) + (a - 1) * ss); }
29
+
30
+ double d_alhood(double a, double ss, int D, int K)
31
+ { return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }
32
+
33
+ double d2_alhood(double a, int D, int K)
34
+ { return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }
35
+
36
+
37
+ /*
38
+ * newtons method
39
+ *
40
+ */
41
+
42
+ double opt_alpha(double ss, int D, int K)
43
+ {
44
+ double a, log_a, init_a = 100;
45
+ double f, df, d2f;
46
+ int iter = 0;
47
+
48
+ log_a = log(init_a);
49
+ do
50
+ {
51
+ iter++;
52
+ a = exp(log_a);
53
+ if (isnan(a))
54
+ {
55
+ init_a = init_a * 10;
56
+ printf("warning : alpha is nan; new init = %5.5f\n", init_a);
57
+ a = init_a;
58
+ log_a = log(a);
59
+ }
60
+ f = alhood(a, ss, D, K);
61
+ df = d_alhood(a, ss, D, K);
62
+ d2f = d2_alhood(a, D, K);
63
+ log_a = log_a - df/(d2f * a + df);
64
+ printf("alpha maximization : %5.5f %5.5f\n", f, df);
65
+ }
66
+ while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
67
+ return(exp(log_a));
68
+ }
69
+
70
+ double quiet_opt_alpha(double ss, int D, int K)
71
+ {
72
+ double a, log_a, init_a = 100;
73
+ double f, df, d2f;
74
+ int iter = 0;
75
+
76
+ log_a = log(init_a);
77
+ do
78
+ {
79
+ iter++;
80
+ a = exp(log_a);
81
+ if (isnan(a))
82
+ {
83
+ init_a = init_a * 10;
84
+ //printf("warning : alpha is nan; new init = %5.5f\n", init_a);
85
+ a = init_a;
86
+ log_a = log(a);
87
+ }
88
+ f = alhood(a, ss, D, K);
89
+ df = d_alhood(a, ss, D, K);
90
+ d2f = d2_alhood(a, D, K);
91
+ log_a = log_a - df/(d2f * a + df);
92
+ //printf("alpha maximization : %5.5f %5.5f\n", f, df);
93
+ }
94
+ while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
95
+ return(exp(log_a));
96
+ }
@@ -0,0 +1,21 @@
1
+ #ifndef LDA_ALPHA_H
2
+ #define LDA_ALPHA_H
3
+
4
+ #include <stdlib.h>
5
+ #include <math.h>
6
+ #include <float.h>
7
+
8
+ #include "lda.h"
9
+ #include "utils.h"
10
+
11
+ #define NEWTON_THRESH 1e-5
12
+ #define MAX_ALPHA_ITER 1000
13
+
14
+ double alhood(double a, double ss, int D, int K);
15
+ double d_alhood(double a, double ss, int D, int K);
16
+ double d2_alhood(double a, int D, int K);
17
+ double opt_alpha(double ss, int D, int K);
18
+ double quiet_opt_alpha(double ss, int D, int K);
19
+ //void maximize_alpha(double** gamma, lda_model* model, int num_docs);
20
+
21
+ #endif
@@ -0,0 +1,67 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #include "lda-data.h"
21
+
22
+ corpus* read_data(char* data_filename)
23
+ {
24
+ FILE *fileptr;
25
+ int length, count, word, n, nd, nw;
26
+ corpus* c;
27
+
28
+ printf("reading data from %s\n", data_filename);
29
+ c = malloc(sizeof(corpus));
30
+ c->docs = 0;
31
+ c->num_terms = 0;
32
+ c->num_docs = 0;
33
+ fileptr = fopen(data_filename, "r");
34
+ nd = 0; nw = 0;
35
+ while ((fscanf(fileptr, "%10d", &length) != EOF))
36
+ {
37
+ c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1));
38
+ c->docs[nd].length = length;
39
+ c->docs[nd].total = 0;
40
+ c->docs[nd].words = malloc(sizeof(int)*length);
41
+ c->docs[nd].counts = malloc(sizeof(int)*length);
42
+ for (n = 0; n < length; n++)
43
+ {
44
+ fscanf(fileptr, "%10d:%10d", &word, &count);
45
+ word = word - OFFSET;
46
+ c->docs[nd].words[n] = word;
47
+ c->docs[nd].counts[n] = count;
48
+ c->docs[nd].total += count;
49
+ if (word >= nw) { nw = word + 1; }
50
+ }
51
+ nd++;
52
+ }
53
+ fclose(fileptr);
54
+ c->num_docs = nd;
55
+ c->num_terms = nw;
56
+ printf("number of docs : %d\n", nd);
57
+ printf("number of terms : %d\n", nw);
58
+ return(c);
59
+ }
60
+
61
+ int max_corpus_length(corpus* c)
62
+ {
63
+ int n, max = 0;
64
+ for (n = 0; n < c->num_docs; n++)
65
+ if (c->docs[n].length > max) max = c->docs[n].length;
66
+ return(max);
67
+ }
@@ -0,0 +1,14 @@
1
+ #ifndef LDA_DATA_H
2
+ #define LDA_DATA_H
3
+
4
+ #include <stdio.h>
5
+ #include <stdlib.h>
6
+
7
+ #include "lda.h"
8
+
9
+ #define OFFSET 0; // offset for reading data
10
+
11
+ corpus* read_data(char* data_filename);
12
+ int max_corpus_length(corpus* c);
13
+
14
+ #endif
@@ -0,0 +1,1007 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #include <stdlib.h>
21
+ #include <stdio.h>
22
+ #include <math.h>
23
+ #include <float.h>
24
+ #include <string.h>
25
+ #include <time.h>
26
+
27
+ #include "lda.h"
28
+ #include "lda-data.h"
29
+ #include "lda-inference.h"
30
+ #include "lda-model.h"
31
+ #include "utils.h"
32
+ #include "cokus.h"
33
+
34
+ #ifdef USE_RUBY
35
+ #include "ruby.h"
36
+
37
+ VALUE rb_cLdaModule;
38
+ VALUE rb_cLda;
39
+ VALUE rb_cLdaCorpus;
40
+ VALUE rb_cLdaDocument;
41
+ #endif
42
+
43
+
44
+
45
+ /*
46
+ * variational inference
47
+ */
48
+
49
+ double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi, short* errors) {
50
+ double converged = 1;
51
+ double phisum = 0, likelihood = 0;
52
+ double likelihood_old = 0, oldphi[model->num_topics];
53
+ int k = 0, n = 0, var_iter = 0, index = 0;
54
+ double digamma_gam[model->num_topics];
55
+
56
+ /* zero'em out */
57
+ memset(digamma_gam,0.0,sizeof(digamma_gam));
58
+ memset(oldphi,0.0,sizeof(oldphi));
59
+
60
+ // compute posterior dirichlet
61
+
62
+ for (k = 0; k < model->num_topics; k++)
63
+ {
64
+ var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
65
+ digamma_gam[k] = digamma(var_gamma[k]);
66
+ for (n = 0; n < doc->length; n++)
67
+ phi[n][k] = 1.0/model->num_topics;
68
+ }
69
+ var_iter = 0;
70
+
71
+ while ((converged > VAR_CONVERGED) &&
72
+ ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
73
+ {
74
+ var_iter++;
75
+ for (n = 0; n < doc->length; n++)
76
+ {
77
+ phisum = 0;
78
+ for (k = 0; k < model->num_topics; k++)
79
+ {
80
+ oldphi[k] = phi[n][k];
81
+ index = doc->words[n];
82
+ if( index < 0 || index > model->num_terms ) {
83
+ printf("phi for term: %d of %d\n", index, model->num_terms);
84
+ phi[n][k] = 0.0;
85
+ }
86
+ else {
87
+ phi[n][k] =
88
+ digamma_gam[k] +
89
+ model->log_prob_w[k][index];
90
+ }
91
+
92
+ if (k > 0)
93
+ phisum = log_sum(phisum, phi[n][k]);
94
+ else
95
+ phisum = phi[n][k]; // note, phi is in log space
96
+ }
97
+
98
+ for (k = 0; k < model->num_topics; k++)
99
+ {
100
+ phi[n][k] = exp(phi[n][k] - phisum);
101
+ var_gamma[k] =
102
+ var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
103
+ // !!! a lot of extra digamma's here because of how we're computing it
104
+ // !!! but its more automatically updated too.
105
+ digamma_gam[k] = digamma(var_gamma[k]);
106
+ }
107
+ }
108
+
109
+ likelihood = compute_likelihood(doc, model, phi, var_gamma);
110
+ //assert(!isnan(likelihood));
111
+ if( isnan(likelihood) ) { *errors = 1; }
112
+ converged = (likelihood_old - likelihood) / likelihood_old;
113
+ likelihood_old = likelihood;
114
+
115
+ // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
116
+ }
117
+ return(likelihood);
118
+ }
119
+
120
+
121
+ /*
122
+ * compute likelihood bound
123
+ */
124
+
125
+ double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) {
126
+ double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics];
127
+ int k = 0, n = 0, index = 0;
128
+ memset(dig,0.0,sizeof(dig));
129
+
130
+ for (k = 0; k < model->num_topics; k++)
131
+ {
132
+ dig[k] = digamma(var_gamma[k]);
133
+ var_gamma_sum += var_gamma[k];
134
+ }
135
+ digsum = digamma(var_gamma_sum);
136
+
137
+ likelihood = lgamma(model->alpha * model->num_topics) -
138
+ model->num_topics *
139
+ lgamma(model->alpha) -
140
+ lgamma(var_gamma_sum);
141
+
142
+ for (k = 0; k < model->num_topics; k++)
143
+ {
144
+ likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum);
145
+
146
+ for (n = 0; n < doc->length; n++)
147
+ {
148
+ if (phi[n][k] > 0)
149
+ {
150
+ index = doc->words[n];
151
+ likelihood += doc->counts[n]*
152
+ (phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
153
+ + model->log_prob_w[k][index]));
154
+ }
155
+ }
156
+ }
157
+ return(likelihood);
158
+ }
159
+
160
+
161
+ double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) {
162
+ double likelihood;
163
+ int n, k;
164
+ short error = 0;
165
+
166
+ // posterior inference
167
+
168
+ likelihood = lda_inference(doc, model, gamma, phi, &error);
169
+ if (error) { likelihood = 0.0; }
170
+
171
+
172
+ // update sufficient statistics
173
+
174
+ double gamma_sum = 0;
175
+ for (k = 0; k < model->num_topics; k++)
176
+ {
177
+ gamma_sum += gamma[k];
178
+ ss->alpha_suffstats += digamma(gamma[k]);
179
+ }
180
+ ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum);
181
+
182
+ for (n = 0; n < doc->length; n++)
183
+ {
184
+ for (k = 0; k < model->num_topics; k++)
185
+ {
186
+ ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k];
187
+ ss->class_total[k] += doc->counts[n]*phi[n][k];
188
+ }
189
+ }
190
+
191
+ ss->num_docs = ss->num_docs + 1;
192
+
193
+ return(likelihood);
194
+ }
195
+
196
+
197
+ /*
198
+ * writes the word assignments line for a document to a file
199
+ */
200
+
201
+ void write_word_assignment(FILE* f, document* doc, double** phi, lda_model* model) {
202
+ int n;
203
+
204
+ fprintf(f, "%03d", doc->length);
205
+ for (n = 0; n < doc->length; n++) {
206
+ fprintf(f, " %04d:%02d", doc->words[n], argmax(phi[n], model->num_topics));
207
+ }
208
+ fprintf(f, "\n");
209
+ fflush(f);
210
+ }
211
+
212
+
213
+ /*
214
+ * saves the gamma parameters of the current dataset
215
+ */
216
+
217
+ void save_gamma(char* filename, double** gamma, int num_docs, int num_topics) {
218
+ FILE* fileptr;
219
+ int d, k;
220
+ fileptr = fopen(filename, "w");
221
+
222
+ for (d = 0; d < num_docs; d++) {
223
+ fprintf(fileptr, "%5.10f", gamma[d][0]);
224
+ for (k = 1; k < num_topics; k++) {
225
+ fprintf(fileptr, " %5.10f", gamma[d][k]);
226
+ }
227
+ fprintf(fileptr, "\n");
228
+ }
229
+ fclose(fileptr);
230
+ }
231
+
232
+
233
+ void run_em(char* start, char* directory, corpus* corpus) {
234
+ int d, n;
235
+ lda_model *model = NULL;
236
+ double **var_gamma, **phi;
237
+
238
+ // allocate variational parameters
239
+
240
+
241
+ var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
242
+ for (d = 0; d < corpus->num_docs; d++)
243
+ var_gamma[d] = malloc(sizeof(double) * NTOPICS);
244
+
245
+ int max_length = max_corpus_length(corpus);
246
+ phi = malloc(sizeof(double*)*max_length);
247
+ for (n = 0; n < max_length; n++)
248
+ phi[n] = malloc(sizeof(double) * NTOPICS);
249
+
250
+ // initialize model
251
+
252
+ char filename[100];
253
+
254
+ lda_suffstats* ss = NULL;
255
+ if (strcmp(start, "seeded")==0) {
256
+ model = new_lda_model(corpus->num_terms, NTOPICS);
257
+ ss = new_lda_suffstats(model);
258
+ corpus_initialize_ss(ss, model, corpus);
259
+ if (VERBOSE) {
260
+ lda_mle(model, ss, 0);
261
+ } else {
262
+ quiet_lda_mle(model, ss, 0);
263
+ }
264
+
265
+ model->alpha = INITIAL_ALPHA;
266
+ } else if (strcmp(start, "random")==0) {
267
+ model = new_lda_model(corpus->num_terms, NTOPICS);
268
+ ss = new_lda_suffstats(model);
269
+ random_initialize_ss(ss, model);
270
+ if (VERBOSE) {
271
+ lda_mle(model, ss, 0);
272
+ } else {
273
+ quiet_lda_mle(model, ss, 0);
274
+ }
275
+ model->alpha = INITIAL_ALPHA;
276
+ } else {
277
+ model = load_lda_model(start);
278
+ ss = new_lda_suffstats(model);
279
+ }
280
+
281
+ sprintf(filename,"%s/000",directory);
282
+ save_lda_model(model, filename);
283
+
284
+ // run expectation maximization
285
+
286
+ int i = 0;
287
+ double likelihood, likelihood_old = 0, converged = 1;
288
+ sprintf(filename, "%s/likelihood.dat", directory);
289
+ FILE* likelihood_file = fopen(filename, "w");
290
+
291
+ while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
292
+ i++;
293
+ if (VERBOSE)
294
+ printf("**** em iteration %d ****\n", i);
295
+ likelihood = 0;
296
+ zero_initialize_ss(ss, model);
297
+
298
+ // e-step
299
+ printf("e-step\n");
300
+
301
+ for (d = 0; d < corpus->num_docs; d++) {
302
+ if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
303
+ likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
304
+ }
305
+ printf("m-step\n");
306
+
307
+ // m-step
308
+ if (VERBOSE) {
309
+ lda_mle(model, ss, ESTIMATE_ALPHA);
310
+ } else {
311
+ quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
312
+ }
313
+
314
+ // check for convergence
315
+ converged = (likelihood_old - likelihood) / (likelihood_old);
316
+ if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
317
+ likelihood_old = likelihood;
318
+
319
+ // output model and likelihood
320
+
321
+ fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
322
+ fflush(likelihood_file);
323
+ if ((i % LAG) == 0)
324
+ {
325
+ sprintf(filename,"%s/%03d",directory, i);
326
+ save_lda_model(model, filename);
327
+ sprintf(filename,"%s/%03d.gamma",directory, i);
328
+ save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
329
+ }
330
+ }
331
+
332
+ // output the final model
333
+
334
+ sprintf(filename,"%s/final",directory);
335
+ save_lda_model(model, filename);
336
+ sprintf(filename,"%s/final.gamma",directory);
337
+ save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
338
+
339
+ // output the word assignments (for visualization)
340
+
341
+ sprintf(filename, "%s/word-assignments.dat", directory);
342
+ FILE* w_asgn_file = fopen(filename, "w");
343
+ short error = 0;
344
+ double tl = 0.0;
345
+ for (d = 0; d < corpus->num_docs; d++)
346
+ {
347
+ if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
348
+ error = 0;
349
+ tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
350
+ if( error ) { continue; }
351
+ likelihood += tl;
352
+ write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
353
+ }
354
+ fclose(w_asgn_file);
355
+ fclose(likelihood_file);
356
+ }
357
+
358
+
359
+ /*
360
+ * read settings.
361
+ */
362
+
363
+ void read_settings(char* filename) {
364
+ FILE* fileptr;
365
+ char alpha_action[100];
366
+ fileptr = fopen(filename, "r");
367
+ fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER);
368
+ fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED);
369
+ fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER);
370
+ fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED);
371
+ fscanf(fileptr, "alpha %s", alpha_action);
372
+ if (strcmp(alpha_action, "fixed")==0)
373
+ {
374
+ ESTIMATE_ALPHA = 0;
375
+ }
376
+ else
377
+ {
378
+ ESTIMATE_ALPHA = 1;
379
+ }
380
+ fclose(fileptr);
381
+ }
382
+
383
+
384
+
385
+
386
+ /*
387
+ * inference only
388
+ *
389
+ */
390
+
391
+ void infer(char* model_root, char* save, corpus* corpus) {
392
+ FILE* fileptr;
393
+ char filename[100];
394
+ int i, d, n;
395
+ lda_model *model;
396
+ double **var_gamma, likelihood, **phi;
397
+ document* doc;
398
+
399
+ model = load_lda_model(model_root);
400
+ var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
401
+ for (i = 0; i < corpus->num_docs; i++)
402
+ var_gamma[i] = malloc(sizeof(double)*model->num_topics);
403
+ sprintf(filename, "%s-lda-lhood.dat", save);
404
+ fileptr = fopen(filename, "w");
405
+ for (d = 0; d < corpus->num_docs; d++) {
406
+ if (((d % 100) == 0) && (d>0) && VERBOSE) printf("document %d\n",d);
407
+
408
+ doc = &(corpus->docs[d]);
409
+ phi = (double**) malloc(sizeof(double*) * doc->length);
410
+ for (n = 0; n < doc->length; n++)
411
+ phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
412
+ short error = 0;
413
+ likelihood = lda_inference(doc, model, var_gamma[d], phi, &error);
414
+
415
+ fprintf(fileptr, "%5.5f\n", likelihood);
416
+ }
417
+ fclose(fileptr);
418
+ sprintf(filename, "%s-gamma.dat", save);
419
+ save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
420
+ }
421
+
422
+
423
+ /*
424
+ * update sufficient statistics
425
+ *
426
+ */
427
+
428
+
429
+
430
+ /*
431
+ * main
432
+ *
433
+ */
434
+
435
+ int main(int argc, char* argv[]) {
436
+ corpus* corpus;
437
+
438
+ long t1;
439
+ (void) time(&t1);
440
+ seedMT(t1);
441
+ // seedMT(4357U);
442
+
443
+ if (argc > 1)
444
+ {
445
+ if (strcmp(argv[1], "est")==0)
446
+ {
447
+ INITIAL_ALPHA = atof(argv[2]);
448
+ NTOPICS = atoi(argv[3]);
449
+ read_settings(argv[4]);
450
+ corpus = read_data(argv[5]);
451
+ make_directory(argv[7]);
452
+ run_em(argv[6], argv[7], corpus);
453
+ }
454
+ if (strcmp(argv[1], "inf")==0)
455
+ {
456
+ read_settings(argv[2]);
457
+ corpus = read_data(argv[4]);
458
+ infer(argv[3], argv[5], corpus);
459
+ }
460
+ }
461
+ else
462
+ {
463
+ printf("usage : lda est [initial alpha] [k] [settings] [data] [random/seeded/*] [directory]\n");
464
+ printf(" lda inf [settings] [model] [data] [name]\n");
465
+ }
466
+ return(0);
467
+ }
468
+
469
+ #ifdef USE_RUBY
470
+
471
+ /* */
472
+ void run_quiet_em(char* start, corpus* corpus) {
473
+ int d = 0, n = 0;
474
+ lda_model *model = NULL;
475
+ double **var_gamma = NULL, **phi = NULL;
476
+ // last_gamma is a double[num_docs][num_topics]
477
+
478
+ // allocate variational parameters
479
+
480
+
481
+ var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
482
+ memset(var_gamma, 0.0, corpus->num_docs);
483
+
484
+ for (d = 0; d < corpus->num_docs; ++d) {
485
+ var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS);
486
+ memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS);
487
+ }
488
+
489
+ int max_length = max_corpus_length(corpus);
490
+
491
+ phi = (double**)malloc(sizeof(double*)*max_length);
492
+ memset(phi, 0.0, max_length);
493
+ for (n = 0; n < max_length; ++n) {
494
+ phi[n] = (double*)malloc(sizeof(double) * NTOPICS);
495
+ memset(phi[n], 0.0, sizeof(double)*NTOPICS);
496
+ }
497
+
498
+ // initialize model
499
+
500
+ lda_suffstats* ss = NULL;
501
+ if (strncmp(start, "seeded",6)==0) {
502
+ model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
503
+ model->alpha = INITIAL_ALPHA;
504
+ ss = new_lda_suffstats(model);
505
+ if (VERBOSE) {
506
+ corpus_initialize_ss(ss, model, corpus);
507
+ } else {
508
+ quiet_corpus_initialize_ss(ss, model, corpus);
509
+ }
510
+ if (VERBOSE) {
511
+ lda_mle(model, ss, 0);
512
+ } else {
513
+ quiet_lda_mle(model, ss, 0);
514
+ }
515
+ } else if (strncmp(start, "fixed",5)==0) {
516
+ model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
517
+ model->alpha = INITIAL_ALPHA;
518
+ ss = new_lda_suffstats(model);
519
+ corpus_initialize_fixed_ss(ss, model, corpus);
520
+ if (VERBOSE) {
521
+ lda_mle(model, ss, 0);
522
+ } else {
523
+ quiet_lda_mle(model, ss, 0);
524
+ }
525
+ } else if (strncmp(start, "random",6)==0) {
526
+ model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
527
+ model->alpha = INITIAL_ALPHA;
528
+ ss = new_lda_suffstats(model);
529
+ random_initialize_ss(ss, model);
530
+ if (VERBOSE) {
531
+ lda_mle(model, ss, 0);
532
+ } else {
533
+ quiet_lda_mle(model, ss, 0);
534
+ }
535
+ } else {
536
+ model = load_lda_model(start);
537
+ ss = new_lda_suffstats(model);
538
+ }
539
+
540
+ // save the model in the last_model global
541
+ last_model = model;
542
+ model_loaded = TRUE;
543
+
544
+ // run expectation maximization
545
+
546
+ int i = 0;
547
+ double likelihood = 0.0, likelihood_old = 0, converged = 1;
548
+
549
+ while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
550
+ i++;
551
+ if (VERBOSE) printf("**** em iteration %d ****\n", i);
552
+ likelihood = 0;
553
+ zero_initialize_ss(ss, model);
554
+
555
+ // e-step
556
+
557
+ for (d = 0; d < corpus->num_docs; d++) {
558
+ if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
559
+ likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
560
+ }
561
+
562
+ // m-step
563
+ if (VERBOSE) {
564
+ lda_mle(model, ss, ESTIMATE_ALPHA);
565
+ } else {
566
+ quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
567
+ }
568
+
569
+ // check for convergence
570
+
571
+ converged = (likelihood_old - likelihood) / (likelihood_old);
572
+ if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
573
+ likelihood_old = likelihood;
574
+
575
+ // store model and likelihood
576
+
577
+ last_model = model;
578
+ last_gamma = var_gamma;
579
+ last_phi = phi;
580
+ }
581
+
582
+ // output the final model
583
+
584
+ last_model = model;
585
+ last_gamma = var_gamma;
586
+ last_phi = phi;
587
+
588
+ free_lda_suffstats(model,ss);
589
+
590
+ // output the word assignments (for visualization)
591
+ /*
592
+ char filename[100];
593
+ sprintf(filename, "%s/word-assignments.dat", directory);
594
+ FILE* w_asgn_file = fopen(filename, "w");
595
+ for (d = 0; d < corpus->num_docs; d++) {
596
+ if ((d % 100) == 0)
597
+ printf("final e step document %d\n",d);
598
+ likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
599
+ write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
600
+ }
601
+ fclose(w_asgn_file);
602
+ */
603
+ }
604
+
605
+
606
+ /*
607
+ * Set all of the settings in one command:
608
+ *
609
+ * * init_alpha
610
+ * * num_topics
611
+ * * max_iter
612
+ * * convergence
613
+ * * em_max_iter
614
+ * * em_convergence
615
+ * * est_alpha
616
+ */
617
+ static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
618
+ INITIAL_ALPHA = NUM2DBL(init_alpha);
619
+ NTOPICS = NUM2INT(num_topics);
620
+ if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
621
+ VAR_MAX_ITER = NUM2INT(max_iter);
622
+ VAR_CONVERGED = (float)NUM2DBL(convergence);
623
+ EM_MAX_ITER = NUM2INT(em_max_iter);
624
+ EM_CONVERGED = (float)NUM2DBL(em_convergence);
625
+ ESTIMATE_ALPHA = NUM2INT(est_alpha);
626
+
627
+ return Qtrue;
628
+ }
629
+
630
+ /*
631
+ * Get the maximum iterations.
632
+ */
633
+ static VALUE wrap_get_max_iter(VALUE self) {
634
+ return rb_int_new(VAR_MAX_ITER);
635
+ }
636
+
637
+ /*
638
+ * Set the maximum iterations.
639
+ */
640
+ static VALUE wrap_set_max_iter(VALUE self, VALUE max_iter) {
641
+ VAR_MAX_ITER = NUM2INT(max_iter);
642
+
643
+ return max_iter;
644
+ }
645
+
646
+ /*
647
+ * Get the convergence setting.
648
+ */
649
+ static VALUE wrap_get_converged(VALUE self) {
650
+ return rb_float_new(VAR_CONVERGED);
651
+ }
652
+
653
+ /*
654
+ * Set the convergence setting.
655
+ */
656
+ static VALUE wrap_set_converged(VALUE self, VALUE converged) {
657
+ VAR_CONVERGED = (float)NUM2DBL(converged);
658
+
659
+ return converged;
660
+ }
661
+
662
+ /*
663
+ * Get the max iterations for the EM algorithm.
664
+ */
665
+ static VALUE wrap_get_em_max_iter(VALUE self) {
666
+ return rb_int_new(EM_MAX_ITER);
667
+ }
668
+
669
+ /*
670
+ * Set the max iterations for the EM algorithm.
671
+ */
672
+ static VALUE wrap_set_em_max_iter(VALUE self, VALUE em_max_iter) {
673
+ EM_MAX_ITER = NUM2INT(em_max_iter);
674
+
675
+ return em_max_iter;
676
+ }
677
+
678
+ /*
679
+ * Get the convergence value for EM.
680
+ */
681
+ static VALUE wrap_get_em_converged(VALUE self) {
682
+ return rb_float_new(EM_CONVERGED);
683
+ }
684
+
685
+ /*
686
+ * Set the convergence value for EM.
687
+ */
688
+ static VALUE wrap_set_em_converged(VALUE self, VALUE em_converged) {
689
+ EM_CONVERGED = (float)NUM2DBL(em_converged);
690
+
691
+ return em_converged;
692
+ }
693
+
694
+ /*
695
+ * Get the initial alpha value.
696
+ */
697
+ static VALUE wrap_get_initial_alpha(VALUE self) {
698
+ return rb_float_new(INITIAL_ALPHA);
699
+ }
700
+
701
+ /*
702
+ * Get the number of topics being clustered.
703
+ */
704
+ static VALUE wrap_get_num_topics(VALUE self) {
705
+ return rb_int_new(NTOPICS);
706
+ }
707
+
708
+ /*
709
+ * Set the initial value of alpha.
710
+ */
711
+ static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
712
+ INITIAL_ALPHA = (float)NUM2DBL(initial_alpha);
713
+
714
+ return initial_alpha;
715
+ }
716
+
717
+ /*
718
+ * Set the number of topics to be clustered.
719
+ */
720
+ static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) {
721
+ NTOPICS = NUM2INT(ntopics);
722
+
723
+ return ntopics;
724
+ }
725
+
726
+ /*
727
+ * Get the estimate alpha value (fixed = 0).
728
+ */
729
+ static VALUE wrap_get_estimate_alpha(VALUE self) {
730
+ return rb_int_new(ESTIMATE_ALPHA);
731
+ }
732
+
733
+ /*
734
+ * Set the estimate alpha value (fixed = 0).
735
+ */
736
+ static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
737
+ ESTIMATE_ALPHA = NUM2INT(est_alpha);
738
+
739
+ return est_alpha;
740
+ }
741
+
742
+ /*
743
+ * Get the verbosity setting.
744
+ */
745
+ static VALUE wrap_get_verbosity(VALUE self) {
746
+ if (VERBOSE) {
747
+ return Qtrue;
748
+ } else {
749
+ return Qfalse;
750
+ }
751
+ }
752
+
753
+
754
+ /*
755
+ * Set the verbosity level (true, false).
756
+ */
757
+ static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
758
+ if (verbosity == Qtrue) {
759
+ VERBOSE = TRUE;
760
+ } else {
761
+ VERBOSE = FALSE;
762
+ }
763
+
764
+ return verbosity;
765
+ }
766
+
767
+
768
+
769
+ /*
770
+ * Run the EM algorithm with the loaded corpus and using the current
771
+ * configuration settings. The +start+ parameter can take the following
772
+ * values:
773
+ * * random - starting alpha are randomized
774
+ * * seeded - loaded based on the corpus values
775
+ * * <filename> - path to the file containing the model
776
+ */
777
+ static VALUE wrap_em(VALUE self, VALUE start) {
778
+ if (!corpus_loaded)
779
+ return Qnil;
780
+
781
+ run_quiet_em(STR2CSTR(start), last_corpus);
782
+
783
+ return Qnil;
784
+ }
785
+
786
+
787
+ /*
788
+ * Load settings from the given file.
789
+ */
790
+ static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
791
+ read_settings(STR2CSTR(settings_file));
792
+
793
+ return Qtrue;
794
+ }
795
+
796
+ /*
797
+ * Load the corpus from the given file. This will not create
798
+ * a +Corpus+ object that is accessible, but it will load the corpus
799
+ * much faster.
800
+ */
801
+ static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
802
+ if (!corpus_loaded) {
803
+ last_corpus = read_data(STR2CSTR(filename));
804
+ corpus_loaded = TRUE;
805
+ return Qtrue;
806
+ } else {
807
+ return Qtrue;
808
+ }
809
+ }
810
+
811
+ /*
812
+ * Set the corpus.
813
+ */
814
+ static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
815
+ corpus* c;
816
+ int i = 0;
817
+ int j = 0;
818
+
819
+ c = malloc(sizeof(corpus));
820
+ c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms"));
821
+ c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs"));
822
+ c->docs = (document*) malloc(sizeof(document) * c->num_docs);
823
+ VALUE doc_ary = rb_iv_get(rcorpus, "@documents");
824
+ for (i = 0; i < c->num_docs; i++) {
825
+ VALUE one_doc = rb_ary_entry(doc_ary, i);
826
+ VALUE words = rb_iv_get(one_doc, "@words");
827
+ VALUE counts = rb_iv_get(one_doc, "@counts");
828
+
829
+ c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
830
+ c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
831
+ c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
832
+ c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
833
+ for (j = 0; j < c->docs[i].length; j++) {
834
+ int one_word = NUM2INT(rb_ary_entry(words, j));
835
+ int one_count = NUM2INT(rb_ary_entry(counts, j));
836
+ if( one_word > c->num_terms ) {
837
+ rb_raise(rb_eRuntimeError, "error term count(%d) less than word index(%d)", c->num_terms, one_word);
838
+ }
839
+ c->docs[i].words[j] = one_word;
840
+ c->docs[i].counts[j] = one_count;
841
+ }
842
+ }
843
+
844
+ last_corpus = c;
845
+ corpus_loaded = TRUE;
846
+
847
+ rb_iv_set(self, "@corpus", rcorpus);
848
+
849
+ return Qtrue;
850
+ }
851
+
852
+
853
+ /*
854
+ * Get the gamma values after the model has been run.
855
+ */
856
+ static VALUE wrap_get_gamma(VALUE self) {
857
+ if (!model_loaded)
858
+ return Qnil;
859
+
860
+ // last_gamma is a double[num_docs][num_topics]
861
+ VALUE arr;
862
+ int i = 0, j = 0;
863
+
864
+ arr = rb_ary_new2(last_corpus->num_docs);
865
+ for (i = 0; i < last_corpus->num_docs; i++) {
866
+ VALUE arr2 = rb_ary_new2(last_model->num_topics);
867
+ for (j = 0; j < last_model->num_topics; j++) {
868
+ rb_ary_store(arr2, j, rb_float_new(last_gamma[i][j]));
869
+ }
870
+ rb_ary_store(arr, i, arr2);
871
+ }
872
+
873
+ return arr;
874
+ }
875
+
876
+
877
+ /*
878
+ * Compute the phi values by running inference after the initial EM run has been completed.
879
+ *
880
+ * Returns a 3D matrix: <tt>num_docs x length x num_topics</tt>.
881
+ */
882
+ static VALUE wrap_get_phi(VALUE self) {
883
+ if (!model_loaded)
884
+ return Qnil;
885
+
886
+ VALUE arr = rb_ary_new2(last_corpus->num_docs);
887
+ int i = 0, j = 0, k = 0;
888
+
889
+ //int max_length = max_corpus_length(last_corpus);
890
+ short error = 0;
891
+
892
+ for (i = 0; i < last_corpus->num_docs; i++) {
893
+ VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
894
+
895
+ lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);
896
+
897
+ for (j = 0; j < last_corpus->docs[i].length; j++) {
898
+ VALUE arr2 = rb_ary_new2(last_model->num_topics);
899
+
900
+ for (k = 0; k < last_model->num_topics; k++) {
901
+ rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
902
+ }
903
+
904
+ rb_ary_store(arr1, j, arr2);
905
+ }
906
+
907
+ rb_ary_store(arr, i, arr1);
908
+ }
909
+
910
+ return arr;
911
+ }
912
+
913
+
914
+
915
+ /*
916
+ * Get the beta matrix after the model has been run.
917
+ */
918
+ static VALUE wrap_get_model_beta(VALUE self) {
919
+ if (!model_loaded)
920
+ return Qnil;
921
+
922
+ // beta is a double[num_topics][num_terms]
923
+ VALUE arr;
924
+ int i = 0, j = 0;
925
+
926
+ arr = rb_ary_new2(last_model->num_topics);
927
+ for (i = 0; i < last_model->num_topics; i++) {
928
+ VALUE arr2 = rb_ary_new2(last_model->num_terms);
929
+ for (j = 0; j < last_model->num_terms; j++) {
930
+ rb_ary_store(arr2, j, rb_float_new(last_model->log_prob_w[i][j]));
931
+ }
932
+ rb_ary_store(arr, i, arr2);
933
+ }
934
+
935
+ return arr;
936
+ }
937
+
938
+
939
+ /*
940
+ * Get the settings used for the model.
941
+ */
942
+ static VALUE wrap_get_model_settings(VALUE self) {
943
+ if (!model_loaded)
944
+ return Qnil;
945
+
946
+ VALUE arr;
947
+
948
+ arr = rb_ary_new();
949
+ rb_ary_push(arr, rb_int_new(last_model->num_topics));
950
+ rb_ary_push(arr, rb_int_new(last_model->num_terms));
951
+ rb_ary_push(arr, rb_float_new(last_model->alpha));
952
+
953
+ return arr; // [num_topics, num_terms, alpha]
954
+ }
955
+
956
+
957
+ void Init_lda() {
958
+ corpus_loaded = FALSE;
959
+ model_loaded = FALSE;
960
+ VERBOSE = TRUE;
961
+
962
+ rb_require("lda-ruby");
963
+
964
+ rb_cLdaModule = rb_define_module("Lda");
965
+ rb_cLda = rb_define_class_under(rb_cLdaModule, "Lda", rb_cObject);
966
+ rb_cLdaCorpus = rb_define_class_under(rb_cLdaModule, "Corpus", rb_cObject);
967
+ rb_cLdaDocument = rb_define_class_under(rb_cLdaModule, "Document", rb_cObject);
968
+
969
+ // method to load the corpus
970
+ rb_define_method(rb_cLda, "fast_load_corpus_from_file", wrap_load_corpus, 1);
971
+ rb_define_method(rb_cLda, "corpus=", wrap_ruby_corpus, 1);
972
+
973
+ // method to run em
974
+ rb_define_method(rb_cLda, "em", wrap_em, 1);
975
+
976
+ // method to load settings from file
977
+ rb_define_method(rb_cLda, "load_settings", wrap_load_settings, 1);
978
+
979
+ // method to set all the config options at once
980
+ rb_define_method(rb_cLda, "set_config", wrap_set_config, 5);
981
+
982
+ // accessor stuff for main settings
983
+ rb_define_method(rb_cLda, "max_iter", wrap_get_max_iter, 0);
984
+ rb_define_method(rb_cLda, "max_iter=", wrap_set_max_iter, 1);
985
+ rb_define_method(rb_cLda, "convergence", wrap_get_converged, 0);
986
+ rb_define_method(rb_cLda, "convergence=", wrap_set_converged, 1);
987
+ rb_define_method(rb_cLda, "em_max_iter", wrap_get_em_max_iter, 0);
988
+ rb_define_method(rb_cLda, "em_max_iter=", wrap_set_em_max_iter, 1);
989
+ rb_define_method(rb_cLda, "em_convergence", wrap_get_em_converged, 0);
990
+ rb_define_method(rb_cLda, "em_convergence=", wrap_set_em_converged, 1);
991
+ rb_define_method(rb_cLda, "init_alpha=", wrap_set_initial_alpha, 1);
992
+ rb_define_method(rb_cLda, "init_alpha", wrap_get_initial_alpha, 0);
993
+ rb_define_method(rb_cLda, "est_alpha=", wrap_set_estimate_alpha, 1);
994
+ rb_define_method(rb_cLda, "est_alpha", wrap_get_estimate_alpha, 0);
995
+ rb_define_method(rb_cLda, "num_topics", wrap_get_num_topics, 0);
996
+ rb_define_method(rb_cLda, "num_topics=", wrap_set_num_topics, 1);
997
+ rb_define_method(rb_cLda, "verbose", wrap_get_verbosity, 0);
998
+ rb_define_method(rb_cLda, "verbose=", wrap_set_verbosity, 1);
999
+
1000
+ // retrieve model and gamma
1001
+ rb_define_method(rb_cLda, "beta", wrap_get_model_beta, 0);
1002
+ rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0);
1003
+ rb_define_method(rb_cLda, "compute_phi", wrap_get_phi, 0);
1004
+ rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
1005
+ }
1006
+
1007
+ #endif