lda-ruby 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,63 @@
1
+ #ifndef LDA_INFERENCE_H
2
+ #define LDA_INFERENCE_H
3
+
4
+ #include <stdlib.h>
5
+ #include <math.h>
6
+ #include <float.h>
7
+ #include "lda.h"
8
+ #include "utils.h"
9
+
10
+
11
+
12
+ int LAG = 5;
13
+
14
+ float EM_CONVERGED;
15
+ int EM_MAX_ITER;
16
+ int ESTIMATE_ALPHA;
17
+ double INITIAL_ALPHA;
18
+ int NTOPICS;
19
+ float VAR_CONVERGED;
20
+ int VAR_MAX_ITER;
21
+
22
+ #ifdef USE_RUBY
23
+ corpus *last_corpus;
24
+ lda_model *last_model;
25
+ double **last_gamma;
26
+ double **last_phi;
27
+
28
+ enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded, VERBOSE;
29
+ #endif
30
+
31
+
32
+
33
+ double lda_inference(document*, lda_model*, double*, double**, short*);
34
+ double compute_likelihood(document*, lda_model*, double**, double*);
35
+
36
+
37
+ double doc_e_step(document* doc,
38
+ double* gamma,
39
+ double** phi,
40
+ lda_model* model,
41
+ lda_suffstats* ss);
42
+
43
+ void save_gamma(char* filename,
44
+ double** gamma,
45
+ int num_docs,
46
+ int num_topics);
47
+
48
+ void run_em(char* start,
49
+ char* directory,
50
+ corpus* corpus);
51
+
52
+ #ifdef USE_RUBY
53
+ void run_quiet_em(char* start, corpus* corpus);
54
+ #endif
55
+
56
+ void read_settings(char* filename);
57
+
58
+ void infer(char* model_root,
59
+ char* save,
60
+ corpus* corpus);
61
+
62
+
63
+ #endif
@@ -0,0 +1,345 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #include "lda-model.h"
21
+ #include <string.h>
22
+
23
+
24
+ /*
25
+ * compute MLE lda model from sufficient statistics
26
+ *
27
+ */
28
+
29
+ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
30
+ int k; int w;
31
+
32
+ for (k = 0; k < model->num_topics; k++)
33
+ {
34
+ for (w = 0; w < model->num_terms; w++)
35
+ {
36
+ if (ss->class_word[k][w] > 0)
37
+ {
38
+ model->log_prob_w[k][w] =
39
+ log(ss->class_word[k][w]) -
40
+ log(ss->class_total[k]);
41
+ }
42
+ else
43
+ model->log_prob_w[k][w] = -100;
44
+ }
45
+ }
46
+ if (estimate_alpha == 1)
47
+ {
48
+ model->alpha = opt_alpha(ss->alpha_suffstats,
49
+ ss->num_docs,
50
+ model->num_topics);
51
+
52
+ printf("new alpha = %5.5f\n", model->alpha);
53
+ }
54
+ }
55
+
56
+ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
57
+ int k; int w;
58
+
59
+ for (k = 0; k < model->num_topics; k++)
60
+ {
61
+ for (w = 0; w < model->num_terms; w++)
62
+ {
63
+ if (ss->class_word[k][w] > 0)
64
+ {
65
+ model->log_prob_w[k][w] =
66
+ log(ss->class_word[k][w]) -
67
+ log(ss->class_total[k]);
68
+ }
69
+ else
70
+ model->log_prob_w[k][w] = -100;
71
+ }
72
+ }
73
+ if (estimate_alpha == 1)
74
+ {
75
+ model->alpha = quiet_opt_alpha(ss->alpha_suffstats,
76
+ ss->num_docs,
77
+ model->num_topics);
78
+ }
79
+ }
80
+
81
+
82
+
83
+
84
+ /*
85
+ * allocate sufficient statistics
86
+ *
87
+ */
88
+
89
+ lda_suffstats* new_lda_suffstats(lda_model* model) {
90
+ register int i;
91
+ int num_topics = model->num_topics;
92
+ int num_terms = model->num_terms;
93
+
94
+ lda_suffstats* ss = (lda_suffstats*)malloc(sizeof(lda_suffstats));
95
+ memset(ss,0,sizeof(lda_suffstats));
96
+ ss->class_total = (double*)malloc(sizeof(double)*num_topics);
97
+ ss->class_word = (double**)malloc(sizeof(double*)*num_topics);
98
+
99
+ for (i = 0; i < num_topics; ++i) {
100
+ ss->class_total[i] = 0;
101
+ ss->class_word[i] = (double*)malloc(sizeof(double)*num_terms);
102
+ memset(ss->class_word[i],0.0,sizeof(double)*num_terms);
103
+ }
104
+
105
+ return(ss);
106
+ }
107
+ /*
108
+ * deallocate new lda suffstats
109
+ *
110
+ */
111
+ void free_lda_suffstats(lda_model* model, lda_suffstats* ss) {
112
+ int i;
113
+ int num_topics = model->num_topics;
114
+
115
+ free(ss->class_total);
116
+ for (i = 0; i < num_topics; ++i) {
117
+ free(ss->class_word[i]);
118
+ }
119
+ free(ss->class_word);
120
+ free(ss);
121
+ }
122
+
123
+ /*
124
+ * various intializations for the sufficient statistics
125
+ *
126
+ */
127
+
128
+ void zero_initialize_ss(lda_suffstats* ss, lda_model* model) {
129
+ int k, w;
130
+ for (k = 0; k < model->num_topics; k++)
131
+ {
132
+ ss->class_total[k] = 0;
133
+ for (w = 0; w < model->num_terms; w++)
134
+ {
135
+ ss->class_word[k][w] = 0;
136
+ }
137
+ }
138
+ ss->num_docs = 0;
139
+ ss->alpha_suffstats = 0;
140
+ }
141
+
142
+
143
+ void random_initialize_ss(lda_suffstats* ss, lda_model* model) {
144
+ int num_topics = model->num_topics;
145
+ int num_terms = model->num_terms;
146
+ int k, n;
147
+
148
+ for (k = 0; k < num_topics; k++)
149
+ {
150
+ for (n = 0; n < num_terms; n++)
151
+ {
152
+ ss->class_word[k][n] += 1.0/num_terms + myrand();
153
+ ss->class_total[k] += ss->class_word[k][n];
154
+ }
155
+ }
156
+ }
157
+
158
+
159
+ void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
160
+ {
161
+ int num_topics = model->num_topics;
162
+ int i, k, d, n;
163
+ document* doc;
164
+
165
+ for (k = 0; k < num_topics; k++)
166
+ {
167
+ for (i = 0; i < NUM_INIT; i++)
168
+ {
169
+ d = floor(myrand() * c->num_docs);
170
+ printf("initialized with document %d\n", d);
171
+ doc = &(c->docs[d]);
172
+ for (n = 0; n < doc->length; n++)
173
+ {
174
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
175
+ }
176
+ }
177
+ for (n = 0; n < model->num_terms; n++)
178
+ {
179
+ ss->class_word[k][n] += 1.0;
180
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
181
+ }
182
+ }
183
+ }
184
+
185
+ void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
186
+ {
187
+ int num_topics = model->num_topics;
188
+ int i, k, d, n;
189
+ document* doc;
190
+
191
+ for (k = 0; k < num_topics; k++)
192
+ {
193
+ for (i = 0; i < NUM_INIT; i++)
194
+ {
195
+ d = floor(myrand() * c->num_docs);
196
+ doc = &(c->docs[d]);
197
+ for (n = 0; n < doc->length; n++)
198
+ {
199
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
200
+ }
201
+ }
202
+ for (n = 0; n < model->num_terms; n++)
203
+ {
204
+ ss->class_word[k][n] += 1.0;
205
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
206
+ }
207
+ }
208
+ }
209
+
210
+
211
+ /*
212
+ * Use the first num_topics documents of the corpus as the seeds. If num_topics > num_docs, results might be hairy.
213
+ */
214
+ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c) {
215
+ int num_topics = MIN(model->num_topics, c->num_docs);
216
+ int k, n;
217
+ document* doc;
218
+
219
+ for (k = 0; k < num_topics; k++) {
220
+ doc = &(c->docs[k]);
221
+ for (n = 0; n < doc->length; n++) {
222
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
223
+ }
224
+ for (n = 0; n < model->num_terms; n++) {
225
+ ss->class_word[k][n] += 1.0;
226
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
227
+ }
228
+ }
229
+ }
230
+
231
+ /*
232
+ * allocate new lda model
233
+ *
234
+ */
235
+
236
+ lda_model* new_lda_model(int num_terms, int num_topics) {
237
+ int i;
238
+ lda_model* model;
239
+
240
+ model = malloc(sizeof(lda_model));
241
+ model->num_topics = num_topics;
242
+ model->num_terms = num_terms;
243
+ model->alpha = 1.0;
244
+ model->log_prob_w = malloc(sizeof(double*)*num_topics);
245
+ printf("new model with: %d topics and %d terms\n", num_topics, num_terms);
246
+ for (i = 0; i < num_topics; i++)
247
+ {
248
+ model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
249
+ memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
250
+ }
251
+ return(model);
252
+ }
253
+
254
+ lda_model* quiet_new_lda_model(int num_terms, int num_topics) {
255
+ int i;
256
+ lda_model* model;
257
+
258
+ model = malloc(sizeof(lda_model));
259
+ model->num_topics = num_topics;
260
+ model->num_terms = num_terms;
261
+ model->alpha = 1.0;
262
+ model->log_prob_w = malloc(sizeof(double*)*num_topics);
263
+ for (i = 0; i < num_topics; i++)
264
+ {
265
+ model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
266
+ memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
267
+ }
268
+ return(model);
269
+ }
270
+
271
+
272
+ /*
273
+ * deallocate new lda model
274
+ *
275
+ */
276
+ void free_lda_model(lda_model* model) {
277
+ int i;
278
+
279
+ for (i = 0; i < model->num_topics; i++)
280
+ {
281
+ free(model->log_prob_w[i]);
282
+ }
283
+ free(model->log_prob_w);
284
+ }
285
+
286
+
287
+ /*
288
+ * save an lda model
289
+ *
290
+ */
291
+ void save_lda_model(lda_model* model, char* model_root) {
292
+ char filename[100];
293
+ FILE* fileptr;
294
+ int i, j;
295
+
296
+ sprintf(filename, "%s.beta", model_root);
297
+ fileptr = fopen(filename, "w");
298
+ for (i = 0; i < model->num_topics; i++) {
299
+ for (j = 0; j < model->num_terms; j++) {
300
+ fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
301
+ }
302
+ fprintf(fileptr, "\n");
303
+ }
304
+ fclose(fileptr);
305
+
306
+ sprintf(filename, "%s.other", model_root);
307
+ fileptr = fopen(filename, "w");
308
+ fprintf(fileptr, "num_topics %d\n", model->num_topics);
309
+ fprintf(fileptr, "num_terms %d\n", model->num_terms);
310
+ fprintf(fileptr, "alpha %5.10f\n", model->alpha);
311
+ fclose(fileptr);
312
+ }
313
+
314
+
315
+ lda_model* load_lda_model(char* model_root) {
316
+ char filename[100];
317
+ FILE* fileptr;
318
+ int i, j, num_terms, num_topics;
319
+ float x, alpha;
320
+
321
+ sprintf(filename, "%s.other", model_root);
322
+ printf("loading %s\n", filename);
323
+ fileptr = fopen(filename, "r");
324
+ fscanf(fileptr, "num_topics %d\n", &num_topics);
325
+ fscanf(fileptr, "num_terms %d\n", &num_terms);
326
+ fscanf(fileptr, "alpha %f\n", &alpha);
327
+ fclose(fileptr);
328
+
329
+ lda_model* model = new_lda_model(num_terms, num_topics);
330
+ model->alpha = alpha;
331
+
332
+ sprintf(filename, "%s.beta", model_root);
333
+ printf("loading %s\n", filename);
334
+ fileptr = fopen(filename, "r");
335
+ for (i = 0; i < num_topics; i++)
336
+ {
337
+ for (j = 0; j < num_terms; j++)
338
+ {
339
+ fscanf(fileptr, "%f", &x);
340
+ model->log_prob_w[i][j] = x;
341
+ }
342
+ }
343
+ fclose(fileptr);
344
+ return(model);
345
+ }
@@ -0,0 +1,29 @@
1
+ #ifndef LDA_MODEL_H
2
+ #define LDA_MODEL
3
+
4
+ #include <stdlib.h>
5
+ #include <stdio.h>
6
+ #include <math.h>
7
+ #include "lda.h"
8
+ #include "lda-alpha.h"
9
+ #include "cokus.h"
10
+
11
+ #define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
12
+ #define NUM_INIT 1
13
+ #define MIN(A,B) (int)((A > B) ? (B) : (A))
14
+
15
+ void free_lda_model(lda_model*);
16
+ void save_lda_model(lda_model*, char*);
17
+ lda_model* new_lda_model(int, int);
18
+ lda_suffstats* new_lda_suffstats(lda_model* model);
19
+ void free_lda_suffstats(lda_model* model, lda_suffstats* ss);
20
+ void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
21
+ void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
22
+ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c);
23
+ void random_initialize_ss(lda_suffstats* ss, lda_model* model);
24
+ void zero_initialize_ss(lda_suffstats* ss, lda_model* model);
25
+ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
26
+ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
27
+ lda_model* load_lda_model(char* model_root);
28
+
29
+ #endif
@@ -0,0 +1,54 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #ifndef LDA_H
21
+ #define LDA_H
22
+
23
+
24
+ typedef struct {
25
+ int* words;
26
+ int* counts;
27
+ int length;
28
+ int total;
29
+ } document;
30
+
31
+
32
+ typedef struct {
33
+ document* docs;
34
+ int num_terms;
35
+ int num_docs;
36
+ } corpus;
37
+
38
+
39
+ typedef struct {
40
+ double alpha;
41
+ double** log_prob_w;
42
+ int num_topics;
43
+ int num_terms;
44
+ } lda_model;
45
+
46
+
47
+ typedef struct {
48
+ double** class_word;
49
+ double* class_total;
50
+ double alpha_suffstats;
51
+ int num_docs;
52
+ } lda_suffstats;
53
+
54
+ #endif