lda-ruby 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,63 @@
1
+ #ifndef LDA_INFERENCE_H
2
+ #define LDA_INFERENCE_H
3
+
4
+ #include <stdlib.h>
5
+ #include <math.h>
6
+ #include <float.h>
7
+ #include "lda.h"
8
+ #include "utils.h"
9
+
10
+
11
+
12
+ int LAG = 5;
13
+
14
+ float EM_CONVERGED;
15
+ int EM_MAX_ITER;
16
+ int ESTIMATE_ALPHA;
17
+ double INITIAL_ALPHA;
18
+ int NTOPICS;
19
+ float VAR_CONVERGED;
20
+ int VAR_MAX_ITER;
21
+
22
+ #ifdef USE_RUBY
23
+ corpus *last_corpus;
24
+ lda_model *last_model;
25
+ double **last_gamma;
26
+ double **last_phi;
27
+
28
+ enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded, VERBOSE;
29
+ #endif
30
+
31
+
32
+
33
+ double lda_inference(document*, lda_model*, double*, double**, short*);
34
+ double compute_likelihood(document*, lda_model*, double**, double*);
35
+
36
+
37
+ double doc_e_step(document* doc,
38
+ double* gamma,
39
+ double** phi,
40
+ lda_model* model,
41
+ lda_suffstats* ss);
42
+
43
+ void save_gamma(char* filename,
44
+ double** gamma,
45
+ int num_docs,
46
+ int num_topics);
47
+
48
+ void run_em(char* start,
49
+ char* directory,
50
+ corpus* corpus);
51
+
52
+ #ifdef USE_RUBY
53
+ void run_quiet_em(char* start, corpus* corpus);
54
+ #endif
55
+
56
+ void read_settings(char* filename);
57
+
58
+ void infer(char* model_root,
59
+ char* save,
60
+ corpus* corpus);
61
+
62
+
63
+ #endif
@@ -0,0 +1,345 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #include "lda-model.h"
21
+ #include <string.h>
22
+
23
+
24
+ /*
25
+ * compute MLE lda model from sufficient statistics
26
+ *
27
+ */
28
+
29
+ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
30
+ int k; int w;
31
+
32
+ for (k = 0; k < model->num_topics; k++)
33
+ {
34
+ for (w = 0; w < model->num_terms; w++)
35
+ {
36
+ if (ss->class_word[k][w] > 0)
37
+ {
38
+ model->log_prob_w[k][w] =
39
+ log(ss->class_word[k][w]) -
40
+ log(ss->class_total[k]);
41
+ }
42
+ else
43
+ model->log_prob_w[k][w] = -100;
44
+ }
45
+ }
46
+ if (estimate_alpha == 1)
47
+ {
48
+ model->alpha = opt_alpha(ss->alpha_suffstats,
49
+ ss->num_docs,
50
+ model->num_topics);
51
+
52
+ printf("new alpha = %5.5f\n", model->alpha);
53
+ }
54
+ }
55
+
56
+ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
57
+ int k; int w;
58
+
59
+ for (k = 0; k < model->num_topics; k++)
60
+ {
61
+ for (w = 0; w < model->num_terms; w++)
62
+ {
63
+ if (ss->class_word[k][w] > 0)
64
+ {
65
+ model->log_prob_w[k][w] =
66
+ log(ss->class_word[k][w]) -
67
+ log(ss->class_total[k]);
68
+ }
69
+ else
70
+ model->log_prob_w[k][w] = -100;
71
+ }
72
+ }
73
+ if (estimate_alpha == 1)
74
+ {
75
+ model->alpha = quiet_opt_alpha(ss->alpha_suffstats,
76
+ ss->num_docs,
77
+ model->num_topics);
78
+ }
79
+ }
80
+
81
+
82
+
83
+
84
+ /*
85
+ * allocate sufficient statistics
86
+ *
87
+ */
88
+
89
+ lda_suffstats* new_lda_suffstats(lda_model* model) {
90
+ register int i;
91
+ int num_topics = model->num_topics;
92
+ int num_terms = model->num_terms;
93
+
94
+ lda_suffstats* ss = (lda_suffstats*)malloc(sizeof(lda_suffstats));
95
+ memset(ss,0,sizeof(lda_suffstats));
96
+ ss->class_total = (double*)malloc(sizeof(double)*num_topics);
97
+ ss->class_word = (double**)malloc(sizeof(double*)*num_topics);
98
+
99
+ for (i = 0; i < num_topics; ++i) {
100
+ ss->class_total[i] = 0;
101
+ ss->class_word[i] = (double*)malloc(sizeof(double)*num_terms);
102
+ memset(ss->class_word[i],0.0,sizeof(double)*num_terms);
103
+ }
104
+
105
+ return(ss);
106
+ }
107
+ /*
108
+ * deallocate new lda suffstats
109
+ *
110
+ */
111
+ void free_lda_suffstats(lda_model* model, lda_suffstats* ss) {
112
+ int i;
113
+ int num_topics = model->num_topics;
114
+
115
+ free(ss->class_total);
116
+ for (i = 0; i < num_topics; ++i) {
117
+ free(ss->class_word[i]);
118
+ }
119
+ free(ss->class_word);
120
+ free(ss);
121
+ }
122
+
123
+ /*
124
+ * various intializations for the sufficient statistics
125
+ *
126
+ */
127
+
128
+ void zero_initialize_ss(lda_suffstats* ss, lda_model* model) {
129
+ int k, w;
130
+ for (k = 0; k < model->num_topics; k++)
131
+ {
132
+ ss->class_total[k] = 0;
133
+ for (w = 0; w < model->num_terms; w++)
134
+ {
135
+ ss->class_word[k][w] = 0;
136
+ }
137
+ }
138
+ ss->num_docs = 0;
139
+ ss->alpha_suffstats = 0;
140
+ }
141
+
142
+
143
+ void random_initialize_ss(lda_suffstats* ss, lda_model* model) {
144
+ int num_topics = model->num_topics;
145
+ int num_terms = model->num_terms;
146
+ int k, n;
147
+
148
+ for (k = 0; k < num_topics; k++)
149
+ {
150
+ for (n = 0; n < num_terms; n++)
151
+ {
152
+ ss->class_word[k][n] += 1.0/num_terms + myrand();
153
+ ss->class_total[k] += ss->class_word[k][n];
154
+ }
155
+ }
156
+ }
157
+
158
+
159
+ void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
160
+ {
161
+ int num_topics = model->num_topics;
162
+ int i, k, d, n;
163
+ document* doc;
164
+
165
+ for (k = 0; k < num_topics; k++)
166
+ {
167
+ for (i = 0; i < NUM_INIT; i++)
168
+ {
169
+ d = floor(myrand() * c->num_docs);
170
+ printf("initialized with document %d\n", d);
171
+ doc = &(c->docs[d]);
172
+ for (n = 0; n < doc->length; n++)
173
+ {
174
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
175
+ }
176
+ }
177
+ for (n = 0; n < model->num_terms; n++)
178
+ {
179
+ ss->class_word[k][n] += 1.0;
180
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
181
+ }
182
+ }
183
+ }
184
+
185
+ void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
186
+ {
187
+ int num_topics = model->num_topics;
188
+ int i, k, d, n;
189
+ document* doc;
190
+
191
+ for (k = 0; k < num_topics; k++)
192
+ {
193
+ for (i = 0; i < NUM_INIT; i++)
194
+ {
195
+ d = floor(myrand() * c->num_docs);
196
+ doc = &(c->docs[d]);
197
+ for (n = 0; n < doc->length; n++)
198
+ {
199
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
200
+ }
201
+ }
202
+ for (n = 0; n < model->num_terms; n++)
203
+ {
204
+ ss->class_word[k][n] += 1.0;
205
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
206
+ }
207
+ }
208
+ }
209
+
210
+
211
+ /*
212
+ * Use the first num_topics documents of the corpus as the seeds. If num_topics > num_docs, results might be hairy.
213
+ */
214
+ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c) {
215
+ int num_topics = MIN(model->num_topics, c->num_docs);
216
+ int k, n;
217
+ document* doc;
218
+
219
+ for (k = 0; k < num_topics; k++) {
220
+ doc = &(c->docs[k]);
221
+ for (n = 0; n < doc->length; n++) {
222
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
223
+ }
224
+ for (n = 0; n < model->num_terms; n++) {
225
+ ss->class_word[k][n] += 1.0;
226
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
227
+ }
228
+ }
229
+ }
230
+
231
+ /*
232
+ * allocate new lda model
233
+ *
234
+ */
235
+
236
+ lda_model* new_lda_model(int num_terms, int num_topics) {
237
+ int i;
238
+ lda_model* model;
239
+
240
+ model = malloc(sizeof(lda_model));
241
+ model->num_topics = num_topics;
242
+ model->num_terms = num_terms;
243
+ model->alpha = 1.0;
244
+ model->log_prob_w = malloc(sizeof(double*)*num_topics);
245
+ printf("new model with: %d topics and %d terms\n", num_topics, num_terms);
246
+ for (i = 0; i < num_topics; i++)
247
+ {
248
+ model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
249
+ memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
250
+ }
251
+ return(model);
252
+ }
253
+
254
+ lda_model* quiet_new_lda_model(int num_terms, int num_topics) {
255
+ int i;
256
+ lda_model* model;
257
+
258
+ model = malloc(sizeof(lda_model));
259
+ model->num_topics = num_topics;
260
+ model->num_terms = num_terms;
261
+ model->alpha = 1.0;
262
+ model->log_prob_w = malloc(sizeof(double*)*num_topics);
263
+ for (i = 0; i < num_topics; i++)
264
+ {
265
+ model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
266
+ memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
267
+ }
268
+ return(model);
269
+ }
270
+
271
+
272
+ /*
273
+ * deallocate new lda model
274
+ *
275
+ */
276
+ void free_lda_model(lda_model* model) {
277
+ int i;
278
+
279
+ for (i = 0; i < model->num_topics; i++)
280
+ {
281
+ free(model->log_prob_w[i]);
282
+ }
283
+ free(model->log_prob_w);
284
+ }
285
+
286
+
287
+ /*
288
+ * save an lda model
289
+ *
290
+ */
291
+ void save_lda_model(lda_model* model, char* model_root) {
292
+ char filename[100];
293
+ FILE* fileptr;
294
+ int i, j;
295
+
296
+ sprintf(filename, "%s.beta", model_root);
297
+ fileptr = fopen(filename, "w");
298
+ for (i = 0; i < model->num_topics; i++) {
299
+ for (j = 0; j < model->num_terms; j++) {
300
+ fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
301
+ }
302
+ fprintf(fileptr, "\n");
303
+ }
304
+ fclose(fileptr);
305
+
306
+ sprintf(filename, "%s.other", model_root);
307
+ fileptr = fopen(filename, "w");
308
+ fprintf(fileptr, "num_topics %d\n", model->num_topics);
309
+ fprintf(fileptr, "num_terms %d\n", model->num_terms);
310
+ fprintf(fileptr, "alpha %5.10f\n", model->alpha);
311
+ fclose(fileptr);
312
+ }
313
+
314
+
315
+ lda_model* load_lda_model(char* model_root) {
316
+ char filename[100];
317
+ FILE* fileptr;
318
+ int i, j, num_terms, num_topics;
319
+ float x, alpha;
320
+
321
+ sprintf(filename, "%s.other", model_root);
322
+ printf("loading %s\n", filename);
323
+ fileptr = fopen(filename, "r");
324
+ fscanf(fileptr, "num_topics %d\n", &num_topics);
325
+ fscanf(fileptr, "num_terms %d\n", &num_terms);
326
+ fscanf(fileptr, "alpha %f\n", &alpha);
327
+ fclose(fileptr);
328
+
329
+ lda_model* model = new_lda_model(num_terms, num_topics);
330
+ model->alpha = alpha;
331
+
332
+ sprintf(filename, "%s.beta", model_root);
333
+ printf("loading %s\n", filename);
334
+ fileptr = fopen(filename, "r");
335
+ for (i = 0; i < num_topics; i++)
336
+ {
337
+ for (j = 0; j < num_terms; j++)
338
+ {
339
+ fscanf(fileptr, "%f", &x);
340
+ model->log_prob_w[i][j] = x;
341
+ }
342
+ }
343
+ fclose(fileptr);
344
+ return(model);
345
+ }
@@ -0,0 +1,29 @@
1
+ #ifndef LDA_MODEL_H
2
+ #define LDA_MODEL
3
+
4
+ #include <stdlib.h>
5
+ #include <stdio.h>
6
+ #include <math.h>
7
+ #include "lda.h"
8
+ #include "lda-alpha.h"
9
+ #include "cokus.h"
10
+
11
+ #define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
12
+ #define NUM_INIT 1
13
+ #define MIN(A,B) (int)((A > B) ? (B) : (A))
14
+
15
+ void free_lda_model(lda_model*);
16
+ void save_lda_model(lda_model*, char*);
17
+ lda_model* new_lda_model(int, int);
18
+ lda_suffstats* new_lda_suffstats(lda_model* model);
19
+ void free_lda_suffstats(lda_model* model, lda_suffstats* ss);
20
+ void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
21
+ void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
22
+ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c);
23
+ void random_initialize_ss(lda_suffstats* ss, lda_model* model);
24
+ void zero_initialize_ss(lda_suffstats* ss, lda_model* model);
25
+ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
26
+ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
27
+ lda_model* load_lda_model(char* model_root);
28
+
29
+ #endif
@@ -0,0 +1,54 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #ifndef LDA_H
21
+ #define LDA_H
22
+
23
+
24
+ typedef struct {
25
+ int* words;
26
+ int* counts;
27
+ int length;
28
+ int total;
29
+ } document;
30
+
31
+
32
+ typedef struct {
33
+ document* docs;
34
+ int num_terms;
35
+ int num_docs;
36
+ } corpus;
37
+
38
+
39
+ typedef struct {
40
+ double alpha;
41
+ double** log_prob_w;
42
+ int num_topics;
43
+ int num_terms;
44
+ } lda_model;
45
+
46
+
47
+ typedef struct {
48
+ double** class_word;
49
+ double* class_total;
50
+ double alpha_suffstats;
51
+ int num_docs;
52
+ } lda_suffstats;
53
+
54
+ #endif