ealdent-lda-ruby 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
File without changes
data/cokus.c ADDED
@@ -0,0 +1,145 @@
1
+ // This is the ``Mersenne Twister'' random number generator MT19937, which
2
+ // generates pseudorandom integers uniformly distributed in 0..(2^32 - 1)
3
+ // starting from any odd seed in 0..(2^32 - 1). This version is a recode
4
+ // by Shawn Cokus (Cokus@math.washington.edu) on March 8, 1998 of a version by
5
+ // Takuji Nishimura (who had suggestions from Topher Cooper and Marc Rieffel in
6
+ // July-August 1997).
7
+ //
8
+ // Effectiveness of the recoding (on Goedel2.math.washington.edu, a DEC Alpha
9
+ // running OSF/1) using GCC -O3 as a compiler: before recoding: 51.6 sec. to
10
+ // generate 300 million random numbers; after recoding: 24.0 sec. for the same
11
+ // (i.e., 46.5% of original time), so speed is now about 12.5 million random
12
+ // number generations per second on this machine.
13
+ //
14
+ // According to the URL <http://www.math.keio.ac.jp/~matumoto/emt.html>
15
+ // (and paraphrasing a bit in places), the Mersenne Twister is ``designed
16
+ // with consideration of the flaws of various existing generators,'' has
17
+ // a period of 2^19937 - 1, gives a sequence that is 623-dimensionally
18
+ // equidistributed, and ``has passed many stringent tests, including the
19
+ // die-hard test of G. Marsaglia and the load test of P. Hellekalek and
20
+ // S. Wegenkittl.'' It is efficient in memory usage (typically using 2506
21
+ // to 5012 bytes of static data, depending on data type sizes, and the code
22
+ // is quite short as well). It generates random numbers in batches of 624
23
+ // at a time, so the caching and pipelining of modern systems is exploited.
24
+ // It is also divide- and mod-free.
25
+ //
26
+ // This library is free software; you can redistribute it and/or modify it
27
+ // under the terms of the GNU Library General Public License as published by
28
+ // the Free Software Foundation (either version 2 of the License or, at your
29
+ // option, any later version). This library is distributed in the hope that
30
+ // it will be useful, but WITHOUT ANY WARRANTY, without even the implied
31
+ // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
32
+ // the GNU Library General Public License for more details. You should have
33
+ // received a copy of the GNU Library General Public License along with this
34
+ // library; if not, write to the Free Software Foundation, Inc., 59 Temple
35
+ // Place, Suite 330, Boston, MA 02111-1307, USA.
36
+ //
37
+ // The code as Shawn received it included the following notice:
38
+ //
39
+ // Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura. When
40
+ // you use this, send an e-mail to <matumoto@math.keio.ac.jp> with
41
+ // an appropriate reference to your work.
42
+ //
43
+ // It would be nice to CC: <Cokus@math.washington.edu> when you write.
44
+ //
45
+
46
+ #include "cokus.h"
47
+
48
+ static uint32 state[N+1]; // state vector + 1 extra to not violate ANSI C
49
+ static uint32 *next; // next random value is computed from here
50
+ static int left = -1; // can *next++ this many times before reloading
51
+
52
+ void seedMT(uint32 seed)
53
+ {
54
+ //
55
+ // We initialize state[0..(N-1)] via the generator
56
+ //
57
+ // x_new = (69069 * x_old) mod 2^32
58
+ //
59
+ // from Line 15 of Table 1, p. 106, Sec. 3.3.4 of Knuth's
60
+ // _The Art of Computer Programming_, Volume 2, 3rd ed.
61
+ //
62
+ // Notes (SJC): I do not know what the initial state requirements
63
+ // of the Mersenne Twister are, but it seems this seeding generator
64
+ // could be better. It achieves the maximum period for its modulus
65
+ // (2^30) iff x_initial is odd (p. 20-21, Sec. 3.2.1.2, Knuth); if
66
+ // x_initial can be even, you have sequences like 0, 0, 0, ...;
67
+ // 2^31, 2^31, 2^31, ...; 2^30, 2^30, 2^30, ...; 2^29, 2^29 + 2^31,
68
+ // 2^29, 2^29 + 2^31, ..., etc. so I force seed to be odd below.
69
+ //
70
+ // Even if x_initial is odd, if x_initial is 1 mod 4 then
71
+ //
72
+ // the lowest bit of x is always 1,
73
+ // the next-to-lowest bit of x is always 0,
74
+ // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... ,
75
+ // the 3rd-from-lowest bit of x 4-cycles ... 0 1 1 0 0 1 1 0 ... ,
76
+ // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 0 1 1 1 1 0 ... ,
77
+ // ...
78
+ //
79
+ // and if x_initial is 3 mod 4 then
80
+ //
81
+ // the lowest bit of x is always 1,
82
+ // the next-to-lowest bit of x is always 1,
83
+ // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... ,
84
+ // the 3rd-from-lowest bit of x 4-cycles ... 0 0 1 1 0 0 1 1 ... ,
85
+ // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 1 1 1 1 0 0 ... ,
86
+ // ...
87
+ //
88
+ // The generator's potency (min. s>=0 with (69069-1)^s = 0 mod 2^32) is
89
+ // 16, which seems to be alright by p. 25, Sec. 3.2.1.3 of Knuth. It
90
+ // also does well in the dimension 2..5 spectral tests, but it could be
91
+ // better in dimension 6 (Line 15, Table 1, p. 106, Sec. 3.3.4, Knuth).
92
+ //
93
+ // Note that the random number user does not see the values generated
94
+ // here directly since reloadMT() will always munge them first, so maybe
95
+ // none of all of this matters. In fact, the seed values made here could
96
+ // even be extra-special desirable if the Mersenne Twister theory says
97
+ // so-- that's why the only change I made is to restrict to odd seeds.
98
+ //
99
+
100
+ register uint32 x = (seed | 1U) & 0xFFFFFFFFU, *s = state;
101
+ register int j;
102
+
103
+ for(left=0, *s++=x, j=N; --j;
104
+ *s++ = (x*=69069U) & 0xFFFFFFFFU);
105
+ }
106
+
107
+
108
+ uint32 reloadMT(void)
109
+ {
110
+ register uint32 *p0=state, *p2=state+2, *pM=state+M, s0, s1;
111
+ register int j;
112
+
113
+ if(left < -1)
114
+ seedMT(4357U);
115
+
116
+ left=N-1, next=state+1;
117
+
118
+ for(s0=state[0], s1=state[1], j=N-M+1; --j; s0=s1, s1=*p2++)
119
+ *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
120
+
121
+ for(pM=state, j=M; --j; s0=s1, s1=*p2++)
122
+ *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
123
+
124
+ s1=state[0], *p0 = *pM ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
125
+ s1 ^= (s1 >> 11);
126
+ s1 ^= (s1 << 7) & 0x9D2C5680U;
127
+ s1 ^= (s1 << 15) & 0xEFC60000U;
128
+ return(s1 ^ (s1 >> 18));
129
+ }
130
+
131
+ uint32 randomMT(void)
132
+ {
133
+ uint32 y;
134
+
135
+ if(--left < 0)
136
+ return(reloadMT());
137
+
138
+ y = *next++;
139
+ y ^= (y >> 11);
140
+ y ^= (y << 7) & 0x9D2C5680U;
141
+ y ^= (y << 15) & 0xEFC60000U;
142
+ y ^= (y >> 18);
143
+ return(y);
144
+ }
145
+
data/cokus.h ADDED
@@ -0,0 +1,27 @@
1
+ #ifndef COKUS_H
2
+ #define COKUS_H
3
+
4
+ #include <stdio.h>
5
+ #include <stdlib.h>
6
+
7
+ //
8
+ // uint32 must be an unsigned integer type capable of holding at least 32
9
+ // bits; exactly 32 should be fastest, but 64 is better on an Alpha with
10
+ // GCC at -O3 optimization so try your options and see what's best for you
11
+ //
12
+
13
+ typedef unsigned long uint32;
14
+
15
+ #define N (624) // length of state vector
16
+ #define M (397) // a period parameter
17
+ #define K (0x9908B0DFU) // a magic constant
18
+ #define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u
19
+ #define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u
20
+ #define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u
21
+ #define mixBits(u, v) (hiBit(u)|loBits(v)) // move hi bit of u to hi bit of v
22
+
23
+ void seedMT(uint32 seed);
24
+ uint32 reloadMT(void);
25
+ uint32 randomMT(void);
26
+
27
+ #endif
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+
3
+ dir_config("lda_ext")
4
+ create_makefile("lda_ext")
@@ -0,0 +1,68 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #include "lda-alpha.h"
21
+
22
+ /*
23
+ * objective function and its derivatives
24
+ *
25
+ */
26
+
27
+ double alhood(double a, double ss, int D, int K)
28
+ { return(D * (lgamma(K * a) - K * lgamma(a)) + (a - 1) * ss); }
29
+
30
+ double d_alhood(double a, double ss, int D, int K)
31
+ { return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }
32
+
33
+ double d2_alhood(double a, int D, int K)
34
+ { return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }
35
+
36
+
37
+ /*
38
+ * newtons method
39
+ *
40
+ */
41
+
42
+ double opt_alpha(double ss, int D, int K)
43
+ {
44
+ double a, log_a, init_a = 100;
45
+ double f, df, d2f;
46
+ int iter = 0;
47
+
48
+ log_a = log(init_a);
49
+ do
50
+ {
51
+ iter++;
52
+ a = exp(log_a);
53
+ if (isnan(a))
54
+ {
55
+ init_a = init_a * 10;
56
+ printf("warning : alpha is nan; new init = %5.5f\n", init_a);
57
+ a = init_a;
58
+ log_a = log(a);
59
+ }
60
+ f = alhood(a, ss, D, K);
61
+ df = d_alhood(a, ss, D, K);
62
+ d2f = d2_alhood(a, D, K);
63
+ log_a = log_a - df/(d2f * a + df);
64
+ printf("alpha maximization : %5.5f %5.5f\n", f, df);
65
+ }
66
+ while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
67
+ return(exp(log_a));
68
+ }
@@ -0,0 +1,20 @@
1
+ #ifndef LDA_ALPHA_H
2
+ #define LDA_ALPHA_H
3
+
4
+ #include <stdlib.h>
5
+ #include <math.h>
6
+ #include <float.h>
7
+
8
+ #include "lda.h"
9
+ #include "utils.h"
10
+
11
+ #define NEWTON_THRESH 1e-5
12
+ #define MAX_ALPHA_ITER 1000
13
+
14
+ double alhood(double a, double ss, int D, int K);
15
+ double d_alhood(double a, double ss, int D, int K);
16
+ double d2_alhood(double a, int D, int K);
17
+ double opt_alpha(double ss, int D, int K);
18
+ void maximize_alpha(double** gamma, lda_model* model, int num_docs);
19
+
20
+ #endif
@@ -0,0 +1,67 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #include "lda-data.h"
21
+
22
+ corpus* read_data(char* data_filename)
23
+ {
24
+ FILE *fileptr;
25
+ int length, count, word, n, nd, nw;
26
+ corpus* c;
27
+
28
+ printf("reading data from %s\n", data_filename);
29
+ c = malloc(sizeof(corpus));
30
+ c->docs = 0;
31
+ c->num_terms = 0;
32
+ c->num_docs = 0;
33
+ fileptr = fopen(data_filename, "r");
34
+ nd = 0; nw = 0;
35
+ while ((fscanf(fileptr, "%10d", &length) != EOF))
36
+ {
37
+ c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1));
38
+ c->docs[nd].length = length;
39
+ c->docs[nd].total = 0;
40
+ c->docs[nd].words = malloc(sizeof(int)*length);
41
+ c->docs[nd].counts = malloc(sizeof(int)*length);
42
+ for (n = 0; n < length; n++)
43
+ {
44
+ fscanf(fileptr, "%10d:%10d", &word, &count);
45
+ word = word - OFFSET;
46
+ c->docs[nd].words[n] = word;
47
+ c->docs[nd].counts[n] = count;
48
+ c->docs[nd].total += count;
49
+ if (word >= nw) { nw = word + 1; }
50
+ }
51
+ nd++;
52
+ }
53
+ fclose(fileptr);
54
+ c->num_docs = nd;
55
+ c->num_terms = nw;
56
+ printf("number of docs : %d\n", nd);
57
+ printf("number of terms : %d\n", nw);
58
+ return(c);
59
+ }
60
+
61
+ int max_corpus_length(corpus* c)
62
+ {
63
+ int n, max = 0;
64
+ for (n = 0; n < c->num_docs; n++)
65
+ if (c->docs[n].length > max) max = c->docs[n].length;
66
+ return(max);
67
+ }
@@ -0,0 +1,14 @@
1
+ #ifndef LDA_DATA_H
2
+ #define LDA_DATA_H
3
+
4
+ #include <stdio.h>
5
+ #include <stdlib.h>
6
+
7
+ #include "lda.h"
8
+
9
+ #define OFFSET 0; // offset for reading data
10
+
11
+ corpus* read_data(char* data_filename);
12
+ int max_corpus_length(corpus* c);
13
+
14
+ #endif
@@ -0,0 +1,875 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #ifndef USE_RUBY
21
+ #define USE_RUBY
22
+ #endif
23
+
24
+ #include <stdlib.h>
25
+ #include <stdio.h>
26
+ #include <math.h>
27
+ #include <float.h>
28
+ #include <string.h>
29
+ #include <time.h>
30
+
31
+ #include "lda.h"
32
+ #include "lda-data.h"
33
+ #include "lda-inference.h"
34
+ #include "lda-model.h"
35
+ #include "lda-alpha.h"
36
+ #include "utils.h"
37
+ #include "cokus.h"
38
+
39
+ #ifdef USE_RUBY
40
+ #include "ruby.h"
41
+
42
+ VALUE rb_mLda;
43
+ VALUE rb_cLda;
44
+ VALUE rb_cLdaCorpus;
45
+ VALUE rb_cLdaDocument;
46
+ #endif
47
+
48
+ /*
49
+ * variational inference
50
+ */
51
+
52
+ double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi) {
53
+ double converged = 1;
54
+ double phisum = 0, likelihood = 0;
55
+ double likelihood_old = 0, oldphi[model->num_topics];
56
+ int k, n, var_iter;
57
+ double digamma_gam[model->num_topics];
58
+
59
+ // compute posterior dirichlet
60
+
61
+ for (k = 0; k < model->num_topics; k++)
62
+ {
63
+ var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
64
+ digamma_gam[k] = digamma(var_gamma[k]);
65
+ for (n = 0; n < doc->length; n++)
66
+ phi[n][k] = 1.0/model->num_topics;
67
+ }
68
+ var_iter = 0;
69
+
70
+ while ((converged > VAR_CONVERGED) &&
71
+ ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
72
+ {
73
+ var_iter++;
74
+ for (n = 0; n < doc->length; n++)
75
+ {
76
+ phisum = 0;
77
+ for (k = 0; k < model->num_topics; k++)
78
+ {
79
+ oldphi[k] = phi[n][k];
80
+ phi[n][k] =
81
+ digamma_gam[k] +
82
+ model->log_prob_w[k][doc->words[n]];
83
+
84
+ if (k > 0)
85
+ phisum = log_sum(phisum, phi[n][k]);
86
+ else
87
+ phisum = phi[n][k]; // note, phi is in log space
88
+ }
89
+
90
+ for (k = 0; k < model->num_topics; k++)
91
+ {
92
+ phi[n][k] = exp(phi[n][k] - phisum);
93
+ var_gamma[k] =
94
+ var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
95
+ // !!! a lot of extra digamma's here because of how we're computing it
96
+ // !!! but its more automatically updated too.
97
+ digamma_gam[k] = digamma(var_gamma[k]);
98
+ }
99
+ }
100
+
101
+ likelihood = compute_likelihood(doc, model, phi, var_gamma);
102
+ assert(!isnan(likelihood));
103
+ converged = (likelihood_old - likelihood) / likelihood_old;
104
+ likelihood_old = likelihood;
105
+
106
+ // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
107
+ }
108
+ return(likelihood);
109
+ }
110
+
111
+
112
+ /*
113
+ * compute likelihood bound
114
+ */
115
+
116
+ double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) {
117
+ double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics];
118
+ int k, n;
119
+
120
+ for (k = 0; k < model->num_topics; k++)
121
+ {
122
+ dig[k] = digamma(var_gamma[k]);
123
+ var_gamma_sum += var_gamma[k];
124
+ }
125
+ digsum = digamma(var_gamma_sum);
126
+
127
+ likelihood = lgamma(model->alpha * model -> num_topics) - model -> num_topics * lgamma(model->alpha) - (lgamma(var_gamma_sum));
128
+
129
+ for (k = 0; k < model->num_topics; k++)
130
+ {
131
+ likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum);
132
+
133
+ for (n = 0; n < doc->length; n++)
134
+ {
135
+ if (phi[n][k] > 0)
136
+ {
137
+ likelihood += doc->counts[n]*
138
+ (phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
139
+ + model->log_prob_w[k][doc->words[n]]));
140
+ }
141
+ }
142
+ }
143
+ return(likelihood);
144
+ }
145
+
146
+
147
+ double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) {
148
+ double likelihood;
149
+ int n, k;
150
+
151
+ // posterior inference
152
+
153
+ likelihood = lda_inference(doc, model, gamma, phi);
154
+
155
+ // update sufficient statistics
156
+
157
+ double gamma_sum = 0;
158
+ for (k = 0; k < model->num_topics; k++)
159
+ {
160
+ gamma_sum += gamma[k];
161
+ ss->alpha_suffstats += digamma(gamma[k]);
162
+ }
163
+ ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum);
164
+
165
+ for (n = 0; n < doc->length; n++)
166
+ {
167
+ for (k = 0; k < model->num_topics; k++)
168
+ {
169
+ ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k];
170
+ ss->class_total[k] += doc->counts[n]*phi[n][k];
171
+ }
172
+ }
173
+
174
+ ss->num_docs = ss->num_docs + 1;
175
+
176
+ return(likelihood);
177
+ }
178
+
179
+
180
+ /*
181
+ * writes the word assignments line for a document to a file
182
+ */
183
+
184
+ void write_word_assignment(FILE* f, document* doc, double** phi, lda_model* model) {
185
+ int n;
186
+
187
+ fprintf(f, "%03d", doc->length);
188
+ for (n = 0; n < doc->length; n++) {
189
+ fprintf(f, " %04d:%02d", doc->words[n], argmax(phi[n], model->num_topics));
190
+ }
191
+ fprintf(f, "\n");
192
+ fflush(f);
193
+ }
194
+
195
+
196
+ /*
197
+ * saves the gamma parameters of the current dataset
198
+ */
199
+
200
+ void save_gamma(char* filename, double** gamma, int num_docs, int num_topics) {
201
+ FILE* fileptr;
202
+ int d, k;
203
+ fileptr = fopen(filename, "w");
204
+
205
+ for (d = 0; d < num_docs; d++) {
206
+ fprintf(fileptr, "%5.10f", gamma[d][0]);
207
+ for (k = 1; k < num_topics; k++) {
208
+ fprintf(fileptr, " %5.10f", gamma[d][k]);
209
+ }
210
+ fprintf(fileptr, "\n");
211
+ }
212
+ fclose(fileptr);
213
+ }
214
+
215
+
216
+ void run_em(char* start, char* directory, corpus* corpus) {
217
+ int d, n;
218
+ lda_model *model = NULL;
219
+ double **var_gamma, **phi;
220
+
221
+ // allocate variational parameters
222
+
223
+ var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
224
+ for (d = 0; d < corpus->num_docs; d++)
225
+ var_gamma[d] = malloc(sizeof(double) * NTOPICS);
226
+
227
+ int max_length = max_corpus_length(corpus);
228
+ phi = malloc(sizeof(double*)*max_length);
229
+ for (n = 0; n < max_length; n++)
230
+ phi[n] = malloc(sizeof(double) * NTOPICS);
231
+
232
+ // initialize model
233
+
234
+ char filename[100];
235
+
236
+ lda_suffstats* ss = NULL;
237
+ if (strcmp(start, "seeded")==0) {
238
+ model = new_lda_model(corpus->num_terms, NTOPICS);
239
+ ss = new_lda_suffstats(model);
240
+ corpus_initialize_ss(ss, model, corpus);
241
+ lda_mle(model, ss, 0);
242
+ model->alpha = INITIAL_ALPHA;
243
+ } else if (strcmp(start, "random")==0) {
244
+ model = new_lda_model(corpus->num_terms, NTOPICS);
245
+ ss = new_lda_suffstats(model);
246
+ random_initialize_ss(ss, model);
247
+ lda_mle(model, ss, 0);
248
+ model->alpha = INITIAL_ALPHA;
249
+ } else {
250
+ model = load_lda_model(start);
251
+ ss = new_lda_suffstats(model);
252
+ }
253
+
254
+ sprintf(filename,"%s/000",directory);
255
+ save_lda_model(model, filename);
256
+
257
+ // run expectation maximization
258
+
259
+ int i = 0;
260
+ double likelihood, likelihood_old = 0, converged = 1;
261
+ sprintf(filename, "%s/likelihood.dat", directory);
262
+ FILE* likelihood_file = fopen(filename, "w");
263
+
264
+ while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
265
+ i++;
266
+ printf("**** em iteration %d ****\n", i);
267
+ likelihood = 0;
268
+ zero_initialize_ss(ss, model);
269
+
270
+ // e-step
271
+
272
+ for (d = 0; d < corpus->num_docs; d++) {
273
+ if ((d % 1000) == 0) printf("document %d\n",d);
274
+ likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
275
+ }
276
+
277
+ // m-step
278
+
279
+ lda_mle(model, ss, ESTIMATE_ALPHA);
280
+
281
+ // check for convergence
282
+
283
+ converged = (likelihood_old - likelihood) / (likelihood_old);
284
+ if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
285
+ likelihood_old = likelihood;
286
+
287
+ // output model and likelihood
288
+
289
+ fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
290
+ fflush(likelihood_file);
291
+ if ((i % LAG) == 0)
292
+ {
293
+ sprintf(filename,"%s/%03d",directory, i);
294
+ save_lda_model(model, filename);
295
+ sprintf(filename,"%s/%03d.gamma",directory, i);
296
+ save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
297
+ }
298
+ }
299
+
300
+ // output the final model
301
+
302
+ sprintf(filename,"%s/final",directory);
303
+ save_lda_model(model, filename);
304
+ sprintf(filename,"%s/final.gamma",directory);
305
+ save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
306
+
307
+ // output the word assignments (for visualization)
308
+
309
+ sprintf(filename, "%s/word-assignments.dat", directory);
310
+ FILE* w_asgn_file = fopen(filename, "w");
311
+ for (d = 0; d < corpus->num_docs; d++)
312
+ {
313
+ if ((d % 100) == 0) printf("final e step document %d\n",d);
314
+ likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
315
+ write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
316
+ }
317
+ fclose(w_asgn_file);
318
+ fclose(likelihood_file);
319
+ }
320
+
321
+
322
+ /*
323
+ * read settings.
324
+ */
325
+
326
+ void read_settings(char* filename) {
327
+ FILE* fileptr;
328
+ char alpha_action[100];
329
+ fileptr = fopen(filename, "r");
330
+ fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER);
331
+ fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED);
332
+ fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER);
333
+ fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED);
334
+ fscanf(fileptr, "alpha %s", alpha_action);
335
+ if (strcmp(alpha_action, "fixed")==0)
336
+ {
337
+ ESTIMATE_ALPHA = 0;
338
+ }
339
+ else
340
+ {
341
+ ESTIMATE_ALPHA = 1;
342
+ }
343
+ fclose(fileptr);
344
+ }
345
+
346
+
347
+
348
+
349
+ /*
350
+ * inference only
351
+ *
352
+ */
353
+
354
+ void infer(char* model_root, char* save, corpus* corpus) {
355
+ FILE* fileptr;
356
+ char filename[100];
357
+ int i, d, n;
358
+ lda_model *model;
359
+ double **var_gamma, likelihood, **phi;
360
+ document* doc;
361
+
362
+ model = load_lda_model(model_root);
363
+ var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
364
+ for (i = 0; i < corpus->num_docs; i++)
365
+ var_gamma[i] = malloc(sizeof(double)*model->num_topics);
366
+ sprintf(filename, "%s-lda-lhood.dat", save);
367
+ fileptr = fopen(filename, "w");
368
+ for (d = 0; d < corpus->num_docs; d++) {
369
+ if (((d % 100) == 0) && (d>0)) printf("document %d\n",d);
370
+
371
+ doc = &(corpus->docs[d]);
372
+ phi = (double**) malloc(sizeof(double*) * doc->length);
373
+ for (n = 0; n < doc->length; n++)
374
+ phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
375
+ likelihood = lda_inference(doc, model, var_gamma[d], phi);
376
+
377
+ fprintf(fileptr, "%5.5f\n", likelihood);
378
+ }
379
+ fclose(fileptr);
380
+ sprintf(filename, "%s-gamma.dat", save);
381
+ save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
382
+ }
383
+
384
+
385
+ /*
386
+ * update sufficient statistics
387
+ *
388
+ */
389
+
390
+
391
+
392
+ /*
393
+ * main
394
+ *
395
+ */
396
+
397
+ int main(int argc, char* argv[]) {
398
+ corpus* corpus;
399
+
400
+ long t1;
401
+ (void) time(&t1);
402
+ seedMT(t1);
403
+ // seedMT(4357U);
404
+
405
+ if (argc > 1)
406
+ {
407
+ if (strcmp(argv[1], "est")==0)
408
+ {
409
+ INITIAL_ALPHA = atof(argv[2]);
410
+ NTOPICS = atoi(argv[3]);
411
+ read_settings(argv[4]);
412
+ corpus = read_data(argv[5]);
413
+ make_directory(argv[7]);
414
+ run_em(argv[6], argv[7], corpus);
415
+ }
416
+ if (strcmp(argv[1], "inf")==0)
417
+ {
418
+ read_settings(argv[2]);
419
+ corpus = read_data(argv[4]);
420
+ infer(argv[3], argv[5], corpus);
421
+ }
422
+ }
423
+ else
424
+ {
425
+ printf("usage : lda est [initial alpha] [k] [settings] [data] [random/seeded/*] [directory]\n");
426
+ printf(" lda inf [settings] [model] [data] [name]\n");
427
+ }
428
+ return(0);
429
+ }
430
+
431
+ #ifdef USE_RUBY
432
+
433
+ /* */
434
+ void run_quiet_em(char* start, corpus* corpus) {
435
+ int d, n;
436
+ lda_model *model = NULL;
437
+ double **var_gamma, **phi;
438
+
439
+ // allocate variational parameters
440
+
441
+ var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
442
+ for (d = 0; d < corpus->num_docs; d++)
443
+ var_gamma[d] = malloc(sizeof(double) * NTOPICS);
444
+
445
+ int max_length = max_corpus_length(corpus);
446
+ phi = malloc(sizeof(double*)*max_length);
447
+ for (n = 0; n < max_length; n++)
448
+ phi[n] = malloc(sizeof(double) * NTOPICS);
449
+
450
+ // initialize model
451
+
452
+ lda_suffstats* ss = NULL;
453
+ if (strcmp(start, "seeded")==0) {
454
+ model = new_lda_model(corpus->num_terms, NTOPICS);
455
+ ss = new_lda_suffstats(model);
456
+ corpus_initialize_ss(ss, model, corpus);
457
+ lda_mle(model, ss, 0);
458
+ model->alpha = INITIAL_ALPHA;
459
+ } else if (strcmp(start, "random")==0) {
460
+ model = new_lda_model(corpus->num_terms, NTOPICS);
461
+ ss = new_lda_suffstats(model);
462
+ random_initialize_ss(ss, model);
463
+ lda_mle(model, ss, 0);
464
+ model->alpha = INITIAL_ALPHA;
465
+ } else {
466
+ model = load_lda_model(start);
467
+ ss = new_lda_suffstats(model);
468
+ }
469
+
470
+ // save the model in the last_model global
471
+ last_model = model;
472
+ model_loaded = TRUE;
473
+
474
+ // run expectation maximization
475
+
476
+ int i = 0;
477
+ double likelihood, likelihood_old = 0, converged = 1;
478
+
479
+ while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
480
+ i++;
481
+ printf("**** em iteration %d ****\n", i);
482
+ likelihood = 0;
483
+ zero_initialize_ss(ss, model);
484
+
485
+ // e-step
486
+
487
+ for (d = 0; d < corpus->num_docs; d++) {
488
+ if ((d % 1000) == 0) printf("document %d\n",d);
489
+ likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
490
+ }
491
+
492
+ // m-step
493
+
494
+ lda_mle(model, ss, ESTIMATE_ALPHA);
495
+
496
+ // check for convergence
497
+
498
+ converged = (likelihood_old - likelihood) / (likelihood_old);
499
+ if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
500
+ likelihood_old = likelihood;
501
+
502
+ // store model and likelihood
503
+
504
+ last_model = model;
505
+ last_gamma = var_gamma;
506
+ }
507
+
508
+ // output the final model
509
+
510
+ last_model = model;
511
+ last_gamma = var_gamma;
512
+
513
+ // output the word assignments (for visualization)
514
+ /*
515
+ char filename[100];
516
+ sprintf(filename, "%s/word-assignments.dat", directory);
517
+ FILE* w_asgn_file = fopen(filename, "w");
518
+ for (d = 0; d < corpus->num_docs; d++) {
519
+ if ((d % 100) == 0)
520
+ printf("final e step document %d\n",d);
521
+ likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
522
+ write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
523
+ }
524
+ fclose(w_asgn_file);
525
+ */
526
+ }
527
+
528
+
529
+ /*
530
+ * Set all of the settings in one command:
531
+ *
532
+ * * init_alpha
533
+ * * num_topics
534
+ * * max_iter
535
+ * * convergence
536
+ * * em_max_iter
537
+ * * em_convergence
538
+ * * est_alpha
539
+ */
540
+ static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
541
+ INITIAL_ALPHA = NUM2DBL(init_alpha);
542
+ NTOPICS = NUM2INT(num_topics);
543
+ VAR_MAX_ITER = NUM2INT(max_iter);
544
+ VAR_CONVERGED = (float)NUM2DBL(convergence);
545
+ EM_MAX_ITER = NUM2INT(em_max_iter);
546
+ EM_CONVERGED = (float)NUM2DBL(em_convergence);
547
+ ESTIMATE_ALPHA = NUM2INT(est_alpha);
548
+
549
+ return Qtrue;
550
+ }
551
+
552
+ /*
553
+ * Get the maximum iterations.
554
+ */
555
+ static VALUE wrap_get_max_iter(VALUE self) {
556
+ return rb_int_new(VAR_MAX_ITER);
557
+ }
558
+
559
+ /*
560
+ * Set the maximum iterations.
561
+ */
562
+ static VALUE wrap_set_max_iter(VALUE self, VALUE max_iter) {
563
+ VAR_MAX_ITER = NUM2INT(max_iter);
564
+
565
+ return max_iter;
566
+ }
567
+
568
+ /*
569
+ * Get the convergence setting.
570
+ */
571
+ static VALUE wrap_get_converged(VALUE self) {
572
+ return rb_float_new(VAR_CONVERGED);
573
+ }
574
+
575
+ /*
576
+ * Set the convergence setting.
577
+ */
578
+ static VALUE wrap_set_converged(VALUE self, VALUE converged) {
579
+ VAR_CONVERGED = (float)NUM2DBL(converged);
580
+
581
+ return converged;
582
+ }
583
+
584
+ /*
585
+ * Get the max iterations for the EM algorithm.
586
+ */
587
+ static VALUE wrap_get_em_max_iter(VALUE self) {
588
+ return rb_int_new(EM_MAX_ITER);
589
+ }
590
+
591
+ /*
592
+ * Set the max iterations for the EM algorithm.
593
+ */
594
+ static VALUE wrap_set_em_max_iter(VALUE self, VALUE em_max_iter) {
595
+ EM_MAX_ITER = NUM2INT(em_max_iter);
596
+
597
+ return em_max_iter;
598
+ }
599
+
600
+ /*
601
+ * Get the convergence value for EM.
602
+ */
603
+ static VALUE wrap_get_em_converged(VALUE self) {
604
+ return rb_float_new(EM_CONVERGED);
605
+ }
606
+
607
+ /*
608
+ * Set the convergence value for EM.
609
+ */
610
+ static VALUE wrap_set_em_converged(VALUE self, VALUE em_converged) {
611
+ EM_CONVERGED = (float)NUM2DBL(em_converged);
612
+
613
+ return em_converged;
614
+ }
615
+
616
+ /*
617
+ * Get the initial alpha value.
618
+ */
619
+ static VALUE wrap_get_initial_alpha(VALUE self) {
620
+ return rb_float_new(INITIAL_ALPHA);
621
+ }
622
+
623
+ /*
624
+ * Get the number of topics being clustered.
625
+ */
626
+ static VALUE wrap_get_num_topics(VALUE self) {
627
+ return rb_int_new(NTOPICS);
628
+ }
629
+
630
+ /*
631
+ * Set the initial value of alpha.
632
+ */
633
+ static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
634
+ INITIAL_ALPHA = (float)NUM2DBL(initial_alpha);
635
+
636
+ return initial_alpha;
637
+ }
638
+
639
+ /*
640
+ * Set the number of topics to be clustered.
641
+ */
642
+ static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) {
643
+ NTOPICS = NUM2INT(ntopics);
644
+
645
+ return ntopics;
646
+ }
647
+
648
+ /*
649
+ * Get the estimate alpha value (fixed = 0).
650
+ */
651
+ static VALUE wrap_get_estimate_alpha(VALUE self) {
652
+ return rb_int_new(ESTIMATE_ALPHA);
653
+ }
654
+
655
+ /*
656
+ * Set the estimate alpha value (fixed = 0).
657
+ */
658
+ static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
659
+ ESTIMATE_ALPHA = NUM2INT(est_alpha);
660
+
661
+ return est_alpha;
662
+ }
663
+
664
+
665
+
666
+ /*
667
+ * Run the EM algorithm with the loaded corpus and using the current
668
+ * configuration settings. The +start+ parameter can take the following
669
+ * values:
670
+ * * random - starting alpha are randomized
671
+ * * seeded - loaded based on the corpus values
672
+ * * <filename> - path to the file containing the model
673
+ */
674
+ static VALUE wrap_em(VALUE self, VALUE start) {
675
+ if (!corpus_loaded)
676
+ return Qnil;
677
+
678
+ run_quiet_em(STR2CSTR(start), last_corpus);
679
+
680
+ return Qnil;
681
+ }
682
+
683
+
684
+ /*
685
+ * Load settings from the given file.
686
+ */
687
+ static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
688
+ read_settings(STR2CSTR(settings_file));
689
+
690
+ return Qtrue;
691
+ }
692
+
693
+ /*
694
+ * Load the corpus from the given file. This will not create
695
+ * a +Corpus+ object that is accessible, but it will load the corpus
696
+ * much faster.
697
+ */
698
+ static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
699
+ if (!corpus_loaded) {
700
+ last_corpus = read_data(STR2CSTR(filename));
701
+ corpus_loaded = TRUE;
702
+ return Qtrue;
703
+ } else {
704
+ return Qtrue;
705
+ }
706
+ }
707
+
708
+ /*
709
+ * Set the corpus.
710
+ */
711
+ static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
712
+ corpus* c;
713
+ int i = 0;
714
+ int j = 0;
715
+
716
+ c = malloc(sizeof(corpus));
717
+ c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms"));
718
+ c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs"));
719
+ c->docs = (document*) malloc(sizeof(document) * c->num_docs);
720
+ VALUE doc_ary = rb_iv_get(rcorpus, "@documents");
721
+ for (i = 0; i < c->num_docs; i++) {
722
+ VALUE one_doc = rb_ary_entry(doc_ary, i);
723
+ VALUE words = rb_iv_get(one_doc, "@words");
724
+ VALUE counts = rb_iv_get(one_doc, "@counts");
725
+
726
+ c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
727
+ c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
728
+ c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
729
+ c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
730
+ for (j = 0; j < c->docs[i].length; j++) {
731
+ VALUE one_word = NUM2INT(rb_ary_entry(words, j));
732
+ VALUE one_count = NUM2INT(rb_ary_entry(counts, j));
733
+ c->docs[i].words[j] = one_word;
734
+ c->docs[i].counts[j] = one_count;
735
+ }
736
+ }
737
+
738
+ last_corpus = c;
739
+ corpus_loaded = TRUE;
740
+
741
+ rb_iv_set(self, "@corpus", rcorpus);
742
+
743
+ return Qtrue;
744
+ }
745
+
746
+
747
+ /*
748
+ * Get the gamma values after the model has been run.
749
+ */
750
+ static VALUE wrap_get_gamma(VALUE self) {
751
+ if (!model_loaded)
752
+ return Qnil;
753
+
754
+ // last_gamma is a double[num_docs][num_topics]
755
+ VALUE arr;
756
+ int i = 0, j = 0;
757
+
758
+ arr = rb_ary_new2(last_corpus->num_docs);
759
+ for (i = 0; i < last_corpus->num_docs; i++) {
760
+ VALUE arr2 = rb_ary_new2(last_model->num_topics);
761
+ for (j = 0; j < last_model->num_topics; j++) {
762
+ rb_ary_store(arr2, j, rb_float_new(last_gamma[i][j]));
763
+ }
764
+ rb_ary_store(arr, i, arr2);
765
+ }
766
+
767
+ return arr;
768
+ }
769
+
770
+ /*
771
+ * Get the beta matrix after the model has been run.
772
+ */
773
+ static VALUE wrap_get_model_beta(VALUE self) {
774
+ if (!model_loaded)
775
+ return Qnil;
776
+
777
+ // beta is a double[num_topics][num_terms]
778
+ VALUE arr;
779
+ int i = 0, j = 0;
780
+
781
+ arr = rb_ary_new2(last_model->num_topics);
782
+ for (i = 0; i < last_model->num_topics; i++) {
783
+ VALUE arr2 = rb_ary_new2(last_model->num_terms);
784
+ for (j = 0; j < last_model->num_terms; j++) {
785
+ rb_ary_store(arr2, j, rb_float_new(last_model->log_prob_w[i][j]));
786
+ }
787
+ rb_ary_store(arr, i, arr2);
788
+ }
789
+
790
+ return arr;
791
+ }
792
+
793
+
794
+ /*
795
+ * Get the settings used for the model.
796
+ */
797
+ static VALUE wrap_get_model_settings(VALUE self) {
798
+ if (!model_loaded)
799
+ return Qnil;
800
+
801
+ VALUE arr;
802
+
803
+ arr = rb_ary_new();
804
+ rb_ary_push(arr, rb_int_new(last_model->num_topics));
805
+ rb_ary_push(arr, rb_int_new(last_model->num_terms));
806
+ rb_ary_push(arr, rb_float_new(last_model->alpha));
807
+
808
+ return arr; // [num_topics, num_terms, alpha]
809
+ }
810
+
811
+
812
+ void Init_lda_ext() {
813
+ corpus_loaded = FALSE;
814
+ model_loaded = FALSE;
815
+
816
+ rb_require("lda");
817
+
818
+ /*
819
+ * The Latent Dirichlet Allocation algorithm by Blei et al (2003). Ruby wrapper based on
820
+ * lda-c code by David Blei (available at http://www.cs.princeton.edu/~blei/lda-c).
821
+ */
822
+ rb_mLda = rb_define_module("Lda");
823
+
824
+ /*
825
+ * Class that handles most of the functionality of LDA.
826
+ */
827
+ rb_cLda = rb_define_class_under(rb_mLda, "Lda", rb_cObject);
828
+
829
+ /*
830
+ * Class that represents a corpus.
831
+ */
832
+ rb_cLdaCorpus = rb_define_class_under(rb_mLda, "Corpus", rb_cObject);
833
+
834
+ /*
835
+ * Class that represents a single document.
836
+ */
837
+ rb_cLdaDocument = rb_define_class_under(rb_mLda, "Document", rb_cObject);
838
+
839
+
840
+ // method to load the corpus
841
+ rb_define_method(rb_cLda, "fast_load_corpus_from_file", wrap_load_corpus, 1);
842
+ rb_define_method(rb_cLda, "corpus=", wrap_ruby_corpus, 1);
843
+
844
+ // method to run em
845
+ rb_define_method(rb_cLda, "em", wrap_em, 1);
846
+
847
+ // method to load settings from file
848
+ rb_define_method(rb_cLda, "load_settings", wrap_load_settings, 1);
849
+
850
+ // method to set all the config options at once
851
+ rb_define_method(rb_cLda, "set_config", wrap_set_config, 5);
852
+
853
+ // accessor stuff for main settings
854
+ rb_define_method(rb_cLda, "max_iter", wrap_get_max_iter, 0);
855
+ rb_define_method(rb_cLda, "max_iter=", wrap_set_max_iter, 1);
856
+ rb_define_method(rb_cLda, "convergence", wrap_get_converged, 0);
857
+ rb_define_method(rb_cLda, "convergence=", wrap_set_converged, 1);
858
+ rb_define_method(rb_cLda, "em_max_iter", wrap_get_em_max_iter, 0);
859
+ rb_define_method(rb_cLda, "em_max_iter=", wrap_set_em_max_iter, 1);
860
+ rb_define_method(rb_cLda, "em_convergence", wrap_get_em_converged, 0);
861
+ rb_define_method(rb_cLda, "em_convergence=", wrap_set_em_converged, 1);
862
+ rb_define_method(rb_cLda, "init_alpha=", wrap_set_initial_alpha, 1);
863
+ rb_define_method(rb_cLda, "init_alpha", wrap_get_initial_alpha, 0);
864
+ rb_define_method(rb_cLda, "est_alpha=", wrap_set_estimate_alpha, 1);
865
+ rb_define_method(rb_cLda, "est_alpha", wrap_get_estimate_alpha, 0);
866
+ rb_define_method(rb_cLda, "num_topics", wrap_get_num_topics, 0);
867
+ rb_define_method(rb_cLda, "num_topics=", wrap_set_num_topics, 1);
868
+
869
+ // retrieve model and gamma
870
+ rb_define_method(rb_cLda, "beta", wrap_get_model_beta, 0);
871
+ rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0);
872
+ rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
873
+ }
874
+
875
+ #endif