lda-ruby 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/CHANGELOG +22 -0
- data/README +21 -0
- data/README.markdown +38 -0
- data/Rakefile +58 -0
- data/VERSION.yml +4 -0
- data/ext/lda-ruby/Makefile +181 -0
- data/ext/lda-ruby/cokus.c +145 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +9 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1007 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +29 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +111 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby.rb +168 -0
- data/lib/lda-ruby/corpus/corpus.rb +34 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +36 -0
- data/lib/lda-ruby/document/text_document.rb +37 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/license.txt +504 -0
- data/test/data/.gitignore +2 -0
- data/test/data/docs.dat +46 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/lda_ruby_test.rb +274 -0
- data/test/test_helper.rb +10 -0
- metadata +95 -0
@@ -0,0 +1,27 @@
|
|
1
|
+
#ifndef COKUS_H
|
2
|
+
#define COKUS_H
|
3
|
+
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
|
7
|
+
//
|
8
|
+
// uint32 must be an unsigned integer type capable of holding at least 32
|
9
|
+
// bits; exactly 32 should be fastest, but 64 is better on an Alpha with
|
10
|
+
// GCC at -O3 optimization so try your options and see what's best for you
|
11
|
+
//
|
12
|
+
|
13
|
+
typedef unsigned long uint32;
|
14
|
+
|
15
|
+
#define N (624) // length of state vector
|
16
|
+
#define M (397) // a period parameter
|
17
|
+
#define K (0x9908B0DFU) // a magic constant
|
18
|
+
#define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u
|
19
|
+
#define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u
|
20
|
+
#define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u
|
21
|
+
#define mixBits(u, v) (hiBit(u)|loBits(v)) // move hi bit of u to hi bit of v
|
22
|
+
|
23
|
+
void seedMT(uint32 seed);
|
24
|
+
uint32 reloadMT(void);
|
25
|
+
uint32 randomMT(void);
|
26
|
+
|
27
|
+
#endif
|
@@ -0,0 +1,96 @@
|
|
1
|
+
// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
|
2
|
+
|
3
|
+
// This file is part of LDA-C.
|
4
|
+
|
5
|
+
// LDA-C is free software; you can redistribute it and/or modify it under
|
6
|
+
// the terms of the GNU General Public License as published by the Free
|
7
|
+
// Software Foundation; either version 2 of the License, or (at your
|
8
|
+
// option) any later version.
|
9
|
+
|
10
|
+
// LDA-C is distributed in the hope that it will be useful, but WITHOUT
|
11
|
+
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
12
|
+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
13
|
+
// for more details.
|
14
|
+
|
15
|
+
// You should have received a copy of the GNU General Public License
|
16
|
+
// along with this program; if not, write to the Free Software
|
17
|
+
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
18
|
+
// USA
|
19
|
+
|
20
|
+
#include "lda-alpha.h"
|
21
|
+
|
22
|
+
/*
|
23
|
+
* objective function and its derivatives
|
24
|
+
*
|
25
|
+
*/
|
26
|
+
|
27
|
+
double alhood(double a, double ss, int D, int K)
|
28
|
+
{ return(D * (lgamma(K * a) - K * lgamma(a)) + (a - 1) * ss); }
|
29
|
+
|
30
|
+
double d_alhood(double a, double ss, int D, int K)
|
31
|
+
{ return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }
|
32
|
+
|
33
|
+
double d2_alhood(double a, int D, int K)
|
34
|
+
{ return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }
|
35
|
+
|
36
|
+
|
37
|
+
/*
|
38
|
+
* newtons method
|
39
|
+
*
|
40
|
+
*/
|
41
|
+
|
42
|
+
double opt_alpha(double ss, int D, int K)
|
43
|
+
{
|
44
|
+
double a, log_a, init_a = 100;
|
45
|
+
double f, df, d2f;
|
46
|
+
int iter = 0;
|
47
|
+
|
48
|
+
log_a = log(init_a);
|
49
|
+
do
|
50
|
+
{
|
51
|
+
iter++;
|
52
|
+
a = exp(log_a);
|
53
|
+
if (isnan(a))
|
54
|
+
{
|
55
|
+
init_a = init_a * 10;
|
56
|
+
printf("warning : alpha is nan; new init = %5.5f\n", init_a);
|
57
|
+
a = init_a;
|
58
|
+
log_a = log(a);
|
59
|
+
}
|
60
|
+
f = alhood(a, ss, D, K);
|
61
|
+
df = d_alhood(a, ss, D, K);
|
62
|
+
d2f = d2_alhood(a, D, K);
|
63
|
+
log_a = log_a - df/(d2f * a + df);
|
64
|
+
printf("alpha maximization : %5.5f %5.5f\n", f, df);
|
65
|
+
}
|
66
|
+
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
|
67
|
+
return(exp(log_a));
|
68
|
+
}
|
69
|
+
|
70
|
+
double quiet_opt_alpha(double ss, int D, int K)
|
71
|
+
{
|
72
|
+
double a, log_a, init_a = 100;
|
73
|
+
double f, df, d2f;
|
74
|
+
int iter = 0;
|
75
|
+
|
76
|
+
log_a = log(init_a);
|
77
|
+
do
|
78
|
+
{
|
79
|
+
iter++;
|
80
|
+
a = exp(log_a);
|
81
|
+
if (isnan(a))
|
82
|
+
{
|
83
|
+
init_a = init_a * 10;
|
84
|
+
//printf("warning : alpha is nan; new init = %5.5f\n", init_a);
|
85
|
+
a = init_a;
|
86
|
+
log_a = log(a);
|
87
|
+
}
|
88
|
+
f = alhood(a, ss, D, K);
|
89
|
+
df = d_alhood(a, ss, D, K);
|
90
|
+
d2f = d2_alhood(a, D, K);
|
91
|
+
log_a = log_a - df/(d2f * a + df);
|
92
|
+
//printf("alpha maximization : %5.5f %5.5f\n", f, df);
|
93
|
+
}
|
94
|
+
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
|
95
|
+
return(exp(log_a));
|
96
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#ifndef LDA_ALPHA_H
|
2
|
+
#define LDA_ALPHA_H
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <math.h>
|
6
|
+
#include <float.h>
|
7
|
+
|
8
|
+
#include "lda.h"
|
9
|
+
#include "utils.h"
|
10
|
+
|
11
|
+
#define NEWTON_THRESH 1e-5
|
12
|
+
#define MAX_ALPHA_ITER 1000
|
13
|
+
|
14
|
+
double alhood(double a, double ss, int D, int K);
|
15
|
+
double d_alhood(double a, double ss, int D, int K);
|
16
|
+
double d2_alhood(double a, int D, int K);
|
17
|
+
double opt_alpha(double ss, int D, int K);
|
18
|
+
double quiet_opt_alpha(double ss, int D, int K);
|
19
|
+
//void maximize_alpha(double** gamma, lda_model* model, int num_docs);
|
20
|
+
|
21
|
+
#endif
|
@@ -0,0 +1,67 @@
|
|
1
|
+
// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
|
2
|
+
|
3
|
+
// This file is part of LDA-C.
|
4
|
+
|
5
|
+
// LDA-C is free software; you can redistribute it and/or modify it under
|
6
|
+
// the terms of the GNU General Public License as published by the Free
|
7
|
+
// Software Foundation; either version 2 of the License, or (at your
|
8
|
+
// option) any later version.
|
9
|
+
|
10
|
+
// LDA-C is distributed in the hope that it will be useful, but WITHOUT
|
11
|
+
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
12
|
+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
13
|
+
// for more details.
|
14
|
+
|
15
|
+
// You should have received a copy of the GNU General Public License
|
16
|
+
// along with this program; if not, write to the Free Software
|
17
|
+
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
18
|
+
// USA
|
19
|
+
|
20
|
+
#include "lda-data.h"
|
21
|
+
|
22
|
+
corpus* read_data(char* data_filename)
|
23
|
+
{
|
24
|
+
FILE *fileptr;
|
25
|
+
int length, count, word, n, nd, nw;
|
26
|
+
corpus* c;
|
27
|
+
|
28
|
+
printf("reading data from %s\n", data_filename);
|
29
|
+
c = malloc(sizeof(corpus));
|
30
|
+
c->docs = 0;
|
31
|
+
c->num_terms = 0;
|
32
|
+
c->num_docs = 0;
|
33
|
+
fileptr = fopen(data_filename, "r");
|
34
|
+
nd = 0; nw = 0;
|
35
|
+
while ((fscanf(fileptr, "%10d", &length) != EOF))
|
36
|
+
{
|
37
|
+
c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1));
|
38
|
+
c->docs[nd].length = length;
|
39
|
+
c->docs[nd].total = 0;
|
40
|
+
c->docs[nd].words = malloc(sizeof(int)*length);
|
41
|
+
c->docs[nd].counts = malloc(sizeof(int)*length);
|
42
|
+
for (n = 0; n < length; n++)
|
43
|
+
{
|
44
|
+
fscanf(fileptr, "%10d:%10d", &word, &count);
|
45
|
+
word = word - OFFSET;
|
46
|
+
c->docs[nd].words[n] = word;
|
47
|
+
c->docs[nd].counts[n] = count;
|
48
|
+
c->docs[nd].total += count;
|
49
|
+
if (word >= nw) { nw = word + 1; }
|
50
|
+
}
|
51
|
+
nd++;
|
52
|
+
}
|
53
|
+
fclose(fileptr);
|
54
|
+
c->num_docs = nd;
|
55
|
+
c->num_terms = nw;
|
56
|
+
printf("number of docs : %d\n", nd);
|
57
|
+
printf("number of terms : %d\n", nw);
|
58
|
+
return(c);
|
59
|
+
}
|
60
|
+
|
61
|
+
int max_corpus_length(corpus* c)
|
62
|
+
{
|
63
|
+
int n, max = 0;
|
64
|
+
for (n = 0; n < c->num_docs; n++)
|
65
|
+
if (c->docs[n].length > max) max = c->docs[n].length;
|
66
|
+
return(max);
|
67
|
+
}
|
@@ -0,0 +1,1007 @@
|
|
1
|
+
// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
|
2
|
+
|
3
|
+
// This file is part of LDA-C.
|
4
|
+
|
5
|
+
// LDA-C is free software; you can redistribute it and/or modify it under
|
6
|
+
// the terms of the GNU General Public License as published by the Free
|
7
|
+
// Software Foundation; either version 2 of the License, or (at your
|
8
|
+
// option) any later version.
|
9
|
+
|
10
|
+
// LDA-C is distributed in the hope that it will be useful, but WITHOUT
|
11
|
+
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
12
|
+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
13
|
+
// for more details.
|
14
|
+
|
15
|
+
// You should have received a copy of the GNU General Public License
|
16
|
+
// along with this program; if not, write to the Free Software
|
17
|
+
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
18
|
+
// USA
|
19
|
+
|
20
|
+
#include <stdlib.h>
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <math.h>
|
23
|
+
#include <float.h>
|
24
|
+
#include <string.h>
|
25
|
+
#include <time.h>
|
26
|
+
|
27
|
+
#include "lda.h"
|
28
|
+
#include "lda-data.h"
|
29
|
+
#include "lda-inference.h"
|
30
|
+
#include "lda-model.h"
|
31
|
+
#include "utils.h"
|
32
|
+
#include "cokus.h"
|
33
|
+
|
34
|
+
#ifdef USE_RUBY
|
35
|
+
#include "ruby.h"
|
36
|
+
|
37
|
+
VALUE rb_cLdaModule;
|
38
|
+
VALUE rb_cLda;
|
39
|
+
VALUE rb_cLdaCorpus;
|
40
|
+
VALUE rb_cLdaDocument;
|
41
|
+
#endif
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
/*
|
46
|
+
* variational inference
|
47
|
+
*/
|
48
|
+
|
49
|
+
double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi, short* errors) {
|
50
|
+
double converged = 1;
|
51
|
+
double phisum = 0, likelihood = 0;
|
52
|
+
double likelihood_old = 0, oldphi[model->num_topics];
|
53
|
+
int k = 0, n = 0, var_iter = 0, index = 0;
|
54
|
+
double digamma_gam[model->num_topics];
|
55
|
+
|
56
|
+
/* zero'em out */
|
57
|
+
memset(digamma_gam,0.0,sizeof(digamma_gam));
|
58
|
+
memset(oldphi,0.0,sizeof(oldphi));
|
59
|
+
|
60
|
+
// compute posterior dirichlet
|
61
|
+
|
62
|
+
for (k = 0; k < model->num_topics; k++)
|
63
|
+
{
|
64
|
+
var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
|
65
|
+
digamma_gam[k] = digamma(var_gamma[k]);
|
66
|
+
for (n = 0; n < doc->length; n++)
|
67
|
+
phi[n][k] = 1.0/model->num_topics;
|
68
|
+
}
|
69
|
+
var_iter = 0;
|
70
|
+
|
71
|
+
while ((converged > VAR_CONVERGED) &&
|
72
|
+
((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
|
73
|
+
{
|
74
|
+
var_iter++;
|
75
|
+
for (n = 0; n < doc->length; n++)
|
76
|
+
{
|
77
|
+
phisum = 0;
|
78
|
+
for (k = 0; k < model->num_topics; k++)
|
79
|
+
{
|
80
|
+
oldphi[k] = phi[n][k];
|
81
|
+
index = doc->words[n];
|
82
|
+
if( index < 0 || index > model->num_terms ) {
|
83
|
+
printf("phi for term: %d of %d\n", index, model->num_terms);
|
84
|
+
phi[n][k] = 0.0;
|
85
|
+
}
|
86
|
+
else {
|
87
|
+
phi[n][k] =
|
88
|
+
digamma_gam[k] +
|
89
|
+
model->log_prob_w[k][index];
|
90
|
+
}
|
91
|
+
|
92
|
+
if (k > 0)
|
93
|
+
phisum = log_sum(phisum, phi[n][k]);
|
94
|
+
else
|
95
|
+
phisum = phi[n][k]; // note, phi is in log space
|
96
|
+
}
|
97
|
+
|
98
|
+
for (k = 0; k < model->num_topics; k++)
|
99
|
+
{
|
100
|
+
phi[n][k] = exp(phi[n][k] - phisum);
|
101
|
+
var_gamma[k] =
|
102
|
+
var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
|
103
|
+
// !!! a lot of extra digamma's here because of how we're computing it
|
104
|
+
// !!! but its more automatically updated too.
|
105
|
+
digamma_gam[k] = digamma(var_gamma[k]);
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
likelihood = compute_likelihood(doc, model, phi, var_gamma);
|
110
|
+
//assert(!isnan(likelihood));
|
111
|
+
if( isnan(likelihood) ) { *errors = 1; }
|
112
|
+
converged = (likelihood_old - likelihood) / likelihood_old;
|
113
|
+
likelihood_old = likelihood;
|
114
|
+
|
115
|
+
// printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
|
116
|
+
}
|
117
|
+
return(likelihood);
|
118
|
+
}
|
119
|
+
|
120
|
+
|
121
|
+
/*
|
122
|
+
* compute likelihood bound
|
123
|
+
*/
|
124
|
+
|
125
|
+
double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) {
|
126
|
+
double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics];
|
127
|
+
int k = 0, n = 0, index = 0;
|
128
|
+
memset(dig,0.0,sizeof(dig));
|
129
|
+
|
130
|
+
for (k = 0; k < model->num_topics; k++)
|
131
|
+
{
|
132
|
+
dig[k] = digamma(var_gamma[k]);
|
133
|
+
var_gamma_sum += var_gamma[k];
|
134
|
+
}
|
135
|
+
digsum = digamma(var_gamma_sum);
|
136
|
+
|
137
|
+
likelihood = lgamma(model->alpha * model->num_topics) -
|
138
|
+
model->num_topics *
|
139
|
+
lgamma(model->alpha) -
|
140
|
+
lgamma(var_gamma_sum);
|
141
|
+
|
142
|
+
for (k = 0; k < model->num_topics; k++)
|
143
|
+
{
|
144
|
+
likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum);
|
145
|
+
|
146
|
+
for (n = 0; n < doc->length; n++)
|
147
|
+
{
|
148
|
+
if (phi[n][k] > 0)
|
149
|
+
{
|
150
|
+
index = doc->words[n];
|
151
|
+
likelihood += doc->counts[n]*
|
152
|
+
(phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
|
153
|
+
+ model->log_prob_w[k][index]));
|
154
|
+
}
|
155
|
+
}
|
156
|
+
}
|
157
|
+
return(likelihood);
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) {
|
162
|
+
double likelihood;
|
163
|
+
int n, k;
|
164
|
+
short error = 0;
|
165
|
+
|
166
|
+
// posterior inference
|
167
|
+
|
168
|
+
likelihood = lda_inference(doc, model, gamma, phi, &error);
|
169
|
+
if (error) { likelihood = 0.0; }
|
170
|
+
|
171
|
+
|
172
|
+
// update sufficient statistics
|
173
|
+
|
174
|
+
double gamma_sum = 0;
|
175
|
+
for (k = 0; k < model->num_topics; k++)
|
176
|
+
{
|
177
|
+
gamma_sum += gamma[k];
|
178
|
+
ss->alpha_suffstats += digamma(gamma[k]);
|
179
|
+
}
|
180
|
+
ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum);
|
181
|
+
|
182
|
+
for (n = 0; n < doc->length; n++)
|
183
|
+
{
|
184
|
+
for (k = 0; k < model->num_topics; k++)
|
185
|
+
{
|
186
|
+
ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k];
|
187
|
+
ss->class_total[k] += doc->counts[n]*phi[n][k];
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
ss->num_docs = ss->num_docs + 1;
|
192
|
+
|
193
|
+
return(likelihood);
|
194
|
+
}
|
195
|
+
|
196
|
+
|
197
|
+
/*
|
198
|
+
* writes the word assignments line for a document to a file
|
199
|
+
*/
|
200
|
+
|
201
|
+
void write_word_assignment(FILE* f, document* doc, double** phi, lda_model* model) {
|
202
|
+
int n;
|
203
|
+
|
204
|
+
fprintf(f, "%03d", doc->length);
|
205
|
+
for (n = 0; n < doc->length; n++) {
|
206
|
+
fprintf(f, " %04d:%02d", doc->words[n], argmax(phi[n], model->num_topics));
|
207
|
+
}
|
208
|
+
fprintf(f, "\n");
|
209
|
+
fflush(f);
|
210
|
+
}
|
211
|
+
|
212
|
+
|
213
|
+
/*
|
214
|
+
* saves the gamma parameters of the current dataset
|
215
|
+
*/
|
216
|
+
|
217
|
+
void save_gamma(char* filename, double** gamma, int num_docs, int num_topics) {
|
218
|
+
FILE* fileptr;
|
219
|
+
int d, k;
|
220
|
+
fileptr = fopen(filename, "w");
|
221
|
+
|
222
|
+
for (d = 0; d < num_docs; d++) {
|
223
|
+
fprintf(fileptr, "%5.10f", gamma[d][0]);
|
224
|
+
for (k = 1; k < num_topics; k++) {
|
225
|
+
fprintf(fileptr, " %5.10f", gamma[d][k]);
|
226
|
+
}
|
227
|
+
fprintf(fileptr, "\n");
|
228
|
+
}
|
229
|
+
fclose(fileptr);
|
230
|
+
}
|
231
|
+
|
232
|
+
|
233
|
+
void run_em(char* start, char* directory, corpus* corpus) {
|
234
|
+
int d, n;
|
235
|
+
lda_model *model = NULL;
|
236
|
+
double **var_gamma, **phi;
|
237
|
+
|
238
|
+
// allocate variational parameters
|
239
|
+
|
240
|
+
|
241
|
+
var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
|
242
|
+
for (d = 0; d < corpus->num_docs; d++)
|
243
|
+
var_gamma[d] = malloc(sizeof(double) * NTOPICS);
|
244
|
+
|
245
|
+
int max_length = max_corpus_length(corpus);
|
246
|
+
phi = malloc(sizeof(double*)*max_length);
|
247
|
+
for (n = 0; n < max_length; n++)
|
248
|
+
phi[n] = malloc(sizeof(double) * NTOPICS);
|
249
|
+
|
250
|
+
// initialize model
|
251
|
+
|
252
|
+
char filename[100];
|
253
|
+
|
254
|
+
lda_suffstats* ss = NULL;
|
255
|
+
if (strcmp(start, "seeded")==0) {
|
256
|
+
model = new_lda_model(corpus->num_terms, NTOPICS);
|
257
|
+
ss = new_lda_suffstats(model);
|
258
|
+
corpus_initialize_ss(ss, model, corpus);
|
259
|
+
if (VERBOSE) {
|
260
|
+
lda_mle(model, ss, 0);
|
261
|
+
} else {
|
262
|
+
quiet_lda_mle(model, ss, 0);
|
263
|
+
}
|
264
|
+
|
265
|
+
model->alpha = INITIAL_ALPHA;
|
266
|
+
} else if (strcmp(start, "random")==0) {
|
267
|
+
model = new_lda_model(corpus->num_terms, NTOPICS);
|
268
|
+
ss = new_lda_suffstats(model);
|
269
|
+
random_initialize_ss(ss, model);
|
270
|
+
if (VERBOSE) {
|
271
|
+
lda_mle(model, ss, 0);
|
272
|
+
} else {
|
273
|
+
quiet_lda_mle(model, ss, 0);
|
274
|
+
}
|
275
|
+
model->alpha = INITIAL_ALPHA;
|
276
|
+
} else {
|
277
|
+
model = load_lda_model(start);
|
278
|
+
ss = new_lda_suffstats(model);
|
279
|
+
}
|
280
|
+
|
281
|
+
sprintf(filename,"%s/000",directory);
|
282
|
+
save_lda_model(model, filename);
|
283
|
+
|
284
|
+
// run expectation maximization
|
285
|
+
|
286
|
+
int i = 0;
|
287
|
+
double likelihood, likelihood_old = 0, converged = 1;
|
288
|
+
sprintf(filename, "%s/likelihood.dat", directory);
|
289
|
+
FILE* likelihood_file = fopen(filename, "w");
|
290
|
+
|
291
|
+
while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
|
292
|
+
i++;
|
293
|
+
if (VERBOSE)
|
294
|
+
printf("**** em iteration %d ****\n", i);
|
295
|
+
likelihood = 0;
|
296
|
+
zero_initialize_ss(ss, model);
|
297
|
+
|
298
|
+
// e-step
|
299
|
+
printf("e-step\n");
|
300
|
+
|
301
|
+
for (d = 0; d < corpus->num_docs; d++) {
|
302
|
+
if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
|
303
|
+
likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
|
304
|
+
}
|
305
|
+
printf("m-step\n");
|
306
|
+
|
307
|
+
// m-step
|
308
|
+
if (VERBOSE) {
|
309
|
+
lda_mle(model, ss, ESTIMATE_ALPHA);
|
310
|
+
} else {
|
311
|
+
quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
|
312
|
+
}
|
313
|
+
|
314
|
+
// check for convergence
|
315
|
+
converged = (likelihood_old - likelihood) / (likelihood_old);
|
316
|
+
if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
|
317
|
+
likelihood_old = likelihood;
|
318
|
+
|
319
|
+
// output model and likelihood
|
320
|
+
|
321
|
+
fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
|
322
|
+
fflush(likelihood_file);
|
323
|
+
if ((i % LAG) == 0)
|
324
|
+
{
|
325
|
+
sprintf(filename,"%s/%03d",directory, i);
|
326
|
+
save_lda_model(model, filename);
|
327
|
+
sprintf(filename,"%s/%03d.gamma",directory, i);
|
328
|
+
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
|
329
|
+
}
|
330
|
+
}
|
331
|
+
|
332
|
+
// output the final model
|
333
|
+
|
334
|
+
sprintf(filename,"%s/final",directory);
|
335
|
+
save_lda_model(model, filename);
|
336
|
+
sprintf(filename,"%s/final.gamma",directory);
|
337
|
+
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
|
338
|
+
|
339
|
+
// output the word assignments (for visualization)
|
340
|
+
|
341
|
+
sprintf(filename, "%s/word-assignments.dat", directory);
|
342
|
+
FILE* w_asgn_file = fopen(filename, "w");
|
343
|
+
short error = 0;
|
344
|
+
double tl = 0.0;
|
345
|
+
for (d = 0; d < corpus->num_docs; d++)
|
346
|
+
{
|
347
|
+
if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
|
348
|
+
error = 0;
|
349
|
+
tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
|
350
|
+
if( error ) { continue; }
|
351
|
+
likelihood += tl;
|
352
|
+
write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
|
353
|
+
}
|
354
|
+
fclose(w_asgn_file);
|
355
|
+
fclose(likelihood_file);
|
356
|
+
}
|
357
|
+
|
358
|
+
|
359
|
+
/*
|
360
|
+
* read settings.
|
361
|
+
*/
|
362
|
+
|
363
|
+
void read_settings(char* filename) {
|
364
|
+
FILE* fileptr;
|
365
|
+
char alpha_action[100];
|
366
|
+
fileptr = fopen(filename, "r");
|
367
|
+
fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER);
|
368
|
+
fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED);
|
369
|
+
fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER);
|
370
|
+
fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED);
|
371
|
+
fscanf(fileptr, "alpha %s", alpha_action);
|
372
|
+
if (strcmp(alpha_action, "fixed")==0)
|
373
|
+
{
|
374
|
+
ESTIMATE_ALPHA = 0;
|
375
|
+
}
|
376
|
+
else
|
377
|
+
{
|
378
|
+
ESTIMATE_ALPHA = 1;
|
379
|
+
}
|
380
|
+
fclose(fileptr);
|
381
|
+
}
|
382
|
+
|
383
|
+
|
384
|
+
|
385
|
+
|
386
|
+
/*
|
387
|
+
* inference only
|
388
|
+
*
|
389
|
+
*/
|
390
|
+
|
391
|
+
void infer(char* model_root, char* save, corpus* corpus) {
|
392
|
+
FILE* fileptr;
|
393
|
+
char filename[100];
|
394
|
+
int i, d, n;
|
395
|
+
lda_model *model;
|
396
|
+
double **var_gamma, likelihood, **phi;
|
397
|
+
document* doc;
|
398
|
+
|
399
|
+
model = load_lda_model(model_root);
|
400
|
+
var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
|
401
|
+
for (i = 0; i < corpus->num_docs; i++)
|
402
|
+
var_gamma[i] = malloc(sizeof(double)*model->num_topics);
|
403
|
+
sprintf(filename, "%s-lda-lhood.dat", save);
|
404
|
+
fileptr = fopen(filename, "w");
|
405
|
+
for (d = 0; d < corpus->num_docs; d++) {
|
406
|
+
if (((d % 100) == 0) && (d>0) && VERBOSE) printf("document %d\n",d);
|
407
|
+
|
408
|
+
doc = &(corpus->docs[d]);
|
409
|
+
phi = (double**) malloc(sizeof(double*) * doc->length);
|
410
|
+
for (n = 0; n < doc->length; n++)
|
411
|
+
phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
|
412
|
+
short error = 0;
|
413
|
+
likelihood = lda_inference(doc, model, var_gamma[d], phi, &error);
|
414
|
+
|
415
|
+
fprintf(fileptr, "%5.5f\n", likelihood);
|
416
|
+
}
|
417
|
+
fclose(fileptr);
|
418
|
+
sprintf(filename, "%s-gamma.dat", save);
|
419
|
+
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
|
420
|
+
}
|
421
|
+
|
422
|
+
|
423
|
+
/*
|
424
|
+
* update sufficient statistics
|
425
|
+
*
|
426
|
+
*/
|
427
|
+
|
428
|
+
|
429
|
+
|
430
|
+
/*
|
431
|
+
* main
|
432
|
+
*
|
433
|
+
*/
|
434
|
+
|
435
|
+
int main(int argc, char* argv[]) {
|
436
|
+
corpus* corpus;
|
437
|
+
|
438
|
+
long t1;
|
439
|
+
(void) time(&t1);
|
440
|
+
seedMT(t1);
|
441
|
+
// seedMT(4357U);
|
442
|
+
|
443
|
+
if (argc > 1)
|
444
|
+
{
|
445
|
+
if (strcmp(argv[1], "est")==0)
|
446
|
+
{
|
447
|
+
INITIAL_ALPHA = atof(argv[2]);
|
448
|
+
NTOPICS = atoi(argv[3]);
|
449
|
+
read_settings(argv[4]);
|
450
|
+
corpus = read_data(argv[5]);
|
451
|
+
make_directory(argv[7]);
|
452
|
+
run_em(argv[6], argv[7], corpus);
|
453
|
+
}
|
454
|
+
if (strcmp(argv[1], "inf")==0)
|
455
|
+
{
|
456
|
+
read_settings(argv[2]);
|
457
|
+
corpus = read_data(argv[4]);
|
458
|
+
infer(argv[3], argv[5], corpus);
|
459
|
+
}
|
460
|
+
}
|
461
|
+
else
|
462
|
+
{
|
463
|
+
printf("usage : lda est [initial alpha] [k] [settings] [data] [random/seeded/*] [directory]\n");
|
464
|
+
printf(" lda inf [settings] [model] [data] [name]\n");
|
465
|
+
}
|
466
|
+
return(0);
|
467
|
+
}
|
468
|
+
|
469
|
+
#ifdef USE_RUBY
|
470
|
+
|
471
|
+
/* */
|
472
|
+
void run_quiet_em(char* start, corpus* corpus) {
|
473
|
+
int d = 0, n = 0;
|
474
|
+
lda_model *model = NULL;
|
475
|
+
double **var_gamma = NULL, **phi = NULL;
|
476
|
+
// last_gamma is a double[num_docs][num_topics]
|
477
|
+
|
478
|
+
// allocate variational parameters
|
479
|
+
|
480
|
+
|
481
|
+
var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
|
482
|
+
memset(var_gamma, 0.0, corpus->num_docs);
|
483
|
+
|
484
|
+
for (d = 0; d < corpus->num_docs; ++d) {
|
485
|
+
var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS);
|
486
|
+
memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS);
|
487
|
+
}
|
488
|
+
|
489
|
+
int max_length = max_corpus_length(corpus);
|
490
|
+
|
491
|
+
phi = (double**)malloc(sizeof(double*)*max_length);
|
492
|
+
memset(phi, 0.0, max_length);
|
493
|
+
for (n = 0; n < max_length; ++n) {
|
494
|
+
phi[n] = (double*)malloc(sizeof(double) * NTOPICS);
|
495
|
+
memset(phi[n], 0.0, sizeof(double)*NTOPICS);
|
496
|
+
}
|
497
|
+
|
498
|
+
// initialize model
|
499
|
+
|
500
|
+
lda_suffstats* ss = NULL;
|
501
|
+
if (strncmp(start, "seeded",6)==0) {
|
502
|
+
model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
|
503
|
+
model->alpha = INITIAL_ALPHA;
|
504
|
+
ss = new_lda_suffstats(model);
|
505
|
+
if (VERBOSE) {
|
506
|
+
corpus_initialize_ss(ss, model, corpus);
|
507
|
+
} else {
|
508
|
+
quiet_corpus_initialize_ss(ss, model, corpus);
|
509
|
+
}
|
510
|
+
if (VERBOSE) {
|
511
|
+
lda_mle(model, ss, 0);
|
512
|
+
} else {
|
513
|
+
quiet_lda_mle(model, ss, 0);
|
514
|
+
}
|
515
|
+
} else if (strncmp(start, "fixed",5)==0) {
|
516
|
+
model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
|
517
|
+
model->alpha = INITIAL_ALPHA;
|
518
|
+
ss = new_lda_suffstats(model);
|
519
|
+
corpus_initialize_fixed_ss(ss, model, corpus);
|
520
|
+
if (VERBOSE) {
|
521
|
+
lda_mle(model, ss, 0);
|
522
|
+
} else {
|
523
|
+
quiet_lda_mle(model, ss, 0);
|
524
|
+
}
|
525
|
+
} else if (strncmp(start, "random",6)==0) {
|
526
|
+
model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
|
527
|
+
model->alpha = INITIAL_ALPHA;
|
528
|
+
ss = new_lda_suffstats(model);
|
529
|
+
random_initialize_ss(ss, model);
|
530
|
+
if (VERBOSE) {
|
531
|
+
lda_mle(model, ss, 0);
|
532
|
+
} else {
|
533
|
+
quiet_lda_mle(model, ss, 0);
|
534
|
+
}
|
535
|
+
} else {
|
536
|
+
model = load_lda_model(start);
|
537
|
+
ss = new_lda_suffstats(model);
|
538
|
+
}
|
539
|
+
|
540
|
+
// save the model in the last_model global
|
541
|
+
last_model = model;
|
542
|
+
model_loaded = TRUE;
|
543
|
+
|
544
|
+
// run expectation maximization
|
545
|
+
|
546
|
+
int i = 0;
|
547
|
+
double likelihood = 0.0, likelihood_old = 0, converged = 1;
|
548
|
+
|
549
|
+
while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
|
550
|
+
i++;
|
551
|
+
if (VERBOSE) printf("**** em iteration %d ****\n", i);
|
552
|
+
likelihood = 0;
|
553
|
+
zero_initialize_ss(ss, model);
|
554
|
+
|
555
|
+
// e-step
|
556
|
+
|
557
|
+
for (d = 0; d < corpus->num_docs; d++) {
|
558
|
+
if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
|
559
|
+
likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
|
560
|
+
}
|
561
|
+
|
562
|
+
// m-step
|
563
|
+
if (VERBOSE) {
|
564
|
+
lda_mle(model, ss, ESTIMATE_ALPHA);
|
565
|
+
} else {
|
566
|
+
quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
|
567
|
+
}
|
568
|
+
|
569
|
+
// check for convergence
|
570
|
+
|
571
|
+
converged = (likelihood_old - likelihood) / (likelihood_old);
|
572
|
+
if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
|
573
|
+
likelihood_old = likelihood;
|
574
|
+
|
575
|
+
// store model and likelihood
|
576
|
+
|
577
|
+
last_model = model;
|
578
|
+
last_gamma = var_gamma;
|
579
|
+
last_phi = phi;
|
580
|
+
}
|
581
|
+
|
582
|
+
// output the final model
|
583
|
+
|
584
|
+
last_model = model;
|
585
|
+
last_gamma = var_gamma;
|
586
|
+
last_phi = phi;
|
587
|
+
|
588
|
+
free_lda_suffstats(model,ss);
|
589
|
+
|
590
|
+
// output the word assignments (for visualization)
|
591
|
+
/*
|
592
|
+
char filename[100];
|
593
|
+
sprintf(filename, "%s/word-assignments.dat", directory);
|
594
|
+
FILE* w_asgn_file = fopen(filename, "w");
|
595
|
+
for (d = 0; d < corpus->num_docs; d++) {
|
596
|
+
if ((d % 100) == 0)
|
597
|
+
printf("final e step document %d\n",d);
|
598
|
+
likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
|
599
|
+
write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
|
600
|
+
}
|
601
|
+
fclose(w_asgn_file);
|
602
|
+
*/
|
603
|
+
}
|
604
|
+
|
605
|
+
|
606
|
+
/*
|
607
|
+
* Set all of the settings in one command:
|
608
|
+
*
|
609
|
+
* * init_alpha
|
610
|
+
* * num_topics
|
611
|
+
* * max_iter
|
612
|
+
* * convergence
|
613
|
+
* * em_max_iter
|
614
|
+
* * em_convergence
|
615
|
+
* * est_alpha
|
616
|
+
*/
|
617
|
+
static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
|
618
|
+
INITIAL_ALPHA = NUM2DBL(init_alpha);
|
619
|
+
NTOPICS = NUM2INT(num_topics);
|
620
|
+
if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
|
621
|
+
VAR_MAX_ITER = NUM2INT(max_iter);
|
622
|
+
VAR_CONVERGED = (float)NUM2DBL(convergence);
|
623
|
+
EM_MAX_ITER = NUM2INT(em_max_iter);
|
624
|
+
EM_CONVERGED = (float)NUM2DBL(em_convergence);
|
625
|
+
ESTIMATE_ALPHA = NUM2INT(est_alpha);
|
626
|
+
|
627
|
+
return Qtrue;
|
628
|
+
}
|
629
|
+
|
630
|
+
/*
|
631
|
+
* Get the maximum iterations.
|
632
|
+
*/
|
633
|
+
static VALUE wrap_get_max_iter(VALUE self) {
|
634
|
+
return rb_int_new(VAR_MAX_ITER);
|
635
|
+
}
|
636
|
+
|
637
|
+
/*
|
638
|
+
* Set the maximum iterations.
|
639
|
+
*/
|
640
|
+
static VALUE wrap_set_max_iter(VALUE self, VALUE max_iter) {
|
641
|
+
VAR_MAX_ITER = NUM2INT(max_iter);
|
642
|
+
|
643
|
+
return max_iter;
|
644
|
+
}
|
645
|
+
|
646
|
+
/*
|
647
|
+
* Get the convergence setting.
|
648
|
+
*/
|
649
|
+
static VALUE wrap_get_converged(VALUE self) {
|
650
|
+
return rb_float_new(VAR_CONVERGED);
|
651
|
+
}
|
652
|
+
|
653
|
+
/*
|
654
|
+
* Set the convergence setting.
|
655
|
+
*/
|
656
|
+
static VALUE wrap_set_converged(VALUE self, VALUE converged) {
|
657
|
+
VAR_CONVERGED = (float)NUM2DBL(converged);
|
658
|
+
|
659
|
+
return converged;
|
660
|
+
}
|
661
|
+
|
662
|
+
/*
|
663
|
+
* Get the max iterations for the EM algorithm.
|
664
|
+
*/
|
665
|
+
static VALUE wrap_get_em_max_iter(VALUE self) {
|
666
|
+
return rb_int_new(EM_MAX_ITER);
|
667
|
+
}
|
668
|
+
|
669
|
+
/*
|
670
|
+
* Set the max iterations for the EM algorithm.
|
671
|
+
*/
|
672
|
+
static VALUE wrap_set_em_max_iter(VALUE self, VALUE em_max_iter) {
|
673
|
+
EM_MAX_ITER = NUM2INT(em_max_iter);
|
674
|
+
|
675
|
+
return em_max_iter;
|
676
|
+
}
|
677
|
+
|
678
|
+
/*
|
679
|
+
* Get the convergence value for EM.
|
680
|
+
*/
|
681
|
+
static VALUE wrap_get_em_converged(VALUE self) {
|
682
|
+
return rb_float_new(EM_CONVERGED);
|
683
|
+
}
|
684
|
+
|
685
|
+
/*
|
686
|
+
* Set the convergence value for EM.
|
687
|
+
*/
|
688
|
+
static VALUE wrap_set_em_converged(VALUE self, VALUE em_converged) {
|
689
|
+
EM_CONVERGED = (float)NUM2DBL(em_converged);
|
690
|
+
|
691
|
+
return em_converged;
|
692
|
+
}
|
693
|
+
|
694
|
+
/*
|
695
|
+
* Get the initial alpha value.
|
696
|
+
*/
|
697
|
+
static VALUE wrap_get_initial_alpha(VALUE self) {
|
698
|
+
return rb_float_new(INITIAL_ALPHA);
|
699
|
+
}
|
700
|
+
|
701
|
+
/*
|
702
|
+
* Get the number of topics being clustered.
|
703
|
+
*/
|
704
|
+
static VALUE wrap_get_num_topics(VALUE self) {
|
705
|
+
return rb_int_new(NTOPICS);
|
706
|
+
}
|
707
|
+
|
708
|
+
/*
|
709
|
+
* Set the initial value of alpha.
|
710
|
+
*/
|
711
|
+
static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
|
712
|
+
INITIAL_ALPHA = (float)NUM2DBL(initial_alpha);
|
713
|
+
|
714
|
+
return initial_alpha;
|
715
|
+
}
|
716
|
+
|
717
|
+
/*
|
718
|
+
* Set the number of topics to be clustered.
|
719
|
+
*/
|
720
|
+
static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) {
|
721
|
+
NTOPICS = NUM2INT(ntopics);
|
722
|
+
|
723
|
+
return ntopics;
|
724
|
+
}
|
725
|
+
|
726
|
+
/*
|
727
|
+
* Get the estimate alpha value (fixed = 0).
|
728
|
+
*/
|
729
|
+
static VALUE wrap_get_estimate_alpha(VALUE self) {
|
730
|
+
return rb_int_new(ESTIMATE_ALPHA);
|
731
|
+
}
|
732
|
+
|
733
|
+
/*
|
734
|
+
* Set the estimate alpha value (fixed = 0).
|
735
|
+
*/
|
736
|
+
static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
|
737
|
+
ESTIMATE_ALPHA = NUM2INT(est_alpha);
|
738
|
+
|
739
|
+
return est_alpha;
|
740
|
+
}
|
741
|
+
|
742
|
+
/*
|
743
|
+
* Get the verbosity setting.
|
744
|
+
*/
|
745
|
+
static VALUE wrap_get_verbosity(VALUE self) {
|
746
|
+
if (VERBOSE) {
|
747
|
+
return Qtrue;
|
748
|
+
} else {
|
749
|
+
return Qfalse;
|
750
|
+
}
|
751
|
+
}
|
752
|
+
|
753
|
+
|
754
|
+
/*
|
755
|
+
* Set the verbosity level (true, false).
|
756
|
+
*/
|
757
|
+
static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
|
758
|
+
if (verbosity == Qtrue) {
|
759
|
+
VERBOSE = TRUE;
|
760
|
+
} else {
|
761
|
+
VERBOSE = FALSE;
|
762
|
+
}
|
763
|
+
|
764
|
+
return verbosity;
|
765
|
+
}
|
766
|
+
|
767
|
+
|
768
|
+
|
769
|
+
/*
|
770
|
+
* Run the EM algorithm with the loaded corpus and using the current
|
771
|
+
* configuration settings. The +start+ parameter can take the following
|
772
|
+
* values:
|
773
|
+
* * random - starting alpha are randomized
|
774
|
+
* * seeded - loaded based on the corpus values
|
775
|
+
* * <filename> - path to the file containing the model
|
776
|
+
*/
|
777
|
+
static VALUE wrap_em(VALUE self, VALUE start) {
|
778
|
+
if (!corpus_loaded)
|
779
|
+
return Qnil;
|
780
|
+
|
781
|
+
run_quiet_em(STR2CSTR(start), last_corpus);
|
782
|
+
|
783
|
+
return Qnil;
|
784
|
+
}
|
785
|
+
|
786
|
+
|
787
|
+
/*
|
788
|
+
* Load settings from the given file.
|
789
|
+
*/
|
790
|
+
static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
|
791
|
+
read_settings(STR2CSTR(settings_file));
|
792
|
+
|
793
|
+
return Qtrue;
|
794
|
+
}
|
795
|
+
|
796
|
+
/*
|
797
|
+
* Load the corpus from the given file. This will not create
|
798
|
+
* a +Corpus+ object that is accessible, but it will load the corpus
|
799
|
+
* much faster.
|
800
|
+
*/
|
801
|
+
static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
|
802
|
+
if (!corpus_loaded) {
|
803
|
+
last_corpus = read_data(STR2CSTR(filename));
|
804
|
+
corpus_loaded = TRUE;
|
805
|
+
return Qtrue;
|
806
|
+
} else {
|
807
|
+
return Qtrue;
|
808
|
+
}
|
809
|
+
}
|
810
|
+
|
811
|
+
/*
|
812
|
+
* Set the corpus.
|
813
|
+
*/
|
814
|
+
static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
|
815
|
+
corpus* c;
|
816
|
+
int i = 0;
|
817
|
+
int j = 0;
|
818
|
+
|
819
|
+
c = malloc(sizeof(corpus));
|
820
|
+
c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms"));
|
821
|
+
c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs"));
|
822
|
+
c->docs = (document*) malloc(sizeof(document) * c->num_docs);
|
823
|
+
VALUE doc_ary = rb_iv_get(rcorpus, "@documents");
|
824
|
+
for (i = 0; i < c->num_docs; i++) {
|
825
|
+
VALUE one_doc = rb_ary_entry(doc_ary, i);
|
826
|
+
VALUE words = rb_iv_get(one_doc, "@words");
|
827
|
+
VALUE counts = rb_iv_get(one_doc, "@counts");
|
828
|
+
|
829
|
+
c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
|
830
|
+
c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
|
831
|
+
c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
|
832
|
+
c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
|
833
|
+
for (j = 0; j < c->docs[i].length; j++) {
|
834
|
+
int one_word = NUM2INT(rb_ary_entry(words, j));
|
835
|
+
int one_count = NUM2INT(rb_ary_entry(counts, j));
|
836
|
+
if( one_word > c->num_terms ) {
|
837
|
+
rb_raise(rb_eRuntimeError, "error term count(%d) less than word index(%d)", c->num_terms, one_word);
|
838
|
+
}
|
839
|
+
c->docs[i].words[j] = one_word;
|
840
|
+
c->docs[i].counts[j] = one_count;
|
841
|
+
}
|
842
|
+
}
|
843
|
+
|
844
|
+
last_corpus = c;
|
845
|
+
corpus_loaded = TRUE;
|
846
|
+
|
847
|
+
rb_iv_set(self, "@corpus", rcorpus);
|
848
|
+
|
849
|
+
return Qtrue;
|
850
|
+
}
|
851
|
+
|
852
|
+
|
853
|
+
/*
|
854
|
+
* Get the gamma values after the model has been run.
|
855
|
+
*/
|
856
|
+
static VALUE wrap_get_gamma(VALUE self) {
|
857
|
+
if (!model_loaded)
|
858
|
+
return Qnil;
|
859
|
+
|
860
|
+
// last_gamma is a double[num_docs][num_topics]
|
861
|
+
VALUE arr;
|
862
|
+
int i = 0, j = 0;
|
863
|
+
|
864
|
+
arr = rb_ary_new2(last_corpus->num_docs);
|
865
|
+
for (i = 0; i < last_corpus->num_docs; i++) {
|
866
|
+
VALUE arr2 = rb_ary_new2(last_model->num_topics);
|
867
|
+
for (j = 0; j < last_model->num_topics; j++) {
|
868
|
+
rb_ary_store(arr2, j, rb_float_new(last_gamma[i][j]));
|
869
|
+
}
|
870
|
+
rb_ary_store(arr, i, arr2);
|
871
|
+
}
|
872
|
+
|
873
|
+
return arr;
|
874
|
+
}
|
875
|
+
|
876
|
+
|
877
|
+
/*
|
878
|
+
* Compute the phi values by running inference after the initial EM run has been completed.
|
879
|
+
*
|
880
|
+
* Returns a 3D matrix: <tt>num_docs x length x num_topics</tt>.
|
881
|
+
*/
|
882
|
+
static VALUE wrap_get_phi(VALUE self) {
|
883
|
+
if (!model_loaded)
|
884
|
+
return Qnil;
|
885
|
+
|
886
|
+
VALUE arr = rb_ary_new2(last_corpus->num_docs);
|
887
|
+
int i = 0, j = 0, k = 0;
|
888
|
+
|
889
|
+
//int max_length = max_corpus_length(last_corpus);
|
890
|
+
short error = 0;
|
891
|
+
|
892
|
+
for (i = 0; i < last_corpus->num_docs; i++) {
|
893
|
+
VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
|
894
|
+
|
895
|
+
lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);
|
896
|
+
|
897
|
+
for (j = 0; j < last_corpus->docs[i].length; j++) {
|
898
|
+
VALUE arr2 = rb_ary_new2(last_model->num_topics);
|
899
|
+
|
900
|
+
for (k = 0; k < last_model->num_topics; k++) {
|
901
|
+
rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
|
902
|
+
}
|
903
|
+
|
904
|
+
rb_ary_store(arr1, j, arr2);
|
905
|
+
}
|
906
|
+
|
907
|
+
rb_ary_store(arr, i, arr1);
|
908
|
+
}
|
909
|
+
|
910
|
+
return arr;
|
911
|
+
}
|
912
|
+
|
913
|
+
|
914
|
+
|
915
|
+
/*
|
916
|
+
* Get the beta matrix after the model has been run.
|
917
|
+
*/
|
918
|
+
static VALUE wrap_get_model_beta(VALUE self) {
|
919
|
+
if (!model_loaded)
|
920
|
+
return Qnil;
|
921
|
+
|
922
|
+
// beta is a double[num_topics][num_terms]
|
923
|
+
VALUE arr;
|
924
|
+
int i = 0, j = 0;
|
925
|
+
|
926
|
+
arr = rb_ary_new2(last_model->num_topics);
|
927
|
+
for (i = 0; i < last_model->num_topics; i++) {
|
928
|
+
VALUE arr2 = rb_ary_new2(last_model->num_terms);
|
929
|
+
for (j = 0; j < last_model->num_terms; j++) {
|
930
|
+
rb_ary_store(arr2, j, rb_float_new(last_model->log_prob_w[i][j]));
|
931
|
+
}
|
932
|
+
rb_ary_store(arr, i, arr2);
|
933
|
+
}
|
934
|
+
|
935
|
+
return arr;
|
936
|
+
}
|
937
|
+
|
938
|
+
|
939
|
+
/*
|
940
|
+
* Get the settings used for the model.
|
941
|
+
*/
|
942
|
+
static VALUE wrap_get_model_settings(VALUE self) {
|
943
|
+
if (!model_loaded)
|
944
|
+
return Qnil;
|
945
|
+
|
946
|
+
VALUE arr;
|
947
|
+
|
948
|
+
arr = rb_ary_new();
|
949
|
+
rb_ary_push(arr, rb_int_new(last_model->num_topics));
|
950
|
+
rb_ary_push(arr, rb_int_new(last_model->num_terms));
|
951
|
+
rb_ary_push(arr, rb_float_new(last_model->alpha));
|
952
|
+
|
953
|
+
return arr; // [num_topics, num_terms, alpha]
|
954
|
+
}
|
955
|
+
|
956
|
+
|
957
|
+
void Init_lda() {
|
958
|
+
corpus_loaded = FALSE;
|
959
|
+
model_loaded = FALSE;
|
960
|
+
VERBOSE = TRUE;
|
961
|
+
|
962
|
+
rb_require("lda-ruby");
|
963
|
+
|
964
|
+
rb_cLdaModule = rb_define_module("Lda");
|
965
|
+
rb_cLda = rb_define_class_under(rb_cLdaModule, "Lda", rb_cObject);
|
966
|
+
rb_cLdaCorpus = rb_define_class_under(rb_cLdaModule, "Corpus", rb_cObject);
|
967
|
+
rb_cLdaDocument = rb_define_class_under(rb_cLdaModule, "Document", rb_cObject);
|
968
|
+
|
969
|
+
// method to load the corpus
|
970
|
+
rb_define_method(rb_cLda, "fast_load_corpus_from_file", wrap_load_corpus, 1);
|
971
|
+
rb_define_method(rb_cLda, "corpus=", wrap_ruby_corpus, 1);
|
972
|
+
|
973
|
+
// method to run em
|
974
|
+
rb_define_method(rb_cLda, "em", wrap_em, 1);
|
975
|
+
|
976
|
+
// method to load settings from file
|
977
|
+
rb_define_method(rb_cLda, "load_settings", wrap_load_settings, 1);
|
978
|
+
|
979
|
+
// method to set all the config options at once
|
980
|
+
rb_define_method(rb_cLda, "set_config", wrap_set_config, 5);
|
981
|
+
|
982
|
+
// accessor stuff for main settings
|
983
|
+
rb_define_method(rb_cLda, "max_iter", wrap_get_max_iter, 0);
|
984
|
+
rb_define_method(rb_cLda, "max_iter=", wrap_set_max_iter, 1);
|
985
|
+
rb_define_method(rb_cLda, "convergence", wrap_get_converged, 0);
|
986
|
+
rb_define_method(rb_cLda, "convergence=", wrap_set_converged, 1);
|
987
|
+
rb_define_method(rb_cLda, "em_max_iter", wrap_get_em_max_iter, 0);
|
988
|
+
rb_define_method(rb_cLda, "em_max_iter=", wrap_set_em_max_iter, 1);
|
989
|
+
rb_define_method(rb_cLda, "em_convergence", wrap_get_em_converged, 0);
|
990
|
+
rb_define_method(rb_cLda, "em_convergence=", wrap_set_em_converged, 1);
|
991
|
+
rb_define_method(rb_cLda, "init_alpha=", wrap_set_initial_alpha, 1);
|
992
|
+
rb_define_method(rb_cLda, "init_alpha", wrap_get_initial_alpha, 0);
|
993
|
+
rb_define_method(rb_cLda, "est_alpha=", wrap_set_estimate_alpha, 1);
|
994
|
+
rb_define_method(rb_cLda, "est_alpha", wrap_get_estimate_alpha, 0);
|
995
|
+
rb_define_method(rb_cLda, "num_topics", wrap_get_num_topics, 0);
|
996
|
+
rb_define_method(rb_cLda, "num_topics=", wrap_set_num_topics, 1);
|
997
|
+
rb_define_method(rb_cLda, "verbose", wrap_get_verbosity, 0);
|
998
|
+
rb_define_method(rb_cLda, "verbose=", wrap_set_verbosity, 1);
|
999
|
+
|
1000
|
+
// retrieve model and gamma
|
1001
|
+
rb_define_method(rb_cLda, "beta", wrap_get_model_beta, 0);
|
1002
|
+
rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0);
|
1003
|
+
rb_define_method(rb_cLda, "compute_phi", wrap_get_phi, 0);
|
1004
|
+
rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
|
1005
|
+
}
|
1006
|
+
|
1007
|
+
#endif
|