lda-ruby 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/CHANGELOG +22 -0
- data/README +21 -0
- data/README.markdown +38 -0
- data/Rakefile +58 -0
- data/VERSION.yml +4 -0
- data/ext/lda-ruby/Makefile +181 -0
- data/ext/lda-ruby/cokus.c +145 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +9 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1007 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +29 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +111 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby.rb +168 -0
- data/lib/lda-ruby/corpus/corpus.rb +34 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +22 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +36 -0
- data/lib/lda-ruby/document/text_document.rb +37 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/license.txt +504 -0
- data/test/data/.gitignore +2 -0
- data/test/data/docs.dat +46 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/lda_ruby_test.rb +274 -0
- data/test/test_helper.rb +10 -0
- metadata +95 -0
@@ -0,0 +1,27 @@
|
|
1
|
+
#ifndef COKUS_H
|
2
|
+
#define COKUS_H
|
3
|
+
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
|
7
|
+
//
|
8
|
+
// uint32 must be an unsigned integer type capable of holding at least 32
|
9
|
+
// bits; exactly 32 should be fastest, but 64 is better on an Alpha with
|
10
|
+
// GCC at -O3 optimization so try your options and see what's best for you
|
11
|
+
//
|
12
|
+
|
13
|
+
typedef unsigned long uint32;
|
14
|
+
|
15
|
+
#define N (624) // length of state vector
|
16
|
+
#define M (397) // a period parameter
|
17
|
+
#define K (0x9908B0DFU) // a magic constant
|
18
|
+
#define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u
|
19
|
+
#define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u
|
20
|
+
#define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u
|
21
|
+
#define mixBits(u, v) (hiBit(u)|loBits(v)) // move hi bit of u to hi bit of v
|
22
|
+
|
23
|
+
void seedMT(uint32 seed);
|
24
|
+
uint32 reloadMT(void);
|
25
|
+
uint32 randomMT(void);
|
26
|
+
|
27
|
+
#endif
|
@@ -0,0 +1,96 @@
|
|
1
|
+
// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
|
2
|
+
|
3
|
+
// This file is part of LDA-C.
|
4
|
+
|
5
|
+
// LDA-C is free software; you can redistribute it and/or modify it under
|
6
|
+
// the terms of the GNU General Public License as published by the Free
|
7
|
+
// Software Foundation; either version 2 of the License, or (at your
|
8
|
+
// option) any later version.
|
9
|
+
|
10
|
+
// LDA-C is distributed in the hope that it will be useful, but WITHOUT
|
11
|
+
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
12
|
+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
13
|
+
// for more details.
|
14
|
+
|
15
|
+
// You should have received a copy of the GNU General Public License
|
16
|
+
// along with this program; if not, write to the Free Software
|
17
|
+
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
18
|
+
// USA
|
19
|
+
|
20
|
+
#include "lda-alpha.h"
|
21
|
+
|
22
|
+
/*
|
23
|
+
* objective function and its derivatives
|
24
|
+
*
|
25
|
+
*/
|
26
|
+
|
27
|
+
double alhood(double a, double ss, int D, int K)
|
28
|
+
{ return(D * (lgamma(K * a) - K * lgamma(a)) + (a - 1) * ss); }
|
29
|
+
|
30
|
+
double d_alhood(double a, double ss, int D, int K)
|
31
|
+
{ return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }
|
32
|
+
|
33
|
+
double d2_alhood(double a, int D, int K)
|
34
|
+
{ return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }
|
35
|
+
|
36
|
+
|
37
|
+
/*
|
38
|
+
* newtons method
|
39
|
+
*
|
40
|
+
*/
|
41
|
+
|
42
|
+
double opt_alpha(double ss, int D, int K)
|
43
|
+
{
|
44
|
+
double a, log_a, init_a = 100;
|
45
|
+
double f, df, d2f;
|
46
|
+
int iter = 0;
|
47
|
+
|
48
|
+
log_a = log(init_a);
|
49
|
+
do
|
50
|
+
{
|
51
|
+
iter++;
|
52
|
+
a = exp(log_a);
|
53
|
+
if (isnan(a))
|
54
|
+
{
|
55
|
+
init_a = init_a * 10;
|
56
|
+
printf("warning : alpha is nan; new init = %5.5f\n", init_a);
|
57
|
+
a = init_a;
|
58
|
+
log_a = log(a);
|
59
|
+
}
|
60
|
+
f = alhood(a, ss, D, K);
|
61
|
+
df = d_alhood(a, ss, D, K);
|
62
|
+
d2f = d2_alhood(a, D, K);
|
63
|
+
log_a = log_a - df/(d2f * a + df);
|
64
|
+
printf("alpha maximization : %5.5f %5.5f\n", f, df);
|
65
|
+
}
|
66
|
+
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
|
67
|
+
return(exp(log_a));
|
68
|
+
}
|
69
|
+
|
70
|
+
double quiet_opt_alpha(double ss, int D, int K)
|
71
|
+
{
|
72
|
+
double a, log_a, init_a = 100;
|
73
|
+
double f, df, d2f;
|
74
|
+
int iter = 0;
|
75
|
+
|
76
|
+
log_a = log(init_a);
|
77
|
+
do
|
78
|
+
{
|
79
|
+
iter++;
|
80
|
+
a = exp(log_a);
|
81
|
+
if (isnan(a))
|
82
|
+
{
|
83
|
+
init_a = init_a * 10;
|
84
|
+
//printf("warning : alpha is nan; new init = %5.5f\n", init_a);
|
85
|
+
a = init_a;
|
86
|
+
log_a = log(a);
|
87
|
+
}
|
88
|
+
f = alhood(a, ss, D, K);
|
89
|
+
df = d_alhood(a, ss, D, K);
|
90
|
+
d2f = d2_alhood(a, D, K);
|
91
|
+
log_a = log_a - df/(d2f * a + df);
|
92
|
+
//printf("alpha maximization : %5.5f %5.5f\n", f, df);
|
93
|
+
}
|
94
|
+
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
|
95
|
+
return(exp(log_a));
|
96
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#ifndef LDA_ALPHA_H
|
2
|
+
#define LDA_ALPHA_H
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <math.h>
|
6
|
+
#include <float.h>
|
7
|
+
|
8
|
+
#include "lda.h"
|
9
|
+
#include "utils.h"
|
10
|
+
|
11
|
+
#define NEWTON_THRESH 1e-5
|
12
|
+
#define MAX_ALPHA_ITER 1000
|
13
|
+
|
14
|
+
double alhood(double a, double ss, int D, int K);
|
15
|
+
double d_alhood(double a, double ss, int D, int K);
|
16
|
+
double d2_alhood(double a, int D, int K);
|
17
|
+
double opt_alpha(double ss, int D, int K);
|
18
|
+
double quiet_opt_alpha(double ss, int D, int K);
|
19
|
+
//void maximize_alpha(double** gamma, lda_model* model, int num_docs);
|
20
|
+
|
21
|
+
#endif
|
@@ -0,0 +1,67 @@
|
|
1
|
+
// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
|
2
|
+
|
3
|
+
// This file is part of LDA-C.
|
4
|
+
|
5
|
+
// LDA-C is free software; you can redistribute it and/or modify it under
|
6
|
+
// the terms of the GNU General Public License as published by the Free
|
7
|
+
// Software Foundation; either version 2 of the License, or (at your
|
8
|
+
// option) any later version.
|
9
|
+
|
10
|
+
// LDA-C is distributed in the hope that it will be useful, but WITHOUT
|
11
|
+
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
12
|
+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
13
|
+
// for more details.
|
14
|
+
|
15
|
+
// You should have received a copy of the GNU General Public License
|
16
|
+
// along with this program; if not, write to the Free Software
|
17
|
+
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
18
|
+
// USA
|
19
|
+
|
20
|
+
#include "lda-data.h"
|
21
|
+
|
22
|
+
corpus* read_data(char* data_filename)
|
23
|
+
{
|
24
|
+
FILE *fileptr;
|
25
|
+
int length, count, word, n, nd, nw;
|
26
|
+
corpus* c;
|
27
|
+
|
28
|
+
printf("reading data from %s\n", data_filename);
|
29
|
+
c = malloc(sizeof(corpus));
|
30
|
+
c->docs = 0;
|
31
|
+
c->num_terms = 0;
|
32
|
+
c->num_docs = 0;
|
33
|
+
fileptr = fopen(data_filename, "r");
|
34
|
+
nd = 0; nw = 0;
|
35
|
+
while ((fscanf(fileptr, "%10d", &length) != EOF))
|
36
|
+
{
|
37
|
+
c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1));
|
38
|
+
c->docs[nd].length = length;
|
39
|
+
c->docs[nd].total = 0;
|
40
|
+
c->docs[nd].words = malloc(sizeof(int)*length);
|
41
|
+
c->docs[nd].counts = malloc(sizeof(int)*length);
|
42
|
+
for (n = 0; n < length; n++)
|
43
|
+
{
|
44
|
+
fscanf(fileptr, "%10d:%10d", &word, &count);
|
45
|
+
word = word - OFFSET;
|
46
|
+
c->docs[nd].words[n] = word;
|
47
|
+
c->docs[nd].counts[n] = count;
|
48
|
+
c->docs[nd].total += count;
|
49
|
+
if (word >= nw) { nw = word + 1; }
|
50
|
+
}
|
51
|
+
nd++;
|
52
|
+
}
|
53
|
+
fclose(fileptr);
|
54
|
+
c->num_docs = nd;
|
55
|
+
c->num_terms = nw;
|
56
|
+
printf("number of docs : %d\n", nd);
|
57
|
+
printf("number of terms : %d\n", nw);
|
58
|
+
return(c);
|
59
|
+
}
|
60
|
+
|
61
|
+
int max_corpus_length(corpus* c)
|
62
|
+
{
|
63
|
+
int n, max = 0;
|
64
|
+
for (n = 0; n < c->num_docs; n++)
|
65
|
+
if (c->docs[n].length > max) max = c->docs[n].length;
|
66
|
+
return(max);
|
67
|
+
}
|
@@ -0,0 +1,1007 @@
|
|
1
|
+
// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
|
2
|
+
|
3
|
+
// This file is part of LDA-C.
|
4
|
+
|
5
|
+
// LDA-C is free software; you can redistribute it and/or modify it under
|
6
|
+
// the terms of the GNU General Public License as published by the Free
|
7
|
+
// Software Foundation; either version 2 of the License, or (at your
|
8
|
+
// option) any later version.
|
9
|
+
|
10
|
+
// LDA-C is distributed in the hope that it will be useful, but WITHOUT
|
11
|
+
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
12
|
+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
13
|
+
// for more details.
|
14
|
+
|
15
|
+
// You should have received a copy of the GNU General Public License
|
16
|
+
// along with this program; if not, write to the Free Software
|
17
|
+
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
18
|
+
// USA
|
19
|
+
|
20
|
+
#include <stdlib.h>
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <math.h>
|
23
|
+
#include <float.h>
|
24
|
+
#include <string.h>
|
25
|
+
#include <time.h>
|
26
|
+
|
27
|
+
#include "lda.h"
|
28
|
+
#include "lda-data.h"
|
29
|
+
#include "lda-inference.h"
|
30
|
+
#include "lda-model.h"
|
31
|
+
#include "utils.h"
|
32
|
+
#include "cokus.h"
|
33
|
+
|
34
|
+
#ifdef USE_RUBY
|
35
|
+
#include "ruby.h"
|
36
|
+
|
37
|
+
VALUE rb_cLdaModule;
|
38
|
+
VALUE rb_cLda;
|
39
|
+
VALUE rb_cLdaCorpus;
|
40
|
+
VALUE rb_cLdaDocument;
|
41
|
+
#endif
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
/*
|
46
|
+
* variational inference
|
47
|
+
*/
|
48
|
+
|
49
|
+
double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi, short* errors) {
|
50
|
+
double converged = 1;
|
51
|
+
double phisum = 0, likelihood = 0;
|
52
|
+
double likelihood_old = 0, oldphi[model->num_topics];
|
53
|
+
int k = 0, n = 0, var_iter = 0, index = 0;
|
54
|
+
double digamma_gam[model->num_topics];
|
55
|
+
|
56
|
+
/* zero'em out */
|
57
|
+
memset(digamma_gam,0.0,sizeof(digamma_gam));
|
58
|
+
memset(oldphi,0.0,sizeof(oldphi));
|
59
|
+
|
60
|
+
// compute posterior dirichlet
|
61
|
+
|
62
|
+
for (k = 0; k < model->num_topics; k++)
|
63
|
+
{
|
64
|
+
var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
|
65
|
+
digamma_gam[k] = digamma(var_gamma[k]);
|
66
|
+
for (n = 0; n < doc->length; n++)
|
67
|
+
phi[n][k] = 1.0/model->num_topics;
|
68
|
+
}
|
69
|
+
var_iter = 0;
|
70
|
+
|
71
|
+
while ((converged > VAR_CONVERGED) &&
|
72
|
+
((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
|
73
|
+
{
|
74
|
+
var_iter++;
|
75
|
+
for (n = 0; n < doc->length; n++)
|
76
|
+
{
|
77
|
+
phisum = 0;
|
78
|
+
for (k = 0; k < model->num_topics; k++)
|
79
|
+
{
|
80
|
+
oldphi[k] = phi[n][k];
|
81
|
+
index = doc->words[n];
|
82
|
+
if( index < 0 || index > model->num_terms ) {
|
83
|
+
printf("phi for term: %d of %d\n", index, model->num_terms);
|
84
|
+
phi[n][k] = 0.0;
|
85
|
+
}
|
86
|
+
else {
|
87
|
+
phi[n][k] =
|
88
|
+
digamma_gam[k] +
|
89
|
+
model->log_prob_w[k][index];
|
90
|
+
}
|
91
|
+
|
92
|
+
if (k > 0)
|
93
|
+
phisum = log_sum(phisum, phi[n][k]);
|
94
|
+
else
|
95
|
+
phisum = phi[n][k]; // note, phi is in log space
|
96
|
+
}
|
97
|
+
|
98
|
+
for (k = 0; k < model->num_topics; k++)
|
99
|
+
{
|
100
|
+
phi[n][k] = exp(phi[n][k] - phisum);
|
101
|
+
var_gamma[k] =
|
102
|
+
var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
|
103
|
+
// !!! a lot of extra digamma's here because of how we're computing it
|
104
|
+
// !!! but its more automatically updated too.
|
105
|
+
digamma_gam[k] = digamma(var_gamma[k]);
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
likelihood = compute_likelihood(doc, model, phi, var_gamma);
|
110
|
+
//assert(!isnan(likelihood));
|
111
|
+
if( isnan(likelihood) ) { *errors = 1; }
|
112
|
+
converged = (likelihood_old - likelihood) / likelihood_old;
|
113
|
+
likelihood_old = likelihood;
|
114
|
+
|
115
|
+
// printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
|
116
|
+
}
|
117
|
+
return(likelihood);
|
118
|
+
}
|
119
|
+
|
120
|
+
|
121
|
+
/*
|
122
|
+
* compute likelihood bound
|
123
|
+
*/
|
124
|
+
|
125
|
+
double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) {
|
126
|
+
double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics];
|
127
|
+
int k = 0, n = 0, index = 0;
|
128
|
+
memset(dig,0.0,sizeof(dig));
|
129
|
+
|
130
|
+
for (k = 0; k < model->num_topics; k++)
|
131
|
+
{
|
132
|
+
dig[k] = digamma(var_gamma[k]);
|
133
|
+
var_gamma_sum += var_gamma[k];
|
134
|
+
}
|
135
|
+
digsum = digamma(var_gamma_sum);
|
136
|
+
|
137
|
+
likelihood = lgamma(model->alpha * model->num_topics) -
|
138
|
+
model->num_topics *
|
139
|
+
lgamma(model->alpha) -
|
140
|
+
lgamma(var_gamma_sum);
|
141
|
+
|
142
|
+
for (k = 0; k < model->num_topics; k++)
|
143
|
+
{
|
144
|
+
likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum);
|
145
|
+
|
146
|
+
for (n = 0; n < doc->length; n++)
|
147
|
+
{
|
148
|
+
if (phi[n][k] > 0)
|
149
|
+
{
|
150
|
+
index = doc->words[n];
|
151
|
+
likelihood += doc->counts[n]*
|
152
|
+
(phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
|
153
|
+
+ model->log_prob_w[k][index]));
|
154
|
+
}
|
155
|
+
}
|
156
|
+
}
|
157
|
+
return(likelihood);
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) {
|
162
|
+
double likelihood;
|
163
|
+
int n, k;
|
164
|
+
short error = 0;
|
165
|
+
|
166
|
+
// posterior inference
|
167
|
+
|
168
|
+
likelihood = lda_inference(doc, model, gamma, phi, &error);
|
169
|
+
if (error) { likelihood = 0.0; }
|
170
|
+
|
171
|
+
|
172
|
+
// update sufficient statistics
|
173
|
+
|
174
|
+
double gamma_sum = 0;
|
175
|
+
for (k = 0; k < model->num_topics; k++)
|
176
|
+
{
|
177
|
+
gamma_sum += gamma[k];
|
178
|
+
ss->alpha_suffstats += digamma(gamma[k]);
|
179
|
+
}
|
180
|
+
ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum);
|
181
|
+
|
182
|
+
for (n = 0; n < doc->length; n++)
|
183
|
+
{
|
184
|
+
for (k = 0; k < model->num_topics; k++)
|
185
|
+
{
|
186
|
+
ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k];
|
187
|
+
ss->class_total[k] += doc->counts[n]*phi[n][k];
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
ss->num_docs = ss->num_docs + 1;
|
192
|
+
|
193
|
+
return(likelihood);
|
194
|
+
}
|
195
|
+
|
196
|
+
|
197
|
+
/*
|
198
|
+
* writes the word assignments line for a document to a file
|
199
|
+
*/
|
200
|
+
|
201
|
+
void write_word_assignment(FILE* f, document* doc, double** phi, lda_model* model) {
|
202
|
+
int n;
|
203
|
+
|
204
|
+
fprintf(f, "%03d", doc->length);
|
205
|
+
for (n = 0; n < doc->length; n++) {
|
206
|
+
fprintf(f, " %04d:%02d", doc->words[n], argmax(phi[n], model->num_topics));
|
207
|
+
}
|
208
|
+
fprintf(f, "\n");
|
209
|
+
fflush(f);
|
210
|
+
}
|
211
|
+
|
212
|
+
|
213
|
+
/*
|
214
|
+
* saves the gamma parameters of the current dataset
|
215
|
+
*/
|
216
|
+
|
217
|
+
void save_gamma(char* filename, double** gamma, int num_docs, int num_topics) {
|
218
|
+
FILE* fileptr;
|
219
|
+
int d, k;
|
220
|
+
fileptr = fopen(filename, "w");
|
221
|
+
|
222
|
+
for (d = 0; d < num_docs; d++) {
|
223
|
+
fprintf(fileptr, "%5.10f", gamma[d][0]);
|
224
|
+
for (k = 1; k < num_topics; k++) {
|
225
|
+
fprintf(fileptr, " %5.10f", gamma[d][k]);
|
226
|
+
}
|
227
|
+
fprintf(fileptr, "\n");
|
228
|
+
}
|
229
|
+
fclose(fileptr);
|
230
|
+
}
|
231
|
+
|
232
|
+
|
233
|
+
void run_em(char* start, char* directory, corpus* corpus) {
|
234
|
+
int d, n;
|
235
|
+
lda_model *model = NULL;
|
236
|
+
double **var_gamma, **phi;
|
237
|
+
|
238
|
+
// allocate variational parameters
|
239
|
+
|
240
|
+
|
241
|
+
var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
|
242
|
+
for (d = 0; d < corpus->num_docs; d++)
|
243
|
+
var_gamma[d] = malloc(sizeof(double) * NTOPICS);
|
244
|
+
|
245
|
+
int max_length = max_corpus_length(corpus);
|
246
|
+
phi = malloc(sizeof(double*)*max_length);
|
247
|
+
for (n = 0; n < max_length; n++)
|
248
|
+
phi[n] = malloc(sizeof(double) * NTOPICS);
|
249
|
+
|
250
|
+
// initialize model
|
251
|
+
|
252
|
+
char filename[100];
|
253
|
+
|
254
|
+
lda_suffstats* ss = NULL;
|
255
|
+
if (strcmp(start, "seeded")==0) {
|
256
|
+
model = new_lda_model(corpus->num_terms, NTOPICS);
|
257
|
+
ss = new_lda_suffstats(model);
|
258
|
+
corpus_initialize_ss(ss, model, corpus);
|
259
|
+
if (VERBOSE) {
|
260
|
+
lda_mle(model, ss, 0);
|
261
|
+
} else {
|
262
|
+
quiet_lda_mle(model, ss, 0);
|
263
|
+
}
|
264
|
+
|
265
|
+
model->alpha = INITIAL_ALPHA;
|
266
|
+
} else if (strcmp(start, "random")==0) {
|
267
|
+
model = new_lda_model(corpus->num_terms, NTOPICS);
|
268
|
+
ss = new_lda_suffstats(model);
|
269
|
+
random_initialize_ss(ss, model);
|
270
|
+
if (VERBOSE) {
|
271
|
+
lda_mle(model, ss, 0);
|
272
|
+
} else {
|
273
|
+
quiet_lda_mle(model, ss, 0);
|
274
|
+
}
|
275
|
+
model->alpha = INITIAL_ALPHA;
|
276
|
+
} else {
|
277
|
+
model = load_lda_model(start);
|
278
|
+
ss = new_lda_suffstats(model);
|
279
|
+
}
|
280
|
+
|
281
|
+
sprintf(filename,"%s/000",directory);
|
282
|
+
save_lda_model(model, filename);
|
283
|
+
|
284
|
+
// run expectation maximization
|
285
|
+
|
286
|
+
int i = 0;
|
287
|
+
double likelihood, likelihood_old = 0, converged = 1;
|
288
|
+
sprintf(filename, "%s/likelihood.dat", directory);
|
289
|
+
FILE* likelihood_file = fopen(filename, "w");
|
290
|
+
|
291
|
+
while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
|
292
|
+
i++;
|
293
|
+
if (VERBOSE)
|
294
|
+
printf("**** em iteration %d ****\n", i);
|
295
|
+
likelihood = 0;
|
296
|
+
zero_initialize_ss(ss, model);
|
297
|
+
|
298
|
+
// e-step
|
299
|
+
printf("e-step\n");
|
300
|
+
|
301
|
+
for (d = 0; d < corpus->num_docs; d++) {
|
302
|
+
if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
|
303
|
+
likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
|
304
|
+
}
|
305
|
+
printf("m-step\n");
|
306
|
+
|
307
|
+
// m-step
|
308
|
+
if (VERBOSE) {
|
309
|
+
lda_mle(model, ss, ESTIMATE_ALPHA);
|
310
|
+
} else {
|
311
|
+
quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
|
312
|
+
}
|
313
|
+
|
314
|
+
// check for convergence
|
315
|
+
converged = (likelihood_old - likelihood) / (likelihood_old);
|
316
|
+
if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
|
317
|
+
likelihood_old = likelihood;
|
318
|
+
|
319
|
+
// output model and likelihood
|
320
|
+
|
321
|
+
fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
|
322
|
+
fflush(likelihood_file);
|
323
|
+
if ((i % LAG) == 0)
|
324
|
+
{
|
325
|
+
sprintf(filename,"%s/%03d",directory, i);
|
326
|
+
save_lda_model(model, filename);
|
327
|
+
sprintf(filename,"%s/%03d.gamma",directory, i);
|
328
|
+
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
|
329
|
+
}
|
330
|
+
}
|
331
|
+
|
332
|
+
// output the final model
|
333
|
+
|
334
|
+
sprintf(filename,"%s/final",directory);
|
335
|
+
save_lda_model(model, filename);
|
336
|
+
sprintf(filename,"%s/final.gamma",directory);
|
337
|
+
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
|
338
|
+
|
339
|
+
// output the word assignments (for visualization)
|
340
|
+
|
341
|
+
sprintf(filename, "%s/word-assignments.dat", directory);
|
342
|
+
FILE* w_asgn_file = fopen(filename, "w");
|
343
|
+
short error = 0;
|
344
|
+
double tl = 0.0;
|
345
|
+
for (d = 0; d < corpus->num_docs; d++)
|
346
|
+
{
|
347
|
+
if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
|
348
|
+
error = 0;
|
349
|
+
tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error);
|
350
|
+
if( error ) { continue; }
|
351
|
+
likelihood += tl;
|
352
|
+
write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
|
353
|
+
}
|
354
|
+
fclose(w_asgn_file);
|
355
|
+
fclose(likelihood_file);
|
356
|
+
}
|
357
|
+
|
358
|
+
|
359
|
+
/*
|
360
|
+
* read settings.
|
361
|
+
*/
|
362
|
+
|
363
|
+
void read_settings(char* filename) {
|
364
|
+
FILE* fileptr;
|
365
|
+
char alpha_action[100];
|
366
|
+
fileptr = fopen(filename, "r");
|
367
|
+
fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER);
|
368
|
+
fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED);
|
369
|
+
fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER);
|
370
|
+
fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED);
|
371
|
+
fscanf(fileptr, "alpha %s", alpha_action);
|
372
|
+
if (strcmp(alpha_action, "fixed")==0)
|
373
|
+
{
|
374
|
+
ESTIMATE_ALPHA = 0;
|
375
|
+
}
|
376
|
+
else
|
377
|
+
{
|
378
|
+
ESTIMATE_ALPHA = 1;
|
379
|
+
}
|
380
|
+
fclose(fileptr);
|
381
|
+
}
|
382
|
+
|
383
|
+
|
384
|
+
|
385
|
+
|
386
|
+
/*
|
387
|
+
* inference only
|
388
|
+
*
|
389
|
+
*/
|
390
|
+
|
391
|
+
void infer(char* model_root, char* save, corpus* corpus) {
|
392
|
+
FILE* fileptr;
|
393
|
+
char filename[100];
|
394
|
+
int i, d, n;
|
395
|
+
lda_model *model;
|
396
|
+
double **var_gamma, likelihood, **phi;
|
397
|
+
document* doc;
|
398
|
+
|
399
|
+
model = load_lda_model(model_root);
|
400
|
+
var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
|
401
|
+
for (i = 0; i < corpus->num_docs; i++)
|
402
|
+
var_gamma[i] = malloc(sizeof(double)*model->num_topics);
|
403
|
+
sprintf(filename, "%s-lda-lhood.dat", save);
|
404
|
+
fileptr = fopen(filename, "w");
|
405
|
+
for (d = 0; d < corpus->num_docs; d++) {
|
406
|
+
if (((d % 100) == 0) && (d>0) && VERBOSE) printf("document %d\n",d);
|
407
|
+
|
408
|
+
doc = &(corpus->docs[d]);
|
409
|
+
phi = (double**) malloc(sizeof(double*) * doc->length);
|
410
|
+
for (n = 0; n < doc->length; n++)
|
411
|
+
phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
|
412
|
+
short error = 0;
|
413
|
+
likelihood = lda_inference(doc, model, var_gamma[d], phi, &error);
|
414
|
+
|
415
|
+
fprintf(fileptr, "%5.5f\n", likelihood);
|
416
|
+
}
|
417
|
+
fclose(fileptr);
|
418
|
+
sprintf(filename, "%s-gamma.dat", save);
|
419
|
+
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
|
420
|
+
}
|
421
|
+
|
422
|
+
|
423
|
+
/*
|
424
|
+
* update sufficient statistics
|
425
|
+
*
|
426
|
+
*/
|
427
|
+
|
428
|
+
|
429
|
+
|
430
|
+
/*
|
431
|
+
* main
|
432
|
+
*
|
433
|
+
*/
|
434
|
+
|
435
|
+
int main(int argc, char* argv[]) {
|
436
|
+
corpus* corpus;
|
437
|
+
|
438
|
+
long t1;
|
439
|
+
(void) time(&t1);
|
440
|
+
seedMT(t1);
|
441
|
+
// seedMT(4357U);
|
442
|
+
|
443
|
+
if (argc > 1)
|
444
|
+
{
|
445
|
+
if (strcmp(argv[1], "est")==0)
|
446
|
+
{
|
447
|
+
INITIAL_ALPHA = atof(argv[2]);
|
448
|
+
NTOPICS = atoi(argv[3]);
|
449
|
+
read_settings(argv[4]);
|
450
|
+
corpus = read_data(argv[5]);
|
451
|
+
make_directory(argv[7]);
|
452
|
+
run_em(argv[6], argv[7], corpus);
|
453
|
+
}
|
454
|
+
if (strcmp(argv[1], "inf")==0)
|
455
|
+
{
|
456
|
+
read_settings(argv[2]);
|
457
|
+
corpus = read_data(argv[4]);
|
458
|
+
infer(argv[3], argv[5], corpus);
|
459
|
+
}
|
460
|
+
}
|
461
|
+
else
|
462
|
+
{
|
463
|
+
printf("usage : lda est [initial alpha] [k] [settings] [data] [random/seeded/*] [directory]\n");
|
464
|
+
printf(" lda inf [settings] [model] [data] [name]\n");
|
465
|
+
}
|
466
|
+
return(0);
|
467
|
+
}
|
468
|
+
|
469
|
+
#ifdef USE_RUBY
|
470
|
+
|
471
|
+
/* */
|
472
|
+
void run_quiet_em(char* start, corpus* corpus) {
|
473
|
+
int d = 0, n = 0;
|
474
|
+
lda_model *model = NULL;
|
475
|
+
double **var_gamma = NULL, **phi = NULL;
|
476
|
+
// last_gamma is a double[num_docs][num_topics]
|
477
|
+
|
478
|
+
// allocate variational parameters
|
479
|
+
|
480
|
+
|
481
|
+
var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
|
482
|
+
memset(var_gamma, 0.0, corpus->num_docs);
|
483
|
+
|
484
|
+
for (d = 0; d < corpus->num_docs; ++d) {
|
485
|
+
var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS);
|
486
|
+
memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS);
|
487
|
+
}
|
488
|
+
|
489
|
+
int max_length = max_corpus_length(corpus);
|
490
|
+
|
491
|
+
phi = (double**)malloc(sizeof(double*)*max_length);
|
492
|
+
memset(phi, 0.0, max_length);
|
493
|
+
for (n = 0; n < max_length; ++n) {
|
494
|
+
phi[n] = (double*)malloc(sizeof(double) * NTOPICS);
|
495
|
+
memset(phi[n], 0.0, sizeof(double)*NTOPICS);
|
496
|
+
}
|
497
|
+
|
498
|
+
// initialize model
|
499
|
+
|
500
|
+
lda_suffstats* ss = NULL;
|
501
|
+
if (strncmp(start, "seeded",6)==0) {
|
502
|
+
model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
|
503
|
+
model->alpha = INITIAL_ALPHA;
|
504
|
+
ss = new_lda_suffstats(model);
|
505
|
+
if (VERBOSE) {
|
506
|
+
corpus_initialize_ss(ss, model, corpus);
|
507
|
+
} else {
|
508
|
+
quiet_corpus_initialize_ss(ss, model, corpus);
|
509
|
+
}
|
510
|
+
if (VERBOSE) {
|
511
|
+
lda_mle(model, ss, 0);
|
512
|
+
} else {
|
513
|
+
quiet_lda_mle(model, ss, 0);
|
514
|
+
}
|
515
|
+
} else if (strncmp(start, "fixed",5)==0) {
|
516
|
+
model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
|
517
|
+
model->alpha = INITIAL_ALPHA;
|
518
|
+
ss = new_lda_suffstats(model);
|
519
|
+
corpus_initialize_fixed_ss(ss, model, corpus);
|
520
|
+
if (VERBOSE) {
|
521
|
+
lda_mle(model, ss, 0);
|
522
|
+
} else {
|
523
|
+
quiet_lda_mle(model, ss, 0);
|
524
|
+
}
|
525
|
+
} else if (strncmp(start, "random",6)==0) {
|
526
|
+
model = quiet_new_lda_model(corpus->num_terms, NTOPICS);
|
527
|
+
model->alpha = INITIAL_ALPHA;
|
528
|
+
ss = new_lda_suffstats(model);
|
529
|
+
random_initialize_ss(ss, model);
|
530
|
+
if (VERBOSE) {
|
531
|
+
lda_mle(model, ss, 0);
|
532
|
+
} else {
|
533
|
+
quiet_lda_mle(model, ss, 0);
|
534
|
+
}
|
535
|
+
} else {
|
536
|
+
model = load_lda_model(start);
|
537
|
+
ss = new_lda_suffstats(model);
|
538
|
+
}
|
539
|
+
|
540
|
+
// save the model in the last_model global
|
541
|
+
last_model = model;
|
542
|
+
model_loaded = TRUE;
|
543
|
+
|
544
|
+
// run expectation maximization
|
545
|
+
|
546
|
+
int i = 0;
|
547
|
+
double likelihood = 0.0, likelihood_old = 0, converged = 1;
|
548
|
+
|
549
|
+
while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
|
550
|
+
i++;
|
551
|
+
if (VERBOSE) printf("**** em iteration %d ****\n", i);
|
552
|
+
likelihood = 0;
|
553
|
+
zero_initialize_ss(ss, model);
|
554
|
+
|
555
|
+
// e-step
|
556
|
+
|
557
|
+
for (d = 0; d < corpus->num_docs; d++) {
|
558
|
+
if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
|
559
|
+
likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
|
560
|
+
}
|
561
|
+
|
562
|
+
// m-step
|
563
|
+
if (VERBOSE) {
|
564
|
+
lda_mle(model, ss, ESTIMATE_ALPHA);
|
565
|
+
} else {
|
566
|
+
quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
|
567
|
+
}
|
568
|
+
|
569
|
+
// check for convergence
|
570
|
+
|
571
|
+
converged = (likelihood_old - likelihood) / (likelihood_old);
|
572
|
+
if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
|
573
|
+
likelihood_old = likelihood;
|
574
|
+
|
575
|
+
// store model and likelihood
|
576
|
+
|
577
|
+
last_model = model;
|
578
|
+
last_gamma = var_gamma;
|
579
|
+
last_phi = phi;
|
580
|
+
}
|
581
|
+
|
582
|
+
// output the final model
|
583
|
+
|
584
|
+
last_model = model;
|
585
|
+
last_gamma = var_gamma;
|
586
|
+
last_phi = phi;
|
587
|
+
|
588
|
+
free_lda_suffstats(model,ss);
|
589
|
+
|
590
|
+
// output the word assignments (for visualization)
|
591
|
+
/*
|
592
|
+
char filename[100];
|
593
|
+
sprintf(filename, "%s/word-assignments.dat", directory);
|
594
|
+
FILE* w_asgn_file = fopen(filename, "w");
|
595
|
+
for (d = 0; d < corpus->num_docs; d++) {
|
596
|
+
if ((d % 100) == 0)
|
597
|
+
printf("final e step document %d\n",d);
|
598
|
+
likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
|
599
|
+
write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
|
600
|
+
}
|
601
|
+
fclose(w_asgn_file);
|
602
|
+
*/
|
603
|
+
}
|
604
|
+
|
605
|
+
|
606
|
+
/*
|
607
|
+
* Set all of the settings in one command:
|
608
|
+
*
|
609
|
+
* * init_alpha
|
610
|
+
* * num_topics
|
611
|
+
* * max_iter
|
612
|
+
* * convergence
|
613
|
+
* * em_max_iter
|
614
|
+
* * em_convergence
|
615
|
+
* * est_alpha
|
616
|
+
*/
|
617
|
+
static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) {
|
618
|
+
INITIAL_ALPHA = NUM2DBL(init_alpha);
|
619
|
+
NTOPICS = NUM2INT(num_topics);
|
620
|
+
if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); }
|
621
|
+
VAR_MAX_ITER = NUM2INT(max_iter);
|
622
|
+
VAR_CONVERGED = (float)NUM2DBL(convergence);
|
623
|
+
EM_MAX_ITER = NUM2INT(em_max_iter);
|
624
|
+
EM_CONVERGED = (float)NUM2DBL(em_convergence);
|
625
|
+
ESTIMATE_ALPHA = NUM2INT(est_alpha);
|
626
|
+
|
627
|
+
return Qtrue;
|
628
|
+
}
|
629
|
+
|
630
|
+
/*
|
631
|
+
* Get the maximum iterations.
|
632
|
+
*/
|
633
|
+
static VALUE wrap_get_max_iter(VALUE self) {
|
634
|
+
return rb_int_new(VAR_MAX_ITER);
|
635
|
+
}
|
636
|
+
|
637
|
+
/*
|
638
|
+
* Set the maximum iterations.
|
639
|
+
*/
|
640
|
+
static VALUE wrap_set_max_iter(VALUE self, VALUE max_iter) {
|
641
|
+
VAR_MAX_ITER = NUM2INT(max_iter);
|
642
|
+
|
643
|
+
return max_iter;
|
644
|
+
}
|
645
|
+
|
646
|
+
/*
|
647
|
+
* Get the convergence setting.
|
648
|
+
*/
|
649
|
+
static VALUE wrap_get_converged(VALUE self) {
|
650
|
+
return rb_float_new(VAR_CONVERGED);
|
651
|
+
}
|
652
|
+
|
653
|
+
/*
|
654
|
+
* Set the convergence setting.
|
655
|
+
*/
|
656
|
+
static VALUE wrap_set_converged(VALUE self, VALUE converged) {
|
657
|
+
VAR_CONVERGED = (float)NUM2DBL(converged);
|
658
|
+
|
659
|
+
return converged;
|
660
|
+
}
|
661
|
+
|
662
|
+
/*
|
663
|
+
* Get the max iterations for the EM algorithm.
|
664
|
+
*/
|
665
|
+
static VALUE wrap_get_em_max_iter(VALUE self) {
|
666
|
+
return rb_int_new(EM_MAX_ITER);
|
667
|
+
}
|
668
|
+
|
669
|
+
/*
|
670
|
+
* Set the max iterations for the EM algorithm.
|
671
|
+
*/
|
672
|
+
static VALUE wrap_set_em_max_iter(VALUE self, VALUE em_max_iter) {
|
673
|
+
EM_MAX_ITER = NUM2INT(em_max_iter);
|
674
|
+
|
675
|
+
return em_max_iter;
|
676
|
+
}
|
677
|
+
|
678
|
+
/*
|
679
|
+
* Get the convergence value for EM.
|
680
|
+
*/
|
681
|
+
static VALUE wrap_get_em_converged(VALUE self) {
|
682
|
+
return rb_float_new(EM_CONVERGED);
|
683
|
+
}
|
684
|
+
|
685
|
+
/*
|
686
|
+
* Set the convergence value for EM.
|
687
|
+
*/
|
688
|
+
static VALUE wrap_set_em_converged(VALUE self, VALUE em_converged) {
|
689
|
+
EM_CONVERGED = (float)NUM2DBL(em_converged);
|
690
|
+
|
691
|
+
return em_converged;
|
692
|
+
}
|
693
|
+
|
694
|
+
/*
|
695
|
+
* Get the initial alpha value.
|
696
|
+
*/
|
697
|
+
static VALUE wrap_get_initial_alpha(VALUE self) {
|
698
|
+
return rb_float_new(INITIAL_ALPHA);
|
699
|
+
}
|
700
|
+
|
701
|
+
/*
|
702
|
+
* Get the number of topics being clustered.
|
703
|
+
*/
|
704
|
+
static VALUE wrap_get_num_topics(VALUE self) {
|
705
|
+
return rb_int_new(NTOPICS);
|
706
|
+
}
|
707
|
+
|
708
|
+
/*
|
709
|
+
* Set the initial value of alpha.
|
710
|
+
*/
|
711
|
+
static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) {
|
712
|
+
INITIAL_ALPHA = (float)NUM2DBL(initial_alpha);
|
713
|
+
|
714
|
+
return initial_alpha;
|
715
|
+
}
|
716
|
+
|
717
|
+
/*
|
718
|
+
* Set the number of topics to be clustered.
|
719
|
+
*/
|
720
|
+
static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) {
|
721
|
+
NTOPICS = NUM2INT(ntopics);
|
722
|
+
|
723
|
+
return ntopics;
|
724
|
+
}
|
725
|
+
|
726
|
+
/*
|
727
|
+
* Get the estimate alpha value (fixed = 0).
|
728
|
+
*/
|
729
|
+
static VALUE wrap_get_estimate_alpha(VALUE self) {
|
730
|
+
return rb_int_new(ESTIMATE_ALPHA);
|
731
|
+
}
|
732
|
+
|
733
|
+
/*
|
734
|
+
* Set the estimate alpha value (fixed = 0).
|
735
|
+
*/
|
736
|
+
static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
|
737
|
+
ESTIMATE_ALPHA = NUM2INT(est_alpha);
|
738
|
+
|
739
|
+
return est_alpha;
|
740
|
+
}
|
741
|
+
|
742
|
+
/*
|
743
|
+
* Get the verbosity setting.
|
744
|
+
*/
|
745
|
+
static VALUE wrap_get_verbosity(VALUE self) {
|
746
|
+
if (VERBOSE) {
|
747
|
+
return Qtrue;
|
748
|
+
} else {
|
749
|
+
return Qfalse;
|
750
|
+
}
|
751
|
+
}
|
752
|
+
|
753
|
+
|
754
|
+
/*
|
755
|
+
* Set the verbosity level (true, false).
|
756
|
+
*/
|
757
|
+
static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
|
758
|
+
if (verbosity == Qtrue) {
|
759
|
+
VERBOSE = TRUE;
|
760
|
+
} else {
|
761
|
+
VERBOSE = FALSE;
|
762
|
+
}
|
763
|
+
|
764
|
+
return verbosity;
|
765
|
+
}
|
766
|
+
|
767
|
+
|
768
|
+
|
769
|
+
/*
|
770
|
+
* Run the EM algorithm with the loaded corpus and using the current
|
771
|
+
* configuration settings. The +start+ parameter can take the following
|
772
|
+
* values:
|
773
|
+
* * random - starting alpha are randomized
|
774
|
+
* * seeded - loaded based on the corpus values
|
775
|
+
* * <filename> - path to the file containing the model
|
776
|
+
*/
|
777
|
+
static VALUE wrap_em(VALUE self, VALUE start) {
|
778
|
+
if (!corpus_loaded)
|
779
|
+
return Qnil;
|
780
|
+
|
781
|
+
run_quiet_em(STR2CSTR(start), last_corpus);
|
782
|
+
|
783
|
+
return Qnil;
|
784
|
+
}
|
785
|
+
|
786
|
+
|
787
|
+
/*
|
788
|
+
* Load settings from the given file.
|
789
|
+
*/
|
790
|
+
static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
|
791
|
+
read_settings(STR2CSTR(settings_file));
|
792
|
+
|
793
|
+
return Qtrue;
|
794
|
+
}
|
795
|
+
|
796
|
+
/*
|
797
|
+
* Load the corpus from the given file. This will not create
|
798
|
+
* a +Corpus+ object that is accessible, but it will load the corpus
|
799
|
+
* much faster.
|
800
|
+
*/
|
801
|
+
static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
|
802
|
+
if (!corpus_loaded) {
|
803
|
+
last_corpus = read_data(STR2CSTR(filename));
|
804
|
+
corpus_loaded = TRUE;
|
805
|
+
return Qtrue;
|
806
|
+
} else {
|
807
|
+
return Qtrue;
|
808
|
+
}
|
809
|
+
}
|
810
|
+
|
811
|
+
/*
|
812
|
+
* Set the corpus.
|
813
|
+
*/
|
814
|
+
static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) {
|
815
|
+
corpus* c;
|
816
|
+
int i = 0;
|
817
|
+
int j = 0;
|
818
|
+
|
819
|
+
c = malloc(sizeof(corpus));
|
820
|
+
c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms"));
|
821
|
+
c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs"));
|
822
|
+
c->docs = (document*) malloc(sizeof(document) * c->num_docs);
|
823
|
+
VALUE doc_ary = rb_iv_get(rcorpus, "@documents");
|
824
|
+
for (i = 0; i < c->num_docs; i++) {
|
825
|
+
VALUE one_doc = rb_ary_entry(doc_ary, i);
|
826
|
+
VALUE words = rb_iv_get(one_doc, "@words");
|
827
|
+
VALUE counts = rb_iv_get(one_doc, "@counts");
|
828
|
+
|
829
|
+
c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length"));
|
830
|
+
c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total"));
|
831
|
+
c->docs[i].words = malloc(sizeof(int) * c->docs[i].length);
|
832
|
+
c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length);
|
833
|
+
for (j = 0; j < c->docs[i].length; j++) {
|
834
|
+
int one_word = NUM2INT(rb_ary_entry(words, j));
|
835
|
+
int one_count = NUM2INT(rb_ary_entry(counts, j));
|
836
|
+
if( one_word > c->num_terms ) {
|
837
|
+
rb_raise(rb_eRuntimeError, "error term count(%d) less than word index(%d)", c->num_terms, one_word);
|
838
|
+
}
|
839
|
+
c->docs[i].words[j] = one_word;
|
840
|
+
c->docs[i].counts[j] = one_count;
|
841
|
+
}
|
842
|
+
}
|
843
|
+
|
844
|
+
last_corpus = c;
|
845
|
+
corpus_loaded = TRUE;
|
846
|
+
|
847
|
+
rb_iv_set(self, "@corpus", rcorpus);
|
848
|
+
|
849
|
+
return Qtrue;
|
850
|
+
}
|
851
|
+
|
852
|
+
|
853
|
+
/*
|
854
|
+
* Get the gamma values after the model has been run.
|
855
|
+
*/
|
856
|
+
static VALUE wrap_get_gamma(VALUE self) {
|
857
|
+
if (!model_loaded)
|
858
|
+
return Qnil;
|
859
|
+
|
860
|
+
// last_gamma is a double[num_docs][num_topics]
|
861
|
+
VALUE arr;
|
862
|
+
int i = 0, j = 0;
|
863
|
+
|
864
|
+
arr = rb_ary_new2(last_corpus->num_docs);
|
865
|
+
for (i = 0; i < last_corpus->num_docs; i++) {
|
866
|
+
VALUE arr2 = rb_ary_new2(last_model->num_topics);
|
867
|
+
for (j = 0; j < last_model->num_topics; j++) {
|
868
|
+
rb_ary_store(arr2, j, rb_float_new(last_gamma[i][j]));
|
869
|
+
}
|
870
|
+
rb_ary_store(arr, i, arr2);
|
871
|
+
}
|
872
|
+
|
873
|
+
return arr;
|
874
|
+
}
|
875
|
+
|
876
|
+
|
877
|
+
/*
|
878
|
+
* Compute the phi values by running inference after the initial EM run has been completed.
|
879
|
+
*
|
880
|
+
* Returns a 3D matrix: <tt>num_docs x length x num_topics</tt>.
|
881
|
+
*/
|
882
|
+
static VALUE wrap_get_phi(VALUE self) {
|
883
|
+
if (!model_loaded)
|
884
|
+
return Qnil;
|
885
|
+
|
886
|
+
VALUE arr = rb_ary_new2(last_corpus->num_docs);
|
887
|
+
int i = 0, j = 0, k = 0;
|
888
|
+
|
889
|
+
//int max_length = max_corpus_length(last_corpus);
|
890
|
+
short error = 0;
|
891
|
+
|
892
|
+
for (i = 0; i < last_corpus->num_docs; i++) {
|
893
|
+
VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length);
|
894
|
+
|
895
|
+
lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error);
|
896
|
+
|
897
|
+
for (j = 0; j < last_corpus->docs[i].length; j++) {
|
898
|
+
VALUE arr2 = rb_ary_new2(last_model->num_topics);
|
899
|
+
|
900
|
+
for (k = 0; k < last_model->num_topics; k++) {
|
901
|
+
rb_ary_store(arr2, k, rb_float_new(last_phi[j][k]));
|
902
|
+
}
|
903
|
+
|
904
|
+
rb_ary_store(arr1, j, arr2);
|
905
|
+
}
|
906
|
+
|
907
|
+
rb_ary_store(arr, i, arr1);
|
908
|
+
}
|
909
|
+
|
910
|
+
return arr;
|
911
|
+
}
|
912
|
+
|
913
|
+
|
914
|
+
|
915
|
+
/*
|
916
|
+
* Get the beta matrix after the model has been run.
|
917
|
+
*/
|
918
|
+
static VALUE wrap_get_model_beta(VALUE self) {
|
919
|
+
if (!model_loaded)
|
920
|
+
return Qnil;
|
921
|
+
|
922
|
+
// beta is a double[num_topics][num_terms]
|
923
|
+
VALUE arr;
|
924
|
+
int i = 0, j = 0;
|
925
|
+
|
926
|
+
arr = rb_ary_new2(last_model->num_topics);
|
927
|
+
for (i = 0; i < last_model->num_topics; i++) {
|
928
|
+
VALUE arr2 = rb_ary_new2(last_model->num_terms);
|
929
|
+
for (j = 0; j < last_model->num_terms; j++) {
|
930
|
+
rb_ary_store(arr2, j, rb_float_new(last_model->log_prob_w[i][j]));
|
931
|
+
}
|
932
|
+
rb_ary_store(arr, i, arr2);
|
933
|
+
}
|
934
|
+
|
935
|
+
return arr;
|
936
|
+
}
|
937
|
+
|
938
|
+
|
939
|
+
/*
|
940
|
+
* Get the settings used for the model.
|
941
|
+
*/
|
942
|
+
static VALUE wrap_get_model_settings(VALUE self) {
|
943
|
+
if (!model_loaded)
|
944
|
+
return Qnil;
|
945
|
+
|
946
|
+
VALUE arr;
|
947
|
+
|
948
|
+
arr = rb_ary_new();
|
949
|
+
rb_ary_push(arr, rb_int_new(last_model->num_topics));
|
950
|
+
rb_ary_push(arr, rb_int_new(last_model->num_terms));
|
951
|
+
rb_ary_push(arr, rb_float_new(last_model->alpha));
|
952
|
+
|
953
|
+
return arr; // [num_topics, num_terms, alpha]
|
954
|
+
}
|
955
|
+
|
956
|
+
|
957
|
+
void Init_lda() {
|
958
|
+
corpus_loaded = FALSE;
|
959
|
+
model_loaded = FALSE;
|
960
|
+
VERBOSE = TRUE;
|
961
|
+
|
962
|
+
rb_require("lda-ruby");
|
963
|
+
|
964
|
+
rb_cLdaModule = rb_define_module("Lda");
|
965
|
+
rb_cLda = rb_define_class_under(rb_cLdaModule, "Lda", rb_cObject);
|
966
|
+
rb_cLdaCorpus = rb_define_class_under(rb_cLdaModule, "Corpus", rb_cObject);
|
967
|
+
rb_cLdaDocument = rb_define_class_under(rb_cLdaModule, "Document", rb_cObject);
|
968
|
+
|
969
|
+
// method to load the corpus
|
970
|
+
rb_define_method(rb_cLda, "fast_load_corpus_from_file", wrap_load_corpus, 1);
|
971
|
+
rb_define_method(rb_cLda, "corpus=", wrap_ruby_corpus, 1);
|
972
|
+
|
973
|
+
// method to run em
|
974
|
+
rb_define_method(rb_cLda, "em", wrap_em, 1);
|
975
|
+
|
976
|
+
// method to load settings from file
|
977
|
+
rb_define_method(rb_cLda, "load_settings", wrap_load_settings, 1);
|
978
|
+
|
979
|
+
// method to set all the config options at once
|
980
|
+
rb_define_method(rb_cLda, "set_config", wrap_set_config, 5);
|
981
|
+
|
982
|
+
// accessor stuff for main settings
|
983
|
+
rb_define_method(rb_cLda, "max_iter", wrap_get_max_iter, 0);
|
984
|
+
rb_define_method(rb_cLda, "max_iter=", wrap_set_max_iter, 1);
|
985
|
+
rb_define_method(rb_cLda, "convergence", wrap_get_converged, 0);
|
986
|
+
rb_define_method(rb_cLda, "convergence=", wrap_set_converged, 1);
|
987
|
+
rb_define_method(rb_cLda, "em_max_iter", wrap_get_em_max_iter, 0);
|
988
|
+
rb_define_method(rb_cLda, "em_max_iter=", wrap_set_em_max_iter, 1);
|
989
|
+
rb_define_method(rb_cLda, "em_convergence", wrap_get_em_converged, 0);
|
990
|
+
rb_define_method(rb_cLda, "em_convergence=", wrap_set_em_converged, 1);
|
991
|
+
rb_define_method(rb_cLda, "init_alpha=", wrap_set_initial_alpha, 1);
|
992
|
+
rb_define_method(rb_cLda, "init_alpha", wrap_get_initial_alpha, 0);
|
993
|
+
rb_define_method(rb_cLda, "est_alpha=", wrap_set_estimate_alpha, 1);
|
994
|
+
rb_define_method(rb_cLda, "est_alpha", wrap_get_estimate_alpha, 0);
|
995
|
+
rb_define_method(rb_cLda, "num_topics", wrap_get_num_topics, 0);
|
996
|
+
rb_define_method(rb_cLda, "num_topics=", wrap_set_num_topics, 1);
|
997
|
+
rb_define_method(rb_cLda, "verbose", wrap_get_verbosity, 0);
|
998
|
+
rb_define_method(rb_cLda, "verbose=", wrap_set_verbosity, 1);
|
999
|
+
|
1000
|
+
// retrieve model and gamma
|
1001
|
+
rb_define_method(rb_cLda, "beta", wrap_get_model_beta, 0);
|
1002
|
+
rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0);
|
1003
|
+
rb_define_method(rb_cLda, "compute_phi", wrap_get_phi, 0);
|
1004
|
+
rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0);
|
1005
|
+
}
|
1006
|
+
|
1007
|
+
#endif
|