ealdent-lda-ruby 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/{cokus.c → lib/cokus.c} +0 -0
- data/{cokus.h → lib/cokus.h} +0 -0
- data/{extconf.rb → lib/extconf.rb} +0 -0
- data/{lda-alpha.c → lib/lda-alpha.c} +28 -0
- data/{lda-alpha.h → lib/lda-alpha.h} +2 -1
- data/{lda-data.c → lib/lda-data.c} +0 -0
- data/{lda-data.h → lib/lda-data.h} +0 -0
- data/{lda-inference.c → lib/lda-inference.c} +70 -13
- data/{lda-inference.h → lib/lda-inference.h} +8 -2
- data/{lda-model.c → lib/lda-model.c} +30 -0
- data/{lda-model.h → lib/lda-model.h} +1 -1
- data/{lda.h → lib/lda.h} +0 -0
- data/{lda.rb → lib/lda.rb} +0 -0
- data/{utils.c → lib/utils.c} +0 -0
- data/{utils.h → lib/utils.h} +0 -0
- metadata +19 -19
data/{cokus.c → lib/cokus.c}
RENAMED
File without changes
|
data/{cokus.h → lib/cokus.h}
RENAMED
File without changes
|
File without changes
|
@@ -66,3 +66,31 @@ double opt_alpha(double ss, int D, int K)
|
|
66
66
|
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
|
67
67
|
return(exp(log_a));
|
68
68
|
}
|
69
|
+
|
70
|
+
double quiet_opt_alpha(double ss, int D, int K)
|
71
|
+
{
|
72
|
+
double a, log_a, init_a = 100;
|
73
|
+
double f, df, d2f;
|
74
|
+
int iter = 0;
|
75
|
+
|
76
|
+
log_a = log(init_a);
|
77
|
+
do
|
78
|
+
{
|
79
|
+
iter++;
|
80
|
+
a = exp(log_a);
|
81
|
+
if (isnan(a))
|
82
|
+
{
|
83
|
+
init_a = init_a * 10;
|
84
|
+
//printf("warning : alpha is nan; new init = %5.5f\n", init_a);
|
85
|
+
a = init_a;
|
86
|
+
log_a = log(a);
|
87
|
+
}
|
88
|
+
f = alhood(a, ss, D, K);
|
89
|
+
df = d_alhood(a, ss, D, K);
|
90
|
+
d2f = d2_alhood(a, D, K);
|
91
|
+
log_a = log_a - df/(d2f * a + df);
|
92
|
+
//printf("alpha maximization : %5.5f %5.5f\n", f, df);
|
93
|
+
}
|
94
|
+
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
|
95
|
+
return(exp(log_a));
|
96
|
+
}
|
@@ -15,6 +15,7 @@ double alhood(double a, double ss, int D, int K);
|
|
15
15
|
double d_alhood(double a, double ss, int D, int K);
|
16
16
|
double d2_alhood(double a, int D, int K);
|
17
17
|
double opt_alpha(double ss, int D, int K);
|
18
|
-
|
18
|
+
double quiet_opt_alpha(double ss, int D, int K);
|
19
|
+
//void maximize_alpha(double** gamma, lda_model* model, int num_docs);
|
19
20
|
|
20
21
|
#endif
|
File without changes
|
File without changes
|
@@ -32,7 +32,6 @@
|
|
32
32
|
#include "lda-data.h"
|
33
33
|
#include "lda-inference.h"
|
34
34
|
#include "lda-model.h"
|
35
|
-
#include "lda-alpha.h"
|
36
35
|
#include "utils.h"
|
37
36
|
#include "cokus.h"
|
38
37
|
|
@@ -45,6 +44,8 @@ VALUE rb_cLdaCorpus;
|
|
45
44
|
VALUE rb_cLdaDocument;
|
46
45
|
#endif
|
47
46
|
|
47
|
+
|
48
|
+
|
48
49
|
/*
|
49
50
|
* variational inference
|
50
51
|
*/
|
@@ -238,13 +239,22 @@ void run_em(char* start, char* directory, corpus* corpus) {
|
|
238
239
|
model = new_lda_model(corpus->num_terms, NTOPICS);
|
239
240
|
ss = new_lda_suffstats(model);
|
240
241
|
corpus_initialize_ss(ss, model, corpus);
|
241
|
-
|
242
|
+
if (VERBOSE) {
|
243
|
+
lda_mle(model, ss, 0);
|
244
|
+
} else {
|
245
|
+
quiet_lda_mle(model, ss, 0);
|
246
|
+
}
|
247
|
+
|
242
248
|
model->alpha = INITIAL_ALPHA;
|
243
249
|
} else if (strcmp(start, "random")==0) {
|
244
250
|
model = new_lda_model(corpus->num_terms, NTOPICS);
|
245
251
|
ss = new_lda_suffstats(model);
|
246
252
|
random_initialize_ss(ss, model);
|
247
|
-
|
253
|
+
if (VERBOSE) {
|
254
|
+
lda_mle(model, ss, 0);
|
255
|
+
} else {
|
256
|
+
quiet_lda_mle(model, ss, 0);
|
257
|
+
}
|
248
258
|
model->alpha = INITIAL_ALPHA;
|
249
259
|
} else {
|
250
260
|
model = load_lda_model(start);
|
@@ -263,20 +273,26 @@ void run_em(char* start, char* directory, corpus* corpus) {
|
|
263
273
|
|
264
274
|
while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
|
265
275
|
i++;
|
266
|
-
|
276
|
+
if (VERBOSE)
|
277
|
+
printf("**** em iteration %d ****\n", i);
|
267
278
|
likelihood = 0;
|
268
279
|
zero_initialize_ss(ss, model);
|
269
280
|
|
270
281
|
// e-step
|
271
282
|
|
272
283
|
for (d = 0; d < corpus->num_docs; d++) {
|
273
|
-
if ((d % 1000) == 0) printf("document %d\n",d);
|
284
|
+
if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
|
274
285
|
likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
|
275
286
|
}
|
276
287
|
|
277
288
|
// m-step
|
278
289
|
|
279
|
-
|
290
|
+
if (VERBOSE) {
|
291
|
+
lda_mle(model, ss, ESTIMATE_ALPHA);
|
292
|
+
} else {
|
293
|
+
quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
|
294
|
+
}
|
295
|
+
|
280
296
|
|
281
297
|
// check for convergence
|
282
298
|
|
@@ -310,7 +326,7 @@ void run_em(char* start, char* directory, corpus* corpus) {
|
|
310
326
|
FILE* w_asgn_file = fopen(filename, "w");
|
311
327
|
for (d = 0; d < corpus->num_docs; d++)
|
312
328
|
{
|
313
|
-
if ((d % 100) == 0) printf("final e step document %d\n",d);
|
329
|
+
if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d);
|
314
330
|
likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
|
315
331
|
write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model);
|
316
332
|
}
|
@@ -366,7 +382,7 @@ void infer(char* model_root, char* save, corpus* corpus) {
|
|
366
382
|
sprintf(filename, "%s-lda-lhood.dat", save);
|
367
383
|
fileptr = fopen(filename, "w");
|
368
384
|
for (d = 0; d < corpus->num_docs; d++) {
|
369
|
-
if (((d % 100) == 0) && (d>0)) printf("document %d\n",d);
|
385
|
+
if (((d % 100) == 0) && (d>0) && VERBOSE) printf("document %d\n",d);
|
370
386
|
|
371
387
|
doc = &(corpus->docs[d]);
|
372
388
|
phi = (double**) malloc(sizeof(double*) * doc->length);
|
@@ -454,13 +470,21 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
454
470
|
model = new_lda_model(corpus->num_terms, NTOPICS);
|
455
471
|
ss = new_lda_suffstats(model);
|
456
472
|
corpus_initialize_ss(ss, model, corpus);
|
457
|
-
|
473
|
+
if (VERBOSE) {
|
474
|
+
lda_mle(model, ss, 0);
|
475
|
+
} else {
|
476
|
+
quiet_lda_mle(model, ss, 0);
|
477
|
+
}
|
458
478
|
model->alpha = INITIAL_ALPHA;
|
459
479
|
} else if (strcmp(start, "random")==0) {
|
460
480
|
model = new_lda_model(corpus->num_terms, NTOPICS);
|
461
481
|
ss = new_lda_suffstats(model);
|
462
482
|
random_initialize_ss(ss, model);
|
463
|
-
|
483
|
+
if (VERBOSE) {
|
484
|
+
lda_mle(model, ss, 0);
|
485
|
+
} else {
|
486
|
+
quiet_lda_mle(model, ss, 0);
|
487
|
+
}
|
464
488
|
model->alpha = INITIAL_ALPHA;
|
465
489
|
} else {
|
466
490
|
model = load_lda_model(start);
|
@@ -478,20 +502,25 @@ void run_quiet_em(char* start, corpus* corpus) {
|
|
478
502
|
|
479
503
|
while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) {
|
480
504
|
i++;
|
481
|
-
|
505
|
+
if (VERBOSE)
|
506
|
+
printf("**** em iteration %d ****\n", i);
|
482
507
|
likelihood = 0;
|
483
508
|
zero_initialize_ss(ss, model);
|
484
509
|
|
485
510
|
// e-step
|
486
511
|
|
487
512
|
for (d = 0; d < corpus->num_docs; d++) {
|
488
|
-
if ((d % 1000) == 0) printf("document %d\n",d);
|
513
|
+
if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d);
|
489
514
|
likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss);
|
490
515
|
}
|
491
516
|
|
492
517
|
// m-step
|
493
518
|
|
494
|
-
|
519
|
+
if (VERBOSE) {
|
520
|
+
lda_mle(model, ss, ESTIMATE_ALPHA);
|
521
|
+
} else {
|
522
|
+
quiet_lda_mle(model, ss, ESTIMATE_ALPHA);
|
523
|
+
}
|
495
524
|
|
496
525
|
// check for convergence
|
497
526
|
|
@@ -661,6 +690,31 @@ static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) {
|
|
661
690
|
return est_alpha;
|
662
691
|
}
|
663
692
|
|
693
|
+
/*
|
694
|
+
* Get the verbosity setting.
|
695
|
+
*/
|
696
|
+
static VALUE wrap_get_verbosity(VALUE self) {
|
697
|
+
if (VERBOSE) {
|
698
|
+
return Qtrue;
|
699
|
+
} else {
|
700
|
+
return Qfalse;
|
701
|
+
}
|
702
|
+
}
|
703
|
+
|
704
|
+
|
705
|
+
/*
|
706
|
+
* Set the verbosity level (true, false).
|
707
|
+
*/
|
708
|
+
static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) {
|
709
|
+
if (verbosity == Qtrue) {
|
710
|
+
VERBOSE = TRUE;
|
711
|
+
} else {
|
712
|
+
VERBOSE = FALSE;
|
713
|
+
}
|
714
|
+
|
715
|
+
return verbosity;
|
716
|
+
}
|
717
|
+
|
664
718
|
|
665
719
|
|
666
720
|
/*
|
@@ -812,6 +866,7 @@ static VALUE wrap_get_model_settings(VALUE self) {
|
|
812
866
|
void Init_lda_ext() {
|
813
867
|
corpus_loaded = FALSE;
|
814
868
|
model_loaded = FALSE;
|
869
|
+
VERBOSE = TRUE;
|
815
870
|
|
816
871
|
rb_require("lda");
|
817
872
|
|
@@ -865,6 +920,8 @@ void Init_lda_ext() {
|
|
865
920
|
rb_define_method(rb_cLda, "est_alpha", wrap_get_estimate_alpha, 0);
|
866
921
|
rb_define_method(rb_cLda, "num_topics", wrap_get_num_topics, 0);
|
867
922
|
rb_define_method(rb_cLda, "num_topics=", wrap_set_num_topics, 1);
|
923
|
+
rb_define_method(rb_cLda, "verbose", wrap_get_verbosity, 0);
|
924
|
+
rb_define_method(rb_cLda, "verbose=", wrap_set_verbosity, 1);
|
868
925
|
|
869
926
|
// retrieve model and gamma
|
870
927
|
rb_define_method(rb_cLda, "beta", wrap_get_model_beta, 0);
|
@@ -1,12 +1,15 @@
|
|
1
1
|
#ifndef LDA_INFERENCE_H
|
2
2
|
#define LDA_INFERENCE_H
|
3
3
|
|
4
|
+
#include <stdlib.h>
|
4
5
|
#include <math.h>
|
5
6
|
#include <float.h>
|
6
7
|
#include <assert.h>
|
7
8
|
#include "lda.h"
|
8
9
|
#include "utils.h"
|
9
10
|
|
11
|
+
|
12
|
+
|
10
13
|
int LAG = 5;
|
11
14
|
|
12
15
|
float EM_CONVERGED;
|
@@ -22,9 +25,11 @@ corpus *last_corpus;
|
|
22
25
|
lda_model *last_model;
|
23
26
|
double **last_gamma;
|
24
27
|
|
25
|
-
enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded;
|
28
|
+
enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded, VERBOSE;
|
26
29
|
#endif
|
27
30
|
|
31
|
+
|
32
|
+
|
28
33
|
double lda_inference(document*, lda_model*, double*, double**);
|
29
34
|
double compute_likelihood(document*, lda_model*, double**, double*);
|
30
35
|
|
@@ -53,5 +58,6 @@ void read_settings(char* filename);
|
|
53
58
|
void infer(char* model_root,
|
54
59
|
char* save,
|
55
60
|
corpus* corpus);
|
61
|
+
|
56
62
|
|
57
|
-
#endif
|
63
|
+
#endif
|
@@ -51,6 +51,36 @@ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
|
|
51
51
|
}
|
52
52
|
}
|
53
53
|
|
54
|
+
void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
|
55
|
+
int k; int w;
|
56
|
+
|
57
|
+
for (k = 0; k < model->num_topics; k++)
|
58
|
+
{
|
59
|
+
for (w = 0; w < model->num_terms; w++)
|
60
|
+
{
|
61
|
+
if (ss->class_word[k][w] > 0)
|
62
|
+
{
|
63
|
+
model->log_prob_w[k][w] =
|
64
|
+
log(ss->class_word[k][w]) -
|
65
|
+
log(ss->class_total[k]);
|
66
|
+
}
|
67
|
+
else
|
68
|
+
model->log_prob_w[k][w] = -100;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
if (estimate_alpha == 1)
|
72
|
+
{
|
73
|
+
model->alpha = opt_alpha(ss->alpha_suffstats,
|
74
|
+
ss->num_docs,
|
75
|
+
model->num_topics);
|
76
|
+
|
77
|
+
printf("new alpha = %5.5f\n", model->alpha);
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
|
54
84
|
/*
|
55
85
|
* allocate sufficient statistics
|
56
86
|
*
|
@@ -5,7 +5,6 @@
|
|
5
5
|
#include <stdio.h>
|
6
6
|
#include <math.h>
|
7
7
|
#include "lda.h"
|
8
|
-
#include "lda-alpha.h"
|
9
8
|
#include "cokus.h"
|
10
9
|
|
11
10
|
#define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
|
@@ -19,6 +18,7 @@ void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
|
|
19
18
|
void random_initialize_ss(lda_suffstats* ss, lda_model* model);
|
20
19
|
void zero_initialize_ss(lda_suffstats* ss, lda_model* model);
|
21
20
|
void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
|
21
|
+
void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
|
22
22
|
lda_model* load_lda_model(char* model_root);
|
23
23
|
|
24
24
|
#endif
|
data/{lda.h → lib/lda.h}
RENAMED
File without changes
|
data/{lda.rb → lib/lda.rb}
RENAMED
File without changes
|
data/{utils.c → lib/utils.c}
RENAMED
File without changes
|
data/{utils.h → lib/utils.h}
RENAMED
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ealdent-lda-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jason M. Adams
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2008-11-
|
13
|
+
date: 2008-11-19 00:00:00 -08:00
|
14
14
|
default_executable:
|
15
15
|
dependencies: []
|
16
16
|
|
@@ -18,28 +18,28 @@ description:
|
|
18
18
|
email: jasonmadams@gmail.com
|
19
19
|
executables: []
|
20
20
|
|
21
|
-
extensions:
|
22
|
-
|
21
|
+
extensions:
|
22
|
+
- lib/extconf.rb
|
23
23
|
extra_rdoc_files: []
|
24
24
|
|
25
25
|
files:
|
26
26
|
- README
|
27
27
|
- license.txt
|
28
|
-
- cokus.c
|
29
|
-
- cokus.h
|
30
|
-
- extconf.rb
|
31
|
-
- lda-alpha.c
|
32
|
-
- lda-alpha.h
|
33
|
-
- lda-data.c
|
34
|
-
- lda-data.h
|
35
|
-
- lda-inference.c
|
36
|
-
- lda-inference.h
|
37
|
-
- lda-model.c
|
38
|
-
- lda-model.h
|
39
|
-
- lda.h
|
40
|
-
- lda.rb
|
41
|
-
- utils.c
|
42
|
-
- utils.h
|
28
|
+
- lib/cokus.c
|
29
|
+
- lib/cokus.h
|
30
|
+
- lib/extconf.rb
|
31
|
+
- lib/lda-alpha.c
|
32
|
+
- lib/lda-alpha.h
|
33
|
+
- lib/lda-data.c
|
34
|
+
- lib/lda-data.h
|
35
|
+
- lib/lda-inference.c
|
36
|
+
- lib/lda-inference.h
|
37
|
+
- lib/lda-model.c
|
38
|
+
- lib/lda-model.h
|
39
|
+
- lib/lda.h
|
40
|
+
- lib/lda.rb
|
41
|
+
- lib/utils.c
|
42
|
+
- lib/utils.h
|
43
43
|
has_rdoc: true
|
44
44
|
homepage: http://github.com/ealdent/lda-ruby
|
45
45
|
post_install_message:
|