ealdent-lda-ruby 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ #ifndef LDA_INFERENCE_H
2
+ #define LDA_INFERENCE_H
3
+
4
+ #include <math.h>
5
+ #include <float.h>
6
+ #include <assert.h>
7
+ #include "lda.h"
8
+ #include "utils.h"
9
+
10
+ int LAG = 5;
11
+
12
+ float EM_CONVERGED;
13
+ int EM_MAX_ITER;
14
+ int ESTIMATE_ALPHA;
15
+ double INITIAL_ALPHA;
16
+ int NTOPICS;
17
+ float VAR_CONVERGED;
18
+ int VAR_MAX_ITER;
19
+
20
+ #ifdef USE_RUBY
21
+ corpus *last_corpus;
22
+ lda_model *last_model;
23
+ double **last_gamma;
24
+
25
+ enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded;
26
+ #endif
27
+
28
+ double lda_inference(document*, lda_model*, double*, double**);
29
+ double compute_likelihood(document*, lda_model*, double**, double*);
30
+
31
+
32
+ double doc_e_step(document* doc,
33
+ double* gamma,
34
+ double** phi,
35
+ lda_model* model,
36
+ lda_suffstats* ss);
37
+
38
+ void save_gamma(char* filename,
39
+ double** gamma,
40
+ int num_docs,
41
+ int num_topics);
42
+
43
+ void run_em(char* start,
44
+ char* directory,
45
+ corpus* corpus);
46
+
47
+ #ifdef USE_RUBY
48
+ void run_quiet_em(char* start, corpus* corpus);
49
+ #endif
50
+
51
+ void read_settings(char* filename);
52
+
53
+ void infer(char* model_root,
54
+ char* save,
55
+ corpus* corpus);
56
+
57
+ #endif
@@ -0,0 +1,238 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #include "lda-model.h"
21
+
22
+ /*
23
+ * compute MLE lda model from sufficient statistics
24
+ *
25
+ */
26
+
27
+ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
28
+ int k; int w;
29
+
30
+ for (k = 0; k < model->num_topics; k++)
31
+ {
32
+ for (w = 0; w < model->num_terms; w++)
33
+ {
34
+ if (ss->class_word[k][w] > 0)
35
+ {
36
+ model->log_prob_w[k][w] =
37
+ log(ss->class_word[k][w]) -
38
+ log(ss->class_total[k]);
39
+ }
40
+ else
41
+ model->log_prob_w[k][w] = -100;
42
+ }
43
+ }
44
+ if (estimate_alpha == 1)
45
+ {
46
+ model->alpha = opt_alpha(ss->alpha_suffstats,
47
+ ss->num_docs,
48
+ model->num_topics);
49
+
50
+ printf("new alpha = %5.5f\n", model->alpha);
51
+ }
52
+ }
53
+
54
+ /*
55
+ * allocate sufficient statistics
56
+ *
57
+ */
58
+
59
+ lda_suffstats* new_lda_suffstats(lda_model* model) {
60
+ int num_topics = model->num_topics;
61
+ int num_terms = model->num_terms;
62
+ int i,j;
63
+
64
+ lda_suffstats* ss = malloc(sizeof(lda_suffstats));
65
+ ss->class_total = malloc(sizeof(double)*num_topics);
66
+ ss->class_word = malloc(sizeof(double*)*num_topics);
67
+ for (i = 0; i < num_topics; i++)
68
+ {
69
+ ss->class_total[i] = 0;
70
+ ss->class_word[i] = malloc(sizeof(double)*num_terms);
71
+ for (j = 0; j < num_terms; j++)
72
+ {
73
+ ss->class_word[i][j] = 0;
74
+ }
75
+ }
76
+ return(ss);
77
+ }
78
+
79
+
80
+ /*
81
+ * various intializations for the sufficient statistics
82
+ *
83
+ */
84
+
85
+ void zero_initialize_ss(lda_suffstats* ss, lda_model* model) {
86
+ int k, w;
87
+ for (k = 0; k < model->num_topics; k++)
88
+ {
89
+ ss->class_total[k] = 0;
90
+ for (w = 0; w < model->num_terms; w++)
91
+ {
92
+ ss->class_word[k][w] = 0;
93
+ }
94
+ }
95
+ ss->num_docs = 0;
96
+ ss->alpha_suffstats = 0;
97
+ }
98
+
99
+
100
+ void random_initialize_ss(lda_suffstats* ss, lda_model* model) {
101
+ int num_topics = model->num_topics;
102
+ int num_terms = model->num_terms;
103
+ int k, n;
104
+ for (k = 0; k < num_topics; k++)
105
+ {
106
+ for (n = 0; n < num_terms; n++)
107
+ {
108
+ ss->class_word[k][n] += 1.0/num_terms + myrand();
109
+ ss->class_total[k] += ss->class_word[k][n];
110
+ }
111
+ }
112
+ }
113
+
114
+
115
+ void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
116
+ {
117
+ int num_topics = model->num_topics;
118
+ int i, k, d, n;
119
+ document* doc;
120
+
121
+ for (k = 0; k < num_topics; k++)
122
+ {
123
+ for (i = 0; i < NUM_INIT; i++)
124
+ {
125
+ d = floor(myrand() * c->num_docs);
126
+ printf("initialized with document %d\n", d);
127
+ doc = &(c->docs[d]);
128
+ for (n = 0; n < doc->length; n++)
129
+ {
130
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
131
+ }
132
+ }
133
+ for (n = 0; n < model->num_terms; n++)
134
+ {
135
+ ss->class_word[k][n] += 1.0;
136
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
137
+ }
138
+ }
139
+ }
140
+
141
+ /*
142
+ * allocate new lda model
143
+ *
144
+ */
145
+
146
+ lda_model* new_lda_model(int num_terms, int num_topics) {
147
+ int i,j;
148
+ lda_model* model;
149
+
150
+ model = malloc(sizeof(lda_model));
151
+ model->num_topics = num_topics;
152
+ model->num_terms = num_terms;
153
+ model->alpha = 1.0;
154
+ model->log_prob_w = malloc(sizeof(double*)*num_topics);
155
+ for (i = 0; i < num_topics; i++)
156
+ {
157
+ model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
158
+ for (j = 0; j < num_terms; j++)
159
+ model->log_prob_w[i][j] = 0;
160
+ }
161
+ return(model);
162
+ }
163
+
164
+
165
+ /*
166
+ * deallocate new lda model
167
+ *
168
+ */
169
+ void free_lda_model(lda_model* model) {
170
+ int i;
171
+
172
+ for (i = 0; i < model->num_topics; i++)
173
+ {
174
+ free(model->log_prob_w[i]);
175
+ }
176
+ free(model->log_prob_w);
177
+ }
178
+
179
+
180
+ /*
181
+ * save an lda model
182
+ *
183
+ */
184
+ void save_lda_model(lda_model* model, char* model_root) {
185
+ char filename[100];
186
+ FILE* fileptr;
187
+ int i, j;
188
+
189
+ sprintf(filename, "%s.beta", model_root);
190
+ fileptr = fopen(filename, "w");
191
+ for (i = 0; i < model->num_topics; i++) {
192
+ for (j = 0; j < model->num_terms; j++) {
193
+ fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
194
+ }
195
+ fprintf(fileptr, "\n");
196
+ }
197
+ fclose(fileptr);
198
+
199
+ sprintf(filename, "%s.other", model_root);
200
+ fileptr = fopen(filename, "w");
201
+ fprintf(fileptr, "num_topics %d\n", model->num_topics);
202
+ fprintf(fileptr, "num_terms %d\n", model->num_terms);
203
+ fprintf(fileptr, "alpha %5.10f\n", model->alpha);
204
+ fclose(fileptr);
205
+ }
206
+
207
+
208
+ lda_model* load_lda_model(char* model_root) {
209
+ char filename[100];
210
+ FILE* fileptr;
211
+ int i, j, num_terms, num_topics;
212
+ float x, alpha;
213
+
214
+ sprintf(filename, "%s.other", model_root);
215
+ printf("loading %s\n", filename);
216
+ fileptr = fopen(filename, "r");
217
+ fscanf(fileptr, "num_topics %d\n", &num_topics);
218
+ fscanf(fileptr, "num_terms %d\n", &num_terms);
219
+ fscanf(fileptr, "alpha %f\n", &alpha);
220
+ fclose(fileptr);
221
+
222
+ lda_model* model = new_lda_model(num_terms, num_topics);
223
+ model->alpha = alpha;
224
+
225
+ sprintf(filename, "%s.beta", model_root);
226
+ printf("loading %s\n", filename);
227
+ fileptr = fopen(filename, "r");
228
+ for (i = 0; i < num_topics; i++)
229
+ {
230
+ for (j = 0; j < num_terms; j++)
231
+ {
232
+ fscanf(fileptr, "%f", &x);
233
+ model->log_prob_w[i][j] = x;
234
+ }
235
+ }
236
+ fclose(fileptr);
237
+ return(model);
238
+ }
@@ -0,0 +1,24 @@
1
+ #ifndef LDA_MODEL_H
2
+ #define LDA_MODEL
3
+
4
+ #include <stdlib.h>
5
+ #include <stdio.h>
6
+ #include <math.h>
7
+ #include "lda.h"
8
+ #include "lda-alpha.h"
9
+ #include "cokus.h"
10
+
11
+ #define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
12
+ #define NUM_INIT 1
13
+
14
+ void free_lda_model(lda_model*);
15
+ void save_lda_model(lda_model*, char*);
16
+ lda_model* new_lda_model(int, int);
17
+ lda_suffstats* new_lda_suffstats(lda_model* model);
18
+ void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
19
+ void random_initialize_ss(lda_suffstats* ss, lda_model* model);
20
+ void zero_initialize_ss(lda_suffstats* ss, lda_model* model);
21
+ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
22
+ lda_model* load_lda_model(char* model_root);
23
+
24
+ #endif
data/lda.h ADDED
@@ -0,0 +1,54 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #ifndef LDA_H
21
+ #define LDA_H
22
+
23
+
24
+ typedef struct {
25
+ int* words;
26
+ int* counts;
27
+ int length;
28
+ int total;
29
+ } document;
30
+
31
+
32
+ typedef struct {
33
+ document* docs;
34
+ int num_terms;
35
+ int num_docs;
36
+ } corpus;
37
+
38
+
39
+ typedef struct {
40
+ double alpha;
41
+ double** log_prob_w;
42
+ int num_topics;
43
+ int num_terms;
44
+ } lda_model;
45
+
46
+
47
+ typedef struct {
48
+ double** class_word;
49
+ double* class_total;
50
+ double alpha_suffstats;
51
+ int num_docs;
52
+ } lda_suffstats;
53
+
54
+ #endif
data/lda.rb ADDED
@@ -0,0 +1,252 @@
1
+ require 'set'
2
+
3
+ module Lda
4
+
5
+ #
6
+ # Corpus class handles the data passed to the LDA algorithm.
7
+ #
8
+ class Corpus
9
+ attr_reader :documents, :num_docs, :num_terms
10
+
11
+ def initialize
12
+ @documents = Array.new
13
+ @all_terms = Set.new
14
+ @num_terms = 0
15
+ @num_docs = 0
16
+ end
17
+
18
+ # Add a new document to the corpus. This can either be
19
+ # an svmlight-style formatted line with the first element
20
+ # being the number of words, or it can be a Document object.
21
+ def add_document(doc)
22
+ if doc.is_a?(Document)
23
+ @documents << doc
24
+ @all_terms = @all_terms + doc.words
25
+ elsif doc.is_a?(String)
26
+ d = Document.new(doc)
27
+ @all_terms = @all_terms + d.words
28
+ @documents << d
29
+ end
30
+ @num_docs += 1
31
+ @num_terms = @all_terms.size
32
+ true
33
+ end
34
+
35
+ # Populate this corpus from the data in the file.
36
+ def load_from_file(filename)
37
+ File.open(filename, 'r') do |f|
38
+ f.each do |line|
39
+ self.add_document(line)
40
+ end
41
+ end
42
+ true
43
+ end
44
+ end
45
+
46
+ # A single document.
47
+ class Document
48
+ attr_accessor :words, :counts, :length, :total
49
+
50
+ # Create the Document using the svmlight-style text line:
51
+ #
52
+ # num_words w1:freq1 w2:freq2 ... w_n:freq_n
53
+ #
54
+ # Ex.
55
+ # 5 1:2 3:1 4:2 7:3 12:1
56
+ #
57
+ # The value for the number of words should equal the number of pairs
58
+ # following it, though this isn't strictly enforced. Order of word-pair
59
+ # indices is not important.
60
+ #
61
+ def initialize(doc_line=nil)
62
+ if doc_line.is_a?(String)
63
+ tmp = doc_line.split
64
+ @words = Array.new
65
+ @counts = Array.new
66
+ @total = 0
67
+ tmp.slice(1,tmp.size).each do |pair|
68
+ tmp2 = pair.split(":")
69
+ @words << tmp2[0].to_i
70
+ @counts << tmp2[1].to_i
71
+ end
72
+ @length = @words.size
73
+ @total = @counts.inject(0) {|sum, i| sum + i}
74
+ else # doc_line == nil
75
+ @words = Array.new
76
+ @counts = Array.new
77
+ @total = 0
78
+ @length = 0
79
+ end
80
+ end
81
+
82
+ def recompute
83
+ @total = @counts.inject(0) {|sum, i| sum + i}
84
+ @length = @words.size
85
+ end
86
+ end
87
+
88
+ class Lda
89
+ attr_reader :vocab, :corpus
90
+
91
+ #
92
+ # Create a new LDA instance with the default settings.
93
+ #
94
+ def initialize
95
+ self.load_default_settings
96
+ @corpus = nil
97
+ @vocab = nil
98
+ end
99
+
100
+ #
101
+ # Load the default settings.
102
+ # * max_iter = 20
103
+ # * convergence = 1e-6
104
+ # * em_max_iter = 100
105
+ # * em_convergence = 1e-4
106
+ # * num_topics = 20
107
+ # * init_alpha = 0.3
108
+ # * est_alpha = 1
109
+ #
110
+ def load_default_settings
111
+ self.max_iter = 20
112
+ self.convergence = 1e-6
113
+ self.em_max_iter = 100
114
+ self.em_convergence = 1e-4
115
+ self.num_topics = 20
116
+ self.init_alpha = 0.3
117
+ self.est_alpha = 1
118
+ nil
119
+ end
120
+
121
+
122
+ #
123
+ # Load the corpus from file. The corpus is in svmlight-style where the
124
+ # first element of each line is the number of words in the document and
125
+ # then each element is the pair word_idx:weight.
126
+ #
127
+ # num_words word1:wgt1 word2:wgt2 ... word_n:wgt_n
128
+ #
129
+ # The value for the number of words should equal the number of pairs
130
+ # following it, though this isn't strictly enforced in this method.
131
+ #
132
+ def load_corpus(filename)
133
+ c = Corpus.new
134
+ c.load_from_file(filename)
135
+ self.corpus = c
136
+ @corpus = c
137
+
138
+ true
139
+ end
140
+
141
+
142
+ #
143
+ # Load the vocabulary file which is a list of words, one per line
144
+ # where the line number corresponds the word list index. This allows
145
+ # the words to be extracted for topics later.
146
+ #
147
+ # +vocab+ can either be the filename of the vocabulary file or the
148
+ # array itself.
149
+ #
150
+ def load_vocabulary(vocab)
151
+ @vocab = Array.new
152
+
153
+ File.open(filename, 'r') do |f|
154
+ f.each do |line|
155
+ @vocab << line.strip
156
+ end
157
+ end
158
+
159
+ true
160
+ end
161
+
162
+
163
+ #
164
+ # Visualization method for printing out the top +words_per_topic+ words
165
+ # for each topic.
166
+ #
167
+ # See also +top_words+.
168
+ #
169
+ def print_topics(words_per_topic=10)
170
+ unless @vocab
171
+ puts "No vocabulary loaded."
172
+ return nil
173
+ end
174
+
175
+ beta = self.beta
176
+ indices = (0..(@vocab.size - 1)).to_a
177
+ topic_num = 0
178
+ beta.each do |topic|
179
+ indices.sort! {|x, y| -(topic[x] <=> topic[y])}
180
+ outp = []
181
+ puts "Topic #{topic_num}"
182
+ words_per_topic.times do |i|
183
+ outp << @vocab[indices[i]]
184
+ end
185
+ puts "\t" + outp.join("\n\t")
186
+ puts ""
187
+ topic_num += 1
188
+ end
189
+
190
+ nil
191
+ end
192
+
193
+ #
194
+ # After the model has been run and a vocabulary has been loaded, return the
195
+ # +words_per_topic+ top words chosen by the model for each topic. This is
196
+ # returned as a hash mapping the topic number to an array of top words
197
+ # (in descending order of importance).
198
+ #
199
+ # topic_number => [w1, w2, ..., w_n]
200
+ #
201
+ # See also +print_topics+.
202
+ #
203
+ def top_words(words_per_topic=10)
204
+ unless @vocab
205
+ puts "No vocabulary loaded."
206
+ return nil
207
+ end
208
+
209
+ # Load the model
210
+ beta = self.beta
211
+ unless beta
212
+ puts "Model has not been run."
213
+ return nil
214
+ end
215
+
216
+ # find the highest scoring words per topic
217
+ topics = Hash.new
218
+ indices = (0..(@vocab.size - 1)).to_a
219
+ topic_num = 0
220
+ beta.each do |topic|
221
+ topics[topic_num] = Array.new
222
+ indices.sort! {|x, y| -(topic[x] <=> topic[y])}
223
+ words_per_topic.times do |i|
224
+ topics[topic_num] << @vocab[indices[i]]
225
+ end
226
+ topic_num += 1
227
+ end
228
+
229
+ topics
230
+ end
231
+
232
+ #
233
+ # String representation displaying current settings.
234
+ #
235
+ def to_s
236
+ outp = []
237
+ outp << "LDA Settings:"
238
+ outp << " Initial alpha: %0.6f" % self.init_alpha
239
+ outp << " # of topics: %d" % self.num_topics
240
+ outp << " Max iterations: %d" % self.max_iter
241
+ outp << " Convergence: %0.6f" % self.convergence
242
+ outp << "EM max iterations: %d" % self.em_max_iter
243
+ outp << " EM convergence: %0.6f" % self.em_convergence
244
+ outp << " Estimate alpha: %d" % self.est_alpha
245
+
246
+ return outp.join("\n")
247
+ end
248
+ end
249
+ end
250
+
251
+ # load the c-side stuff
252
+ require 'lda_ext'