lda-ruby 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +61 -0
  3. data/Gemfile +9 -0
  4. data/README.md +157 -0
  5. data/VERSION.yml +5 -0
  6. data/docs/modernization-handoff.md +190 -0
  7. data/docs/porting-strategy.md +127 -0
  8. data/docs/precompiled-platform-policy.md +68 -0
  9. data/docs/release-runbook.md +157 -0
  10. data/ext/lda-ruby/cokus.c +145 -0
  11. data/ext/lda-ruby/cokus.h +27 -0
  12. data/ext/lda-ruby/extconf.rb +13 -0
  13. data/ext/lda-ruby/lda-alpha.c +96 -0
  14. data/ext/lda-ruby/lda-alpha.h +21 -0
  15. data/ext/lda-ruby/lda-data.c +67 -0
  16. data/ext/lda-ruby/lda-data.h +14 -0
  17. data/ext/lda-ruby/lda-inference.c +1023 -0
  18. data/ext/lda-ruby/lda-inference.h +63 -0
  19. data/ext/lda-ruby/lda-model.c +345 -0
  20. data/ext/lda-ruby/lda-model.h +31 -0
  21. data/ext/lda-ruby/lda.h +54 -0
  22. data/ext/lda-ruby/utils.c +111 -0
  23. data/ext/lda-ruby/utils.h +18 -0
  24. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  25. data/ext/lda-ruby-rust/README.md +48 -0
  26. data/ext/lda-ruby-rust/extconf.rb +123 -0
  27. data/ext/lda-ruby-rust/src/lib.rs +456 -0
  28. data/lda-ruby.gemspec +78 -0
  29. data/lib/lda-ruby/backends/base.rb +129 -0
  30. data/lib/lda-ruby/backends/native.rb +158 -0
  31. data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
  32. data/lib/lda-ruby/backends/rust.rb +226 -0
  33. data/lib/lda-ruby/backends.rb +58 -0
  34. data/lib/lda-ruby/config/stopwords.yml +571 -0
  35. data/lib/lda-ruby/corpus/corpus.rb +45 -0
  36. data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
  37. data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
  38. data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
  39. data/lib/lda-ruby/document/data_document.rb +30 -0
  40. data/lib/lda-ruby/document/document.rb +40 -0
  41. data/lib/lda-ruby/document/text_document.rb +39 -0
  42. data/lib/lda-ruby/lda.so +0 -0
  43. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  44. data/lib/lda-ruby/version.rb +5 -0
  45. data/lib/lda-ruby/vocabulary.rb +46 -0
  46. data/lib/lda-ruby.rb +413 -0
  47. data/lib/lda_ruby_rust.so +0 -0
  48. data/license.txt +504 -0
  49. data/test/backend_compatibility_test.rb +146 -0
  50. data/test/backends_selection_test.rb +100 -0
  51. data/test/data/docs.dat +46 -0
  52. data/test/data/sample.rb +20 -0
  53. data/test/data/wiki-test-docs.yml +123 -0
  54. data/test/gemspec_test.rb +27 -0
  55. data/test/lda_ruby_test.rb +319 -0
  56. data/test/packaged_gem_smoke_test.rb +33 -0
  57. data/test/release_scripts_test.rb +54 -0
  58. data/test/rust_build_policy_test.rb +23 -0
  59. data/test/simple_pipeline_test.rb +22 -0
  60. data/test/simple_yaml.rb +17 -0
  61. data/test/test_helper.rb +10 -0
  62. metadata +111 -0
@@ -0,0 +1,63 @@
1
+ #ifndef LDA_INFERENCE_H
2
+ #define LDA_INFERENCE_H
3
+
4
+ #include <stdlib.h>
5
+ #include <math.h>
6
+ #include <float.h>
7
+ #include "lda.h"
8
+ #include "utils.h"
9
+
10
+
11
+
12
+ int LAG = 5;
13
+
14
+ float EM_CONVERGED;
15
+ int EM_MAX_ITER;
16
+ int ESTIMATE_ALPHA;
17
+ double INITIAL_ALPHA;
18
+ int NTOPICS;
19
+ float VAR_CONVERGED;
20
+ int VAR_MAX_ITER;
21
+
22
+ #ifdef USE_RUBY
23
+ corpus *last_corpus;
24
+ lda_model *last_model;
25
+ double **last_gamma;
26
+ double **last_phi;
27
+
28
+ enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded, VERBOSE;
29
+ #endif
30
+
31
+
32
+
33
+ double lda_inference(document*, lda_model*, double*, double**, short*);
34
+ double compute_likelihood(document*, lda_model*, double**, double*);
35
+
36
+
37
+ double doc_e_step(document* doc,
38
+ double* gamma,
39
+ double** phi,
40
+ lda_model* model,
41
+ lda_suffstats* ss);
42
+
43
+ void save_gamma(char* filename,
44
+ double** gamma,
45
+ int num_docs,
46
+ int num_topics);
47
+
48
+ void run_em(char* start,
49
+ char* directory,
50
+ corpus* corpus);
51
+
52
+ #ifdef USE_RUBY
53
+ void run_quiet_em(char* start, corpus* corpus);
54
+ #endif
55
+
56
+ void read_settings(char* filename);
57
+
58
+ void infer(char* model_root,
59
+ char* save,
60
+ corpus* corpus);
61
+
62
+
63
+ #endif
@@ -0,0 +1,345 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #include "lda-model.h"
21
+ #include <string.h>
22
+
23
+
24
+ /*
25
+ * compute MLE lda model from sufficient statistics
26
+ *
27
+ */
28
+
29
+ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
30
+ int k; int w;
31
+
32
+ for (k = 0; k < model->num_topics; k++)
33
+ {
34
+ for (w = 0; w < model->num_terms; w++)
35
+ {
36
+ if (ss->class_word[k][w] > 0)
37
+ {
38
+ model->log_prob_w[k][w] =
39
+ log(ss->class_word[k][w]) -
40
+ log(ss->class_total[k]);
41
+ }
42
+ else
43
+ model->log_prob_w[k][w] = -100;
44
+ }
45
+ }
46
+ if (estimate_alpha == 1)
47
+ {
48
+ model->alpha = opt_alpha(ss->alpha_suffstats,
49
+ ss->num_docs,
50
+ model->num_topics);
51
+
52
+ printf("new alpha = %5.5f\n", model->alpha);
53
+ }
54
+ }
55
+
56
+ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) {
57
+ int k; int w;
58
+
59
+ for (k = 0; k < model->num_topics; k++)
60
+ {
61
+ for (w = 0; w < model->num_terms; w++)
62
+ {
63
+ if (ss->class_word[k][w] > 0)
64
+ {
65
+ model->log_prob_w[k][w] =
66
+ log(ss->class_word[k][w]) -
67
+ log(ss->class_total[k]);
68
+ }
69
+ else
70
+ model->log_prob_w[k][w] = -100;
71
+ }
72
+ }
73
+ if (estimate_alpha == 1)
74
+ {
75
+ model->alpha = quiet_opt_alpha(ss->alpha_suffstats,
76
+ ss->num_docs,
77
+ model->num_topics);
78
+ }
79
+ }
80
+
81
+
82
+
83
+
84
+ /*
85
+ * allocate sufficient statistics
86
+ *
87
+ */
88
+
89
+ lda_suffstats* new_lda_suffstats(lda_model* model) {
90
+ register int i;
91
+ int num_topics = model->num_topics;
92
+ int num_terms = model->num_terms;
93
+
94
+ lda_suffstats* ss = (lda_suffstats*)malloc(sizeof(lda_suffstats));
95
+ memset(ss,0,sizeof(lda_suffstats));
96
+ ss->class_total = (double*)malloc(sizeof(double)*num_topics);
97
+ ss->class_word = (double**)malloc(sizeof(double*)*num_topics);
98
+
99
+ for (i = 0; i < num_topics; ++i) {
100
+ ss->class_total[i] = 0;
101
+ ss->class_word[i] = (double*)malloc(sizeof(double)*num_terms);
102
+ memset(ss->class_word[i],0.0,sizeof(double)*num_terms);
103
+ }
104
+
105
+ return(ss);
106
+ }
107
+ /*
108
+ * deallocate new lda suffstats
109
+ *
110
+ */
111
+ void free_lda_suffstats(lda_model* model, lda_suffstats* ss) {
112
+ int i;
113
+ int num_topics = model->num_topics;
114
+
115
+ free(ss->class_total);
116
+ for (i = 0; i < num_topics; ++i) {
117
+ free(ss->class_word[i]);
118
+ }
119
+ free(ss->class_word);
120
+ free(ss);
121
+ }
122
+
123
+ /*
124
+ * various intializations for the sufficient statistics
125
+ *
126
+ */
127
+
128
+ void zero_initialize_ss(lda_suffstats* ss, lda_model* model) {
129
+ int k, w;
130
+ for (k = 0; k < model->num_topics; k++)
131
+ {
132
+ ss->class_total[k] = 0;
133
+ for (w = 0; w < model->num_terms; w++)
134
+ {
135
+ ss->class_word[k][w] = 0;
136
+ }
137
+ }
138
+ ss->num_docs = 0;
139
+ ss->alpha_suffstats = 0;
140
+ }
141
+
142
+
143
+ void random_initialize_ss(lda_suffstats* ss, lda_model* model) {
144
+ int num_topics = model->num_topics;
145
+ int num_terms = model->num_terms;
146
+ int k, n;
147
+
148
+ for (k = 0; k < num_topics; k++)
149
+ {
150
+ for (n = 0; n < num_terms; n++)
151
+ {
152
+ ss->class_word[k][n] += 1.0/num_terms + myrand();
153
+ ss->class_total[k] += ss->class_word[k][n];
154
+ }
155
+ }
156
+ }
157
+
158
+
159
+ void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
160
+ {
161
+ int num_topics = model->num_topics;
162
+ int i, k, d, n;
163
+ document* doc;
164
+
165
+ for (k = 0; k < num_topics; k++)
166
+ {
167
+ for (i = 0; i < NUM_INIT; i++)
168
+ {
169
+ d = floor(myrand() * c->num_docs);
170
+ printf("initialized with document %d\n", d);
171
+ doc = &(c->docs[d]);
172
+ for (n = 0; n < doc->length; n++)
173
+ {
174
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
175
+ }
176
+ }
177
+ for (n = 0; n < model->num_terms; n++)
178
+ {
179
+ ss->class_word[k][n] += 1.0;
180
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
181
+ }
182
+ }
183
+ }
184
+
185
+ void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
186
+ {
187
+ int num_topics = model->num_topics;
188
+ int i, k, d, n;
189
+ document* doc;
190
+
191
+ for (k = 0; k < num_topics; k++)
192
+ {
193
+ for (i = 0; i < NUM_INIT; i++)
194
+ {
195
+ d = floor(myrand() * c->num_docs);
196
+ doc = &(c->docs[d]);
197
+ for (n = 0; n < doc->length; n++)
198
+ {
199
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
200
+ }
201
+ }
202
+ for (n = 0; n < model->num_terms; n++)
203
+ {
204
+ ss->class_word[k][n] += 1.0;
205
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
206
+ }
207
+ }
208
+ }
209
+
210
+
211
+ /*
212
+ * Use the first num_topics documents of the corpus as the seeds. If num_topics > num_docs, results might be hairy.
213
+ */
214
+ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c) {
215
+ int num_topics = MIN(model->num_topics, c->num_docs);
216
+ int k, n;
217
+ document* doc;
218
+
219
+ for (k = 0; k < num_topics; k++) {
220
+ doc = &(c->docs[k]);
221
+ for (n = 0; n < doc->length; n++) {
222
+ ss->class_word[k][doc->words[n]] += doc->counts[n];
223
+ }
224
+ for (n = 0; n < model->num_terms; n++) {
225
+ ss->class_word[k][n] += 1.0;
226
+ ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
227
+ }
228
+ }
229
+ }
230
+
231
+ /*
232
+ * allocate new lda model
233
+ *
234
+ */
235
+
236
+ lda_model* new_lda_model(int num_terms, int num_topics) {
237
+ int i;
238
+ lda_model* model;
239
+
240
+ model = malloc(sizeof(lda_model));
241
+ model->num_topics = num_topics;
242
+ model->num_terms = num_terms;
243
+ model->alpha = 1.0;
244
+ model->log_prob_w = malloc(sizeof(double*)*num_topics);
245
+ printf("new model with: %d topics and %d terms\n", num_topics, num_terms);
246
+ for (i = 0; i < num_topics; i++)
247
+ {
248
+ model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
249
+ memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
250
+ }
251
+ return(model);
252
+ }
253
+
254
+ lda_model* quiet_new_lda_model(int num_terms, int num_topics) {
255
+ int i;
256
+ lda_model* model;
257
+
258
+ model = malloc(sizeof(lda_model));
259
+ model->num_topics = num_topics;
260
+ model->num_terms = num_terms;
261
+ model->alpha = 1.0;
262
+ model->log_prob_w = malloc(sizeof(double*)*num_topics);
263
+ for (i = 0; i < num_topics; i++)
264
+ {
265
+ model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
266
+ memset(model->log_prob_w[i],0,sizeof(double)*num_terms);
267
+ }
268
+ return(model);
269
+ }
270
+
271
+
272
+ /*
273
+ * deallocate new lda model
274
+ *
275
+ */
276
+ void free_lda_model(lda_model* model) {
277
+ int i;
278
+
279
+ for (i = 0; i < model->num_topics; i++)
280
+ {
281
+ free(model->log_prob_w[i]);
282
+ }
283
+ free(model->log_prob_w);
284
+ }
285
+
286
+
287
+ /*
288
+ * save an lda model
289
+ *
290
+ */
291
+ void save_lda_model(lda_model* model, char* model_root) {
292
+ char filename[100];
293
+ FILE* fileptr;
294
+ int i, j;
295
+
296
+ sprintf(filename, "%s.beta", model_root);
297
+ fileptr = fopen(filename, "w");
298
+ for (i = 0; i < model->num_topics; i++) {
299
+ for (j = 0; j < model->num_terms; j++) {
300
+ fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
301
+ }
302
+ fprintf(fileptr, "\n");
303
+ }
304
+ fclose(fileptr);
305
+
306
+ sprintf(filename, "%s.other", model_root);
307
+ fileptr = fopen(filename, "w");
308
+ fprintf(fileptr, "num_topics %d\n", model->num_topics);
309
+ fprintf(fileptr, "num_terms %d\n", model->num_terms);
310
+ fprintf(fileptr, "alpha %5.10f\n", model->alpha);
311
+ fclose(fileptr);
312
+ }
313
+
314
+
315
+ lda_model* load_lda_model(char* model_root) {
316
+ char filename[100];
317
+ FILE* fileptr;
318
+ int i, j, num_terms, num_topics;
319
+ float x, alpha;
320
+
321
+ sprintf(filename, "%s.other", model_root);
322
+ printf("loading %s\n", filename);
323
+ fileptr = fopen(filename, "r");
324
+ fscanf(fileptr, "num_topics %d\n", &num_topics);
325
+ fscanf(fileptr, "num_terms %d\n", &num_terms);
326
+ fscanf(fileptr, "alpha %f\n", &alpha);
327
+ fclose(fileptr);
328
+
329
+ lda_model* model = new_lda_model(num_terms, num_topics);
330
+ model->alpha = alpha;
331
+
332
+ sprintf(filename, "%s.beta", model_root);
333
+ printf("loading %s\n", filename);
334
+ fileptr = fopen(filename, "r");
335
+ for (i = 0; i < num_topics; i++)
336
+ {
337
+ for (j = 0; j < num_terms; j++)
338
+ {
339
+ fscanf(fileptr, "%f", &x);
340
+ model->log_prob_w[i][j] = x;
341
+ }
342
+ }
343
+ fclose(fileptr);
344
+ return(model);
345
+ }
@@ -0,0 +1,31 @@
1
+ #ifndef LDA_MODEL_H
2
+ #define LDA_MODEL
3
+
4
+ #include <stdlib.h>
5
+ #include <stdio.h>
6
+ #include <math.h>
7
+ #include "lda.h"
8
+ #include "lda-alpha.h"
9
+ #include "cokus.h"
10
+
11
+ #define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
12
+ #define NUM_INIT 1
13
+ #define MIN(A,B) (int)((A > B) ? (B) : (A))
14
+
15
+ void free_lda_model(lda_model*);
16
+ void save_lda_model(lda_model*, char*);
17
+ lda_model* new_lda_model(int, int);
18
+ lda_model* quiet_new_lda_model(int num_terms, int num_topics);
19
+ lda_model* new_lda_model(int num_terms, int num_topics);
20
+ lda_suffstats* new_lda_suffstats(lda_model* model);
21
+ void free_lda_suffstats(lda_model* model, lda_suffstats* ss);
22
+ void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
23
+ void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
24
+ void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c);
25
+ void random_initialize_ss(lda_suffstats* ss, lda_model* model);
26
+ void zero_initialize_ss(lda_suffstats* ss, lda_model* model);
27
+ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
28
+ void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha);
29
+ lda_model* load_lda_model(char* model_root);
30
+
31
+ #endif
@@ -0,0 +1,54 @@
1
+ // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
2
+
3
+ // This file is part of LDA-C.
4
+
5
+ // LDA-C is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 2 of the License, or (at your
8
+ // option) any later version.
9
+
10
+ // LDA-C is distributed in the hope that it will be useful, but WITHOUT
11
+ // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with this program; if not, write to the Free Software
17
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18
+ // USA
19
+
20
+ #ifndef LDA_H
21
+ #define LDA_H
22
+
23
+
24
+ typedef struct {
25
+ int* words;
26
+ int* counts;
27
+ int length;
28
+ int total;
29
+ } document;
30
+
31
+
32
+ typedef struct {
33
+ document* docs;
34
+ int num_terms;
35
+ int num_docs;
36
+ } corpus;
37
+
38
+
39
+ typedef struct {
40
+ double alpha;
41
+ double** log_prob_w;
42
+ int num_topics;
43
+ int num_terms;
44
+ } lda_model;
45
+
46
+
47
+ typedef struct {
48
+ double** class_word;
49
+ double* class_total;
50
+ double alpha_suffstats;
51
+ int num_docs;
52
+ } lda_suffstats;
53
+
54
+ #endif
@@ -0,0 +1,111 @@
1
+ #include "utils.h"
2
+
3
+ /*
4
+ * given log(a) and log(b), return log(a + b)
5
+ *
6
+ */
7
+
8
+ double log_sum(double log_a, double log_b)
9
+ {
10
+ double v;
11
+
12
+ if (log_a < log_b)
13
+ {
14
+ v = log_b+log(1 + exp(log_a-log_b));
15
+ }
16
+ else
17
+ {
18
+ v = log_a+log(1 + exp(log_b-log_a));
19
+ }
20
+ return(v);
21
+ }
22
+
23
+ /**
24
+ * Proc to calculate the value of the trigamma, the second
25
+ * derivative of the loggamma function. Accepts positive matrices.
26
+ * From Abromowitz and Stegun. Uses formulas 6.4.11 and 6.4.12 with
27
+ * recurrence formula 6.4.6. Each requires workspace at least 5
28
+ * times the size of X.
29
+ *
30
+ **/
31
+
32
+ double trigamma(double x)
33
+ {
34
+ double p;
35
+ int i;
36
+
37
+ x=x+6;
38
+ p=1/(x*x);
39
+ p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)
40
+ *p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p;
41
+ for (i=0; i<6 ;i++)
42
+ {
43
+ x=x-1;
44
+ p=1/(x*x)+p;
45
+ }
46
+ return(p);
47
+ }
48
+
49
+
50
+ /*
51
+ * taylor approximation of first derivative of the log gamma function
52
+ *
53
+ */
54
+
55
+ double digamma(double x)
56
+ {
57
+ double p;
58
+ x=x+6;
59
+ p=1/(x*x);
60
+ p=(((0.004166666666667*p-0.003968253986254)*p+
61
+ 0.008333333333333)*p-0.083333333333333)*p;
62
+ p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
63
+ return p;
64
+ }
65
+
66
+
67
+ double log_gamma(double x)
68
+ {
69
+ double z=1/(x*x);
70
+
71
+ x=x+6;
72
+ z=(((-0.000595238095238*z+0.000793650793651)
73
+ *z-0.002777777777778)*z+0.083333333333333)/x;
74
+ z=(x-0.5)*log(x)-x+0.918938533204673+z-log(x-1)-
75
+ log(x-2)-log(x-3)-log(x-4)-log(x-5)-log(x-6);
76
+ return z;
77
+ }
78
+
79
+
80
+
81
+ /*
82
+ * make directory
83
+ *
84
+ */
85
+
86
+ void make_directory(char* name)
87
+ {
88
+ mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
89
+ }
90
+
91
+
92
+ /*
93
+ * argmax
94
+ *
95
+ */
96
+
97
+ int argmax(double* x, int n)
98
+ {
99
+ int i;
100
+ double max = x[0];
101
+ int argmax = 0;
102
+ for (i = 1; i < n; i++)
103
+ {
104
+ if (x[i] > max)
105
+ {
106
+ max = x[i];
107
+ argmax = i;
108
+ }
109
+ }
110
+ return(argmax);
111
+ }
@@ -0,0 +1,18 @@
1
+ #ifndef UTILS_H
2
+ #define UTILS_H
3
+
4
+ #include <stdio.h>
5
+ #include <math.h>
6
+ #include <float.h>
7
+ #include <stdlib.h>
8
+ #include <sys/stat.h>
9
+ #include <sys/types.h>
10
+
11
+ double log_sum(double log_a, double log_b);
12
+ double trigamma(double x);
13
+ double digamma(double x);
14
+ double log_gamma(double x);
15
+ void make_directory(char* name);
16
+ int argmax(double* x, int n);
17
+
18
+ #endif
@@ -0,0 +1,12 @@
1
+ [package]
2
+ name = "lda_ruby_rust"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ rust-version = "1.74"
6
+
7
+ [lib]
8
+ name = "lda_ruby_rust"
9
+ crate-type = ["cdylib"]
10
+
11
+ [dependencies]
12
+ magnus = "0.7"
@@ -0,0 +1,48 @@
1
+ # Experimental Rust Extension Scaffold
2
+
3
+ This directory contains an experimental Rust extension scaffold built with `magnus`.
4
+
5
+ Current scope:
6
+
7
+ - Defines `Lda::RustBackend` module in Ruby.
8
+ - Exposes capability hooks:
9
+ - `Lda::RustBackend.available?`
10
+ - `Lda::RustBackend.abi_version`
11
+ - `Lda::RustBackend.before_em(start, num_docs, num_terms)`
12
+ - `Lda::RustBackend.topic_weights_for_word(beta, gamma, word_index, min_probability)`
13
+ - `Lda::RustBackend.accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)`
14
+ - `Lda::RustBackend.infer_document(beta, gamma_initial, words, counts, max_iter, convergence, min_probability, init_alpha)`
15
+ - `Lda::RustBackend.infer_corpus_iteration(beta, document_words, document_counts, max_iter, convergence, min_probability, init_alpha)`
16
+ - `Lda::RustBackend.normalize_topic_term_counts(topic_term_counts, min_probability)`
17
+ - `Lda::RustBackend.average_gamma_shift(previous_gamma, current_gamma)`
18
+ - `Lda::RustBackend.topic_document_probability(phi_tensor, document_counts, num_topics, min_probability)`
19
+ - `Lda::RustBackend.seeded_topic_term_probabilities(document_words, document_counts, topics, terms, min_probability)`
20
+
21
+ Hot-path kernels currently executed in Rust when `backend: :rust` is active:
22
+ - topic weights for a word across topics
23
+ - topic-term count accumulation from per-document `phi`
24
+ - full per-document inference loop (batched inner EM updates)
25
+ - full per-iteration corpus inference (batched document processing)
26
+ - topic-term normalization and log-probability finalization for EM beta updates
27
+ - gamma convergence shift reduction between EM iterations
28
+ - topic-document average log-probability computation
29
+ - seeded topic-term initialization
30
+
31
+ Remaining numeric LDA kernels are still provided by the pure Ruby backend and will move incrementally.
32
+
33
+ ## Local build (optional)
34
+
35
+ ```bash
36
+ cd ext/lda-ruby-rust
37
+ cargo build --release
38
+ ```
39
+
40
+ Then run Ruby with `require "lda_ruby_rust"` available on load path.
41
+
42
+ ## Install-time policy
43
+
44
+ During source gem installs, `ext/lda-ruby-rust/extconf.rb` can optionally build this extension.
45
+
46
+ - `LDA_RUBY_RUST_BUILD=auto` (default): build when `cargo` is available.
47
+ - `LDA_RUBY_RUST_BUILD=always`: require a successful Rust build or fail installation.
48
+ - `LDA_RUBY_RUST_BUILD=never`: always skip Rust build.