lda-ruby 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +5 -13
  2. data/CHANGELOG.md +8 -0
  3. data/Gemfile +9 -0
  4. data/README.md +123 -3
  5. data/VERSION.yml +3 -3
  6. data/docs/modernization-handoff.md +190 -0
  7. data/docs/porting-strategy.md +127 -0
  8. data/docs/precompiled-platform-policy.md +68 -0
  9. data/docs/release-runbook.md +157 -0
  10. data/ext/lda-ruby/extconf.rb +10 -6
  11. data/ext/lda-ruby/lda-inference.c +21 -5
  12. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  13. data/ext/lda-ruby-rust/README.md +48 -0
  14. data/ext/lda-ruby-rust/extconf.rb +123 -0
  15. data/ext/lda-ruby-rust/src/lib.rs +456 -0
  16. data/lda-ruby.gemspec +0 -0
  17. data/lib/lda-ruby/backends/base.rb +129 -0
  18. data/lib/lda-ruby/backends/native.rb +158 -0
  19. data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
  20. data/lib/lda-ruby/backends/rust.rb +226 -0
  21. data/lib/lda-ruby/backends.rb +58 -0
  22. data/lib/lda-ruby/corpus/corpus.rb +17 -15
  23. data/lib/lda-ruby/corpus/data_corpus.rb +2 -2
  24. data/lib/lda-ruby/corpus/directory_corpus.rb +2 -2
  25. data/lib/lda-ruby/corpus/text_corpus.rb +2 -2
  26. data/lib/lda-ruby/document/document.rb +6 -6
  27. data/lib/lda-ruby/document/text_document.rb +5 -4
  28. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  29. data/lib/lda-ruby/version.rb +5 -0
  30. data/lib/lda-ruby.rb +293 -48
  31. data/test/backend_compatibility_test.rb +146 -0
  32. data/test/backends_selection_test.rb +100 -0
  33. data/test/gemspec_test.rb +27 -0
  34. data/test/lda_ruby_test.rb +49 -11
  35. data/test/packaged_gem_smoke_test.rb +33 -0
  36. data/test/release_scripts_test.rb +54 -0
  37. data/test/rust_build_policy_test.rb +23 -0
  38. data/test/simple_pipeline_test.rb +22 -0
  39. data/test/simple_yaml.rb +1 -7
  40. data/test/test_helper.rb +5 -6
  41. metadata +48 -38
  42. data/Rakefile +0 -61
  43. data/ext/lda-ruby/Makefile +0 -181
  44. data/test/data/.gitignore +0 -2
  45. data/test/simple_test.rb +0 -26
@@ -0,0 +1,456 @@
1
+ use magnus::{define_module, function, Error, Module, Object};
2
+
3
+ fn available() -> bool {
4
+ true
5
+ }
6
+
7
+ fn abi_version() -> i64 {
8
+ 1
9
+ }
10
+
11
+ fn before_em(_start: String, _num_docs: i64, _num_terms: i64) -> bool {
12
+ true
13
+ }
14
+
15
+ fn floor_value(min_probability: f64) -> f64 {
16
+ if min_probability.is_finite() && min_probability > 0.0 {
17
+ min_probability
18
+ } else {
19
+ 1.0e-12
20
+ }
21
+ }
22
+
23
+ fn normalize_in_place(weights: &mut [f64]) {
24
+ let total: f64 = weights.iter().sum();
25
+
26
+ if !total.is_finite() || total <= 0.0 {
27
+ let uniform = if weights.is_empty() {
28
+ 0.0
29
+ } else {
30
+ 1.0 / weights.len() as f64
31
+ };
32
+ for weight in weights {
33
+ *weight = uniform;
34
+ }
35
+ return;
36
+ }
37
+
38
+ for weight in weights {
39
+ *weight /= total;
40
+ }
41
+ }
42
+
43
+ fn compute_topic_weights(
44
+ beta_probabilities: &[Vec<f64>],
45
+ gamma: &[f64],
46
+ word_index: usize,
47
+ floor: f64,
48
+ ) -> Vec<f64> {
49
+ let topics = gamma.len().min(beta_probabilities.len());
50
+ if topics == 0 {
51
+ return Vec::new();
52
+ }
53
+
54
+ let mut weights = Vec::with_capacity(topics);
55
+ for topic_index in 0..topics {
56
+ let beta_value = beta_probabilities[topic_index]
57
+ .get(word_index)
58
+ .copied()
59
+ .unwrap_or(floor)
60
+ .max(floor);
61
+ let gamma_value = gamma[topic_index].max(floor);
62
+ weights.push(beta_value * gamma_value);
63
+ }
64
+
65
+ normalize_in_place(&mut weights);
66
+ weights
67
+ }
68
+
69
+ fn topic_weights_for_word(
70
+ beta_probabilities: Vec<Vec<f64>>,
71
+ gamma: Vec<f64>,
72
+ word_index: usize,
73
+ min_probability: f64,
74
+ ) -> Vec<f64> {
75
+ let floor = floor_value(min_probability);
76
+ compute_topic_weights(&beta_probabilities, &gamma, word_index, floor)
77
+ }
78
+
79
+ fn accumulate_topic_term_counts_in_place(
80
+ topic_term_counts: &mut [Vec<f64>],
81
+ phi_d: &[Vec<f64>],
82
+ words: &[usize],
83
+ counts: &[f64],
84
+ ) {
85
+ let topics = topic_term_counts.len();
86
+ if topics == 0 {
87
+ return;
88
+ }
89
+
90
+ for (word_offset, &word_index) in words.iter().enumerate() {
91
+ let count = counts.get(word_offset).copied().unwrap_or(0.0);
92
+ if count == 0.0 {
93
+ continue;
94
+ }
95
+
96
+ let Some(phi_row) = phi_d.get(word_offset) else {
97
+ continue;
98
+ };
99
+
100
+ for topic_index in 0..topics {
101
+ let phi_value = phi_row.get(topic_index).copied().unwrap_or(0.0);
102
+ if let Some(topic_terms) = topic_term_counts.get_mut(topic_index) {
103
+ if word_index < topic_terms.len() {
104
+ topic_terms[word_index] += count * phi_value;
105
+ }
106
+ }
107
+ }
108
+ }
109
+ }
110
+
111
+ fn accumulate_topic_term_counts(
112
+ mut topic_term_counts: Vec<Vec<f64>>,
113
+ phi_d: Vec<Vec<f64>>,
114
+ words: Vec<usize>,
115
+ counts: Vec<f64>,
116
+ ) -> Vec<Vec<f64>> {
117
+ accumulate_topic_term_counts_in_place(
118
+ topic_term_counts.as_mut_slice(),
119
+ phi_d.as_slice(),
120
+ words.as_slice(),
121
+ counts.as_slice(),
122
+ );
123
+ topic_term_counts
124
+ }
125
+
126
+ fn normalize_topic_term_counts(
127
+ topic_term_counts: Vec<Vec<f64>>,
128
+ min_probability: f64,
129
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>) {
130
+ let floor = floor_value(min_probability);
131
+
132
+ let mut beta_probabilities = Vec::with_capacity(topic_term_counts.len());
133
+ let mut beta_log = Vec::with_capacity(topic_term_counts.len());
134
+
135
+ for topic_counts in topic_term_counts.iter() {
136
+ let mut normalized = topic_counts
137
+ .iter()
138
+ .map(|value| {
139
+ if value.is_finite() {
140
+ value.max(floor)
141
+ } else {
142
+ floor
143
+ }
144
+ })
145
+ .collect::<Vec<_>>();
146
+
147
+ normalize_in_place(&mut normalized);
148
+
149
+ let topic_log = normalized
150
+ .iter()
151
+ .map(|value| value.max(floor).ln())
152
+ .collect::<Vec<_>>();
153
+
154
+ beta_probabilities.push(normalized);
155
+ beta_log.push(topic_log);
156
+ }
157
+
158
+ (beta_probabilities, beta_log)
159
+ }
160
+
161
+ fn average_gamma_shift(previous_gamma: Vec<Vec<f64>>, current_gamma: Vec<Vec<f64>>) -> f64 {
162
+ let mut sum = 0.0_f64;
163
+ let mut count = 0_usize;
164
+
165
+ for (row_index, previous_row) in previous_gamma.iter().enumerate() {
166
+ let current_row = current_gamma.get(row_index);
167
+
168
+ for (col_index, previous_value) in previous_row.iter().enumerate() {
169
+ let current_value = current_row
170
+ .and_then(|row| row.get(col_index))
171
+ .copied()
172
+ .unwrap_or(*previous_value);
173
+
174
+ sum += (previous_value - current_value).abs();
175
+ count += 1;
176
+ }
177
+ }
178
+
179
+ if count == 0 {
180
+ 0.0
181
+ } else {
182
+ sum / count as f64
183
+ }
184
+ }
185
+
186
+ fn topic_document_probability(
187
+ phi_tensor: Vec<Vec<Vec<f64>>>,
188
+ document_counts: Vec<Vec<f64>>,
189
+ num_topics: usize,
190
+ min_probability: f64,
191
+ ) -> Vec<Vec<f64>> {
192
+ let floor = floor_value(min_probability);
193
+ let mut output = Vec::with_capacity(document_counts.len());
194
+
195
+ for (doc_index, counts) in document_counts.iter().enumerate() {
196
+ let mut tops = vec![0.0_f64; num_topics];
197
+ let ttl: f64 = counts.iter().copied().sum();
198
+
199
+ if let Some(doc_phi) = phi_tensor.get(doc_index) {
200
+ for (word_index, word_dist) in doc_phi.iter().enumerate() {
201
+ let count = counts.get(word_index).copied().unwrap_or(0.0);
202
+ if count == 0.0 {
203
+ continue;
204
+ }
205
+
206
+ for topic_index in 0..num_topics {
207
+ let top_prob = word_dist.get(topic_index).copied().unwrap_or(floor).max(floor);
208
+ tops[topic_index] += top_prob.ln() * count;
209
+ }
210
+ }
211
+ }
212
+
213
+ if ttl.is_finite() && ttl > 0.0 {
214
+ for value in tops.iter_mut() {
215
+ *value /= ttl;
216
+ }
217
+ }
218
+
219
+ output.push(tops);
220
+ }
221
+
222
+ output
223
+ }
224
+
225
+ fn seeded_topic_term_probabilities(
226
+ document_words: Vec<Vec<usize>>,
227
+ document_counts: Vec<Vec<f64>>,
228
+ topics: usize,
229
+ terms: usize,
230
+ min_probability: f64,
231
+ ) -> Vec<Vec<f64>> {
232
+ if topics == 0 || terms == 0 {
233
+ return Vec::new();
234
+ }
235
+
236
+ let floor = floor_value(min_probability);
237
+ let mut topic_term_counts = vec![vec![floor; terms]; topics];
238
+
239
+ for (doc_index, words) in document_words.iter().enumerate() {
240
+ let topic_index = doc_index % topics;
241
+ let counts = document_counts.get(doc_index);
242
+
243
+ for (word_offset, &word_index) in words.iter().enumerate() {
244
+ if word_index >= terms {
245
+ continue;
246
+ }
247
+
248
+ let count = counts
249
+ .and_then(|row| row.get(word_offset))
250
+ .copied()
251
+ .unwrap_or(0.0);
252
+ if !count.is_finite() || count == 0.0 {
253
+ continue;
254
+ }
255
+
256
+ topic_term_counts[topic_index][word_index] += count;
257
+ }
258
+ }
259
+
260
+ for row in topic_term_counts.iter_mut() {
261
+ normalize_in_place(row);
262
+ }
263
+
264
+ topic_term_counts
265
+ }
266
+
267
+ fn infer_document_internal(
268
+ beta_probabilities: &[Vec<f64>],
269
+ gamma_initial: &[f64],
270
+ words: &[usize],
271
+ counts: &[f64],
272
+ max_iter: i64,
273
+ convergence: f64,
274
+ min_probability: f64,
275
+ init_alpha: f64,
276
+ ) -> (Vec<f64>, Vec<Vec<f64>>) {
277
+ let topics = gamma_initial.len().min(beta_probabilities.len());
278
+ if topics == 0 {
279
+ return (Vec::new(), Vec::new());
280
+ }
281
+
282
+ let floor = floor_value(min_probability);
283
+ let init_alpha_value = if init_alpha.is_finite() {
284
+ init_alpha
285
+ } else {
286
+ 0.3
287
+ };
288
+ let convergence_value = if convergence.is_finite() && convergence >= 0.0 {
289
+ convergence
290
+ } else {
291
+ 1.0e-6
292
+ };
293
+ let max_iter_value = if max_iter <= 0 { 1 } else { max_iter as usize };
294
+
295
+ let mut gamma_d = gamma_initial.iter().copied().take(topics).collect::<Vec<_>>();
296
+ if gamma_d.len() < topics {
297
+ gamma_d.resize(topics, init_alpha_value);
298
+ }
299
+
300
+ let mut phi_d = vec![vec![1.0 / topics as f64; topics]; words.len()];
301
+
302
+ for _ in 0..max_iter_value {
303
+ let mut gamma_next = vec![init_alpha_value; topics];
304
+
305
+ for (word_offset, &word_index) in words.iter().enumerate() {
306
+ let topic_weights = compute_topic_weights(beta_probabilities, &gamma_d, word_index, floor);
307
+ phi_d[word_offset] = topic_weights.clone();
308
+
309
+ let count = counts.get(word_offset).copied().unwrap_or(0.0);
310
+ if count == 0.0 {
311
+ continue;
312
+ }
313
+
314
+ for topic_index in 0..topics {
315
+ gamma_next[topic_index] += count * topic_weights[topic_index];
316
+ }
317
+ }
318
+
319
+ let mut gamma_shift = 0.0_f64;
320
+ for topic_index in 0..topics {
321
+ let delta = (gamma_d[topic_index] - gamma_next[topic_index]).abs();
322
+ if delta > gamma_shift {
323
+ gamma_shift = delta;
324
+ }
325
+ }
326
+
327
+ gamma_d = gamma_next;
328
+ if gamma_shift <= convergence_value {
329
+ break;
330
+ }
331
+ }
332
+
333
+ (gamma_d, phi_d)
334
+ }
335
+
336
+ fn infer_document(
337
+ beta_probabilities: Vec<Vec<f64>>,
338
+ gamma_initial: Vec<f64>,
339
+ words: Vec<usize>,
340
+ counts: Vec<f64>,
341
+ max_iter: i64,
342
+ convergence: f64,
343
+ min_probability: f64,
344
+ init_alpha: f64,
345
+ ) -> Vec<Vec<f64>> {
346
+ let (gamma_d, phi_d) = infer_document_internal(
347
+ beta_probabilities.as_slice(),
348
+ gamma_initial.as_slice(),
349
+ words.as_slice(),
350
+ counts.as_slice(),
351
+ max_iter,
352
+ convergence,
353
+ min_probability,
354
+ init_alpha,
355
+ );
356
+
357
+ let mut output = Vec::with_capacity(phi_d.len() + 1);
358
+ output.push(gamma_d);
359
+ output.extend(phi_d);
360
+ output
361
+ }
362
+
363
+ fn infer_corpus_iteration(
364
+ beta_probabilities: Vec<Vec<f64>>,
365
+ document_words: Vec<Vec<usize>>,
366
+ document_counts: Vec<Vec<f64>>,
367
+ max_iter: i64,
368
+ convergence: f64,
369
+ min_probability: f64,
370
+ init_alpha: f64,
371
+ ) -> (Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>, Vec<Vec<f64>>) {
372
+ let topics = beta_probabilities.len();
373
+ if topics == 0 {
374
+ return (Vec::new(), Vec::new(), Vec::new());
375
+ }
376
+
377
+ let terms = beta_probabilities
378
+ .iter()
379
+ .map(|row| row.len())
380
+ .max()
381
+ .unwrap_or(0);
382
+ let floor = floor_value(min_probability);
383
+ let init_alpha_value = if init_alpha.is_finite() { init_alpha } else { 0.3 };
384
+
385
+ let mut topic_term_counts = vec![vec![floor; terms]; topics];
386
+ let mut gamma_matrix = Vec::with_capacity(document_words.len());
387
+ let mut phi_tensor = Vec::with_capacity(document_words.len());
388
+
389
+ for (doc_index, words) in document_words.iter().enumerate() {
390
+ let counts = document_counts.get(doc_index).cloned().unwrap_or_else(|| vec![0.0; words.len()]);
391
+ let total: f64 = counts.iter().sum();
392
+ let gamma_initial = vec![init_alpha_value + (total / topics as f64); topics];
393
+
394
+ let (gamma_d, phi_d) = infer_document_internal(
395
+ beta_probabilities.as_slice(),
396
+ gamma_initial.as_slice(),
397
+ words.as_slice(),
398
+ counts.as_slice(),
399
+ max_iter,
400
+ convergence,
401
+ min_probability,
402
+ init_alpha,
403
+ );
404
+
405
+ accumulate_topic_term_counts_in_place(
406
+ topic_term_counts.as_mut_slice(),
407
+ phi_d.as_slice(),
408
+ words.as_slice(),
409
+ counts.as_slice(),
410
+ );
411
+
412
+ gamma_matrix.push(gamma_d);
413
+ phi_tensor.push(phi_d);
414
+ }
415
+
416
+ (gamma_matrix, phi_tensor, topic_term_counts)
417
+ }
418
+
419
+ #[magnus::init]
420
+ fn init() -> Result<(), Error> {
421
+ let lda_module = define_module("Lda")?;
422
+ let rust_backend_module = lda_module.define_module("RustBackend")?;
423
+
424
+ rust_backend_module.define_singleton_method("available?", function!(available, 0))?;
425
+ rust_backend_module.define_singleton_method("abi_version", function!(abi_version, 0))?;
426
+ rust_backend_module.define_singleton_method("before_em", function!(before_em, 3))?;
427
+ rust_backend_module.define_singleton_method(
428
+ "topic_weights_for_word",
429
+ function!(topic_weights_for_word, 4),
430
+ )?;
431
+ rust_backend_module.define_singleton_method(
432
+ "accumulate_topic_term_counts",
433
+ function!(accumulate_topic_term_counts, 4),
434
+ )?;
435
+ rust_backend_module.define_singleton_method("infer_document", function!(infer_document, 8))?;
436
+ rust_backend_module.define_singleton_method(
437
+ "infer_corpus_iteration",
438
+ function!(infer_corpus_iteration, 7),
439
+ )?;
440
+ rust_backend_module.define_singleton_method(
441
+ "normalize_topic_term_counts",
442
+ function!(normalize_topic_term_counts, 2),
443
+ )?;
444
+ rust_backend_module
445
+ .define_singleton_method("average_gamma_shift", function!(average_gamma_shift, 2))?;
446
+ rust_backend_module.define_singleton_method(
447
+ "topic_document_probability",
448
+ function!(topic_document_probability, 4),
449
+ )?;
450
+ rust_backend_module.define_singleton_method(
451
+ "seeded_topic_term_probabilities",
452
+ function!(seeded_topic_term_probabilities, 5),
453
+ )?;
454
+
455
+ Ok(())
456
+ }
data/lda-ruby.gemspec CHANGED
Binary file
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ module Backends
5
+ class Base
6
+ attr_reader :corpus
7
+
8
+ attr_accessor :max_iter,
9
+ :convergence,
10
+ :em_max_iter,
11
+ :em_convergence,
12
+ :num_topics,
13
+ :init_alpha,
14
+ :est_alpha,
15
+ :verbose
16
+
17
+ def initialize(random_seed: nil)
18
+ @random = random_seed.nil? ? Random.new : Random.new(random_seed)
19
+
20
+ @max_iter = 20
21
+ @convergence = 1e-6
22
+ @em_max_iter = 100
23
+ @em_convergence = 1e-4
24
+ @num_topics = 20
25
+ @init_alpha = 0.3
26
+ @est_alpha = 1
27
+ @verbose = true
28
+
29
+ @corpus = nil
30
+ end
31
+
32
+ def name
33
+ self.class.name.split("::").last.downcase
34
+ end
35
+
36
+ def corpus=(corpus)
37
+ @corpus = corpus
38
+ true
39
+ end
40
+
41
+ def fast_load_corpus_from_file(filename)
42
+ self.corpus = Lda::DataCorpus.new(filename)
43
+ end
44
+
45
+ def load_settings(settings_file)
46
+ File.readlines(settings_file).each do |line|
47
+ next if line.strip.empty? || line.strip.start_with?("#")
48
+
49
+ key, value = line.split(/\s+/, 2)
50
+ next if value.nil?
51
+
52
+ case key.downcase
53
+ when "max_iter", "var_max_iter"
54
+ self.max_iter = value.to_i
55
+ when "convergence", "var_converged"
56
+ self.convergence = value.to_f
57
+ when "em_max_iter"
58
+ self.em_max_iter = value.to_i
59
+ when "em_convergence", "em_converged"
60
+ self.em_convergence = value.to_f
61
+ when "num_topics", "ntopics"
62
+ self.num_topics = value.to_i
63
+ when "init_alpha", "initial_alpha", "alpha"
64
+ self.init_alpha = value.to_f
65
+ when "est_alpha", "estimate_alpha"
66
+ self.est_alpha = value.to_i
67
+ when "verbose"
68
+ self.verbose = value.to_i != 0
69
+ end
70
+ end
71
+
72
+ true
73
+ end
74
+
75
+ def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
76
+ self.init_alpha = init_alpha
77
+ self.num_topics = num_topics
78
+ self.max_iter = max_iter
79
+ self.convergence = convergence
80
+ self.em_max_iter = em_max_iter
81
+ self.em_convergence = em_convergence
82
+ self.est_alpha = est_alpha
83
+ true
84
+ end
85
+
86
+ def em(_start)
87
+ raise NotImplementedError, "#{self.class} must implement #em"
88
+ end
89
+
90
+ def beta
91
+ raise NotImplementedError, "#{self.class} must implement #beta"
92
+ end
93
+
94
+ def gamma
95
+ raise NotImplementedError, "#{self.class} must implement #gamma"
96
+ end
97
+
98
+ def compute_phi
99
+ raise NotImplementedError, "#{self.class} must implement #compute_phi"
100
+ end
101
+
102
+ def model
103
+ raise NotImplementedError, "#{self.class} must implement #model"
104
+ end
105
+
106
+ def topic_document_probability(_phi_matrix, _document_counts)
107
+ nil
108
+ end
109
+
110
+ private
111
+
112
+ def normalize!(weights)
113
+ total = weights.sum.to_f
114
+
115
+ if total <= 0.0
116
+ uniform = 1.0 / weights.size
117
+ weights.map! { uniform }
118
+ return weights
119
+ end
120
+
121
+ weights.map! { |w| w / total }
122
+ end
123
+
124
+ def clone_matrix(matrix)
125
+ Marshal.load(Marshal.dump(matrix))
126
+ end
127
+ end
128
+ end
129
+ end