lda-ruby 0.3.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +5 -13
  2. data/CHANGELOG.md +16 -0
  3. data/Gemfile +9 -0
  4. data/README.md +126 -3
  5. data/VERSION.yml +3 -3
  6. data/docs/modernization-handoff.md +233 -0
  7. data/docs/porting-strategy.md +148 -0
  8. data/docs/precompiled-platform-policy.md +81 -0
  9. data/docs/precompiled-target-evaluation.md +67 -0
  10. data/docs/release-runbook.md +192 -0
  11. data/docs/rust-orchestration-guardrails.md +50 -0
  12. data/ext/lda-ruby/cokus.c +10 -11
  13. data/ext/lda-ruby/cokus.h +3 -3
  14. data/ext/lda-ruby/extconf.rb +10 -6
  15. data/ext/lda-ruby/lda-inference.c +23 -7
  16. data/ext/lda-ruby/utils.c +8 -0
  17. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  18. data/ext/lda-ruby-rust/README.md +73 -0
  19. data/ext/lda-ruby-rust/extconf.rb +135 -0
  20. data/ext/lda-ruby-rust/include/strings.h +35 -0
  21. data/ext/lda-ruby-rust/src/lib.rs +1263 -0
  22. data/lda-ruby.gemspec +0 -0
  23. data/lib/lda-ruby/backends/base.rb +133 -0
  24. data/lib/lda-ruby/backends/native.rb +158 -0
  25. data/lib/lda-ruby/backends/pure_ruby.rb +675 -0
  26. data/lib/lda-ruby/backends/rust.rb +607 -0
  27. data/lib/lda-ruby/backends.rb +58 -0
  28. data/lib/lda-ruby/corpus/corpus.rb +17 -15
  29. data/lib/lda-ruby/corpus/data_corpus.rb +2 -2
  30. data/lib/lda-ruby/corpus/directory_corpus.rb +2 -2
  31. data/lib/lda-ruby/corpus/text_corpus.rb +2 -2
  32. data/lib/lda-ruby/document/document.rb +6 -6
  33. data/lib/lda-ruby/document/text_document.rb +5 -4
  34. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  35. data/lib/lda-ruby/version.rb +5 -0
  36. data/lib/lda-ruby.rb +293 -48
  37. data/test/backend_compatibility_test.rb +146 -0
  38. data/test/backends_selection_test.rb +100 -0
  39. data/test/benchmark_scripts_test.rb +23 -0
  40. data/test/gemspec_test.rb +27 -0
  41. data/test/lda_ruby_test.rb +49 -11
  42. data/test/packaged_gem_smoke_test.rb +33 -0
  43. data/test/pure_ruby_orchestration_test.rb +109 -0
  44. data/test/release_scripts_test.rb +93 -0
  45. data/test/rust_build_policy_test.rb +23 -0
  46. data/test/rust_orchestration_test.rb +911 -0
  47. data/test/simple_pipeline_test.rb +22 -0
  48. data/test/simple_yaml.rb +1 -7
  49. data/test/test_helper.rb +5 -6
  50. metadata +54 -38
  51. data/Rakefile +0 -61
  52. data/ext/lda-ruby/Makefile +0 -181
  53. data/test/data/.gitignore +0 -2
  54. data/test/simple_test.rb +0 -26
@@ -0,0 +1,1263 @@
1
+ use magnus::{define_module, function, Error, Module, Object};
2
+ use std::collections::HashMap;
3
+ use std::sync::atomic::{AtomicU64, Ordering};
4
+ use std::sync::{Arc, Mutex, OnceLock};
5
+
6
+ fn available() -> bool {
7
+ true
8
+ }
9
+
10
+ fn abi_version() -> i64 {
11
+ 1
12
+ }
13
+
14
+ fn before_em(_start: String, _num_docs: i64, _num_terms: i64) -> bool {
15
+ true
16
+ }
17
+
18
+ fn floor_value(min_probability: f64) -> f64 {
19
+ if min_probability.is_finite() && min_probability > 0.0 {
20
+ min_probability
21
+ } else {
22
+ 1.0e-12
23
+ }
24
+ }
25
+
26
+ fn normalize_in_place(weights: &mut [f64]) {
27
+ let total: f64 = weights.iter().sum();
28
+
29
+ if !total.is_finite() || total <= 0.0 {
30
+ let uniform = if weights.is_empty() {
31
+ 0.0
32
+ } else {
33
+ 1.0 / weights.len() as f64
34
+ };
35
+ for weight in weights {
36
+ *weight = uniform;
37
+ }
38
+ return;
39
+ }
40
+
41
+ for weight in weights {
42
+ *weight /= total;
43
+ }
44
+ }
45
+
46
+ #[derive(Clone, PartialEq)]
47
+ struct SessionConfig {
48
+ topics: usize,
49
+ max_iter: i64,
50
+ convergence: f64,
51
+ em_max_iter: i64,
52
+ em_convergence: f64,
53
+ init_alpha: f64,
54
+ min_probability: f64,
55
+ }
56
+
57
+ struct CorpusSessionData {
58
+ document_words: Vec<Vec<usize>>,
59
+ document_counts: Vec<Vec<f64>>,
60
+ terms: usize,
61
+ }
62
+
63
+ struct CorpusSession {
64
+ data: Arc<CorpusSessionData>,
65
+ config: Option<SessionConfig>,
66
+ }
67
+
68
+ static CORPUS_SESSIONS: OnceLock<Mutex<HashMap<u64, CorpusSession>>> = OnceLock::new();
69
+ static NEXT_CORPUS_SESSION_ID: AtomicU64 = AtomicU64::new(1);
70
+
71
+ fn corpus_sessions() -> &'static Mutex<HashMap<u64, CorpusSession>> {
72
+ CORPUS_SESSIONS.get_or_init(|| Mutex::new(HashMap::new()))
73
+ }
74
+
75
+ fn corpus_session_count() -> i64 {
76
+ match corpus_sessions().lock() {
77
+ Ok(sessions) => sessions.len() as i64,
78
+ Err(_) => 0,
79
+ }
80
+ }
81
+
82
+ fn corpus_session_exists(session_id: i64) -> bool {
83
+ if session_id <= 0 {
84
+ return false;
85
+ }
86
+
87
+ let session_key = session_id as u64;
88
+ match corpus_sessions().lock() {
89
+ Ok(sessions) => sessions.contains_key(&session_key),
90
+ Err(_) => false,
91
+ }
92
+ }
93
+
94
+ fn empty_em_output() -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
95
+ (Vec::new(), Vec::new(), Vec::new(), Vec::new())
96
+ }
97
+
98
+ fn empty_managed_session_em_output(
99
+ ) -> (
100
+ i64,
101
+ Vec<Vec<f64>>,
102
+ Vec<Vec<f64>>,
103
+ Vec<Vec<f64>>,
104
+ Vec<Vec<Vec<f64>>>,
105
+ ) {
106
+ (0, Vec::new(), Vec::new(), Vec::new(), Vec::new())
107
+ }
108
+
109
+ struct XorShift64 {
110
+ state: u64,
111
+ }
112
+
113
+ impl XorShift64 {
114
+ fn new(seed: i64) -> Self {
115
+ let mut state = seed as u64;
116
+ if state == 0 {
117
+ state = 0x9E37_79B9_7F4A_7C15;
118
+ }
119
+
120
+ Self { state }
121
+ }
122
+
123
+ fn next_u64(&mut self) -> u64 {
124
+ let mut x = self.state;
125
+ x ^= x >> 12;
126
+ x ^= x << 25;
127
+ x ^= x >> 27;
128
+ self.state = x;
129
+ x.wrapping_mul(0x2545_F491_4F6C_DD1D)
130
+ }
131
+
132
+ fn next_f64_unit(&mut self) -> f64 {
133
+ // Keep 53 random bits to map uniformly into [0, 1).
134
+ let value = self.next_u64() >> 11;
135
+ value as f64 / ((1_u64 << 53) as f64)
136
+ }
137
+ }
138
+
139
+ fn compute_topic_weights(
140
+ beta_probabilities: &[Vec<f64>],
141
+ gamma: &[f64],
142
+ word_index: usize,
143
+ floor: f64,
144
+ ) -> Vec<f64> {
145
+ let topics = gamma.len().min(beta_probabilities.len());
146
+ if topics == 0 {
147
+ return Vec::new();
148
+ }
149
+
150
+ let mut weights = Vec::with_capacity(topics);
151
+ for topic_index in 0..topics {
152
+ let beta_value = beta_probabilities[topic_index]
153
+ .get(word_index)
154
+ .copied()
155
+ .unwrap_or(floor)
156
+ .max(floor);
157
+ let gamma_value = gamma[topic_index].max(floor);
158
+ weights.push(beta_value * gamma_value);
159
+ }
160
+
161
+ normalize_in_place(&mut weights);
162
+ weights
163
+ }
164
+
165
+ fn topic_weights_for_word(
166
+ beta_probabilities: Vec<Vec<f64>>,
167
+ gamma: Vec<f64>,
168
+ word_index: usize,
169
+ min_probability: f64,
170
+ ) -> Vec<f64> {
171
+ let floor = floor_value(min_probability);
172
+ compute_topic_weights(&beta_probabilities, &gamma, word_index, floor)
173
+ }
174
+
175
+ fn accumulate_topic_term_counts_in_place(
176
+ topic_term_counts: &mut [Vec<f64>],
177
+ phi_d: &[Vec<f64>],
178
+ words: &[usize],
179
+ counts: &[f64],
180
+ ) {
181
+ let topics = topic_term_counts.len();
182
+ if topics == 0 {
183
+ return;
184
+ }
185
+
186
+ for (word_offset, &word_index) in words.iter().enumerate() {
187
+ let count = counts.get(word_offset).copied().unwrap_or(0.0);
188
+ if count == 0.0 {
189
+ continue;
190
+ }
191
+
192
+ let Some(phi_row) = phi_d.get(word_offset) else {
193
+ continue;
194
+ };
195
+
196
+ for topic_index in 0..topics {
197
+ let phi_value = phi_row.get(topic_index).copied().unwrap_or(0.0);
198
+ if let Some(topic_terms) = topic_term_counts.get_mut(topic_index) {
199
+ if word_index < topic_terms.len() {
200
+ topic_terms[word_index] += count * phi_value;
201
+ }
202
+ }
203
+ }
204
+ }
205
+ }
206
+
207
+ fn accumulate_topic_term_counts(
208
+ mut topic_term_counts: Vec<Vec<f64>>,
209
+ phi_d: Vec<Vec<f64>>,
210
+ words: Vec<usize>,
211
+ counts: Vec<f64>,
212
+ ) -> Vec<Vec<f64>> {
213
+ accumulate_topic_term_counts_in_place(
214
+ topic_term_counts.as_mut_slice(),
215
+ phi_d.as_slice(),
216
+ words.as_slice(),
217
+ counts.as_slice(),
218
+ );
219
+ topic_term_counts
220
+ }
221
+
222
+ fn normalize_topic_term_counts(
223
+ topic_term_counts: Vec<Vec<f64>>,
224
+ min_probability: f64,
225
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>) {
226
+ let floor = floor_value(min_probability);
227
+
228
+ let mut beta_probabilities = Vec::with_capacity(topic_term_counts.len());
229
+ let mut beta_log = Vec::with_capacity(topic_term_counts.len());
230
+
231
+ for topic_counts in topic_term_counts.iter() {
232
+ let mut normalized = topic_counts
233
+ .iter()
234
+ .map(|value| {
235
+ if value.is_finite() {
236
+ value.max(floor)
237
+ } else {
238
+ floor
239
+ }
240
+ })
241
+ .collect::<Vec<_>>();
242
+
243
+ normalize_in_place(&mut normalized);
244
+
245
+ let topic_log = normalized
246
+ .iter()
247
+ .map(|value| value.max(floor).ln())
248
+ .collect::<Vec<_>>();
249
+
250
+ beta_probabilities.push(normalized);
251
+ beta_log.push(topic_log);
252
+ }
253
+
254
+ (beta_probabilities, beta_log)
255
+ }
256
+
257
+ fn average_gamma_shift_internal(previous_gamma: &[Vec<f64>], current_gamma: &[Vec<f64>]) -> f64 {
258
+ let mut sum = 0.0_f64;
259
+ let mut count = 0_usize;
260
+
261
+ for (row_index, previous_row) in previous_gamma.iter().enumerate() {
262
+ let current_row = current_gamma.get(row_index);
263
+
264
+ for (col_index, previous_value) in previous_row.iter().enumerate() {
265
+ let current_value = current_row
266
+ .and_then(|row| row.get(col_index))
267
+ .copied()
268
+ .unwrap_or(*previous_value);
269
+
270
+ sum += (previous_value - current_value).abs();
271
+ count += 1;
272
+ }
273
+ }
274
+
275
+ if count == 0 {
276
+ 0.0
277
+ } else {
278
+ sum / count as f64
279
+ }
280
+ }
281
+
282
+ fn average_gamma_shift(previous_gamma: Vec<Vec<f64>>, current_gamma: Vec<Vec<f64>>) -> f64 {
283
+ average_gamma_shift_internal(previous_gamma.as_slice(), current_gamma.as_slice())
284
+ }
285
+
286
+ fn topic_document_probability(
287
+ phi_tensor: Vec<Vec<Vec<f64>>>,
288
+ document_counts: Vec<Vec<f64>>,
289
+ num_topics: usize,
290
+ min_probability: f64,
291
+ ) -> Vec<Vec<f64>> {
292
+ let floor = floor_value(min_probability);
293
+ let mut output = Vec::with_capacity(document_counts.len());
294
+
295
+ for (doc_index, counts) in document_counts.iter().enumerate() {
296
+ let mut tops = vec![0.0_f64; num_topics];
297
+ let ttl: f64 = counts.iter().copied().sum();
298
+
299
+ if let Some(doc_phi) = phi_tensor.get(doc_index) {
300
+ for (word_index, word_dist) in doc_phi.iter().enumerate() {
301
+ let count = counts.get(word_index).copied().unwrap_or(0.0);
302
+ if count == 0.0 {
303
+ continue;
304
+ }
305
+
306
+ for topic_index in 0..num_topics {
307
+ let top_prob = word_dist.get(topic_index).copied().unwrap_or(floor).max(floor);
308
+ tops[topic_index] += top_prob.ln() * count;
309
+ }
310
+ }
311
+ }
312
+
313
+ if ttl.is_finite() && ttl > 0.0 {
314
+ for value in tops.iter_mut() {
315
+ *value /= ttl;
316
+ }
317
+ }
318
+
319
+ output.push(tops);
320
+ }
321
+
322
+ output
323
+ }
324
+
325
+ fn seeded_topic_term_probabilities_internal(
326
+ document_words: &[Vec<usize>],
327
+ document_counts: &[Vec<f64>],
328
+ topics: usize,
329
+ terms: usize,
330
+ min_probability: f64,
331
+ ) -> Vec<Vec<f64>> {
332
+ if topics == 0 || terms == 0 {
333
+ return Vec::new();
334
+ }
335
+
336
+ let floor = floor_value(min_probability);
337
+ let mut topic_term_counts = vec![vec![floor; terms]; topics];
338
+
339
+ for (doc_index, words) in document_words.iter().enumerate() {
340
+ let topic_index = doc_index % topics;
341
+ let counts = document_counts.get(doc_index);
342
+
343
+ for (word_offset, &word_index) in words.iter().enumerate() {
344
+ if word_index >= terms {
345
+ continue;
346
+ }
347
+
348
+ let count = counts
349
+ .and_then(|row| row.get(word_offset))
350
+ .copied()
351
+ .unwrap_or(0.0);
352
+ if !count.is_finite() || count == 0.0 {
353
+ continue;
354
+ }
355
+
356
+ topic_term_counts[topic_index][word_index] += count;
357
+ }
358
+ }
359
+
360
+ for row in topic_term_counts.iter_mut() {
361
+ normalize_in_place(row);
362
+ }
363
+
364
+ topic_term_counts
365
+ }
366
+
367
+ fn seeded_topic_term_probabilities(
368
+ document_words: Vec<Vec<usize>>,
369
+ document_counts: Vec<Vec<f64>>,
370
+ topics: usize,
371
+ terms: usize,
372
+ min_probability: f64,
373
+ ) -> Vec<Vec<f64>> {
374
+ seeded_topic_term_probabilities_internal(
375
+ document_words.as_slice(),
376
+ document_counts.as_slice(),
377
+ topics,
378
+ terms,
379
+ min_probability,
380
+ )
381
+ }
382
+
383
+ fn random_topic_term_probabilities(
384
+ topics: usize,
385
+ terms: usize,
386
+ min_probability: f64,
387
+ random_seed: i64,
388
+ ) -> Vec<Vec<f64>> {
389
+ if topics == 0 || terms == 0 {
390
+ return Vec::new();
391
+ }
392
+
393
+ let floor = floor_value(min_probability);
394
+ let mut rng = XorShift64::new(random_seed);
395
+ let mut matrix = Vec::with_capacity(topics);
396
+
397
+ for _ in 0..topics {
398
+ let mut weights = Vec::with_capacity(terms);
399
+ for _ in 0..terms {
400
+ weights.push(rng.next_f64_unit() + floor);
401
+ }
402
+ normalize_in_place(&mut weights);
403
+ matrix.push(weights);
404
+ }
405
+
406
+ matrix
407
+ }
408
+
409
+ fn corpus_session_data(
410
+ document_words: &[Vec<usize>],
411
+ document_counts: &[Vec<f64>],
412
+ terms: usize,
413
+ ) -> Arc<CorpusSessionData> {
414
+ Arc::new(CorpusSessionData {
415
+ document_words: document_words.to_vec(),
416
+ document_counts: document_counts.to_vec(),
417
+ terms,
418
+ })
419
+ }
420
+
421
+ fn create_corpus_session_internal(
422
+ document_words: &[Vec<usize>],
423
+ document_counts: &[Vec<f64>],
424
+ terms: usize,
425
+ ) -> i64 {
426
+ let session_id = NEXT_CORPUS_SESSION_ID.fetch_add(1, Ordering::Relaxed);
427
+ let session = CorpusSession {
428
+ data: corpus_session_data(document_words, document_counts, terms),
429
+ config: None,
430
+ };
431
+
432
+ match corpus_sessions().lock() {
433
+ Ok(mut sessions) => {
434
+ sessions.insert(session_id, session);
435
+ session_id as i64
436
+ }
437
+ Err(_) => 0,
438
+ }
439
+ }
440
+
441
+ fn create_corpus_session(
442
+ document_words: Vec<Vec<usize>>,
443
+ document_counts: Vec<Vec<f64>>,
444
+ terms: usize,
445
+ ) -> i64 {
446
+ create_corpus_session_internal(document_words.as_slice(), document_counts.as_slice(), terms)
447
+ }
448
+
449
+ fn replace_corpus_session_internal(
450
+ session_id: i64,
451
+ document_words: &[Vec<usize>],
452
+ document_counts: &[Vec<f64>],
453
+ terms: usize,
454
+ ) -> i64 {
455
+ if terms == 0 {
456
+ return 0;
457
+ }
458
+
459
+ let replacement_data = corpus_session_data(document_words, document_counts, terms);
460
+ match corpus_sessions().lock() {
461
+ Ok(mut sessions) => {
462
+ if session_id > 0 {
463
+ let session_key = session_id as u64;
464
+ if let Some(session) = sessions.get_mut(&session_key) {
465
+ session.data = replacement_data;
466
+ session.config = None;
467
+ return session_id;
468
+ }
469
+ }
470
+
471
+ let new_session_id = NEXT_CORPUS_SESSION_ID.fetch_add(1, Ordering::Relaxed);
472
+ sessions.insert(
473
+ new_session_id,
474
+ CorpusSession {
475
+ data: replacement_data,
476
+ config: None,
477
+ },
478
+ );
479
+ new_session_id as i64
480
+ }
481
+ Err(_) => 0,
482
+ }
483
+ }
484
+
485
+ fn replace_corpus_session(
486
+ session_id: i64,
487
+ document_words: Vec<Vec<usize>>,
488
+ document_counts: Vec<Vec<f64>>,
489
+ terms: usize,
490
+ ) -> i64 {
491
+ replace_corpus_session_internal(
492
+ session_id,
493
+ document_words.as_slice(),
494
+ document_counts.as_slice(),
495
+ terms,
496
+ )
497
+ }
498
+
499
+ fn ensure_corpus_session(
500
+ session_id: i64,
501
+ document_words: &[Vec<usize>],
502
+ document_counts: &[Vec<f64>],
503
+ terms: usize,
504
+ ) -> i64 {
505
+ if terms == 0 {
506
+ return 0;
507
+ }
508
+
509
+ if session_id > 0 && corpus_session_exists(session_id) {
510
+ return session_id;
511
+ }
512
+
513
+ create_corpus_session_internal(document_words, document_counts, terms)
514
+ }
515
+
516
+ fn drop_corpus_session(session_id: i64) -> bool {
517
+ if session_id <= 0 {
518
+ return false;
519
+ }
520
+
521
+ let session_key = session_id as u64;
522
+ match corpus_sessions().lock() {
523
+ Ok(mut sessions) => sessions.remove(&session_key).is_some(),
524
+ Err(_) => false,
525
+ }
526
+ }
527
+
528
+ fn configure_corpus_session(
529
+ session_id: i64,
530
+ topics: usize,
531
+ max_iter: i64,
532
+ convergence: f64,
533
+ em_max_iter: i64,
534
+ em_convergence: f64,
535
+ init_alpha: f64,
536
+ min_probability: f64,
537
+ ) -> bool {
538
+ if session_id <= 0 || topics == 0 {
539
+ return false;
540
+ }
541
+
542
+ let session_key = session_id as u64;
543
+ match corpus_sessions().lock() {
544
+ Ok(mut sessions) => {
545
+ let Some(session) = sessions.get_mut(&session_key) else {
546
+ return false;
547
+ };
548
+
549
+ session.config = Some(SessionConfig {
550
+ topics,
551
+ max_iter,
552
+ convergence,
553
+ em_max_iter,
554
+ em_convergence,
555
+ init_alpha,
556
+ min_probability,
557
+ });
558
+
559
+ true
560
+ }
561
+ Err(_) => false,
562
+ }
563
+ }
564
+
565
+ fn run_em_on_session_with_start_seed(
566
+ session_id: i64,
567
+ start: String,
568
+ topics: usize,
569
+ max_iter: i64,
570
+ convergence: f64,
571
+ em_max_iter: i64,
572
+ em_convergence: f64,
573
+ init_alpha: f64,
574
+ min_probability: f64,
575
+ random_seed: i64,
576
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
577
+ if session_id <= 0 {
578
+ return empty_em_output();
579
+ }
580
+
581
+ let session_key = session_id as u64;
582
+ let session_data = match corpus_sessions().lock() {
583
+ Ok(sessions) => sessions
584
+ .get(&session_key)
585
+ .map(|session| Arc::clone(&session.data)),
586
+ Err(_) => None,
587
+ };
588
+
589
+ let Some(session_data) = session_data else {
590
+ return empty_em_output();
591
+ };
592
+
593
+ run_em_with_start_seed_internal(
594
+ start.as_str(),
595
+ session_data.document_words.as_slice(),
596
+ session_data.document_counts.as_slice(),
597
+ topics,
598
+ session_data.terms,
599
+ max_iter,
600
+ convergence,
601
+ em_max_iter,
602
+ em_convergence,
603
+ init_alpha,
604
+ min_probability,
605
+ random_seed,
606
+ )
607
+ }
608
+
609
+ fn run_em_on_session(
610
+ session_id: i64,
611
+ start: String,
612
+ topics: usize,
613
+ max_iter: i64,
614
+ convergence: f64,
615
+ em_max_iter: i64,
616
+ em_convergence: f64,
617
+ init_alpha: f64,
618
+ min_probability: f64,
619
+ random_seed: i64,
620
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
621
+ if session_id <= 0 || topics == 0 {
622
+ return empty_em_output();
623
+ }
624
+
625
+ let desired_config = SessionConfig {
626
+ topics,
627
+ max_iter,
628
+ convergence,
629
+ em_max_iter,
630
+ em_convergence,
631
+ init_alpha,
632
+ min_probability,
633
+ };
634
+
635
+ let session_key = session_id as u64;
636
+ let session_data = match corpus_sessions().lock() {
637
+ Ok(mut sessions) => {
638
+ let Some(session) = sessions.get_mut(&session_key) else {
639
+ return empty_em_output();
640
+ };
641
+
642
+ if session.config.as_ref() != Some(&desired_config) {
643
+ session.config = Some(desired_config.clone());
644
+ }
645
+
646
+ Arc::clone(&session.data)
647
+ }
648
+ Err(_) => return empty_em_output(),
649
+ };
650
+
651
+ run_em_with_start_seed_internal(
652
+ start.as_str(),
653
+ session_data.document_words.as_slice(),
654
+ session_data.document_counts.as_slice(),
655
+ desired_config.topics,
656
+ session_data.terms,
657
+ desired_config.max_iter,
658
+ desired_config.convergence,
659
+ desired_config.em_max_iter,
660
+ desired_config.em_convergence,
661
+ desired_config.init_alpha,
662
+ desired_config.min_probability,
663
+ random_seed,
664
+ )
665
+ }
666
+
667
+ fn run_em_on_session_with_corpus(
668
+ session_id: i64,
669
+ document_words: Vec<Vec<usize>>,
670
+ document_counts: Vec<Vec<f64>>,
671
+ terms: usize,
672
+ start: String,
673
+ topics: usize,
674
+ max_iter: i64,
675
+ convergence: f64,
676
+ em_max_iter: i64,
677
+ em_convergence: f64,
678
+ init_alpha: f64,
679
+ min_probability: f64,
680
+ random_seed: i64,
681
+ ) -> (
682
+ i64,
683
+ Vec<Vec<f64>>,
684
+ Vec<Vec<f64>>,
685
+ Vec<Vec<f64>>,
686
+ Vec<Vec<Vec<f64>>>,
687
+ ) {
688
+ if topics == 0 || terms == 0 {
689
+ return empty_managed_session_em_output();
690
+ }
691
+
692
+ let active_session_id = ensure_corpus_session(
693
+ session_id,
694
+ document_words.as_slice(),
695
+ document_counts.as_slice(),
696
+ terms,
697
+ );
698
+
699
+ if active_session_id > 0 {
700
+ let (beta_probabilities, beta_log, gamma, phi) = run_em_on_session(
701
+ active_session_id,
702
+ start.clone(),
703
+ topics,
704
+ max_iter,
705
+ convergence,
706
+ em_max_iter,
707
+ em_convergence,
708
+ init_alpha,
709
+ min_probability,
710
+ random_seed,
711
+ );
712
+
713
+ if !(beta_probabilities.is_empty()
714
+ && beta_log.is_empty()
715
+ && gamma.is_empty()
716
+ && phi.is_empty())
717
+ {
718
+ return (active_session_id, beta_probabilities, beta_log, gamma, phi);
719
+ }
720
+ }
721
+
722
+ let (beta_probabilities, beta_log, gamma, phi) = run_em_with_start_seed_internal(
723
+ start.as_str(),
724
+ document_words.as_slice(),
725
+ document_counts.as_slice(),
726
+ topics,
727
+ terms,
728
+ max_iter,
729
+ convergence,
730
+ em_max_iter,
731
+ em_convergence,
732
+ init_alpha,
733
+ min_probability,
734
+ random_seed,
735
+ );
736
+
737
+ if beta_probabilities.is_empty() && beta_log.is_empty() && gamma.is_empty() && phi.is_empty() {
738
+ return empty_managed_session_em_output();
739
+ }
740
+
741
+ (active_session_id, beta_probabilities, beta_log, gamma, phi)
742
+ }
743
+
744
+ fn run_em_on_session_start(
745
+ session_id: i64,
746
+ start: String,
747
+ random_seed: i64,
748
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
749
+ if session_id <= 0 {
750
+ return empty_em_output();
751
+ }
752
+
753
+ let session_key = session_id as u64;
754
+ let session_data = match corpus_sessions().lock() {
755
+ Ok(sessions) => sessions.get(&session_key).map(|session| {
756
+ (
757
+ Arc::clone(&session.data),
758
+ session.config.clone(),
759
+ )
760
+ }),
761
+ Err(_) => None,
762
+ };
763
+
764
+ let Some((session_data, config)) = session_data else {
765
+ return empty_em_output();
766
+ };
767
+
768
+ let Some(config) = config else {
769
+ return empty_em_output();
770
+ };
771
+
772
+ run_em_with_start_seed_internal(
773
+ start.as_str(),
774
+ session_data.document_words.as_slice(),
775
+ session_data.document_counts.as_slice(),
776
+ config.topics,
777
+ session_data.terms,
778
+ config.max_iter,
779
+ config.convergence,
780
+ config.em_max_iter,
781
+ config.em_convergence,
782
+ config.init_alpha,
783
+ config.min_probability,
784
+ random_seed,
785
+ )
786
+ }
787
+
788
+ fn infer_document_internal(
789
+ beta_probabilities: &[Vec<f64>],
790
+ gamma_initial: &[f64],
791
+ words: &[usize],
792
+ counts: &[f64],
793
+ max_iter: i64,
794
+ convergence: f64,
795
+ min_probability: f64,
796
+ init_alpha: f64,
797
+ ) -> (Vec<f64>, Vec<Vec<f64>>) {
798
+ let topics = gamma_initial.len().min(beta_probabilities.len());
799
+ if topics == 0 {
800
+ return (Vec::new(), Vec::new());
801
+ }
802
+
803
+ let floor = floor_value(min_probability);
804
+ let init_alpha_value = if init_alpha.is_finite() {
805
+ init_alpha
806
+ } else {
807
+ 0.3
808
+ };
809
+ let convergence_value = if convergence.is_finite() && convergence >= 0.0 {
810
+ convergence
811
+ } else {
812
+ 1.0e-6
813
+ };
814
+ let max_iter_value = if max_iter <= 0 { 1 } else { max_iter as usize };
815
+
816
+ let mut gamma_d = gamma_initial.iter().copied().take(topics).collect::<Vec<_>>();
817
+ if gamma_d.len() < topics {
818
+ gamma_d.resize(topics, init_alpha_value);
819
+ }
820
+
821
+ let mut phi_d = vec![vec![1.0 / topics as f64; topics]; words.len()];
822
+
823
+ for _ in 0..max_iter_value {
824
+ let mut gamma_next = vec![init_alpha_value; topics];
825
+
826
+ for (word_offset, &word_index) in words.iter().enumerate() {
827
+ let topic_weights = compute_topic_weights(beta_probabilities, &gamma_d, word_index, floor);
828
+ phi_d[word_offset] = topic_weights.clone();
829
+
830
+ let count = counts.get(word_offset).copied().unwrap_or(0.0);
831
+ if count == 0.0 {
832
+ continue;
833
+ }
834
+
835
+ for topic_index in 0..topics {
836
+ gamma_next[topic_index] += count * topic_weights[topic_index];
837
+ }
838
+ }
839
+
840
+ let mut gamma_shift = 0.0_f64;
841
+ for topic_index in 0..topics {
842
+ let delta = (gamma_d[topic_index] - gamma_next[topic_index]).abs();
843
+ if delta > gamma_shift {
844
+ gamma_shift = delta;
845
+ }
846
+ }
847
+
848
+ gamma_d = gamma_next;
849
+ if gamma_shift <= convergence_value {
850
+ break;
851
+ }
852
+ }
853
+
854
+ (gamma_d, phi_d)
855
+ }
856
+
857
+ fn infer_document(
858
+ beta_probabilities: Vec<Vec<f64>>,
859
+ gamma_initial: Vec<f64>,
860
+ words: Vec<usize>,
861
+ counts: Vec<f64>,
862
+ max_iter: i64,
863
+ convergence: f64,
864
+ min_probability: f64,
865
+ init_alpha: f64,
866
+ ) -> Vec<Vec<f64>> {
867
+ let (gamma_d, phi_d) = infer_document_internal(
868
+ beta_probabilities.as_slice(),
869
+ gamma_initial.as_slice(),
870
+ words.as_slice(),
871
+ counts.as_slice(),
872
+ max_iter,
873
+ convergence,
874
+ min_probability,
875
+ init_alpha,
876
+ );
877
+
878
+ let mut output = Vec::with_capacity(phi_d.len() + 1);
879
+ output.push(gamma_d);
880
+ output.extend(phi_d);
881
+ output
882
+ }
883
+
884
+ fn infer_corpus_iteration_internal(
885
+ beta_probabilities: &[Vec<f64>],
886
+ document_words: &[Vec<usize>],
887
+ document_counts: &[Vec<f64>],
888
+ max_iter: i64,
889
+ convergence: f64,
890
+ min_probability: f64,
891
+ init_alpha: f64,
892
+ ) -> (Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>, Vec<Vec<f64>>) {
893
+ let topics = beta_probabilities.len();
894
+ if topics == 0 {
895
+ return (Vec::new(), Vec::new(), Vec::new());
896
+ }
897
+
898
+ let terms = beta_probabilities
899
+ .iter()
900
+ .map(|row| row.len())
901
+ .max()
902
+ .unwrap_or(0);
903
+ let floor = floor_value(min_probability);
904
+ let init_alpha_value = if init_alpha.is_finite() { init_alpha } else { 0.3 };
905
+
906
+ let mut topic_term_counts = vec![vec![floor; terms]; topics];
907
+ let mut gamma_matrix = Vec::with_capacity(document_words.len());
908
+ let mut phi_tensor = Vec::with_capacity(document_words.len());
909
+
910
+ for (doc_index, words) in document_words.iter().enumerate() {
911
+ let counts = document_counts.get(doc_index).cloned().unwrap_or_else(|| vec![0.0; words.len()]);
912
+ let total: f64 = counts.iter().sum();
913
+ let gamma_initial = vec![init_alpha_value + (total / topics as f64); topics];
914
+
915
+ let (gamma_d, phi_d) = infer_document_internal(
916
+ beta_probabilities,
917
+ gamma_initial.as_slice(),
918
+ words.as_slice(),
919
+ counts.as_slice(),
920
+ max_iter,
921
+ convergence,
922
+ min_probability,
923
+ init_alpha,
924
+ );
925
+
926
+ accumulate_topic_term_counts_in_place(
927
+ topic_term_counts.as_mut_slice(),
928
+ phi_d.as_slice(),
929
+ words.as_slice(),
930
+ counts.as_slice(),
931
+ );
932
+
933
+ gamma_matrix.push(gamma_d);
934
+ phi_tensor.push(phi_d);
935
+ }
936
+
937
+ (gamma_matrix, phi_tensor, topic_term_counts)
938
+ }
939
+
940
+ fn infer_corpus_iteration(
941
+ beta_probabilities: Vec<Vec<f64>>,
942
+ document_words: Vec<Vec<usize>>,
943
+ document_counts: Vec<Vec<f64>>,
944
+ max_iter: i64,
945
+ convergence: f64,
946
+ min_probability: f64,
947
+ init_alpha: f64,
948
+ ) -> (Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>, Vec<Vec<f64>>) {
949
+ infer_corpus_iteration_internal(
950
+ beta_probabilities.as_slice(),
951
+ document_words.as_slice(),
952
+ document_counts.as_slice(),
953
+ max_iter,
954
+ convergence,
955
+ min_probability,
956
+ init_alpha,
957
+ )
958
+ }
959
+
960
+ fn start_uses_seeded_initialization(start: &str) -> bool {
961
+ let normalized = start.trim().to_ascii_lowercase();
962
+ normalized == "seeded" || normalized == "deterministic"
963
+ }
964
+
965
+ fn start_uses_random_initialization(start: &str) -> bool {
966
+ start.trim().eq_ignore_ascii_case("random")
967
+ }
968
+
969
+ fn run_em_internal(
970
+ mut beta_probabilities: Vec<Vec<f64>>,
971
+ document_words: &[Vec<usize>],
972
+ document_counts: &[Vec<f64>],
973
+ max_iter: i64,
974
+ convergence: f64,
975
+ em_max_iter: i64,
976
+ em_convergence: f64,
977
+ init_alpha: f64,
978
+ min_probability: f64,
979
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
980
+ let em_max_iter_value = if em_max_iter <= 0 { 0 } else { em_max_iter as usize };
981
+ let em_convergence_value = if em_convergence.is_finite() && em_convergence >= 0.0 {
982
+ em_convergence
983
+ } else {
984
+ 1.0e-4
985
+ };
986
+
987
+ let mut previous_gamma: Option<Vec<Vec<f64>>> = None;
988
+ let mut beta_log: Vec<Vec<f64>> = Vec::new();
989
+ let mut gamma: Vec<Vec<f64>> = Vec::new();
990
+ let mut phi: Vec<Vec<Vec<f64>>> = Vec::new();
991
+
992
+ for _ in 0..em_max_iter_value {
993
+ let (current_gamma, current_phi, topic_term_counts) = infer_corpus_iteration_internal(
994
+ beta_probabilities.as_slice(),
995
+ document_words,
996
+ document_counts,
997
+ max_iter,
998
+ convergence,
999
+ min_probability,
1000
+ init_alpha,
1001
+ );
1002
+
1003
+ let (next_beta_probabilities, next_beta_log) =
1004
+ normalize_topic_term_counts(topic_term_counts, min_probability);
1005
+ let should_stop = previous_gamma
1006
+ .as_ref()
1007
+ .map(|prev| {
1008
+ average_gamma_shift_internal(prev.as_slice(), current_gamma.as_slice())
1009
+ <= em_convergence_value
1010
+ })
1011
+ .unwrap_or(false);
1012
+
1013
+ beta_probabilities = next_beta_probabilities;
1014
+ beta_log = next_beta_log;
1015
+ gamma = current_gamma;
1016
+ phi = current_phi;
1017
+
1018
+ if should_stop {
1019
+ break;
1020
+ }
1021
+
1022
+ previous_gamma = Some(gamma.clone());
1023
+ }
1024
+
1025
+ (beta_probabilities, beta_log, gamma, phi)
1026
+ }
1027
+
1028
+ fn run_em(
1029
+ beta_probabilities: Vec<Vec<f64>>,
1030
+ document_words: Vec<Vec<usize>>,
1031
+ document_counts: Vec<Vec<f64>>,
1032
+ max_iter: i64,
1033
+ convergence: f64,
1034
+ em_max_iter: i64,
1035
+ em_convergence: f64,
1036
+ init_alpha: f64,
1037
+ min_probability: f64,
1038
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
1039
+ run_em_internal(
1040
+ beta_probabilities,
1041
+ document_words.as_slice(),
1042
+ document_counts.as_slice(),
1043
+ max_iter,
1044
+ convergence,
1045
+ em_max_iter,
1046
+ em_convergence,
1047
+ init_alpha,
1048
+ min_probability,
1049
+ )
1050
+ }
1051
+
1052
+ fn run_em_with_start_internal(
1053
+ start: &str,
1054
+ document_words: &[Vec<usize>],
1055
+ document_counts: &[Vec<f64>],
1056
+ topics: usize,
1057
+ terms: usize,
1058
+ max_iter: i64,
1059
+ convergence: f64,
1060
+ em_max_iter: i64,
1061
+ em_convergence: f64,
1062
+ init_alpha: f64,
1063
+ min_probability: f64,
1064
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
1065
+ let initial_beta =
1066
+ if start_uses_seeded_initialization(start) || start_uses_random_initialization(start) {
1067
+ seeded_topic_term_probabilities_internal(
1068
+ document_words,
1069
+ document_counts,
1070
+ topics,
1071
+ terms,
1072
+ min_probability,
1073
+ )
1074
+ } else {
1075
+ // Unknown start modes default to seeded initialization for a stable fallback.
1076
+ seeded_topic_term_probabilities_internal(
1077
+ document_words,
1078
+ document_counts,
1079
+ topics,
1080
+ terms,
1081
+ min_probability,
1082
+ )
1083
+ };
1084
+
1085
+ run_em_internal(
1086
+ initial_beta,
1087
+ document_words,
1088
+ document_counts,
1089
+ max_iter,
1090
+ convergence,
1091
+ em_max_iter,
1092
+ em_convergence,
1093
+ init_alpha,
1094
+ min_probability,
1095
+ )
1096
+ }
1097
+
1098
+ fn run_em_with_start(
1099
+ start: String,
1100
+ document_words: Vec<Vec<usize>>,
1101
+ document_counts: Vec<Vec<f64>>,
1102
+ topics: usize,
1103
+ terms: usize,
1104
+ max_iter: i64,
1105
+ convergence: f64,
1106
+ em_max_iter: i64,
1107
+ em_convergence: f64,
1108
+ init_alpha: f64,
1109
+ min_probability: f64,
1110
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
1111
+ run_em_with_start_internal(
1112
+ start.as_str(),
1113
+ document_words.as_slice(),
1114
+ document_counts.as_slice(),
1115
+ topics,
1116
+ terms,
1117
+ max_iter,
1118
+ convergence,
1119
+ em_max_iter,
1120
+ em_convergence,
1121
+ init_alpha,
1122
+ min_probability,
1123
+ )
1124
+ }
1125
+
1126
+ fn run_em_with_start_seed_internal(
1127
+ start: &str,
1128
+ document_words: &[Vec<usize>],
1129
+ document_counts: &[Vec<f64>],
1130
+ topics: usize,
1131
+ terms: usize,
1132
+ max_iter: i64,
1133
+ convergence: f64,
1134
+ em_max_iter: i64,
1135
+ em_convergence: f64,
1136
+ init_alpha: f64,
1137
+ min_probability: f64,
1138
+ random_seed: i64,
1139
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
1140
+ let initial_beta = if start_uses_seeded_initialization(start) {
1141
+ seeded_topic_term_probabilities_internal(
1142
+ document_words,
1143
+ document_counts,
1144
+ topics,
1145
+ terms,
1146
+ min_probability,
1147
+ )
1148
+ } else if start_uses_random_initialization(start) {
1149
+ random_topic_term_probabilities(topics, terms, min_probability, random_seed)
1150
+ } else {
1151
+ // Unknown start modes follow Ruby's non-seeded fallback behavior.
1152
+ random_topic_term_probabilities(topics, terms, min_probability, random_seed)
1153
+ };
1154
+
1155
+ run_em_internal(
1156
+ initial_beta,
1157
+ document_words,
1158
+ document_counts,
1159
+ max_iter,
1160
+ convergence,
1161
+ em_max_iter,
1162
+ em_convergence,
1163
+ init_alpha,
1164
+ min_probability,
1165
+ )
1166
+ }
1167
+
1168
+ fn run_em_with_start_seed(
1169
+ start: String,
1170
+ document_words: Vec<Vec<usize>>,
1171
+ document_counts: Vec<Vec<f64>>,
1172
+ topics: usize,
1173
+ terms: usize,
1174
+ max_iter: i64,
1175
+ convergence: f64,
1176
+ em_max_iter: i64,
1177
+ em_convergence: f64,
1178
+ init_alpha: f64,
1179
+ min_probability: f64,
1180
+ random_seed: i64,
1181
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>) {
1182
+ run_em_with_start_seed_internal(
1183
+ start.as_str(),
1184
+ document_words.as_slice(),
1185
+ document_counts.as_slice(),
1186
+ topics,
1187
+ terms,
1188
+ max_iter,
1189
+ convergence,
1190
+ em_max_iter,
1191
+ em_convergence,
1192
+ init_alpha,
1193
+ min_probability,
1194
+ random_seed,
1195
+ )
1196
+ }
1197
+
1198
+ #[magnus::init]
1199
+ fn init() -> Result<(), Error> {
1200
+ let lda_module = define_module("Lda")?;
1201
+ let rust_backend_module = lda_module.define_module("RustBackend")?;
1202
+
1203
+ rust_backend_module.define_singleton_method("available?", function!(available, 0))?;
1204
+ rust_backend_module.define_singleton_method("abi_version", function!(abi_version, 0))?;
1205
+ rust_backend_module.define_singleton_method("corpus_session_count", function!(corpus_session_count, 0))?;
1206
+ rust_backend_module.define_singleton_method("corpus_session_exists", function!(corpus_session_exists, 1))?;
1207
+ rust_backend_module.define_singleton_method("before_em", function!(before_em, 3))?;
1208
+ rust_backend_module.define_singleton_method(
1209
+ "topic_weights_for_word",
1210
+ function!(topic_weights_for_word, 4),
1211
+ )?;
1212
+ rust_backend_module.define_singleton_method(
1213
+ "accumulate_topic_term_counts",
1214
+ function!(accumulate_topic_term_counts, 4),
1215
+ )?;
1216
+ rust_backend_module.define_singleton_method("infer_document", function!(infer_document, 8))?;
1217
+ rust_backend_module.define_singleton_method(
1218
+ "infer_corpus_iteration",
1219
+ function!(infer_corpus_iteration, 7),
1220
+ )?;
1221
+ rust_backend_module.define_singleton_method(
1222
+ "normalize_topic_term_counts",
1223
+ function!(normalize_topic_term_counts, 2),
1224
+ )?;
1225
+ rust_backend_module
1226
+ .define_singleton_method("average_gamma_shift", function!(average_gamma_shift, 2))?;
1227
+ rust_backend_module.define_singleton_method(
1228
+ "topic_document_probability",
1229
+ function!(topic_document_probability, 4),
1230
+ )?;
1231
+ rust_backend_module.define_singleton_method(
1232
+ "seeded_topic_term_probabilities",
1233
+ function!(seeded_topic_term_probabilities, 5),
1234
+ )?;
1235
+ rust_backend_module.define_singleton_method(
1236
+ "random_topic_term_probabilities",
1237
+ function!(random_topic_term_probabilities, 4),
1238
+ )?;
1239
+ rust_backend_module
1240
+ .define_singleton_method("create_corpus_session", function!(create_corpus_session, 3))?;
1241
+ rust_backend_module
1242
+ .define_singleton_method("replace_corpus_session", function!(replace_corpus_session, 4))?;
1243
+ rust_backend_module
1244
+ .define_singleton_method("drop_corpus_session", function!(drop_corpus_session, 1))?;
1245
+ rust_backend_module
1246
+ .define_singleton_method("configure_corpus_session", function!(configure_corpus_session, 8))?;
1247
+ rust_backend_module.define_singleton_method("run_em", function!(run_em, 9))?;
1248
+ rust_backend_module
1249
+ .define_singleton_method("run_em_with_start", function!(run_em_with_start, 11))?;
1250
+ rust_backend_module
1251
+ .define_singleton_method("run_em_with_start_seed", function!(run_em_with_start_seed, 12))?;
1252
+ rust_backend_module.define_singleton_method(
1253
+ "run_em_on_session_with_start_seed",
1254
+ function!(run_em_on_session_with_start_seed, 10),
1255
+ )?;
1256
+ rust_backend_module.define_singleton_method("run_em_on_session", function!(run_em_on_session, 10))?;
1257
+ rust_backend_module
1258
+ .define_singleton_method("run_em_on_session_with_corpus", function!(run_em_on_session_with_corpus, 13))?;
1259
+ rust_backend_module
1260
+ .define_singleton_method("run_em_on_session_start", function!(run_em_on_session_start, 3))?;
1261
+
1262
+ Ok(())
1263
+ }