lda-ruby 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +61 -0
  3. data/Gemfile +9 -0
  4. data/README.md +157 -0
  5. data/VERSION.yml +5 -0
  6. data/docs/modernization-handoff.md +190 -0
  7. data/docs/porting-strategy.md +127 -0
  8. data/docs/precompiled-platform-policy.md +68 -0
  9. data/docs/release-runbook.md +157 -0
  10. data/ext/lda-ruby/cokus.c +145 -0
  11. data/ext/lda-ruby/cokus.h +27 -0
  12. data/ext/lda-ruby/extconf.rb +13 -0
  13. data/ext/lda-ruby/lda-alpha.c +96 -0
  14. data/ext/lda-ruby/lda-alpha.h +21 -0
  15. data/ext/lda-ruby/lda-data.c +67 -0
  16. data/ext/lda-ruby/lda-data.h +14 -0
  17. data/ext/lda-ruby/lda-inference.c +1023 -0
  18. data/ext/lda-ruby/lda-inference.h +63 -0
  19. data/ext/lda-ruby/lda-model.c +345 -0
  20. data/ext/lda-ruby/lda-model.h +31 -0
  21. data/ext/lda-ruby/lda.h +54 -0
  22. data/ext/lda-ruby/utils.c +111 -0
  23. data/ext/lda-ruby/utils.h +18 -0
  24. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  25. data/ext/lda-ruby-rust/README.md +48 -0
  26. data/ext/lda-ruby-rust/extconf.rb +123 -0
  27. data/ext/lda-ruby-rust/src/lib.rs +456 -0
  28. data/lda-ruby.gemspec +78 -0
  29. data/lib/lda-ruby/backends/base.rb +129 -0
  30. data/lib/lda-ruby/backends/native.rb +158 -0
  31. data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
  32. data/lib/lda-ruby/backends/rust.rb +226 -0
  33. data/lib/lda-ruby/backends.rb +58 -0
  34. data/lib/lda-ruby/config/stopwords.yml +571 -0
  35. data/lib/lda-ruby/corpus/corpus.rb +45 -0
  36. data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
  37. data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
  38. data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
  39. data/lib/lda-ruby/document/data_document.rb +30 -0
  40. data/lib/lda-ruby/document/document.rb +40 -0
  41. data/lib/lda-ruby/document/text_document.rb +39 -0
  42. data/lib/lda-ruby/lda.so +0 -0
  43. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  44. data/lib/lda-ruby/version.rb +5 -0
  45. data/lib/lda-ruby/vocabulary.rb +46 -0
  46. data/lib/lda-ruby.rb +413 -0
  47. data/lib/lda_ruby_rust.so +0 -0
  48. data/license.txt +504 -0
  49. data/test/backend_compatibility_test.rb +146 -0
  50. data/test/backends_selection_test.rb +100 -0
  51. data/test/data/docs.dat +46 -0
  52. data/test/data/sample.rb +20 -0
  53. data/test/data/wiki-test-docs.yml +123 -0
  54. data/test/gemspec_test.rb +27 -0
  55. data/test/lda_ruby_test.rb +319 -0
  56. data/test/packaged_gem_smoke_test.rb +33 -0
  57. data/test/release_scripts_test.rb +54 -0
  58. data/test/rust_build_policy_test.rb +23 -0
  59. data/test/simple_pipeline_test.rb +22 -0
  60. data/test/simple_yaml.rb +17 -0
  61. data/test/test_helper.rb +10 -0
  62. metadata +111 -0
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "rbconfig"
5
+
6
+ require_relative "../../lib/lda-ruby/rust_build_policy"
7
+
8
+ module Lda
9
+ module RustExtensionBuild
10
+ module_function
11
+
12
+ def run
13
+ policy = RustBuildPolicy.resolve
14
+ puts("Rust extension build policy: #{policy} (#{RustBuildPolicy::ENV_KEY})")
15
+
16
+ case policy
17
+ when RustBuildPolicy::NEVER
18
+ puts("Skipping Rust extension build (policy=#{RustBuildPolicy::NEVER}).")
19
+ when RustBuildPolicy::ALWAYS
20
+ ensure_cargo_available!
21
+ build_and_stage!
22
+ else
23
+ if cargo_available?
24
+ build_and_stage!
25
+ else
26
+ puts("cargo not found; skipping Rust extension build (policy=#{RustBuildPolicy::AUTO}).")
27
+ end
28
+ end
29
+
30
+ write_noop_makefile
31
+ rescue StandardError => e
32
+ if policy == RustBuildPolicy::ALWAYS
33
+ abort("Rust extension build failed with #{RustBuildPolicy::ENV_KEY}=#{RustBuildPolicy::ALWAYS}: #{e.message}")
34
+ end
35
+
36
+ warn("Rust extension build skipped after error in auto mode: #{e.message}")
37
+ write_noop_makefile
38
+ end
39
+
40
+ def ensure_cargo_available!
41
+ return if cargo_available?
42
+
43
+ abort("cargo not found in PATH but #{RustBuildPolicy::ENV_KEY}=#{RustBuildPolicy::ALWAYS} was requested.")
44
+ end
45
+
46
+ def cargo_available?
47
+ cargo = ENV.fetch("CARGO", "cargo")
48
+ system(cargo, "--version", out: File::NULL, err: File::NULL)
49
+ end
50
+
51
+ def build_and_stage!
52
+ cargo = ENV.fetch("CARGO", "cargo")
53
+ Dir.chdir(__dir__) do
54
+ env = rust_build_env
55
+ success =
56
+ if env.empty?
57
+ system(cargo, "build", "--release")
58
+ else
59
+ system(env, cargo, "build", "--release")
60
+ end
61
+ success or raise "cargo build --release failed"
62
+ end
63
+
64
+ source = File.join(__dir__, "target", "release", rust_cdylib_filename)
65
+ raise "Rust extension artifact not found at #{source}" unless File.exist?(source)
66
+
67
+ destination = File.expand_path("../../lib/lda_ruby_rust.#{RbConfig::CONFIG.fetch('DLEXT')}", __dir__)
68
+ FileUtils.cp(source, destination)
69
+ puts("Staged Rust extension to #{destination}")
70
+ end
71
+
72
+ def rust_cdylib_filename
73
+ host_os = RbConfig::CONFIG.fetch("host_os")
74
+ extension =
75
+ case host_os
76
+ when /darwin/
77
+ "dylib"
78
+ when /mswin|mingw|cygwin/
79
+ "dll"
80
+ else
81
+ "so"
82
+ end
83
+
84
+ "liblda_ruby_rust.#{extension}"
85
+ end
86
+
87
+ def rust_build_env
88
+ host_os = RbConfig::CONFIG.fetch("host_os")
89
+ return {} unless host_os.match?(/darwin/)
90
+
91
+ dynamic_lookup_flag = "-C link-arg=-Wl,-undefined,dynamic_lookup"
92
+ existing = ENV.fetch("RUSTFLAGS", "")
93
+ merged =
94
+ if existing.include?(dynamic_lookup_flag)
95
+ existing
96
+ else
97
+ [existing, dynamic_lookup_flag].reject(&:empty?).join(" ")
98
+ end
99
+
100
+ { "RUSTFLAGS" => merged }
101
+ end
102
+
103
+ def write_noop_makefile
104
+ File.write(
105
+ File.join(__dir__, "Makefile"),
106
+ <<~MAKEFILE
107
+ all:
108
+ \t@echo "Rust extension handled by extconf.rb"
109
+
110
+ install:
111
+ \t@echo "Rust extension handled by extconf.rb"
112
+
113
+ clean:
114
+ \t@true
115
+
116
+ distclean: clean
117
+ MAKEFILE
118
+ )
119
+ end
120
+ end
121
+ end
122
+
123
+ Lda::RustExtensionBuild.run
@@ -0,0 +1,456 @@
1
+ use magnus::{define_module, function, Error, Module, Object};
2
+
3
+ fn available() -> bool {
4
+ true
5
+ }
6
+
7
+ fn abi_version() -> i64 {
8
+ 1
9
+ }
10
+
11
+ fn before_em(_start: String, _num_docs: i64, _num_terms: i64) -> bool {
12
+ true
13
+ }
14
+
15
+ fn floor_value(min_probability: f64) -> f64 {
16
+ if min_probability.is_finite() && min_probability > 0.0 {
17
+ min_probability
18
+ } else {
19
+ 1.0e-12
20
+ }
21
+ }
22
+
23
+ fn normalize_in_place(weights: &mut [f64]) {
24
+ let total: f64 = weights.iter().sum();
25
+
26
+ if !total.is_finite() || total <= 0.0 {
27
+ let uniform = if weights.is_empty() {
28
+ 0.0
29
+ } else {
30
+ 1.0 / weights.len() as f64
31
+ };
32
+ for weight in weights {
33
+ *weight = uniform;
34
+ }
35
+ return;
36
+ }
37
+
38
+ for weight in weights {
39
+ *weight /= total;
40
+ }
41
+ }
42
+
43
+ fn compute_topic_weights(
44
+ beta_probabilities: &[Vec<f64>],
45
+ gamma: &[f64],
46
+ word_index: usize,
47
+ floor: f64,
48
+ ) -> Vec<f64> {
49
+ let topics = gamma.len().min(beta_probabilities.len());
50
+ if topics == 0 {
51
+ return Vec::new();
52
+ }
53
+
54
+ let mut weights = Vec::with_capacity(topics);
55
+ for topic_index in 0..topics {
56
+ let beta_value = beta_probabilities[topic_index]
57
+ .get(word_index)
58
+ .copied()
59
+ .unwrap_or(floor)
60
+ .max(floor);
61
+ let gamma_value = gamma[topic_index].max(floor);
62
+ weights.push(beta_value * gamma_value);
63
+ }
64
+
65
+ normalize_in_place(&mut weights);
66
+ weights
67
+ }
68
+
69
+ fn topic_weights_for_word(
70
+ beta_probabilities: Vec<Vec<f64>>,
71
+ gamma: Vec<f64>,
72
+ word_index: usize,
73
+ min_probability: f64,
74
+ ) -> Vec<f64> {
75
+ let floor = floor_value(min_probability);
76
+ compute_topic_weights(&beta_probabilities, &gamma, word_index, floor)
77
+ }
78
+
79
+ fn accumulate_topic_term_counts_in_place(
80
+ topic_term_counts: &mut [Vec<f64>],
81
+ phi_d: &[Vec<f64>],
82
+ words: &[usize],
83
+ counts: &[f64],
84
+ ) {
85
+ let topics = topic_term_counts.len();
86
+ if topics == 0 {
87
+ return;
88
+ }
89
+
90
+ for (word_offset, &word_index) in words.iter().enumerate() {
91
+ let count = counts.get(word_offset).copied().unwrap_or(0.0);
92
+ if count == 0.0 {
93
+ continue;
94
+ }
95
+
96
+ let Some(phi_row) = phi_d.get(word_offset) else {
97
+ continue;
98
+ };
99
+
100
+ for topic_index in 0..topics {
101
+ let phi_value = phi_row.get(topic_index).copied().unwrap_or(0.0);
102
+ if let Some(topic_terms) = topic_term_counts.get_mut(topic_index) {
103
+ if word_index < topic_terms.len() {
104
+ topic_terms[word_index] += count * phi_value;
105
+ }
106
+ }
107
+ }
108
+ }
109
+ }
110
+
111
+ fn accumulate_topic_term_counts(
112
+ mut topic_term_counts: Vec<Vec<f64>>,
113
+ phi_d: Vec<Vec<f64>>,
114
+ words: Vec<usize>,
115
+ counts: Vec<f64>,
116
+ ) -> Vec<Vec<f64>> {
117
+ accumulate_topic_term_counts_in_place(
118
+ topic_term_counts.as_mut_slice(),
119
+ phi_d.as_slice(),
120
+ words.as_slice(),
121
+ counts.as_slice(),
122
+ );
123
+ topic_term_counts
124
+ }
125
+
126
+ fn normalize_topic_term_counts(
127
+ topic_term_counts: Vec<Vec<f64>>,
128
+ min_probability: f64,
129
+ ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>) {
130
+ let floor = floor_value(min_probability);
131
+
132
+ let mut beta_probabilities = Vec::with_capacity(topic_term_counts.len());
133
+ let mut beta_log = Vec::with_capacity(topic_term_counts.len());
134
+
135
+ for topic_counts in topic_term_counts.iter() {
136
+ let mut normalized = topic_counts
137
+ .iter()
138
+ .map(|value| {
139
+ if value.is_finite() {
140
+ value.max(floor)
141
+ } else {
142
+ floor
143
+ }
144
+ })
145
+ .collect::<Vec<_>>();
146
+
147
+ normalize_in_place(&mut normalized);
148
+
149
+ let topic_log = normalized
150
+ .iter()
151
+ .map(|value| value.max(floor).ln())
152
+ .collect::<Vec<_>>();
153
+
154
+ beta_probabilities.push(normalized);
155
+ beta_log.push(topic_log);
156
+ }
157
+
158
+ (beta_probabilities, beta_log)
159
+ }
160
+
161
+ fn average_gamma_shift(previous_gamma: Vec<Vec<f64>>, current_gamma: Vec<Vec<f64>>) -> f64 {
162
+ let mut sum = 0.0_f64;
163
+ let mut count = 0_usize;
164
+
165
+ for (row_index, previous_row) in previous_gamma.iter().enumerate() {
166
+ let current_row = current_gamma.get(row_index);
167
+
168
+ for (col_index, previous_value) in previous_row.iter().enumerate() {
169
+ let current_value = current_row
170
+ .and_then(|row| row.get(col_index))
171
+ .copied()
172
+ .unwrap_or(*previous_value);
173
+
174
+ sum += (previous_value - current_value).abs();
175
+ count += 1;
176
+ }
177
+ }
178
+
179
+ if count == 0 {
180
+ 0.0
181
+ } else {
182
+ sum / count as f64
183
+ }
184
+ }
185
+
186
+ fn topic_document_probability(
187
+ phi_tensor: Vec<Vec<Vec<f64>>>,
188
+ document_counts: Vec<Vec<f64>>,
189
+ num_topics: usize,
190
+ min_probability: f64,
191
+ ) -> Vec<Vec<f64>> {
192
+ let floor = floor_value(min_probability);
193
+ let mut output = Vec::with_capacity(document_counts.len());
194
+
195
+ for (doc_index, counts) in document_counts.iter().enumerate() {
196
+ let mut tops = vec![0.0_f64; num_topics];
197
+ let ttl: f64 = counts.iter().copied().sum();
198
+
199
+ if let Some(doc_phi) = phi_tensor.get(doc_index) {
200
+ for (word_index, word_dist) in doc_phi.iter().enumerate() {
201
+ let count = counts.get(word_index).copied().unwrap_or(0.0);
202
+ if count == 0.0 {
203
+ continue;
204
+ }
205
+
206
+ for topic_index in 0..num_topics {
207
+ let top_prob = word_dist.get(topic_index).copied().unwrap_or(floor).max(floor);
208
+ tops[topic_index] += top_prob.ln() * count;
209
+ }
210
+ }
211
+ }
212
+
213
+ if ttl.is_finite() && ttl > 0.0 {
214
+ for value in tops.iter_mut() {
215
+ *value /= ttl;
216
+ }
217
+ }
218
+
219
+ output.push(tops);
220
+ }
221
+
222
+ output
223
+ }
224
+
225
+ fn seeded_topic_term_probabilities(
226
+ document_words: Vec<Vec<usize>>,
227
+ document_counts: Vec<Vec<f64>>,
228
+ topics: usize,
229
+ terms: usize,
230
+ min_probability: f64,
231
+ ) -> Vec<Vec<f64>> {
232
+ if topics == 0 || terms == 0 {
233
+ return Vec::new();
234
+ }
235
+
236
+ let floor = floor_value(min_probability);
237
+ let mut topic_term_counts = vec![vec![floor; terms]; topics];
238
+
239
+ for (doc_index, words) in document_words.iter().enumerate() {
240
+ let topic_index = doc_index % topics;
241
+ let counts = document_counts.get(doc_index);
242
+
243
+ for (word_offset, &word_index) in words.iter().enumerate() {
244
+ if word_index >= terms {
245
+ continue;
246
+ }
247
+
248
+ let count = counts
249
+ .and_then(|row| row.get(word_offset))
250
+ .copied()
251
+ .unwrap_or(0.0);
252
+ if !count.is_finite() || count == 0.0 {
253
+ continue;
254
+ }
255
+
256
+ topic_term_counts[topic_index][word_index] += count;
257
+ }
258
+ }
259
+
260
+ for row in topic_term_counts.iter_mut() {
261
+ normalize_in_place(row);
262
+ }
263
+
264
+ topic_term_counts
265
+ }
266
+
267
+ fn infer_document_internal(
268
+ beta_probabilities: &[Vec<f64>],
269
+ gamma_initial: &[f64],
270
+ words: &[usize],
271
+ counts: &[f64],
272
+ max_iter: i64,
273
+ convergence: f64,
274
+ min_probability: f64,
275
+ init_alpha: f64,
276
+ ) -> (Vec<f64>, Vec<Vec<f64>>) {
277
+ let topics = gamma_initial.len().min(beta_probabilities.len());
278
+ if topics == 0 {
279
+ return (Vec::new(), Vec::new());
280
+ }
281
+
282
+ let floor = floor_value(min_probability);
283
+ let init_alpha_value = if init_alpha.is_finite() {
284
+ init_alpha
285
+ } else {
286
+ 0.3
287
+ };
288
+ let convergence_value = if convergence.is_finite() && convergence >= 0.0 {
289
+ convergence
290
+ } else {
291
+ 1.0e-6
292
+ };
293
+ let max_iter_value = if max_iter <= 0 { 1 } else { max_iter as usize };
294
+
295
+ let mut gamma_d = gamma_initial.iter().copied().take(topics).collect::<Vec<_>>();
296
+ if gamma_d.len() < topics {
297
+ gamma_d.resize(topics, init_alpha_value);
298
+ }
299
+
300
+ let mut phi_d = vec![vec![1.0 / topics as f64; topics]; words.len()];
301
+
302
+ for _ in 0..max_iter_value {
303
+ let mut gamma_next = vec![init_alpha_value; topics];
304
+
305
+ for (word_offset, &word_index) in words.iter().enumerate() {
306
+ let topic_weights = compute_topic_weights(beta_probabilities, &gamma_d, word_index, floor);
307
+ phi_d[word_offset] = topic_weights.clone();
308
+
309
+ let count = counts.get(word_offset).copied().unwrap_or(0.0);
310
+ if count == 0.0 {
311
+ continue;
312
+ }
313
+
314
+ for topic_index in 0..topics {
315
+ gamma_next[topic_index] += count * topic_weights[topic_index];
316
+ }
317
+ }
318
+
319
+ let mut gamma_shift = 0.0_f64;
320
+ for topic_index in 0..topics {
321
+ let delta = (gamma_d[topic_index] - gamma_next[topic_index]).abs();
322
+ if delta > gamma_shift {
323
+ gamma_shift = delta;
324
+ }
325
+ }
326
+
327
+ gamma_d = gamma_next;
328
+ if gamma_shift <= convergence_value {
329
+ break;
330
+ }
331
+ }
332
+
333
+ (gamma_d, phi_d)
334
+ }
335
+
336
+ fn infer_document(
337
+ beta_probabilities: Vec<Vec<f64>>,
338
+ gamma_initial: Vec<f64>,
339
+ words: Vec<usize>,
340
+ counts: Vec<f64>,
341
+ max_iter: i64,
342
+ convergence: f64,
343
+ min_probability: f64,
344
+ init_alpha: f64,
345
+ ) -> Vec<Vec<f64>> {
346
+ let (gamma_d, phi_d) = infer_document_internal(
347
+ beta_probabilities.as_slice(),
348
+ gamma_initial.as_slice(),
349
+ words.as_slice(),
350
+ counts.as_slice(),
351
+ max_iter,
352
+ convergence,
353
+ min_probability,
354
+ init_alpha,
355
+ );
356
+
357
+ let mut output = Vec::with_capacity(phi_d.len() + 1);
358
+ output.push(gamma_d);
359
+ output.extend(phi_d);
360
+ output
361
+ }
362
+
363
+ fn infer_corpus_iteration(
364
+ beta_probabilities: Vec<Vec<f64>>,
365
+ document_words: Vec<Vec<usize>>,
366
+ document_counts: Vec<Vec<f64>>,
367
+ max_iter: i64,
368
+ convergence: f64,
369
+ min_probability: f64,
370
+ init_alpha: f64,
371
+ ) -> (Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>, Vec<Vec<f64>>) {
372
+ let topics = beta_probabilities.len();
373
+ if topics == 0 {
374
+ return (Vec::new(), Vec::new(), Vec::new());
375
+ }
376
+
377
+ let terms = beta_probabilities
378
+ .iter()
379
+ .map(|row| row.len())
380
+ .max()
381
+ .unwrap_or(0);
382
+ let floor = floor_value(min_probability);
383
+ let init_alpha_value = if init_alpha.is_finite() { init_alpha } else { 0.3 };
384
+
385
+ let mut topic_term_counts = vec![vec![floor; terms]; topics];
386
+ let mut gamma_matrix = Vec::with_capacity(document_words.len());
387
+ let mut phi_tensor = Vec::with_capacity(document_words.len());
388
+
389
+ for (doc_index, words) in document_words.iter().enumerate() {
390
+ let counts = document_counts.get(doc_index).cloned().unwrap_or_else(|| vec![0.0; words.len()]);
391
+ let total: f64 = counts.iter().sum();
392
+ let gamma_initial = vec![init_alpha_value + (total / topics as f64); topics];
393
+
394
+ let (gamma_d, phi_d) = infer_document_internal(
395
+ beta_probabilities.as_slice(),
396
+ gamma_initial.as_slice(),
397
+ words.as_slice(),
398
+ counts.as_slice(),
399
+ max_iter,
400
+ convergence,
401
+ min_probability,
402
+ init_alpha,
403
+ );
404
+
405
+ accumulate_topic_term_counts_in_place(
406
+ topic_term_counts.as_mut_slice(),
407
+ phi_d.as_slice(),
408
+ words.as_slice(),
409
+ counts.as_slice(),
410
+ );
411
+
412
+ gamma_matrix.push(gamma_d);
413
+ phi_tensor.push(phi_d);
414
+ }
415
+
416
+ (gamma_matrix, phi_tensor, topic_term_counts)
417
+ }
418
+
419
+ #[magnus::init]
420
+ fn init() -> Result<(), Error> {
421
+ let lda_module = define_module("Lda")?;
422
+ let rust_backend_module = lda_module.define_module("RustBackend")?;
423
+
424
+ rust_backend_module.define_singleton_method("available?", function!(available, 0))?;
425
+ rust_backend_module.define_singleton_method("abi_version", function!(abi_version, 0))?;
426
+ rust_backend_module.define_singleton_method("before_em", function!(before_em, 3))?;
427
+ rust_backend_module.define_singleton_method(
428
+ "topic_weights_for_word",
429
+ function!(topic_weights_for_word, 4),
430
+ )?;
431
+ rust_backend_module.define_singleton_method(
432
+ "accumulate_topic_term_counts",
433
+ function!(accumulate_topic_term_counts, 4),
434
+ )?;
435
+ rust_backend_module.define_singleton_method("infer_document", function!(infer_document, 8))?;
436
+ rust_backend_module.define_singleton_method(
437
+ "infer_corpus_iteration",
438
+ function!(infer_corpus_iteration, 7),
439
+ )?;
440
+ rust_backend_module.define_singleton_method(
441
+ "normalize_topic_term_counts",
442
+ function!(normalize_topic_term_counts, 2),
443
+ )?;
444
+ rust_backend_module
445
+ .define_singleton_method("average_gamma_shift", function!(average_gamma_shift, 2))?;
446
+ rust_backend_module.define_singleton_method(
447
+ "topic_document_probability",
448
+ function!(topic_document_probability, 4),
449
+ )?;
450
+ rust_backend_module.define_singleton_method(
451
+ "seeded_topic_term_probabilities",
452
+ function!(seeded_topic_term_probabilities, 5),
453
+ )?;
454
+
455
+ Ok(())
456
+ }
data/lda-ruby.gemspec ADDED
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/lda-ruby/version"
4
+
5
+ variant = ENV.fetch("LDA_RUBY_GEM_VARIANT", "source")
6
+ valid_variants = %w[source precompiled].freeze
7
+ unless valid_variants.include?(variant)
8
+ raise ArgumentError, "Unsupported LDA_RUBY_GEM_VARIANT=#{variant.inspect}. Expected one of: #{valid_variants.join(', ')}"
9
+ end
10
+
11
+ precompiled_variant = variant == "precompiled"
12
+
13
+ Gem::Specification.new do |spec|
14
+ spec.name = "lda-ruby"
15
+ spec.version = Lda::VERSION
16
+ spec.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
17
+ spec.email = ["jasonmadams@gmail.com"]
18
+
19
+ spec.summary = "Ruby implementation of Latent Dirichlet Allocation (LDA)."
20
+ spec.description = "Ruby wrapper and toolkit for Latent Dirichlet Allocation based on the original lda-c implementation by David M. Blei."
21
+ spec.homepage = "https://github.com/ealdent/lda-ruby"
22
+ spec.license = "GPL-2.0-or-later"
23
+ spec.required_ruby_version = ">= 3.2"
24
+
25
+ spec.metadata = {
26
+ "homepage_uri" => spec.homepage,
27
+ "source_code_uri" => spec.homepage,
28
+ "changelog_uri" => "#{spec.homepage}/blob/master/CHANGELOG.md",
29
+ "lda_ruby_gem_variant" => variant
30
+ }
31
+
32
+ if precompiled_variant
33
+ platform_override = ENV.fetch("LDA_RUBY_GEM_PLATFORM", "").strip
34
+ platform_value = platform_override.empty? ? Gem::Platform.local.to_s : platform_override
35
+
36
+ spec.platform = Gem::Platform.new(platform_value)
37
+ spec.metadata["lda_ruby_platform"] = spec.platform.to_s
38
+ spec.extensions = []
39
+ else
40
+ spec.extensions = ["ext/lda-ruby/extconf.rb", "ext/lda-ruby-rust/extconf.rb"]
41
+ end
42
+
43
+ spec.require_paths = ["lib"]
44
+
45
+ included = %w[CHANGELOG.md Gemfile README.md VERSION.yml lda-ruby.gemspec license.txt]
46
+ included += Dir.glob("docs/**/*")
47
+ included += Dir.glob("ext/**/*")
48
+ included += Dir.glob("lib/**/*")
49
+ included += Dir.glob("test/**/*")
50
+ allowed_precompiled_binary_patterns = [
51
+ %r{\Alib/lda-ruby/lda\.(so|bundle|dylib|dll)\z},
52
+ %r{\Alib/lda_ruby_rust\.(so|bundle|dylib|dll)\z}
53
+ ]
54
+
55
+ spec.files = included
56
+ .reject { |path| File.directory?(path) }
57
+ .reject { |path| path.start_with?("ext/lda-ruby-rust/target/") }
58
+ .reject { |path| path == "ext/lda-ruby-rust/Cargo.lock" }
59
+ .reject do |path|
60
+ next false if precompiled_variant && allowed_precompiled_binary_patterns.any? { |pattern| pattern.match?(path) }
61
+
62
+ path.end_with?(".o", ".so", ".bundle", ".dylib", ".dll", ".rlib", ".rmeta")
63
+ end
64
+ .reject do |path|
65
+ ["Makefile", "ext/lda-ruby/Makefile", "ext/lda-ruby/mkmf.log", "ext/lda-ruby-rust/Makefile"].include?(path)
66
+ end
67
+ .uniq
68
+ .sort
69
+
70
+ if precompiled_variant
71
+ missing_binaries = allowed_precompiled_binary_patterns.reject do |pattern|
72
+ spec.files.any? { |path| pattern.match?(path) }
73
+ end
74
+ unless missing_binaries.empty?
75
+ raise "Precompiled variant requires staged binaries under lib/: #{missing_binaries.map(&:source).join(', ')}"
76
+ end
77
+ end
78
+ end