lda-ruby 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +61 -0
  3. data/Gemfile +9 -0
  4. data/README.md +157 -0
  5. data/VERSION.yml +5 -0
  6. data/docs/modernization-handoff.md +190 -0
  7. data/docs/porting-strategy.md +127 -0
  8. data/docs/precompiled-platform-policy.md +68 -0
  9. data/docs/release-runbook.md +157 -0
  10. data/ext/lda-ruby/cokus.c +145 -0
  11. data/ext/lda-ruby/cokus.h +27 -0
  12. data/ext/lda-ruby/extconf.rb +13 -0
  13. data/ext/lda-ruby/lda-alpha.c +96 -0
  14. data/ext/lda-ruby/lda-alpha.h +21 -0
  15. data/ext/lda-ruby/lda-data.c +67 -0
  16. data/ext/lda-ruby/lda-data.h +14 -0
  17. data/ext/lda-ruby/lda-inference.c +1023 -0
  18. data/ext/lda-ruby/lda-inference.h +63 -0
  19. data/ext/lda-ruby/lda-model.c +345 -0
  20. data/ext/lda-ruby/lda-model.h +31 -0
  21. data/ext/lda-ruby/lda.h +54 -0
  22. data/ext/lda-ruby/utils.c +111 -0
  23. data/ext/lda-ruby/utils.h +18 -0
  24. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  25. data/ext/lda-ruby-rust/README.md +48 -0
  26. data/ext/lda-ruby-rust/extconf.rb +123 -0
  27. data/ext/lda-ruby-rust/src/lib.rs +456 -0
  28. data/lda-ruby.gemspec +78 -0
  29. data/lib/lda-ruby/backends/base.rb +129 -0
  30. data/lib/lda-ruby/backends/native.rb +158 -0
  31. data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
  32. data/lib/lda-ruby/backends/rust.rb +226 -0
  33. data/lib/lda-ruby/backends.rb +58 -0
  34. data/lib/lda-ruby/config/stopwords.yml +571 -0
  35. data/lib/lda-ruby/corpus/corpus.rb +45 -0
  36. data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
  37. data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
  38. data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
  39. data/lib/lda-ruby/document/data_document.rb +30 -0
  40. data/lib/lda-ruby/document/document.rb +40 -0
  41. data/lib/lda-ruby/document/text_document.rb +39 -0
  42. data/lib/lda-ruby/lda.so +0 -0
  43. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  44. data/lib/lda-ruby/version.rb +5 -0
  45. data/lib/lda-ruby/vocabulary.rb +46 -0
  46. data/lib/lda-ruby.rb +413 -0
  47. data/lib/lda_ruby_rust.so +0 -0
  48. data/license.txt +504 -0
  49. data/test/backend_compatibility_test.rb +146 -0
  50. data/test/backends_selection_test.rb +100 -0
  51. data/test/data/docs.dat +46 -0
  52. data/test/data/sample.rb +20 -0
  53. data/test/data/wiki-test-docs.yml +123 -0
  54. data/test/gemspec_test.rb +27 -0
  55. data/test/lda_ruby_test.rb +319 -0
  56. data/test/packaged_gem_smoke_test.rb +33 -0
  57. data/test/release_scripts_test.rb +54 -0
  58. data/test/rust_build_policy_test.rb +23 -0
  59. data/test/simple_pipeline_test.rb +22 -0
  60. data/test/simple_yaml.rb +17 -0
  61. data/test/test_helper.rb +10 -0
  62. metadata +111 -0
@@ -0,0 +1,226 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ module Backends
5
+ class Rust < Base
6
+ SETTINGS = %i[max_iter convergence em_max_iter em_convergence num_topics init_alpha est_alpha verbose].freeze
7
+
8
+ def self.available?
9
+ return false unless defined?(::Lda::RUST_EXTENSION_LOADED) && ::Lda::RUST_EXTENSION_LOADED
10
+ return false unless defined?(::Lda::RustBackend)
11
+
12
+ if ::Lda::RustBackend.respond_to?(:available?)
13
+ ::Lda::RustBackend.available?
14
+ else
15
+ true
16
+ end
17
+ rescue StandardError
18
+ false
19
+ end
20
+
21
+ SETTINGS.each do |setting_name|
22
+ define_method(setting_name) do
23
+ @fallback.public_send(setting_name)
24
+ end
25
+
26
+ define_method("#{setting_name}=") do |value|
27
+ @fallback.public_send("#{setting_name}=", value)
28
+ end
29
+ end
30
+
31
+ def initialize(random_seed: nil)
32
+ super(random_seed: random_seed)
33
+ raise LoadError, "Rust backend is unavailable for this environment" unless self.class.available?
34
+
35
+ @fallback = PureRuby.new(random_seed: random_seed)
36
+ @fallback.topic_weights_kernel = method(:rust_topic_weights_for_word)
37
+ @fallback.topic_term_accumulator_kernel = method(:rust_accumulate_topic_term_counts)
38
+ @fallback.document_inference_kernel = method(:rust_infer_document)
39
+ @fallback.corpus_iteration_kernel = method(:rust_infer_corpus_iteration)
40
+ @fallback.topic_term_finalizer_kernel = method(:rust_finalize_topic_term_counts)
41
+ @fallback.gamma_shift_kernel = method(:rust_average_gamma_shift)
42
+ @fallback.topic_document_probability_kernel = method(:rust_topic_document_probability)
43
+ @fallback.topic_term_seed_kernel = method(:rust_seeded_topic_term_probabilities)
44
+ @fallback.trusted_kernel_outputs = true
45
+ end
46
+
47
+ def name
48
+ "rust"
49
+ end
50
+
51
+ def corpus=(corpus)
52
+ @corpus = corpus
53
+ @fallback.corpus = corpus
54
+ true
55
+ end
56
+
57
+ def fast_load_corpus_from_file(filename)
58
+ loaded = @fallback.fast_load_corpus_from_file(filename)
59
+ @corpus = @fallback.corpus
60
+ loaded
61
+ end
62
+
63
+ def load_settings(settings_file)
64
+ loaded = @fallback.load_settings(settings_file)
65
+ @corpus = @fallback.corpus
66
+ loaded
67
+ end
68
+
69
+ def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
70
+ @fallback.set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
71
+ end
72
+
73
+ def em(start)
74
+ rust_before_em(start)
75
+ @fallback.em(start)
76
+ end
77
+
78
+ def beta
79
+ @fallback.beta
80
+ end
81
+
82
+ def gamma
83
+ @fallback.gamma
84
+ end
85
+
86
+ def compute_phi
87
+ @fallback.compute_phi
88
+ end
89
+
90
+ def model
91
+ @fallback.model
92
+ end
93
+
94
+ private
95
+
96
+ def rust_before_em(start)
97
+ return unless defined?(::Lda::RustBackend)
98
+ return unless ::Lda::RustBackend.respond_to?(:before_em)
99
+
100
+ ::Lda::RustBackend.before_em(start.to_s, @corpus&.num_docs.to_i, @corpus&.num_terms.to_i)
101
+ rescue StandardError
102
+ nil
103
+ end
104
+
105
+ def rust_topic_weights_for_word(beta_probabilities, gamma, word_index, min_probability)
106
+ return nil unless defined?(::Lda::RustBackend)
107
+ return nil unless ::Lda::RustBackend.respond_to?(:topic_weights_for_word)
108
+
109
+ ::Lda::RustBackend.topic_weights_for_word(
110
+ beta_probabilities,
111
+ gamma,
112
+ Integer(word_index),
113
+ Float(min_probability)
114
+ )
115
+ rescue StandardError
116
+ nil
117
+ end
118
+
119
+ def rust_accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)
120
+ return nil unless defined?(::Lda::RustBackend)
121
+ return nil unless ::Lda::RustBackend.respond_to?(:accumulate_topic_term_counts)
122
+
123
+ ::Lda::RustBackend.accumulate_topic_term_counts(
124
+ topic_term_counts,
125
+ phi_d,
126
+ words,
127
+ counts
128
+ )
129
+ rescue StandardError
130
+ nil
131
+ end
132
+
133
+ def rust_infer_document(beta_probabilities, gamma_initial, words, counts, max_iter, convergence, min_probability, init_alpha)
134
+ return nil unless defined?(::Lda::RustBackend)
135
+ return nil unless ::Lda::RustBackend.respond_to?(:infer_document)
136
+
137
+ output = ::Lda::RustBackend.infer_document(
138
+ beta_probabilities,
139
+ gamma_initial,
140
+ words,
141
+ counts,
142
+ Integer(max_iter),
143
+ Float(convergence),
144
+ Float(min_probability),
145
+ Float(init_alpha)
146
+ )
147
+
148
+ return nil unless output.is_a?(Array)
149
+ return nil if output.empty?
150
+
151
+ gamma = output.first
152
+ phi_rows = output[1..] || []
153
+ [gamma, phi_rows]
154
+ rescue StandardError
155
+ nil
156
+ end
157
+
158
+ def rust_infer_corpus_iteration(beta_probabilities, document_words, document_counts, max_iter, convergence, min_probability, init_alpha)
159
+ return nil unless defined?(::Lda::RustBackend)
160
+ return nil unless ::Lda::RustBackend.respond_to?(:infer_corpus_iteration)
161
+
162
+ ::Lda::RustBackend.infer_corpus_iteration(
163
+ beta_probabilities,
164
+ document_words,
165
+ document_counts,
166
+ Integer(max_iter),
167
+ Float(convergence),
168
+ Float(min_probability),
169
+ Float(init_alpha)
170
+ )
171
+ rescue StandardError
172
+ nil
173
+ end
174
+
175
+ def rust_finalize_topic_term_counts(topic_term_counts, min_probability)
176
+ return nil unless defined?(::Lda::RustBackend)
177
+ return nil unless ::Lda::RustBackend.respond_to?(:normalize_topic_term_counts)
178
+
179
+ ::Lda::RustBackend.normalize_topic_term_counts(
180
+ topic_term_counts,
181
+ Float(min_probability)
182
+ )
183
+ rescue StandardError
184
+ nil
185
+ end
186
+
187
+ def rust_average_gamma_shift(previous_gamma, current_gamma)
188
+ return nil unless defined?(::Lda::RustBackend)
189
+ return nil unless ::Lda::RustBackend.respond_to?(:average_gamma_shift)
190
+
191
+ ::Lda::RustBackend.average_gamma_shift(previous_gamma, current_gamma)
192
+ rescue StandardError
193
+ nil
194
+ end
195
+
196
+ def rust_topic_document_probability(phi_matrix, document_counts, num_topics, min_probability)
197
+ return nil unless defined?(::Lda::RustBackend)
198
+ return nil unless ::Lda::RustBackend.respond_to?(:topic_document_probability)
199
+
200
+ ::Lda::RustBackend.topic_document_probability(
201
+ phi_matrix,
202
+ document_counts,
203
+ Integer(num_topics),
204
+ Float(min_probability)
205
+ )
206
+ rescue StandardError
207
+ nil
208
+ end
209
+
210
+ def rust_seeded_topic_term_probabilities(document_words, document_counts, topics, terms, min_probability)
211
+ return nil unless defined?(::Lda::RustBackend)
212
+ return nil unless ::Lda::RustBackend.respond_to?(:seeded_topic_term_probabilities)
213
+
214
+ ::Lda::RustBackend.seeded_topic_term_probabilities(
215
+ document_words,
216
+ document_counts,
217
+ Integer(topics),
218
+ Integer(terms),
219
+ Float(min_probability)
220
+ )
221
+ rescue StandardError
222
+ nil
223
+ end
224
+ end
225
+ end
226
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lda-ruby/backends/base"
4
+ require "lda-ruby/backends/rust"
5
+ require "lda-ruby/backends/native"
6
+ require "lda-ruby/backends/pure_ruby"
7
+
8
+ module Lda
9
+ module Backends
10
+ class << self
11
+ def build(host:, requested: nil, random_seed: nil)
12
+ mode = normalize_mode(requested)
13
+
14
+ case mode
15
+ when :auto
16
+ if Rust.available?
17
+ Rust.new(random_seed: random_seed)
18
+ elsif Native.available?(host)
19
+ Native.new(host, random_seed: random_seed)
20
+ else
21
+ PureRuby.new(random_seed: random_seed)
22
+ end
23
+ when :rust
24
+ raise LoadError, "Rust backend is unavailable for this environment" unless Rust.available?
25
+
26
+ Rust.new(random_seed: random_seed)
27
+ when :native
28
+ raise LoadError, "Native backend is unavailable for this environment" unless Native.available?(host)
29
+
30
+ Native.new(host, random_seed: random_seed)
31
+ when :pure
32
+ PureRuby.new(random_seed: random_seed)
33
+ else
34
+ raise ArgumentError, "Unknown backend mode: #{requested.inspect}"
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def normalize_mode(requested)
41
+ raw_mode = requested || ENV.fetch("LDA_RUBY_BACKEND", "auto")
42
+
43
+ case raw_mode.to_s.strip.downcase
44
+ when "", "auto"
45
+ :auto
46
+ when "native", "c"
47
+ :native
48
+ when "rust", "rust_native"
49
+ :rust
50
+ when "pure", "ruby", "pure_ruby"
51
+ :pure
52
+ else
53
+ raw_mode
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end