lda-ruby 0.4.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +9 -0
- data/README.md +157 -0
- data/VERSION.yml +5 -0
- data/docs/modernization-handoff.md +190 -0
- data/docs/porting-strategy.md +127 -0
- data/docs/precompiled-platform-policy.md +68 -0
- data/docs/release-runbook.md +157 -0
- data/ext/lda-ruby/cokus.c +145 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +13 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1023 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +31 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +111 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/ext/lda-ruby-rust/Cargo.toml +12 -0
- data/ext/lda-ruby-rust/README.md +48 -0
- data/ext/lda-ruby-rust/extconf.rb +123 -0
- data/ext/lda-ruby-rust/src/lib.rs +456 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby/backends/base.rb +129 -0
- data/lib/lda-ruby/backends/native.rb +158 -0
- data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
- data/lib/lda-ruby/backends/rust.rb +226 -0
- data/lib/lda-ruby/backends.rb +58 -0
- data/lib/lda-ruby/config/stopwords.yml +571 -0
- data/lib/lda-ruby/corpus/corpus.rb +45 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +40 -0
- data/lib/lda-ruby/document/text_document.rb +39 -0
- data/lib/lda-ruby/lda.so +0 -0
- data/lib/lda-ruby/rust_build_policy.rb +21 -0
- data/lib/lda-ruby/version.rb +5 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/lib/lda-ruby.rb +413 -0
- data/lib/lda_ruby_rust.so +0 -0
- data/license.txt +504 -0
- data/test/backend_compatibility_test.rb +146 -0
- data/test/backends_selection_test.rb +100 -0
- data/test/data/docs.dat +46 -0
- data/test/data/sample.rb +20 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/gemspec_test.rb +27 -0
- data/test/lda_ruby_test.rb +319 -0
- data/test/packaged_gem_smoke_test.rb +33 -0
- data/test/release_scripts_test.rb +54 -0
- data/test/rust_build_policy_test.rb +23 -0
- data/test/simple_pipeline_test.rb +22 -0
- data/test/simple_yaml.rb +17 -0
- data/test/test_helper.rb +10 -0
- metadata +111 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Lda
|
|
4
|
+
module Backends
|
|
5
|
+
class Rust < Base
|
|
6
|
+
SETTINGS = %i[max_iter convergence em_max_iter em_convergence num_topics init_alpha est_alpha verbose].freeze
|
|
7
|
+
|
|
8
|
+
def self.available?
|
|
9
|
+
return false unless defined?(::Lda::RUST_EXTENSION_LOADED) && ::Lda::RUST_EXTENSION_LOADED
|
|
10
|
+
return false unless defined?(::Lda::RustBackend)
|
|
11
|
+
|
|
12
|
+
if ::Lda::RustBackend.respond_to?(:available?)
|
|
13
|
+
::Lda::RustBackend.available?
|
|
14
|
+
else
|
|
15
|
+
true
|
|
16
|
+
end
|
|
17
|
+
rescue StandardError
|
|
18
|
+
false
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
SETTINGS.each do |setting_name|
|
|
22
|
+
define_method(setting_name) do
|
|
23
|
+
@fallback.public_send(setting_name)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
define_method("#{setting_name}=") do |value|
|
|
27
|
+
@fallback.public_send("#{setting_name}=", value)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def initialize(random_seed: nil)
|
|
32
|
+
super(random_seed: random_seed)
|
|
33
|
+
raise LoadError, "Rust backend is unavailable for this environment" unless self.class.available?
|
|
34
|
+
|
|
35
|
+
@fallback = PureRuby.new(random_seed: random_seed)
|
|
36
|
+
@fallback.topic_weights_kernel = method(:rust_topic_weights_for_word)
|
|
37
|
+
@fallback.topic_term_accumulator_kernel = method(:rust_accumulate_topic_term_counts)
|
|
38
|
+
@fallback.document_inference_kernel = method(:rust_infer_document)
|
|
39
|
+
@fallback.corpus_iteration_kernel = method(:rust_infer_corpus_iteration)
|
|
40
|
+
@fallback.topic_term_finalizer_kernel = method(:rust_finalize_topic_term_counts)
|
|
41
|
+
@fallback.gamma_shift_kernel = method(:rust_average_gamma_shift)
|
|
42
|
+
@fallback.topic_document_probability_kernel = method(:rust_topic_document_probability)
|
|
43
|
+
@fallback.topic_term_seed_kernel = method(:rust_seeded_topic_term_probabilities)
|
|
44
|
+
@fallback.trusted_kernel_outputs = true
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def name
|
|
48
|
+
"rust"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def corpus=(corpus)
|
|
52
|
+
@corpus = corpus
|
|
53
|
+
@fallback.corpus = corpus
|
|
54
|
+
true
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def fast_load_corpus_from_file(filename)
|
|
58
|
+
loaded = @fallback.fast_load_corpus_from_file(filename)
|
|
59
|
+
@corpus = @fallback.corpus
|
|
60
|
+
loaded
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def load_settings(settings_file)
|
|
64
|
+
loaded = @fallback.load_settings(settings_file)
|
|
65
|
+
@corpus = @fallback.corpus
|
|
66
|
+
loaded
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
|
|
70
|
+
@fallback.set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def em(start)
|
|
74
|
+
rust_before_em(start)
|
|
75
|
+
@fallback.em(start)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def beta
|
|
79
|
+
@fallback.beta
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def gamma
|
|
83
|
+
@fallback.gamma
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def compute_phi
|
|
87
|
+
@fallback.compute_phi
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def model
|
|
91
|
+
@fallback.model
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private
|
|
95
|
+
|
|
96
|
+
def rust_before_em(start)
|
|
97
|
+
return unless defined?(::Lda::RustBackend)
|
|
98
|
+
return unless ::Lda::RustBackend.respond_to?(:before_em)
|
|
99
|
+
|
|
100
|
+
::Lda::RustBackend.before_em(start.to_s, @corpus&.num_docs.to_i, @corpus&.num_terms.to_i)
|
|
101
|
+
rescue StandardError
|
|
102
|
+
nil
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def rust_topic_weights_for_word(beta_probabilities, gamma, word_index, min_probability)
|
|
106
|
+
return nil unless defined?(::Lda::RustBackend)
|
|
107
|
+
return nil unless ::Lda::RustBackend.respond_to?(:topic_weights_for_word)
|
|
108
|
+
|
|
109
|
+
::Lda::RustBackend.topic_weights_for_word(
|
|
110
|
+
beta_probabilities,
|
|
111
|
+
gamma,
|
|
112
|
+
Integer(word_index),
|
|
113
|
+
Float(min_probability)
|
|
114
|
+
)
|
|
115
|
+
rescue StandardError
|
|
116
|
+
nil
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def rust_accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)
|
|
120
|
+
return nil unless defined?(::Lda::RustBackend)
|
|
121
|
+
return nil unless ::Lda::RustBackend.respond_to?(:accumulate_topic_term_counts)
|
|
122
|
+
|
|
123
|
+
::Lda::RustBackend.accumulate_topic_term_counts(
|
|
124
|
+
topic_term_counts,
|
|
125
|
+
phi_d,
|
|
126
|
+
words,
|
|
127
|
+
counts
|
|
128
|
+
)
|
|
129
|
+
rescue StandardError
|
|
130
|
+
nil
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def rust_infer_document(beta_probabilities, gamma_initial, words, counts, max_iter, convergence, min_probability, init_alpha)
|
|
134
|
+
return nil unless defined?(::Lda::RustBackend)
|
|
135
|
+
return nil unless ::Lda::RustBackend.respond_to?(:infer_document)
|
|
136
|
+
|
|
137
|
+
output = ::Lda::RustBackend.infer_document(
|
|
138
|
+
beta_probabilities,
|
|
139
|
+
gamma_initial,
|
|
140
|
+
words,
|
|
141
|
+
counts,
|
|
142
|
+
Integer(max_iter),
|
|
143
|
+
Float(convergence),
|
|
144
|
+
Float(min_probability),
|
|
145
|
+
Float(init_alpha)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return nil unless output.is_a?(Array)
|
|
149
|
+
return nil if output.empty?
|
|
150
|
+
|
|
151
|
+
gamma = output.first
|
|
152
|
+
phi_rows = output[1..] || []
|
|
153
|
+
[gamma, phi_rows]
|
|
154
|
+
rescue StandardError
|
|
155
|
+
nil
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def rust_infer_corpus_iteration(beta_probabilities, document_words, document_counts, max_iter, convergence, min_probability, init_alpha)
|
|
159
|
+
return nil unless defined?(::Lda::RustBackend)
|
|
160
|
+
return nil unless ::Lda::RustBackend.respond_to?(:infer_corpus_iteration)
|
|
161
|
+
|
|
162
|
+
::Lda::RustBackend.infer_corpus_iteration(
|
|
163
|
+
beta_probabilities,
|
|
164
|
+
document_words,
|
|
165
|
+
document_counts,
|
|
166
|
+
Integer(max_iter),
|
|
167
|
+
Float(convergence),
|
|
168
|
+
Float(min_probability),
|
|
169
|
+
Float(init_alpha)
|
|
170
|
+
)
|
|
171
|
+
rescue StandardError
|
|
172
|
+
nil
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def rust_finalize_topic_term_counts(topic_term_counts, min_probability)
|
|
176
|
+
return nil unless defined?(::Lda::RustBackend)
|
|
177
|
+
return nil unless ::Lda::RustBackend.respond_to?(:normalize_topic_term_counts)
|
|
178
|
+
|
|
179
|
+
::Lda::RustBackend.normalize_topic_term_counts(
|
|
180
|
+
topic_term_counts,
|
|
181
|
+
Float(min_probability)
|
|
182
|
+
)
|
|
183
|
+
rescue StandardError
|
|
184
|
+
nil
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def rust_average_gamma_shift(previous_gamma, current_gamma)
|
|
188
|
+
return nil unless defined?(::Lda::RustBackend)
|
|
189
|
+
return nil unless ::Lda::RustBackend.respond_to?(:average_gamma_shift)
|
|
190
|
+
|
|
191
|
+
::Lda::RustBackend.average_gamma_shift(previous_gamma, current_gamma)
|
|
192
|
+
rescue StandardError
|
|
193
|
+
nil
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def rust_topic_document_probability(phi_matrix, document_counts, num_topics, min_probability)
|
|
197
|
+
return nil unless defined?(::Lda::RustBackend)
|
|
198
|
+
return nil unless ::Lda::RustBackend.respond_to?(:topic_document_probability)
|
|
199
|
+
|
|
200
|
+
::Lda::RustBackend.topic_document_probability(
|
|
201
|
+
phi_matrix,
|
|
202
|
+
document_counts,
|
|
203
|
+
Integer(num_topics),
|
|
204
|
+
Float(min_probability)
|
|
205
|
+
)
|
|
206
|
+
rescue StandardError
|
|
207
|
+
nil
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def rust_seeded_topic_term_probabilities(document_words, document_counts, topics, terms, min_probability)
|
|
211
|
+
return nil unless defined?(::Lda::RustBackend)
|
|
212
|
+
return nil unless ::Lda::RustBackend.respond_to?(:seeded_topic_term_probabilities)
|
|
213
|
+
|
|
214
|
+
::Lda::RustBackend.seeded_topic_term_probabilities(
|
|
215
|
+
document_words,
|
|
216
|
+
document_counts,
|
|
217
|
+
Integer(topics),
|
|
218
|
+
Integer(terms),
|
|
219
|
+
Float(min_probability)
|
|
220
|
+
)
|
|
221
|
+
rescue StandardError
|
|
222
|
+
nil
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lda-ruby/backends/base"
|
|
4
|
+
require "lda-ruby/backends/rust"
|
|
5
|
+
require "lda-ruby/backends/native"
|
|
6
|
+
require "lda-ruby/backends/pure_ruby"
|
|
7
|
+
|
|
8
|
+
module Lda
|
|
9
|
+
module Backends
|
|
10
|
+
class << self
|
|
11
|
+
def build(host:, requested: nil, random_seed: nil)
|
|
12
|
+
mode = normalize_mode(requested)
|
|
13
|
+
|
|
14
|
+
case mode
|
|
15
|
+
when :auto
|
|
16
|
+
if Rust.available?
|
|
17
|
+
Rust.new(random_seed: random_seed)
|
|
18
|
+
elsif Native.available?(host)
|
|
19
|
+
Native.new(host, random_seed: random_seed)
|
|
20
|
+
else
|
|
21
|
+
PureRuby.new(random_seed: random_seed)
|
|
22
|
+
end
|
|
23
|
+
when :rust
|
|
24
|
+
raise LoadError, "Rust backend is unavailable for this environment" unless Rust.available?
|
|
25
|
+
|
|
26
|
+
Rust.new(random_seed: random_seed)
|
|
27
|
+
when :native
|
|
28
|
+
raise LoadError, "Native backend is unavailable for this environment" unless Native.available?(host)
|
|
29
|
+
|
|
30
|
+
Native.new(host, random_seed: random_seed)
|
|
31
|
+
when :pure
|
|
32
|
+
PureRuby.new(random_seed: random_seed)
|
|
33
|
+
else
|
|
34
|
+
raise ArgumentError, "Unknown backend mode: #{requested.inspect}"
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def normalize_mode(requested)
|
|
41
|
+
raw_mode = requested || ENV.fetch("LDA_RUBY_BACKEND", "auto")
|
|
42
|
+
|
|
43
|
+
case raw_mode.to_s.strip.downcase
|
|
44
|
+
when "", "auto"
|
|
45
|
+
:auto
|
|
46
|
+
when "native", "c"
|
|
47
|
+
:native
|
|
48
|
+
when "rust", "rust_native"
|
|
49
|
+
:rust
|
|
50
|
+
when "pure", "ruby", "pure_ruby"
|
|
51
|
+
:pure
|
|
52
|
+
else
|
|
53
|
+
raw_mode
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|