lda-ruby 0.4.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +9 -0
- data/README.md +157 -0
- data/VERSION.yml +5 -0
- data/docs/modernization-handoff.md +190 -0
- data/docs/porting-strategy.md +127 -0
- data/docs/precompiled-platform-policy.md +68 -0
- data/docs/release-runbook.md +157 -0
- data/ext/lda-ruby/cokus.c +145 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +13 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1023 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +31 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +111 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/ext/lda-ruby-rust/Cargo.toml +12 -0
- data/ext/lda-ruby-rust/README.md +48 -0
- data/ext/lda-ruby-rust/extconf.rb +123 -0
- data/ext/lda-ruby-rust/src/lib.rs +456 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby/backends/base.rb +129 -0
- data/lib/lda-ruby/backends/native.rb +158 -0
- data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
- data/lib/lda-ruby/backends/rust.rb +226 -0
- data/lib/lda-ruby/backends.rb +58 -0
- data/lib/lda-ruby/config/stopwords.yml +571 -0
- data/lib/lda-ruby/corpus/corpus.rb +45 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +40 -0
- data/lib/lda-ruby/document/text_document.rb +39 -0
- data/lib/lda-ruby/lda.so +0 -0
- data/lib/lda-ruby/rust_build_policy.rb +21 -0
- data/lib/lda-ruby/version.rb +5 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/lib/lda-ruby.rb +413 -0
- data/lib/lda_ruby_rust.so +0 -0
- data/license.txt +504 -0
- data/test/backend_compatibility_test.rb +146 -0
- data/test/backends_selection_test.rb +100 -0
- data/test/data/docs.dat +46 -0
- data/test/data/sample.rb +20 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/gemspec_test.rb +27 -0
- data/test/lda_ruby_test.rb +319 -0
- data/test/packaged_gem_smoke_test.rb +33 -0
- data/test/release_scripts_test.rb +54 -0
- data/test/rust_build_policy_test.rb +23 -0
- data/test/simple_pipeline_test.rb +22 -0
- data/test/simple_yaml.rb +17 -0
- data/test/test_helper.rb +10 -0
- metadata +111 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "rbconfig"
|
|
5
|
+
|
|
6
|
+
require_relative "../../lib/lda-ruby/rust_build_policy"
|
|
7
|
+
|
|
8
|
+
module Lda
|
|
9
|
+
module RustExtensionBuild
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
def run
|
|
13
|
+
policy = RustBuildPolicy.resolve
|
|
14
|
+
puts("Rust extension build policy: #{policy} (#{RustBuildPolicy::ENV_KEY})")
|
|
15
|
+
|
|
16
|
+
case policy
|
|
17
|
+
when RustBuildPolicy::NEVER
|
|
18
|
+
puts("Skipping Rust extension build (policy=#{RustBuildPolicy::NEVER}).")
|
|
19
|
+
when RustBuildPolicy::ALWAYS
|
|
20
|
+
ensure_cargo_available!
|
|
21
|
+
build_and_stage!
|
|
22
|
+
else
|
|
23
|
+
if cargo_available?
|
|
24
|
+
build_and_stage!
|
|
25
|
+
else
|
|
26
|
+
puts("cargo not found; skipping Rust extension build (policy=#{RustBuildPolicy::AUTO}).")
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
write_noop_makefile
|
|
31
|
+
rescue StandardError => e
|
|
32
|
+
if policy == RustBuildPolicy::ALWAYS
|
|
33
|
+
abort("Rust extension build failed with #{RustBuildPolicy::ENV_KEY}=#{RustBuildPolicy::ALWAYS}: #{e.message}")
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
warn("Rust extension build skipped after error in auto mode: #{e.message}")
|
|
37
|
+
write_noop_makefile
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def ensure_cargo_available!
|
|
41
|
+
return if cargo_available?
|
|
42
|
+
|
|
43
|
+
abort("cargo not found in PATH but #{RustBuildPolicy::ENV_KEY}=#{RustBuildPolicy::ALWAYS} was requested.")
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def cargo_available?
|
|
47
|
+
cargo = ENV.fetch("CARGO", "cargo")
|
|
48
|
+
system(cargo, "--version", out: File::NULL, err: File::NULL)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def build_and_stage!
|
|
52
|
+
cargo = ENV.fetch("CARGO", "cargo")
|
|
53
|
+
Dir.chdir(__dir__) do
|
|
54
|
+
env = rust_build_env
|
|
55
|
+
success =
|
|
56
|
+
if env.empty?
|
|
57
|
+
system(cargo, "build", "--release")
|
|
58
|
+
else
|
|
59
|
+
system(env, cargo, "build", "--release")
|
|
60
|
+
end
|
|
61
|
+
success or raise "cargo build --release failed"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
source = File.join(__dir__, "target", "release", rust_cdylib_filename)
|
|
65
|
+
raise "Rust extension artifact not found at #{source}" unless File.exist?(source)
|
|
66
|
+
|
|
67
|
+
destination = File.expand_path("../../lib/lda_ruby_rust.#{RbConfig::CONFIG.fetch('DLEXT')}", __dir__)
|
|
68
|
+
FileUtils.cp(source, destination)
|
|
69
|
+
puts("Staged Rust extension to #{destination}")
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def rust_cdylib_filename
|
|
73
|
+
host_os = RbConfig::CONFIG.fetch("host_os")
|
|
74
|
+
extension =
|
|
75
|
+
case host_os
|
|
76
|
+
when /darwin/
|
|
77
|
+
"dylib"
|
|
78
|
+
when /mswin|mingw|cygwin/
|
|
79
|
+
"dll"
|
|
80
|
+
else
|
|
81
|
+
"so"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
"liblda_ruby_rust.#{extension}"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def rust_build_env
|
|
88
|
+
host_os = RbConfig::CONFIG.fetch("host_os")
|
|
89
|
+
return {} unless host_os.match?(/darwin/)
|
|
90
|
+
|
|
91
|
+
dynamic_lookup_flag = "-C link-arg=-Wl,-undefined,dynamic_lookup"
|
|
92
|
+
existing = ENV.fetch("RUSTFLAGS", "")
|
|
93
|
+
merged =
|
|
94
|
+
if existing.include?(dynamic_lookup_flag)
|
|
95
|
+
existing
|
|
96
|
+
else
|
|
97
|
+
[existing, dynamic_lookup_flag].reject(&:empty?).join(" ")
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
{ "RUSTFLAGS" => merged }
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def write_noop_makefile
|
|
104
|
+
File.write(
|
|
105
|
+
File.join(__dir__, "Makefile"),
|
|
106
|
+
<<~MAKEFILE
|
|
107
|
+
all:
|
|
108
|
+
\t@echo "Rust extension handled by extconf.rb"
|
|
109
|
+
|
|
110
|
+
install:
|
|
111
|
+
\t@echo "Rust extension handled by extconf.rb"
|
|
112
|
+
|
|
113
|
+
clean:
|
|
114
|
+
\t@true
|
|
115
|
+
|
|
116
|
+
distclean: clean
|
|
117
|
+
MAKEFILE
|
|
118
|
+
)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
Lda::RustExtensionBuild.run
|
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
use magnus::{define_module, function, Error, Module, Object};
|
|
2
|
+
|
|
3
|
+
fn available() -> bool {
|
|
4
|
+
true
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
fn abi_version() -> i64 {
|
|
8
|
+
1
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
fn before_em(_start: String, _num_docs: i64, _num_terms: i64) -> bool {
|
|
12
|
+
true
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
fn floor_value(min_probability: f64) -> f64 {
|
|
16
|
+
if min_probability.is_finite() && min_probability > 0.0 {
|
|
17
|
+
min_probability
|
|
18
|
+
} else {
|
|
19
|
+
1.0e-12
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
fn normalize_in_place(weights: &mut [f64]) {
|
|
24
|
+
let total: f64 = weights.iter().sum();
|
|
25
|
+
|
|
26
|
+
if !total.is_finite() || total <= 0.0 {
|
|
27
|
+
let uniform = if weights.is_empty() {
|
|
28
|
+
0.0
|
|
29
|
+
} else {
|
|
30
|
+
1.0 / weights.len() as f64
|
|
31
|
+
};
|
|
32
|
+
for weight in weights {
|
|
33
|
+
*weight = uniform;
|
|
34
|
+
}
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
for weight in weights {
|
|
39
|
+
*weight /= total;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
fn compute_topic_weights(
|
|
44
|
+
beta_probabilities: &[Vec<f64>],
|
|
45
|
+
gamma: &[f64],
|
|
46
|
+
word_index: usize,
|
|
47
|
+
floor: f64,
|
|
48
|
+
) -> Vec<f64> {
|
|
49
|
+
let topics = gamma.len().min(beta_probabilities.len());
|
|
50
|
+
if topics == 0 {
|
|
51
|
+
return Vec::new();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
let mut weights = Vec::with_capacity(topics);
|
|
55
|
+
for topic_index in 0..topics {
|
|
56
|
+
let beta_value = beta_probabilities[topic_index]
|
|
57
|
+
.get(word_index)
|
|
58
|
+
.copied()
|
|
59
|
+
.unwrap_or(floor)
|
|
60
|
+
.max(floor);
|
|
61
|
+
let gamma_value = gamma[topic_index].max(floor);
|
|
62
|
+
weights.push(beta_value * gamma_value);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
normalize_in_place(&mut weights);
|
|
66
|
+
weights
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
fn topic_weights_for_word(
|
|
70
|
+
beta_probabilities: Vec<Vec<f64>>,
|
|
71
|
+
gamma: Vec<f64>,
|
|
72
|
+
word_index: usize,
|
|
73
|
+
min_probability: f64,
|
|
74
|
+
) -> Vec<f64> {
|
|
75
|
+
let floor = floor_value(min_probability);
|
|
76
|
+
compute_topic_weights(&beta_probabilities, &gamma, word_index, floor)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
fn accumulate_topic_term_counts_in_place(
|
|
80
|
+
topic_term_counts: &mut [Vec<f64>],
|
|
81
|
+
phi_d: &[Vec<f64>],
|
|
82
|
+
words: &[usize],
|
|
83
|
+
counts: &[f64],
|
|
84
|
+
) {
|
|
85
|
+
let topics = topic_term_counts.len();
|
|
86
|
+
if topics == 0 {
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
for (word_offset, &word_index) in words.iter().enumerate() {
|
|
91
|
+
let count = counts.get(word_offset).copied().unwrap_or(0.0);
|
|
92
|
+
if count == 0.0 {
|
|
93
|
+
continue;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
let Some(phi_row) = phi_d.get(word_offset) else {
|
|
97
|
+
continue;
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
for topic_index in 0..topics {
|
|
101
|
+
let phi_value = phi_row.get(topic_index).copied().unwrap_or(0.0);
|
|
102
|
+
if let Some(topic_terms) = topic_term_counts.get_mut(topic_index) {
|
|
103
|
+
if word_index < topic_terms.len() {
|
|
104
|
+
topic_terms[word_index] += count * phi_value;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
fn accumulate_topic_term_counts(
|
|
112
|
+
mut topic_term_counts: Vec<Vec<f64>>,
|
|
113
|
+
phi_d: Vec<Vec<f64>>,
|
|
114
|
+
words: Vec<usize>,
|
|
115
|
+
counts: Vec<f64>,
|
|
116
|
+
) -> Vec<Vec<f64>> {
|
|
117
|
+
accumulate_topic_term_counts_in_place(
|
|
118
|
+
topic_term_counts.as_mut_slice(),
|
|
119
|
+
phi_d.as_slice(),
|
|
120
|
+
words.as_slice(),
|
|
121
|
+
counts.as_slice(),
|
|
122
|
+
);
|
|
123
|
+
topic_term_counts
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
fn normalize_topic_term_counts(
|
|
127
|
+
topic_term_counts: Vec<Vec<f64>>,
|
|
128
|
+
min_probability: f64,
|
|
129
|
+
) -> (Vec<Vec<f64>>, Vec<Vec<f64>>) {
|
|
130
|
+
let floor = floor_value(min_probability);
|
|
131
|
+
|
|
132
|
+
let mut beta_probabilities = Vec::with_capacity(topic_term_counts.len());
|
|
133
|
+
let mut beta_log = Vec::with_capacity(topic_term_counts.len());
|
|
134
|
+
|
|
135
|
+
for topic_counts in topic_term_counts.iter() {
|
|
136
|
+
let mut normalized = topic_counts
|
|
137
|
+
.iter()
|
|
138
|
+
.map(|value| {
|
|
139
|
+
if value.is_finite() {
|
|
140
|
+
value.max(floor)
|
|
141
|
+
} else {
|
|
142
|
+
floor
|
|
143
|
+
}
|
|
144
|
+
})
|
|
145
|
+
.collect::<Vec<_>>();
|
|
146
|
+
|
|
147
|
+
normalize_in_place(&mut normalized);
|
|
148
|
+
|
|
149
|
+
let topic_log = normalized
|
|
150
|
+
.iter()
|
|
151
|
+
.map(|value| value.max(floor).ln())
|
|
152
|
+
.collect::<Vec<_>>();
|
|
153
|
+
|
|
154
|
+
beta_probabilities.push(normalized);
|
|
155
|
+
beta_log.push(topic_log);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
(beta_probabilities, beta_log)
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
fn average_gamma_shift(previous_gamma: Vec<Vec<f64>>, current_gamma: Vec<Vec<f64>>) -> f64 {
|
|
162
|
+
let mut sum = 0.0_f64;
|
|
163
|
+
let mut count = 0_usize;
|
|
164
|
+
|
|
165
|
+
for (row_index, previous_row) in previous_gamma.iter().enumerate() {
|
|
166
|
+
let current_row = current_gamma.get(row_index);
|
|
167
|
+
|
|
168
|
+
for (col_index, previous_value) in previous_row.iter().enumerate() {
|
|
169
|
+
let current_value = current_row
|
|
170
|
+
.and_then(|row| row.get(col_index))
|
|
171
|
+
.copied()
|
|
172
|
+
.unwrap_or(*previous_value);
|
|
173
|
+
|
|
174
|
+
sum += (previous_value - current_value).abs();
|
|
175
|
+
count += 1;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
if count == 0 {
|
|
180
|
+
0.0
|
|
181
|
+
} else {
|
|
182
|
+
sum / count as f64
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
fn topic_document_probability(
|
|
187
|
+
phi_tensor: Vec<Vec<Vec<f64>>>,
|
|
188
|
+
document_counts: Vec<Vec<f64>>,
|
|
189
|
+
num_topics: usize,
|
|
190
|
+
min_probability: f64,
|
|
191
|
+
) -> Vec<Vec<f64>> {
|
|
192
|
+
let floor = floor_value(min_probability);
|
|
193
|
+
let mut output = Vec::with_capacity(document_counts.len());
|
|
194
|
+
|
|
195
|
+
for (doc_index, counts) in document_counts.iter().enumerate() {
|
|
196
|
+
let mut tops = vec![0.0_f64; num_topics];
|
|
197
|
+
let ttl: f64 = counts.iter().copied().sum();
|
|
198
|
+
|
|
199
|
+
if let Some(doc_phi) = phi_tensor.get(doc_index) {
|
|
200
|
+
for (word_index, word_dist) in doc_phi.iter().enumerate() {
|
|
201
|
+
let count = counts.get(word_index).copied().unwrap_or(0.0);
|
|
202
|
+
if count == 0.0 {
|
|
203
|
+
continue;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
for topic_index in 0..num_topics {
|
|
207
|
+
let top_prob = word_dist.get(topic_index).copied().unwrap_or(floor).max(floor);
|
|
208
|
+
tops[topic_index] += top_prob.ln() * count;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
if ttl.is_finite() && ttl > 0.0 {
|
|
214
|
+
for value in tops.iter_mut() {
|
|
215
|
+
*value /= ttl;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
output.push(tops);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
output
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
fn seeded_topic_term_probabilities(
|
|
226
|
+
document_words: Vec<Vec<usize>>,
|
|
227
|
+
document_counts: Vec<Vec<f64>>,
|
|
228
|
+
topics: usize,
|
|
229
|
+
terms: usize,
|
|
230
|
+
min_probability: f64,
|
|
231
|
+
) -> Vec<Vec<f64>> {
|
|
232
|
+
if topics == 0 || terms == 0 {
|
|
233
|
+
return Vec::new();
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
let floor = floor_value(min_probability);
|
|
237
|
+
let mut topic_term_counts = vec![vec![floor; terms]; topics];
|
|
238
|
+
|
|
239
|
+
for (doc_index, words) in document_words.iter().enumerate() {
|
|
240
|
+
let topic_index = doc_index % topics;
|
|
241
|
+
let counts = document_counts.get(doc_index);
|
|
242
|
+
|
|
243
|
+
for (word_offset, &word_index) in words.iter().enumerate() {
|
|
244
|
+
if word_index >= terms {
|
|
245
|
+
continue;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
let count = counts
|
|
249
|
+
.and_then(|row| row.get(word_offset))
|
|
250
|
+
.copied()
|
|
251
|
+
.unwrap_or(0.0);
|
|
252
|
+
if !count.is_finite() || count == 0.0 {
|
|
253
|
+
continue;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
topic_term_counts[topic_index][word_index] += count;
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
for row in topic_term_counts.iter_mut() {
|
|
261
|
+
normalize_in_place(row);
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
topic_term_counts
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
fn infer_document_internal(
|
|
268
|
+
beta_probabilities: &[Vec<f64>],
|
|
269
|
+
gamma_initial: &[f64],
|
|
270
|
+
words: &[usize],
|
|
271
|
+
counts: &[f64],
|
|
272
|
+
max_iter: i64,
|
|
273
|
+
convergence: f64,
|
|
274
|
+
min_probability: f64,
|
|
275
|
+
init_alpha: f64,
|
|
276
|
+
) -> (Vec<f64>, Vec<Vec<f64>>) {
|
|
277
|
+
let topics = gamma_initial.len().min(beta_probabilities.len());
|
|
278
|
+
if topics == 0 {
|
|
279
|
+
return (Vec::new(), Vec::new());
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
let floor = floor_value(min_probability);
|
|
283
|
+
let init_alpha_value = if init_alpha.is_finite() {
|
|
284
|
+
init_alpha
|
|
285
|
+
} else {
|
|
286
|
+
0.3
|
|
287
|
+
};
|
|
288
|
+
let convergence_value = if convergence.is_finite() && convergence >= 0.0 {
|
|
289
|
+
convergence
|
|
290
|
+
} else {
|
|
291
|
+
1.0e-6
|
|
292
|
+
};
|
|
293
|
+
let max_iter_value = if max_iter <= 0 { 1 } else { max_iter as usize };
|
|
294
|
+
|
|
295
|
+
let mut gamma_d = gamma_initial.iter().copied().take(topics).collect::<Vec<_>>();
|
|
296
|
+
if gamma_d.len() < topics {
|
|
297
|
+
gamma_d.resize(topics, init_alpha_value);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
let mut phi_d = vec![vec![1.0 / topics as f64; topics]; words.len()];
|
|
301
|
+
|
|
302
|
+
for _ in 0..max_iter_value {
|
|
303
|
+
let mut gamma_next = vec![init_alpha_value; topics];
|
|
304
|
+
|
|
305
|
+
for (word_offset, &word_index) in words.iter().enumerate() {
|
|
306
|
+
let topic_weights = compute_topic_weights(beta_probabilities, &gamma_d, word_index, floor);
|
|
307
|
+
phi_d[word_offset] = topic_weights.clone();
|
|
308
|
+
|
|
309
|
+
let count = counts.get(word_offset).copied().unwrap_or(0.0);
|
|
310
|
+
if count == 0.0 {
|
|
311
|
+
continue;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
for topic_index in 0..topics {
|
|
315
|
+
gamma_next[topic_index] += count * topic_weights[topic_index];
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
let mut gamma_shift = 0.0_f64;
|
|
320
|
+
for topic_index in 0..topics {
|
|
321
|
+
let delta = (gamma_d[topic_index] - gamma_next[topic_index]).abs();
|
|
322
|
+
if delta > gamma_shift {
|
|
323
|
+
gamma_shift = delta;
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
gamma_d = gamma_next;
|
|
328
|
+
if gamma_shift <= convergence_value {
|
|
329
|
+
break;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
(gamma_d, phi_d)
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
fn infer_document(
|
|
337
|
+
beta_probabilities: Vec<Vec<f64>>,
|
|
338
|
+
gamma_initial: Vec<f64>,
|
|
339
|
+
words: Vec<usize>,
|
|
340
|
+
counts: Vec<f64>,
|
|
341
|
+
max_iter: i64,
|
|
342
|
+
convergence: f64,
|
|
343
|
+
min_probability: f64,
|
|
344
|
+
init_alpha: f64,
|
|
345
|
+
) -> Vec<Vec<f64>> {
|
|
346
|
+
let (gamma_d, phi_d) = infer_document_internal(
|
|
347
|
+
beta_probabilities.as_slice(),
|
|
348
|
+
gamma_initial.as_slice(),
|
|
349
|
+
words.as_slice(),
|
|
350
|
+
counts.as_slice(),
|
|
351
|
+
max_iter,
|
|
352
|
+
convergence,
|
|
353
|
+
min_probability,
|
|
354
|
+
init_alpha,
|
|
355
|
+
);
|
|
356
|
+
|
|
357
|
+
let mut output = Vec::with_capacity(phi_d.len() + 1);
|
|
358
|
+
output.push(gamma_d);
|
|
359
|
+
output.extend(phi_d);
|
|
360
|
+
output
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
fn infer_corpus_iteration(
|
|
364
|
+
beta_probabilities: Vec<Vec<f64>>,
|
|
365
|
+
document_words: Vec<Vec<usize>>,
|
|
366
|
+
document_counts: Vec<Vec<f64>>,
|
|
367
|
+
max_iter: i64,
|
|
368
|
+
convergence: f64,
|
|
369
|
+
min_probability: f64,
|
|
370
|
+
init_alpha: f64,
|
|
371
|
+
) -> (Vec<Vec<f64>>, Vec<Vec<Vec<f64>>>, Vec<Vec<f64>>) {
|
|
372
|
+
let topics = beta_probabilities.len();
|
|
373
|
+
if topics == 0 {
|
|
374
|
+
return (Vec::new(), Vec::new(), Vec::new());
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
let terms = beta_probabilities
|
|
378
|
+
.iter()
|
|
379
|
+
.map(|row| row.len())
|
|
380
|
+
.max()
|
|
381
|
+
.unwrap_or(0);
|
|
382
|
+
let floor = floor_value(min_probability);
|
|
383
|
+
let init_alpha_value = if init_alpha.is_finite() { init_alpha } else { 0.3 };
|
|
384
|
+
|
|
385
|
+
let mut topic_term_counts = vec![vec![floor; terms]; topics];
|
|
386
|
+
let mut gamma_matrix = Vec::with_capacity(document_words.len());
|
|
387
|
+
let mut phi_tensor = Vec::with_capacity(document_words.len());
|
|
388
|
+
|
|
389
|
+
for (doc_index, words) in document_words.iter().enumerate() {
|
|
390
|
+
let counts = document_counts.get(doc_index).cloned().unwrap_or_else(|| vec![0.0; words.len()]);
|
|
391
|
+
let total: f64 = counts.iter().sum();
|
|
392
|
+
let gamma_initial = vec![init_alpha_value + (total / topics as f64); topics];
|
|
393
|
+
|
|
394
|
+
let (gamma_d, phi_d) = infer_document_internal(
|
|
395
|
+
beta_probabilities.as_slice(),
|
|
396
|
+
gamma_initial.as_slice(),
|
|
397
|
+
words.as_slice(),
|
|
398
|
+
counts.as_slice(),
|
|
399
|
+
max_iter,
|
|
400
|
+
convergence,
|
|
401
|
+
min_probability,
|
|
402
|
+
init_alpha,
|
|
403
|
+
);
|
|
404
|
+
|
|
405
|
+
accumulate_topic_term_counts_in_place(
|
|
406
|
+
topic_term_counts.as_mut_slice(),
|
|
407
|
+
phi_d.as_slice(),
|
|
408
|
+
words.as_slice(),
|
|
409
|
+
counts.as_slice(),
|
|
410
|
+
);
|
|
411
|
+
|
|
412
|
+
gamma_matrix.push(gamma_d);
|
|
413
|
+
phi_tensor.push(phi_d);
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
(gamma_matrix, phi_tensor, topic_term_counts)
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
#[magnus::init]
|
|
420
|
+
fn init() -> Result<(), Error> {
|
|
421
|
+
let lda_module = define_module("Lda")?;
|
|
422
|
+
let rust_backend_module = lda_module.define_module("RustBackend")?;
|
|
423
|
+
|
|
424
|
+
rust_backend_module.define_singleton_method("available?", function!(available, 0))?;
|
|
425
|
+
rust_backend_module.define_singleton_method("abi_version", function!(abi_version, 0))?;
|
|
426
|
+
rust_backend_module.define_singleton_method("before_em", function!(before_em, 3))?;
|
|
427
|
+
rust_backend_module.define_singleton_method(
|
|
428
|
+
"topic_weights_for_word",
|
|
429
|
+
function!(topic_weights_for_word, 4),
|
|
430
|
+
)?;
|
|
431
|
+
rust_backend_module.define_singleton_method(
|
|
432
|
+
"accumulate_topic_term_counts",
|
|
433
|
+
function!(accumulate_topic_term_counts, 4),
|
|
434
|
+
)?;
|
|
435
|
+
rust_backend_module.define_singleton_method("infer_document", function!(infer_document, 8))?;
|
|
436
|
+
rust_backend_module.define_singleton_method(
|
|
437
|
+
"infer_corpus_iteration",
|
|
438
|
+
function!(infer_corpus_iteration, 7),
|
|
439
|
+
)?;
|
|
440
|
+
rust_backend_module.define_singleton_method(
|
|
441
|
+
"normalize_topic_term_counts",
|
|
442
|
+
function!(normalize_topic_term_counts, 2),
|
|
443
|
+
)?;
|
|
444
|
+
rust_backend_module
|
|
445
|
+
.define_singleton_method("average_gamma_shift", function!(average_gamma_shift, 2))?;
|
|
446
|
+
rust_backend_module.define_singleton_method(
|
|
447
|
+
"topic_document_probability",
|
|
448
|
+
function!(topic_document_probability, 4),
|
|
449
|
+
)?;
|
|
450
|
+
rust_backend_module.define_singleton_method(
|
|
451
|
+
"seeded_topic_term_probabilities",
|
|
452
|
+
function!(seeded_topic_term_probabilities, 5),
|
|
453
|
+
)?;
|
|
454
|
+
|
|
455
|
+
Ok(())
|
|
456
|
+
}
|
data/lda-ruby.gemspec
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/lda-ruby/version"
|
|
4
|
+
|
|
5
|
+
variant = ENV.fetch("LDA_RUBY_GEM_VARIANT", "source")
|
|
6
|
+
valid_variants = %w[source precompiled].freeze
|
|
7
|
+
unless valid_variants.include?(variant)
|
|
8
|
+
raise ArgumentError, "Unsupported LDA_RUBY_GEM_VARIANT=#{variant.inspect}. Expected one of: #{valid_variants.join(', ')}"
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
precompiled_variant = variant == "precompiled"
|
|
12
|
+
|
|
13
|
+
Gem::Specification.new do |spec|
|
|
14
|
+
spec.name = "lda-ruby"
|
|
15
|
+
spec.version = Lda::VERSION
|
|
16
|
+
spec.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
|
|
17
|
+
spec.email = ["jasonmadams@gmail.com"]
|
|
18
|
+
|
|
19
|
+
spec.summary = "Ruby implementation of Latent Dirichlet Allocation (LDA)."
|
|
20
|
+
spec.description = "Ruby wrapper and toolkit for Latent Dirichlet Allocation based on the original lda-c implementation by David M. Blei."
|
|
21
|
+
spec.homepage = "https://github.com/ealdent/lda-ruby"
|
|
22
|
+
spec.license = "GPL-2.0-or-later"
|
|
23
|
+
spec.required_ruby_version = ">= 3.2"
|
|
24
|
+
|
|
25
|
+
spec.metadata = {
|
|
26
|
+
"homepage_uri" => spec.homepage,
|
|
27
|
+
"source_code_uri" => spec.homepage,
|
|
28
|
+
"changelog_uri" => "#{spec.homepage}/blob/master/CHANGELOG.md",
|
|
29
|
+
"lda_ruby_gem_variant" => variant
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if precompiled_variant
|
|
33
|
+
platform_override = ENV.fetch("LDA_RUBY_GEM_PLATFORM", "").strip
|
|
34
|
+
platform_value = platform_override.empty? ? Gem::Platform.local.to_s : platform_override
|
|
35
|
+
|
|
36
|
+
spec.platform = Gem::Platform.new(platform_value)
|
|
37
|
+
spec.metadata["lda_ruby_platform"] = spec.platform.to_s
|
|
38
|
+
spec.extensions = []
|
|
39
|
+
else
|
|
40
|
+
spec.extensions = ["ext/lda-ruby/extconf.rb", "ext/lda-ruby-rust/extconf.rb"]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
spec.require_paths = ["lib"]
|
|
44
|
+
|
|
45
|
+
included = %w[CHANGELOG.md Gemfile README.md VERSION.yml lda-ruby.gemspec license.txt]
|
|
46
|
+
included += Dir.glob("docs/**/*")
|
|
47
|
+
included += Dir.glob("ext/**/*")
|
|
48
|
+
included += Dir.glob("lib/**/*")
|
|
49
|
+
included += Dir.glob("test/**/*")
|
|
50
|
+
allowed_precompiled_binary_patterns = [
|
|
51
|
+
%r{\Alib/lda-ruby/lda\.(so|bundle|dylib|dll)\z},
|
|
52
|
+
%r{\Alib/lda_ruby_rust\.(so|bundle|dylib|dll)\z}
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
spec.files = included
|
|
56
|
+
.reject { |path| File.directory?(path) }
|
|
57
|
+
.reject { |path| path.start_with?("ext/lda-ruby-rust/target/") }
|
|
58
|
+
.reject { |path| path == "ext/lda-ruby-rust/Cargo.lock" }
|
|
59
|
+
.reject do |path|
|
|
60
|
+
next false if precompiled_variant && allowed_precompiled_binary_patterns.any? { |pattern| pattern.match?(path) }
|
|
61
|
+
|
|
62
|
+
path.end_with?(".o", ".so", ".bundle", ".dylib", ".dll", ".rlib", ".rmeta")
|
|
63
|
+
end
|
|
64
|
+
.reject do |path|
|
|
65
|
+
["Makefile", "ext/lda-ruby/Makefile", "ext/lda-ruby/mkmf.log", "ext/lda-ruby-rust/Makefile"].include?(path)
|
|
66
|
+
end
|
|
67
|
+
.uniq
|
|
68
|
+
.sort
|
|
69
|
+
|
|
70
|
+
if precompiled_variant
|
|
71
|
+
missing_binaries = allowed_precompiled_binary_patterns.reject do |pattern|
|
|
72
|
+
spec.files.any? { |path| pattern.match?(path) }
|
|
73
|
+
end
|
|
74
|
+
unless missing_binaries.empty?
|
|
75
|
+
raise "Precompiled variant requires staged binaries under lib/: #{missing_binaries.map(&:source).join(', ')}"
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|