lda-ruby 0.5.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +69 -0
- data/Gemfile +9 -0
- data/README.md +160 -0
- data/VERSION.yml +5 -0
- data/docs/modernization-handoff.md +233 -0
- data/docs/porting-strategy.md +148 -0
- data/docs/precompiled-platform-policy.md +81 -0
- data/docs/precompiled-target-evaluation.md +67 -0
- data/docs/release-runbook.md +192 -0
- data/docs/rust-orchestration-guardrails.md +50 -0
- data/ext/lda-ruby/cokus.c +144 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +13 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1023 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +31 -0
- data/ext/lda-ruby/lda-x64-mingw-ucrt.def +2 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +119 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/ext/lda-ruby-rust/Cargo.toml +12 -0
- data/ext/lda-ruby-rust/README.md +73 -0
- data/ext/lda-ruby-rust/extconf.rb +135 -0
- data/ext/lda-ruby-rust/include/strings.h +35 -0
- data/ext/lda-ruby-rust/src/lib.rs +1263 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby/backends/base.rb +133 -0
- data/lib/lda-ruby/backends/native.rb +158 -0
- data/lib/lda-ruby/backends/pure_ruby.rb +675 -0
- data/lib/lda-ruby/backends/rust.rb +607 -0
- data/lib/lda-ruby/backends.rb +58 -0
- data/lib/lda-ruby/config/stopwords.yml +571 -0
- data/lib/lda-ruby/corpus/corpus.rb +45 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +40 -0
- data/lib/lda-ruby/document/text_document.rb +39 -0
- data/lib/lda-ruby/lda.so +0 -0
- data/lib/lda-ruby/rust_build_policy.rb +21 -0
- data/lib/lda-ruby/version.rb +5 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/lib/lda-ruby.rb +413 -0
- data/lib/lda_ruby_rust.so +0 -0
- data/license.txt +504 -0
- data/test/backend_compatibility_test.rb +146 -0
- data/test/backends_selection_test.rb +100 -0
- data/test/benchmark_scripts_test.rb +23 -0
- data/test/data/docs.dat +46 -0
- data/test/data/sample.rb +20 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/gemspec_test.rb +27 -0
- data/test/lda_ruby_test.rb +319 -0
- data/test/packaged_gem_smoke_test.rb +33 -0
- data/test/pure_ruby_orchestration_test.rb +109 -0
- data/test/release_scripts_test.rb +93 -0
- data/test/rust_build_policy_test.rb +23 -0
- data/test/rust_orchestration_test.rb +911 -0
- data/test/simple_pipeline_test.rb +22 -0
- data/test/simple_yaml.rb +17 -0
- data/test/test_helper.rb +10 -0
- metadata +118 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Experimental Rust Extension Scaffold
|
|
2
|
+
|
|
3
|
+
This directory contains an experimental Rust extension scaffold built with `magnus`.
|
|
4
|
+
|
|
5
|
+
Current scope:
|
|
6
|
+
|
|
7
|
+
- Defines `Lda::RustBackend` module in Ruby.
|
|
8
|
+
- Exposes capability hooks:
|
|
9
|
+
- `Lda::RustBackend.available?`
|
|
10
|
+
- `Lda::RustBackend.abi_version`
|
|
11
|
+
- `Lda::RustBackend.corpus_session_count`
|
|
12
|
+
- `Lda::RustBackend.corpus_session_exists(session_id)`
|
|
13
|
+
- `Lda::RustBackend.before_em(start, num_docs, num_terms)`
|
|
14
|
+
- `Lda::RustBackend.topic_weights_for_word(beta, gamma, word_index, min_probability)`
|
|
15
|
+
- `Lda::RustBackend.accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)`
|
|
16
|
+
- `Lda::RustBackend.infer_document(beta, gamma_initial, words, counts, max_iter, convergence, min_probability, init_alpha)`
|
|
17
|
+
- `Lda::RustBackend.infer_corpus_iteration(beta, document_words, document_counts, max_iter, convergence, min_probability, init_alpha)`
|
|
18
|
+
- `Lda::RustBackend.normalize_topic_term_counts(topic_term_counts, min_probability)`
|
|
19
|
+
- `Lda::RustBackend.average_gamma_shift(previous_gamma, current_gamma)`
|
|
20
|
+
- `Lda::RustBackend.topic_document_probability(phi_tensor, document_counts, num_topics, min_probability)`
|
|
21
|
+
- `Lda::RustBackend.seeded_topic_term_probabilities(document_words, document_counts, topics, terms, min_probability)`
|
|
22
|
+
- `Lda::RustBackend.random_topic_term_probabilities(topics, terms, min_probability, random_seed)`
|
|
23
|
+
- `Lda::RustBackend.create_corpus_session(document_words, document_counts, terms)`
|
|
24
|
+
- `Lda::RustBackend.replace_corpus_session(session_id, document_words, document_counts, terms)`
|
|
25
|
+
- `Lda::RustBackend.drop_corpus_session(session_id)`
|
|
26
|
+
- `Lda::RustBackend.configure_corpus_session(session_id, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability)`
|
|
27
|
+
- `Lda::RustBackend.run_em(initial_beta, document_words, document_counts, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability)`
|
|
28
|
+
- `Lda::RustBackend.run_em_with_start(start, document_words, document_counts, topics, terms, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability)`
|
|
29
|
+
- `Lda::RustBackend.run_em_with_start_seed(start, document_words, document_counts, topics, terms, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
|
|
30
|
+
- `Lda::RustBackend.run_em_on_session(session_id, start, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
|
|
31
|
+
- `Lda::RustBackend.run_em_on_session_start(session_id, start, random_seed)`
|
|
32
|
+
- `Lda::RustBackend.run_em_on_session_with_start_seed(session_id, start, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
|
|
33
|
+
- `Lda::RustBackend.run_em_on_session_with_corpus(session_id, document_words, document_counts, terms, start, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
|
|
34
|
+
|
|
35
|
+
Hot-path kernels currently executed in Rust when `backend: :rust` is active:
|
|
36
|
+
- topic weights for a word across topics
|
|
37
|
+
- topic-term count accumulation from per-document `phi`
|
|
38
|
+
- full per-document inference loop (batched inner EM updates)
|
|
39
|
+
- full per-iteration corpus inference (batched document processing)
|
|
40
|
+
- topic-term normalization and log-probability finalization for EM beta updates
|
|
41
|
+
- gamma convergence shift reduction between EM iterations
|
|
42
|
+
- topic-document average log-probability computation
|
|
43
|
+
- seeded topic-term initialization
|
|
44
|
+
- random topic-term initialization with explicit seed control
|
|
45
|
+
- EM outer-loop orchestration with convergence checks (`run_em`)
|
|
46
|
+
- start-aware deterministic EM orchestration (`run_em_with_start` for `seeded`/`deterministic`)
|
|
47
|
+
- start-aware seeded and random EM orchestration with explicit seed control (`run_em_with_start_seed`)
|
|
48
|
+
- unified session-settings orchestration (`run_em_on_session`) that applies settings and executes EM in one call
|
|
49
|
+
- session-based EM orchestration against Rust-managed corpus lifecycle (`create_corpus_session` + `run_em_on_session_with_start_seed`)
|
|
50
|
+
- settings-aware session orchestration (`configure_corpus_session` + `run_em_on_session_start`)
|
|
51
|
+
- managed corpus orchestration (`run_em_on_session_with_corpus`) that can recreate missing sessions and, if session-backed execution cannot be used, falls back internally to direct start-aware execution inside Rust
|
|
52
|
+
- `Lda::Backends::Rust` prefers `run_em_on_session_with_corpus` whenever a cached Rust corpus snapshot is available, even if no session id is currently cached locally
|
|
53
|
+
- direct and legacy beta-input compatibility fallbacks both reuse the backend's cached Rust corpus snapshot instead of rebuilding corpus arrays in Ruby
|
|
54
|
+
- unknown EM start modes in seed-aware orchestration follow Ruby's non-seeded fallback behavior (seeded by explicit `random_seed`)
|
|
55
|
+
|
|
56
|
+
Remaining numeric LDA kernels are still provided by the pure Ruby backend and will move incrementally.
|
|
57
|
+
|
|
58
|
+
## Local build (optional)
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
cd ext/lda-ruby-rust
|
|
62
|
+
cargo build --release
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Then run Ruby with `require "lda_ruby_rust"` available on load path.
|
|
66
|
+
|
|
67
|
+
## Install-time policy
|
|
68
|
+
|
|
69
|
+
During source gem installs, `ext/lda-ruby-rust/extconf.rb` can optionally build this extension.
|
|
70
|
+
|
|
71
|
+
- `LDA_RUBY_RUST_BUILD=auto` (default): build when `cargo` is available.
|
|
72
|
+
- `LDA_RUBY_RUST_BUILD=always`: require a successful Rust build or fail installation.
|
|
73
|
+
- `LDA_RUBY_RUST_BUILD=never`: always skip Rust build.
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "rbconfig"
|
|
5
|
+
|
|
6
|
+
require_relative "../../lib/lda-ruby/rust_build_policy"
|
|
7
|
+
|
|
8
|
+
module Lda
|
|
9
|
+
module RustExtensionBuild
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
def run
|
|
13
|
+
policy = RustBuildPolicy.resolve
|
|
14
|
+
puts("Rust extension build policy: #{policy} (#{RustBuildPolicy::ENV_KEY})")
|
|
15
|
+
|
|
16
|
+
case policy
|
|
17
|
+
when RustBuildPolicy::NEVER
|
|
18
|
+
puts("Skipping Rust extension build (policy=#{RustBuildPolicy::NEVER}).")
|
|
19
|
+
when RustBuildPolicy::ALWAYS
|
|
20
|
+
ensure_cargo_available!
|
|
21
|
+
build_and_stage!
|
|
22
|
+
else
|
|
23
|
+
if cargo_available?
|
|
24
|
+
build_and_stage!
|
|
25
|
+
else
|
|
26
|
+
puts("cargo not found; skipping Rust extension build (policy=#{RustBuildPolicy::AUTO}).")
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
write_noop_makefile
|
|
31
|
+
rescue StandardError => e
|
|
32
|
+
if policy == RustBuildPolicy::ALWAYS
|
|
33
|
+
abort("Rust extension build failed with #{RustBuildPolicy::ENV_KEY}=#{RustBuildPolicy::ALWAYS}: #{e.message}")
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
warn("Rust extension build skipped after error in auto mode: #{e.message}")
|
|
37
|
+
write_noop_makefile
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def ensure_cargo_available!
|
|
41
|
+
return if cargo_available?
|
|
42
|
+
|
|
43
|
+
abort("cargo not found in PATH but #{RustBuildPolicy::ENV_KEY}=#{RustBuildPolicy::ALWAYS} was requested.")
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def cargo_available?
|
|
47
|
+
cargo = ENV.fetch("CARGO", "cargo")
|
|
48
|
+
system(cargo, "--version", out: File::NULL, err: File::NULL)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def build_and_stage!
|
|
52
|
+
cargo = ENV.fetch("CARGO", "cargo")
|
|
53
|
+
Dir.chdir(__dir__) do
|
|
54
|
+
env = rust_build_env
|
|
55
|
+
success =
|
|
56
|
+
if env.empty?
|
|
57
|
+
system(cargo, "build", "--release")
|
|
58
|
+
else
|
|
59
|
+
system(env, cargo, "build", "--release")
|
|
60
|
+
end
|
|
61
|
+
success or raise "cargo build --release failed"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
source = rust_cdylib_source
|
|
65
|
+
raise "Rust extension artifact not found at #{rust_cdylib_candidates.join(', ')}" unless source
|
|
66
|
+
|
|
67
|
+
destination = File.expand_path("../../lib/lda_ruby_rust.#{RbConfig::CONFIG.fetch('DLEXT')}", __dir__)
|
|
68
|
+
FileUtils.cp(source, destination)
|
|
69
|
+
puts("Staged Rust extension to #{destination}")
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def rust_cdylib_source
|
|
73
|
+
rust_cdylib_candidates.find { |path| File.exist?(path) }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def rust_cdylib_candidates
|
|
77
|
+
rust_cdylib_filenames.map { |filename| File.join(__dir__, "target", "release", filename) }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def rust_cdylib_filenames
|
|
81
|
+
host_os = RbConfig::CONFIG.fetch("host_os")
|
|
82
|
+
case host_os
|
|
83
|
+
when /mswin|mingw|cygwin/
|
|
84
|
+
# On Windows cargo may emit either prefixed or unprefixed DLL names.
|
|
85
|
+
["lda_ruby_rust.dll", "liblda_ruby_rust.dll"]
|
|
86
|
+
else
|
|
87
|
+
extension =
|
|
88
|
+
case host_os
|
|
89
|
+
when /darwin/
|
|
90
|
+
"dylib"
|
|
91
|
+
else
|
|
92
|
+
"so"
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
["liblda_ruby_rust.#{extension}"]
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def rust_build_env
|
|
100
|
+
host_os = RbConfig::CONFIG.fetch("host_os")
|
|
101
|
+
return {} unless host_os.match?(/darwin/)
|
|
102
|
+
|
|
103
|
+
dynamic_lookup_flag = "-C link-arg=-Wl,-undefined,dynamic_lookup"
|
|
104
|
+
existing = ENV.fetch("RUSTFLAGS", "")
|
|
105
|
+
merged =
|
|
106
|
+
if existing.include?(dynamic_lookup_flag)
|
|
107
|
+
existing
|
|
108
|
+
else
|
|
109
|
+
[existing, dynamic_lookup_flag].reject(&:empty?).join(" ")
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
{ "RUSTFLAGS" => merged }
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def write_noop_makefile
|
|
116
|
+
File.write(
|
|
117
|
+
File.join(__dir__, "Makefile"),
|
|
118
|
+
<<~MAKEFILE
|
|
119
|
+
all:
|
|
120
|
+
\t@echo "Rust extension handled by extconf.rb"
|
|
121
|
+
|
|
122
|
+
install:
|
|
123
|
+
\t@echo "Rust extension handled by extconf.rb"
|
|
124
|
+
|
|
125
|
+
clean:
|
|
126
|
+
\t@true
|
|
127
|
+
|
|
128
|
+
distclean: clean
|
|
129
|
+
MAKEFILE
|
|
130
|
+
)
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
Lda::RustExtensionBuild.run
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#ifndef LDA_RUBY_BINDGEN_STRINGS_H
|
|
2
|
+
#define LDA_RUBY_BINDGEN_STRINGS_H
|
|
3
|
+
|
|
4
|
+
#include <string.h>
|
|
5
|
+
|
|
6
|
+
/*
|
|
7
|
+
* RubyInstaller headers may include <strings.h> on Windows, but Clang-based
|
|
8
|
+
* bindgen runs can miss that header in this environment. Provide compatibility
|
|
9
|
+
* aliases for bindgen preprocessing.
|
|
10
|
+
*/
|
|
11
|
+
#if defined(_WIN32) && !defined(__MINGW32__)
|
|
12
|
+
#ifndef bzero
|
|
13
|
+
#define bzero(ptr, size) memset((ptr), 0, (size))
|
|
14
|
+
#endif
|
|
15
|
+
#ifndef bcmp
|
|
16
|
+
#define bcmp(a, b, n) memcmp((a), (b), (n))
|
|
17
|
+
#endif
|
|
18
|
+
#ifndef bcopy
|
|
19
|
+
#define bcopy(src, dst, n) memmove((dst), (src), (n))
|
|
20
|
+
#endif
|
|
21
|
+
#ifndef index
|
|
22
|
+
#define index strchr
|
|
23
|
+
#endif
|
|
24
|
+
#ifndef rindex
|
|
25
|
+
#define rindex strrchr
|
|
26
|
+
#endif
|
|
27
|
+
#ifndef strcasecmp
|
|
28
|
+
#define strcasecmp _stricmp
|
|
29
|
+
#endif
|
|
30
|
+
#ifndef strncasecmp
|
|
31
|
+
#define strncasecmp _strnicmp
|
|
32
|
+
#endif
|
|
33
|
+
#endif
|
|
34
|
+
|
|
35
|
+
#endif
|