lda-ruby 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +4 -1
- data/VERSION.yml +1 -1
- data/docs/modernization-handoff.md +68 -25
- data/docs/porting-strategy.md +23 -2
- data/docs/precompiled-platform-policy.md +15 -2
- data/docs/precompiled-target-evaluation.md +67 -0
- data/docs/release-runbook.md +41 -6
- data/docs/rust-orchestration-guardrails.md +50 -0
- data/ext/lda-ruby/cokus.c +10 -11
- data/ext/lda-ruby/cokus.h +3 -3
- data/ext/lda-ruby/lda-inference.c +2 -2
- data/ext/lda-ruby/utils.c +8 -0
- data/ext/lda-ruby-rust/README.md +25 -0
- data/ext/lda-ruby-rust/extconf.rb +25 -13
- data/ext/lda-ruby-rust/include/strings.h +35 -0
- data/ext/lda-ruby-rust/src/lib.rs +816 -9
- data/lib/lda-ruby/backends/base.rb +4 -0
- data/lib/lda-ruby/backends/pure_ruby.rb +110 -48
- data/lib/lda-ruby/backends/rust.rb +384 -3
- data/lib/lda-ruby/version.rb +1 -1
- data/test/benchmark_scripts_test.rb +23 -0
- data/test/pure_ruby_orchestration_test.rb +109 -0
- data/test/release_scripts_test.rb +39 -0
- data/test/rust_orchestration_test.rb +911 -0
- metadata +8 -2
data/ext/lda-ruby/cokus.h
CHANGED
|
@@ -12,9 +12,9 @@
|
|
|
12
12
|
|
|
13
13
|
typedef unsigned long uint32;
|
|
14
14
|
|
|
15
|
-
#define
|
|
16
|
-
#define
|
|
17
|
-
#define
|
|
15
|
+
#define COKUS_N (624) // length of state vector
|
|
16
|
+
#define COKUS_M (397) // a period parameter
|
|
17
|
+
#define COKUS_K (0x9908B0DFU) // a magic constant
|
|
18
18
|
#define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u
|
|
19
19
|
#define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u
|
|
20
20
|
#define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u
|
|
@@ -435,9 +435,9 @@ void infer(char* model_root, char* save, corpus* corpus) {
|
|
|
435
435
|
int main(int argc, char* argv[]) {
|
|
436
436
|
corpus* corpus;
|
|
437
437
|
|
|
438
|
-
|
|
438
|
+
time_t t1;
|
|
439
439
|
(void) time(&t1);
|
|
440
|
-
seedMT(t1);
|
|
440
|
+
seedMT((uint32) t1);
|
|
441
441
|
// seedMT(4357U);
|
|
442
442
|
|
|
443
443
|
if (argc > 1)
|
data/ext/lda-ruby/utils.c
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
#include "utils.h"
|
|
2
2
|
|
|
3
|
+
#ifdef _WIN32
|
|
4
|
+
#include <direct.h>
|
|
5
|
+
#endif
|
|
6
|
+
|
|
3
7
|
/*
|
|
4
8
|
* given log(a) and log(b), return log(a + b)
|
|
5
9
|
*
|
|
@@ -85,7 +89,11 @@ double log_gamma(double x)
|
|
|
85
89
|
|
|
86
90
|
void make_directory(char* name)
|
|
87
91
|
{
|
|
92
|
+
#ifdef _WIN32
|
|
93
|
+
_mkdir(name);
|
|
94
|
+
#else
|
|
88
95
|
mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
|
|
96
|
+
#endif
|
|
89
97
|
}
|
|
90
98
|
|
|
91
99
|
|
data/ext/lda-ruby-rust/README.md
CHANGED
|
@@ -8,6 +8,8 @@ Current scope:
|
|
|
8
8
|
- Exposes capability hooks:
|
|
9
9
|
- `Lda::RustBackend.available?`
|
|
10
10
|
- `Lda::RustBackend.abi_version`
|
|
11
|
+
- `Lda::RustBackend.corpus_session_count`
|
|
12
|
+
- `Lda::RustBackend.corpus_session_exists(session_id)`
|
|
11
13
|
- `Lda::RustBackend.before_em(start, num_docs, num_terms)`
|
|
12
14
|
- `Lda::RustBackend.topic_weights_for_word(beta, gamma, word_index, min_probability)`
|
|
13
15
|
- `Lda::RustBackend.accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)`
|
|
@@ -17,6 +19,18 @@ Current scope:
|
|
|
17
19
|
- `Lda::RustBackend.average_gamma_shift(previous_gamma, current_gamma)`
|
|
18
20
|
- `Lda::RustBackend.topic_document_probability(phi_tensor, document_counts, num_topics, min_probability)`
|
|
19
21
|
- `Lda::RustBackend.seeded_topic_term_probabilities(document_words, document_counts, topics, terms, min_probability)`
|
|
22
|
+
- `Lda::RustBackend.random_topic_term_probabilities(topics, terms, min_probability, random_seed)`
|
|
23
|
+
- `Lda::RustBackend.create_corpus_session(document_words, document_counts, terms)`
|
|
24
|
+
- `Lda::RustBackend.replace_corpus_session(session_id, document_words, document_counts, terms)`
|
|
25
|
+
- `Lda::RustBackend.drop_corpus_session(session_id)`
|
|
26
|
+
- `Lda::RustBackend.configure_corpus_session(session_id, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability)`
|
|
27
|
+
- `Lda::RustBackend.run_em(initial_beta, document_words, document_counts, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability)`
|
|
28
|
+
- `Lda::RustBackend.run_em_with_start(start, document_words, document_counts, topics, terms, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability)`
|
|
29
|
+
- `Lda::RustBackend.run_em_with_start_seed(start, document_words, document_counts, topics, terms, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
|
|
30
|
+
- `Lda::RustBackend.run_em_on_session(session_id, start, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
|
|
31
|
+
- `Lda::RustBackend.run_em_on_session_start(session_id, start, random_seed)`
|
|
32
|
+
- `Lda::RustBackend.run_em_on_session_with_start_seed(session_id, start, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
|
|
33
|
+
- `Lda::RustBackend.run_em_on_session_with_corpus(session_id, document_words, document_counts, terms, start, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
|
|
20
34
|
|
|
21
35
|
Hot-path kernels currently executed in Rust when `backend: :rust` is active:
|
|
22
36
|
- topic weights for a word across topics
|
|
@@ -27,6 +41,17 @@ Hot-path kernels currently executed in Rust when `backend: :rust` is active:
|
|
|
27
41
|
- gamma convergence shift reduction between EM iterations
|
|
28
42
|
- topic-document average log-probability computation
|
|
29
43
|
- seeded topic-term initialization
|
|
44
|
+
- random topic-term initialization with explicit seed control
|
|
45
|
+
- EM outer-loop orchestration with convergence checks (`run_em`)
|
|
46
|
+
- start-aware deterministic EM orchestration (`run_em_with_start` for `seeded`/`deterministic`)
|
|
47
|
+
- start-aware seeded and random EM orchestration with explicit seed control (`run_em_with_start_seed`)
|
|
48
|
+
- unified session-settings orchestration (`run_em_on_session`) that applies settings and executes EM in one call
|
|
49
|
+
- session-based EM orchestration against Rust-managed corpus lifecycle (`create_corpus_session` + `run_em_on_session_with_start_seed`)
|
|
50
|
+
- settings-aware session orchestration (`configure_corpus_session` + `run_em_on_session_start`)
|
|
51
|
+
- managed corpus orchestration (`run_em_on_session_with_corpus`) that can recreate missing sessions and, if session-backed execution cannot be used, falls back internally to direct start-aware execution inside Rust
|
|
52
|
+
- `Lda::Backends::Rust` prefers `run_em_on_session_with_corpus` whenever a cached Rust corpus snapshot is available, even if no session id is currently cached locally
|
|
53
|
+
- direct and legacy beta-input compatibility fallbacks both reuse the backend's cached Rust corpus snapshot instead of rebuilding corpus arrays in Ruby
|
|
54
|
+
- unknown EM start modes in seed-aware orchestration follow Ruby's non-seeded fallback behavior (seeded by explicit `random_seed`)
|
|
30
55
|
|
|
31
56
|
Remaining numeric LDA kernels are still provided by the pure Ruby backend and will move incrementally.
|
|
32
57
|
|
|
@@ -61,27 +61,39 @@ module Lda
|
|
|
61
61
|
success or raise "cargo build --release failed"
|
|
62
62
|
end
|
|
63
63
|
|
|
64
|
-
source =
|
|
65
|
-
raise "Rust extension artifact not found at #{
|
|
64
|
+
source = rust_cdylib_source
|
|
65
|
+
raise "Rust extension artifact not found at #{rust_cdylib_candidates.join(', ')}" unless source
|
|
66
66
|
|
|
67
67
|
destination = File.expand_path("../../lib/lda_ruby_rust.#{RbConfig::CONFIG.fetch('DLEXT')}", __dir__)
|
|
68
68
|
FileUtils.cp(source, destination)
|
|
69
69
|
puts("Staged Rust extension to #{destination}")
|
|
70
70
|
end
|
|
71
71
|
|
|
72
|
-
def
|
|
72
|
+
def rust_cdylib_source
|
|
73
|
+
rust_cdylib_candidates.find { |path| File.exist?(path) }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def rust_cdylib_candidates
|
|
77
|
+
rust_cdylib_filenames.map { |filename| File.join(__dir__, "target", "release", filename) }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def rust_cdylib_filenames
|
|
73
81
|
host_os = RbConfig::CONFIG.fetch("host_os")
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
82
|
+
case host_os
|
|
83
|
+
when /mswin|mingw|cygwin/
|
|
84
|
+
# On Windows cargo may emit either prefixed or unprefixed DLL names.
|
|
85
|
+
["lda_ruby_rust.dll", "liblda_ruby_rust.dll"]
|
|
86
|
+
else
|
|
87
|
+
extension =
|
|
88
|
+
case host_os
|
|
89
|
+
when /darwin/
|
|
90
|
+
"dylib"
|
|
91
|
+
else
|
|
92
|
+
"so"
|
|
93
|
+
end
|
|
83
94
|
|
|
84
|
-
|
|
95
|
+
["liblda_ruby_rust.#{extension}"]
|
|
96
|
+
end
|
|
85
97
|
end
|
|
86
98
|
|
|
87
99
|
def rust_build_env
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#ifndef LDA_RUBY_BINDGEN_STRINGS_H
|
|
2
|
+
#define LDA_RUBY_BINDGEN_STRINGS_H
|
|
3
|
+
|
|
4
|
+
#include <string.h>
|
|
5
|
+
|
|
6
|
+
/*
|
|
7
|
+
* RubyInstaller headers may include <strings.h> on Windows, but Clang-based
|
|
8
|
+
* bindgen runs can miss that header in this environment. Provide compatibility
|
|
9
|
+
* aliases for bindgen preprocessing.
|
|
10
|
+
*/
|
|
11
|
+
#if defined(_WIN32) && !defined(__MINGW32__)
|
|
12
|
+
#ifndef bzero
|
|
13
|
+
#define bzero(ptr, size) memset((ptr), 0, (size))
|
|
14
|
+
#endif
|
|
15
|
+
#ifndef bcmp
|
|
16
|
+
#define bcmp(a, b, n) memcmp((a), (b), (n))
|
|
17
|
+
#endif
|
|
18
|
+
#ifndef bcopy
|
|
19
|
+
#define bcopy(src, dst, n) memmove((dst), (src), (n))
|
|
20
|
+
#endif
|
|
21
|
+
#ifndef index
|
|
22
|
+
#define index strchr
|
|
23
|
+
#endif
|
|
24
|
+
#ifndef rindex
|
|
25
|
+
#define rindex strrchr
|
|
26
|
+
#endif
|
|
27
|
+
#ifndef strcasecmp
|
|
28
|
+
#define strcasecmp _stricmp
|
|
29
|
+
#endif
|
|
30
|
+
#ifndef strncasecmp
|
|
31
|
+
#define strncasecmp _strnicmp
|
|
32
|
+
#endif
|
|
33
|
+
#endif
|
|
34
|
+
|
|
35
|
+
#endif
|