lda-ruby 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/lda-ruby/cokus.h CHANGED
@@ -12,9 +12,9 @@
12
12
 
13
13
  typedef unsigned long uint32;
14
14
 
15
- #define N (624) // length of state vector
16
- #define M (397) // a period parameter
17
- #define K (0x9908B0DFU) // a magic constant
15
+ #define COKUS_N (624) // length of state vector
16
+ #define COKUS_M (397) // a period parameter
17
+ #define COKUS_K (0x9908B0DFU) // a magic constant
18
18
  #define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u
19
19
  #define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u
20
20
  #define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u
@@ -435,9 +435,9 @@ void infer(char* model_root, char* save, corpus* corpus) {
435
435
  int main(int argc, char* argv[]) {
436
436
  corpus* corpus;
437
437
 
438
- long t1;
438
+ time_t t1;
439
439
  (void) time(&t1);
440
- seedMT(t1);
440
+ seedMT((uint32) t1);
441
441
  // seedMT(4357U);
442
442
 
443
443
  if (argc > 1)
data/ext/lda-ruby/utils.c CHANGED
@@ -1,5 +1,9 @@
1
1
  #include "utils.h"
2
2
 
3
+ #ifdef _WIN32
4
+ #include <direct.h>
5
+ #endif
6
+
3
7
  /*
4
8
  * given log(a) and log(b), return log(a + b)
5
9
  *
@@ -85,7 +89,11 @@ double log_gamma(double x)
85
89
 
86
90
  void make_directory(char* name)
87
91
  {
92
+ #ifdef _WIN32
93
+ _mkdir(name);
94
+ #else
88
95
  mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
96
+ #endif
89
97
  }
90
98
 
91
99
 
@@ -8,6 +8,8 @@ Current scope:
8
8
  - Exposes capability hooks:
9
9
  - `Lda::RustBackend.available?`
10
10
  - `Lda::RustBackend.abi_version`
11
+ - `Lda::RustBackend.corpus_session_count`
12
+ - `Lda::RustBackend.corpus_session_exists(session_id)`
11
13
  - `Lda::RustBackend.before_em(start, num_docs, num_terms)`
12
14
  - `Lda::RustBackend.topic_weights_for_word(beta, gamma, word_index, min_probability)`
13
15
  - `Lda::RustBackend.accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)`
@@ -17,6 +19,18 @@ Current scope:
17
19
  - `Lda::RustBackend.average_gamma_shift(previous_gamma, current_gamma)`
18
20
  - `Lda::RustBackend.topic_document_probability(phi_tensor, document_counts, num_topics, min_probability)`
19
21
  - `Lda::RustBackend.seeded_topic_term_probabilities(document_words, document_counts, topics, terms, min_probability)`
22
+ - `Lda::RustBackend.random_topic_term_probabilities(topics, terms, min_probability, random_seed)`
23
+ - `Lda::RustBackend.create_corpus_session(document_words, document_counts, terms)`
24
+ - `Lda::RustBackend.replace_corpus_session(session_id, document_words, document_counts, terms)`
25
+ - `Lda::RustBackend.drop_corpus_session(session_id)`
26
+ - `Lda::RustBackend.configure_corpus_session(session_id, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability)`
27
+ - `Lda::RustBackend.run_em(initial_beta, document_words, document_counts, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability)`
28
+ - `Lda::RustBackend.run_em_with_start(start, document_words, document_counts, topics, terms, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability)`
29
+ - `Lda::RustBackend.run_em_with_start_seed(start, document_words, document_counts, topics, terms, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
30
+ - `Lda::RustBackend.run_em_on_session(session_id, start, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
31
+ - `Lda::RustBackend.run_em_on_session_start(session_id, start, random_seed)`
32
+ - `Lda::RustBackend.run_em_on_session_with_start_seed(session_id, start, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
33
+ - `Lda::RustBackend.run_em_on_session_with_corpus(session_id, document_words, document_counts, terms, start, topics, max_iter, convergence, em_max_iter, em_convergence, init_alpha, min_probability, random_seed)`
20
34
 
21
35
  Hot-path kernels currently executed in Rust when `backend: :rust` is active:
22
36
  - topic weights for a word across topics
@@ -27,6 +41,17 @@ Hot-path kernels currently executed in Rust when `backend: :rust` is active:
27
41
  - gamma convergence shift reduction between EM iterations
28
42
  - topic-document average log-probability computation
29
43
  - seeded topic-term initialization
44
+ - random topic-term initialization with explicit seed control
45
+ - EM outer-loop orchestration with convergence checks (`run_em`)
46
+ - start-aware deterministic EM orchestration (`run_em_with_start` for `seeded`/`deterministic`)
47
+ - start-aware seeded and random EM orchestration with explicit seed control (`run_em_with_start_seed`)
48
+ - unified session-settings orchestration (`run_em_on_session`) that applies settings and executes EM in one call
49
+ - session-based EM orchestration against Rust-managed corpus lifecycle (`create_corpus_session` + `run_em_on_session_with_start_seed`)
50
+ - settings-aware session orchestration (`configure_corpus_session` + `run_em_on_session_start`)
51
+ - managed corpus orchestration (`run_em_on_session_with_corpus`) that can recreate missing sessions and, if session-backed execution cannot be used, falls back internally to direct start-aware execution inside Rust
52
+ - `Lda::Backends::Rust` prefers `run_em_on_session_with_corpus` whenever a cached Rust corpus snapshot is available, even if no session id is currently cached locally
53
+ - direct and legacy beta-input compatibility fallbacks both reuse the backend's cached Rust corpus snapshot instead of rebuilding corpus arrays in Ruby
54
+ - unknown EM start modes in seed-aware orchestration follow Ruby's non-seeded fallback behavior (seeded by explicit `random_seed`)
30
55
 
31
56
  Remaining numeric LDA kernels are still provided by the pure Ruby backend and will move incrementally.
32
57
 
@@ -61,27 +61,39 @@ module Lda
61
61
  success or raise "cargo build --release failed"
62
62
  end
63
63
 
64
- source = File.join(__dir__, "target", "release", rust_cdylib_filename)
65
- raise "Rust extension artifact not found at #{source}" unless File.exist?(source)
64
+ source = rust_cdylib_source
65
+ raise "Rust extension artifact not found at #{rust_cdylib_candidates.join(', ')}" unless source
66
66
 
67
67
  destination = File.expand_path("../../lib/lda_ruby_rust.#{RbConfig::CONFIG.fetch('DLEXT')}", __dir__)
68
68
  FileUtils.cp(source, destination)
69
69
  puts("Staged Rust extension to #{destination}")
70
70
  end
71
71
 
72
- def rust_cdylib_filename
72
+ def rust_cdylib_source
73
+ rust_cdylib_candidates.find { |path| File.exist?(path) }
74
+ end
75
+
76
+ def rust_cdylib_candidates
77
+ rust_cdylib_filenames.map { |filename| File.join(__dir__, "target", "release", filename) }
78
+ end
79
+
80
+ def rust_cdylib_filenames
73
81
  host_os = RbConfig::CONFIG.fetch("host_os")
74
- extension =
75
- case host_os
76
- when /darwin/
77
- "dylib"
78
- when /mswin|mingw|cygwin/
79
- "dll"
80
- else
81
- "so"
82
- end
82
+ case host_os
83
+ when /mswin|mingw|cygwin/
84
+ # On Windows cargo may emit either prefixed or unprefixed DLL names.
85
+ ["lda_ruby_rust.dll", "liblda_ruby_rust.dll"]
86
+ else
87
+ extension =
88
+ case host_os
89
+ when /darwin/
90
+ "dylib"
91
+ else
92
+ "so"
93
+ end
83
94
 
84
- "liblda_ruby_rust.#{extension}"
95
+ ["liblda_ruby_rust.#{extension}"]
96
+ end
85
97
  end
86
98
 
87
99
  def rust_build_env
@@ -0,0 +1,35 @@
1
+ #ifndef LDA_RUBY_BINDGEN_STRINGS_H
2
+ #define LDA_RUBY_BINDGEN_STRINGS_H
3
+
4
+ #include <string.h>
5
+
6
+ /*
7
+ * RubyInstaller headers may include <strings.h> on Windows, but Clang-based
8
+ * bindgen runs can miss that header in this environment. Provide compatibility
9
+ * aliases for bindgen preprocessing.
10
+ */
11
+ #if defined(_WIN32) && !defined(__MINGW32__)
12
+ #ifndef bzero
13
+ #define bzero(ptr, size) memset((ptr), 0, (size))
14
+ #endif
15
+ #ifndef bcmp
16
+ #define bcmp(a, b, n) memcmp((a), (b), (n))
17
+ #endif
18
+ #ifndef bcopy
19
+ #define bcopy(src, dst, n) memmove((dst), (src), (n))
20
+ #endif
21
+ #ifndef index
22
+ #define index strchr
23
+ #endif
24
+ #ifndef rindex
25
+ #define rindex strrchr
26
+ #endif
27
+ #ifndef strcasecmp
28
+ #define strcasecmp _stricmp
29
+ #endif
30
+ #ifndef strncasecmp
31
+ #define strncasecmp _strnicmp
32
+ #endif
33
+ #endif
34
+
35
+ #endif