lda-ruby 0.5.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +69 -0
- data/Gemfile +9 -0
- data/README.md +160 -0
- data/VERSION.yml +5 -0
- data/docs/modernization-handoff.md +233 -0
- data/docs/porting-strategy.md +148 -0
- data/docs/precompiled-platform-policy.md +81 -0
- data/docs/precompiled-target-evaluation.md +67 -0
- data/docs/release-runbook.md +192 -0
- data/docs/rust-orchestration-guardrails.md +50 -0
- data/ext/lda-ruby/cokus.c +144 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +13 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1023 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +31 -0
- data/ext/lda-ruby/lda-x64-mingw-ucrt.def +2 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +119 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/ext/lda-ruby-rust/Cargo.toml +12 -0
- data/ext/lda-ruby-rust/README.md +73 -0
- data/ext/lda-ruby-rust/extconf.rb +135 -0
- data/ext/lda-ruby-rust/include/strings.h +35 -0
- data/ext/lda-ruby-rust/src/lib.rs +1263 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby/backends/base.rb +133 -0
- data/lib/lda-ruby/backends/native.rb +158 -0
- data/lib/lda-ruby/backends/pure_ruby.rb +675 -0
- data/lib/lda-ruby/backends/rust.rb +607 -0
- data/lib/lda-ruby/backends.rb +58 -0
- data/lib/lda-ruby/config/stopwords.yml +571 -0
- data/lib/lda-ruby/corpus/corpus.rb +45 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +40 -0
- data/lib/lda-ruby/document/text_document.rb +39 -0
- data/lib/lda-ruby/lda.so +0 -0
- data/lib/lda-ruby/rust_build_policy.rb +21 -0
- data/lib/lda-ruby/version.rb +5 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/lib/lda-ruby.rb +413 -0
- data/lib/lda_ruby_rust.so +0 -0
- data/license.txt +504 -0
- data/test/backend_compatibility_test.rb +146 -0
- data/test/backends_selection_test.rb +100 -0
- data/test/benchmark_scripts_test.rb +23 -0
- data/test/data/docs.dat +46 -0
- data/test/data/sample.rb +20 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/gemspec_test.rb +27 -0
- data/test/lda_ruby_test.rb +319 -0
- data/test/packaged_gem_smoke_test.rb +33 -0
- data/test/pure_ruby_orchestration_test.rb +109 -0
- data/test/release_scripts_test.rb +93 -0
- data/test/rust_build_policy_test.rb +23 -0
- data/test/rust_orchestration_test.rb +911 -0
- data/test/simple_pipeline_test.rb +22 -0
- data/test/simple_yaml.rb +17 -0
- data/test/test_helper.rb +10 -0
- metadata +118 -0
data/lda-ruby.gemspec
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/lda-ruby/version"
|
|
4
|
+
|
|
5
|
+
variant = ENV.fetch("LDA_RUBY_GEM_VARIANT", "source")
|
|
6
|
+
valid_variants = %w[source precompiled].freeze
|
|
7
|
+
unless valid_variants.include?(variant)
|
|
8
|
+
raise ArgumentError, "Unsupported LDA_RUBY_GEM_VARIANT=#{variant.inspect}. Expected one of: #{valid_variants.join(', ')}"
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
precompiled_variant = variant == "precompiled"
|
|
12
|
+
|
|
13
|
+
Gem::Specification.new do |spec|
|
|
14
|
+
spec.name = "lda-ruby"
|
|
15
|
+
spec.version = Lda::VERSION
|
|
16
|
+
spec.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
|
|
17
|
+
spec.email = ["jasonmadams@gmail.com"]
|
|
18
|
+
|
|
19
|
+
spec.summary = "Ruby implementation of Latent Dirichlet Allocation (LDA)."
|
|
20
|
+
spec.description = "Ruby wrapper and toolkit for Latent Dirichlet Allocation based on the original lda-c implementation by David M. Blei."
|
|
21
|
+
spec.homepage = "https://github.com/ealdent/lda-ruby"
|
|
22
|
+
spec.license = "GPL-2.0-or-later"
|
|
23
|
+
spec.required_ruby_version = ">= 3.2"
|
|
24
|
+
|
|
25
|
+
spec.metadata = {
|
|
26
|
+
"homepage_uri" => spec.homepage,
|
|
27
|
+
"source_code_uri" => spec.homepage,
|
|
28
|
+
"changelog_uri" => "#{spec.homepage}/blob/master/CHANGELOG.md",
|
|
29
|
+
"lda_ruby_gem_variant" => variant
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if precompiled_variant
|
|
33
|
+
platform_override = ENV.fetch("LDA_RUBY_GEM_PLATFORM", "").strip
|
|
34
|
+
platform_value = platform_override.empty? ? Gem::Platform.local.to_s : platform_override
|
|
35
|
+
|
|
36
|
+
spec.platform = Gem::Platform.new(platform_value)
|
|
37
|
+
spec.metadata["lda_ruby_platform"] = spec.platform.to_s
|
|
38
|
+
spec.extensions = []
|
|
39
|
+
else
|
|
40
|
+
spec.extensions = ["ext/lda-ruby/extconf.rb", "ext/lda-ruby-rust/extconf.rb"]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
spec.require_paths = ["lib"]
|
|
44
|
+
|
|
45
|
+
included = %w[CHANGELOG.md Gemfile README.md VERSION.yml lda-ruby.gemspec license.txt]
|
|
46
|
+
included += Dir.glob("docs/**/*")
|
|
47
|
+
included += Dir.glob("ext/**/*")
|
|
48
|
+
included += Dir.glob("lib/**/*")
|
|
49
|
+
included += Dir.glob("test/**/*")
|
|
50
|
+
allowed_precompiled_binary_patterns = [
|
|
51
|
+
%r{\Alib/lda-ruby/lda\.(so|bundle|dylib|dll)\z},
|
|
52
|
+
%r{\Alib/lda_ruby_rust\.(so|bundle|dylib|dll)\z}
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
spec.files = included
|
|
56
|
+
.reject { |path| File.directory?(path) }
|
|
57
|
+
.reject { |path| path.start_with?("ext/lda-ruby-rust/target/") }
|
|
58
|
+
.reject { |path| path == "ext/lda-ruby-rust/Cargo.lock" }
|
|
59
|
+
.reject do |path|
|
|
60
|
+
next false if precompiled_variant && allowed_precompiled_binary_patterns.any? { |pattern| pattern.match?(path) }
|
|
61
|
+
|
|
62
|
+
path.end_with?(".o", ".so", ".bundle", ".dylib", ".dll", ".rlib", ".rmeta")
|
|
63
|
+
end
|
|
64
|
+
.reject do |path|
|
|
65
|
+
["Makefile", "ext/lda-ruby/Makefile", "ext/lda-ruby/mkmf.log", "ext/lda-ruby-rust/Makefile"].include?(path)
|
|
66
|
+
end
|
|
67
|
+
.uniq
|
|
68
|
+
.sort
|
|
69
|
+
|
|
70
|
+
if precompiled_variant
|
|
71
|
+
missing_binaries = allowed_precompiled_binary_patterns.reject do |pattern|
|
|
72
|
+
spec.files.any? { |path| pattern.match?(path) }
|
|
73
|
+
end
|
|
74
|
+
unless missing_binaries.empty?
|
|
75
|
+
raise "Precompiled variant requires staged binaries under lib/: #{missing_binaries.map(&:source).join(', ')}"
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Lda
|
|
4
|
+
module Backends
|
|
5
|
+
class Base
|
|
6
|
+
attr_reader :corpus
|
|
7
|
+
|
|
8
|
+
attr_accessor :max_iter,
|
|
9
|
+
:convergence,
|
|
10
|
+
:em_max_iter,
|
|
11
|
+
:em_convergence,
|
|
12
|
+
:num_topics,
|
|
13
|
+
:init_alpha,
|
|
14
|
+
:est_alpha,
|
|
15
|
+
:verbose
|
|
16
|
+
|
|
17
|
+
def initialize(random_seed: nil)
|
|
18
|
+
@random = random_seed.nil? ? Random.new : Random.new(random_seed)
|
|
19
|
+
|
|
20
|
+
@max_iter = 20
|
|
21
|
+
@convergence = 1e-6
|
|
22
|
+
@em_max_iter = 100
|
|
23
|
+
@em_convergence = 1e-4
|
|
24
|
+
@num_topics = 20
|
|
25
|
+
@init_alpha = 0.3
|
|
26
|
+
@est_alpha = 1
|
|
27
|
+
@verbose = true
|
|
28
|
+
|
|
29
|
+
@corpus = nil
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def name
|
|
33
|
+
self.class.name.split("::").last.downcase
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def corpus=(corpus)
|
|
37
|
+
@corpus = corpus
|
|
38
|
+
true
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def fast_load_corpus_from_file(filename)
|
|
42
|
+
self.corpus = Lda::DataCorpus.new(filename)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def load_settings(settings_file)
|
|
46
|
+
File.readlines(settings_file).each do |line|
|
|
47
|
+
next if line.strip.empty? || line.strip.start_with?("#")
|
|
48
|
+
|
|
49
|
+
key, value = line.split(/\s+/, 2)
|
|
50
|
+
next if value.nil?
|
|
51
|
+
|
|
52
|
+
case key.downcase
|
|
53
|
+
when "max_iter", "var_max_iter"
|
|
54
|
+
self.max_iter = value.to_i
|
|
55
|
+
when "convergence", "var_converged"
|
|
56
|
+
self.convergence = value.to_f
|
|
57
|
+
when "em_max_iter"
|
|
58
|
+
self.em_max_iter = value.to_i
|
|
59
|
+
when "em_convergence", "em_converged"
|
|
60
|
+
self.em_convergence = value.to_f
|
|
61
|
+
when "num_topics", "ntopics"
|
|
62
|
+
self.num_topics = value.to_i
|
|
63
|
+
when "init_alpha", "initial_alpha", "alpha"
|
|
64
|
+
self.init_alpha = value.to_f
|
|
65
|
+
when "est_alpha", "estimate_alpha"
|
|
66
|
+
self.est_alpha = value.to_i
|
|
67
|
+
when "verbose"
|
|
68
|
+
self.verbose = value.to_i != 0
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
true
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
|
|
76
|
+
self.init_alpha = init_alpha
|
|
77
|
+
self.num_topics = num_topics
|
|
78
|
+
self.max_iter = max_iter
|
|
79
|
+
self.convergence = convergence
|
|
80
|
+
self.em_max_iter = em_max_iter
|
|
81
|
+
self.em_convergence = em_convergence
|
|
82
|
+
self.est_alpha = est_alpha
|
|
83
|
+
true
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def em(_start)
|
|
87
|
+
raise NotImplementedError, "#{self.class} must implement #em"
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def beta
|
|
91
|
+
raise NotImplementedError, "#{self.class} must implement #beta"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def gamma
|
|
95
|
+
raise NotImplementedError, "#{self.class} must implement #gamma"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def compute_phi
|
|
99
|
+
raise NotImplementedError, "#{self.class} must implement #compute_phi"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def model
|
|
103
|
+
raise NotImplementedError, "#{self.class} must implement #model"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def topic_document_probability(_phi_matrix, _document_counts)
|
|
107
|
+
nil
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
private
|
|
111
|
+
|
|
112
|
+
def next_random_seed
|
|
113
|
+
@random.rand(0..9_223_372_036_854_775_807)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def normalize!(weights)
|
|
117
|
+
total = weights.sum.to_f
|
|
118
|
+
|
|
119
|
+
if total <= 0.0
|
|
120
|
+
uniform = 1.0 / weights.size
|
|
121
|
+
weights.map! { uniform }
|
|
122
|
+
return weights
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
weights.map! { |w| w / total }
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def clone_matrix(matrix)
|
|
129
|
+
Marshal.load(Marshal.dump(matrix))
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Lda
|
|
4
|
+
module Backends
|
|
5
|
+
class Native < Base
|
|
6
|
+
REQUIRED_NATIVE_METHODS = %i[
|
|
7
|
+
__native_fast_load_corpus_from_file
|
|
8
|
+
__native_load_settings
|
|
9
|
+
__native_set_config
|
|
10
|
+
__native_em
|
|
11
|
+
__native_beta
|
|
12
|
+
__native_gamma
|
|
13
|
+
__native_compute_phi
|
|
14
|
+
__native_model
|
|
15
|
+
__native_set_corpus
|
|
16
|
+
__native_max_iter
|
|
17
|
+
__native_set_max_iter
|
|
18
|
+
__native_convergence
|
|
19
|
+
__native_set_convergence
|
|
20
|
+
__native_em_max_iter
|
|
21
|
+
__native_set_em_max_iter
|
|
22
|
+
__native_em_convergence
|
|
23
|
+
__native_set_em_convergence
|
|
24
|
+
__native_init_alpha
|
|
25
|
+
__native_set_init_alpha
|
|
26
|
+
__native_num_topics
|
|
27
|
+
__native_set_num_topics
|
|
28
|
+
__native_est_alpha
|
|
29
|
+
__native_set_est_alpha
|
|
30
|
+
__native_verbose
|
|
31
|
+
__native_set_verbose
|
|
32
|
+
].freeze
|
|
33
|
+
|
|
34
|
+
def self.available?(host)
|
|
35
|
+
REQUIRED_NATIVE_METHODS.all? { |method_name| host.respond_to?(method_name, true) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def initialize(host, random_seed: nil)
|
|
39
|
+
super(random_seed: random_seed)
|
|
40
|
+
@host = host
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def name
|
|
44
|
+
"native"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def corpus=(corpus)
|
|
48
|
+
@corpus = corpus
|
|
49
|
+
@host.__send__(:__native_set_corpus, corpus)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def fast_load_corpus_from_file(filename)
|
|
53
|
+
@host.__send__(:__native_fast_load_corpus_from_file, filename)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def load_settings(settings_file)
|
|
57
|
+
@host.__send__(:__native_load_settings, settings_file)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
|
|
61
|
+
@host.__send__(
|
|
62
|
+
:__native_set_config,
|
|
63
|
+
init_alpha,
|
|
64
|
+
num_topics,
|
|
65
|
+
max_iter,
|
|
66
|
+
convergence,
|
|
67
|
+
em_max_iter,
|
|
68
|
+
em_convergence,
|
|
69
|
+
est_alpha
|
|
70
|
+
)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def max_iter
|
|
74
|
+
@host.__send__(:__native_max_iter)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def max_iter=(value)
|
|
78
|
+
@host.__send__(:__native_set_max_iter, Integer(value))
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def convergence
|
|
82
|
+
@host.__send__(:__native_convergence)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def convergence=(value)
|
|
86
|
+
@host.__send__(:__native_set_convergence, Float(value))
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def em_max_iter
|
|
90
|
+
@host.__send__(:__native_em_max_iter)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def em_max_iter=(value)
|
|
94
|
+
@host.__send__(:__native_set_em_max_iter, Integer(value))
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def em_convergence
|
|
98
|
+
@host.__send__(:__native_em_convergence)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def em_convergence=(value)
|
|
102
|
+
@host.__send__(:__native_set_em_convergence, Float(value))
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def init_alpha
|
|
106
|
+
@host.__send__(:__native_init_alpha)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def init_alpha=(value)
|
|
110
|
+
@host.__send__(:__native_set_init_alpha, Float(value))
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def num_topics
|
|
114
|
+
@host.__send__(:__native_num_topics)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def num_topics=(value)
|
|
118
|
+
@host.__send__(:__native_set_num_topics, Integer(value))
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def est_alpha
|
|
122
|
+
@host.__send__(:__native_est_alpha)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def est_alpha=(value)
|
|
126
|
+
@host.__send__(:__native_set_est_alpha, Integer(value))
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def verbose
|
|
130
|
+
@host.__send__(:__native_verbose)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def verbose=(value)
|
|
134
|
+
@host.__send__(:__native_set_verbose, !!value)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def em(start)
|
|
138
|
+
@host.__send__(:__native_em, start)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def beta
|
|
142
|
+
@host.__send__(:__native_beta)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def gamma
|
|
146
|
+
@host.__send__(:__native_gamma)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def compute_phi
|
|
150
|
+
@host.__send__(:__native_compute_phi)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def model
|
|
154
|
+
@host.__send__(:__native_model)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|