lda-ruby 0.4.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +9 -0
- data/README.md +157 -0
- data/VERSION.yml +5 -0
- data/docs/modernization-handoff.md +190 -0
- data/docs/porting-strategy.md +127 -0
- data/docs/precompiled-platform-policy.md +68 -0
- data/docs/release-runbook.md +157 -0
- data/ext/lda-ruby/cokus.c +145 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +13 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1023 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +31 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +111 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/ext/lda-ruby-rust/Cargo.toml +12 -0
- data/ext/lda-ruby-rust/README.md +48 -0
- data/ext/lda-ruby-rust/extconf.rb +123 -0
- data/ext/lda-ruby-rust/src/lib.rs +456 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby/backends/base.rb +129 -0
- data/lib/lda-ruby/backends/native.rb +158 -0
- data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
- data/lib/lda-ruby/backends/rust.rb +226 -0
- data/lib/lda-ruby/backends.rb +58 -0
- data/lib/lda-ruby/config/stopwords.yml +571 -0
- data/lib/lda-ruby/corpus/corpus.rb +45 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +40 -0
- data/lib/lda-ruby/document/text_document.rb +39 -0
- data/lib/lda-ruby/lda.so +0 -0
- data/lib/lda-ruby/rust_build_policy.rb +21 -0
- data/lib/lda-ruby/version.rb +5 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/lib/lda-ruby.rb +413 -0
- data/lib/lda_ruby_rust.so +0 -0
- data/license.txt +504 -0
- data/test/backend_compatibility_test.rb +146 -0
- data/test/backends_selection_test.rb +100 -0
- data/test/data/docs.dat +46 -0
- data/test/data/sample.rb +20 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/gemspec_test.rb +27 -0
- data/test/lda_ruby_test.rb +319 -0
- data/test/packaged_gem_smoke_test.rb +33 -0
- data/test/release_scripts_test.rb +54 -0
- data/test/rust_build_policy_test.rb +23 -0
- data/test/simple_pipeline_test.rb +22 -0
- data/test/simple_yaml.rb +17 -0
- data/test/test_helper.rb +10 -0
- metadata +111 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Lda
|
|
4
|
+
module Backends
|
|
5
|
+
class Base
|
|
6
|
+
attr_reader :corpus
|
|
7
|
+
|
|
8
|
+
attr_accessor :max_iter,
|
|
9
|
+
:convergence,
|
|
10
|
+
:em_max_iter,
|
|
11
|
+
:em_convergence,
|
|
12
|
+
:num_topics,
|
|
13
|
+
:init_alpha,
|
|
14
|
+
:est_alpha,
|
|
15
|
+
:verbose
|
|
16
|
+
|
|
17
|
+
def initialize(random_seed: nil)
|
|
18
|
+
@random = random_seed.nil? ? Random.new : Random.new(random_seed)
|
|
19
|
+
|
|
20
|
+
@max_iter = 20
|
|
21
|
+
@convergence = 1e-6
|
|
22
|
+
@em_max_iter = 100
|
|
23
|
+
@em_convergence = 1e-4
|
|
24
|
+
@num_topics = 20
|
|
25
|
+
@init_alpha = 0.3
|
|
26
|
+
@est_alpha = 1
|
|
27
|
+
@verbose = true
|
|
28
|
+
|
|
29
|
+
@corpus = nil
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def name
|
|
33
|
+
self.class.name.split("::").last.downcase
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def corpus=(corpus)
|
|
37
|
+
@corpus = corpus
|
|
38
|
+
true
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def fast_load_corpus_from_file(filename)
|
|
42
|
+
self.corpus = Lda::DataCorpus.new(filename)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def load_settings(settings_file)
|
|
46
|
+
File.readlines(settings_file).each do |line|
|
|
47
|
+
next if line.strip.empty? || line.strip.start_with?("#")
|
|
48
|
+
|
|
49
|
+
key, value = line.split(/\s+/, 2)
|
|
50
|
+
next if value.nil?
|
|
51
|
+
|
|
52
|
+
case key.downcase
|
|
53
|
+
when "max_iter", "var_max_iter"
|
|
54
|
+
self.max_iter = value.to_i
|
|
55
|
+
when "convergence", "var_converged"
|
|
56
|
+
self.convergence = value.to_f
|
|
57
|
+
when "em_max_iter"
|
|
58
|
+
self.em_max_iter = value.to_i
|
|
59
|
+
when "em_convergence", "em_converged"
|
|
60
|
+
self.em_convergence = value.to_f
|
|
61
|
+
when "num_topics", "ntopics"
|
|
62
|
+
self.num_topics = value.to_i
|
|
63
|
+
when "init_alpha", "initial_alpha", "alpha"
|
|
64
|
+
self.init_alpha = value.to_f
|
|
65
|
+
when "est_alpha", "estimate_alpha"
|
|
66
|
+
self.est_alpha = value.to_i
|
|
67
|
+
when "verbose"
|
|
68
|
+
self.verbose = value.to_i != 0
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
true
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
|
|
76
|
+
self.init_alpha = init_alpha
|
|
77
|
+
self.num_topics = num_topics
|
|
78
|
+
self.max_iter = max_iter
|
|
79
|
+
self.convergence = convergence
|
|
80
|
+
self.em_max_iter = em_max_iter
|
|
81
|
+
self.em_convergence = em_convergence
|
|
82
|
+
self.est_alpha = est_alpha
|
|
83
|
+
true
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def em(_start)
|
|
87
|
+
raise NotImplementedError, "#{self.class} must implement #em"
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def beta
|
|
91
|
+
raise NotImplementedError, "#{self.class} must implement #beta"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def gamma
|
|
95
|
+
raise NotImplementedError, "#{self.class} must implement #gamma"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def compute_phi
|
|
99
|
+
raise NotImplementedError, "#{self.class} must implement #compute_phi"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def model
|
|
103
|
+
raise NotImplementedError, "#{self.class} must implement #model"
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def topic_document_probability(_phi_matrix, _document_counts)
|
|
107
|
+
nil
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
private
|
|
111
|
+
|
|
112
|
+
def normalize!(weights)
|
|
113
|
+
total = weights.sum.to_f
|
|
114
|
+
|
|
115
|
+
if total <= 0.0
|
|
116
|
+
uniform = 1.0 / weights.size
|
|
117
|
+
weights.map! { uniform }
|
|
118
|
+
return weights
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
weights.map! { |w| w / total }
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def clone_matrix(matrix)
|
|
125
|
+
Marshal.load(Marshal.dump(matrix))
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Lda
|
|
4
|
+
module Backends
|
|
5
|
+
class Native < Base
|
|
6
|
+
REQUIRED_NATIVE_METHODS = %i[
|
|
7
|
+
__native_fast_load_corpus_from_file
|
|
8
|
+
__native_load_settings
|
|
9
|
+
__native_set_config
|
|
10
|
+
__native_em
|
|
11
|
+
__native_beta
|
|
12
|
+
__native_gamma
|
|
13
|
+
__native_compute_phi
|
|
14
|
+
__native_model
|
|
15
|
+
__native_set_corpus
|
|
16
|
+
__native_max_iter
|
|
17
|
+
__native_set_max_iter
|
|
18
|
+
__native_convergence
|
|
19
|
+
__native_set_convergence
|
|
20
|
+
__native_em_max_iter
|
|
21
|
+
__native_set_em_max_iter
|
|
22
|
+
__native_em_convergence
|
|
23
|
+
__native_set_em_convergence
|
|
24
|
+
__native_init_alpha
|
|
25
|
+
__native_set_init_alpha
|
|
26
|
+
__native_num_topics
|
|
27
|
+
__native_set_num_topics
|
|
28
|
+
__native_est_alpha
|
|
29
|
+
__native_set_est_alpha
|
|
30
|
+
__native_verbose
|
|
31
|
+
__native_set_verbose
|
|
32
|
+
].freeze
|
|
33
|
+
|
|
34
|
+
def self.available?(host)
|
|
35
|
+
REQUIRED_NATIVE_METHODS.all? { |method_name| host.respond_to?(method_name, true) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def initialize(host, random_seed: nil)
|
|
39
|
+
super(random_seed: random_seed)
|
|
40
|
+
@host = host
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def name
|
|
44
|
+
"native"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def corpus=(corpus)
|
|
48
|
+
@corpus = corpus
|
|
49
|
+
@host.__send__(:__native_set_corpus, corpus)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def fast_load_corpus_from_file(filename)
|
|
53
|
+
@host.__send__(:__native_fast_load_corpus_from_file, filename)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def load_settings(settings_file)
|
|
57
|
+
@host.__send__(:__native_load_settings, settings_file)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
|
|
61
|
+
@host.__send__(
|
|
62
|
+
:__native_set_config,
|
|
63
|
+
init_alpha,
|
|
64
|
+
num_topics,
|
|
65
|
+
max_iter,
|
|
66
|
+
convergence,
|
|
67
|
+
em_max_iter,
|
|
68
|
+
em_convergence,
|
|
69
|
+
est_alpha
|
|
70
|
+
)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def max_iter
|
|
74
|
+
@host.__send__(:__native_max_iter)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def max_iter=(value)
|
|
78
|
+
@host.__send__(:__native_set_max_iter, Integer(value))
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def convergence
|
|
82
|
+
@host.__send__(:__native_convergence)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def convergence=(value)
|
|
86
|
+
@host.__send__(:__native_set_convergence, Float(value))
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def em_max_iter
|
|
90
|
+
@host.__send__(:__native_em_max_iter)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def em_max_iter=(value)
|
|
94
|
+
@host.__send__(:__native_set_em_max_iter, Integer(value))
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def em_convergence
|
|
98
|
+
@host.__send__(:__native_em_convergence)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def em_convergence=(value)
|
|
102
|
+
@host.__send__(:__native_set_em_convergence, Float(value))
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def init_alpha
|
|
106
|
+
@host.__send__(:__native_init_alpha)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def init_alpha=(value)
|
|
110
|
+
@host.__send__(:__native_set_init_alpha, Float(value))
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def num_topics
|
|
114
|
+
@host.__send__(:__native_num_topics)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def num_topics=(value)
|
|
118
|
+
@host.__send__(:__native_set_num_topics, Integer(value))
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def est_alpha
|
|
122
|
+
@host.__send__(:__native_est_alpha)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def est_alpha=(value)
|
|
126
|
+
@host.__send__(:__native_set_est_alpha, Integer(value))
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def verbose
|
|
130
|
+
@host.__send__(:__native_verbose)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def verbose=(value)
|
|
134
|
+
@host.__send__(:__native_set_verbose, !!value)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def em(start)
|
|
138
|
+
@host.__send__(:__native_em, start)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def beta
|
|
142
|
+
@host.__send__(:__native_beta)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def gamma
|
|
146
|
+
@host.__send__(:__native_gamma)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def compute_phi
|
|
150
|
+
@host.__send__(:__native_compute_phi)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def model
|
|
154
|
+
@host.__send__(:__native_model)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|