lda-ruby 0.4.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +9 -0
- data/README.md +157 -0
- data/VERSION.yml +5 -0
- data/docs/modernization-handoff.md +190 -0
- data/docs/porting-strategy.md +127 -0
- data/docs/precompiled-platform-policy.md +68 -0
- data/docs/release-runbook.md +157 -0
- data/ext/lda-ruby/cokus.c +145 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +13 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1023 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +31 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +111 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/ext/lda-ruby-rust/Cargo.toml +12 -0
- data/ext/lda-ruby-rust/README.md +48 -0
- data/ext/lda-ruby-rust/extconf.rb +123 -0
- data/ext/lda-ruby-rust/src/lib.rs +456 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby/backends/base.rb +129 -0
- data/lib/lda-ruby/backends/native.rb +158 -0
- data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
- data/lib/lda-ruby/backends/rust.rb +226 -0
- data/lib/lda-ruby/backends.rb +58 -0
- data/lib/lda-ruby/config/stopwords.yml +571 -0
- data/lib/lda-ruby/corpus/corpus.rb +45 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +40 -0
- data/lib/lda-ruby/document/text_document.rb +39 -0
- data/lib/lda-ruby/lda.so +0 -0
- data/lib/lda-ruby/rust_build_policy.rb +21 -0
- data/lib/lda-ruby/version.rb +5 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/lib/lda-ruby.rb +413 -0
- data/lib/lda_ruby_rust.so +0 -0
- data/license.txt +504 -0
- data/test/backend_compatibility_test.rb +146 -0
- data/test/backends_selection_test.rb +100 -0
- data/test/data/docs.dat +46 -0
- data/test/data/sample.rb +20 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/gemspec_test.rb +27 -0
- data/test/lda_ruby_test.rb +319 -0
- data/test/packaged_gem_smoke_test.rb +33 -0
- data/test/release_scripts_test.rb +54 -0
- data/test/rust_build_policy_test.rb +23 -0
- data/test/simple_pipeline_test.rb +22 -0
- data/test/simple_yaml.rb +17 -0
- data/test/test_helper.rb +10 -0
- metadata +111 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
require 'yaml'
|
|
2
|
+
|
|
3
|
+
module Lda
|
|
4
|
+
class Document
|
|
5
|
+
attr_reader :corpus, :words, :counts, :length, :total, :tokens
|
|
6
|
+
|
|
7
|
+
def initialize(corpus)
|
|
8
|
+
@corpus = corpus
|
|
9
|
+
|
|
10
|
+
@words = []
|
|
11
|
+
@counts = []
|
|
12
|
+
@tokens = []
|
|
13
|
+
@length = 0
|
|
14
|
+
@total = 0
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
#
|
|
18
|
+
# Recompute the total and length values.
|
|
19
|
+
#
|
|
20
|
+
def recompute
|
|
21
|
+
@total = @counts.inject(0) { |sum, i| sum + i }
|
|
22
|
+
@length = @words.size
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def text?
|
|
26
|
+
false
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def handle(tokens)
|
|
30
|
+
tokens
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def tokenize(text)
|
|
34
|
+
# remove everything but letters and ' and leave only single spaces
|
|
35
|
+
clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase
|
|
36
|
+
@tokens = handle(clean_text.split(' '))
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
module Lda
|
|
2
|
+
class TextDocument < Document
|
|
3
|
+
attr_reader :filename
|
|
4
|
+
|
|
5
|
+
def initialize(corpus, text)
|
|
6
|
+
super(corpus)
|
|
7
|
+
@filename = nil
|
|
8
|
+
|
|
9
|
+
tokenize(text)
|
|
10
|
+
@corpus.stopwords.each { |w| @tokens.delete(w) }
|
|
11
|
+
build_from_tokens
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def text?
|
|
15
|
+
true
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def self.build_from_file(corpus, filename)
|
|
19
|
+
text = File.read(filename)
|
|
20
|
+
document = new(corpus, text)
|
|
21
|
+
document.instance_variable_set(:@filename, filename.dup.freeze)
|
|
22
|
+
document
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
protected
|
|
26
|
+
|
|
27
|
+
def build_from_tokens
|
|
28
|
+
vocab = Hash.new(0)
|
|
29
|
+
@tokens.each { |t| vocab[t] = vocab[t] + 1 }
|
|
30
|
+
|
|
31
|
+
vocab.each_pair do |word, count|
|
|
32
|
+
@words << @corpus.vocabulary.check_word(word) - 1
|
|
33
|
+
@counts << count
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
recompute
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
data/lib/lda-ruby/lda.so
ADDED
|
Binary file
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Lda
|
|
4
|
+
module RustBuildPolicy
|
|
5
|
+
ENV_KEY = "LDA_RUBY_RUST_BUILD"
|
|
6
|
+
AUTO = "auto"
|
|
7
|
+
ALWAYS = "always"
|
|
8
|
+
NEVER = "never"
|
|
9
|
+
VALID_VALUES = [AUTO, ALWAYS, NEVER].freeze
|
|
10
|
+
|
|
11
|
+
module_function
|
|
12
|
+
|
|
13
|
+
def resolve(raw_value = ENV[ENV_KEY])
|
|
14
|
+
value = raw_value.to_s.strip.downcase
|
|
15
|
+
return AUTO if value.empty?
|
|
16
|
+
return value if VALID_VALUES.include?(value)
|
|
17
|
+
|
|
18
|
+
AUTO
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
module Lda
|
|
2
|
+
class Vocabulary
|
|
3
|
+
attr_reader :words, :indexes
|
|
4
|
+
|
|
5
|
+
def initialize(words = nil)
|
|
6
|
+
@words = Hash.new do |hash, key|
|
|
7
|
+
if hash.member?(:MAX_VALUE)
|
|
8
|
+
hash[:MAX_VALUE] = hash[:MAX_VALUE] + 1
|
|
9
|
+
else
|
|
10
|
+
hash[:MAX_VALUE] = 1
|
|
11
|
+
end
|
|
12
|
+
hash[key] = hash[:MAX_VALUE]
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
words.each { |w| @words[w] } if words
|
|
16
|
+
@indexes = Hash.new
|
|
17
|
+
|
|
18
|
+
@words.each_pair do |w, i|
|
|
19
|
+
@indexes[i] = w
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def check_word(word)
|
|
24
|
+
w = @words[word.dup]
|
|
25
|
+
@indexes[w] = word.dup
|
|
26
|
+
w
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def load_file(filename)
|
|
30
|
+
txt = File.open(filename, 'r') { |f| f.read }
|
|
31
|
+
txt.split(/[\n\r]+/).each { |word| check_word(word) }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def load_yaml(filename)
|
|
35
|
+
YAML::load_file(filename).each { |word| check_word(word) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def num_words
|
|
39
|
+
((@words.size > 0) ? @words.size - 1 : 0 )
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def to_a
|
|
43
|
+
@words.sort { |w1, w2| w1[1] <=> w2[1] }.map { |word, idx| word }.reject { |w| w == :MAX_VALUE }
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
data/lib/lda-ruby.rb
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lda-ruby/version"
|
|
4
|
+
require "rbconfig"
|
|
5
|
+
|
|
6
|
+
rust_extension_loaded = false
|
|
7
|
+
rust_dlext = RbConfig::CONFIG.fetch("DLEXT")
|
|
8
|
+
|
|
9
|
+
[
|
|
10
|
+
"lda_ruby_rust",
|
|
11
|
+
"../ext/lda-ruby-rust/target/release/lda_ruby_rust",
|
|
12
|
+
"../ext/lda-ruby-rust/target/release/lda_ruby_rust.#{rust_dlext}",
|
|
13
|
+
"../ext/lda-ruby-rust/target/debug/lda_ruby_rust",
|
|
14
|
+
"../ext/lda-ruby-rust/target/debug/lda_ruby_rust.#{rust_dlext}"
|
|
15
|
+
].each do |rust_extension_candidate|
|
|
16
|
+
begin
|
|
17
|
+
if rust_extension_candidate.start_with?("../")
|
|
18
|
+
require_relative rust_extension_candidate
|
|
19
|
+
else
|
|
20
|
+
require rust_extension_candidate
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
rust_extension_loaded = true
|
|
24
|
+
break
|
|
25
|
+
rescue LoadError
|
|
26
|
+
next
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
native_extension_loaded = false
|
|
31
|
+
|
|
32
|
+
begin
|
|
33
|
+
require "lda-ruby/lda"
|
|
34
|
+
native_extension_loaded = true
|
|
35
|
+
rescue LoadError
|
|
36
|
+
begin
|
|
37
|
+
require_relative "../ext/lda-ruby/lda"
|
|
38
|
+
native_extension_loaded = true
|
|
39
|
+
rescue LoadError
|
|
40
|
+
native_extension_loaded = false
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
LDA_RUBY_NATIVE_EXTENSION_LOADED = native_extension_loaded unless defined?(LDA_RUBY_NATIVE_EXTENSION_LOADED)
|
|
45
|
+
LDA_RUBY_RUST_EXTENSION_LOADED = rust_extension_loaded unless defined?(LDA_RUBY_RUST_EXTENSION_LOADED)
|
|
46
|
+
|
|
47
|
+
require "lda-ruby/document/document"
|
|
48
|
+
require "lda-ruby/document/data_document"
|
|
49
|
+
require "lda-ruby/document/text_document"
|
|
50
|
+
require "lda-ruby/corpus/corpus"
|
|
51
|
+
require "lda-ruby/corpus/data_corpus"
|
|
52
|
+
require "lda-ruby/corpus/text_corpus"
|
|
53
|
+
require "lda-ruby/corpus/directory_corpus"
|
|
54
|
+
require "lda-ruby/vocabulary"
|
|
55
|
+
require "lda-ruby/backends"
|
|
56
|
+
|
|
57
|
+
module Lda
|
|
58
|
+
RUST_EXTENSION_LOADED = LDA_RUBY_RUST_EXTENSION_LOADED unless const_defined?(:RUST_EXTENSION_LOADED)
|
|
59
|
+
NATIVE_EXTENSION_LOADED = LDA_RUBY_NATIVE_EXTENSION_LOADED unless const_defined?(:NATIVE_EXTENSION_LOADED)
|
|
60
|
+
|
|
61
|
+
class Lda
|
|
62
|
+
NATIVE_ALIAS_MAP = {
|
|
63
|
+
fast_load_corpus_from_file: :__native_fast_load_corpus_from_file,
|
|
64
|
+
"corpus=": :__native_set_corpus,
|
|
65
|
+
em: :__native_em,
|
|
66
|
+
load_settings: :__native_load_settings,
|
|
67
|
+
set_config: :__native_set_config,
|
|
68
|
+
max_iter: :__native_max_iter,
|
|
69
|
+
"max_iter=": :__native_set_max_iter,
|
|
70
|
+
convergence: :__native_convergence,
|
|
71
|
+
"convergence=": :__native_set_convergence,
|
|
72
|
+
em_max_iter: :__native_em_max_iter,
|
|
73
|
+
"em_max_iter=": :__native_set_em_max_iter,
|
|
74
|
+
em_convergence: :__native_em_convergence,
|
|
75
|
+
"em_convergence=": :__native_set_em_convergence,
|
|
76
|
+
init_alpha: :__native_init_alpha,
|
|
77
|
+
"init_alpha=": :__native_set_init_alpha,
|
|
78
|
+
est_alpha: :__native_est_alpha,
|
|
79
|
+
"est_alpha=": :__native_set_est_alpha,
|
|
80
|
+
num_topics: :__native_num_topics,
|
|
81
|
+
"num_topics=": :__native_set_num_topics,
|
|
82
|
+
verbose: :__native_verbose,
|
|
83
|
+
"verbose=": :__native_set_verbose,
|
|
84
|
+
beta: :__native_beta,
|
|
85
|
+
gamma: :__native_gamma,
|
|
86
|
+
compute_phi: :__native_compute_phi,
|
|
87
|
+
model: :__native_model
|
|
88
|
+
}.freeze
|
|
89
|
+
|
|
90
|
+
NATIVE_ALIAS_MAP.each do |native_name, alias_name|
|
|
91
|
+
next unless method_defined?(native_name)
|
|
92
|
+
|
|
93
|
+
alias_method alias_name, native_name
|
|
94
|
+
private alias_name
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
attr_reader :vocab, :corpus, :backend
|
|
98
|
+
|
|
99
|
+
def initialize(corpus, backend: nil, random_seed: nil)
|
|
100
|
+
@backend = Backends.build(host: self, requested: backend, random_seed: random_seed)
|
|
101
|
+
|
|
102
|
+
load_default_settings
|
|
103
|
+
|
|
104
|
+
@vocab = nil
|
|
105
|
+
self.corpus = corpus
|
|
106
|
+
@vocab = corpus.vocabulary.to_a if corpus.respond_to?(:vocabulary) && corpus.vocabulary
|
|
107
|
+
|
|
108
|
+
@phi = nil
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def backend_name
|
|
112
|
+
@backend.name
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def native_backend?
|
|
116
|
+
backend_name == "native"
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def rust_backend?
|
|
120
|
+
backend_name == "rust"
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def load_default_settings
|
|
124
|
+
self.max_iter = 20
|
|
125
|
+
self.convergence = 1e-6
|
|
126
|
+
self.em_max_iter = 100
|
|
127
|
+
self.em_convergence = 1e-4
|
|
128
|
+
self.num_topics = 20
|
|
129
|
+
self.init_alpha = 0.3
|
|
130
|
+
self.est_alpha = 1
|
|
131
|
+
|
|
132
|
+
[20, 1e-6, 100, 1e-4, 20, 0.3, 1]
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence = self.em_convergence, est_alpha = self.est_alpha)
|
|
136
|
+
@backend.set_config(
|
|
137
|
+
Float(init_alpha),
|
|
138
|
+
Integer(num_topics),
|
|
139
|
+
Integer(max_iter),
|
|
140
|
+
Float(convergence),
|
|
141
|
+
Integer(em_max_iter),
|
|
142
|
+
Float(em_convergence),
|
|
143
|
+
Integer(est_alpha)
|
|
144
|
+
)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def max_iter
|
|
148
|
+
@backend.max_iter
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def max_iter=(value)
|
|
152
|
+
@backend.max_iter = Integer(value)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def convergence
|
|
156
|
+
@backend.convergence
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def convergence=(value)
|
|
160
|
+
@backend.convergence = Float(value)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def em_max_iter
|
|
164
|
+
@backend.em_max_iter
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def em_max_iter=(value)
|
|
168
|
+
@backend.em_max_iter = Integer(value)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def em_convergence
|
|
172
|
+
@backend.em_convergence
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def em_convergence=(value)
|
|
176
|
+
@backend.em_convergence = Float(value)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def num_topics
|
|
180
|
+
@backend.num_topics
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def num_topics=(value)
|
|
184
|
+
@backend.num_topics = Integer(value)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def init_alpha
|
|
188
|
+
@backend.init_alpha
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def init_alpha=(value)
|
|
192
|
+
@backend.init_alpha = Float(value)
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def est_alpha
|
|
196
|
+
@backend.est_alpha
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def est_alpha=(value)
|
|
200
|
+
@backend.est_alpha = Integer(value)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def verbose
|
|
204
|
+
@backend.verbose
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def verbose=(value)
|
|
208
|
+
@backend.verbose = !!value
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def corpus=(corpus)
|
|
212
|
+
@corpus = corpus
|
|
213
|
+
@backend.corpus = corpus
|
|
214
|
+
true
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def load_corpus(filename)
|
|
218
|
+
fast_load_corpus_from_file(filename)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def fast_load_corpus_from_file(filename)
|
|
222
|
+
loaded = @backend.fast_load_corpus_from_file(filename)
|
|
223
|
+
|
|
224
|
+
if @backend.corpus
|
|
225
|
+
@corpus = @backend.corpus
|
|
226
|
+
@vocab = @corpus.vocabulary.to_a if @corpus.respond_to?(:vocabulary) && @corpus.vocabulary
|
|
227
|
+
elsif @corpus.nil?
|
|
228
|
+
@corpus = DataCorpus.new(filename)
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
!!loaded
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def load_settings(settings_file)
|
|
235
|
+
@backend.load_settings(settings_file)
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def load_vocabulary(vocab)
|
|
239
|
+
if vocab.is_a?(Array)
|
|
240
|
+
@vocab = Marshal.load(Marshal.dump(vocab)) # deep clone array
|
|
241
|
+
elsif vocab.is_a?(Vocabulary)
|
|
242
|
+
@vocab = vocab.to_a
|
|
243
|
+
else
|
|
244
|
+
@vocab = File.read(vocab).split(/\s+/)
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
true
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def em(start = "random")
|
|
251
|
+
@phi = nil
|
|
252
|
+
@backend.em(start.to_s)
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def beta
|
|
256
|
+
@backend.beta
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def gamma
|
|
260
|
+
@backend.gamma
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def model
|
|
264
|
+
@backend.model
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
#
|
|
268
|
+
# Visualization method for printing out the top +words_per_topic+ words
|
|
269
|
+
# for each topic.
|
|
270
|
+
#
|
|
271
|
+
# See also +top_words+.
|
|
272
|
+
#
|
|
273
|
+
def print_topics(words_per_topic = 10)
|
|
274
|
+
raise "No vocabulary loaded." unless @vocab
|
|
275
|
+
|
|
276
|
+
beta.each_with_index do |topic, topic_num|
|
|
277
|
+
indices = topic
|
|
278
|
+
.each_with_index
|
|
279
|
+
.sort_by { |score, _index| score }
|
|
280
|
+
.reverse
|
|
281
|
+
.first(words_per_topic)
|
|
282
|
+
.map { |_score, index| index }
|
|
283
|
+
|
|
284
|
+
puts "Topic #{topic_num}"
|
|
285
|
+
puts "\t#{indices.map { |i| @vocab[i] }.join("\n\t")}"
|
|
286
|
+
puts ""
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
nil
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
#
|
|
293
|
+
# After the model has been run and a vocabulary has been loaded, return the
|
|
294
|
+
# +words_per_topic+ top words chosen by the model for each topic. This is
|
|
295
|
+
# returned as a hash mapping the topic number to an array of top words
|
|
296
|
+
# (in descending order of importance).
|
|
297
|
+
#
|
|
298
|
+
# topic_number => [w1, w2, ..., w_n]
|
|
299
|
+
#
|
|
300
|
+
# See also +print_topics+.
|
|
301
|
+
#
|
|
302
|
+
def top_word_indices(words_per_topic = 10)
|
|
303
|
+
raise "No vocabulary loaded." unless @vocab
|
|
304
|
+
|
|
305
|
+
topics = {}
|
|
306
|
+
|
|
307
|
+
beta.each_with_index do |topic, topic_num|
|
|
308
|
+
topics[topic_num] = topic
|
|
309
|
+
.each_with_index
|
|
310
|
+
.sort_by { |score, _index| score }
|
|
311
|
+
.reverse
|
|
312
|
+
.first(words_per_topic)
|
|
313
|
+
.map { |_score, index| index }
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
topics
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
def top_words(words_per_topic = 10)
|
|
320
|
+
output = {}
|
|
321
|
+
|
|
322
|
+
topics = top_word_indices(words_per_topic)
|
|
323
|
+
topics.each_pair do |topic_num, words|
|
|
324
|
+
output[topic_num] = words.map { |w| @vocab[w] }
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
output
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
#
|
|
331
|
+
# Get the phi matrix which can be used to assign probabilities to words
|
|
332
|
+
# belonging to a specific topic in each document. The return value is a
|
|
333
|
+
# 3D matrix: num_docs x doc_length x num_topics. The value is cached
|
|
334
|
+
# after the first call, so if it needs to be recomputed, set the +recompute+
|
|
335
|
+
# value to true.
|
|
336
|
+
#
|
|
337
|
+
def phi(recompute = false)
|
|
338
|
+
@phi = compute_phi if @phi.nil? || recompute
|
|
339
|
+
|
|
340
|
+
@phi
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def compute_phi
|
|
344
|
+
@backend.compute_phi
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
#
|
|
348
|
+
# Compute the average log probability for each topic for each document in the corpus.
|
|
349
|
+
# This method returns a matrix: num_docs x num_topics with the average log probability
|
|
350
|
+
# for the topic in the document.
|
|
351
|
+
#
|
|
352
|
+
def compute_topic_document_probability
|
|
353
|
+
phi_matrix = phi
|
|
354
|
+
document_counts = @corpus.documents.map(&:counts)
|
|
355
|
+
|
|
356
|
+
backend_output = @backend.topic_document_probability(phi_matrix, document_counts)
|
|
357
|
+
if valid_topic_document_probability_output?(backend_output, document_counts.size, num_topics)
|
|
358
|
+
return backend_output
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
outp = []
|
|
362
|
+
|
|
363
|
+
@corpus.documents.each_with_index do |doc, idx|
|
|
364
|
+
tops = [0.0] * num_topics
|
|
365
|
+
ttl = doc.counts.inject(0.0) { |sum, i| sum + i }
|
|
366
|
+
|
|
367
|
+
phi_matrix[idx].each_with_index do |word_dist, word_idx|
|
|
368
|
+
word_dist.each_with_index do |top_prob, top_idx|
|
|
369
|
+
tops[top_idx] += Math.log([top_prob, 1e-300].max) * doc.counts[word_idx]
|
|
370
|
+
end
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
tops = tops.map { |i| i / ttl }
|
|
374
|
+
outp << tops
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
outp
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
def valid_topic_document_probability_output?(output, expected_docs, expected_topics)
|
|
381
|
+
return false unless output.is_a?(Array)
|
|
382
|
+
return false unless output.size == expected_docs
|
|
383
|
+
|
|
384
|
+
output.each do |row|
|
|
385
|
+
return false unless row.is_a?(Array)
|
|
386
|
+
return false unless row.size == expected_topics
|
|
387
|
+
row.each do |value|
|
|
388
|
+
return false unless value.is_a?(Numeric)
|
|
389
|
+
return false unless value.finite?
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
true
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
#
|
|
397
|
+
# String representation displaying current settings.
|
|
398
|
+
#
|
|
399
|
+
def to_s
|
|
400
|
+
outp = ["LDA Settings:"]
|
|
401
|
+
outp << format(" Initial alpha: %0.6f", init_alpha)
|
|
402
|
+
outp << format(" # of topics: %d", num_topics)
|
|
403
|
+
outp << format(" Max iterations: %d", max_iter)
|
|
404
|
+
outp << format(" Convergence: %0.6f", convergence)
|
|
405
|
+
outp << format("EM max iterations: %d", em_max_iter)
|
|
406
|
+
outp << format(" EM convergence: %0.6f", em_convergence)
|
|
407
|
+
outp << format(" Estimate alpha: %d", est_alpha)
|
|
408
|
+
outp << format(" Backend: %s", backend_name)
|
|
409
|
+
|
|
410
|
+
outp.join("\n")
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
end
|
|
Binary file
|