lda-ruby 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +61 -0
  3. data/Gemfile +9 -0
  4. data/README.md +157 -0
  5. data/VERSION.yml +5 -0
  6. data/docs/modernization-handoff.md +190 -0
  7. data/docs/porting-strategy.md +127 -0
  8. data/docs/precompiled-platform-policy.md +68 -0
  9. data/docs/release-runbook.md +157 -0
  10. data/ext/lda-ruby/cokus.c +145 -0
  11. data/ext/lda-ruby/cokus.h +27 -0
  12. data/ext/lda-ruby/extconf.rb +13 -0
  13. data/ext/lda-ruby/lda-alpha.c +96 -0
  14. data/ext/lda-ruby/lda-alpha.h +21 -0
  15. data/ext/lda-ruby/lda-data.c +67 -0
  16. data/ext/lda-ruby/lda-data.h +14 -0
  17. data/ext/lda-ruby/lda-inference.c +1023 -0
  18. data/ext/lda-ruby/lda-inference.h +63 -0
  19. data/ext/lda-ruby/lda-model.c +345 -0
  20. data/ext/lda-ruby/lda-model.h +31 -0
  21. data/ext/lda-ruby/lda.h +54 -0
  22. data/ext/lda-ruby/utils.c +111 -0
  23. data/ext/lda-ruby/utils.h +18 -0
  24. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  25. data/ext/lda-ruby-rust/README.md +48 -0
  26. data/ext/lda-ruby-rust/extconf.rb +123 -0
  27. data/ext/lda-ruby-rust/src/lib.rs +456 -0
  28. data/lda-ruby.gemspec +78 -0
  29. data/lib/lda-ruby/backends/base.rb +129 -0
  30. data/lib/lda-ruby/backends/native.rb +158 -0
  31. data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
  32. data/lib/lda-ruby/backends/rust.rb +226 -0
  33. data/lib/lda-ruby/backends.rb +58 -0
  34. data/lib/lda-ruby/config/stopwords.yml +571 -0
  35. data/lib/lda-ruby/corpus/corpus.rb +45 -0
  36. data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
  37. data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
  38. data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
  39. data/lib/lda-ruby/document/data_document.rb +30 -0
  40. data/lib/lda-ruby/document/document.rb +40 -0
  41. data/lib/lda-ruby/document/text_document.rb +39 -0
  42. data/lib/lda-ruby/lda.so +0 -0
  43. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  44. data/lib/lda-ruby/version.rb +5 -0
  45. data/lib/lda-ruby/vocabulary.rb +46 -0
  46. data/lib/lda-ruby.rb +413 -0
  47. data/lib/lda_ruby_rust.so +0 -0
  48. data/license.txt +504 -0
  49. data/test/backend_compatibility_test.rb +146 -0
  50. data/test/backends_selection_test.rb +100 -0
  51. data/test/data/docs.dat +46 -0
  52. data/test/data/sample.rb +20 -0
  53. data/test/data/wiki-test-docs.yml +123 -0
  54. data/test/gemspec_test.rb +27 -0
  55. data/test/lda_ruby_test.rb +319 -0
  56. data/test/packaged_gem_smoke_test.rb +33 -0
  57. data/test/release_scripts_test.rb +54 -0
  58. data/test/rust_build_policy_test.rb +23 -0
  59. data/test/simple_pipeline_test.rb +22 -0
  60. data/test/simple_yaml.rb +17 -0
  61. data/test/test_helper.rb +10 -0
  62. metadata +111 -0
@@ -0,0 +1,40 @@
1
+ require 'yaml'
2
+
3
+ module Lda
4
+ class Document
5
+ attr_reader :corpus, :words, :counts, :length, :total, :tokens
6
+
7
+ def initialize(corpus)
8
+ @corpus = corpus
9
+
10
+ @words = []
11
+ @counts = []
12
+ @tokens = []
13
+ @length = 0
14
+ @total = 0
15
+ end
16
+
17
+ #
18
+ # Recompute the total and length values.
19
+ #
20
+ def recompute
21
+ @total = @counts.inject(0) { |sum, i| sum + i }
22
+ @length = @words.size
23
+ end
24
+
25
+ def text?
26
+ false
27
+ end
28
+
29
+ def handle(tokens)
30
+ tokens
31
+ end
32
+
33
+ def tokenize(text)
34
+ # remove everything but letters and ' and leave only single spaces
35
+ clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase
36
+ @tokens = handle(clean_text.split(' '))
37
+ nil
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,39 @@
1
+ module Lda
2
+ class TextDocument < Document
3
+ attr_reader :filename
4
+
5
+ def initialize(corpus, text)
6
+ super(corpus)
7
+ @filename = nil
8
+
9
+ tokenize(text)
10
+ @corpus.stopwords.each { |w| @tokens.delete(w) }
11
+ build_from_tokens
12
+ end
13
+
14
+ def text?
15
+ true
16
+ end
17
+
18
+ def self.build_from_file(corpus, filename)
19
+ text = File.read(filename)
20
+ document = new(corpus, text)
21
+ document.instance_variable_set(:@filename, filename.dup.freeze)
22
+ document
23
+ end
24
+
25
+ protected
26
+
27
+ def build_from_tokens
28
+ vocab = Hash.new(0)
29
+ @tokens.each { |t| vocab[t] = vocab[t] + 1 }
30
+
31
+ vocab.each_pair do |word, count|
32
+ @words << @corpus.vocabulary.check_word(word) - 1
33
+ @counts << count
34
+ end
35
+
36
+ recompute
37
+ end
38
+ end
39
+ end
Binary file
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ module RustBuildPolicy
5
+ ENV_KEY = "LDA_RUBY_RUST_BUILD"
6
+ AUTO = "auto"
7
+ ALWAYS = "always"
8
+ NEVER = "never"
9
+ VALID_VALUES = [AUTO, ALWAYS, NEVER].freeze
10
+
11
+ module_function
12
+
13
+ def resolve(raw_value = ENV[ENV_KEY])
14
+ value = raw_value.to_s.strip.downcase
15
+ return AUTO if value.empty?
16
+ return value if VALID_VALUES.include?(value)
17
+
18
+ AUTO
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ VERSION = "0.4.0"
5
+ end
@@ -0,0 +1,46 @@
1
+ module Lda
2
+ class Vocabulary
3
+ attr_reader :words, :indexes
4
+
5
+ def initialize(words = nil)
6
+ @words = Hash.new do |hash, key|
7
+ if hash.member?(:MAX_VALUE)
8
+ hash[:MAX_VALUE] = hash[:MAX_VALUE] + 1
9
+ else
10
+ hash[:MAX_VALUE] = 1
11
+ end
12
+ hash[key] = hash[:MAX_VALUE]
13
+ end
14
+
15
+ words.each { |w| @words[w] } if words
16
+ @indexes = Hash.new
17
+
18
+ @words.each_pair do |w, i|
19
+ @indexes[i] = w
20
+ end
21
+ end
22
+
23
+ def check_word(word)
24
+ w = @words[word.dup]
25
+ @indexes[w] = word.dup
26
+ w
27
+ end
28
+
29
+ def load_file(filename)
30
+ txt = File.open(filename, 'r') { |f| f.read }
31
+ txt.split(/[\n\r]+/).each { |word| check_word(word) }
32
+ end
33
+
34
+ def load_yaml(filename)
35
+ YAML::load_file(filename).each { |word| check_word(word) }
36
+ end
37
+
38
+ def num_words
39
+ ((@words.size > 0) ? @words.size - 1 : 0 )
40
+ end
41
+
42
+ def to_a
43
+ @words.sort { |w1, w2| w1[1] <=> w2[1] }.map { |word, idx| word }.reject { |w| w == :MAX_VALUE }
44
+ end
45
+ end
46
+ end
data/lib/lda-ruby.rb ADDED
@@ -0,0 +1,413 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lda-ruby/version"
4
+ require "rbconfig"
5
+
6
+ rust_extension_loaded = false
7
+ rust_dlext = RbConfig::CONFIG.fetch("DLEXT")
8
+
9
+ [
10
+ "lda_ruby_rust",
11
+ "../ext/lda-ruby-rust/target/release/lda_ruby_rust",
12
+ "../ext/lda-ruby-rust/target/release/lda_ruby_rust.#{rust_dlext}",
13
+ "../ext/lda-ruby-rust/target/debug/lda_ruby_rust",
14
+ "../ext/lda-ruby-rust/target/debug/lda_ruby_rust.#{rust_dlext}"
15
+ ].each do |rust_extension_candidate|
16
+ begin
17
+ if rust_extension_candidate.start_with?("../")
18
+ require_relative rust_extension_candidate
19
+ else
20
+ require rust_extension_candidate
21
+ end
22
+
23
+ rust_extension_loaded = true
24
+ break
25
+ rescue LoadError
26
+ next
27
+ end
28
+ end
29
+
30
+ native_extension_loaded = false
31
+
32
+ begin
33
+ require "lda-ruby/lda"
34
+ native_extension_loaded = true
35
+ rescue LoadError
36
+ begin
37
+ require_relative "../ext/lda-ruby/lda"
38
+ native_extension_loaded = true
39
+ rescue LoadError
40
+ native_extension_loaded = false
41
+ end
42
+ end
43
+
44
+ LDA_RUBY_NATIVE_EXTENSION_LOADED = native_extension_loaded unless defined?(LDA_RUBY_NATIVE_EXTENSION_LOADED)
45
+ LDA_RUBY_RUST_EXTENSION_LOADED = rust_extension_loaded unless defined?(LDA_RUBY_RUST_EXTENSION_LOADED)
46
+
47
+ require "lda-ruby/document/document"
48
+ require "lda-ruby/document/data_document"
49
+ require "lda-ruby/document/text_document"
50
+ require "lda-ruby/corpus/corpus"
51
+ require "lda-ruby/corpus/data_corpus"
52
+ require "lda-ruby/corpus/text_corpus"
53
+ require "lda-ruby/corpus/directory_corpus"
54
+ require "lda-ruby/vocabulary"
55
+ require "lda-ruby/backends"
56
+
57
+ module Lda
58
+ RUST_EXTENSION_LOADED = LDA_RUBY_RUST_EXTENSION_LOADED unless const_defined?(:RUST_EXTENSION_LOADED)
59
+ NATIVE_EXTENSION_LOADED = LDA_RUBY_NATIVE_EXTENSION_LOADED unless const_defined?(:NATIVE_EXTENSION_LOADED)
60
+
61
+ class Lda
62
+ NATIVE_ALIAS_MAP = {
63
+ fast_load_corpus_from_file: :__native_fast_load_corpus_from_file,
64
+ "corpus=": :__native_set_corpus,
65
+ em: :__native_em,
66
+ load_settings: :__native_load_settings,
67
+ set_config: :__native_set_config,
68
+ max_iter: :__native_max_iter,
69
+ "max_iter=": :__native_set_max_iter,
70
+ convergence: :__native_convergence,
71
+ "convergence=": :__native_set_convergence,
72
+ em_max_iter: :__native_em_max_iter,
73
+ "em_max_iter=": :__native_set_em_max_iter,
74
+ em_convergence: :__native_em_convergence,
75
+ "em_convergence=": :__native_set_em_convergence,
76
+ init_alpha: :__native_init_alpha,
77
+ "init_alpha=": :__native_set_init_alpha,
78
+ est_alpha: :__native_est_alpha,
79
+ "est_alpha=": :__native_set_est_alpha,
80
+ num_topics: :__native_num_topics,
81
+ "num_topics=": :__native_set_num_topics,
82
+ verbose: :__native_verbose,
83
+ "verbose=": :__native_set_verbose,
84
+ beta: :__native_beta,
85
+ gamma: :__native_gamma,
86
+ compute_phi: :__native_compute_phi,
87
+ model: :__native_model
88
+ }.freeze
89
+
90
+ NATIVE_ALIAS_MAP.each do |native_name, alias_name|
91
+ next unless method_defined?(native_name)
92
+
93
+ alias_method alias_name, native_name
94
+ private alias_name
95
+ end
96
+
97
+ attr_reader :vocab, :corpus, :backend
98
+
99
+ def initialize(corpus, backend: nil, random_seed: nil)
100
+ @backend = Backends.build(host: self, requested: backend, random_seed: random_seed)
101
+
102
+ load_default_settings
103
+
104
+ @vocab = nil
105
+ self.corpus = corpus
106
+ @vocab = corpus.vocabulary.to_a if corpus.respond_to?(:vocabulary) && corpus.vocabulary
107
+
108
+ @phi = nil
109
+ end
110
+
111
+ def backend_name
112
+ @backend.name
113
+ end
114
+
115
+ def native_backend?
116
+ backend_name == "native"
117
+ end
118
+
119
+ def rust_backend?
120
+ backend_name == "rust"
121
+ end
122
+
123
+ def load_default_settings
124
+ self.max_iter = 20
125
+ self.convergence = 1e-6
126
+ self.em_max_iter = 100
127
+ self.em_convergence = 1e-4
128
+ self.num_topics = 20
129
+ self.init_alpha = 0.3
130
+ self.est_alpha = 1
131
+
132
+ [20, 1e-6, 100, 1e-4, 20, 0.3, 1]
133
+ end
134
+
135
+ def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence = self.em_convergence, est_alpha = self.est_alpha)
136
+ @backend.set_config(
137
+ Float(init_alpha),
138
+ Integer(num_topics),
139
+ Integer(max_iter),
140
+ Float(convergence),
141
+ Integer(em_max_iter),
142
+ Float(em_convergence),
143
+ Integer(est_alpha)
144
+ )
145
+ end
146
+
147
+ def max_iter
148
+ @backend.max_iter
149
+ end
150
+
151
+ def max_iter=(value)
152
+ @backend.max_iter = Integer(value)
153
+ end
154
+
155
+ def convergence
156
+ @backend.convergence
157
+ end
158
+
159
+ def convergence=(value)
160
+ @backend.convergence = Float(value)
161
+ end
162
+
163
+ def em_max_iter
164
+ @backend.em_max_iter
165
+ end
166
+
167
+ def em_max_iter=(value)
168
+ @backend.em_max_iter = Integer(value)
169
+ end
170
+
171
+ def em_convergence
172
+ @backend.em_convergence
173
+ end
174
+
175
+ def em_convergence=(value)
176
+ @backend.em_convergence = Float(value)
177
+ end
178
+
179
+ def num_topics
180
+ @backend.num_topics
181
+ end
182
+
183
+ def num_topics=(value)
184
+ @backend.num_topics = Integer(value)
185
+ end
186
+
187
+ def init_alpha
188
+ @backend.init_alpha
189
+ end
190
+
191
+ def init_alpha=(value)
192
+ @backend.init_alpha = Float(value)
193
+ end
194
+
195
+ def est_alpha
196
+ @backend.est_alpha
197
+ end
198
+
199
+ def est_alpha=(value)
200
+ @backend.est_alpha = Integer(value)
201
+ end
202
+
203
+ def verbose
204
+ @backend.verbose
205
+ end
206
+
207
+ def verbose=(value)
208
+ @backend.verbose = !!value
209
+ end
210
+
211
+ def corpus=(corpus)
212
+ @corpus = corpus
213
+ @backend.corpus = corpus
214
+ true
215
+ end
216
+
217
+ def load_corpus(filename)
218
+ fast_load_corpus_from_file(filename)
219
+ end
220
+
221
+ def fast_load_corpus_from_file(filename)
222
+ loaded = @backend.fast_load_corpus_from_file(filename)
223
+
224
+ if @backend.corpus
225
+ @corpus = @backend.corpus
226
+ @vocab = @corpus.vocabulary.to_a if @corpus.respond_to?(:vocabulary) && @corpus.vocabulary
227
+ elsif @corpus.nil?
228
+ @corpus = DataCorpus.new(filename)
229
+ end
230
+
231
+ !!loaded
232
+ end
233
+
234
+ def load_settings(settings_file)
235
+ @backend.load_settings(settings_file)
236
+ end
237
+
238
+ def load_vocabulary(vocab)
239
+ if vocab.is_a?(Array)
240
+ @vocab = Marshal.load(Marshal.dump(vocab)) # deep clone array
241
+ elsif vocab.is_a?(Vocabulary)
242
+ @vocab = vocab.to_a
243
+ else
244
+ @vocab = File.read(vocab).split(/\s+/)
245
+ end
246
+
247
+ true
248
+ end
249
+
250
+ def em(start = "random")
251
+ @phi = nil
252
+ @backend.em(start.to_s)
253
+ end
254
+
255
+ def beta
256
+ @backend.beta
257
+ end
258
+
259
+ def gamma
260
+ @backend.gamma
261
+ end
262
+
263
+ def model
264
+ @backend.model
265
+ end
266
+
267
+ #
268
+ # Visualization method for printing out the top +words_per_topic+ words
269
+ # for each topic.
270
+ #
271
+ # See also +top_words+.
272
+ #
273
+ def print_topics(words_per_topic = 10)
274
+ raise "No vocabulary loaded." unless @vocab
275
+
276
+ beta.each_with_index do |topic, topic_num|
277
+ indices = topic
278
+ .each_with_index
279
+ .sort_by { |score, _index| score }
280
+ .reverse
281
+ .first(words_per_topic)
282
+ .map { |_score, index| index }
283
+
284
+ puts "Topic #{topic_num}"
285
+ puts "\t#{indices.map { |i| @vocab[i] }.join("\n\t")}"
286
+ puts ""
287
+ end
288
+
289
+ nil
290
+ end
291
+
292
+ #
293
+ # After the model has been run and a vocabulary has been loaded, return the
294
+ # +words_per_topic+ top words chosen by the model for each topic. This is
295
+ # returned as a hash mapping the topic number to an array of top words
296
+ # (in descending order of importance).
297
+ #
298
+ # topic_number => [w1, w2, ..., w_n]
299
+ #
300
+ # See also +print_topics+.
301
+ #
302
+ def top_word_indices(words_per_topic = 10)
303
+ raise "No vocabulary loaded." unless @vocab
304
+
305
+ topics = {}
306
+
307
+ beta.each_with_index do |topic, topic_num|
308
+ topics[topic_num] = topic
309
+ .each_with_index
310
+ .sort_by { |score, _index| score }
311
+ .reverse
312
+ .first(words_per_topic)
313
+ .map { |_score, index| index }
314
+ end
315
+
316
+ topics
317
+ end
318
+
319
+ def top_words(words_per_topic = 10)
320
+ output = {}
321
+
322
+ topics = top_word_indices(words_per_topic)
323
+ topics.each_pair do |topic_num, words|
324
+ output[topic_num] = words.map { |w| @vocab[w] }
325
+ end
326
+
327
+ output
328
+ end
329
+
330
+ #
331
+ # Get the phi matrix which can be used to assign probabilities to words
332
+ # belonging to a specific topic in each document. The return value is a
333
+ # 3D matrix: num_docs x doc_length x num_topics. The value is cached
334
+ # after the first call, so if it needs to be recomputed, set the +recompute+
335
+ # value to true.
336
+ #
337
+ def phi(recompute = false)
338
+ @phi = compute_phi if @phi.nil? || recompute
339
+
340
+ @phi
341
+ end
342
+
343
+ def compute_phi
344
+ @backend.compute_phi
345
+ end
346
+
347
+ #
348
+ # Compute the average log probability for each topic for each document in the corpus.
349
+ # This method returns a matrix: num_docs x num_topics with the average log probability
350
+ # for the topic in the document.
351
+ #
352
+ def compute_topic_document_probability
353
+ phi_matrix = phi
354
+ document_counts = @corpus.documents.map(&:counts)
355
+
356
+ backend_output = @backend.topic_document_probability(phi_matrix, document_counts)
357
+ if valid_topic_document_probability_output?(backend_output, document_counts.size, num_topics)
358
+ return backend_output
359
+ end
360
+
361
+ outp = []
362
+
363
+ @corpus.documents.each_with_index do |doc, idx|
364
+ tops = [0.0] * num_topics
365
+ ttl = doc.counts.inject(0.0) { |sum, i| sum + i }
366
+
367
+ phi_matrix[idx].each_with_index do |word_dist, word_idx|
368
+ word_dist.each_with_index do |top_prob, top_idx|
369
+ tops[top_idx] += Math.log([top_prob, 1e-300].max) * doc.counts[word_idx]
370
+ end
371
+ end
372
+
373
+ tops = tops.map { |i| i / ttl }
374
+ outp << tops
375
+ end
376
+
377
+ outp
378
+ end
379
+
380
+ def valid_topic_document_probability_output?(output, expected_docs, expected_topics)
381
+ return false unless output.is_a?(Array)
382
+ return false unless output.size == expected_docs
383
+
384
+ output.each do |row|
385
+ return false unless row.is_a?(Array)
386
+ return false unless row.size == expected_topics
387
+ row.each do |value|
388
+ return false unless value.is_a?(Numeric)
389
+ return false unless value.finite?
390
+ end
391
+ end
392
+
393
+ true
394
+ end
395
+
396
+ #
397
+ # String representation displaying current settings.
398
+ #
399
+ def to_s
400
+ outp = ["LDA Settings:"]
401
+ outp << format(" Initial alpha: %0.6f", init_alpha)
402
+ outp << format(" # of topics: %d", num_topics)
403
+ outp << format(" Max iterations: %d", max_iter)
404
+ outp << format(" Convergence: %0.6f", convergence)
405
+ outp << format("EM max iterations: %d", em_max_iter)
406
+ outp << format(" EM convergence: %0.6f", em_convergence)
407
+ outp << format(" Estimate alpha: %d", est_alpha)
408
+ outp << format(" Backend: %s", backend_name)
409
+
410
+ outp.join("\n")
411
+ end
412
+ end
413
+ end
Binary file