lda-ruby 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +5 -13
  2. data/CHANGELOG.md +8 -0
  3. data/Gemfile +9 -0
  4. data/README.md +123 -3
  5. data/VERSION.yml +3 -3
  6. data/docs/modernization-handoff.md +190 -0
  7. data/docs/porting-strategy.md +127 -0
  8. data/docs/precompiled-platform-policy.md +68 -0
  9. data/docs/release-runbook.md +157 -0
  10. data/ext/lda-ruby/extconf.rb +10 -6
  11. data/ext/lda-ruby/lda-inference.c +21 -5
  12. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  13. data/ext/lda-ruby-rust/README.md +48 -0
  14. data/ext/lda-ruby-rust/extconf.rb +123 -0
  15. data/ext/lda-ruby-rust/src/lib.rs +456 -0
  16. data/lda-ruby.gemspec +0 -0
  17. data/lib/lda-ruby/backends/base.rb +129 -0
  18. data/lib/lda-ruby/backends/native.rb +158 -0
  19. data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
  20. data/lib/lda-ruby/backends/rust.rb +226 -0
  21. data/lib/lda-ruby/backends.rb +58 -0
  22. data/lib/lda-ruby/corpus/corpus.rb +17 -15
  23. data/lib/lda-ruby/corpus/data_corpus.rb +2 -2
  24. data/lib/lda-ruby/corpus/directory_corpus.rb +2 -2
  25. data/lib/lda-ruby/corpus/text_corpus.rb +2 -2
  26. data/lib/lda-ruby/document/document.rb +6 -6
  27. data/lib/lda-ruby/document/text_document.rb +5 -4
  28. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  29. data/lib/lda-ruby/version.rb +5 -0
  30. data/lib/lda-ruby.rb +293 -48
  31. data/test/backend_compatibility_test.rb +146 -0
  32. data/test/backends_selection_test.rb +100 -0
  33. data/test/gemspec_test.rb +27 -0
  34. data/test/lda_ruby_test.rb +49 -11
  35. data/test/packaged_gem_smoke_test.rb +33 -0
  36. data/test/release_scripts_test.rb +54 -0
  37. data/test/rust_build_policy_test.rb +23 -0
  38. data/test/simple_pipeline_test.rb +22 -0
  39. data/test/simple_yaml.rb +1 -7
  40. data/test/test_helper.rb +5 -6
  41. metadata +48 -38
  42. data/Rakefile +0 -61
  43. data/ext/lda-ruby/Makefile +0 -181
  44. data/test/data/.gitignore +0 -2
  45. data/test/simple_test.rb +0 -26
@@ -0,0 +1,226 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ module Backends
5
+ class Rust < Base
6
+ SETTINGS = %i[max_iter convergence em_max_iter em_convergence num_topics init_alpha est_alpha verbose].freeze
7
+
8
+ def self.available?
9
+ return false unless defined?(::Lda::RUST_EXTENSION_LOADED) && ::Lda::RUST_EXTENSION_LOADED
10
+ return false unless defined?(::Lda::RustBackend)
11
+
12
+ if ::Lda::RustBackend.respond_to?(:available?)
13
+ ::Lda::RustBackend.available?
14
+ else
15
+ true
16
+ end
17
+ rescue StandardError
18
+ false
19
+ end
20
+
21
+ SETTINGS.each do |setting_name|
22
+ define_method(setting_name) do
23
+ @fallback.public_send(setting_name)
24
+ end
25
+
26
+ define_method("#{setting_name}=") do |value|
27
+ @fallback.public_send("#{setting_name}=", value)
28
+ end
29
+ end
30
+
31
+ def initialize(random_seed: nil)
32
+ super(random_seed: random_seed)
33
+ raise LoadError, "Rust backend is unavailable for this environment" unless self.class.available?
34
+
35
+ @fallback = PureRuby.new(random_seed: random_seed)
36
+ @fallback.topic_weights_kernel = method(:rust_topic_weights_for_word)
37
+ @fallback.topic_term_accumulator_kernel = method(:rust_accumulate_topic_term_counts)
38
+ @fallback.document_inference_kernel = method(:rust_infer_document)
39
+ @fallback.corpus_iteration_kernel = method(:rust_infer_corpus_iteration)
40
+ @fallback.topic_term_finalizer_kernel = method(:rust_finalize_topic_term_counts)
41
+ @fallback.gamma_shift_kernel = method(:rust_average_gamma_shift)
42
+ @fallback.topic_document_probability_kernel = method(:rust_topic_document_probability)
43
+ @fallback.topic_term_seed_kernel = method(:rust_seeded_topic_term_probabilities)
44
+ @fallback.trusted_kernel_outputs = true
45
+ end
46
+
47
+ def name
48
+ "rust"
49
+ end
50
+
51
+ def corpus=(corpus)
52
+ @corpus = corpus
53
+ @fallback.corpus = corpus
54
+ true
55
+ end
56
+
57
+ def fast_load_corpus_from_file(filename)
58
+ loaded = @fallback.fast_load_corpus_from_file(filename)
59
+ @corpus = @fallback.corpus
60
+ loaded
61
+ end
62
+
63
+ def load_settings(settings_file)
64
+ loaded = @fallback.load_settings(settings_file)
65
+ @corpus = @fallback.corpus
66
+ loaded
67
+ end
68
+
69
+ def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
70
+ @fallback.set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence, est_alpha)
71
+ end
72
+
73
+ def em(start)
74
+ rust_before_em(start)
75
+ @fallback.em(start)
76
+ end
77
+
78
+ def beta
79
+ @fallback.beta
80
+ end
81
+
82
+ def gamma
83
+ @fallback.gamma
84
+ end
85
+
86
+ def compute_phi
87
+ @fallback.compute_phi
88
+ end
89
+
90
+ def model
91
+ @fallback.model
92
+ end
93
+
94
+ private
95
+
96
+ def rust_before_em(start)
97
+ return unless defined?(::Lda::RustBackend)
98
+ return unless ::Lda::RustBackend.respond_to?(:before_em)
99
+
100
+ ::Lda::RustBackend.before_em(start.to_s, @corpus&.num_docs.to_i, @corpus&.num_terms.to_i)
101
+ rescue StandardError
102
+ nil
103
+ end
104
+
105
+ def rust_topic_weights_for_word(beta_probabilities, gamma, word_index, min_probability)
106
+ return nil unless defined?(::Lda::RustBackend)
107
+ return nil unless ::Lda::RustBackend.respond_to?(:topic_weights_for_word)
108
+
109
+ ::Lda::RustBackend.topic_weights_for_word(
110
+ beta_probabilities,
111
+ gamma,
112
+ Integer(word_index),
113
+ Float(min_probability)
114
+ )
115
+ rescue StandardError
116
+ nil
117
+ end
118
+
119
+ def rust_accumulate_topic_term_counts(topic_term_counts, phi_d, words, counts)
120
+ return nil unless defined?(::Lda::RustBackend)
121
+ return nil unless ::Lda::RustBackend.respond_to?(:accumulate_topic_term_counts)
122
+
123
+ ::Lda::RustBackend.accumulate_topic_term_counts(
124
+ topic_term_counts,
125
+ phi_d,
126
+ words,
127
+ counts
128
+ )
129
+ rescue StandardError
130
+ nil
131
+ end
132
+
133
+ def rust_infer_document(beta_probabilities, gamma_initial, words, counts, max_iter, convergence, min_probability, init_alpha)
134
+ return nil unless defined?(::Lda::RustBackend)
135
+ return nil unless ::Lda::RustBackend.respond_to?(:infer_document)
136
+
137
+ output = ::Lda::RustBackend.infer_document(
138
+ beta_probabilities,
139
+ gamma_initial,
140
+ words,
141
+ counts,
142
+ Integer(max_iter),
143
+ Float(convergence),
144
+ Float(min_probability),
145
+ Float(init_alpha)
146
+ )
147
+
148
+ return nil unless output.is_a?(Array)
149
+ return nil if output.empty?
150
+
151
+ gamma = output.first
152
+ phi_rows = output[1..] || []
153
+ [gamma, phi_rows]
154
+ rescue StandardError
155
+ nil
156
+ end
157
+
158
+ def rust_infer_corpus_iteration(beta_probabilities, document_words, document_counts, max_iter, convergence, min_probability, init_alpha)
159
+ return nil unless defined?(::Lda::RustBackend)
160
+ return nil unless ::Lda::RustBackend.respond_to?(:infer_corpus_iteration)
161
+
162
+ ::Lda::RustBackend.infer_corpus_iteration(
163
+ beta_probabilities,
164
+ document_words,
165
+ document_counts,
166
+ Integer(max_iter),
167
+ Float(convergence),
168
+ Float(min_probability),
169
+ Float(init_alpha)
170
+ )
171
+ rescue StandardError
172
+ nil
173
+ end
174
+
175
+ def rust_finalize_topic_term_counts(topic_term_counts, min_probability)
176
+ return nil unless defined?(::Lda::RustBackend)
177
+ return nil unless ::Lda::RustBackend.respond_to?(:normalize_topic_term_counts)
178
+
179
+ ::Lda::RustBackend.normalize_topic_term_counts(
180
+ topic_term_counts,
181
+ Float(min_probability)
182
+ )
183
+ rescue StandardError
184
+ nil
185
+ end
186
+
187
+ def rust_average_gamma_shift(previous_gamma, current_gamma)
188
+ return nil unless defined?(::Lda::RustBackend)
189
+ return nil unless ::Lda::RustBackend.respond_to?(:average_gamma_shift)
190
+
191
+ ::Lda::RustBackend.average_gamma_shift(previous_gamma, current_gamma)
192
+ rescue StandardError
193
+ nil
194
+ end
195
+
196
+ def rust_topic_document_probability(phi_matrix, document_counts, num_topics, min_probability)
197
+ return nil unless defined?(::Lda::RustBackend)
198
+ return nil unless ::Lda::RustBackend.respond_to?(:topic_document_probability)
199
+
200
+ ::Lda::RustBackend.topic_document_probability(
201
+ phi_matrix,
202
+ document_counts,
203
+ Integer(num_topics),
204
+ Float(min_probability)
205
+ )
206
+ rescue StandardError
207
+ nil
208
+ end
209
+
210
+ def rust_seeded_topic_term_probabilities(document_words, document_counts, topics, terms, min_probability)
211
+ return nil unless defined?(::Lda::RustBackend)
212
+ return nil unless ::Lda::RustBackend.respond_to?(:seeded_topic_term_probabilities)
213
+
214
+ ::Lda::RustBackend.seeded_topic_term_probabilities(
215
+ document_words,
216
+ document_counts,
217
+ Integer(topics),
218
+ Integer(terms),
219
+ Float(min_probability)
220
+ )
221
+ rescue StandardError
222
+ nil
223
+ end
224
+ end
225
+ end
226
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "lda-ruby/backends/base"
4
+ require "lda-ruby/backends/rust"
5
+ require "lda-ruby/backends/native"
6
+ require "lda-ruby/backends/pure_ruby"
7
+
8
+ module Lda
9
+ module Backends
10
+ class << self
11
+ def build(host:, requested: nil, random_seed: nil)
12
+ mode = normalize_mode(requested)
13
+
14
+ case mode
15
+ when :auto
16
+ if Rust.available?
17
+ Rust.new(random_seed: random_seed)
18
+ elsif Native.available?(host)
19
+ Native.new(host, random_seed: random_seed)
20
+ else
21
+ PureRuby.new(random_seed: random_seed)
22
+ end
23
+ when :rust
24
+ raise LoadError, "Rust backend is unavailable for this environment" unless Rust.available?
25
+
26
+ Rust.new(random_seed: random_seed)
27
+ when :native
28
+ raise LoadError, "Native backend is unavailable for this environment" unless Native.available?(host)
29
+
30
+ Native.new(host, random_seed: random_seed)
31
+ when :pure
32
+ PureRuby.new(random_seed: random_seed)
33
+ else
34
+ raise ArgumentError, "Unknown backend mode: #{requested.inspect}"
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def normalize_mode(requested)
41
+ raw_mode = requested || ENV.fetch("LDA_RUBY_BACKEND", "auto")
42
+
43
+ case raw_mode.to_s.strip.downcase
44
+ when "", "auto"
45
+ :auto
46
+ when "native", "c"
47
+ :native
48
+ when "rust", "rust_native"
49
+ :rust
50
+ when "pure", "ruby", "pure_ruby"
51
+ :pure
52
+ else
53
+ raw_mode
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -1,24 +1,26 @@
1
- require 'set'
1
+ require "set"
2
+ require "yaml"
2
3
 
3
4
  module Lda
4
5
  class Corpus
5
6
  attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords
6
7
 
7
8
  def initialize(stop_word_list = nil)
8
- @documents = Array.new
9
+ @documents = []
9
10
  @all_terms = Set.new
10
11
  @num_terms = @num_docs = 0
11
12
  @vocabulary = Vocabulary.new
12
- if stop_word_list.nil?
13
- @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
14
- else
15
- @stopwords = YAML.load_file(stop_word_list)
16
- end
17
- @stopwords.map! { |w| w.strip }
13
+ @stopwords = if stop_word_list.nil?
14
+ File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml')
15
+ else
16
+ stop_word_list
17
+ end
18
+ @stopwords = YAML.load_file(@stopwords)
19
+ @stopwords.map!(&:strip)
18
20
  end
19
-
21
+
20
22
  def add_document(doc)
21
- raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
23
+ raise 'Parameter +doc+ must be of type Document' unless doc.is_a?(Document)
22
24
 
23
25
  @documents << doc
24
26
 
@@ -29,11 +31,11 @@ module Lda
29
31
  update_vocabulary(doc)
30
32
  nil
31
33
  end
32
-
33
- def remove_word(word)
34
- @vocabulary.words.delete word
35
- end
36
-
34
+
35
+ def remove_word(word)
36
+ @vocabulary.words.delete word
37
+ end
38
+
37
39
  protected
38
40
 
39
41
  def update_vocabulary(doc)
@@ -12,11 +12,11 @@ module Lda
12
12
  protected
13
13
 
14
14
  def load_from_file
15
- txt = File.open(@filename, 'r') { |f| f.read }
15
+ txt = File.open(@filename, 'r', &:read)
16
16
  lines = txt.split(/[\r\n]+/)
17
17
  lines.each do |line|
18
18
  add_document(DataDocument.new(self, line))
19
19
  end
20
20
  end
21
21
  end
22
- end
22
+ end
@@ -15,11 +15,11 @@ module Lda
15
15
  protected
16
16
 
17
17
  def load_from_directory
18
- dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
18
+ dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : '*'))
19
19
 
20
20
  Dir.glob(dir_glob).each do |filename|
21
21
  add_document(TextDocument.build_from_file(self, filename))
22
22
  end
23
23
  end
24
24
  end
25
- end
25
+ end
@@ -6,7 +6,7 @@ module Lda
6
6
  def initialize(input_data)
7
7
  super()
8
8
 
9
- docs = if input_data.is_a?(String) && File.exists?(input_data)
9
+ docs = if input_data.is_a?(String) && File.exist?(input_data)
10
10
  # yaml file containing an array of strings representing each document
11
11
  YAML.load_file(input_data)
12
12
  elsif input_data.is_a?(Array)
@@ -16,7 +16,7 @@ module Lda
16
16
  # a single string representing one document
17
17
  [input_data]
18
18
  else
19
- raise "Unknown input type: please pass in a valid filename or an array of strings."
19
+ raise 'Unknown input type: please pass in a valid filename or an array of strings.'
20
20
  end
21
21
 
22
22
  docs.each do |doc|
@@ -1,4 +1,3 @@
1
- # coding: utf-8
2
1
  require 'yaml'
3
2
 
4
3
  module Lda
@@ -8,9 +7,9 @@ module Lda
8
7
  def initialize(corpus)
9
8
  @corpus = corpus
10
9
 
11
- @words = Array.new
12
- @counts = Array.new
13
- @tokens = Array.new
10
+ @words = []
11
+ @counts = []
12
+ @tokens = []
14
13
  @length = 0
15
14
  @total = 0
16
15
  end
@@ -23,7 +22,7 @@ module Lda
23
22
  @length = @words.size
24
23
  end
25
24
 
26
- def has_text?
25
+ def text?
27
26
  false
28
27
  end
29
28
 
@@ -32,7 +31,8 @@ module Lda
32
31
  end
33
32
 
34
33
  def tokenize(text)
35
- clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
34
+ # remove everything but letters and ' and leave only single spaces
35
+ clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase
36
36
  @tokens = handle(clean_text.split(' '))
37
37
  nil
38
38
  end
@@ -11,14 +11,15 @@ module Lda
11
11
  build_from_tokens
12
12
  end
13
13
 
14
- def has_text?
14
+ def text?
15
15
  true
16
16
  end
17
17
 
18
18
  def self.build_from_file(corpus, filename)
19
- @filename = filename.dup.freeze
20
- text = File.open(@filename, 'r') { |f| f.read }
21
- self.new(corpus, text)
19
+ text = File.read(filename)
20
+ document = new(corpus, text)
21
+ document.instance_variable_set(:@filename, filename.dup.freeze)
22
+ document
22
23
  end
23
24
 
24
25
  protected
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ module RustBuildPolicy
5
+ ENV_KEY = "LDA_RUBY_RUST_BUILD"
6
+ AUTO = "auto"
7
+ ALWAYS = "always"
8
+ NEVER = "never"
9
+ VALID_VALUES = [AUTO, ALWAYS, NEVER].freeze
10
+
11
+ module_function
12
+
13
+ def resolve(raw_value = ENV[ENV_KEY])
14
+ value = raw_value.to_s.strip.downcase
15
+ return AUTO if value.empty?
16
+ return value if VALID_VALUES.include?(value)
17
+
18
+ AUTO
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Lda
4
+ VERSION = "0.4.0"
5
+ end