RubyGems - lda-ruby - Versions diffs - 0.3.9 → 0.5.0 - Mend

lda-ruby 0.3.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

checksums.yaml +5 -13
data/CHANGELOG.md +16 -0
data/Gemfile +9 -0
data/README.md +126 -3
data/VERSION.yml +3 -3
data/docs/modernization-handoff.md +233 -0
data/docs/porting-strategy.md +148 -0
data/docs/precompiled-platform-policy.md +81 -0
data/docs/precompiled-target-evaluation.md +67 -0
data/docs/release-runbook.md +192 -0
data/docs/rust-orchestration-guardrails.md +50 -0
data/ext/lda-ruby/cokus.c +10 -11
data/ext/lda-ruby/cokus.h +3 -3
data/ext/lda-ruby/extconf.rb +10 -6
data/ext/lda-ruby/lda-inference.c +23 -7
data/ext/lda-ruby/utils.c +8 -0
data/ext/lda-ruby-rust/Cargo.toml +12 -0
data/ext/lda-ruby-rust/README.md +73 -0
data/ext/lda-ruby-rust/extconf.rb +135 -0
data/ext/lda-ruby-rust/include/strings.h +35 -0
data/ext/lda-ruby-rust/src/lib.rs +1263 -0
data/lda-ruby.gemspec +0 -0
data/lib/lda-ruby/backends/base.rb +133 -0
data/lib/lda-ruby/backends/native.rb +158 -0
data/lib/lda-ruby/backends/pure_ruby.rb +675 -0
data/lib/lda-ruby/backends/rust.rb +607 -0
data/lib/lda-ruby/backends.rb +58 -0
data/lib/lda-ruby/corpus/corpus.rb +17 -15
data/lib/lda-ruby/corpus/data_corpus.rb +2 -2
data/lib/lda-ruby/corpus/directory_corpus.rb +2 -2
data/lib/lda-ruby/corpus/text_corpus.rb +2 -2
data/lib/lda-ruby/document/document.rb +6 -6
data/lib/lda-ruby/document/text_document.rb +5 -4
data/lib/lda-ruby/rust_build_policy.rb +21 -0
data/lib/lda-ruby/version.rb +5 -0
data/lib/lda-ruby.rb +293 -48
data/test/backend_compatibility_test.rb +146 -0
data/test/backends_selection_test.rb +100 -0
data/test/benchmark_scripts_test.rb +23 -0
data/test/gemspec_test.rb +27 -0
data/test/lda_ruby_test.rb +49 -11
data/test/packaged_gem_smoke_test.rb +33 -0
data/test/pure_ruby_orchestration_test.rb +109 -0
data/test/release_scripts_test.rb +93 -0
data/test/rust_build_policy_test.rb +23 -0
data/test/rust_orchestration_test.rb +911 -0
data/test/simple_pipeline_test.rb +22 -0
data/test/simple_yaml.rb +1 -7
data/test/test_helper.rb +5 -6
metadata +54 -38
data/Rakefile +0 -61
data/ext/lda-ruby/Makefile +0 -181
data/test/data/.gitignore +0 -2
data/test/simple_test.rb +0 -26

data/lib/lda-ruby/corpus/corpus.rb CHANGED Viewed

@@ -1,24 +1,26 @@
-require 'set'
+require "set"
+require "yaml"
 module Lda
   class Corpus
     attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords
     def initialize(stop_word_list = nil)
-      @documents = Array.new
+      @documents = []
       @all_terms = Set.new
       @num_terms = @num_docs = 0
       @vocabulary = Vocabulary.new
-      if stop_word_list.nil?
-        @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
-      else
-        @stopwords = YAML.load_file(stop_word_list)
-      end
-      @stopwords.map! { |w| w.strip }
+      @stopwords =  if stop_word_list.nil?
+                      File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml')
+                    else
+                      stop_word_list
+                    end
+      @stopwords = YAML.load_file(@stopwords)
+      @stopwords.map!(&:strip)
     end
     def add_document(doc)
-      raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
+      raise 'Parameter +doc+ must be of type Document' unless doc.is_a?(Document)
       @documents << doc
@@ -29,11 +31,11 @@ module Lda
       update_vocabulary(doc)
       nil
     end
-	def remove_word(word)
-		@vocabulary.words.delete word
-	end
+    def remove_word(word)
+      @vocabulary.words.delete word
+    end
     protected
     def update_vocabulary(doc)

data/lib/lda-ruby/corpus/data_corpus.rb CHANGED Viewed

@@ -12,11 +12,11 @@ module Lda
     protected
     def load_from_file
-      txt = File.open(@filename, 'r') { |f| f.read }
+      txt = File.open(@filename, 'r', &:read)
       lines = txt.split(/[\r\n]+/)
       lines.each do |line|
         add_document(DataDocument.new(self, line))
       end
     end
   end
-end
+end

data/lib/lda-ruby/corpus/directory_corpus.rb CHANGED Viewed

@@ -15,11 +15,11 @@ module Lda
     protected
     def load_from_directory
-      dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : "*"))
+      dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : '*'))
       Dir.glob(dir_glob).each do |filename|
         add_document(TextDocument.build_from_file(self, filename))
       end
     end
   end
-end
+end

data/lib/lda-ruby/corpus/text_corpus.rb CHANGED Viewed

@@ -6,7 +6,7 @@ module Lda
     def initialize(input_data)
       super()
-      docs = if input_data.is_a?(String) && File.exists?(input_data)
+      docs = if input_data.is_a?(String) && File.exist?(input_data)
         # yaml file containing an array of strings representing each document
         YAML.load_file(input_data)
       elsif input_data.is_a?(Array)
@@ -16,7 +16,7 @@ module Lda
         # a single string representing one document
         [input_data]
       else
-        raise "Unknown input type: please pass in a valid filename or an array of strings."
+        raise 'Unknown input type: please pass in a valid filename or an array of strings.'
       end
       docs.each do |doc|

data/lib/lda-ruby/document/document.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-# coding: utf-8
 require 'yaml'
 module Lda
@@ -8,9 +7,9 @@ module Lda
     def initialize(corpus)
       @corpus = corpus
-      @words  = Array.new
-      @counts = Array.new
-      @tokens = Array.new
+      @words  = []
+      @counts = []
+      @tokens = []
       @length = 0
       @total  = 0
     end
@@ -23,7 +22,7 @@ module Lda
       @length = @words.size
     end
-    def has_text?
+    def text?
       false
     end
@@ -32,7 +31,8 @@ module Lda
     end
     def tokenize(text)
-      clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase  # remove everything but letters and ' and leave only single spaces
+      # remove everything but letters and ' and leave only single spaces
+      clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase
       @tokens = handle(clean_text.split(' '))
       nil
     end

data/lib/lda-ruby/document/text_document.rb CHANGED Viewed

@@ -11,14 +11,15 @@ module Lda
       build_from_tokens
     end
-    def has_text?
+    def text?
       true
     end
     def self.build_from_file(corpus, filename)
-      @filename = filename.dup.freeze
-      text = File.open(@filename, 'r') { |f| f.read }
-      self.new(corpus, text)
+      text = File.read(filename)
+      document = new(corpus, text)
+      document.instance_variable_set(:@filename, filename.dup.freeze)
+      document
     end
     protected

data/lib/lda-ruby/rust_build_policy.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+module Lda
+  module RustBuildPolicy
+    ENV_KEY = "LDA_RUBY_RUST_BUILD"
+    AUTO = "auto"
+    ALWAYS = "always"
+    NEVER = "never"
+    VALID_VALUES = [AUTO, ALWAYS, NEVER].freeze
+    module_function
+    def resolve(raw_value = ENV[ENV_KEY])
+      value = raw_value.to_s.strip.downcase
+      return AUTO if value.empty?
+      return value if VALID_VALUES.include?(value)
+      AUTO
+    end
+  end
+end

data/lib/lda-ruby/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Lda
+  VERSION = "0.5.0"
+end

data/lib/lda-ruby.rb CHANGED Viewed

@@ -1,29 +1,125 @@
-$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
-require 'lda-ruby/lda'
-require 'lda-ruby/document/document'
-require 'lda-ruby/document/data_document'
-require 'lda-ruby/document/text_document'
-require 'lda-ruby/corpus/corpus'
-require 'lda-ruby/corpus/data_corpus'
-require 'lda-ruby/corpus/text_corpus'
-require 'lda-ruby/corpus/directory_corpus'
-require 'lda-ruby/vocabulary'
+# frozen_string_literal: true
+require "lda-ruby/version"
+require "rbconfig"
+rust_extension_loaded = false
+rust_dlext = RbConfig::CONFIG.fetch("DLEXT")
+[
+  "lda_ruby_rust",
+  "../ext/lda-ruby-rust/target/release/lda_ruby_rust",
+  "../ext/lda-ruby-rust/target/release/lda_ruby_rust.#{rust_dlext}",
+  "../ext/lda-ruby-rust/target/debug/lda_ruby_rust",
+  "../ext/lda-ruby-rust/target/debug/lda_ruby_rust.#{rust_dlext}"
+].each do |rust_extension_candidate|
+  begin
+    if rust_extension_candidate.start_with?("../")
+      require_relative rust_extension_candidate
+    else
+      require rust_extension_candidate
+    end
+    rust_extension_loaded = true
+    break
+  rescue LoadError
+    next
+  end
+end
+native_extension_loaded = false
+begin
+  require "lda-ruby/lda"
+  native_extension_loaded = true
+rescue LoadError
+  begin
+    require_relative "../ext/lda-ruby/lda"
+    native_extension_loaded = true
+  rescue LoadError
+    native_extension_loaded = false
+  end
+end
+LDA_RUBY_NATIVE_EXTENSION_LOADED = native_extension_loaded unless defined?(LDA_RUBY_NATIVE_EXTENSION_LOADED)
+LDA_RUBY_RUST_EXTENSION_LOADED = rust_extension_loaded unless defined?(LDA_RUBY_RUST_EXTENSION_LOADED)
+require "lda-ruby/document/document"
+require "lda-ruby/document/data_document"
+require "lda-ruby/document/text_document"
+require "lda-ruby/corpus/corpus"
+require "lda-ruby/corpus/data_corpus"
+require "lda-ruby/corpus/text_corpus"
+require "lda-ruby/corpus/directory_corpus"
+require "lda-ruby/vocabulary"
+require "lda-ruby/backends"
 module Lda
+  RUST_EXTENSION_LOADED = LDA_RUBY_RUST_EXTENSION_LOADED unless const_defined?(:RUST_EXTENSION_LOADED)
+  NATIVE_EXTENSION_LOADED = LDA_RUBY_NATIVE_EXTENSION_LOADED unless const_defined?(:NATIVE_EXTENSION_LOADED)
   class Lda
-    attr_reader :vocab, :corpus
+    NATIVE_ALIAS_MAP = {
+      fast_load_corpus_from_file: :__native_fast_load_corpus_from_file,
+      "corpus=": :__native_set_corpus,
+      em: :__native_em,
+      load_settings: :__native_load_settings,
+      set_config: :__native_set_config,
+      max_iter: :__native_max_iter,
+      "max_iter=": :__native_set_max_iter,
+      convergence: :__native_convergence,
+      "convergence=": :__native_set_convergence,
+      em_max_iter: :__native_em_max_iter,
+      "em_max_iter=": :__native_set_em_max_iter,
+      em_convergence: :__native_em_convergence,
+      "em_convergence=": :__native_set_em_convergence,
+      init_alpha: :__native_init_alpha,
+      "init_alpha=": :__native_set_init_alpha,
+      est_alpha: :__native_est_alpha,
+      "est_alpha=": :__native_set_est_alpha,
+      num_topics: :__native_num_topics,
+      "num_topics=": :__native_set_num_topics,
+      verbose: :__native_verbose,
+      "verbose=": :__native_set_verbose,
+      beta: :__native_beta,
+      gamma: :__native_gamma,
+      compute_phi: :__native_compute_phi,
+      model: :__native_model
+    }.freeze
+    NATIVE_ALIAS_MAP.each do |native_name, alias_name|
+      next unless method_defined?(native_name)
+      alias_method alias_name, native_name
+      private alias_name
+    end
+    attr_reader :vocab, :corpus, :backend
+    def initialize(corpus, backend: nil, random_seed: nil)
+      @backend = Backends.build(host: self, requested: backend, random_seed: random_seed)
-    def initialize(corpus)
       load_default_settings
       @vocab = nil
       self.corpus = corpus
-      @vocab = corpus.vocabulary.to_a if corpus.vocabulary
+      @vocab = corpus.vocabulary.to_a if corpus.respond_to?(:vocabulary) && corpus.vocabulary
       @phi = nil
     end
+    def backend_name
+      @backend.name
+    end
+    def native_backend?
+      backend_name == "native"
+    end
+    def rust_backend?
+      backend_name == "rust"
+    end
     def load_default_settings
       self.max_iter = 20
       self.convergence = 1e-6
@@ -36,25 +132,138 @@ module Lda
       [20, 1e-6, 100, 1e-4, 20, 0.3, 1]
     end
-    def load_corpus(filename)
-      @corpus = Corpus.new
-      @corpus.load_from_file(filename)
+    def set_config(init_alpha, num_topics, max_iter, convergence, em_max_iter, em_convergence = self.em_convergence, est_alpha = self.est_alpha)
+      @backend.set_config(
+        Float(init_alpha),
+        Integer(num_topics),
+        Integer(max_iter),
+        Float(convergence),
+        Integer(em_max_iter),
+        Float(em_convergence),
+        Integer(est_alpha)
+      )
+    end
+    def max_iter
+      @backend.max_iter
+    end
+    def max_iter=(value)
+      @backend.max_iter = Integer(value)
+    end
+    def convergence
+      @backend.convergence
+    end
+    def convergence=(value)
+      @backend.convergence = Float(value)
+    end
+    def em_max_iter
+      @backend.em_max_iter
+    end
+    def em_max_iter=(value)
+      @backend.em_max_iter = Integer(value)
+    end
+    def em_convergence
+      @backend.em_convergence
+    end
+    def em_convergence=(value)
+      @backend.em_convergence = Float(value)
+    end
+    def num_topics
+      @backend.num_topics
+    end
+    def num_topics=(value)
+      @backend.num_topics = Integer(value)
+    end
+    def init_alpha
+      @backend.init_alpha
+    end
+    def init_alpha=(value)
+      @backend.init_alpha = Float(value)
+    end
+    def est_alpha
+      @backend.est_alpha
+    end
+    def est_alpha=(value)
+      @backend.est_alpha = Integer(value)
+    end
+    def verbose
+      @backend.verbose
+    end
+    def verbose=(value)
+      @backend.verbose = !!value
+    end
+    def corpus=(corpus)
+      @corpus = corpus
+      @backend.corpus = corpus
       true
     end
+    def load_corpus(filename)
+      fast_load_corpus_from_file(filename)
+    end
+    def fast_load_corpus_from_file(filename)
+      loaded = @backend.fast_load_corpus_from_file(filename)
+      if @backend.corpus
+        @corpus = @backend.corpus
+        @vocab = @corpus.vocabulary.to_a if @corpus.respond_to?(:vocabulary) && @corpus.vocabulary
+      elsif @corpus.nil?
+        @corpus = DataCorpus.new(filename)
+      end
+      !!loaded
+    end
+    def load_settings(settings_file)
+      @backend.load_settings(settings_file)
+    end
     def load_vocabulary(vocab)
       if vocab.is_a?(Array)
-        @vocab = Marshal::load(Marshal::dump(vocab))      # deep clone array
+        @vocab = Marshal.load(Marshal.dump(vocab)) # deep clone array
       elsif vocab.is_a?(Vocabulary)
         @vocab = vocab.to_a
       else
-        @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) }
+        @vocab = File.read(vocab).split(/\s+/)
       end
       true
     end
+    def em(start = "random")
+      @phi = nil
+      @backend.em(start.to_s)
+    end
+    def beta
+      @backend.beta
+    end
+    def gamma
+      @backend.gamma
+    end
+    def model
+      @backend.model
+    end
     #
     # Visualization method for printing out the top +words_per_topic+ words
     # for each topic.
@@ -62,14 +271,18 @@ module Lda
     # See also +top_words+.
     #
     def print_topics(words_per_topic = 10)
-      raise 'No vocabulary loaded.' unless @vocab
+      raise "No vocabulary loaded." unless @vocab
-      self.beta.each_with_index do |topic, topic_num|
-        # Sort the topic array and return the sorted indices of the best scores
-        indices = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
+      beta.each_with_index do |topic, topic_num|
+        indices = topic
+          .each_with_index
+          .sort_by { |score, _index| score }
+          .reverse
+          .first(words_per_topic)
+          .map { |_score, index| index }
         puts "Topic #{topic_num}"
-        puts "\t#{indices.map {|i| @vocab[i]}.join("\n\t")}"
+        puts "\t#{indices.map { |i| @vocab[i] }.join("\n\t")}"
         puts ""
       end
@@ -87,21 +300,24 @@ module Lda
     # See also +print_topics+.
     #
     def top_word_indices(words_per_topic = 10)
-      raise 'No vocabulary loaded.' unless @vocab
+      raise "No vocabulary loaded." unless @vocab
-      # find the highest scoring words per topic
-      topics = Hash.new
-      indices = (0...@vocab.size).to_a
+      topics = {}
-      self.beta.each_with_index do |topic, topic_num|
-        topics[topic_num] = (topic.zip((0...@vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
+      beta.each_with_index do |topic, topic_num|
+        topics[topic_num] = topic
+          .each_with_index
+          .sort_by { |score, _index| score }
+          .reverse
+          .first(words_per_topic)
+          .map { |_score, index| index }
       end
       topics
     end
     def top_words(words_per_topic = 10)
-      output = Hash.new
+      output = {}
       topics = top_word_indices(words_per_topic)
       topics.each_pair do |topic_num, words|
@@ -118,49 +334,78 @@ module Lda
     # after the first call, so if it needs to be recomputed, set the +recompute+
     # value to true.
     #
-    def phi(recompute=false)
-      if @phi.nil? || recompute
-        @phi = self.compute_phi
-      end
+    def phi(recompute = false)
+      @phi = compute_phi if @phi.nil? || recompute
       @phi
     end
+    def compute_phi
+      @backend.compute_phi
+    end
     #
     # Compute the average log probability for each topic for each document in the corpus.
     # This method returns a matrix:  num_docs x num_topics with the average log probability
     # for the topic in the document.
     #
     def compute_topic_document_probability
-      outp = Array.new
+      phi_matrix = phi
+      document_counts = @corpus.documents.map(&:counts)
+      backend_output = @backend.topic_document_probability(phi_matrix, document_counts)
+      if valid_topic_document_probability_output?(backend_output, document_counts.size, num_topics)
+        return backend_output
+      end
+      outp = []
       @corpus.documents.each_with_index do |doc, idx|
-        tops = [0.0] * self.num_topics
-        ttl  = doc.counts.inject(0.0) {|sum, i| sum + i}
-        self.phi[idx].each_with_index do |word_dist, word_idx|
+        tops = [0.0] * num_topics
+        ttl = doc.counts.inject(0.0) { |sum, i| sum + i }
+        phi_matrix[idx].each_with_index do |word_dist, word_idx|
           word_dist.each_with_index do |top_prob, top_idx|
-            tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx]
+            tops[top_idx] += Math.log([top_prob, 1e-300].max) * doc.counts[word_idx]
           end
         end
-        tops = tops.map {|i| i / ttl}
+        tops = tops.map { |i| i / ttl }
         outp << tops
       end
       outp
     end
+    def valid_topic_document_probability_output?(output, expected_docs, expected_topics)
+      return false unless output.is_a?(Array)
+      return false unless output.size == expected_docs
+      output.each do |row|
+        return false unless row.is_a?(Array)
+        return false unless row.size == expected_topics
+        row.each do |value|
+          return false unless value.is_a?(Numeric)
+          return false unless value.finite?
+        end
+      end
+      true
+    end
     #
     # String representation displaying current settings.
     #
     def to_s
       outp = ["LDA Settings:"]
-      outp << "    Initial alpha: %0.6f" % self.init_alpha
-      outp << "      # of topics: %d" % self.num_topics
-      outp << "   Max iterations: %d" % self.max_iter
-      outp << "      Convergence: %0.6f" % self.convergence
-      outp << "EM max iterations: %d" % self.em_max_iter
-      outp << "   EM convergence: %0.6f" % self.em_convergence
-      outp << "   Estimate alpha: %d" % self.est_alpha
+      outp << format("    Initial alpha: %0.6f", init_alpha)
+      outp << format("      # of topics: %d", num_topics)
+      outp << format("   Max iterations: %d", max_iter)
+      outp << format("      Convergence: %0.6f", convergence)
+      outp << format("EM max iterations: %d", em_max_iter)
+      outp << format("   EM convergence: %0.6f", em_convergence)
+      outp << format("   Estimate alpha: %d", est_alpha)
+      outp << format("         Backend: %s", backend_name)
       outp.join("\n")
     end