RubyGems - lightgbm - Versions diffs - 0.1.1 → 0.1.2 - Mend

lightgbm 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 49e0eef0a10a444e0cc24c8188268d349037fb12054b3f3f73ab14ed54fae3d7
-  data.tar.gz: 9ee78189ec31bfb3dc9cea6fe5836f97a010097cc821819d05236a899d0654af
+  metadata.gz: 723130d41ea9196bcbd7bcffeb865c40c65985f26eca018d49bd176d33c43142
+  data.tar.gz: d92b41899ff72da2ef4e5782bf4d2840caee1554107d9fd5d02bd6728829585a
 SHA512:
-  metadata.gz: d95050754e85ee004df08c4761f31f1bfc97e3efbcd3ea0ae2251f5a84eeff2978e16118411ddeced74a9c7d3fd731176488cbbac4ed2bdd840e55e4dd6172db
-  data.tar.gz: 52dcca52827fffca3d638c814eec359c4f2d397b025cc9ee99323bebbbf5436d3a7fcf569390f0c71eb9623c68a711b0e32ba38a53364873fe9d0de99b2f3f66
+  metadata.gz: 6960dbf1e2a884705e8a2752952392483c0ab1e74e970382b45df747ccbedbc0d978d3910e2b935190c98a9c45315841b53e27128aa5944e3a7834808e05582a
+  data.tar.gz: c6e793933dc794fa62099580ad35f29d7e5e3ae24a07df4335dca3d68571bc1d5b360a7cf0859d75d245a77d470a0dafba0cb5d86edda1974f3bc532b0f5c11a

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,9 @@
+## 0.1.2
+- Added `cv` method
+- Added early stopping
+- Fixed multiclass classification
 ## 0.1.1
 - Added training API

data/README.md CHANGED Viewed

@@ -16,6 +16,8 @@ Add this line to your application’s Gemfile:
 gem 'lightgbm'
 ```
+## Getting Started
 Train a model
 ```ruby
@@ -30,13 +32,13 @@ Predict
 booster.predict(x_test)
 ```
-Save the model
+Save the model to a file
 ```ruby
 booster.save_model("model.txt")
 ```
-Load a model from a file
+Load the model from a file
 ```ruby
 booster = LightGBM::Booster.new(model_file: "model.txt")
@@ -48,30 +50,32 @@ Get feature importance
 booster.feature_importance
 ```
-## Reference
-### Booster
+## Early Stopping
 ```ruby
-booster = LightGBM::Booster.new(model_str: "tree...")
-booster.to_json
-booster.model_to_string
-booster.current_iteration
+LightGBM.train(params, train_set, valid_set: [train_set, test_set], early_stopping_rounds: 5)
 ```
-### Dataset
+## CV
 ```ruby
-dataset = LightGBM::Dataset.new(data, label: label, weight: weight, params: params)
-dataset.num_data
-dataset.num_feature
-# note: only works with unquoted CSVs
-dataset = LightGBM::Dataset.new("data.csv", params: {headers: true, label: "name:label"})
-dataset.save_binary("train.bin")
-dataset.dump_text("train.txt")
+LightGBM.cv(params, train_set, nfold: 5, verbose_eval: true)
 ```
+## Reference
+This library follows the [Data Structure and Training APIs](https://lightgbm.readthedocs.io/en/latest/Python-API.html) for the Python library. A few differences are:
+- The default verbosity is `-1`
+- With the `cv` method, `stratified` is set to `false`
+Some methods and options are also missing at the moment. PRs welcome!
+## Helpful Resources
+- [Parameters](https://lightgbm.readthedocs.io/en/latest/Parameters.html)
+- [Parameter Tuning](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)
 ## Credits
 Thanks to the [xgboost](https://github.com/PairOnAir/xgboost-ruby) gem for serving as an initial reference, and Selva Prabhakaran for the [test datasets](https://github.com/selva86/datasets).

data/lib/lightgbm.rb CHANGED Viewed

@@ -11,14 +11,174 @@ require "lightgbm/version"
 module LightGBM
   class Error < StandardError; end
-  def self.train(params, train_set, num_boost_round: 100, valid_sets: [], valid_names: [])
-    booster = Booster.new(params: params, train_set: train_set)
-    valid_sets.zip(valid_names) do |data, name|
-      booster.add_valid(data, name)
+  class << self
+    def train(params, train_set,num_boost_round: 100, valid_sets: [], valid_names: [], early_stopping_rounds: nil, verbose_eval: true)
+      booster = Booster.new(params: params, train_set: train_set)
+      valid_contain_train = false
+      valid_sets.zip(valid_names).each_with_index do |(data, name), i|
+        if data == train_set
+          booster.train_data_name = name || "training"
+          valid_contain_train = true
+        else
+          booster.add_valid(data, name || "valid_#{i}")
+        end
+      end
+      booster.best_iteration = 0
+      if early_stopping_rounds
+        best_score = []
+        best_iter = []
+        best_message = []
+        puts "Training until validation scores don't improve for #{early_stopping_rounds.to_i} rounds." if verbose_eval
+      end
+      num_boost_round.times do |iteration|
+        booster.update
+        if valid_sets.any?
+          # print results
+          messages = []
+          if valid_contain_train
+            # not sure why reversed in output
+            booster.eval_train.reverse.each do |res|
+              messages << "%s's %s: %g" % [res[0], res[1], res[2]]
+            end
+          end
+          eval_valid = booster.eval_valid
+          # not sure why reversed in output
+          eval_valid.reverse.each do |res|
+            messages << "%s's %s: %g" % [res[0], res[1], res[2]]
+          end
+          message = "[#{iteration + 1}]\t#{messages.join("\t")}"
+          puts message if verbose_eval
+          if early_stopping_rounds
+            stop_early = false
+            eval_valid.each_with_index do |(_, _, score, higher_better), i|
+              op = higher_better ? :> : :<
+              if best_score[i].nil? || score.send(op, best_score[i])
+                best_score[i] = score
+                best_iter[i] = iteration
+                best_message[i] = message
+              elsif iteration - best_iter[i] >= early_stopping_rounds
+                booster.best_iteration = best_iter[i] + 1
+                puts "Early stopping, best iteration is:\n#{best_message[i]}" if verbose_eval
+                stop_early = true
+                break
+              end
+            end
+            break if stop_early
+            if iteration == num_boost_round - 1
+              booster.best_iteration = best_iter[0] + 1
+              puts "Did not meet early stopping. Best iteration is: #{best_message[0]}" if verbose_eval
+            end
+          end
+        end
+      end
+      booster
+    end
+    def cv(params, train_set, num_boost_round: 100, nfold: 5, seed: 0, shuffle: true, early_stopping_rounds: nil, verbose_eval: nil, show_stdv: true)
+      rand_idx = (0...train_set.num_data).to_a
+      rand_idx.shuffle!(random: Random.new(seed)) if shuffle
+      kstep = rand_idx.size / nfold
+      test_id = rand_idx.each_slice(kstep).to_a[0...nfold]
+      train_id = []
+      nfold.times do |i|
+        idx = test_id.dup
+        idx.delete_at(i)
+        train_id << idx.flatten
+      end
+      boosters = []
+      folds = train_id.zip(test_id)
+      folds.each do |(train_idx, test_idx)|
+        fold_train_set = train_set.subset(train_idx)
+        fold_valid_set = train_set.subset(test_idx)
+        booster = Booster.new(params: params, train_set: fold_train_set)
+        booster.add_valid(fold_valid_set, "valid")
+        boosters << booster
+      end
+      eval_hist = {}
+      if early_stopping_rounds
+        best_score = {}
+        best_iter = {}
+      end
+      num_boost_round.times do |iteration|
+        boosters.each(&:update)
+        scores = {}
+        boosters.map(&:eval_valid).map(&:reverse).flatten(1).each do |r|
+          (scores[r[1]] ||= []) << r[2]
+        end
+        message_parts = ["[#{iteration + 1}]"]
+        means = {}
+        scores.each do |eval_name, vals|
+          mean = mean(vals)
+          stdev = stdev(vals)
+          (eval_hist["#{eval_name}-mean"] ||= []) << mean
+          (eval_hist["#{eval_name}-stdv"] ||= []) << stdev
+          means[eval_name] = mean
+          if show_stdv
+            message_parts << "cv_agg's %s: %g + %g" % [eval_name, mean, stdev]
+          else
+            message_parts << "cv_agg's %s: %g" % [eval_name, mean]
+          end
+        end
+        puts message_parts.join("\t") if verbose_eval
+        if early_stopping_rounds
+          stop_early = false
+          means.each do |k, score|
+            if best_score[k].nil? || score < best_score[k]
+              best_score[k] = score
+              best_iter[k] = iteration
+            elsif iteration - best_iter[k] >= early_stopping_rounds
+              stop_early = true
+              break
+            end
+          end
+          break if stop_early
+        end
+      end
+      eval_hist
     end
-    num_boost_round.times do
-      booster.update
+    private
+    def mean(arr)
+      arr.sum / arr.size.to_f
+    end
+    # don't subtract one from arr.size
+    def stdev(arr)
+      m = mean(arr)
+      sum = 0
+      arr.each do |v|
+        sum += (v - m) ** 2
+      end
+      Math.sqrt(sum / arr.size)
     end
-    booster
   end
 end

data/lib/lightgbm/booster.rb CHANGED Viewed

@@ -1,57 +1,65 @@
 module LightGBM
   class Booster
+    attr_accessor :best_iteration, :train_data_name
     def initialize(params: nil, train_set: nil, model_file: nil, model_str: nil)
       @handle = ::FFI::MemoryPointer.new(:pointer)
       if model_str
-        out_num_iterations = ::FFI::MemoryPointer.new(:int)
-        check_result FFI.LGBM_BoosterLoadModelFromString(model_str, out_num_iterations, @handle)
+        model_from_string(model_str)
       elsif model_file
         out_num_iterations = ::FFI::MemoryPointer.new(:int)
         check_result FFI.LGBM_BoosterCreateFromModelfile(model_file, out_num_iterations, @handle)
       else
+        params ||= {}
+        set_verbosity(params)
         check_result FFI.LGBM_BoosterCreate(train_set.handle_pointer, params_str(params), @handle)
       end
       # causes "Stack consistency error"
       # ObjectSpace.define_finalizer(self, self.class.finalize(handle_pointer))
+      self.best_iteration = -1
+      # TODO get names when loaded from file
+      @name_valid_sets = []
     end
     def self.finalize(pointer)
       -> { FFI.LGBM_BoosterFree(pointer) }
     end
-    # TODO handle name
     def add_valid(data, name)
       check_result FFI.LGBM_BoosterAddValidData(handle_pointer, data.handle_pointer)
+      @name_valid_sets << name
       self # consistent with Python API
     end
-    def predict(input)
-      raise TypeError unless input.is_a?(Array)
-      singular = input.first.is_a?(Array)
-      input = [input] unless singular
-      data = ::FFI::MemoryPointer.new(:float, input.count * input.first.count)
-      data.put_array_of_float(0, input.flatten)
+    def current_iteration
+      out = ::FFI::MemoryPointer.new(:int)
+      check_result FFI::LGBM_BoosterGetCurrentIteration(handle_pointer, out)
+      out.read_int
+    end
+    def dump_model(num_iteration: nil, start_iteration: 0)
+      num_iteration ||= best_iteration
+      buffer_len = 1 << 20
       out_len = ::FFI::MemoryPointer.new(:int64)
-      out_result = ::FFI::MemoryPointer.new(:double, input.count)
-      parameter = ""
-      check_result FFI.LGBM_BoosterPredictForMat(handle_pointer, data, 0, input.count, input.first.count, 1, 0, 0, parameter, out_len, out_result)
-      out = out_result.read_array_of_double(out_len.read_int64)
-      singular ? out : out.first
+      out_str = ::FFI::MemoryPointer.new(:string, buffer_len)
+      check_result FFI.LGBM_BoosterDumpModel(handle_pointer, start_iteration, num_iteration, buffer_len, out_len, out_str)
+      actual_len = out_len.read_int64
+      if actual_len > buffer_len
+        out_str = ::FFI::MemoryPointer.new(:string, actual_len)
+        check_result FFI.LGBM_BoosterDumpModel(handle_pointer, start_iteration, num_iteration, actual_len, out_len, out_str)
+      end
+      out_str.read_string
     end
+    alias_method :to_json, :dump_model
-    def save_model(filename)
-      check_result FFI.LGBM_BoosterSaveModel(handle_pointer, 0, 0, filename)
-      self # consistent with Python API
+    def eval_valid
+      @name_valid_sets.each_with_index.map { |n, i| inner_eval(n, i + 1) }.flatten(1)
     end
-    def update
-      finished = ::FFI::MemoryPointer.new(:int)
-      check_result FFI.LGBM_BoosterUpdateOneIter(handle_pointer, finished)
-      finished.read_int == 1
+    def eval_train
+      inner_eval(train_data_name, 0)
     end
     def feature_importance(iteration: nil, importance_type: "split")
@@ -66,27 +74,16 @@ module LightGBM
           -1
         end
-      num_features = self.num_features
-      out_result = ::FFI::MemoryPointer.new(:double, num_features)
+      num_feature = self.num_feature
+      out_result = ::FFI::MemoryPointer.new(:double, num_feature)
       check_result FFI.LGBM_BoosterFeatureImportance(handle_pointer, iteration, importance_type, out_result)
-      out_result.read_array_of_double(num_features)
-    end
-    def num_features
-      out = ::FFI::MemoryPointer.new(:int)
-      check_result FFI.LGBM_BoosterGetNumFeature(handle_pointer, out)
-      out.read_int
-    end
-    def current_iteration
-      out = ::FFI::MemoryPointer.new(:int)
-      check_result FFI::LGBM_BoosterGetCurrentIteration(handle_pointer, out)
-      out.read_int
+      out_result.read_array_of_double(num_feature)
     end
-    # TODO fix
-    def best_iteration
-      -1
+    def model_from_string(model_str)
+      out_num_iterations = ::FFI::MemoryPointer.new(:int)
+      check_result FFI.LGBM_BoosterLoadModelFromString(model_str, out_num_iterations, @handle)
+      self
     end
     def model_to_string(num_iteration: nil, start_iteration: 0)
@@ -103,18 +100,57 @@ module LightGBM
       out_str.read_string
     end
-    def to_json(num_iteration: nil, start_iteration: 0)
+    def num_feature
+      out = ::FFI::MemoryPointer.new(:int)
+      check_result FFI.LGBM_BoosterGetNumFeature(handle_pointer, out)
+      out.read_int
+    end
+    alias_method :num_features, :num_feature # legacy typo
+    def num_model_per_iteration
+      out = ::FFI::MemoryPointer.new(:int)
+      check_result FFI::LGBM_BoosterNumModelPerIteration(handle_pointer, out)
+      out.read_int
+    end
+    def num_trees
+      out = ::FFI::MemoryPointer.new(:int)
+      check_result FFI::LGBM_BoosterNumberOfTotalModel(handle_pointer, out)
+      out.read_int
+    end
+    # TODO support different prediction types
+    def predict(input, num_iteration: nil, **params)
+      raise TypeError unless input.is_a?(Array)
+      singular = !input.first.is_a?(Array)
+      input = [input] if singular
       num_iteration ||= best_iteration
-      buffer_len = 1 << 20
+      num_class ||= num_class()
+      data = ::FFI::MemoryPointer.new(:float, input.count * input.first.count)
+      data.put_array_of_float(0, input.flatten)
       out_len = ::FFI::MemoryPointer.new(:int64)
-      out_str = ::FFI::MemoryPointer.new(:string, buffer_len)
-      check_result FFI.LGBM_BoosterDumpModel(handle_pointer, start_iteration, num_iteration, buffer_len, out_len, out_str)
-      actual_len = out_len.read_int64
-      if actual_len > buffer_len
-        out_str = ::FFI::MemoryPointer.new(:string, actual_len)
-        check_result FFI.LGBM_BoosterDumpModel(handle_pointer, start_iteration, num_iteration, actual_len, out_len, out_str)
-      end
-      out_str.read_string
+      out_result = ::FFI::MemoryPointer.new(:double, num_class * input.count)
+      check_result FFI.LGBM_BoosterPredictForMat(handle_pointer, data, 0, input.count, input.first.count, 1, 0, num_iteration, params_str(params), out_len, out_result)
+      out = out_result.read_array_of_double(out_len.read_int64)
+      out = out.each_slice(num_class).to_a if num_class > 1
+      singular ? out.first : out
+    end
+    def save_model(filename, num_iteration: nil, start_iteration: 0)
+      num_iteration ||= best_iteration
+      check_result FFI.LGBM_BoosterSaveModel(handle_pointer, start_iteration, num_iteration, filename)
+      self # consistent with Python API
+    end
+    def update
+      finished = ::FFI::MemoryPointer.new(:int)
+      check_result FFI.LGBM_BoosterUpdateOneIter(handle_pointer, finished)
+      finished.read_int == 1
     end
     private
@@ -123,6 +159,42 @@ module LightGBM
       @handle.read_pointer
     end
+    def eval_counts
+      out = ::FFI::MemoryPointer.new(:int)
+      check_result FFI::LGBM_BoosterGetEvalCounts(handle_pointer, out)
+      out.read_int
+    end
+    def eval_names
+      eval_counts ||= eval_counts()
+      out_len = ::FFI::MemoryPointer.new(:int)
+      out_strs = ::FFI::MemoryPointer.new(:pointer, eval_counts)
+      str_ptrs = eval_counts.times.map { ::FFI::MemoryPointer.new(:string, 255) }
+      out_strs.put_array_of_pointer(0, str_ptrs)
+      check_result FFI.LGBM_BoosterGetEvalNames(handle_pointer, out_len, out_strs)
+      str_ptrs.map(&:read_string)
+    end
+    def inner_eval(name, i)
+      eval_names ||= eval_names()
+      out_len = ::FFI::MemoryPointer.new(:int)
+      out_results = ::FFI::MemoryPointer.new(:double, eval_names.count)
+      check_result FFI.LGBM_BoosterGetEval(handle_pointer, i, out_len, out_results)
+      vals = out_results.read_array_of_double(out_len.read_int)
+      eval_names.zip(vals).map do |eval_name, val|
+        higher_better = ["auc", "ndcg@", "map@"].any? { |v| eval_name.start_with?(v) }
+        [name, eval_name, val, higher_better]
+      end
+    end
+    def num_class
+      out = ::FFI::MemoryPointer.new(:int)
+      check_result FFI::LGBM_BoosterGetNumClasses(handle_pointer, out)
+      out.read_int
+    end
     include Utils
   end
 end

data/lib/lightgbm/dataset.rb CHANGED Viewed

@@ -2,16 +2,27 @@ module LightGBM
   class Dataset
     attr_reader :data, :params
-    def initialize(data, label: nil, weight: nil, params: nil)
+    def initialize(data, label: nil, weight: nil, params: nil, reference: nil, used_indices: nil, categorical_feature: "auto")
       @data = data
+      # TODO stringify params
+      params ||= {}
+      params["categorical_feature"] ||= categorical_feature.join(",") if categorical_feature != "auto"
+      set_verbosity(params)
       @handle = ::FFI::MemoryPointer.new(:pointer)
+      parameters = params_str(params)
+      reference = reference.handle_pointer if reference
       if data.is_a?(String)
-        check_result FFI.LGBM_DatasetCreateFromFile(data, params_str(params), nil, @handle)
+        check_result FFI.LGBM_DatasetCreateFromFile(data, parameters, reference, @handle)
+      elsif used_indices
+        used_row_indices = ::FFI::MemoryPointer.new(:int32, used_indices.count)
+        used_row_indices.put_array_of_int32(0, used_indices)
+        check_result FFI.LGBM_DatasetGetSubset(reference, used_row_indices, used_indices.count, parameters, @handle)
       else
         c_data = ::FFI::MemoryPointer.new(:float, data.count * data.first.count)
         c_data.put_array_of_float(0, data.flatten)
-        check_result FFI.LGBM_DatasetCreateFromMat(c_data, 0, data.count, data.first.count, 1, params_str(params), nil, @handle)
+        check_result FFI.LGBM_DatasetCreateFromMat(c_data, 0, data.count, data.first.count, 1, parameters, reference, @handle)
       end
       # causes "Stack consistency error"
       # ObjectSpace.define_finalizer(self, self.class.finalize(handle_pointer))
@@ -48,6 +59,16 @@ module LightGBM
       check_result FFI.LGBM_DatasetDumpText(handle_pointer, filename)
     end
+    def subset(used_indices, params: nil)
+      # categorical_feature passed via params
+      params ||= self.params
+      Dataset.new(nil,
+        params: params,
+        reference: self,
+        used_indices: used_indices
+      )
+    end
     def self.finalize(pointer)
       -> { FFI.LGBM_DatasetFree(pointer) }
     end

data/lib/lightgbm/ffi.rb CHANGED Viewed

@@ -12,6 +12,7 @@ module LightGBM
     # dataset
     attach_function :LGBM_DatasetCreateFromFile, %i[string string pointer pointer], :int
     attach_function :LGBM_DatasetCreateFromMat, %i[pointer int int32 int32 int string pointer pointer], :int
+    attach_function :LGBM_DatasetGetSubset, %i[pointer pointer int32 string pointer], :int
     attach_function :LGBM_DatasetFree, %i[pointer], :int
     attach_function :LGBM_DatasetSaveBinary, %i[pointer string], :int
     attach_function :LGBM_DatasetDumpText, %i[pointer string], :int
@@ -26,9 +27,15 @@ module LightGBM
     attach_function :LGBM_BoosterLoadModelFromString, %i[string pointer pointer], :int
     attach_function :LGBM_BoosterFree, %i[pointer], :int
     attach_function :LGBM_BoosterAddValidData, %i[pointer pointer], :int
+    attach_function :LGBM_BoosterGetNumClasses, %i[pointer pointer], :int
     attach_function :LGBM_BoosterUpdateOneIter, %i[pointer pointer], :int
     attach_function :LGBM_BoosterGetCurrentIteration, %i[pointer pointer], :int
+    attach_function :LGBM_BoosterNumModelPerIteration, %i[pointer pointer], :int
+    attach_function :LGBM_BoosterNumberOfTotalModel, %i[pointer pointer], :int
+    attach_function :LGBM_BoosterGetEvalCounts, %i[pointer pointer], :int
+    attach_function :LGBM_BoosterGetEvalNames, %i[pointer pointer pointer], :int
     attach_function :LGBM_BoosterGetNumFeature, %i[pointer pointer], :int
+    attach_function :LGBM_BoosterGetEval, %i[pointer int pointer pointer], :int
     attach_function :LGBM_BoosterPredictForMat, %i[pointer pointer int int32 int32 int int int string pointer pointer], :int
     attach_function :LGBM_BoosterSaveModel, %i[pointer int int string], :int
     attach_function :LGBM_BoosterSaveModelToString, %i[pointer int int int64 pointer pointer], :int

data/lib/lightgbm/utils.rb CHANGED Viewed

@@ -8,12 +8,20 @@ module LightGBM
     # remove spaces in keys and values to prevent injection
     def params_str(params)
-      (params || {}).map { |k, v| [check_param(k.to_s), check_param(v.to_s)].join("=") }.join(" ")
+      params.map { |k, v| [check_param(k.to_s), check_param(Array(v).join(",").to_s)].join("=") }.join(" ")
     end
     def check_param(v)
       raise ArgumentError, "Invalid parameter" if /[[:space:]]/.match(v)
       v
     end
+    # change default verbosity
+    def set_verbosity(params)
+      params_keys = params.keys.map(&:to_s)
+      unless params_keys.include?("verbosity")
+        params["verbosity"] = -1
+      end
+    end
   end
 end

data/lib/lightgbm/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module LightGBM
-  VERSION = "0.1.1"
+  VERSION = "0.1.2"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: lightgbm
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-08-14 00:00:00.000000000 Z
+date: 2019-08-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ffi