RubyGems - xgb - Versions diffs - 0.1.1 → 0.1.2 - Mend

xgb 0.1.1 → 0.1.2

Files changed (13) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1bb50395d579da91b18754bc75e780cbb2e98fd7a48a17c34514230d1c4828d1
-  data.tar.gz: 3d2f9c5a72c63c2622a973805c9f2caa9bd4de7b5c67f8c4b5445fd9a71993c3
+  metadata.gz: 148980f8a4991f1f98cd1740188e763a3bd96c98bc69b13a9de9aa00132a12f1
+  data.tar.gz: 31d90a3a064d032a7d1f371c6928f11b103bfc6f9c92dd1697cc538ef33f15fa
 SHA512:
-  metadata.gz: f141b3ea0b6ceb8549198fd6ad8a07f6947201409478fc4829fe625da376e40d8028427a5aa34191b565aa275d27bb03e2082bb8fc489f6da6a2a09b3bbf2c2f
-  data.tar.gz: c393f4fdbe240ffc14b64f22f17d149ed393070fb0752f9ec49dd94bcfa88f446ea21bc5bf9a96bef7759c5c47033dd0480a4001d477b8c487cf5dcf8be19b81
+  metadata.gz: 28fb08d373af3a3b198822ddea958ca0ee433145dbca091583b25945f27a048a3ec4a4ec43d88dc3c1dde67de7f72b7fb84e7668cfbd173f01d56992017ba4e2
+  data.tar.gz: 5bfb07db7c6b65d0a08010ab59328af75a97e74e025fcd4eb15ca88b05afec3303beb1569c99d59932a5426a64addce1c7fda90562342efcf52fc2c72d9b362a

data/CHANGELOG.md CHANGED

@@ -1,3 +1,10 @@
+## 0.1.2
+- Friendlier message when XGBoost not found
+- Free memory when objects are destroyed
+- Added `Ranker`
+- Added early stopping to Scikit-Learn API
 ## 0.1.1
 - Added Scikit-Learn API

data/README.md CHANGED

@@ -18,7 +18,7 @@ gem 'xgb'
 ## Getting Started
-This library follows the [Core Data Structure, Learning and Scikit-Learn APIs](https://xgboost.readthedocs.io/en/latest/python/python_api.html) of the Python library. Some methods and options are missing at the moment. PRs welcome!
+This library follows the [Python API](https://xgboost.readthedocs.io/en/latest/python/python_api.html). Some methods and options are missing at the moment. PRs welcome!
 ## Learning API
@@ -33,7 +33,8 @@ booster = Xgb.train(params, dtrain)
 Predict
 ```ruby
-booster.predict(x_test)
+dtest = Xgb::DMatrix.new(x_test)
+booster.predict(dtest)
 ```
 Save the model to a file
@@ -110,6 +111,12 @@ Get the importance of features
 model.feature_importances
 ```
+Early stopping
+```ruby
+model.fit(x, y, eval_set: [[x_test, y_test]], early_stopping_rounds: 5)
+```
 ## Data
 Data can be an array of arrays

data/lib/xgb.rb CHANGED

@@ -5,16 +5,25 @@ require "ffi"
 require "xgb/utils"
 require "xgb/booster"
 require "xgb/dmatrix"
-require "xgb/ffi"
 require "xgb/version"
 # scikit-learn API
+require "xgb/model"
 require "xgb/classifier"
+require "xgb/ranker"
 require "xgb/regressor"
 module Xgb
   class Error < StandardError; end
+  class << self
+    attr_accessor :ffi_lib
+  end
+  self.ffi_lib = ["xgboost"]
+  # friendlier error message
+  autoload :FFI,"xgb/ffi"
   class << self
     def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true)
       booster = Booster.new(params: params)

data/lib/xgb/booster.rb CHANGED

@@ -5,6 +5,8 @@ module Xgb
     def initialize(params: nil, model_file: nil)
       @handle = ::FFI::MemoryPointer.new(:pointer)
       check_result FFI.XGBoosterCreate(nil, 0, @handle)
+      ObjectSpace.define_finalizer(self, self.class.finalize(handle_pointer))
       if model_file
         check_result FFI.XGBoosterLoadModel(handle_pointer, model_file)
       end
@@ -13,6 +15,11 @@ module Xgb
       set_param(params)
     end
+    def self.finalize(pointer)
+      # must use proc instead of stabby lambda
+      proc { FFI.XGBoosterFree(pointer) }
+    end
     def update(dtrain, iteration)
       check_result FFI.XGBoosterUpdateOneIter(handle_pointer, iteration, dtrain.handle_pointer)
     end
@@ -43,10 +50,10 @@ module Xgb
     def predict(data, ntree_limit: nil)
       ntree_limit ||= 0
-      out_len = ::FFI::MemoryPointer.new(:ulong)
+      out_len = ::FFI::MemoryPointer.new(:uint64)
       out_result = ::FFI::MemoryPointer.new(:pointer)
       check_result FFI.XGBoosterPredict(handle_pointer, data.handle_pointer, 0, ntree_limit, out_len, out_result)
-      out = out_result.read_pointer.read_array_of_float(out_len.read_ulong)
+      out = out_result.read_pointer.read_array_of_float(out_len.read_uint64)
       num_class = out.size / data.num_row
       out = out.each_slice(num_class).to_a if num_class > 1
       out
@@ -58,10 +65,10 @@ module Xgb
     # returns an array of strings
     def dump(fmap: "", with_stats: false, dump_format: "text")
-      out_len = ::FFI::MemoryPointer.new(:ulong)
+      out_len = ::FFI::MemoryPointer.new(:uint64)
       out_result = ::FFI::MemoryPointer.new(:pointer)
       check_result FFI.XGBoosterDumpModelEx(handle_pointer, fmap, with_stats ? 1 : 0, dump_format, out_len, out_result)
-      out_result.read_pointer.get_array_of_string(0, out_len.read_ulong)
+      out_result.read_pointer.get_array_of_string(0, out_len.read_uint64)
     end
     def dump_model(fout, fmap: "", with_stats: false, dump_format: "text")

data/lib/xgb/classifier.rb CHANGED

@@ -1,16 +1,10 @@
 module Xgb
-  class Classifier
-    def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "binary:logistic", importance_type: "gain")
-      @params = {
-        max_depth: max_depth,
-        objective: objective,
-        learning_rate: learning_rate
-      }
-      @n_estimators = n_estimators
-      @importance_type = importance_type
+  class Classifier < Model
+    def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "binary:logistic", importance_type: "gain", **options)
+      super
     end
-    def fit(x, y)
+    def fit(x, y, eval_set: nil, early_stopping_rounds: nil, verbose: true)
       n_classes = y.uniq.size
       params = @params.dup
@@ -20,18 +14,24 @@ module Xgb
       end
       dtrain = DMatrix.new(x, label: y)
-      @booster = Xgb.train(params, dtrain, num_boost_round: @n_estimators)
+      evals = Array(eval_set).map.with_index { |v, i| [DMatrix.new(v[0], label: v[1]), "validation_#{i}"] }
+      @booster = Xgb.train(params, dtrain,
+        num_boost_round: @n_estimators,
+        early_stopping_rounds: early_stopping_rounds,
+        verbose_eval: verbose,
+        evals: evals
+      )
       nil
     end
     def predict(data)
-      dmat = DMatrix.new(data)
-      y_pred = @booster.predict(dmat)
+      y_pred = super(data)
       if y_pred.first.is_a?(Array)
         # multiple classes
         y_pred.map do |v|
-          v.map.with_index.max_by { |v2, i| v2 }.last
+          v.map.with_index.max_by { |v2, _| v2 }.last
         end
       else
         y_pred.map { |v| v > 0.5 ? 1 : 0 }
@@ -49,20 +49,5 @@ module Xgb
         y_pred.map { |v| [1 - v, v] }
       end
     end
-    def save_model(fname)
-      @booster.save_model(fname)
-    end
-    def load_model(fname)
-      @booster = Booster.new(params: @params, model_file: fname)
-    end
-    def feature_importances
-      score = @booster.score(importance_type: @importance_type)
-      scores = @booster.feature_names.map { |k| score[k] || 0.0 }
-      total = scores.sum.to_f
-      scores.map { |s| s / total }
-    end
   end
 end

data/lib/xgb/dmatrix.rb CHANGED

@@ -27,12 +27,19 @@ module Xgb
         c_data = ::FFI::MemoryPointer.new(:float, nrow * ncol)
         c_data.put_array_of_float(0, flat_data)
         check_result FFI.XGDMatrixCreateFromMat(c_data, nrow, ncol, missing, @handle)
+        ObjectSpace.define_finalizer(self, self.class.finalize(handle_pointer))
       end
       set_float_info("label", label) if label
       set_float_info("weight", weight) if weight
     end
+    def self.finalize(pointer)
+      # must use proc instead of stabby lambda
+      proc { FFI.XGDMatrixFree(pointer) }
+    end
     def label
       float_info("label")
     end
@@ -41,16 +48,22 @@ module Xgb
       float_info("weight")
     end
+    def group=(group)
+      c_data = ::FFI::MemoryPointer.new(:int, group.size)
+      c_data.put_array_of_int(0, group)
+      check_result FFI.XGDMatrixSetGroup(handle_pointer, c_data, group.size)
+    end
     def num_row
-      out = ::FFI::MemoryPointer.new(:ulong)
+      out = ::FFI::MemoryPointer.new(:uint64)
       check_result FFI.XGDMatrixNumRow(handle_pointer, out)
-      out.read_ulong
+      out.read_uint64
     end
     def num_col
-      out = ::FFI::MemoryPointer.new(:ulong)
+      out = ::FFI::MemoryPointer.new(:uint64)
       check_result FFI.XGDMatrixNumCol(handle_pointer, out)
-      out.read_ulong
+      out.read_uint64
     end
     def slice(rindex)
@@ -76,15 +89,7 @@ module Xgb
     private
     def set_float_info(field, data)
-      data =
-        if matrix?(data)
-          data.to_a[0]
-        elsif daru_vector?(data) || narray?(data)
-          data.to_a
-        else
-          data
-        end
+      data = data.to_a unless data.is_a?(Array)
       c_data = ::FFI::MemoryPointer.new(:float, data.size)
       c_data.put_array_of_float(0, data)
       check_result FFI.XGDMatrixSetFloatInfo(handle_pointer, field.to_s, c_data, data.size)
@@ -106,10 +111,6 @@ module Xgb
       defined?(Daru::DataFrame) && data.is_a?(Daru::DataFrame)
     end
-    def daru_vector?(data)
-      defined?(Daru::Vector) && data.is_a?(Daru::Vector)
-    end
     def narray?(data)
       defined?(Numo::NArray) && data.is_a?(Numo::NArray)
     end

data/lib/xgb/ffi.rb CHANGED

@@ -1,7 +1,13 @@
 module Xgb
   module FFI
     extend ::FFI::Library
-    ffi_lib ["xgboost"]
+    begin
+      ffi_lib Xgb.ffi_lib
+    rescue LoadError => e
+      raise e if ENV["XGB_DEBUG"]
+      raise LoadError, "Could not find XGBoost"
+    end
     # https://github.com/dmlc/xgboost/blob/master/include/xgboost/c_api.h
     # keep same order
@@ -10,18 +16,21 @@ module Xgb
     attach_function :XGBGetLastError, %i[], :string
     # dmatrix
-    attach_function :XGDMatrixCreateFromMat, %i[pointer ulong ulong float pointer], :int
+    attach_function :XGDMatrixCreateFromMat, %i[pointer uint64 uint64 float pointer], :int
+    attach_function :XGDMatrixSetGroup, %i[pointer pointer uint64], :int
     attach_function :XGDMatrixNumRow, %i[pointer pointer], :int
     attach_function :XGDMatrixNumCol, %i[pointer pointer], :int
-    attach_function :XGDMatrixSliceDMatrix, %i[pointer pointer ulong pointer], :int
+    attach_function :XGDMatrixSliceDMatrix, %i[pointer pointer uint64 pointer], :int
+    attach_function :XGDMatrixFree, %i[pointer], :int
     attach_function :XGDMatrixSaveBinary, %i[pointer string int], :int
-    attach_function :XGDMatrixSetFloatInfo, %i[pointer string pointer ulong], :int
+    attach_function :XGDMatrixSetFloatInfo, %i[pointer string pointer uint64], :int
     attach_function :XGDMatrixGetFloatInfo, %i[pointer string pointer pointer], :int
     # booster
     attach_function :XGBoosterCreate, %i[pointer int pointer], :int
     attach_function :XGBoosterUpdateOneIter, %i[pointer int pointer], :int
-    attach_function :XGBoosterEvalOneIter, %i[pointer int pointer pointer ulong pointer], :int
+    attach_function :XGBoosterEvalOneIter, %i[pointer int pointer pointer uint64 pointer], :int
+    attach_function :XGBoosterFree, %i[pointer], :int
     attach_function :XGBoosterSetParam, %i[pointer string string], :int
     attach_function :XGBoosterPredict, %i[pointer pointer int int pointer pointer], :int
     attach_function :XGBoosterLoadModel, %i[pointer string], :int

data/lib/xgb/model.rb ADDED

@@ -0,0 +1,35 @@
+module Xgb
+  class Model
+    attr_reader :booster
+    def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: nil, importance_type: "gain", **options)
+      @params = {
+        max_depth: max_depth,
+        objective: objective,
+        learning_rate: learning_rate
+      }.merge(options)
+      @n_estimators = n_estimators
+      @importance_type = importance_type
+    end
+    def predict(data)
+      dmat = DMatrix.new(data)
+      @booster.predict(dmat)
+    end
+    def save_model(fname)
+      @booster.save_model(fname)
+    end
+    def load_model(fname)
+      @booster = Booster.new(params: @params, model_file: fname)
+    end
+    def feature_importances
+      score = @booster.score(importance_type: @importance_type)
+      scores = @booster.feature_names.map { |k| score[k] || 0.0 }
+      total = scores.sum.to_f
+      scores.map { |s| s / total }
+    end
+  end
+end

data/lib/xgb/ranker.rb ADDED

@@ -0,0 +1,14 @@
+module Xgb
+  class Ranker < Model
+    def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "rank:pairwise", importance_type: "gain", **options)
+      super
+    end
+    def fit(x, y, group)
+      dtrain = DMatrix.new(x, label: y)
+      dtrain.group = group
+      @booster = Xgb.train(@params, dtrain, num_boost_round: @n_estimators)
+      nil
+    end
+  end
+end

data/lib/xgb/regressor.rb CHANGED

@@ -1,39 +1,20 @@
 module Xgb
-  class Regressor
-    def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "reg:squarederror", importance_type: "gain")
-      @params = {
-        max_depth: max_depth,
-        objective: objective,
-        learning_rate: learning_rate
-      }
-      @n_estimators = n_estimators
-      @importance_type = importance_type
+  class Regressor < Model
+    def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "reg:squarederror", importance_type: "gain", **options)
+      super
     end
-    def fit(x, y)
+    def fit(x, y, eval_set: nil, early_stopping_rounds: nil, verbose: true)
       dtrain = DMatrix.new(x, label: y)
-      @booster = Xgb.train(@params, dtrain, num_boost_round: @n_estimators)
-      nil
-    end
+      evals = Array(eval_set).map.with_index { |v, i| [DMatrix.new(v[0], label: v[1]), "validation_#{i}"] }
-    def predict(data)
-      dmat = DMatrix.new(data)
-      @booster.predict(dmat)
-    end
-    def save_model(fname)
-      @booster.save_model(fname)
-    end
-    def load_model(fname)
-      @booster = Booster.new(params: @params, model_file: fname)
-    end
-    def feature_importances
-      score = @booster.score(importance_type: @importance_type)
-      scores = @booster.feature_names.map { |k| score[k] || 0.0 }
-      total = scores.sum.to_f
-      scores.map { |s| s / total }
+      @booster = Xgb.train(@params, dtrain,
+        num_boost_round: @n_estimators,
+        early_stopping_rounds: early_stopping_rounds,
+        verbose_eval: verbose,
+        evals: evals
+      )
+      nil
     end
   end
 end

data/lib/xgb/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Xgb
-  VERSION = "0.1.1"
+  VERSION = "0.1.2"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: xgb
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-08-16 00:00:00.000000000 Z
+date: 2019-08-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ffi
@@ -107,6 +107,8 @@ files:
 - lib/xgb/classifier.rb
 - lib/xgb/dmatrix.rb
 - lib/xgb/ffi.rb
+- lib/xgb/model.rb
+- lib/xgb/ranker.rb
 - lib/xgb/regressor.rb
 - lib/xgb/utils.rb
 - lib/xgb/version.rb
@@ -129,7 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.4
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: XGBoost - the high performance machine learning library - for Ruby