xgb 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1bb50395d579da91b18754bc75e780cbb2e98fd7a48a17c34514230d1c4828d1
4
- data.tar.gz: 3d2f9c5a72c63c2622a973805c9f2caa9bd4de7b5c67f8c4b5445fd9a71993c3
3
+ metadata.gz: 148980f8a4991f1f98cd1740188e763a3bd96c98bc69b13a9de9aa00132a12f1
4
+ data.tar.gz: 31d90a3a064d032a7d1f371c6928f11b103bfc6f9c92dd1697cc538ef33f15fa
5
5
  SHA512:
6
- metadata.gz: f141b3ea0b6ceb8549198fd6ad8a07f6947201409478fc4829fe625da376e40d8028427a5aa34191b565aa275d27bb03e2082bb8fc489f6da6a2a09b3bbf2c2f
7
- data.tar.gz: c393f4fdbe240ffc14b64f22f17d149ed393070fb0752f9ec49dd94bcfa88f446ea21bc5bf9a96bef7759c5c47033dd0480a4001d477b8c487cf5dcf8be19b81
6
+ metadata.gz: 28fb08d373af3a3b198822ddea958ca0ee433145dbca091583b25945f27a048a3ec4a4ec43d88dc3c1dde67de7f72b7fb84e7668cfbd173f01d56992017ba4e2
7
+ data.tar.gz: 5bfb07db7c6b65d0a08010ab59328af75a97e74e025fcd4eb15ca88b05afec3303beb1569c99d59932a5426a64addce1c7fda90562342efcf52fc2c72d9b362a
@@ -1,3 +1,10 @@
1
+ ## 0.1.2
2
+
3
+ - Friendlier message when XGBoost not found
4
+ - Free memory when objects are destroyed
5
+ - Added `Ranker`
6
+ - Added early stopping to Scikit-Learn API
7
+
1
8
  ## 0.1.1
2
9
 
3
10
  - Added Scikit-Learn API
data/README.md CHANGED
@@ -18,7 +18,7 @@ gem 'xgb'
18
18
 
19
19
  ## Getting Started
20
20
 
21
- This library follows the [Core Data Structure, Learning and Scikit-Learn APIs](https://xgboost.readthedocs.io/en/latest/python/python_api.html) of the Python library. Some methods and options are missing at the moment. PRs welcome!
21
+ This library follows the [Python API](https://xgboost.readthedocs.io/en/latest/python/python_api.html). Some methods and options are missing at the moment. PRs welcome!
22
22
 
23
23
  ## Learning API
24
24
 
@@ -33,7 +33,8 @@ booster = Xgb.train(params, dtrain)
33
33
  Predict
34
34
 
35
35
  ```ruby
36
- booster.predict(x_test)
36
+ dtest = Xgb::DMatrix.new(x_test)
37
+ booster.predict(dtest)
37
38
  ```
38
39
 
39
40
  Save the model to a file
@@ -110,6 +111,12 @@ Get the importance of features
110
111
  model.feature_importances
111
112
  ```
112
113
 
114
+ Early stopping
115
+
116
+ ```ruby
117
+ model.fit(x, y, eval_set: [[x_test, y_test]], early_stopping_rounds: 5)
118
+ ```
119
+
113
120
  ## Data
114
121
 
115
122
  Data can be an array of arrays
data/lib/xgb.rb CHANGED
@@ -5,16 +5,25 @@ require "ffi"
5
5
  require "xgb/utils"
6
6
  require "xgb/booster"
7
7
  require "xgb/dmatrix"
8
- require "xgb/ffi"
9
8
  require "xgb/version"
10
9
 
11
10
  # scikit-learn API
11
+ require "xgb/model"
12
12
  require "xgb/classifier"
13
+ require "xgb/ranker"
13
14
  require "xgb/regressor"
14
15
 
15
16
  module Xgb
16
17
  class Error < StandardError; end
17
18
 
19
+ class << self
20
+ attr_accessor :ffi_lib
21
+ end
22
+ self.ffi_lib = ["xgboost"]
23
+
24
+ # friendlier error message
25
+ autoload :FFI,"xgb/ffi"
26
+
18
27
  class << self
19
28
  def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true)
20
29
  booster = Booster.new(params: params)
@@ -5,6 +5,8 @@ module Xgb
5
5
  def initialize(params: nil, model_file: nil)
6
6
  @handle = ::FFI::MemoryPointer.new(:pointer)
7
7
  check_result FFI.XGBoosterCreate(nil, 0, @handle)
8
+ ObjectSpace.define_finalizer(self, self.class.finalize(handle_pointer))
9
+
8
10
  if model_file
9
11
  check_result FFI.XGBoosterLoadModel(handle_pointer, model_file)
10
12
  end
@@ -13,6 +15,11 @@ module Xgb
13
15
  set_param(params)
14
16
  end
15
17
 
18
+ def self.finalize(pointer)
19
+ # must use proc instead of stabby lambda
20
+ proc { FFI.XGBoosterFree(pointer) }
21
+ end
22
+
16
23
  def update(dtrain, iteration)
17
24
  check_result FFI.XGBoosterUpdateOneIter(handle_pointer, iteration, dtrain.handle_pointer)
18
25
  end
@@ -43,10 +50,10 @@ module Xgb
43
50
 
44
51
  def predict(data, ntree_limit: nil)
45
52
  ntree_limit ||= 0
46
- out_len = ::FFI::MemoryPointer.new(:ulong)
53
+ out_len = ::FFI::MemoryPointer.new(:uint64)
47
54
  out_result = ::FFI::MemoryPointer.new(:pointer)
48
55
  check_result FFI.XGBoosterPredict(handle_pointer, data.handle_pointer, 0, ntree_limit, out_len, out_result)
49
- out = out_result.read_pointer.read_array_of_float(out_len.read_ulong)
56
+ out = out_result.read_pointer.read_array_of_float(out_len.read_uint64)
50
57
  num_class = out.size / data.num_row
51
58
  out = out.each_slice(num_class).to_a if num_class > 1
52
59
  out
@@ -58,10 +65,10 @@ module Xgb
58
65
 
59
66
  # returns an array of strings
60
67
  def dump(fmap: "", with_stats: false, dump_format: "text")
61
- out_len = ::FFI::MemoryPointer.new(:ulong)
68
+ out_len = ::FFI::MemoryPointer.new(:uint64)
62
69
  out_result = ::FFI::MemoryPointer.new(:pointer)
63
70
  check_result FFI.XGBoosterDumpModelEx(handle_pointer, fmap, with_stats ? 1 : 0, dump_format, out_len, out_result)
64
- out_result.read_pointer.get_array_of_string(0, out_len.read_ulong)
71
+ out_result.read_pointer.get_array_of_string(0, out_len.read_uint64)
65
72
  end
66
73
 
67
74
  def dump_model(fout, fmap: "", with_stats: false, dump_format: "text")
@@ -1,16 +1,10 @@
1
1
  module Xgb
2
- class Classifier
3
- def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "binary:logistic", importance_type: "gain")
4
- @params = {
5
- max_depth: max_depth,
6
- objective: objective,
7
- learning_rate: learning_rate
8
- }
9
- @n_estimators = n_estimators
10
- @importance_type = importance_type
2
+ class Classifier < Model
3
+ def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "binary:logistic", importance_type: "gain", **options)
4
+ super
11
5
  end
12
6
 
13
- def fit(x, y)
7
+ def fit(x, y, eval_set: nil, early_stopping_rounds: nil, verbose: true)
14
8
  n_classes = y.uniq.size
15
9
 
16
10
  params = @params.dup
@@ -20,18 +14,24 @@ module Xgb
20
14
  end
21
15
 
22
16
  dtrain = DMatrix.new(x, label: y)
23
- @booster = Xgb.train(params, dtrain, num_boost_round: @n_estimators)
17
+ evals = Array(eval_set).map.with_index { |v, i| [DMatrix.new(v[0], label: v[1]), "validation_#{i}"] }
18
+
19
+ @booster = Xgb.train(params, dtrain,
20
+ num_boost_round: @n_estimators,
21
+ early_stopping_rounds: early_stopping_rounds,
22
+ verbose_eval: verbose,
23
+ evals: evals
24
+ )
24
25
  nil
25
26
  end
26
27
 
27
28
  def predict(data)
28
- dmat = DMatrix.new(data)
29
- y_pred = @booster.predict(dmat)
29
+ y_pred = super(data)
30
30
 
31
31
  if y_pred.first.is_a?(Array)
32
32
  # multiple classes
33
33
  y_pred.map do |v|
34
- v.map.with_index.max_by { |v2, i| v2 }.last
34
+ v.map.with_index.max_by { |v2, _| v2 }.last
35
35
  end
36
36
  else
37
37
  y_pred.map { |v| v > 0.5 ? 1 : 0 }
@@ -49,20 +49,5 @@ module Xgb
49
49
  y_pred.map { |v| [1 - v, v] }
50
50
  end
51
51
  end
52
-
53
- def save_model(fname)
54
- @booster.save_model(fname)
55
- end
56
-
57
- def load_model(fname)
58
- @booster = Booster.new(params: @params, model_file: fname)
59
- end
60
-
61
- def feature_importances
62
- score = @booster.score(importance_type: @importance_type)
63
- scores = @booster.feature_names.map { |k| score[k] || 0.0 }
64
- total = scores.sum.to_f
65
- scores.map { |s| s / total }
66
- end
67
52
  end
68
53
  end
@@ -27,12 +27,19 @@ module Xgb
27
27
  c_data = ::FFI::MemoryPointer.new(:float, nrow * ncol)
28
28
  c_data.put_array_of_float(0, flat_data)
29
29
  check_result FFI.XGDMatrixCreateFromMat(c_data, nrow, ncol, missing, @handle)
30
+
31
+ ObjectSpace.define_finalizer(self, self.class.finalize(handle_pointer))
30
32
  end
31
33
 
32
34
  set_float_info("label", label) if label
33
35
  set_float_info("weight", weight) if weight
34
36
  end
35
37
 
38
+ def self.finalize(pointer)
39
+ # must use proc instead of stabby lambda
40
+ proc { FFI.XGDMatrixFree(pointer) }
41
+ end
42
+
36
43
  def label
37
44
  float_info("label")
38
45
  end
@@ -41,16 +48,22 @@ module Xgb
41
48
  float_info("weight")
42
49
  end
43
50
 
51
+ def group=(group)
52
+ c_data = ::FFI::MemoryPointer.new(:int, group.size)
53
+ c_data.put_array_of_int(0, group)
54
+ check_result FFI.XGDMatrixSetGroup(handle_pointer, c_data, group.size)
55
+ end
56
+
44
57
  def num_row
45
- out = ::FFI::MemoryPointer.new(:ulong)
58
+ out = ::FFI::MemoryPointer.new(:uint64)
46
59
  check_result FFI.XGDMatrixNumRow(handle_pointer, out)
47
- out.read_ulong
60
+ out.read_uint64
48
61
  end
49
62
 
50
63
  def num_col
51
- out = ::FFI::MemoryPointer.new(:ulong)
64
+ out = ::FFI::MemoryPointer.new(:uint64)
52
65
  check_result FFI.XGDMatrixNumCol(handle_pointer, out)
53
- out.read_ulong
66
+ out.read_uint64
54
67
  end
55
68
 
56
69
  def slice(rindex)
@@ -76,15 +89,7 @@ module Xgb
76
89
  private
77
90
 
78
91
  def set_float_info(field, data)
79
- data =
80
- if matrix?(data)
81
- data.to_a[0]
82
- elsif daru_vector?(data) || narray?(data)
83
- data.to_a
84
- else
85
- data
86
- end
87
-
92
+ data = data.to_a unless data.is_a?(Array)
88
93
  c_data = ::FFI::MemoryPointer.new(:float, data.size)
89
94
  c_data.put_array_of_float(0, data)
90
95
  check_result FFI.XGDMatrixSetFloatInfo(handle_pointer, field.to_s, c_data, data.size)
@@ -106,10 +111,6 @@ module Xgb
106
111
  defined?(Daru::DataFrame) && data.is_a?(Daru::DataFrame)
107
112
  end
108
113
 
109
- def daru_vector?(data)
110
- defined?(Daru::Vector) && data.is_a?(Daru::Vector)
111
- end
112
-
113
114
  def narray?(data)
114
115
  defined?(Numo::NArray) && data.is_a?(Numo::NArray)
115
116
  end
@@ -1,7 +1,13 @@
1
1
  module Xgb
2
2
  module FFI
3
3
  extend ::FFI::Library
4
- ffi_lib ["xgboost"]
4
+
5
+ begin
6
+ ffi_lib Xgb.ffi_lib
7
+ rescue LoadError => e
8
+ raise e if ENV["XGB_DEBUG"]
9
+ raise LoadError, "Could not find XGBoost"
10
+ end
5
11
 
6
12
  # https://github.com/dmlc/xgboost/blob/master/include/xgboost/c_api.h
7
13
  # keep same order
@@ -10,18 +16,21 @@ module Xgb
10
16
  attach_function :XGBGetLastError, %i[], :string
11
17
 
12
18
  # dmatrix
13
- attach_function :XGDMatrixCreateFromMat, %i[pointer ulong ulong float pointer], :int
19
+ attach_function :XGDMatrixCreateFromMat, %i[pointer uint64 uint64 float pointer], :int
20
+ attach_function :XGDMatrixSetGroup, %i[pointer pointer uint64], :int
14
21
  attach_function :XGDMatrixNumRow, %i[pointer pointer], :int
15
22
  attach_function :XGDMatrixNumCol, %i[pointer pointer], :int
16
- attach_function :XGDMatrixSliceDMatrix, %i[pointer pointer ulong pointer], :int
23
+ attach_function :XGDMatrixSliceDMatrix, %i[pointer pointer uint64 pointer], :int
24
+ attach_function :XGDMatrixFree, %i[pointer], :int
17
25
  attach_function :XGDMatrixSaveBinary, %i[pointer string int], :int
18
- attach_function :XGDMatrixSetFloatInfo, %i[pointer string pointer ulong], :int
26
+ attach_function :XGDMatrixSetFloatInfo, %i[pointer string pointer uint64], :int
19
27
  attach_function :XGDMatrixGetFloatInfo, %i[pointer string pointer pointer], :int
20
28
 
21
29
  # booster
22
30
  attach_function :XGBoosterCreate, %i[pointer int pointer], :int
23
31
  attach_function :XGBoosterUpdateOneIter, %i[pointer int pointer], :int
24
- attach_function :XGBoosterEvalOneIter, %i[pointer int pointer pointer ulong pointer], :int
32
+ attach_function :XGBoosterEvalOneIter, %i[pointer int pointer pointer uint64 pointer], :int
33
+ attach_function :XGBoosterFree, %i[pointer], :int
25
34
  attach_function :XGBoosterSetParam, %i[pointer string string], :int
26
35
  attach_function :XGBoosterPredict, %i[pointer pointer int int pointer pointer], :int
27
36
  attach_function :XGBoosterLoadModel, %i[pointer string], :int
@@ -0,0 +1,35 @@
1
+ module Xgb
2
+ class Model
3
+ attr_reader :booster
4
+
5
+ def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: nil, importance_type: "gain", **options)
6
+ @params = {
7
+ max_depth: max_depth,
8
+ objective: objective,
9
+ learning_rate: learning_rate
10
+ }.merge(options)
11
+ @n_estimators = n_estimators
12
+ @importance_type = importance_type
13
+ end
14
+
15
+ def predict(data)
16
+ dmat = DMatrix.new(data)
17
+ @booster.predict(dmat)
18
+ end
19
+
20
+ def save_model(fname)
21
+ @booster.save_model(fname)
22
+ end
23
+
24
+ def load_model(fname)
25
+ @booster = Booster.new(params: @params, model_file: fname)
26
+ end
27
+
28
+ def feature_importances
29
+ score = @booster.score(importance_type: @importance_type)
30
+ scores = @booster.feature_names.map { |k| score[k] || 0.0 }
31
+ total = scores.sum.to_f
32
+ scores.map { |s| s / total }
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,14 @@
1
+ module Xgb
2
+ class Ranker < Model
3
+ def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "rank:pairwise", importance_type: "gain", **options)
4
+ super
5
+ end
6
+
7
+ def fit(x, y, group)
8
+ dtrain = DMatrix.new(x, label: y)
9
+ dtrain.group = group
10
+ @booster = Xgb.train(@params, dtrain, num_boost_round: @n_estimators)
11
+ nil
12
+ end
13
+ end
14
+ end
@@ -1,39 +1,20 @@
1
1
  module Xgb
2
- class Regressor
3
- def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "reg:squarederror", importance_type: "gain")
4
- @params = {
5
- max_depth: max_depth,
6
- objective: objective,
7
- learning_rate: learning_rate
8
- }
9
- @n_estimators = n_estimators
10
- @importance_type = importance_type
2
+ class Regressor < Model
3
+ def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "reg:squarederror", importance_type: "gain", **options)
4
+ super
11
5
  end
12
6
 
13
- def fit(x, y)
7
+ def fit(x, y, eval_set: nil, early_stopping_rounds: nil, verbose: true)
14
8
  dtrain = DMatrix.new(x, label: y)
15
- @booster = Xgb.train(@params, dtrain, num_boost_round: @n_estimators)
16
- nil
17
- end
9
+ evals = Array(eval_set).map.with_index { |v, i| [DMatrix.new(v[0], label: v[1]), "validation_#{i}"] }
18
10
 
19
- def predict(data)
20
- dmat = DMatrix.new(data)
21
- @booster.predict(dmat)
22
- end
23
-
24
- def save_model(fname)
25
- @booster.save_model(fname)
26
- end
27
-
28
- def load_model(fname)
29
- @booster = Booster.new(params: @params, model_file: fname)
30
- end
31
-
32
- def feature_importances
33
- score = @booster.score(importance_type: @importance_type)
34
- scores = @booster.feature_names.map { |k| score[k] || 0.0 }
35
- total = scores.sum.to_f
36
- scores.map { |s| s / total }
11
+ @booster = Xgb.train(@params, dtrain,
12
+ num_boost_round: @n_estimators,
13
+ early_stopping_rounds: early_stopping_rounds,
14
+ verbose_eval: verbose,
15
+ evals: evals
16
+ )
17
+ nil
37
18
  end
38
19
  end
39
20
  end
@@ -1,3 +1,3 @@
1
1
  module Xgb
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xgb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-16 00:00:00.000000000 Z
11
+ date: 2019-08-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -107,6 +107,8 @@ files:
107
107
  - lib/xgb/classifier.rb
108
108
  - lib/xgb/dmatrix.rb
109
109
  - lib/xgb/ffi.rb
110
+ - lib/xgb/model.rb
111
+ - lib/xgb/ranker.rb
110
112
  - lib/xgb/regressor.rb
111
113
  - lib/xgb/utils.rb
112
114
  - lib/xgb/version.rb
@@ -129,7 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
129
131
  - !ruby/object:Gem::Version
130
132
  version: '0'
131
133
  requirements: []
132
- rubygems_version: 3.0.4
134
+ rubygems_version: 3.0.3
133
135
  signing_key:
134
136
  specification_version: 4
135
137
  summary: XGBoost - the high performance machine learning library - for Ruby