xgb 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1bb50395d579da91b18754bc75e780cbb2e98fd7a48a17c34514230d1c4828d1
4
- data.tar.gz: 3d2f9c5a72c63c2622a973805c9f2caa9bd4de7b5c67f8c4b5445fd9a71993c3
3
+ metadata.gz: 148980f8a4991f1f98cd1740188e763a3bd96c98bc69b13a9de9aa00132a12f1
4
+ data.tar.gz: 31d90a3a064d032a7d1f371c6928f11b103bfc6f9c92dd1697cc538ef33f15fa
5
5
  SHA512:
6
- metadata.gz: f141b3ea0b6ceb8549198fd6ad8a07f6947201409478fc4829fe625da376e40d8028427a5aa34191b565aa275d27bb03e2082bb8fc489f6da6a2a09b3bbf2c2f
7
- data.tar.gz: c393f4fdbe240ffc14b64f22f17d149ed393070fb0752f9ec49dd94bcfa88f446ea21bc5bf9a96bef7759c5c47033dd0480a4001d477b8c487cf5dcf8be19b81
6
+ metadata.gz: 28fb08d373af3a3b198822ddea958ca0ee433145dbca091583b25945f27a048a3ec4a4ec43d88dc3c1dde67de7f72b7fb84e7668cfbd173f01d56992017ba4e2
7
+ data.tar.gz: 5bfb07db7c6b65d0a08010ab59328af75a97e74e025fcd4eb15ca88b05afec3303beb1569c99d59932a5426a64addce1c7fda90562342efcf52fc2c72d9b362a
@@ -1,3 +1,10 @@
1
+ ## 0.1.2
2
+
3
+ - Friendlier message when XGBoost not found
4
+ - Free memory when objects are destroyed
5
+ - Added `Ranker`
6
+ - Added early stopping to Scikit-Learn API
7
+
1
8
  ## 0.1.1
2
9
 
3
10
  - Added Scikit-Learn API
data/README.md CHANGED
@@ -18,7 +18,7 @@ gem 'xgb'
18
18
 
19
19
  ## Getting Started
20
20
 
21
- This library follows the [Core Data Structure, Learning and Scikit-Learn APIs](https://xgboost.readthedocs.io/en/latest/python/python_api.html) of the Python library. Some methods and options are missing at the moment. PRs welcome!
21
+ This library follows the [Python API](https://xgboost.readthedocs.io/en/latest/python/python_api.html). Some methods and options are missing at the moment. PRs welcome!
22
22
 
23
23
  ## Learning API
24
24
 
@@ -33,7 +33,8 @@ booster = Xgb.train(params, dtrain)
33
33
  Predict
34
34
 
35
35
  ```ruby
36
- booster.predict(x_test)
36
+ dtest = Xgb::DMatrix.new(x_test)
37
+ booster.predict(dtest)
37
38
  ```
38
39
 
39
40
  Save the model to a file
@@ -110,6 +111,12 @@ Get the importance of features
110
111
  model.feature_importances
111
112
  ```
112
113
 
114
+ Early stopping
115
+
116
+ ```ruby
117
+ model.fit(x, y, eval_set: [[x_test, y_test]], early_stopping_rounds: 5)
118
+ ```
119
+
113
120
  ## Data
114
121
 
115
122
  Data can be an array of arrays
data/lib/xgb.rb CHANGED
@@ -5,16 +5,25 @@ require "ffi"
5
5
  require "xgb/utils"
6
6
  require "xgb/booster"
7
7
  require "xgb/dmatrix"
8
- require "xgb/ffi"
9
8
  require "xgb/version"
10
9
 
11
10
  # scikit-learn API
11
+ require "xgb/model"
12
12
  require "xgb/classifier"
13
+ require "xgb/ranker"
13
14
  require "xgb/regressor"
14
15
 
15
16
  module Xgb
16
17
  class Error < StandardError; end
17
18
 
19
+ class << self
20
+ attr_accessor :ffi_lib
21
+ end
22
+ self.ffi_lib = ["xgboost"]
23
+
24
+ # friendlier error message
25
+ autoload :FFI,"xgb/ffi"
26
+
18
27
  class << self
19
28
  def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true)
20
29
  booster = Booster.new(params: params)
@@ -5,6 +5,8 @@ module Xgb
5
5
  def initialize(params: nil, model_file: nil)
6
6
  @handle = ::FFI::MemoryPointer.new(:pointer)
7
7
  check_result FFI.XGBoosterCreate(nil, 0, @handle)
8
+ ObjectSpace.define_finalizer(self, self.class.finalize(handle_pointer))
9
+
8
10
  if model_file
9
11
  check_result FFI.XGBoosterLoadModel(handle_pointer, model_file)
10
12
  end
@@ -13,6 +15,11 @@ module Xgb
13
15
  set_param(params)
14
16
  end
15
17
 
18
+ def self.finalize(pointer)
19
+ # must use proc instead of stabby lambda
20
+ proc { FFI.XGBoosterFree(pointer) }
21
+ end
22
+
16
23
  def update(dtrain, iteration)
17
24
  check_result FFI.XGBoosterUpdateOneIter(handle_pointer, iteration, dtrain.handle_pointer)
18
25
  end
@@ -43,10 +50,10 @@ module Xgb
43
50
 
44
51
  def predict(data, ntree_limit: nil)
45
52
  ntree_limit ||= 0
46
- out_len = ::FFI::MemoryPointer.new(:ulong)
53
+ out_len = ::FFI::MemoryPointer.new(:uint64)
47
54
  out_result = ::FFI::MemoryPointer.new(:pointer)
48
55
  check_result FFI.XGBoosterPredict(handle_pointer, data.handle_pointer, 0, ntree_limit, out_len, out_result)
49
- out = out_result.read_pointer.read_array_of_float(out_len.read_ulong)
56
+ out = out_result.read_pointer.read_array_of_float(out_len.read_uint64)
50
57
  num_class = out.size / data.num_row
51
58
  out = out.each_slice(num_class).to_a if num_class > 1
52
59
  out
@@ -58,10 +65,10 @@ module Xgb
58
65
 
59
66
  # returns an array of strings
60
67
  def dump(fmap: "", with_stats: false, dump_format: "text")
61
- out_len = ::FFI::MemoryPointer.new(:ulong)
68
+ out_len = ::FFI::MemoryPointer.new(:uint64)
62
69
  out_result = ::FFI::MemoryPointer.new(:pointer)
63
70
  check_result FFI.XGBoosterDumpModelEx(handle_pointer, fmap, with_stats ? 1 : 0, dump_format, out_len, out_result)
64
- out_result.read_pointer.get_array_of_string(0, out_len.read_ulong)
71
+ out_result.read_pointer.get_array_of_string(0, out_len.read_uint64)
65
72
  end
66
73
 
67
74
  def dump_model(fout, fmap: "", with_stats: false, dump_format: "text")
@@ -1,16 +1,10 @@
1
1
  module Xgb
2
- class Classifier
3
- def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "binary:logistic", importance_type: "gain")
4
- @params = {
5
- max_depth: max_depth,
6
- objective: objective,
7
- learning_rate: learning_rate
8
- }
9
- @n_estimators = n_estimators
10
- @importance_type = importance_type
2
+ class Classifier < Model
3
+ def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "binary:logistic", importance_type: "gain", **options)
4
+ super
11
5
  end
12
6
 
13
- def fit(x, y)
7
+ def fit(x, y, eval_set: nil, early_stopping_rounds: nil, verbose: true)
14
8
  n_classes = y.uniq.size
15
9
 
16
10
  params = @params.dup
@@ -20,18 +14,24 @@ module Xgb
20
14
  end
21
15
 
22
16
  dtrain = DMatrix.new(x, label: y)
23
- @booster = Xgb.train(params, dtrain, num_boost_round: @n_estimators)
17
+ evals = Array(eval_set).map.with_index { |v, i| [DMatrix.new(v[0], label: v[1]), "validation_#{i}"] }
18
+
19
+ @booster = Xgb.train(params, dtrain,
20
+ num_boost_round: @n_estimators,
21
+ early_stopping_rounds: early_stopping_rounds,
22
+ verbose_eval: verbose,
23
+ evals: evals
24
+ )
24
25
  nil
25
26
  end
26
27
 
27
28
  def predict(data)
28
- dmat = DMatrix.new(data)
29
- y_pred = @booster.predict(dmat)
29
+ y_pred = super(data)
30
30
 
31
31
  if y_pred.first.is_a?(Array)
32
32
  # multiple classes
33
33
  y_pred.map do |v|
34
- v.map.with_index.max_by { |v2, i| v2 }.last
34
+ v.map.with_index.max_by { |v2, _| v2 }.last
35
35
  end
36
36
  else
37
37
  y_pred.map { |v| v > 0.5 ? 1 : 0 }
@@ -49,20 +49,5 @@ module Xgb
49
49
  y_pred.map { |v| [1 - v, v] }
50
50
  end
51
51
  end
52
-
53
- def save_model(fname)
54
- @booster.save_model(fname)
55
- end
56
-
57
- def load_model(fname)
58
- @booster = Booster.new(params: @params, model_file: fname)
59
- end
60
-
61
- def feature_importances
62
- score = @booster.score(importance_type: @importance_type)
63
- scores = @booster.feature_names.map { |k| score[k] || 0.0 }
64
- total = scores.sum.to_f
65
- scores.map { |s| s / total }
66
- end
67
52
  end
68
53
  end
@@ -27,12 +27,19 @@ module Xgb
27
27
  c_data = ::FFI::MemoryPointer.new(:float, nrow * ncol)
28
28
  c_data.put_array_of_float(0, flat_data)
29
29
  check_result FFI.XGDMatrixCreateFromMat(c_data, nrow, ncol, missing, @handle)
30
+
31
+ ObjectSpace.define_finalizer(self, self.class.finalize(handle_pointer))
30
32
  end
31
33
 
32
34
  set_float_info("label", label) if label
33
35
  set_float_info("weight", weight) if weight
34
36
  end
35
37
 
38
+ def self.finalize(pointer)
39
+ # must use proc instead of stabby lambda
40
+ proc { FFI.XGDMatrixFree(pointer) }
41
+ end
42
+
36
43
  def label
37
44
  float_info("label")
38
45
  end
@@ -41,16 +48,22 @@ module Xgb
41
48
  float_info("weight")
42
49
  end
43
50
 
51
+ def group=(group)
52
+ c_data = ::FFI::MemoryPointer.new(:int, group.size)
53
+ c_data.put_array_of_int(0, group)
54
+ check_result FFI.XGDMatrixSetGroup(handle_pointer, c_data, group.size)
55
+ end
56
+
44
57
  def num_row
45
- out = ::FFI::MemoryPointer.new(:ulong)
58
+ out = ::FFI::MemoryPointer.new(:uint64)
46
59
  check_result FFI.XGDMatrixNumRow(handle_pointer, out)
47
- out.read_ulong
60
+ out.read_uint64
48
61
  end
49
62
 
50
63
  def num_col
51
- out = ::FFI::MemoryPointer.new(:ulong)
64
+ out = ::FFI::MemoryPointer.new(:uint64)
52
65
  check_result FFI.XGDMatrixNumCol(handle_pointer, out)
53
- out.read_ulong
66
+ out.read_uint64
54
67
  end
55
68
 
56
69
  def slice(rindex)
@@ -76,15 +89,7 @@ module Xgb
76
89
  private
77
90
 
78
91
  def set_float_info(field, data)
79
- data =
80
- if matrix?(data)
81
- data.to_a[0]
82
- elsif daru_vector?(data) || narray?(data)
83
- data.to_a
84
- else
85
- data
86
- end
87
-
92
+ data = data.to_a unless data.is_a?(Array)
88
93
  c_data = ::FFI::MemoryPointer.new(:float, data.size)
89
94
  c_data.put_array_of_float(0, data)
90
95
  check_result FFI.XGDMatrixSetFloatInfo(handle_pointer, field.to_s, c_data, data.size)
@@ -106,10 +111,6 @@ module Xgb
106
111
  defined?(Daru::DataFrame) && data.is_a?(Daru::DataFrame)
107
112
  end
108
113
 
109
- def daru_vector?(data)
110
- defined?(Daru::Vector) && data.is_a?(Daru::Vector)
111
- end
112
-
113
114
  def narray?(data)
114
115
  defined?(Numo::NArray) && data.is_a?(Numo::NArray)
115
116
  end
@@ -1,7 +1,13 @@
1
1
  module Xgb
2
2
  module FFI
3
3
  extend ::FFI::Library
4
- ffi_lib ["xgboost"]
4
+
5
+ begin
6
+ ffi_lib Xgb.ffi_lib
7
+ rescue LoadError => e
8
+ raise e if ENV["XGB_DEBUG"]
9
+ raise LoadError, "Could not find XGBoost"
10
+ end
5
11
 
6
12
  # https://github.com/dmlc/xgboost/blob/master/include/xgboost/c_api.h
7
13
  # keep same order
@@ -10,18 +16,21 @@ module Xgb
10
16
  attach_function :XGBGetLastError, %i[], :string
11
17
 
12
18
  # dmatrix
13
- attach_function :XGDMatrixCreateFromMat, %i[pointer ulong ulong float pointer], :int
19
+ attach_function :XGDMatrixCreateFromMat, %i[pointer uint64 uint64 float pointer], :int
20
+ attach_function :XGDMatrixSetGroup, %i[pointer pointer uint64], :int
14
21
  attach_function :XGDMatrixNumRow, %i[pointer pointer], :int
15
22
  attach_function :XGDMatrixNumCol, %i[pointer pointer], :int
16
- attach_function :XGDMatrixSliceDMatrix, %i[pointer pointer ulong pointer], :int
23
+ attach_function :XGDMatrixSliceDMatrix, %i[pointer pointer uint64 pointer], :int
24
+ attach_function :XGDMatrixFree, %i[pointer], :int
17
25
  attach_function :XGDMatrixSaveBinary, %i[pointer string int], :int
18
- attach_function :XGDMatrixSetFloatInfo, %i[pointer string pointer ulong], :int
26
+ attach_function :XGDMatrixSetFloatInfo, %i[pointer string pointer uint64], :int
19
27
  attach_function :XGDMatrixGetFloatInfo, %i[pointer string pointer pointer], :int
20
28
 
21
29
  # booster
22
30
  attach_function :XGBoosterCreate, %i[pointer int pointer], :int
23
31
  attach_function :XGBoosterUpdateOneIter, %i[pointer int pointer], :int
24
- attach_function :XGBoosterEvalOneIter, %i[pointer int pointer pointer ulong pointer], :int
32
+ attach_function :XGBoosterEvalOneIter, %i[pointer int pointer pointer uint64 pointer], :int
33
+ attach_function :XGBoosterFree, %i[pointer], :int
25
34
  attach_function :XGBoosterSetParam, %i[pointer string string], :int
26
35
  attach_function :XGBoosterPredict, %i[pointer pointer int int pointer pointer], :int
27
36
  attach_function :XGBoosterLoadModel, %i[pointer string], :int
@@ -0,0 +1,35 @@
1
+ module Xgb
2
+ class Model
3
+ attr_reader :booster
4
+
5
+ def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: nil, importance_type: "gain", **options)
6
+ @params = {
7
+ max_depth: max_depth,
8
+ objective: objective,
9
+ learning_rate: learning_rate
10
+ }.merge(options)
11
+ @n_estimators = n_estimators
12
+ @importance_type = importance_type
13
+ end
14
+
15
+ def predict(data)
16
+ dmat = DMatrix.new(data)
17
+ @booster.predict(dmat)
18
+ end
19
+
20
+ def save_model(fname)
21
+ @booster.save_model(fname)
22
+ end
23
+
24
+ def load_model(fname)
25
+ @booster = Booster.new(params: @params, model_file: fname)
26
+ end
27
+
28
+ def feature_importances
29
+ score = @booster.score(importance_type: @importance_type)
30
+ scores = @booster.feature_names.map { |k| score[k] || 0.0 }
31
+ total = scores.sum.to_f
32
+ scores.map { |s| s / total }
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,14 @@
1
+ module Xgb
2
+ class Ranker < Model
3
+ def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "rank:pairwise", importance_type: "gain", **options)
4
+ super
5
+ end
6
+
7
+ def fit(x, y, group)
8
+ dtrain = DMatrix.new(x, label: y)
9
+ dtrain.group = group
10
+ @booster = Xgb.train(@params, dtrain, num_boost_round: @n_estimators)
11
+ nil
12
+ end
13
+ end
14
+ end
@@ -1,39 +1,20 @@
1
1
  module Xgb
2
- class Regressor
3
- def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "reg:squarederror", importance_type: "gain")
4
- @params = {
5
- max_depth: max_depth,
6
- objective: objective,
7
- learning_rate: learning_rate
8
- }
9
- @n_estimators = n_estimators
10
- @importance_type = importance_type
2
+ class Regressor < Model
3
+ def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "reg:squarederror", importance_type: "gain", **options)
4
+ super
11
5
  end
12
6
 
13
- def fit(x, y)
7
+ def fit(x, y, eval_set: nil, early_stopping_rounds: nil, verbose: true)
14
8
  dtrain = DMatrix.new(x, label: y)
15
- @booster = Xgb.train(@params, dtrain, num_boost_round: @n_estimators)
16
- nil
17
- end
9
+ evals = Array(eval_set).map.with_index { |v, i| [DMatrix.new(v[0], label: v[1]), "validation_#{i}"] }
18
10
 
19
- def predict(data)
20
- dmat = DMatrix.new(data)
21
- @booster.predict(dmat)
22
- end
23
-
24
- def save_model(fname)
25
- @booster.save_model(fname)
26
- end
27
-
28
- def load_model(fname)
29
- @booster = Booster.new(params: @params, model_file: fname)
30
- end
31
-
32
- def feature_importances
33
- score = @booster.score(importance_type: @importance_type)
34
- scores = @booster.feature_names.map { |k| score[k] || 0.0 }
35
- total = scores.sum.to_f
36
- scores.map { |s| s / total }
11
+ @booster = Xgb.train(@params, dtrain,
12
+ num_boost_round: @n_estimators,
13
+ early_stopping_rounds: early_stopping_rounds,
14
+ verbose_eval: verbose,
15
+ evals: evals
16
+ )
17
+ nil
37
18
  end
38
19
  end
39
20
  end
@@ -1,3 +1,3 @@
1
1
  module Xgb
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: xgb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-08-16 00:00:00.000000000 Z
11
+ date: 2019-08-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ffi
@@ -107,6 +107,8 @@ files:
107
107
  - lib/xgb/classifier.rb
108
108
  - lib/xgb/dmatrix.rb
109
109
  - lib/xgb/ffi.rb
110
+ - lib/xgb/model.rb
111
+ - lib/xgb/ranker.rb
110
112
  - lib/xgb/regressor.rb
111
113
  - lib/xgb/utils.rb
112
114
  - lib/xgb/version.rb
@@ -129,7 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
129
131
  - !ruby/object:Gem::Version
130
132
  version: '0'
131
133
  requirements: []
132
- rubygems_version: 3.0.4
134
+ rubygems_version: 3.0.3
133
135
  signing_key:
134
136
  specification_version: 4
135
137
  summary: XGBoost - the high performance machine learning library - for Ruby