xgb 0.7.3 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f12fd8cc77963f39c4e2a5580491100b8af1a43b9f89ac82a63671ae59e6a651
4
- data.tar.gz: ef084b0b7685198809718cfb9c6f417b3d9f3c2bdfc9a4032051e9e83c8b0a29
3
+ metadata.gz: ba232bc8a9c27bc9cfb2b7f87afc581acf9a5464df9fccf1b206d912704771fe
4
+ data.tar.gz: 608ca80f01e81d32a31b796364de14096c89556543fa2892efd86bbf86fac96a
5
5
  SHA512:
6
- metadata.gz: a5b3788792eae0002e82be795628c326fbe30e2c7aca1ca94c26577a1af30f298df27ae15e2112d7d088a8aa168c6ac353212efb5dcb1fdc63e50257260d23a0
7
- data.tar.gz: cbc4a96d4d5898d5a0d023a130fae13216fac42691e22825612054700c629a9474c1bb87c6ccdda5eebc9039e9bb9f7bea43b3b178dd8e6e097f321051da2442
6
+ metadata.gz: af38ca270cf3d12a8a7757ff5f32a41f5d68e43f9ad08c585df8efae835d014ddcac51201019aa339143cc081c5c69ef5914d8de06cb4326ccb2864b5b04eba2
7
+ data.tar.gz: 787bddc1c867d648ffe13c783aede98c62917feca536d1f8fe27e7992e5f2945126ad15b1a4e3ff48cc041723d299cf3cbacdd5557083983575eae22a9583cd8
data/CHANGELOG.md CHANGED
@@ -1,3 +1,16 @@
1
+ ## 0.9.0 (2024-10-17)
2
+
3
+ - Updated XGBoost to 2.1.1
4
+ - Added support for callbacks
5
+ - Added `num_features` and `save_config` methods to `Booster`
6
+ - Added `num_nonmissing` and `data_split_mode` methods to `DMatrix`
7
+ - Dropped support for Ruby < 3.1
8
+
9
+ ## 0.8.0 (2023-09-13)
10
+
11
+ - Updated XGBoost to 2.0.0
12
+ - Dropped support for Ruby < 3
13
+
1
14
  ## 0.7.3 (2023-07-24)
2
15
 
3
16
  - Fixed error with `dup` and `clone`
data/NOTICE.txt CHANGED
@@ -1,5 +1,5 @@
1
1
  Copyright XGBoost contributors
2
- Copyright 2019-2023 Andrew Kane
2
+ Copyright 2019-2024 Andrew Kane
3
3
 
4
4
  Licensed under the Apache License, Version 2.0 (the "License");
5
5
  you may not use this file except in compliance with the License.
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  [XGBoost](https://github.com/dmlc/xgboost) - high performance gradient boosting - for Ruby
4
4
 
5
- [![Build Status](https://github.com/ruby-ml/xgboost-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ruby-ml/xgboost-ruby/actions)
5
+ [![Build Status](https://github.com/ankane/xgboost-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/xgboost-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -173,21 +173,21 @@ Thanks to the [xgboost](https://github.com/PairOnAir/xgboost-ruby) gem for showi
173
173
 
174
174
  ## History
175
175
 
176
- View the [changelog](https://github.com/ruby-ml/xgboost-ruby/blob/master/CHANGELOG.md)
176
+ View the [changelog](https://github.com/ankane/xgboost-ruby/blob/master/CHANGELOG.md)
177
177
 
178
178
  ## Contributing
179
179
 
180
180
  Everyone is encouraged to help improve this project. Here are a few ways you can help:
181
181
 
182
- - [Report bugs](https://github.com/ruby-ml/xgboost-ruby/issues)
183
- - Fix bugs and [submit pull requests](https://github.com/ruby-ml/xgboost-ruby/pulls)
182
+ - [Report bugs](https://github.com/ankane/xgboost-ruby/issues)
183
+ - Fix bugs and [submit pull requests](https://github.com/ankane/xgboost-ruby/pulls)
184
184
  - Write, clarify, or fix documentation
185
185
  - Suggest or add new features
186
186
 
187
187
  To get started with development:
188
188
 
189
189
  ```sh
190
- git clone https://github.com/ruby-ml/xgboost-ruby.git
190
+ git clone https://github.com/ankane/xgboost-ruby.git
191
191
  cd xgboost-ruby
192
192
  bundle install
193
193
  bundle exec rake vendor:all
@@ -1,77 +1,155 @@
1
1
  module XGBoost
2
2
  class Booster
3
- attr_accessor :best_iteration, :feature_names, :feature_types
3
+ include Utils
4
+
5
+ def initialize(params: nil, cache: nil, model_file: nil)
6
+ cache ||= []
7
+ cache.each do |d|
8
+ if !d.is_a?(DMatrix)
9
+ raise TypeError, "invalid cache item: #{d.class.name}"
10
+ end
11
+ end
12
+
13
+ dmats = array_of_pointers(cache.map { |d| d.handle })
14
+ out = ::FFI::MemoryPointer.new(:pointer)
15
+ check_call FFI.XGBoosterCreate(dmats, cache.length, out)
16
+ @handle = ::FFI::AutoPointer.new(out.read_pointer, FFI.method(:XGBoosterFree))
4
17
 
5
- def initialize(params: nil, model_file: nil)
6
- @handle = ::FFI::MemoryPointer.new(:pointer)
7
- check_result FFI.XGBoosterCreate(nil, 0, @handle)
8
- ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i))
18
+ cache.each do |d|
19
+ assign_dmatrix_features(d)
20
+ end
9
21
 
10
22
  if model_file
11
- check_result FFI.XGBoosterLoadModel(handle_pointer, model_file)
23
+ check_call FFI.XGBoosterLoadModel(handle, model_file)
12
24
  end
13
25
 
14
- self.best_iteration = 0
15
26
  set_param(params)
16
27
  end
17
28
 
18
- def self.finalize(addr)
19
- # must use proc instead of stabby lambda
20
- proc { FFI.XGBoosterFree(::FFI::Pointer.new(:pointer, addr)) }
29
+ def [](key_name)
30
+ if key_name.is_a?(String)
31
+ return attr(key_name)
32
+ end
33
+
34
+ # TODO slice
35
+
36
+ raise TypeError, "expected string"
21
37
  end
22
38
 
23
- def update(dtrain, iteration)
24
- check_result FFI.XGBoosterUpdateOneIter(handle_pointer, iteration, dtrain.handle_pointer)
39
+ def []=(key_name, raw_value)
40
+ set_attr(**{key_name => raw_value})
25
41
  end
26
42
 
27
- def eval_set(evals, iteration)
28
- dmats = array_of_pointers(evals.map { |v| v[0].handle_pointer })
29
- evnames = array_of_pointers(evals.map { |v| string_pointer(v[1]) })
43
+ def save_config
44
+ length = ::FFI::MemoryPointer.new(:uint64)
45
+ json_string = ::FFI::MemoryPointer.new(:pointer)
46
+ check_call FFI.XGBoosterSaveJsonConfig(handle, length, json_string)
47
+ json_string.read_pointer.read_string(length.read_uint64).force_encoding(Encoding::UTF_8)
48
+ end
30
49
 
31
- out_result = ::FFI::MemoryPointer.new(:pointer)
50
+ def attr(key)
51
+ ret = ::FFI::MemoryPointer.new(:pointer)
52
+ success = ::FFI::MemoryPointer.new(:int)
53
+ check_call FFI.XGBoosterGetAttr(handle, key.to_s, ret, success)
54
+ success.read_int != 0 ? ret.read_pointer.read_string : nil
55
+ end
32
56
 
33
- check_result FFI.XGBoosterEvalOneIter(handle_pointer, iteration, dmats, evnames, evals.size, out_result)
57
+ def attributes
58
+ length = ::FFI::MemoryPointer.new(:uint64)
59
+ sarr = ::FFI::MemoryPointer.new(:pointer)
60
+ check_call FFI.XGBoosterGetAttrNames(handle, length, sarr)
61
+ attr_names = from_cstr_to_rbstr(sarr, length)
62
+ attr_names.to_h { |n| [n, attr(n)] }
63
+ end
34
64
 
35
- out_result.read_pointer.read_string
65
+ def set_attr(**kwargs)
66
+ kwargs.each do |key, value|
67
+ check_call FFI.XGBoosterSetAttr(handle, key.to_s, value&.to_s)
68
+ end
69
+ end
70
+
71
+ def feature_types
72
+ get_feature_info("feature_type")
73
+ end
74
+
75
+ def feature_types=(features)
76
+ set_feature_info(features, "feature_type")
77
+ end
78
+
79
+ def feature_names
80
+ get_feature_info("feature_name")
81
+ end
82
+
83
+ def feature_names=(features)
84
+ set_feature_info(features, "feature_name")
36
85
  end
37
86
 
38
87
  def set_param(params, value = nil)
39
88
  if params.is_a?(Enumerable)
40
89
  params.each do |k, v|
41
- check_result FFI.XGBoosterSetParam(handle_pointer, k.to_s, v.to_s)
90
+ check_call FFI.XGBoosterSetParam(handle, k.to_s, v.to_s)
42
91
  end
43
92
  else
44
- check_result FFI.XGBoosterSetParam(handle_pointer, params.to_s, value.to_s)
93
+ check_call FFI.XGBoosterSetParam(handle, params.to_s, value.to_s)
45
94
  end
46
95
  end
47
96
 
97
+ def update(dtrain, iteration)
98
+ check_call FFI.XGBoosterUpdateOneIter(handle, iteration, dtrain.handle)
99
+ end
100
+
101
+ def eval_set(evals, iteration)
102
+ dmats = array_of_pointers(evals.map { |v| v[0].handle })
103
+ evnames = array_of_pointers(evals.map { |v| string_pointer(v[1]) })
104
+
105
+ out_result = ::FFI::MemoryPointer.new(:pointer)
106
+
107
+ check_call FFI.XGBoosterEvalOneIter(handle, iteration, dmats, evnames, evals.size, out_result)
108
+
109
+ out_result.read_pointer.read_string
110
+ end
111
+
48
112
  def predict(data, ntree_limit: nil)
49
113
  ntree_limit ||= 0
50
114
  out_len = ::FFI::MemoryPointer.new(:uint64)
51
115
  out_result = ::FFI::MemoryPointer.new(:pointer)
52
- check_result FFI.XGBoosterPredict(handle_pointer, data.handle_pointer, 0, ntree_limit, 0, out_len, out_result)
53
- out = out_result.read_pointer.read_array_of_float(read_uint64(out_len))
116
+ check_call FFI.XGBoosterPredict(handle, data.handle, 0, ntree_limit, 0, out_len, out_result)
117
+ out = out_result.read_pointer.read_array_of_float(out_len.read_uint64)
54
118
  num_class = out.size / data.num_row
55
119
  out = out.each_slice(num_class).to_a if num_class > 1
56
120
  out
57
121
  end
58
122
 
59
123
  def save_model(fname)
60
- check_result FFI.XGBoosterSaveModel(handle_pointer, fname)
124
+ check_call FFI.XGBoosterSaveModel(handle, fname)
61
125
  end
62
126
 
63
- # returns an array of strings
64
- def dump(fmap: "", with_stats: false, dump_format: "text")
65
- out_len = ::FFI::MemoryPointer.new(:uint64)
66
- out_result = ::FFI::MemoryPointer.new(:pointer)
127
+ def best_iteration
128
+ attr(:best_iteration)&.to_i
129
+ end
67
130
 
68
- names = feature_names || []
69
- fnames = array_of_pointers(names.map { |fname| string_pointer(fname) })
70
- ftypes = array_of_pointers(feature_types || Array.new(names.size, string_pointer("float")))
131
+ def best_iteration=(iteration)
132
+ set_attr(best_iteration: iteration)
133
+ end
134
+
135
+ def best_score
136
+ attr(:best_score)&.to_f
137
+ end
71
138
 
72
- check_result FFI.XGBoosterDumpModelExWithFeatures(handle_pointer, names.size, fnames, ftypes, with_stats ? 1 : 0, dump_format, out_len, out_result)
139
+ def best_score=(score)
140
+ set_attr(best_score: score)
141
+ end
142
+
143
+ def num_boosted_rounds
144
+ rounds = ::FFI::MemoryPointer.new(:int)
145
+ check_call FFI.XGBoosterBoostedRounds(handle, rounds)
146
+ rounds.read_int
147
+ end
73
148
 
74
- out_result.read_pointer.get_array_of_string(0, read_uint64(out_len))
149
+ def num_features
150
+ features = ::FFI::MemoryPointer.new(:uint64)
151
+ check_call FFI.XGBoosterGetNumFeature(handle, features)
152
+ features.read_uint64
75
153
  end
76
154
 
77
155
  def dump_model(fout, fmap: "", with_stats: false, dump_format: "text")
@@ -93,6 +171,20 @@ module XGBoost
93
171
  end
94
172
  end
95
173
 
174
+ # returns an array of strings
175
+ def dump(fmap: "", with_stats: false, dump_format: "text")
176
+ out_len = ::FFI::MemoryPointer.new(:uint64)
177
+ out_result = ::FFI::MemoryPointer.new(:pointer)
178
+
179
+ names = feature_names || []
180
+ fnames = array_of_pointers(names.map { |fname| string_pointer(fname) })
181
+ ftypes = array_of_pointers(feature_types || Array.new(names.size, string_pointer("float")))
182
+
183
+ check_call FFI.XGBoosterDumpModelExWithFeatures(handle, names.size, fnames, ftypes, with_stats ? 1 : 0, dump_format, out_len, out_result)
184
+
185
+ out_result.read_pointer.get_array_of_string(0, out_len.read_uint64)
186
+ end
187
+
96
188
  def fscore(fmap: "")
97
189
  # always weight
98
190
  score(fmap: fmap, importance_type: "weight")
@@ -157,48 +249,67 @@ module XGBoost
157
249
  end
158
250
  end
159
251
 
160
- def [](key_name)
161
- key = string_pointer(key_name)
162
- success = ::FFI::MemoryPointer.new(:int)
163
- out_result = ::FFI::MemoryPointer.new(:pointer)
164
-
165
- check_result FFI.XGBoosterGetAttr(handle_pointer, key, out_result, success)
166
-
167
- success.read_int == 1 ? out_result.read_pointer.read_string : nil
168
- end
169
-
170
- def []=(key_name, raw_value)
171
- key = string_pointer(key_name)
172
- value = raw_value.nil? ? nil : string_pointer(raw_value)
252
+ private
173
253
 
174
- check_result FFI.XGBoosterSetAttr(handle_pointer, key, value)
254
+ def handle
255
+ @handle
175
256
  end
176
257
 
177
- def attributes
178
- out_len = ::FFI::MemoryPointer.new(:uint64)
179
- out_result = ::FFI::MemoryPointer.new(:pointer)
180
- check_result FFI.XGBoosterGetAttrNames(handle_pointer, out_len, out_result)
181
-
182
- len = read_uint64(out_len)
183
- key_names = len.zero? ? [] : out_result.read_pointer.get_array_of_string(0, len)
184
-
185
- key_names.map { |key_name| [key_name, self[key_name]] }.to_h
186
- end
258
+ def assign_dmatrix_features(data)
259
+ if data.num_row == 0
260
+ return
261
+ end
187
262
 
188
- private
263
+ fn = data.feature_names
264
+ ft = data.feature_types
189
265
 
190
- def handle_pointer
191
- @handle.read_pointer
266
+ if feature_names.nil?
267
+ self.feature_names = fn
268
+ end
269
+ if feature_types.nil?
270
+ self.feature_types = ft
271
+ end
192
272
  end
193
273
 
194
- def array_of_pointers(values)
195
- ::FFI::MemoryPointer.new(:pointer, values.size).write_array_of_pointer(values)
274
+ def get_feature_info(field)
275
+ length = ::FFI::MemoryPointer.new(:uint64)
276
+ sarr = ::FFI::MemoryPointer.new(:pointer)
277
+ if @handle.nil?
278
+ return nil
279
+ end
280
+ check_call(
281
+ FFI.XGBoosterGetStrFeatureInfo(
282
+ handle,
283
+ field,
284
+ length,
285
+ sarr
286
+ )
287
+ )
288
+ feature_info = from_cstr_to_rbstr(sarr, length)
289
+ !feature_info.empty? ? feature_info : nil
196
290
  end
197
291
 
198
- def string_pointer(value)
199
- ::FFI::MemoryPointer.from_string(value.to_s)
292
+ def set_feature_info(features, field)
293
+ if !features.nil?
294
+ if !features.is_a?(Array)
295
+ raise TypeError, "features must be an array"
296
+ end
297
+ c_feature_info = array_of_pointers(features.map { |f| string_pointer(f) })
298
+ check_call(
299
+ FFI.XGBoosterSetStrFeatureInfo(
300
+ handle,
301
+ field,
302
+ c_feature_info,
303
+ features.length
304
+ )
305
+ )
306
+ else
307
+ check_call(
308
+ FFI.XGBoosterSetStrFeatureInfo(
309
+ handle, field, nil, 0
310
+ )
311
+ )
312
+ end
200
313
  end
201
-
202
- include Utils
203
314
  end
204
315
  end
@@ -0,0 +1,145 @@
1
+ module XGBoost
2
+ class CallbackContainer
3
+ attr_reader :aggregated_cv, :history
4
+
5
+ def initialize(callbacks, is_cv: false)
6
+ @callbacks = callbacks
7
+ callbacks.each do |callback|
8
+ unless callback.is_a?(TrainingCallback)
9
+ raise TypeError, "callback must be an instance of XGBoost::TrainingCallback"
10
+ end
11
+ end
12
+
13
+ @history = {}
14
+ @is_cv = is_cv
15
+ end
16
+
17
+ def before_training(model)
18
+ @callbacks.each do |callback|
19
+ model = callback.before_training(model)
20
+ if @is_cv
21
+ unless model.is_a?(PackedBooster)
22
+ raise TypeError, "before_training should return the model"
23
+ end
24
+ else
25
+ unless model.is_a?(Booster)
26
+ raise TypeError, "before_training should return the model"
27
+ end
28
+ end
29
+ end
30
+ model
31
+ end
32
+
33
+ def after_training(model)
34
+ @callbacks.each do |callback|
35
+ model = callback.after_training(model)
36
+ if @is_cv
37
+ unless model.is_a?(PackedBooster)
38
+ raise TypeError, "after_training should return the model"
39
+ end
40
+ else
41
+ unless model.is_a?(Booster)
42
+ raise TypeError, "after_training should return the model"
43
+ end
44
+ end
45
+ end
46
+ model
47
+ end
48
+
49
+ def before_iteration(model, epoch, dtrain, evals)
50
+ @callbacks.any? do |callback|
51
+ callback.before_iteration(model, epoch, @history)
52
+ end
53
+ end
54
+
55
+ def after_iteration(model, epoch, dtrain, evals)
56
+ if @is_cv
57
+ scores = model.eval_set(epoch)
58
+ scores = aggcv(scores)
59
+ @aggregated_cv = scores
60
+ update_history(scores, epoch)
61
+ else
62
+ evals ||= []
63
+ evals.each do |_, name|
64
+ if name.include?("-")
65
+ raise ArgumentError, "Dataset name should not contain `-`"
66
+ end
67
+ end
68
+ score = model.eval_set(evals, epoch)
69
+ metric_score = parse_eval_str(score)
70
+ update_history(metric_score, epoch)
71
+ end
72
+
73
+ @callbacks.any? do |callback|
74
+ callback.after_iteration(model, epoch, @history)
75
+ end
76
+ end
77
+
78
+ private
79
+
80
+ def update_history(score, epoch)
81
+ score.each do |d|
82
+ name = d[0]
83
+ s = d[1]
84
+ if @is_cv
85
+ std = d[2]
86
+ x = [s, std]
87
+ else
88
+ x = s
89
+ end
90
+ splited_names = name.split("-")
91
+ data_name = splited_names[0]
92
+ metric_name = splited_names[1..].join("-")
93
+ @history[data_name] ||= {}
94
+ data_history = @history[data_name]
95
+ data_history[metric_name] ||= []
96
+ metric_history = data_history[metric_name]
97
+ metric_history << x
98
+ end
99
+ end
100
+
101
+ # TODO move
102
+ def parse_eval_str(result)
103
+ splited = result.split[1..]
104
+ # split up `test-error:0.1234`
105
+ metric_score_str = splited.map { |s| s.split(":") }
106
+ # convert to float
107
+ metric_score = metric_score_str.map { |n, s| [n, s.to_f] }
108
+ metric_score
109
+ end
110
+
111
+ def aggcv(rlist)
112
+ cvmap = {}
113
+ idx = rlist[0].split[0]
114
+ rlist.each do |line|
115
+ arr = line.split
116
+ arr[1..].each_with_index do |it, metric_idx|
117
+ k, v = it.split(":")
118
+ (cvmap[[metric_idx, k]] ||= []) << v.to_f
119
+ end
120
+ end
121
+ msg = idx
122
+ results = []
123
+ cvmap.sort { |x| x[0][0] }.each do |(_, name), s|
124
+ mean = mean(s)
125
+ std = stdev(s)
126
+ results << [name, mean, std]
127
+ end
128
+ results
129
+ end
130
+
131
+ def mean(arr)
132
+ arr.sum / arr.size.to_f
133
+ end
134
+
135
+ # don't subtract one from arr.size
136
+ def stdev(arr)
137
+ m = mean(arr)
138
+ sum = 0
139
+ arr.each do |v|
140
+ sum += (v - m) ** 2
141
+ end
142
+ Math.sqrt(sum / arr.size)
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,26 @@
1
+ require "forwardable"
2
+
3
+ module XGBoost
4
+ class CVPack
5
+ extend Forwardable
6
+
7
+ def_delegators :@bst, :num_boosted_rounds, :best_iteration=, :best_score=
8
+
9
+ attr_reader :bst
10
+
11
+ def initialize(dtrain, dtest, param)
12
+ @dtrain = dtrain
13
+ @dtest = dtest
14
+ @watchlist = [[dtrain, "train"], [dtest, "test"]]
15
+ @bst = Booster.new(params: param, cache: [dtrain, dtest])
16
+ end
17
+
18
+ def update(iteration)
19
+ @bst.update(@dtrain, iteration)
20
+ end
21
+
22
+ def eval_set(iteration)
23
+ @bst.eval_set(@watchlist, iteration)
24
+ end
25
+ end
26
+ end