xgb 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/NOTICE.txt +1 -1
- data/README.md +1 -1
- data/lib/xgboost/booster.rb +176 -65
- data/lib/xgboost/callback_container.rb +145 -0
- data/lib/xgboost/cv_pack.rb +26 -0
- data/lib/xgboost/dmatrix.rb +190 -78
- data/lib/xgboost/early_stopping.rb +132 -0
- data/lib/xgboost/evaluation_monitor.rb +44 -0
- data/lib/xgboost/ffi.rb +11 -2
- data/lib/xgboost/packed_booster.rb +51 -0
- data/lib/xgboost/training_callback.rb +23 -0
- data/lib/xgboost/utils.rb +19 -4
- data/lib/xgboost/version.rb +1 -1
- data/lib/xgboost.rb +107 -112
- data/vendor/aarch64-linux/libxgboost.so +0 -0
- data/vendor/arm64-darwin/libxgboost.dylib +0 -0
- data/vendor/x64-mingw/xgboost.dll +0 -0
- data/vendor/x86_64-darwin/libxgboost.dylib +0 -0
- data/vendor/x86_64-linux/libxgboost.so +0 -0
- data/vendor/x86_64-linux-musl/libxgboost.so +0 -0
- metadata +10 -10
- data/vendor/aarch64-linux/LICENSE-rabit.txt +0 -28
- data/vendor/arm64-darwin/LICENSE-rabit.txt +0 -28
- data/vendor/x64-mingw/LICENSE-rabit.txt +0 -28
- data/vendor/x86_64-darwin/LICENSE-rabit.txt +0 -28
- data/vendor/x86_64-linux/LICENSE-rabit.txt +0 -28
- data/vendor/x86_64-linux-musl/LICENSE-rabit.txt +0 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ba232bc8a9c27bc9cfb2b7f87afc581acf9a5464df9fccf1b206d912704771fe
|
4
|
+
data.tar.gz: 608ca80f01e81d32a31b796364de14096c89556543fa2892efd86bbf86fac96a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af38ca270cf3d12a8a7757ff5f32a41f5d68e43f9ad08c585df8efae835d014ddcac51201019aa339143cc081c5c69ef5914d8de06cb4326ccb2864b5b04eba2
|
7
|
+
data.tar.gz: 787bddc1c867d648ffe13c783aede98c62917feca536d1f8fe27e7992e5f2945126ad15b1a4e3ff48cc041723d299cf3cbacdd5557083983575eae22a9583cd8
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
## 0.9.0 (2024-10-17)
|
2
|
+
|
3
|
+
- Updated XGBoost to 2.1.1
|
4
|
+
- Added support for callbacks
|
5
|
+
- Added `num_features` and `save_config` methods to `Booster`
|
6
|
+
- Added `num_nonmissing` and `data_split_mode` methods to `DMatrix`
|
7
|
+
- Dropped support for Ruby < 3.1
|
8
|
+
|
1
9
|
## 0.8.0 (2023-09-13)
|
2
10
|
|
3
11
|
- Updated XGBoost to 2.0.0
|
data/NOTICE.txt
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[XGBoost](https://github.com/dmlc/xgboost) - high performance gradient boosting - for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://github.com/ankane/xgboost-ruby/workflows/build/badge.svg
|
5
|
+
[![Build Status](https://github.com/ankane/xgboost-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/xgboost-ruby/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
data/lib/xgboost/booster.rb
CHANGED
@@ -1,77 +1,155 @@
|
|
1
1
|
module XGBoost
|
2
2
|
class Booster
|
3
|
-
|
3
|
+
include Utils
|
4
|
+
|
5
|
+
def initialize(params: nil, cache: nil, model_file: nil)
|
6
|
+
cache ||= []
|
7
|
+
cache.each do |d|
|
8
|
+
if !d.is_a?(DMatrix)
|
9
|
+
raise TypeError, "invalid cache item: #{d.class.name}"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
dmats = array_of_pointers(cache.map { |d| d.handle })
|
14
|
+
out = ::FFI::MemoryPointer.new(:pointer)
|
15
|
+
check_call FFI.XGBoosterCreate(dmats, cache.length, out)
|
16
|
+
@handle = ::FFI::AutoPointer.new(out.read_pointer, FFI.method(:XGBoosterFree))
|
4
17
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i))
|
18
|
+
cache.each do |d|
|
19
|
+
assign_dmatrix_features(d)
|
20
|
+
end
|
9
21
|
|
10
22
|
if model_file
|
11
|
-
|
23
|
+
check_call FFI.XGBoosterLoadModel(handle, model_file)
|
12
24
|
end
|
13
25
|
|
14
|
-
self.best_iteration = 0
|
15
26
|
set_param(params)
|
16
27
|
end
|
17
28
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
29
|
+
def [](key_name)
|
30
|
+
if key_name.is_a?(String)
|
31
|
+
return attr(key_name)
|
32
|
+
end
|
33
|
+
|
34
|
+
# TODO slice
|
35
|
+
|
36
|
+
raise TypeError, "expected string"
|
21
37
|
end
|
22
38
|
|
23
|
-
def
|
24
|
-
|
39
|
+
def []=(key_name, raw_value)
|
40
|
+
set_attr(**{key_name => raw_value})
|
25
41
|
end
|
26
42
|
|
27
|
-
def
|
28
|
-
|
29
|
-
|
43
|
+
def save_config
|
44
|
+
length = ::FFI::MemoryPointer.new(:uint64)
|
45
|
+
json_string = ::FFI::MemoryPointer.new(:pointer)
|
46
|
+
check_call FFI.XGBoosterSaveJsonConfig(handle, length, json_string)
|
47
|
+
json_string.read_pointer.read_string(length.read_uint64).force_encoding(Encoding::UTF_8)
|
48
|
+
end
|
30
49
|
|
31
|
-
|
50
|
+
def attr(key)
|
51
|
+
ret = ::FFI::MemoryPointer.new(:pointer)
|
52
|
+
success = ::FFI::MemoryPointer.new(:int)
|
53
|
+
check_call FFI.XGBoosterGetAttr(handle, key.to_s, ret, success)
|
54
|
+
success.read_int != 0 ? ret.read_pointer.read_string : nil
|
55
|
+
end
|
32
56
|
|
33
|
-
|
57
|
+
def attributes
|
58
|
+
length = ::FFI::MemoryPointer.new(:uint64)
|
59
|
+
sarr = ::FFI::MemoryPointer.new(:pointer)
|
60
|
+
check_call FFI.XGBoosterGetAttrNames(handle, length, sarr)
|
61
|
+
attr_names = from_cstr_to_rbstr(sarr, length)
|
62
|
+
attr_names.to_h { |n| [n, attr(n)] }
|
63
|
+
end
|
34
64
|
|
35
|
-
|
65
|
+
def set_attr(**kwargs)
|
66
|
+
kwargs.each do |key, value|
|
67
|
+
check_call FFI.XGBoosterSetAttr(handle, key.to_s, value&.to_s)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def feature_types
|
72
|
+
get_feature_info("feature_type")
|
73
|
+
end
|
74
|
+
|
75
|
+
def feature_types=(features)
|
76
|
+
set_feature_info(features, "feature_type")
|
77
|
+
end
|
78
|
+
|
79
|
+
def feature_names
|
80
|
+
get_feature_info("feature_name")
|
81
|
+
end
|
82
|
+
|
83
|
+
def feature_names=(features)
|
84
|
+
set_feature_info(features, "feature_name")
|
36
85
|
end
|
37
86
|
|
38
87
|
def set_param(params, value = nil)
|
39
88
|
if params.is_a?(Enumerable)
|
40
89
|
params.each do |k, v|
|
41
|
-
|
90
|
+
check_call FFI.XGBoosterSetParam(handle, k.to_s, v.to_s)
|
42
91
|
end
|
43
92
|
else
|
44
|
-
|
93
|
+
check_call FFI.XGBoosterSetParam(handle, params.to_s, value.to_s)
|
45
94
|
end
|
46
95
|
end
|
47
96
|
|
97
|
+
def update(dtrain, iteration)
|
98
|
+
check_call FFI.XGBoosterUpdateOneIter(handle, iteration, dtrain.handle)
|
99
|
+
end
|
100
|
+
|
101
|
+
def eval_set(evals, iteration)
|
102
|
+
dmats = array_of_pointers(evals.map { |v| v[0].handle })
|
103
|
+
evnames = array_of_pointers(evals.map { |v| string_pointer(v[1]) })
|
104
|
+
|
105
|
+
out_result = ::FFI::MemoryPointer.new(:pointer)
|
106
|
+
|
107
|
+
check_call FFI.XGBoosterEvalOneIter(handle, iteration, dmats, evnames, evals.size, out_result)
|
108
|
+
|
109
|
+
out_result.read_pointer.read_string
|
110
|
+
end
|
111
|
+
|
48
112
|
def predict(data, ntree_limit: nil)
|
49
113
|
ntree_limit ||= 0
|
50
114
|
out_len = ::FFI::MemoryPointer.new(:uint64)
|
51
115
|
out_result = ::FFI::MemoryPointer.new(:pointer)
|
52
|
-
|
53
|
-
out = out_result.read_pointer.read_array_of_float(read_uint64
|
116
|
+
check_call FFI.XGBoosterPredict(handle, data.handle, 0, ntree_limit, 0, out_len, out_result)
|
117
|
+
out = out_result.read_pointer.read_array_of_float(out_len.read_uint64)
|
54
118
|
num_class = out.size / data.num_row
|
55
119
|
out = out.each_slice(num_class).to_a if num_class > 1
|
56
120
|
out
|
57
121
|
end
|
58
122
|
|
59
123
|
def save_model(fname)
|
60
|
-
|
124
|
+
check_call FFI.XGBoosterSaveModel(handle, fname)
|
61
125
|
end
|
62
126
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
out_result = ::FFI::MemoryPointer.new(:pointer)
|
127
|
+
def best_iteration
|
128
|
+
attr(:best_iteration)&.to_i
|
129
|
+
end
|
67
130
|
|
68
|
-
|
69
|
-
|
70
|
-
|
131
|
+
def best_iteration=(iteration)
|
132
|
+
set_attr(best_iteration: iteration)
|
133
|
+
end
|
134
|
+
|
135
|
+
def best_score
|
136
|
+
attr(:best_score)&.to_f
|
137
|
+
end
|
71
138
|
|
72
|
-
|
139
|
+
def best_score=(score)
|
140
|
+
set_attr(best_score: score)
|
141
|
+
end
|
142
|
+
|
143
|
+
def num_boosted_rounds
|
144
|
+
rounds = ::FFI::MemoryPointer.new(:int)
|
145
|
+
check_call FFI.XGBoosterBoostedRounds(handle, rounds)
|
146
|
+
rounds.read_int
|
147
|
+
end
|
73
148
|
|
74
|
-
|
149
|
+
def num_features
|
150
|
+
features = ::FFI::MemoryPointer.new(:uint64)
|
151
|
+
check_call FFI.XGBoosterGetNumFeature(handle, features)
|
152
|
+
features.read_uint64
|
75
153
|
end
|
76
154
|
|
77
155
|
def dump_model(fout, fmap: "", with_stats: false, dump_format: "text")
|
@@ -93,6 +171,20 @@ module XGBoost
|
|
93
171
|
end
|
94
172
|
end
|
95
173
|
|
174
|
+
# returns an array of strings
|
175
|
+
def dump(fmap: "", with_stats: false, dump_format: "text")
|
176
|
+
out_len = ::FFI::MemoryPointer.new(:uint64)
|
177
|
+
out_result = ::FFI::MemoryPointer.new(:pointer)
|
178
|
+
|
179
|
+
names = feature_names || []
|
180
|
+
fnames = array_of_pointers(names.map { |fname| string_pointer(fname) })
|
181
|
+
ftypes = array_of_pointers(feature_types || Array.new(names.size, string_pointer("float")))
|
182
|
+
|
183
|
+
check_call FFI.XGBoosterDumpModelExWithFeatures(handle, names.size, fnames, ftypes, with_stats ? 1 : 0, dump_format, out_len, out_result)
|
184
|
+
|
185
|
+
out_result.read_pointer.get_array_of_string(0, out_len.read_uint64)
|
186
|
+
end
|
187
|
+
|
96
188
|
def fscore(fmap: "")
|
97
189
|
# always weight
|
98
190
|
score(fmap: fmap, importance_type: "weight")
|
@@ -157,48 +249,67 @@ module XGBoost
|
|
157
249
|
end
|
158
250
|
end
|
159
251
|
|
160
|
-
|
161
|
-
key = string_pointer(key_name)
|
162
|
-
success = ::FFI::MemoryPointer.new(:int)
|
163
|
-
out_result = ::FFI::MemoryPointer.new(:pointer)
|
164
|
-
|
165
|
-
check_result FFI.XGBoosterGetAttr(handle_pointer, key, out_result, success)
|
166
|
-
|
167
|
-
success.read_int == 1 ? out_result.read_pointer.read_string : nil
|
168
|
-
end
|
169
|
-
|
170
|
-
def []=(key_name, raw_value)
|
171
|
-
key = string_pointer(key_name)
|
172
|
-
value = raw_value.nil? ? nil : string_pointer(raw_value)
|
252
|
+
private
|
173
253
|
|
174
|
-
|
254
|
+
def handle
|
255
|
+
@handle
|
175
256
|
end
|
176
257
|
|
177
|
-
def
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
len = read_uint64(out_len)
|
183
|
-
key_names = len.zero? ? [] : out_result.read_pointer.get_array_of_string(0, len)
|
184
|
-
|
185
|
-
key_names.map { |key_name| [key_name, self[key_name]] }.to_h
|
186
|
-
end
|
258
|
+
def assign_dmatrix_features(data)
|
259
|
+
if data.num_row == 0
|
260
|
+
return
|
261
|
+
end
|
187
262
|
|
188
|
-
|
263
|
+
fn = data.feature_names
|
264
|
+
ft = data.feature_types
|
189
265
|
|
190
|
-
|
191
|
-
|
266
|
+
if feature_names.nil?
|
267
|
+
self.feature_names = fn
|
268
|
+
end
|
269
|
+
if feature_types.nil?
|
270
|
+
self.feature_types = ft
|
271
|
+
end
|
192
272
|
end
|
193
273
|
|
194
|
-
def
|
195
|
-
::FFI::MemoryPointer.new(:
|
274
|
+
def get_feature_info(field)
|
275
|
+
length = ::FFI::MemoryPointer.new(:uint64)
|
276
|
+
sarr = ::FFI::MemoryPointer.new(:pointer)
|
277
|
+
if @handle.nil?
|
278
|
+
return nil
|
279
|
+
end
|
280
|
+
check_call(
|
281
|
+
FFI.XGBoosterGetStrFeatureInfo(
|
282
|
+
handle,
|
283
|
+
field,
|
284
|
+
length,
|
285
|
+
sarr
|
286
|
+
)
|
287
|
+
)
|
288
|
+
feature_info = from_cstr_to_rbstr(sarr, length)
|
289
|
+
!feature_info.empty? ? feature_info : nil
|
196
290
|
end
|
197
291
|
|
198
|
-
def
|
199
|
-
|
292
|
+
def set_feature_info(features, field)
|
293
|
+
if !features.nil?
|
294
|
+
if !features.is_a?(Array)
|
295
|
+
raise TypeError, "features must be an array"
|
296
|
+
end
|
297
|
+
c_feature_info = array_of_pointers(features.map { |f| string_pointer(f) })
|
298
|
+
check_call(
|
299
|
+
FFI.XGBoosterSetStrFeatureInfo(
|
300
|
+
handle,
|
301
|
+
field,
|
302
|
+
c_feature_info,
|
303
|
+
features.length
|
304
|
+
)
|
305
|
+
)
|
306
|
+
else
|
307
|
+
check_call(
|
308
|
+
FFI.XGBoosterSetStrFeatureInfo(
|
309
|
+
handle, field, nil, 0
|
310
|
+
)
|
311
|
+
)
|
312
|
+
end
|
200
313
|
end
|
201
|
-
|
202
|
-
include Utils
|
203
314
|
end
|
204
315
|
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
module XGBoost
|
2
|
+
class CallbackContainer
|
3
|
+
attr_reader :aggregated_cv, :history
|
4
|
+
|
5
|
+
def initialize(callbacks, is_cv: false)
|
6
|
+
@callbacks = callbacks
|
7
|
+
callbacks.each do |callback|
|
8
|
+
unless callback.is_a?(TrainingCallback)
|
9
|
+
raise TypeError, "callback must be an instance of XGBoost::TrainingCallback"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
@history = {}
|
14
|
+
@is_cv = is_cv
|
15
|
+
end
|
16
|
+
|
17
|
+
def before_training(model)
|
18
|
+
@callbacks.each do |callback|
|
19
|
+
model = callback.before_training(model)
|
20
|
+
if @is_cv
|
21
|
+
unless model.is_a?(PackedBooster)
|
22
|
+
raise TypeError, "before_training should return the model"
|
23
|
+
end
|
24
|
+
else
|
25
|
+
unless model.is_a?(Booster)
|
26
|
+
raise TypeError, "before_training should return the model"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
model
|
31
|
+
end
|
32
|
+
|
33
|
+
def after_training(model)
|
34
|
+
@callbacks.each do |callback|
|
35
|
+
model = callback.after_training(model)
|
36
|
+
if @is_cv
|
37
|
+
unless model.is_a?(PackedBooster)
|
38
|
+
raise TypeError, "after_training should return the model"
|
39
|
+
end
|
40
|
+
else
|
41
|
+
unless model.is_a?(Booster)
|
42
|
+
raise TypeError, "after_training should return the model"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
model
|
47
|
+
end
|
48
|
+
|
49
|
+
def before_iteration(model, epoch, dtrain, evals)
|
50
|
+
@callbacks.any? do |callback|
|
51
|
+
callback.before_iteration(model, epoch, @history)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def after_iteration(model, epoch, dtrain, evals)
|
56
|
+
if @is_cv
|
57
|
+
scores = model.eval_set(epoch)
|
58
|
+
scores = aggcv(scores)
|
59
|
+
@aggregated_cv = scores
|
60
|
+
update_history(scores, epoch)
|
61
|
+
else
|
62
|
+
evals ||= []
|
63
|
+
evals.each do |_, name|
|
64
|
+
if name.include?("-")
|
65
|
+
raise ArgumentError, "Dataset name should not contain `-`"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
score = model.eval_set(evals, epoch)
|
69
|
+
metric_score = parse_eval_str(score)
|
70
|
+
update_history(metric_score, epoch)
|
71
|
+
end
|
72
|
+
|
73
|
+
@callbacks.any? do |callback|
|
74
|
+
callback.after_iteration(model, epoch, @history)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def update_history(score, epoch)
|
81
|
+
score.each do |d|
|
82
|
+
name = d[0]
|
83
|
+
s = d[1]
|
84
|
+
if @is_cv
|
85
|
+
std = d[2]
|
86
|
+
x = [s, std]
|
87
|
+
else
|
88
|
+
x = s
|
89
|
+
end
|
90
|
+
splited_names = name.split("-")
|
91
|
+
data_name = splited_names[0]
|
92
|
+
metric_name = splited_names[1..].join("-")
|
93
|
+
@history[data_name] ||= {}
|
94
|
+
data_history = @history[data_name]
|
95
|
+
data_history[metric_name] ||= []
|
96
|
+
metric_history = data_history[metric_name]
|
97
|
+
metric_history << x
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# TODO move
|
102
|
+
def parse_eval_str(result)
|
103
|
+
splited = result.split[1..]
|
104
|
+
# split up `test-error:0.1234`
|
105
|
+
metric_score_str = splited.map { |s| s.split(":") }
|
106
|
+
# convert to float
|
107
|
+
metric_score = metric_score_str.map { |n, s| [n, s.to_f] }
|
108
|
+
metric_score
|
109
|
+
end
|
110
|
+
|
111
|
+
def aggcv(rlist)
|
112
|
+
cvmap = {}
|
113
|
+
idx = rlist[0].split[0]
|
114
|
+
rlist.each do |line|
|
115
|
+
arr = line.split
|
116
|
+
arr[1..].each_with_index do |it, metric_idx|
|
117
|
+
k, v = it.split(":")
|
118
|
+
(cvmap[[metric_idx, k]] ||= []) << v.to_f
|
119
|
+
end
|
120
|
+
end
|
121
|
+
msg = idx
|
122
|
+
results = []
|
123
|
+
cvmap.sort { |x| x[0][0] }.each do |(_, name), s|
|
124
|
+
mean = mean(s)
|
125
|
+
std = stdev(s)
|
126
|
+
results << [name, mean, std]
|
127
|
+
end
|
128
|
+
results
|
129
|
+
end
|
130
|
+
|
131
|
+
def mean(arr)
|
132
|
+
arr.sum / arr.size.to_f
|
133
|
+
end
|
134
|
+
|
135
|
+
# don't subtract one from arr.size
|
136
|
+
def stdev(arr)
|
137
|
+
m = mean(arr)
|
138
|
+
sum = 0
|
139
|
+
arr.each do |v|
|
140
|
+
sum += (v - m) ** 2
|
141
|
+
end
|
142
|
+
Math.sqrt(sum / arr.size)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "forwardable"
|
2
|
+
|
3
|
+
module XGBoost
|
4
|
+
class CVPack
|
5
|
+
extend Forwardable
|
6
|
+
|
7
|
+
def_delegators :@bst, :num_boosted_rounds, :best_iteration=, :best_score=
|
8
|
+
|
9
|
+
attr_reader :bst
|
10
|
+
|
11
|
+
def initialize(dtrain, dtest, param)
|
12
|
+
@dtrain = dtrain
|
13
|
+
@dtest = dtest
|
14
|
+
@watchlist = [[dtrain, "train"], [dtest, "test"]]
|
15
|
+
@bst = Booster.new(params: param, cache: [dtrain, dtest])
|
16
|
+
end
|
17
|
+
|
18
|
+
def update(iteration)
|
19
|
+
@bst.update(@dtrain, iteration)
|
20
|
+
end
|
21
|
+
|
22
|
+
def eval_set(iteration)
|
23
|
+
@bst.eval_set(@watchlist, iteration)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|