xgb 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,77 +1,70 @@
1
1
  module XGBoost
2
2
  class DMatrix
3
- attr_reader :data, :feature_names, :feature_types
3
+ include Utils
4
+
5
+ attr_reader :handle
4
6
 
5
7
  def initialize(data, label: nil, weight: nil, missing: Float::NAN)
6
- @data = data
7
-
8
- @handle = ::FFI::MemoryPointer.new(:pointer)
9
-
10
- if data
11
- if matrix?(data)
12
- nrow = data.row_count
13
- ncol = data.column_count
14
- flat_data = data.to_a.flatten
15
- elsif daru?(data)
16
- nrow, ncol = data.shape
17
- flat_data = data.map_rows(&:to_a).flatten
18
- @feature_names = data.each_vector.map(&:name)
19
- @feature_types =
20
- data.each_vector.map(&:db_type).map do |v|
21
- case v
22
- when "INTEGER"
23
- "int"
24
- when "DOUBLE"
25
- "float"
26
- else
27
- raise Error, "Unknown feature type: #{v}"
28
- end
8
+ if data.is_a?(::FFI::AutoPointer)
9
+ @handle = data
10
+ return
11
+ end
12
+
13
+ if matrix?(data)
14
+ nrow = data.row_count
15
+ ncol = data.column_count
16
+ flat_data = data.to_a.flatten
17
+ elsif daru?(data)
18
+ nrow, ncol = data.shape
19
+ flat_data = data.map_rows(&:to_a).flatten
20
+ feature_names = data.each_vector.map(&:name)
21
+ feature_types =
22
+ data.each_vector.map(&:db_type).map do |v|
23
+ case v
24
+ when "INTEGER"
25
+ "int"
26
+ when "DOUBLE"
27
+ "float"
28
+ else
29
+ raise Error, "Unknown feature type: #{v}"
29
30
  end
30
- elsif numo?(data)
31
- nrow, ncol = data.shape
32
- elsif rover?(data)
33
- nrow, ncol = data.shape
34
- @feature_names = data.keys
35
- data = data.to_numo
36
- else
37
- nrow = data.count
38
- ncol = data.first.count
39
- if !data.all? { |r| r.size == ncol }
40
- # TODO raise ArgumentError in 0.8.0
41
- raise IndexError, "Rows have different sizes"
42
31
  end
43
- flat_data = data.flatten
32
+ elsif numo?(data)
33
+ nrow, ncol = data.shape
34
+ elsif rover?(data)
35
+ nrow, ncol = data.shape
36
+ feature_names = data.keys
37
+ data = data.to_numo
38
+ else
39
+ nrow = data.count
40
+ ncol = data.first.count
41
+ if !data.all? { |r| r.size == ncol }
42
+ raise ArgumentError, "Rows have different sizes"
44
43
  end
44
+ flat_data = data.flatten
45
+ end
45
46
 
46
- c_data = ::FFI::MemoryPointer.new(:float, nrow * ncol)
47
- if numo?(data)
48
- c_data.write_bytes(data.cast_to(Numo::SFloat).to_string)
49
- else
50
- handle_missing(flat_data, missing)
51
- c_data.write_array_of_float(flat_data)
52
- end
53
- check_result FFI.XGDMatrixCreateFromMat(c_data, nrow, ncol, missing, @handle)
47
+ c_data = ::FFI::MemoryPointer.new(:float, nrow * ncol)
48
+ if numo?(data)
49
+ c_data.write_bytes(data.cast_to(Numo::SFloat).to_string)
50
+ else
51
+ handle_missing(flat_data, missing)
52
+ c_data.write_array_of_float(flat_data)
53
+ end
54
54
 
55
- ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i))
55
+ out = ::FFI::MemoryPointer.new(:pointer)
56
+ check_call FFI.XGDMatrixCreateFromMat(c_data, nrow, ncol, missing, out)
57
+ @handle = ::FFI::AutoPointer.new(out.read_pointer, FFI.method(:XGDMatrixFree))
56
58
 
57
- @feature_names ||= ncol.times.map { |i| "f#{i}" }
58
- end
59
+ self.feature_names = feature_names || ncol.times.map { |i| "f#{i}" }
60
+ self.feature_types = feature_types if feature_types
59
61
 
60
62
  self.label = label if label
61
63
  self.weight = weight if weight
62
64
  end
63
65
 
64
- def self.finalize(addr)
65
- # must use proc instead of stabby lambda
66
- proc { FFI.XGDMatrixFree(::FFI::Pointer.new(:pointer, addr)) }
67
- end
68
-
69
- def label
70
- float_info("label")
71
- end
72
-
73
- def weight
74
- float_info("weight")
66
+ def save_binary(fname, silent: true)
67
+ check_call FFI.XGDMatrixSaveBinary(handle, fname, silent ? 1 : 0)
75
68
  end
76
69
 
77
70
  def label=(label)
@@ -85,39 +78,146 @@ module XGBoost
85
78
  def group=(group)
86
79
  c_data = ::FFI::MemoryPointer.new(:int, group.size)
87
80
  c_data.write_array_of_int(group)
88
- check_result FFI.XGDMatrixSetUIntInfo(handle_pointer, "group", c_data, group.size)
81
+ check_call FFI.XGDMatrixSetUIntInfo(handle, "group", c_data, group.size)
82
+ end
83
+
84
+ def label
85
+ float_info("label")
86
+ end
87
+
88
+ def weight
89
+ float_info("weight")
89
90
  end
90
91
 
91
92
  def num_row
92
93
  out = ::FFI::MemoryPointer.new(:uint64)
93
- check_result FFI.XGDMatrixNumRow(handle_pointer, out)
94
- read_uint64(out)
94
+ check_call FFI.XGDMatrixNumRow(handle, out)
95
+ out.read_uint64
95
96
  end
96
97
 
97
98
  def num_col
98
99
  out = ::FFI::MemoryPointer.new(:uint64)
99
- check_result FFI.XGDMatrixNumCol(handle_pointer, out)
100
- read_uint64(out)
100
+ check_call FFI.XGDMatrixNumCol(handle, out)
101
+ out.read_uint64
102
+ end
103
+
104
+ def num_nonmissing
105
+ out = ::FFI::MemoryPointer.new(:uint64)
106
+ check_call FFI.XGDMatrixNumNonMissing(handle, out)
107
+ out.read_uint64
108
+ end
109
+
110
+ def data_split_mode
111
+ out = ::FFI::MemoryPointer.new(:uint64)
112
+ check_call FFI.XGDMatrixDataSplitMode(handle, out)
113
+ out.read_uint64 == 0 ? :row : :col
101
114
  end
102
115
 
103
116
  def slice(rindex)
104
- res = DMatrix.new(nil)
105
117
  idxset = ::FFI::MemoryPointer.new(:int, rindex.count)
106
118
  idxset.write_array_of_int(rindex)
107
- check_result FFI.XGDMatrixSliceDMatrix(handle_pointer, idxset, rindex.size, res.handle)
108
- res
119
+ out = ::FFI::MemoryPointer.new(:pointer)
120
+ check_call FFI.XGDMatrixSliceDMatrix(handle, idxset, rindex.size, out)
121
+
122
+ handle = ::FFI::AutoPointer.new(out.read_pointer, FFI.method(:XGDMatrixFree))
123
+ DMatrix.new(handle)
109
124
  end
110
125
 
111
- def save_binary(fname, silent: true)
112
- check_result FFI.XGDMatrixSaveBinary(handle_pointer, fname, silent ? 1 : 0)
126
+ def feature_names
127
+ length = ::FFI::MemoryPointer.new(:uint64)
128
+ sarr = ::FFI::MemoryPointer.new(:pointer)
129
+ check_call(
130
+ FFI.XGDMatrixGetStrFeatureInfo(
131
+ handle,
132
+ "feature_name",
133
+ length,
134
+ sarr
135
+ )
136
+ )
137
+ feature_names = from_cstr_to_rbstr(sarr, length)
138
+ feature_names.empty? ? nil : feature_names
113
139
  end
114
140
 
115
- def handle
116
- @handle
141
+ def feature_names=(feature_names)
142
+ if feature_names.nil?
143
+ check_call(
144
+ FFI.XGDMatrixSetStrFeatureInfo(
145
+ handle, "feature_name", nil, 0
146
+ )
147
+ )
148
+ return
149
+ end
150
+
151
+ # validate feature name
152
+ feature_names =
153
+ validate_feature_info(
154
+ feature_names,
155
+ num_col,
156
+ data_split_mode == :col,
157
+ "feature names"
158
+ )
159
+ if feature_names.length != feature_names.uniq.length
160
+ raise ArgumentError, "feature_names must be unique"
161
+ end
162
+
163
+ # prohibit the use symbols that may affect parsing. e.g. []<
164
+ if !feature_names.all? { |f| f.is_a?(String) && !["[", "]", "<"].any? { |x| f.include?(x) } }
165
+ raise ArgumentError, "feature_names must be string, and may not contain [, ] or <"
166
+ end
167
+
168
+ c_feature_names = array_of_pointers(feature_names.map { |f| string_pointer(f) })
169
+ check_call(
170
+ FFI.XGDMatrixSetStrFeatureInfo(
171
+ handle,
172
+ "feature_name",
173
+ c_feature_names,
174
+ feature_names.length
175
+ )
176
+ )
177
+ end
178
+
179
+ def feature_types
180
+ length = ::FFI::MemoryPointer.new(:uint64)
181
+ sarr = ::FFI::MemoryPointer.new(:pointer)
182
+ check_call(
183
+ FFI.XGDMatrixGetStrFeatureInfo(
184
+ handle,
185
+ "feature_type",
186
+ length,
187
+ sarr
188
+ )
189
+ )
190
+ res = from_cstr_to_rbstr(sarr, length)
191
+ res.empty? ? nil : res
117
192
  end
118
193
 
119
- def handle_pointer
120
- @handle.read_pointer
194
+ def feature_types=(feature_types)
195
+ if feature_types.nil?
196
+ check_call(
197
+ FFI.XGDMatrixSetStrFeatureInfo(
198
+ handle, "feature_type", nil, 0
199
+ )
200
+ )
201
+ return
202
+ end
203
+
204
+ feature_types =
205
+ validate_feature_info(
206
+ feature_types,
207
+ num_col,
208
+ data_split_mode == :col,
209
+ "feature types"
210
+ )
211
+
212
+ c_feature_types = array_of_pointers(feature_types.map { |f| string_pointer(f) })
213
+ check_call(
214
+ FFI.XGDMatrixSetStrFeatureInfo(
215
+ handle,
216
+ "feature_type",
217
+ c_feature_types,
218
+ feature_types.length
219
+ )
220
+ )
121
221
  end
122
222
 
123
223
  private
@@ -126,17 +226,31 @@ module XGBoost
126
226
  data = data.to_a unless data.is_a?(Array)
127
227
  c_data = ::FFI::MemoryPointer.new(:float, data.size)
128
228
  c_data.write_array_of_float(data)
129
- check_result FFI.XGDMatrixSetFloatInfo(handle_pointer, field.to_s, c_data, data.size)
229
+ check_call FFI.XGDMatrixSetFloatInfo(handle, field.to_s, c_data, data.size)
130
230
  end
131
231
 
132
232
  def float_info(field)
133
233
  num_row ||= num_row()
134
- out_len = ::FFI::MemoryPointer.new(:int)
234
+ out_len = ::FFI::MemoryPointer.new(:uint64)
135
235
  out_dptr = ::FFI::MemoryPointer.new(:float, num_row)
136
- check_result FFI.XGDMatrixGetFloatInfo(handle_pointer, field, out_len, out_dptr)
236
+ check_call FFI.XGDMatrixGetFloatInfo(handle, field, out_len, out_dptr)
137
237
  out_dptr.read_pointer.read_array_of_float(num_row)
138
238
  end
139
239
 
240
+ def validate_feature_info(feature_info, n_features, is_column_split, name)
241
+ if !feature_info.is_a?(Array)
242
+ raise TypeError, "Expecting an array of strings for #{name}, got: #{feature_info.class.name}"
243
+ end
244
+ if feature_info.length != n_features && n_features != 0 && !is_column_split
245
+ msg = (
246
+ "#{name} must have the same length as the number of data columns, " +
247
+ "expected #{n_features}, got #{feature_info.length}"
248
+ )
249
+ raise ArgumentError, msg
250
+ end
251
+ feature_info
252
+ end
253
+
140
254
  def matrix?(data)
141
255
  defined?(Matrix) && data.is_a?(Matrix)
142
256
  end
@@ -156,7 +270,5 @@ module XGBoost
156
270
  def handle_missing(data, missing)
157
271
  data.map! { |v| v.nil? ? missing : v }
158
272
  end
159
-
160
- include Utils
161
273
  end
162
274
  end
@@ -0,0 +1,132 @@
1
+ module XGBoost
2
+ class EarlyStopping < TrainingCallback
3
+ def initialize(
4
+ rounds:,
5
+ metric_name: nil,
6
+ data_name: nil,
7
+ maximize: nil,
8
+ save_best: false,
9
+ min_delta: 0.0
10
+ )
11
+ @data = data_name
12
+ @metric_name = metric_name
13
+ @rounds = rounds
14
+ @save_best = save_best
15
+ @maximize = maximize
16
+ @stopping_history = {}
17
+ @min_delta = min_delta
18
+ if @min_delta < 0
19
+ raise ArgumentError, "min_delta must be greater or equal to 0."
20
+ end
21
+
22
+ @current_rounds = 0
23
+ @best_scores = {}
24
+ @starting_round = 0
25
+ super()
26
+ end
27
+
28
+ def before_training(model)
29
+ @starting_round = model.num_boosted_rounds
30
+ model
31
+ end
32
+
33
+ def after_iteration(model, epoch, evals_log)
34
+ epoch += @starting_round
35
+ msg = "Must have at least 1 validation dataset for early stopping."
36
+ if evals_log.keys.length < 1
37
+ raise ArgumentError, msg
38
+ end
39
+
40
+ # Get data name
41
+ if @data
42
+ data_name = @data
43
+ else
44
+ # Use the last one as default.
45
+ data_name = evals_log.keys[-1]
46
+ end
47
+ if !evals_log.include?(data_name)
48
+ raise ArgumentError, "No dataset named: #{data_name}"
49
+ end
50
+
51
+ if !data_name.is_a?(String)
52
+ raise TypeError, "The name of the dataset should be a string. Got: #{data_name.class.name}"
53
+ end
54
+ data_log = evals_log[data_name]
55
+
56
+ # Get metric name
57
+ if @metric_name
58
+ metric_name = @metric_name
59
+ else
60
+ # Use last metric by default.
61
+ metric_name = data_log.keys[-1]
62
+ end
63
+ if !data_log.include?(metric_name)
64
+ raise ArgumentError, "No metric named: #{metric_name}"
65
+ end
66
+
67
+ # The latest score
68
+ score = data_log[metric_name][-1]
69
+ update_rounds(
70
+ score, data_name, metric_name, model, epoch
71
+ )
72
+ end
73
+
74
+ def after_training(model)
75
+ if !@save_best
76
+ return model
77
+ end
78
+
79
+ best_iteration = model.best_iteration
80
+ best_score = model.best_score
81
+ # model = model[..(best_iteration + 1)]
82
+ model.best_iteration = best_iteration
83
+ model.best_score = best_score
84
+ model
85
+ end
86
+
87
+ private
88
+
89
+ def update_rounds(score, name, metric, model, epoch)
90
+ get_s = lambda do |value|
91
+ value.is_a?(Array) ? value[0] : value
92
+ end
93
+
94
+ maximize = lambda do |new_, best|
95
+ get_s.(new_) - @min_delta > get_s.(best)
96
+ end
97
+
98
+ minimize = lambda do |new_, best|
99
+ get_s.(best) - @min_delta > get_s.(new_)
100
+ end
101
+
102
+ improve_op = @maximize ? maximize : minimize
103
+
104
+ if @stopping_history.empty?
105
+ # First round
106
+ @current_rounds = 0
107
+ @stopping_history[name] = {}
108
+ @stopping_history[name][metric] = [score]
109
+ @best_scores[name] = {}
110
+ @best_scores[name][metric] = [score]
111
+ model.set_attr(best_score: get_s.(score), best_iteration: epoch)
112
+ elsif !improve_op.(score, @best_scores[name][metric][-1])
113
+ # Not improved
114
+ @stopping_history[name][metric] << score
115
+ @current_rounds += 1
116
+ else
117
+ # Improved
118
+ @stopping_history[name][metric] << score
119
+ @best_scores[name][metric] << score
120
+ record = @stopping_history[name][metric][-1]
121
+ model.set_attr(best_score: get_s.(record), best_iteration: epoch)
122
+ @current_rounds = 0
123
+ end
124
+
125
+ if @current_rounds >= @rounds
126
+ # Should stop
127
+ return true
128
+ end
129
+ false
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,44 @@
1
+ module XGBoost
2
+ class EvaluationMonitor < TrainingCallback
3
+ def initialize(period:, show_stdv: false)
4
+ @show_stdv = show_stdv
5
+ @period = period
6
+ end
7
+
8
+ def after_iteration(model, epoch, evals_log)
9
+ if evals_log.empty?
10
+ return false
11
+ end
12
+
13
+ msg = "[#{epoch}]"
14
+ evals_log.each do |data, metric|
15
+ metric.each do |metric_name, log|
16
+ stdv = nil
17
+ if log[-1].is_a?(Array)
18
+ score = log[-1][0]
19
+ stdv = log[-1][1]
20
+ else
21
+ score = log[-1]
22
+ end
23
+ msg += fmt_metric(data, metric_name, score, stdv)
24
+ end
25
+ end
26
+ msg += "\n"
27
+
28
+ if epoch % @period == 0
29
+ puts msg
30
+ end
31
+ false
32
+ end
33
+
34
+ private
35
+
36
+ def fmt_metric(data, metric, score, std)
37
+ if !std.nil? && @show_stdv
38
+ "\t%s:%.5f+%.5f" % [data + "-" + metric, score, std]
39
+ else
40
+ "\t%s:%.5f" % [data + "-" + metric, score]
41
+ end
42
+ end
43
+ end
44
+ end
data/lib/xgboost/ffi.rb CHANGED
@@ -22,8 +22,12 @@ module XGBoost
22
22
  # dmatrix
23
23
  attach_function :XGDMatrixCreateFromMat, %i[pointer uint64 uint64 float pointer], :int
24
24
  attach_function :XGDMatrixSetUIntInfo, %i[pointer string pointer uint64], :int
25
+ attach_function :XGDMatrixSetStrFeatureInfo, %i[pointer string pointer uint64], :int
26
+ attach_function :XGDMatrixGetStrFeatureInfo, %i[pointer string pointer pointer], :int
25
27
  attach_function :XGDMatrixNumRow, %i[pointer pointer], :int
26
28
  attach_function :XGDMatrixNumCol, %i[pointer pointer], :int
29
+ attach_function :XGDMatrixNumNonMissing, %i[pointer pointer], :int
30
+ attach_function :XGDMatrixDataSplitMode, %i[pointer pointer], :int
27
31
  attach_function :XGDMatrixSliceDMatrix, %i[pointer pointer uint64 pointer], :int
28
32
  attach_function :XGDMatrixFree, %i[pointer], :int
29
33
  attach_function :XGDMatrixSaveBinary, %i[pointer string int], :int
@@ -35,13 +39,18 @@ module XGBoost
35
39
  attach_function :XGBoosterUpdateOneIter, %i[pointer int pointer], :int
36
40
  attach_function :XGBoosterEvalOneIter, %i[pointer int pointer pointer uint64 pointer], :int
37
41
  attach_function :XGBoosterFree, %i[pointer], :int
42
+ attach_function :XGBoosterBoostedRounds, %i[pointer pointer], :int
38
43
  attach_function :XGBoosterSetParam, %i[pointer string string], :int
44
+ attach_function :XGBoosterGetNumFeature, %i[pointer pointer], :int
39
45
  attach_function :XGBoosterPredict, %i[pointer pointer int int int pointer pointer], :int
40
46
  attach_function :XGBoosterLoadModel, %i[pointer string], :int
41
47
  attach_function :XGBoosterSaveModel, %i[pointer string], :int
48
+ attach_function :XGBoosterSaveJsonConfig, %i[pointer pointer pointer], :int
42
49
  attach_function :XGBoosterDumpModelExWithFeatures, %i[pointer int pointer pointer int string pointer pointer], :int
43
- attach_function :XGBoosterGetAttr, %i[pointer pointer pointer pointer], :int
44
- attach_function :XGBoosterSetAttr, %i[pointer pointer pointer], :int
50
+ attach_function :XGBoosterGetAttr, %i[pointer string pointer pointer], :int
51
+ attach_function :XGBoosterSetAttr, %i[pointer string string], :int
45
52
  attach_function :XGBoosterGetAttrNames, %i[pointer pointer pointer], :int
53
+ attach_function :XGBoosterSetStrFeatureInfo, %i[pointer string pointer uint64], :int
54
+ attach_function :XGBoosterGetStrFeatureInfo, %i[pointer string pointer pointer], :int
46
55
  end
47
56
  end
@@ -0,0 +1,51 @@
1
+ module XGBoost
2
+ class PackedBooster
3
+ def initialize(cvfolds)
4
+ @cvfolds = cvfolds
5
+ end
6
+
7
+ def update(iteration)
8
+ @cvfolds.each do |fold|
9
+ fold.update(iteration)
10
+ end
11
+ end
12
+
13
+ def set_attr(**kwargs)
14
+ @cvfolds.each do |f|
15
+ f.bst.set_attr(**kwargs)
16
+ end
17
+ end
18
+
19
+ def attr(key)
20
+ @cvfolds[0].bst.attr(key)
21
+ end
22
+
23
+ def eval_set(iteration)
24
+ @cvfolds.map { |f| f.eval_set(iteration) }
25
+ end
26
+
27
+ def best_iteration
28
+ @cvfolds[0].bst.best_iteration
29
+ end
30
+
31
+ def best_iteration=(iteration)
32
+ @cvfolds.each do |fold|
33
+ fold.best_iteration = iteration
34
+ end
35
+ end
36
+
37
+ def best_score
38
+ @cvfolds[0].bst.best_score
39
+ end
40
+
41
+ def best_score=(score)
42
+ @cvfolds.each do |fold|
43
+ fold.best_score = score
44
+ end
45
+ end
46
+
47
+ def num_boosted_rounds
48
+ @cvfolds[0].num_boosted_rounds
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,23 @@
1
+ module XGBoost
2
+ class TrainingCallback
3
+ def before_training(model)
4
+ # Run before training starts
5
+ model
6
+ end
7
+
8
+ def after_training(model)
9
+ # Run after training is finished
10
+ model
11
+ end
12
+
13
+ def before_iteration(model, epoch, evals_log)
14
+ # Run before each iteration. Returns true when training should stop.
15
+ false
16
+ end
17
+
18
+ def after_iteration(model, epoch, evals_log)
19
+ # Run after each iteration. Returns true when training should stop.
20
+ false
21
+ end
22
+ end
23
+ end
data/lib/xgboost/utils.rb CHANGED
@@ -2,7 +2,7 @@ module XGBoost
2
2
  module Utils
3
3
  private
4
4
 
5
- def check_result(err)
5
+ def check_call(err)
6
6
  if err != 0
7
7
  # make friendly
8
8
  message = FFI.XGBGetLastError.split("\n").first.split(/:\d+: /, 2).last
@@ -10,9 +10,24 @@ module XGBoost
10
10
  end
11
11
  end
12
12
 
13
- # read_uint64 not available on JRuby
14
- def read_uint64(ptr)
15
- ptr.read_array_of_uint64(1).first
13
+ def array_of_pointers(values)
14
+ arr = ::FFI::MemoryPointer.new(:pointer, values.size)
15
+ arr.write_array_of_pointer(values)
16
+ # keep reference for string pointers
17
+ arr.instance_variable_set(:@xgboost_ref, values)
18
+ arr
19
+ end
20
+
21
+ def string_pointer(value)
22
+ ::FFI::MemoryPointer.from_string(value.to_s)
23
+ end
24
+
25
+ def from_cstr_to_rbstr(data, length)
26
+ res = []
27
+ length.read_uint64.times do |i|
28
+ res << data.read_pointer[i * ::FFI::Pointer.size].read_pointer.read_string.force_encoding(Encoding::UTF_8)
29
+ end
30
+ res
16
31
  end
17
32
  end
18
33
  end
@@ -1,3 +1,3 @@
1
1
  module XGBoost
2
- VERSION = "0.8.0"
2
+ VERSION = "0.9.0"
3
3
  end