xgb 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +90 -6
- data/lib/xgb.rb +140 -2
- data/lib/xgb/booster.rb +112 -4
- data/lib/xgb/classifier.rb +68 -0
- data/lib/xgb/dmatrix.rb +91 -10
- data/lib/xgb/ffi.rb +8 -2
- data/lib/xgb/regressor.rb +39 -0
- data/lib/xgb/utils.rb +5 -1
- data/lib/xgb/version.rb +1 -1
- metadata +32 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1bb50395d579da91b18754bc75e780cbb2e98fd7a48a17c34514230d1c4828d1
|
4
|
+
data.tar.gz: 3d2f9c5a72c63c2622a973805c9f2caa9bd4de7b5c67f8c4b5445fd9a71993c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f141b3ea0b6ceb8549198fd6ad8a07f6947201409478fc4829fe625da376e40d8028427a5aa34191b565aa275d27bb03e2082bb8fc489f6da6a2a09b3bbf2c2f
|
7
|
+
data.tar.gz: c393f4fdbe240ffc14b64f22f17d149ed393070fb0752f9ec49dd94bcfa88f446ea21bc5bf9a96bef7759c5c47033dd0480a4001d477b8c487cf5dcf8be19b81
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## 0.1.1
|
2
|
+
|
3
|
+
- Added Scikit-Learn API
|
4
|
+
- Added early stopping
|
5
|
+
- Added `cv` method
|
6
|
+
- Added support for Daru and Numo::NArray
|
7
|
+
- Added many other methods
|
8
|
+
- Fixed shape of multiclass predictions when loaded from file
|
9
|
+
|
1
10
|
## 0.1.0
|
2
11
|
|
3
12
|
- First release
|
data/README.md
CHANGED
@@ -4,6 +4,8 @@
|
|
4
4
|
|
5
5
|
:fire: Uses the C API for blazing performance
|
6
6
|
|
7
|
+
[](https://travis-ci.org/ankane/xgb)
|
8
|
+
|
7
9
|
## Installation
|
8
10
|
|
9
11
|
First, [install XGBoost](https://xgboost.readthedocs.io/en/latest/build.html). On Mac, copy `lib/libxgboost.dylib` to `/usr/local/lib`.
|
@@ -16,12 +18,16 @@ gem 'xgb'
|
|
16
18
|
|
17
19
|
## Getting Started
|
18
20
|
|
21
|
+
This library follows the [Core Data Structure, Learning and Scikit-Learn APIs](https://xgboost.readthedocs.io/en/latest/python/python_api.html) of the Python library. Some methods and options are missing at the moment. PRs welcome!
|
22
|
+
|
23
|
+
## Learning API
|
24
|
+
|
19
25
|
Train a model
|
20
26
|
|
21
27
|
```ruby
|
22
28
|
params = {objective: "reg:squarederror"}
|
23
|
-
|
24
|
-
booster = Xgb.train(params,
|
29
|
+
dtrain = Xgb::DMatrix.new(x_train, label: y_train)
|
30
|
+
booster = Xgb.train(params, dtrain)
|
25
31
|
```
|
26
32
|
|
27
33
|
Predict
|
@@ -33,18 +39,96 @@ booster.predict(x_test)
|
|
33
39
|
Save the model to a file
|
34
40
|
|
35
41
|
```ruby
|
36
|
-
booster.save_model("model
|
42
|
+
booster.save_model("my.model")
|
37
43
|
```
|
38
44
|
|
39
45
|
Load the model from a file
|
40
46
|
|
41
47
|
```ruby
|
42
|
-
booster = Xgb::Booster.new(model_file: "model
|
48
|
+
booster = Xgb::Booster.new(model_file: "my.model")
|
49
|
+
```
|
50
|
+
|
51
|
+
Get the importance of features
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
booster.score
|
55
|
+
```
|
56
|
+
|
57
|
+
Early stopping
|
58
|
+
|
59
|
+
```ruby
|
60
|
+
Xgb.train(params, dtrain, evals: [[dtrain, "train"], [dtest, "eval"]], early_stopping_rounds: 5)
|
61
|
+
```
|
62
|
+
|
63
|
+
CV
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
Xgb.cv(params, dtrain, nfold: 3, verbose_eval: true)
|
43
67
|
```
|
44
68
|
|
45
|
-
##
|
69
|
+
## Scikit-Learn API
|
46
70
|
|
47
|
-
|
71
|
+
Prep your data
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
x = [[1, 2], [3, 4], [5, 6], [7, 8]]
|
75
|
+
y = [1, 2, 3, 4]
|
76
|
+
```
|
77
|
+
|
78
|
+
Train a model
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
model = Xgb::Regressor.new
|
82
|
+
model.fit(x, y)
|
83
|
+
```
|
84
|
+
|
85
|
+
> For classification, use `Xgb::Classifier`
|
86
|
+
|
87
|
+
Predict
|
88
|
+
|
89
|
+
```ruby
|
90
|
+
model.predict(x)
|
91
|
+
```
|
92
|
+
|
93
|
+
> For classification, use `predict_proba` for probabilities
|
94
|
+
|
95
|
+
Save the model to a file
|
96
|
+
|
97
|
+
```ruby
|
98
|
+
model.save_model("my.model")
|
99
|
+
```
|
100
|
+
|
101
|
+
Load the model from a file
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
model.load_model("my.model")
|
105
|
+
```
|
106
|
+
|
107
|
+
Get the importance of features
|
108
|
+
|
109
|
+
```ruby
|
110
|
+
model.feature_importances
|
111
|
+
```
|
112
|
+
|
113
|
+
## Data
|
114
|
+
|
115
|
+
Data can be an array of arrays
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
[[1, 2, 3], [4, 5, 6]]
|
119
|
+
```
|
120
|
+
|
121
|
+
Or a Daru data frame
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
Daru::DataFrame.from_csv("houses.csv")
|
125
|
+
```
|
126
|
+
|
127
|
+
Or a Numo NArray
|
128
|
+
|
129
|
+
```ruby
|
130
|
+
Numo::DFloat.new(3, 2).seq
|
131
|
+
```
|
48
132
|
|
49
133
|
## Helpful Resources
|
50
134
|
|
data/lib/xgb.rb
CHANGED
@@ -8,19 +8,157 @@ require "xgb/dmatrix"
|
|
8
8
|
require "xgb/ffi"
|
9
9
|
require "xgb/version"
|
10
10
|
|
11
|
+
# scikit-learn API
|
12
|
+
require "xgb/classifier"
|
13
|
+
require "xgb/regressor"
|
14
|
+
|
11
15
|
module Xgb
|
12
16
|
class Error < StandardError; end
|
13
17
|
|
14
18
|
class << self
|
15
|
-
def train(params, dtrain, num_boost_round: 10)
|
19
|
+
def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true)
|
16
20
|
booster = Booster.new(params: params)
|
17
|
-
|
21
|
+
num_feature = dtrain.num_col
|
22
|
+
booster.set_param("num_feature", num_feature)
|
23
|
+
booster.feature_names = num_feature.times.map { |i| "f#{i}" }
|
24
|
+
evals ||= []
|
25
|
+
|
26
|
+
if early_stopping_rounds
|
27
|
+
best_score = nil
|
28
|
+
best_iter = nil
|
29
|
+
best_message = nil
|
30
|
+
end
|
18
31
|
|
19
32
|
num_boost_round.times do |iteration|
|
20
33
|
booster.update(dtrain, iteration)
|
34
|
+
|
35
|
+
if evals.any?
|
36
|
+
message = booster.eval_set(evals, iteration)
|
37
|
+
res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] }
|
38
|
+
|
39
|
+
if early_stopping_rounds && iteration == 0
|
40
|
+
metric = res[-1][0]
|
41
|
+
puts "Will train until #{metric} hasn't improved in #{early_stopping_rounds.to_i} rounds." if verbose_eval
|
42
|
+
end
|
43
|
+
|
44
|
+
puts message if verbose_eval
|
45
|
+
score = res[-1][1]
|
46
|
+
|
47
|
+
# TODO handle larger better
|
48
|
+
if best_score.nil? || score < best_score
|
49
|
+
best_score = score
|
50
|
+
best_iter = iteration
|
51
|
+
best_message = message
|
52
|
+
elsif iteration - best_iter >= early_stopping_rounds
|
53
|
+
booster.best_iteration = best_iter
|
54
|
+
puts "Stopping. Best iteration:\n#{best_message}" if verbose_eval
|
55
|
+
break
|
56
|
+
end
|
57
|
+
end
|
21
58
|
end
|
22
59
|
|
23
60
|
booster
|
24
61
|
end
|
62
|
+
|
63
|
+
def cv(params, dtrain, num_boost_round: 10, nfold: 3, seed: 0, shuffle: true, verbose_eval: nil, show_stdv: true, early_stopping_rounds: nil)
|
64
|
+
rand_idx = (0...dtrain.num_row).to_a
|
65
|
+
rand_idx.shuffle!(random: Random.new(seed)) if shuffle
|
66
|
+
|
67
|
+
kstep = (rand_idx.size / nfold.to_f).ceil
|
68
|
+
test_id = rand_idx.each_slice(kstep).to_a[0...nfold]
|
69
|
+
train_id = []
|
70
|
+
nfold.times do |i|
|
71
|
+
idx = test_id.dup
|
72
|
+
idx.delete_at(i)
|
73
|
+
train_id << idx.flatten
|
74
|
+
end
|
75
|
+
|
76
|
+
folds = train_id.zip(test_id)
|
77
|
+
cvfolds = []
|
78
|
+
folds.each do |(train_idx, test_idx)|
|
79
|
+
fold_dtrain = dtrain.slice(train_idx)
|
80
|
+
fold_dvalid = dtrain.slice(test_idx)
|
81
|
+
booster = Booster.new(params: params)
|
82
|
+
booster.set_param("num_feature", dtrain.num_col)
|
83
|
+
cvfolds << [booster, fold_dtrain, fold_dvalid]
|
84
|
+
end
|
85
|
+
|
86
|
+
eval_hist = {}
|
87
|
+
|
88
|
+
if early_stopping_rounds
|
89
|
+
best_score = nil
|
90
|
+
best_iter = nil
|
91
|
+
end
|
92
|
+
|
93
|
+
num_boost_round.times do |iteration|
|
94
|
+
scores = {}
|
95
|
+
|
96
|
+
cvfolds.each do |(booster, fold_dtrain, fold_dvalid)|
|
97
|
+
booster.update(fold_dtrain, iteration)
|
98
|
+
message = booster.eval_set([[fold_dtrain, "train"], [fold_dvalid, "test"]], iteration)
|
99
|
+
|
100
|
+
res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] }
|
101
|
+
res.each do |k, v|
|
102
|
+
(scores[k] ||= []) << v
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
message_parts = ["[#{iteration}]"]
|
107
|
+
|
108
|
+
last_mean = nil
|
109
|
+
means = {}
|
110
|
+
scores.each do |eval_name, vals|
|
111
|
+
mean = mean(vals)
|
112
|
+
stdev = stdev(vals)
|
113
|
+
|
114
|
+
(eval_hist["#{eval_name}-mean"] ||= []) << mean
|
115
|
+
(eval_hist["#{eval_name}-std"] ||= []) << stdev
|
116
|
+
|
117
|
+
means[eval_name] = mean
|
118
|
+
last_mean = mean
|
119
|
+
|
120
|
+
if show_stdv
|
121
|
+
message_parts << "%s:%g+%g" % [eval_name, mean, stdev]
|
122
|
+
else
|
123
|
+
message_parts << "%s:%g" % [eval_name, mean]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
if early_stopping_rounds
|
128
|
+
score = last_mean
|
129
|
+
# TODO handle larger better
|
130
|
+
if best_score.nil? || score < best_score
|
131
|
+
best_score = score
|
132
|
+
best_iter = iteration
|
133
|
+
elsif iteration - best_iter >= early_stopping_rounds
|
134
|
+
eval_hist.each_key do |k|
|
135
|
+
eval_hist[k] = eval_hist[k][0..best_iter]
|
136
|
+
end
|
137
|
+
break
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# put at end to keep output consistent with Python
|
142
|
+
puts message_parts.join("\t") if verbose_eval
|
143
|
+
end
|
144
|
+
|
145
|
+
eval_hist
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
def mean(arr)
|
151
|
+
arr.sum / arr.size.to_f
|
152
|
+
end
|
153
|
+
|
154
|
+
# don't subtract one from arr.size
|
155
|
+
def stdev(arr)
|
156
|
+
m = mean(arr)
|
157
|
+
sum = 0
|
158
|
+
arr.each do |v|
|
159
|
+
sum += (v - m) ** 2
|
160
|
+
end
|
161
|
+
Math.sqrt(sum / arr.size)
|
162
|
+
end
|
25
163
|
end
|
26
164
|
end
|
data/lib/xgb/booster.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Xgb
|
2
2
|
class Booster
|
3
|
+
attr_accessor :best_iteration, :feature_names
|
4
|
+
|
3
5
|
def initialize(params: nil, model_file: nil)
|
4
6
|
@handle = ::FFI::MemoryPointer.new(:pointer)
|
5
7
|
check_result FFI.XGBoosterCreate(nil, 0, @handle)
|
@@ -7,14 +9,28 @@ module Xgb
|
|
7
9
|
check_result FFI.XGBoosterLoadModel(handle_pointer, model_file)
|
8
10
|
end
|
9
11
|
|
12
|
+
self.best_iteration = 0
|
10
13
|
set_param(params)
|
11
|
-
@num_class = (params && params[:num_class]) || 1
|
12
14
|
end
|
13
15
|
|
14
16
|
def update(dtrain, iteration)
|
15
17
|
check_result FFI.XGBoosterUpdateOneIter(handle_pointer, iteration, dtrain.handle_pointer)
|
16
18
|
end
|
17
19
|
|
20
|
+
def eval_set(evals, iteration)
|
21
|
+
dmats = ::FFI::MemoryPointer.new(:pointer, evals.size)
|
22
|
+
dmats.write_array_of_pointer(evals.map { |v| v[0].handle_pointer })
|
23
|
+
|
24
|
+
evnames = ::FFI::MemoryPointer.new(:pointer, evals.size)
|
25
|
+
evnames.write_array_of_pointer(evals.map { |v| ::FFI::MemoryPointer.from_string(v[1]) })
|
26
|
+
|
27
|
+
out_result = ::FFI::MemoryPointer.new(:pointer)
|
28
|
+
|
29
|
+
check_result FFI.XGBoosterEvalOneIter(handle_pointer, iteration, dmats, evnames, evals.size, out_result)
|
30
|
+
|
31
|
+
out_result.read_pointer.read_string
|
32
|
+
end
|
33
|
+
|
18
34
|
def set_param(params, value = nil)
|
19
35
|
if params.is_a?(Enumerable)
|
20
36
|
params.each do |k, v|
|
@@ -27,11 +43,12 @@ module Xgb
|
|
27
43
|
|
28
44
|
def predict(data, ntree_limit: nil)
|
29
45
|
ntree_limit ||= 0
|
30
|
-
out_len = ::FFI::MemoryPointer.new(:
|
46
|
+
out_len = ::FFI::MemoryPointer.new(:ulong)
|
31
47
|
out_result = ::FFI::MemoryPointer.new(:pointer)
|
32
48
|
check_result FFI.XGBoosterPredict(handle_pointer, data.handle_pointer, 0, ntree_limit, out_len, out_result)
|
33
|
-
out = out_result.read_pointer.read_array_of_float(out_len.
|
34
|
-
|
49
|
+
out = out_result.read_pointer.read_array_of_float(out_len.read_ulong)
|
50
|
+
num_class = out.size / data.num_row
|
51
|
+
out = out.each_slice(num_class).to_a if num_class > 1
|
35
52
|
out
|
36
53
|
end
|
37
54
|
|
@@ -39,6 +56,97 @@ module Xgb
|
|
39
56
|
check_result FFI.XGBoosterSaveModel(handle_pointer, fname)
|
40
57
|
end
|
41
58
|
|
59
|
+
# returns an array of strings
|
60
|
+
def dump(fmap: "", with_stats: false, dump_format: "text")
|
61
|
+
out_len = ::FFI::MemoryPointer.new(:ulong)
|
62
|
+
out_result = ::FFI::MemoryPointer.new(:pointer)
|
63
|
+
check_result FFI.XGBoosterDumpModelEx(handle_pointer, fmap, with_stats ? 1 : 0, dump_format, out_len, out_result)
|
64
|
+
out_result.read_pointer.get_array_of_string(0, out_len.read_ulong)
|
65
|
+
end
|
66
|
+
|
67
|
+
def dump_model(fout, fmap: "", with_stats: false, dump_format: "text")
|
68
|
+
ret = dump(fmap: fmap, with_stats: with_stats, dump_format: dump_format)
|
69
|
+
File.open(fout, "wb") do |f|
|
70
|
+
if dump_format == "json"
|
71
|
+
f.print("[\n")
|
72
|
+
ret.each_with_index do |r, i|
|
73
|
+
f.print(r)
|
74
|
+
f.print(",\n") if i < ret.size - 1
|
75
|
+
end
|
76
|
+
f.print("\n]")
|
77
|
+
else
|
78
|
+
ret.each_with_index do |r, i|
|
79
|
+
f.print("booster[#{i}]:\n")
|
80
|
+
f.print(r)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def fscore(fmap: "")
|
87
|
+
# always weight
|
88
|
+
score(fmap: fmap, importance_type: "weight")
|
89
|
+
end
|
90
|
+
|
91
|
+
def score(fmap: "", importance_type: "weight")
|
92
|
+
if importance_type == "weight"
|
93
|
+
trees = dump(fmap: fmap, with_stats: false)
|
94
|
+
fmap = {}
|
95
|
+
trees.each do |tree|
|
96
|
+
tree.split("\n").each do |line|
|
97
|
+
arr = line.split("[")
|
98
|
+
next if arr.size == 1
|
99
|
+
|
100
|
+
fid = arr[1].split("]")[0].split("<")[0]
|
101
|
+
fmap[fid] ||= 0
|
102
|
+
fmap[fid] += 1
|
103
|
+
end
|
104
|
+
end
|
105
|
+
fmap
|
106
|
+
else
|
107
|
+
average_over_splits = true
|
108
|
+
if importance_type == "total_gain"
|
109
|
+
importance_type = "gain"
|
110
|
+
average_over_splits = false
|
111
|
+
elsif importance_type == "total_cover"
|
112
|
+
importance_type = "cover"
|
113
|
+
average_over_splits = false
|
114
|
+
end
|
115
|
+
|
116
|
+
trees = dump(fmap: fmap, with_stats: true)
|
117
|
+
|
118
|
+
importance_type += "="
|
119
|
+
fmap = {}
|
120
|
+
gmap = {}
|
121
|
+
trees.each do |tree|
|
122
|
+
tree.split("\n").each do |line|
|
123
|
+
arr = line.split("[")
|
124
|
+
next if arr.size == 1
|
125
|
+
|
126
|
+
fid = arr[1].split("]")
|
127
|
+
|
128
|
+
g = fid[1].split(importance_type)[1].split(",")[0].to_f
|
129
|
+
|
130
|
+
fid = fid[0].split("<")[0]
|
131
|
+
|
132
|
+
fmap[fid] ||= 0
|
133
|
+
gmap[fid] ||= 0
|
134
|
+
|
135
|
+
fmap[fid] += 1
|
136
|
+
gmap[fid] += g
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
if average_over_splits
|
141
|
+
gmap.each_key do |fid|
|
142
|
+
gmap[fid] = gmap[fid] / fmap[fid]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
gmap
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
42
150
|
private
|
43
151
|
|
44
152
|
def handle_pointer
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Xgb
|
2
|
+
class Classifier
|
3
|
+
def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "binary:logistic", importance_type: "gain")
|
4
|
+
@params = {
|
5
|
+
max_depth: max_depth,
|
6
|
+
objective: objective,
|
7
|
+
learning_rate: learning_rate
|
8
|
+
}
|
9
|
+
@n_estimators = n_estimators
|
10
|
+
@importance_type = importance_type
|
11
|
+
end
|
12
|
+
|
13
|
+
def fit(x, y)
|
14
|
+
n_classes = y.uniq.size
|
15
|
+
|
16
|
+
params = @params.dup
|
17
|
+
if n_classes > 2
|
18
|
+
params[:objective] = "multi:softprob"
|
19
|
+
params[:num_class] = n_classes
|
20
|
+
end
|
21
|
+
|
22
|
+
dtrain = DMatrix.new(x, label: y)
|
23
|
+
@booster = Xgb.train(params, dtrain, num_boost_round: @n_estimators)
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def predict(data)
|
28
|
+
dmat = DMatrix.new(data)
|
29
|
+
y_pred = @booster.predict(dmat)
|
30
|
+
|
31
|
+
if y_pred.first.is_a?(Array)
|
32
|
+
# multiple classes
|
33
|
+
y_pred.map do |v|
|
34
|
+
v.map.with_index.max_by { |v2, i| v2 }.last
|
35
|
+
end
|
36
|
+
else
|
37
|
+
y_pred.map { |v| v > 0.5 ? 1 : 0 }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def predict_proba(data)
|
42
|
+
dmat = DMatrix.new(data)
|
43
|
+
y_pred = @booster.predict(dmat)
|
44
|
+
|
45
|
+
if y_pred.first.is_a?(Array)
|
46
|
+
# multiple classes
|
47
|
+
y_pred
|
48
|
+
else
|
49
|
+
y_pred.map { |v| [1 - v, v] }
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def save_model(fname)
|
54
|
+
@booster.save_model(fname)
|
55
|
+
end
|
56
|
+
|
57
|
+
def load_model(fname)
|
58
|
+
@booster = Booster.new(params: @params, model_file: fname)
|
59
|
+
end
|
60
|
+
|
61
|
+
def feature_importances
|
62
|
+
score = @booster.score(importance_type: @importance_type)
|
63
|
+
scores = @booster.feature_names.map { |k| score[k] || 0.0 }
|
64
|
+
total = scores.sum.to_f
|
65
|
+
scores.map { |s| s / total }
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
data/lib/xgb/dmatrix.rb
CHANGED
@@ -1,24 +1,72 @@
|
|
1
1
|
module Xgb
|
2
2
|
class DMatrix
|
3
|
-
attr_reader :data
|
3
|
+
attr_reader :data
|
4
4
|
|
5
5
|
def initialize(data, label: nil, weight: nil, missing: Float::NAN)
|
6
6
|
@data = data
|
7
|
-
@label = label
|
8
|
-
@weight = weight
|
9
7
|
|
10
|
-
c_data = ::FFI::MemoryPointer.new(:float, data.count * data.first.count)
|
11
|
-
c_data.put_array_of_float(0, data.flatten)
|
12
8
|
@handle = ::FFI::MemoryPointer.new(:pointer)
|
13
|
-
|
9
|
+
|
10
|
+
if data
|
11
|
+
if matrix?(data)
|
12
|
+
nrow = data.row_count
|
13
|
+
ncol = data.column_count
|
14
|
+
flat_data = data.to_a.flatten
|
15
|
+
elsif daru?(data)
|
16
|
+
nrow, ncol = data.shape
|
17
|
+
flat_data = data.each_vector.map(&:to_a).flatten
|
18
|
+
elsif narray?(data)
|
19
|
+
nrow, ncol = data.shape
|
20
|
+
flat_data = data.flatten.to_a
|
21
|
+
else
|
22
|
+
nrow = data.count
|
23
|
+
ncol = data.first.count
|
24
|
+
flat_data = data.flatten
|
25
|
+
end
|
26
|
+
|
27
|
+
c_data = ::FFI::MemoryPointer.new(:float, nrow * ncol)
|
28
|
+
c_data.put_array_of_float(0, flat_data)
|
29
|
+
check_result FFI.XGDMatrixCreateFromMat(c_data, nrow, ncol, missing, @handle)
|
30
|
+
end
|
14
31
|
|
15
32
|
set_float_info("label", label) if label
|
33
|
+
set_float_info("weight", weight) if weight
|
34
|
+
end
|
35
|
+
|
36
|
+
def label
|
37
|
+
float_info("label")
|
38
|
+
end
|
39
|
+
|
40
|
+
def weight
|
41
|
+
float_info("weight")
|
42
|
+
end
|
43
|
+
|
44
|
+
def num_row
|
45
|
+
out = ::FFI::MemoryPointer.new(:ulong)
|
46
|
+
check_result FFI.XGDMatrixNumRow(handle_pointer, out)
|
47
|
+
out.read_ulong
|
16
48
|
end
|
17
49
|
|
18
50
|
def num_col
|
19
|
-
out = ::FFI::MemoryPointer.new(:
|
20
|
-
FFI.XGDMatrixNumCol(handle_pointer, out)
|
21
|
-
out.
|
51
|
+
out = ::FFI::MemoryPointer.new(:ulong)
|
52
|
+
check_result FFI.XGDMatrixNumCol(handle_pointer, out)
|
53
|
+
out.read_ulong
|
54
|
+
end
|
55
|
+
|
56
|
+
def slice(rindex)
|
57
|
+
res = DMatrix.new(nil)
|
58
|
+
idxset = ::FFI::MemoryPointer.new(:int, rindex.count)
|
59
|
+
idxset.put_array_of_int(0, rindex)
|
60
|
+
check_result FFI.XGDMatrixSliceDMatrix(handle_pointer, idxset, rindex.size, res.handle)
|
61
|
+
res
|
62
|
+
end
|
63
|
+
|
64
|
+
def save_binary(fname, silent: true)
|
65
|
+
check_result FFI.XGDMatrixSaveBinary(handle_pointer, fname, silent ? 1 : 0)
|
66
|
+
end
|
67
|
+
|
68
|
+
def handle
|
69
|
+
@handle
|
22
70
|
end
|
23
71
|
|
24
72
|
def handle_pointer
|
@@ -28,11 +76,44 @@ module Xgb
|
|
28
76
|
private
|
29
77
|
|
30
78
|
def set_float_info(field, data)
|
31
|
-
|
79
|
+
data =
|
80
|
+
if matrix?(data)
|
81
|
+
data.to_a[0]
|
82
|
+
elsif daru_vector?(data) || narray?(data)
|
83
|
+
data.to_a
|
84
|
+
else
|
85
|
+
data
|
86
|
+
end
|
87
|
+
|
88
|
+
c_data = ::FFI::MemoryPointer.new(:float, data.size)
|
32
89
|
c_data.put_array_of_float(0, data)
|
33
90
|
check_result FFI.XGDMatrixSetFloatInfo(handle_pointer, field.to_s, c_data, data.size)
|
34
91
|
end
|
35
92
|
|
93
|
+
def float_info(field)
|
94
|
+
num_row ||= num_row()
|
95
|
+
out_len = ::FFI::MemoryPointer.new(:int)
|
96
|
+
out_dptr = ::FFI::MemoryPointer.new(:float, num_row)
|
97
|
+
check_result FFI.XGDMatrixGetFloatInfo(handle_pointer, field, out_len, out_dptr)
|
98
|
+
out_dptr.read_pointer.read_array_of_float(num_row)
|
99
|
+
end
|
100
|
+
|
101
|
+
def matrix?(data)
|
102
|
+
defined?(Matrix) && data.is_a?(Matrix)
|
103
|
+
end
|
104
|
+
|
105
|
+
def daru?(data)
|
106
|
+
defined?(Daru::DataFrame) && data.is_a?(Daru::DataFrame)
|
107
|
+
end
|
108
|
+
|
109
|
+
def daru_vector?(data)
|
110
|
+
defined?(Daru::Vector) && data.is_a?(Daru::Vector)
|
111
|
+
end
|
112
|
+
|
113
|
+
def narray?(data)
|
114
|
+
defined?(Numo::NArray) && data.is_a?(Numo::NArray)
|
115
|
+
end
|
116
|
+
|
36
117
|
include Utils
|
37
118
|
end
|
38
119
|
end
|
data/lib/xgb/ffi.rb
CHANGED
@@ -10,16 +10,22 @@ module Xgb
|
|
10
10
|
attach_function :XGBGetLastError, %i[], :string
|
11
11
|
|
12
12
|
# dmatrix
|
13
|
-
attach_function :XGDMatrixCreateFromMat, %i[pointer
|
13
|
+
attach_function :XGDMatrixCreateFromMat, %i[pointer ulong ulong float pointer], :int
|
14
|
+
attach_function :XGDMatrixNumRow, %i[pointer pointer], :int
|
14
15
|
attach_function :XGDMatrixNumCol, %i[pointer pointer], :int
|
15
|
-
attach_function :
|
16
|
+
attach_function :XGDMatrixSliceDMatrix, %i[pointer pointer ulong pointer], :int
|
17
|
+
attach_function :XGDMatrixSaveBinary, %i[pointer string int], :int
|
18
|
+
attach_function :XGDMatrixSetFloatInfo, %i[pointer string pointer ulong], :int
|
19
|
+
attach_function :XGDMatrixGetFloatInfo, %i[pointer string pointer pointer], :int
|
16
20
|
|
17
21
|
# booster
|
18
22
|
attach_function :XGBoosterCreate, %i[pointer int pointer], :int
|
19
23
|
attach_function :XGBoosterUpdateOneIter, %i[pointer int pointer], :int
|
24
|
+
attach_function :XGBoosterEvalOneIter, %i[pointer int pointer pointer ulong pointer], :int
|
20
25
|
attach_function :XGBoosterSetParam, %i[pointer string string], :int
|
21
26
|
attach_function :XGBoosterPredict, %i[pointer pointer int int pointer pointer], :int
|
22
27
|
attach_function :XGBoosterLoadModel, %i[pointer string], :int
|
23
28
|
attach_function :XGBoosterSaveModel, %i[pointer string], :int
|
29
|
+
attach_function :XGBoosterDumpModelEx, %i[pointer string int string pointer pointer], :int
|
24
30
|
end
|
25
31
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Xgb
|
2
|
+
class Regressor
|
3
|
+
def initialize(max_depth: 3, learning_rate: 0.1, n_estimators: 100, objective: "reg:squarederror", importance_type: "gain")
|
4
|
+
@params = {
|
5
|
+
max_depth: max_depth,
|
6
|
+
objective: objective,
|
7
|
+
learning_rate: learning_rate
|
8
|
+
}
|
9
|
+
@n_estimators = n_estimators
|
10
|
+
@importance_type = importance_type
|
11
|
+
end
|
12
|
+
|
13
|
+
def fit(x, y)
|
14
|
+
dtrain = DMatrix.new(x, label: y)
|
15
|
+
@booster = Xgb.train(@params, dtrain, num_boost_round: @n_estimators)
|
16
|
+
nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def predict(data)
|
20
|
+
dmat = DMatrix.new(data)
|
21
|
+
@booster.predict(dmat)
|
22
|
+
end
|
23
|
+
|
24
|
+
def save_model(fname)
|
25
|
+
@booster.save_model(fname)
|
26
|
+
end
|
27
|
+
|
28
|
+
def load_model(fname)
|
29
|
+
@booster = Booster.new(params: @params, model_file: fname)
|
30
|
+
end
|
31
|
+
|
32
|
+
def feature_importances
|
33
|
+
score = @booster.score(importance_type: @importance_type)
|
34
|
+
scores = @booster.feature_names.map { |k| score[k] || 0.0 }
|
35
|
+
total = scores.sum.to_f
|
36
|
+
scores.map { |s| s / total }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/xgb/utils.rb
CHANGED
@@ -3,7 +3,11 @@ module Xgb
|
|
3
3
|
private
|
4
4
|
|
5
5
|
def check_result(err)
|
6
|
-
|
6
|
+
if err != 0
|
7
|
+
# make friendly
|
8
|
+
message = FFI.XGBGetLastError.split("\n").first.split(/:\d+: /, 2).last
|
9
|
+
raise Xgb::Error, message
|
10
|
+
end
|
7
11
|
end
|
8
12
|
end
|
9
13
|
end
|
data/lib/xgb/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xgb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-08-
|
11
|
+
date: 2019-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ffi
|
@@ -66,6 +66,34 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '5'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: daru
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: numo-narray
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
69
97
|
description:
|
70
98
|
email: andrew@chartkick.com
|
71
99
|
executables: []
|
@@ -76,8 +104,10 @@ files:
|
|
76
104
|
- README.md
|
77
105
|
- lib/xgb.rb
|
78
106
|
- lib/xgb/booster.rb
|
107
|
+
- lib/xgb/classifier.rb
|
79
108
|
- lib/xgb/dmatrix.rb
|
80
109
|
- lib/xgb/ffi.rb
|
110
|
+
- lib/xgb/regressor.rb
|
81
111
|
- lib/xgb/utils.rb
|
82
112
|
- lib/xgb/version.rb
|
83
113
|
homepage: https://github.com/ankane/xgb
|