lightgbm 0.3.4 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/LICENSE.txt +1 -1
- data/lib/lightgbm/booster.rb +125 -82
- data/lib/lightgbm/dataset.rb +55 -33
- data/lib/lightgbm/ffi.rb +15 -0
- data/lib/lightgbm/inner_predictor.rb +159 -0
- data/lib/lightgbm/model.rb +1 -1
- data/lib/lightgbm/utils.rb +9 -2
- data/lib/lightgbm/version.rb +1 -1
- data/lib/lightgbm.rb +4 -0
- metadata +5 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4fadfa7ea250cf7c48f076effb5ce8f5db3cf0c8ab87bb04f2033457a502721a
|
4
|
+
data.tar.gz: 3af4cac369a3c684bdb387036845eca04747c14c8e12c9b38625e5c38130de74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21fb7ae25e1f085cd3642bb02ce32f8378f5d6013f6e8504b86586c86dfaf5c29b12d83b10e3bfd747a3dbfc996eb8473c12313ca5e2f4302554b0a6c40261e3
|
7
|
+
data.tar.gz: 75d7b3cea373adedbe8a6cc7c9f0b47f473ac0e77126779efd2f5ab1899596f4a8a926e1d97b033e5cb6e43b0c0f3706e8e7eb9f280d29354ba18792e2f4078b
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## 0.4.0 (2025-01-05)
|
2
|
+
|
3
|
+
- Added support for different prediction types
|
4
|
+
- Added support for `pandas_categorical` to `predict` method
|
5
|
+
- Added support for hashes and Rover data frames to `predict` method
|
6
|
+
- Added support for hashes to `Dataset`
|
7
|
+
- Added `importance_type` option to `dump_model`, `model_to_string`, and `save_model` methods
|
8
|
+
- Changed `Dataset` to use column names for feature names with Rover and Daru
|
9
|
+
- Changed `predict` method to match feature names with Daru
|
10
|
+
- Dropped support for Ruby < 3.1
|
11
|
+
|
1
12
|
## 0.3.4 (2024-07-28)
|
2
13
|
|
3
14
|
- Updated LightGBM to 4.5.0
|
data/LICENSE.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
The MIT License (MIT)
|
2
2
|
|
3
3
|
Copyright (c) Microsoft Corporation
|
4
|
-
Copyright (c) 2019-
|
4
|
+
Copyright (c) 2019-2025 Andrew Kane
|
5
5
|
|
6
6
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
7
|
of this software and associated documentation files (the "Software"), to deal
|
data/lib/lightgbm/booster.rb
CHANGED
@@ -1,20 +1,29 @@
|
|
1
1
|
module LightGBM
|
2
2
|
class Booster
|
3
|
-
|
3
|
+
include Utils
|
4
|
+
|
5
|
+
attr_accessor :best_iteration, :train_data_name, :params
|
4
6
|
|
5
7
|
def initialize(params: nil, train_set: nil, model_file: nil, model_str: nil)
|
6
|
-
@handle = ::FFI::MemoryPointer.new(:pointer)
|
7
8
|
if model_str
|
8
9
|
model_from_string(model_str)
|
9
10
|
elsif model_file
|
10
11
|
out_num_iterations = ::FFI::MemoryPointer.new(:int)
|
11
|
-
|
12
|
+
create_handle do |handle|
|
13
|
+
safe_call FFI.LGBM_BoosterCreateFromModelfile(model_file, out_num_iterations, handle)
|
14
|
+
end
|
15
|
+
@pandas_categorical = load_pandas_categorical(file_name: model_file)
|
16
|
+
if params
|
17
|
+
warn "[lightgbm] Ignoring params argument, using parameters from model file."
|
18
|
+
end
|
19
|
+
@params = loaded_param
|
12
20
|
else
|
13
21
|
params ||= {}
|
14
22
|
set_verbosity(params)
|
15
|
-
|
23
|
+
create_handle do |handle|
|
24
|
+
safe_call FFI.LGBM_BoosterCreate(train_set.handle, params_str(params), handle)
|
25
|
+
end
|
16
26
|
end
|
17
|
-
ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i))
|
18
27
|
|
19
28
|
self.best_iteration = -1
|
20
29
|
|
@@ -23,28 +32,28 @@ module LightGBM
|
|
23
32
|
end
|
24
33
|
|
25
34
|
def add_valid(data, name)
|
26
|
-
|
35
|
+
safe_call FFI.LGBM_BoosterAddValidData(@handle, data.handle)
|
27
36
|
@name_valid_sets << name
|
28
37
|
self # consistent with Python API
|
29
38
|
end
|
30
39
|
|
31
40
|
def current_iteration
|
32
41
|
out = ::FFI::MemoryPointer.new(:int)
|
33
|
-
|
42
|
+
safe_call FFI.LGBM_BoosterGetCurrentIteration(@handle, out)
|
34
43
|
out.read_int
|
35
44
|
end
|
36
45
|
|
37
|
-
def dump_model(num_iteration: nil, start_iteration: 0)
|
46
|
+
def dump_model(num_iteration: nil, start_iteration: 0, importance_type: "split")
|
38
47
|
num_iteration ||= best_iteration
|
48
|
+
importance_type_int = feature_importance_type_mapper(importance_type)
|
39
49
|
buffer_len = 1 << 20
|
40
50
|
out_len = ::FFI::MemoryPointer.new(:int64)
|
41
51
|
out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
|
42
|
-
|
43
|
-
|
44
|
-
actual_len = read_int64(out_len)
|
52
|
+
safe_call FFI.LGBM_BoosterDumpModel(@handle, start_iteration, num_iteration, importance_type_int, buffer_len, out_len, out_str)
|
53
|
+
actual_len = out_len.read_int64
|
45
54
|
if actual_len > buffer_len
|
46
55
|
out_str = ::FFI::MemoryPointer.new(:char, actual_len)
|
47
|
-
|
56
|
+
safe_call FFI.LGBM_BoosterDumpModel(@handle, start_iteration, num_iteration, importance_type_int, actual_len, out_len, out_str)
|
48
57
|
end
|
49
58
|
out_str.read_string
|
50
59
|
end
|
@@ -60,19 +69,10 @@ module LightGBM
|
|
60
69
|
|
61
70
|
def feature_importance(iteration: nil, importance_type: "split")
|
62
71
|
iteration ||= best_iteration
|
63
|
-
|
64
|
-
case importance_type
|
65
|
-
when "split"
|
66
|
-
0
|
67
|
-
when "gain"
|
68
|
-
1
|
69
|
-
else
|
70
|
-
-1
|
71
|
-
end
|
72
|
-
|
72
|
+
importance_type_int = feature_importance_type_mapper(importance_type)
|
73
73
|
num_feature = self.num_feature
|
74
74
|
out_result = ::FFI::MemoryPointer.new(:double, num_feature)
|
75
|
-
|
75
|
+
safe_call FFI.LGBM_BoosterFeatureImportance(@handle, iteration, importance_type_int, out_result)
|
76
76
|
out_result.read_array_of_double(num_feature).map(&:to_i)
|
77
77
|
end
|
78
78
|
|
@@ -84,13 +84,13 @@ module LightGBM
|
|
84
84
|
out_strs = ::FFI::MemoryPointer.new(:pointer, num_feature)
|
85
85
|
str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
|
86
86
|
out_strs.write_array_of_pointer(str_ptrs)
|
87
|
-
|
87
|
+
safe_call FFI.LGBM_BoosterGetFeatureNames(@handle, len, out_len, buffer_len, out_buffer_len, out_strs)
|
88
88
|
|
89
89
|
actual_len = out_buffer_len.read(:size_t)
|
90
90
|
if actual_len > buffer_len
|
91
91
|
str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
|
92
92
|
out_strs.write_array_of_pointer(str_ptrs)
|
93
|
-
|
93
|
+
safe_call FFI.LGBM_BoosterGetFeatureNames(@handle, len, out_len, actual_len, out_buffer_len, out_strs)
|
94
94
|
end
|
95
95
|
|
96
96
|
str_ptrs[0, out_len.read(:size_t)].map(&:read_string)
|
@@ -98,130 +98,122 @@ module LightGBM
|
|
98
98
|
|
99
99
|
def model_from_string(model_str)
|
100
100
|
out_num_iterations = ::FFI::MemoryPointer.new(:int)
|
101
|
-
|
101
|
+
create_handle do |handle|
|
102
|
+
safe_call FFI.LGBM_BoosterLoadModelFromString(model_str, out_num_iterations, handle)
|
103
|
+
end
|
104
|
+
@pandas_categorical = load_pandas_categorical(model_str: model_str)
|
105
|
+
@params = loaded_param
|
106
|
+
@cached_feature_name = nil
|
102
107
|
self
|
103
108
|
end
|
104
109
|
|
105
|
-
def model_to_string(num_iteration: nil, start_iteration: 0)
|
110
|
+
def model_to_string(num_iteration: nil, start_iteration: 0, importance_type: "split")
|
106
111
|
num_iteration ||= best_iteration
|
112
|
+
importance_type_int = feature_importance_type_mapper(importance_type)
|
107
113
|
buffer_len = 1 << 20
|
108
114
|
out_len = ::FFI::MemoryPointer.new(:int64)
|
109
115
|
out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
|
110
|
-
|
111
|
-
|
112
|
-
actual_len = read_int64(out_len)
|
116
|
+
safe_call FFI.LGBM_BoosterSaveModelToString(@handle, start_iteration, num_iteration, importance_type_int, buffer_len, out_len, out_str)
|
117
|
+
actual_len = out_len.read_int64
|
113
118
|
if actual_len > buffer_len
|
114
119
|
out_str = ::FFI::MemoryPointer.new(:char, actual_len)
|
115
|
-
|
120
|
+
safe_call FFI.LGBM_BoosterSaveModelToString(@handle, start_iteration, num_iteration, importance_type_int, actual_len, out_len, out_str)
|
116
121
|
end
|
117
122
|
out_str.read_string
|
118
123
|
end
|
119
124
|
|
120
125
|
def num_feature
|
121
126
|
out = ::FFI::MemoryPointer.new(:int)
|
122
|
-
|
127
|
+
safe_call FFI.LGBM_BoosterGetNumFeature(@handle, out)
|
123
128
|
out.read_int
|
124
129
|
end
|
125
130
|
alias_method :num_features, :num_feature # legacy typo
|
126
131
|
|
127
132
|
def num_model_per_iteration
|
128
133
|
out = ::FFI::MemoryPointer.new(:int)
|
129
|
-
|
134
|
+
safe_call FFI.LGBM_BoosterNumModelPerIteration(@handle, out)
|
130
135
|
out.read_int
|
131
136
|
end
|
132
137
|
|
133
138
|
def num_trees
|
134
139
|
out = ::FFI::MemoryPointer.new(:int)
|
135
|
-
|
140
|
+
safe_call FFI.LGBM_BoosterNumberOfTotalModel(@handle, out)
|
136
141
|
out.read_int
|
137
142
|
end
|
138
143
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
if
|
143
|
-
|
144
|
+
def predict(data, start_iteration: 0, num_iteration: nil, raw_score: false, pred_leaf: false, pred_contrib: false, **kwargs)
|
145
|
+
predictor = InnerPredictor.from_booster(self, kwargs.transform_values(&:dup))
|
146
|
+
if num_iteration.nil?
|
147
|
+
if start_iteration <= 0
|
148
|
+
num_iteration = best_iteration
|
144
149
|
else
|
145
|
-
|
150
|
+
num_iteration = -1
|
146
151
|
end
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
handle_missing(flat_input)
|
157
|
-
data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
|
158
|
-
data.write_array_of_double(flat_input)
|
159
|
-
|
160
|
-
out_len = ::FFI::MemoryPointer.new(:int64)
|
161
|
-
out_result = ::FFI::MemoryPointer.new(:double, num_class * input.count)
|
162
|
-
check_result FFI.LGBM_BoosterPredictForMat(handle_pointer, data, 1, input.count, input.first.count, 1, 0, start_iteration, num_iteration, params_str(params), out_len, out_result)
|
163
|
-
out = out_result.read_array_of_double(read_int64(out_len))
|
164
|
-
out = out.each_slice(num_class).to_a if num_class > 1
|
165
|
-
|
166
|
-
singular ? out.first : out
|
152
|
+
end
|
153
|
+
predictor.predict(
|
154
|
+
data,
|
155
|
+
start_iteration: start_iteration,
|
156
|
+
num_iteration: num_iteration,
|
157
|
+
raw_score: raw_score,
|
158
|
+
pred_leaf: pred_leaf,
|
159
|
+
pred_contrib: pred_contrib
|
160
|
+
)
|
167
161
|
end
|
168
162
|
|
169
|
-
def save_model(filename, num_iteration: nil, start_iteration: 0)
|
163
|
+
def save_model(filename, num_iteration: nil, start_iteration: 0, importance_type: "split")
|
170
164
|
num_iteration ||= best_iteration
|
171
|
-
|
172
|
-
|
165
|
+
importance_type_int = feature_importance_type_mapper(importance_type)
|
166
|
+
safe_call FFI.LGBM_BoosterSaveModel(@handle, start_iteration, num_iteration, importance_type_int, filename)
|
173
167
|
self # consistent with Python API
|
174
168
|
end
|
175
169
|
|
176
170
|
def update
|
177
171
|
finished = ::FFI::MemoryPointer.new(:int)
|
178
|
-
|
172
|
+
safe_call FFI.LGBM_BoosterUpdateOneIter(@handle, finished)
|
179
173
|
finished.read_int == 1
|
180
174
|
end
|
181
175
|
|
182
|
-
def self.finalize(addr)
|
183
|
-
# must use proc instead of stabby lambda
|
184
|
-
proc { FFI.LGBM_BoosterFree(::FFI::Pointer.new(:pointer, addr)) }
|
185
|
-
end
|
186
|
-
|
187
176
|
private
|
188
177
|
|
189
|
-
def
|
190
|
-
|
178
|
+
def create_handle
|
179
|
+
::FFI::MemoryPointer.new(:pointer) do |handle|
|
180
|
+
yield handle
|
181
|
+
@handle = ::FFI::AutoPointer.new(handle.read_pointer, FFI.method(:LGBM_BoosterFree))
|
182
|
+
end
|
191
183
|
end
|
192
184
|
|
193
185
|
def eval_counts
|
194
186
|
out = ::FFI::MemoryPointer.new(:int)
|
195
|
-
|
187
|
+
safe_call FFI.LGBM_BoosterGetEvalCounts(@handle, out)
|
196
188
|
out.read_int
|
197
189
|
end
|
198
190
|
|
199
191
|
def eval_names
|
200
|
-
eval_counts
|
192
|
+
eval_counts = self.eval_counts
|
201
193
|
out_len = ::FFI::MemoryPointer.new(:int)
|
202
194
|
out_buffer_len = ::FFI::MemoryPointer.new(:size_t)
|
203
195
|
out_strs = ::FFI::MemoryPointer.new(:pointer, eval_counts)
|
204
196
|
buffer_len = 255
|
205
197
|
str_ptrs = eval_counts.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
|
206
198
|
out_strs.write_array_of_pointer(str_ptrs)
|
207
|
-
|
199
|
+
safe_call FFI.LGBM_BoosterGetEvalNames(@handle, eval_counts, out_len, buffer_len, out_buffer_len, out_strs)
|
208
200
|
|
209
201
|
actual_len = out_buffer_len.read(:size_t)
|
210
202
|
if actual_len > buffer_len
|
211
203
|
str_ptrs = eval_counts.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
|
212
204
|
out_strs.write_array_of_pointer(str_ptrs)
|
213
|
-
|
205
|
+
safe_call FFI.LGBM_BoosterGetEvalNames(@handle, eval_counts, out_len, actual_len, out_buffer_len, out_strs)
|
214
206
|
end
|
215
207
|
|
216
208
|
str_ptrs.map(&:read_string)
|
217
209
|
end
|
218
210
|
|
219
211
|
def inner_eval(name, i)
|
220
|
-
eval_names
|
212
|
+
eval_names = self.eval_names
|
221
213
|
|
222
214
|
out_len = ::FFI::MemoryPointer.new(:int)
|
223
215
|
out_results = ::FFI::MemoryPointer.new(:double, eval_names.count)
|
224
|
-
|
216
|
+
safe_call FFI.LGBM_BoosterGetEval(@handle, i, out_len, out_results)
|
225
217
|
vals = out_results.read_array_of_double(out_len.read_int)
|
226
218
|
|
227
219
|
eval_names.zip(vals).map do |eval_name, val|
|
@@ -232,15 +224,66 @@ module LightGBM
|
|
232
224
|
|
233
225
|
def num_class
|
234
226
|
out = ::FFI::MemoryPointer.new(:int)
|
235
|
-
|
227
|
+
safe_call FFI.LGBM_BoosterGetNumClasses(@handle, out)
|
236
228
|
out.read_int
|
237
229
|
end
|
238
230
|
|
239
|
-
|
240
|
-
|
241
|
-
ptr.read_array_of_int64(1).first
|
231
|
+
def cached_feature_name
|
232
|
+
@cached_feature_name ||= feature_name
|
242
233
|
end
|
243
234
|
|
244
|
-
|
235
|
+
def feature_importance_type_mapper(importance_type)
|
236
|
+
case importance_type
|
237
|
+
when "split"
|
238
|
+
FFI::C_API_FEATURE_IMPORTANCE_SPLIT
|
239
|
+
when "gain"
|
240
|
+
FFI::C_API_FEATURE_IMPORTANCE_GAIN
|
241
|
+
else
|
242
|
+
-1
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def load_pandas_categorical(file_name: nil, model_str: nil)
|
247
|
+
pandas_key = "pandas_categorical:"
|
248
|
+
offset = -pandas_key.length
|
249
|
+
if !file_name.nil?
|
250
|
+
max_offset = -File.size(file_name)
|
251
|
+
lines = []
|
252
|
+
File.open(file_name, "rb") do |f|
|
253
|
+
loop do
|
254
|
+
offset = [offset, max_offset].max
|
255
|
+
f.seek(offset, IO::SEEK_END)
|
256
|
+
lines = f.readlines
|
257
|
+
if lines.length >= 2 || offset == max_offset
|
258
|
+
break
|
259
|
+
end
|
260
|
+
offset *= 2
|
261
|
+
end
|
262
|
+
end
|
263
|
+
last_line = lines[-1].strip
|
264
|
+
if !last_line.start_with?(pandas_key)
|
265
|
+
last_line = lines[-2].strip
|
266
|
+
end
|
267
|
+
elsif !model_str.nil?
|
268
|
+
idx = model_str[..offset].rindex("\n")
|
269
|
+
last_line = model_str[idx..].strip
|
270
|
+
end
|
271
|
+
if last_line.start_with?(pandas_key)
|
272
|
+
JSON.parse(last_line[pandas_key.length..])
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
def loaded_param
|
277
|
+
buffer_len = 1 << 20
|
278
|
+
out_len = ::FFI::MemoryPointer.new(:int64)
|
279
|
+
out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
|
280
|
+
safe_call FFI.LGBM_BoosterGetLoadedParam(@handle, buffer_len, out_len, out_str)
|
281
|
+
actual_len = out_len.read_int64
|
282
|
+
if actual_len > buffer_len
|
283
|
+
out_str = ::FFI::MemoryPointer.new(:char, actual_len)
|
284
|
+
safe_call FFI.LGBM_BoosterGetLoadedParam(@handle, actual_len, out_len, out_str)
|
285
|
+
end
|
286
|
+
JSON.parse(out_str.read_string)
|
287
|
+
end
|
245
288
|
end
|
246
289
|
end
|
data/lib/lightgbm/dataset.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
module LightGBM
|
2
2
|
class Dataset
|
3
|
+
include Utils
|
4
|
+
|
3
5
|
attr_reader :data, :params
|
4
6
|
|
5
|
-
def initialize(data, label: nil, weight: nil, group: nil, params: nil, reference: nil, used_indices: nil, categorical_feature: "auto", feature_names: nil)
|
7
|
+
def initialize(data, label: nil, weight: nil, group: nil, params: nil, reference: nil, used_indices: nil, categorical_feature: "auto", feature_name: nil, feature_names: nil)
|
6
8
|
@data = data
|
7
9
|
@label = label
|
8
10
|
@weight = weight
|
@@ -11,7 +13,7 @@ module LightGBM
|
|
11
13
|
@reference = reference
|
12
14
|
@used_indices = used_indices
|
13
15
|
@categorical_feature = categorical_feature
|
14
|
-
@
|
16
|
+
@feature_name = feature_name || feature_names || "auto"
|
15
17
|
|
16
18
|
construct
|
17
19
|
end
|
@@ -24,7 +26,7 @@ module LightGBM
|
|
24
26
|
field("weight")
|
25
27
|
end
|
26
28
|
|
27
|
-
def
|
29
|
+
def feature_name
|
28
30
|
# must preallocate space
|
29
31
|
num_feature_names = ::FFI::MemoryPointer.new(:int)
|
30
32
|
out_buffer_len = ::FFI::MemoryPointer.new(:size_t)
|
@@ -33,7 +35,7 @@ module LightGBM
|
|
33
35
|
buffer_len = 255
|
34
36
|
str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
|
35
37
|
out_strs.write_array_of_pointer(str_ptrs)
|
36
|
-
|
38
|
+
safe_call FFI.LGBM_DatasetGetFeatureNames(@handle, len, num_feature_names, buffer_len, out_buffer_len, out_strs)
|
37
39
|
|
38
40
|
num_features = num_feature_names.read_int
|
39
41
|
actual_len = out_buffer_len.read(:size_t)
|
@@ -41,13 +43,14 @@ module LightGBM
|
|
41
43
|
out_strs = ::FFI::MemoryPointer.new(:pointer, num_features) if num_features > len
|
42
44
|
str_ptrs = num_features.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
|
43
45
|
out_strs.write_array_of_pointer(str_ptrs)
|
44
|
-
|
46
|
+
safe_call FFI.LGBM_DatasetGetFeatureNames(@handle, num_features, num_feature_names, actual_len, out_buffer_len, out_strs)
|
45
47
|
end
|
46
48
|
|
47
49
|
# should be the same, but get number of features
|
48
50
|
# from most recent call (instead of num_features)
|
49
51
|
str_ptrs[0, num_feature_names.read_int].map(&:read_string)
|
50
52
|
end
|
53
|
+
alias_method :feature_names, :feature_name
|
51
54
|
|
52
55
|
def label=(label)
|
53
56
|
@label = label
|
@@ -64,12 +67,16 @@ module LightGBM
|
|
64
67
|
set_field("group", group, type: :int32)
|
65
68
|
end
|
66
69
|
|
67
|
-
def
|
70
|
+
def feature_name=(feature_names)
|
71
|
+
feature_names = feature_names.map(&:to_s)
|
68
72
|
@feature_names = feature_names
|
69
73
|
c_feature_names = ::FFI::MemoryPointer.new(:pointer, feature_names.size)
|
70
|
-
|
71
|
-
|
74
|
+
# keep reference to string pointers
|
75
|
+
str_ptrs = feature_names.map { |v| ::FFI::MemoryPointer.from_string(v) }
|
76
|
+
c_feature_names.write_array_of_pointer(str_ptrs)
|
77
|
+
safe_call FFI.LGBM_DatasetSetFeatureNames(@handle, c_feature_names, feature_names.size)
|
72
78
|
end
|
79
|
+
alias_method :feature_names=, :feature_name=
|
73
80
|
|
74
81
|
# TODO only update reference if not in chain
|
75
82
|
def reference=(reference)
|
@@ -81,18 +88,18 @@ module LightGBM
|
|
81
88
|
|
82
89
|
def num_data
|
83
90
|
out = ::FFI::MemoryPointer.new(:int)
|
84
|
-
|
91
|
+
safe_call FFI.LGBM_DatasetGetNumData(@handle, out)
|
85
92
|
out.read_int
|
86
93
|
end
|
87
94
|
|
88
95
|
def num_feature
|
89
96
|
out = ::FFI::MemoryPointer.new(:int)
|
90
|
-
|
97
|
+
safe_call FFI.LGBM_DatasetGetNumFeature(@handle, out)
|
91
98
|
out.read_int
|
92
99
|
end
|
93
100
|
|
94
101
|
def save_binary(filename)
|
95
|
-
|
102
|
+
safe_call FFI.LGBM_DatasetSaveBinary(@handle, filename)
|
96
103
|
end
|
97
104
|
|
98
105
|
def subset(used_indices, params: nil)
|
@@ -105,13 +112,8 @@ module LightGBM
|
|
105
112
|
)
|
106
113
|
end
|
107
114
|
|
108
|
-
def
|
109
|
-
@handle
|
110
|
-
end
|
111
|
-
|
112
|
-
def self.finalize(addr)
|
113
|
-
# must use proc instead of stabby lambda
|
114
|
-
proc { FFI.LGBM_DatasetFree(::FFI::Pointer.new(:pointer, addr)) }
|
115
|
+
def handle
|
116
|
+
@handle
|
115
117
|
end
|
116
118
|
|
117
119
|
private
|
@@ -127,27 +129,45 @@ module LightGBM
|
|
127
129
|
end
|
128
130
|
set_verbosity(params)
|
129
131
|
|
130
|
-
|
132
|
+
handle = ::FFI::MemoryPointer.new(:pointer)
|
131
133
|
parameters = params_str(params)
|
132
|
-
reference = @reference.
|
134
|
+
reference = @reference.handle if @reference
|
133
135
|
if used_indices
|
134
136
|
used_row_indices = ::FFI::MemoryPointer.new(:int32, used_indices.count)
|
135
137
|
used_row_indices.write_array_of_int32(used_indices)
|
136
|
-
|
138
|
+
safe_call FFI.LGBM_DatasetGetSubset(reference, used_row_indices, used_indices.count, parameters, handle)
|
137
139
|
elsif data.is_a?(String)
|
138
|
-
|
140
|
+
safe_call FFI.LGBM_DatasetCreateFromFile(data, parameters, reference, handle)
|
139
141
|
else
|
140
142
|
if matrix?(data)
|
141
143
|
nrow = data.row_count
|
142
144
|
ncol = data.column_count
|
143
145
|
flat_data = data.to_a.flatten
|
144
146
|
elsif daru?(data)
|
147
|
+
if @feature_name == "auto"
|
148
|
+
@feature_name = data.vectors.to_a
|
149
|
+
end
|
145
150
|
nrow, ncol = data.shape
|
146
151
|
flat_data = data.map_rows(&:to_a).flatten
|
147
|
-
elsif numo?(data)
|
148
|
-
|
152
|
+
elsif numo?(data)
|
153
|
+
nrow, ncol = data.shape
|
154
|
+
elsif rover?(data)
|
155
|
+
if @feature_name == "auto"
|
156
|
+
@feature_name = data.keys
|
157
|
+
end
|
158
|
+
data = data.to_numo
|
149
159
|
nrow, ncol = data.shape
|
160
|
+
elsif data.is_a?(Array) && data.first.is_a?(Hash)
|
161
|
+
keys = data.first.keys
|
162
|
+
if @feature_name == "auto"
|
163
|
+
@feature_name = keys
|
164
|
+
end
|
165
|
+
nrow = data.count
|
166
|
+
ncol = data.first.count
|
167
|
+
flat_data = data.flat_map { |v| v.fetch_values(*keys) }
|
150
168
|
else
|
169
|
+
data = data.to_a
|
170
|
+
check_2d_array(data)
|
151
171
|
nrow = data.count
|
152
172
|
ncol = data.first.count
|
153
173
|
flat_data = data.flatten
|
@@ -161,18 +181,22 @@ module LightGBM
|
|
161
181
|
c_data.write_array_of_double(flat_data)
|
162
182
|
end
|
163
183
|
|
164
|
-
|
184
|
+
safe_call FFI.LGBM_DatasetCreateFromMat(c_data, FFI::C_API_DTYPE_FLOAT64, nrow, ncol, 1, parameters, reference, handle)
|
185
|
+
end
|
186
|
+
if used_indices
|
187
|
+
@handle = handle.read_pointer
|
188
|
+
else
|
189
|
+
@handle = ::FFI::AutoPointer.new(handle.read_pointer, FFI.method(:LGBM_DatasetFree))
|
165
190
|
end
|
166
|
-
ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i)) unless used_indices
|
167
191
|
|
168
192
|
self.label = @label if @label
|
169
193
|
self.weight = @weight if @weight
|
170
194
|
self.group = @group if @group
|
171
|
-
self.
|
195
|
+
self.feature_name = @feature_name if @feature_name && @feature_name != "auto"
|
172
196
|
end
|
173
197
|
|
174
198
|
def dump_text(filename)
|
175
|
-
|
199
|
+
safe_call FFI.LGBM_DatasetDumpText(@handle, filename)
|
176
200
|
end
|
177
201
|
|
178
202
|
def field(field_name)
|
@@ -180,7 +204,7 @@ module LightGBM
|
|
180
204
|
out_len = ::FFI::MemoryPointer.new(:int)
|
181
205
|
out_ptr = ::FFI::MemoryPointer.new(:float, num_data)
|
182
206
|
out_type = ::FFI::MemoryPointer.new(:int)
|
183
|
-
|
207
|
+
safe_call FFI.LGBM_DatasetGetField(@handle, field_name, out_len, out_ptr, out_type)
|
184
208
|
out_ptr.read_pointer.read_array_of_float(num_data)
|
185
209
|
end
|
186
210
|
|
@@ -189,14 +213,12 @@ module LightGBM
|
|
189
213
|
if type == :int32
|
190
214
|
c_data = ::FFI::MemoryPointer.new(:int32, data.count)
|
191
215
|
c_data.write_array_of_int32(data)
|
192
|
-
|
216
|
+
safe_call FFI.LGBM_DatasetSetField(@handle, field_name, c_data, data.count, 2)
|
193
217
|
else
|
194
218
|
c_data = ::FFI::MemoryPointer.new(:float, data.count)
|
195
219
|
c_data.write_array_of_float(data)
|
196
|
-
|
220
|
+
safe_call FFI.LGBM_DatasetSetField(@handle, field_name, c_data, data.count, 0)
|
197
221
|
end
|
198
222
|
end
|
199
|
-
|
200
|
-
include Utils
|
201
223
|
end
|
202
224
|
end
|
data/lib/lightgbm/ffi.rb
CHANGED
@@ -15,6 +15,19 @@ module LightGBM
|
|
15
15
|
# https://github.com/microsoft/LightGBM/blob/master/include/LightGBM/c_api.h
|
16
16
|
# keep same order
|
17
17
|
|
18
|
+
C_API_DTYPE_FLOAT32 = 0
|
19
|
+
C_API_DTYPE_FLOAT64 = 1
|
20
|
+
C_API_DTYPE_INT32 = 2
|
21
|
+
C_API_DTYPE_INT64 = 3
|
22
|
+
|
23
|
+
C_API_PREDICT_NORMAL = 0
|
24
|
+
C_API_PREDICT_RAW_SCORE = 1
|
25
|
+
C_API_PREDICT_LEAF_INDEX = 2
|
26
|
+
C_API_PREDICT_CONTRIB = 3
|
27
|
+
|
28
|
+
C_API_FEATURE_IMPORTANCE_SPLIT = 0
|
29
|
+
C_API_FEATURE_IMPORTANCE_GAIN = 1
|
30
|
+
|
18
31
|
# error
|
19
32
|
attach_function :LGBM_GetLastError, %i[], :string
|
20
33
|
|
@@ -36,6 +49,7 @@ module LightGBM
|
|
36
49
|
attach_function :LGBM_BoosterCreate, %i[pointer string pointer], :int
|
37
50
|
attach_function :LGBM_BoosterCreateFromModelfile, %i[string pointer pointer], :int
|
38
51
|
attach_function :LGBM_BoosterLoadModelFromString, %i[string pointer pointer], :int
|
52
|
+
attach_function :LGBM_BoosterGetLoadedParam, %i[pointer int64 pointer pointer], :int
|
39
53
|
attach_function :LGBM_BoosterFree, %i[pointer], :int
|
40
54
|
attach_function :LGBM_BoosterAddValidData, %i[pointer pointer], :int
|
41
55
|
attach_function :LGBM_BoosterGetNumClasses, %i[pointer pointer], :int
|
@@ -48,6 +62,7 @@ module LightGBM
|
|
48
62
|
attach_function :LGBM_BoosterGetFeatureNames, %i[pointer int pointer size_t pointer pointer], :int
|
49
63
|
attach_function :LGBM_BoosterGetNumFeature, %i[pointer pointer], :int
|
50
64
|
attach_function :LGBM_BoosterGetEval, %i[pointer int pointer pointer], :int
|
65
|
+
attach_function :LGBM_BoosterCalcNumPredict, %i[pointer int int int int pointer], :int
|
51
66
|
attach_function :LGBM_BoosterPredictForMat, %i[pointer pointer int int32 int32 int int int int string pointer pointer], :int
|
52
67
|
attach_function :LGBM_BoosterSaveModel, %i[pointer int int int string], :int
|
53
68
|
attach_function :LGBM_BoosterSaveModelToString, %i[pointer int int int int64 pointer pointer], :int
|
@@ -0,0 +1,159 @@
|
|
1
|
+
module LightGBM
|
2
|
+
class InnerPredictor
|
3
|
+
include Utils
|
4
|
+
|
5
|
+
MAX_INT32 = (1 << 31) - 1
|
6
|
+
|
7
|
+
def initialize(booster, pred_parameter)
|
8
|
+
@handle = booster.instance_variable_get(:@handle)
|
9
|
+
@pandas_categorical = booster.instance_variable_get(:@pandas_categorical)
|
10
|
+
@pred_parameter = params_str(pred_parameter)
|
11
|
+
|
12
|
+
# keep booster for cached_feature_name
|
13
|
+
@booster = booster
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.from_booster(booster, pred_parameter)
|
17
|
+
new(booster, pred_parameter)
|
18
|
+
end
|
19
|
+
|
20
|
+
def predict(data, start_iteration: 0, num_iteration: -1, raw_score: false, pred_leaf: false, pred_contrib: false)
|
21
|
+
if data.is_a?(Dataset)
|
22
|
+
raise TypeError, "Cannot use Dataset instance for prediction, please use raw data instead"
|
23
|
+
end
|
24
|
+
|
25
|
+
predict_type = FFI::C_API_PREDICT_NORMAL
|
26
|
+
if raw_score
|
27
|
+
predict_type = FFI::C_API_PREDICT_RAW_SCORE
|
28
|
+
end
|
29
|
+
if pred_leaf
|
30
|
+
predict_type = FFI::C_API_PREDICT_LEAF_INDEX
|
31
|
+
end
|
32
|
+
if pred_contrib
|
33
|
+
predict_type = FFI::C_API_PREDICT_CONTRIB
|
34
|
+
end
|
35
|
+
|
36
|
+
if daru?(data)
|
37
|
+
data = data[*cached_feature_name].map_rows(&:to_a)
|
38
|
+
singular = false
|
39
|
+
elsif data.is_a?(Hash) # sort feature.values to match the order of model.feature_name
|
40
|
+
data = [sorted_feature_values(data)]
|
41
|
+
singular = true
|
42
|
+
elsif data.is_a?(Array) && data.first.is_a?(Hash) # on multiple elems, if 1st is hash, assume they all are
|
43
|
+
data = data.map(&method(:sorted_feature_values))
|
44
|
+
singular = false
|
45
|
+
elsif rover?(data)
|
46
|
+
# TODO improve performance
|
47
|
+
data = data[cached_feature_name].to_numo.to_a
|
48
|
+
singular = false
|
49
|
+
else
|
50
|
+
data = data.to_a
|
51
|
+
singular = !data.first.is_a?(Array)
|
52
|
+
data = [data] if singular
|
53
|
+
check_2d_array(data)
|
54
|
+
data = data.map(&:dup) if @pandas_categorical&.any?
|
55
|
+
end
|
56
|
+
|
57
|
+
if @pandas_categorical&.any?
|
58
|
+
apply_pandas_categorical(
|
59
|
+
data,
|
60
|
+
@booster.params["categorical_feature"],
|
61
|
+
@pandas_categorical
|
62
|
+
)
|
63
|
+
end
|
64
|
+
|
65
|
+
preds, nrow =
|
66
|
+
pred_for_array(
|
67
|
+
data,
|
68
|
+
start_iteration,
|
69
|
+
num_iteration,
|
70
|
+
predict_type
|
71
|
+
)
|
72
|
+
|
73
|
+
if pred_leaf
|
74
|
+
preds = preds.map(&:to_i)
|
75
|
+
end
|
76
|
+
|
77
|
+
if preds.size != nrow
|
78
|
+
if preds.size % nrow == 0
|
79
|
+
preds = preds.each_slice(preds.size / nrow).to_a
|
80
|
+
else
|
81
|
+
raise Error, "Length of predict result (#{preds.size}) cannot be divide nrow (#{nrow})"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
singular ? preds.first : preds
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def pred_for_array(input, start_iteration, num_iteration, predict_type)
|
91
|
+
nrow = input.count
|
92
|
+
if nrow > MAX_INT32
|
93
|
+
raise Error, "Not supported"
|
94
|
+
end
|
95
|
+
inner_predict_array(
|
96
|
+
input,
|
97
|
+
start_iteration,
|
98
|
+
num_iteration,
|
99
|
+
predict_type
|
100
|
+
)
|
101
|
+
end
|
102
|
+
|
103
|
+
def inner_predict_array(input, start_iteration, num_iteration, predict_type)
|
104
|
+
n_preds =
|
105
|
+
num_preds(
|
106
|
+
start_iteration,
|
107
|
+
num_iteration,
|
108
|
+
input.count,
|
109
|
+
predict_type
|
110
|
+
)
|
111
|
+
|
112
|
+
flat_input = input.flatten
|
113
|
+
handle_missing(flat_input)
|
114
|
+
data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
|
115
|
+
data.write_array_of_double(flat_input)
|
116
|
+
|
117
|
+
out_num_preds = ::FFI::MemoryPointer.new(:int64)
|
118
|
+
out_result = ::FFI::MemoryPointer.new(:double, n_preds)
|
119
|
+
safe_call FFI.LGBM_BoosterPredictForMat(@handle, data, FFI::C_API_DTYPE_FLOAT64, input.count, input.first.count, 1, predict_type, start_iteration, num_iteration, @pred_parameter, out_num_preds, out_result)
|
120
|
+
if n_preds != out_num_preds.read_int64
|
121
|
+
raise Error, "Wrong length for predict results"
|
122
|
+
end
|
123
|
+
preds = out_result.read_array_of_double(out_num_preds.read_int64)
|
124
|
+
[preds, input.count]
|
125
|
+
end
|
126
|
+
|
127
|
+
def num_preds(start_iteration, num_iteration, nrow, predict_type)
|
128
|
+
out = ::FFI::MemoryPointer.new(:int64)
|
129
|
+
safe_call FFI.LGBM_BoosterCalcNumPredict(@handle, nrow, predict_type, start_iteration, num_iteration, out)
|
130
|
+
out.read_int64
|
131
|
+
end
|
132
|
+
|
133
|
+
def sorted_feature_values(input_hash)
|
134
|
+
input_hash.transform_keys(&:to_s).fetch_values(*cached_feature_name)
|
135
|
+
end
|
136
|
+
|
137
|
+
def cached_feature_name
|
138
|
+
@booster.send(:cached_feature_name)
|
139
|
+
end
|
140
|
+
|
141
|
+
def apply_pandas_categorical(data, categorical_feature, pandas_categorical)
|
142
|
+
(categorical_feature || []).each_with_index do |cf, i|
|
143
|
+
cat_codes = pandas_categorical[i].map.with_index.to_h
|
144
|
+
data.each do |r|
|
145
|
+
cat = r[cf]
|
146
|
+
unless cat.nil?
|
147
|
+
r[cf] =
|
148
|
+
cat_codes.fetch(cat) do
|
149
|
+
unless cat.is_a?(String)
|
150
|
+
raise ArgumentError, "expected categorical value"
|
151
|
+
end
|
152
|
+
nil
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
data/lib/lightgbm/model.rb
CHANGED
data/lib/lightgbm/utils.rb
CHANGED
@@ -2,8 +2,8 @@ module LightGBM
|
|
2
2
|
module Utils
|
3
3
|
private
|
4
4
|
|
5
|
-
def
|
6
|
-
raise
|
5
|
+
def safe_call(err)
|
6
|
+
raise Error, FFI.LGBM_GetLastError if err != 0
|
7
7
|
end
|
8
8
|
|
9
9
|
# remove spaces in keys and values to prevent injection
|
@@ -24,6 +24,13 @@ module LightGBM
|
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
+
def check_2d_array(data)
|
28
|
+
ncol = data.first&.size || 0
|
29
|
+
if !data.all? { |r| r.size == ncol }
|
30
|
+
raise ArgumentError, "Rows have different sizes"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
27
34
|
# for categorical, NaN and negative value are the same
|
28
35
|
def handle_missing(data)
|
29
36
|
data.map! { |v| v.nil? ? Float::NAN : v }
|
data/lib/lightgbm/version.rb
CHANGED
data/lib/lightgbm.rb
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
# dependencies
|
2
2
|
require "ffi"
|
3
3
|
|
4
|
+
# stdlib
|
5
|
+
require "json"
|
6
|
+
|
4
7
|
# modules
|
5
8
|
require_relative "lightgbm/utils"
|
6
9
|
require_relative "lightgbm/booster"
|
7
10
|
require_relative "lightgbm/dataset"
|
11
|
+
require_relative "lightgbm/inner_predictor"
|
8
12
|
require_relative "lightgbm/version"
|
9
13
|
|
10
14
|
# scikit-learn API
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lightgbm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-01-05 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: ffi
|
@@ -24,7 +23,6 @@ dependencies:
|
|
24
23
|
- - ">="
|
25
24
|
- !ruby/object:Gem::Version
|
26
25
|
version: '0'
|
27
|
-
description:
|
28
26
|
email: andrew@ankane.org
|
29
27
|
executables: []
|
30
28
|
extensions: []
|
@@ -38,6 +36,7 @@ files:
|
|
38
36
|
- lib/lightgbm/classifier.rb
|
39
37
|
- lib/lightgbm/dataset.rb
|
40
38
|
- lib/lightgbm/ffi.rb
|
39
|
+
- lib/lightgbm/inner_predictor.rb
|
41
40
|
- lib/lightgbm/model.rb
|
42
41
|
- lib/lightgbm/ranker.rb
|
43
42
|
- lib/lightgbm/regressor.rb
|
@@ -53,7 +52,6 @@ homepage: https://github.com/ankane/lightgbm-ruby
|
|
53
52
|
licenses:
|
54
53
|
- MIT
|
55
54
|
metadata: {}
|
56
|
-
post_install_message:
|
57
55
|
rdoc_options: []
|
58
56
|
require_paths:
|
59
57
|
- lib
|
@@ -61,15 +59,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
61
59
|
requirements:
|
62
60
|
- - ">="
|
63
61
|
- !ruby/object:Gem::Version
|
64
|
-
version: '3'
|
62
|
+
version: '3.1'
|
65
63
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
64
|
requirements:
|
67
65
|
- - ">="
|
68
66
|
- !ruby/object:Gem::Version
|
69
67
|
version: '0'
|
70
68
|
requirements: []
|
71
|
-
rubygems_version: 3.
|
72
|
-
signing_key:
|
69
|
+
rubygems_version: 3.6.2
|
73
70
|
specification_version: 4
|
74
71
|
summary: High performance gradient boosting for Ruby
|
75
72
|
test_files: []
|