lightgbm 0.3.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/LICENSE.txt +1 -1
- data/lib/lightgbm/booster.rb +125 -82
- data/lib/lightgbm/dataset.rb +55 -33
- data/lib/lightgbm/ffi.rb +15 -0
- data/lib/lightgbm/inner_predictor.rb +159 -0
- data/lib/lightgbm/model.rb +1 -1
- data/lib/lightgbm/utils.rb +9 -2
- data/lib/lightgbm/version.rb +1 -1
- data/lib/lightgbm.rb +4 -0
- metadata +5 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4fadfa7ea250cf7c48f076effb5ce8f5db3cf0c8ab87bb04f2033457a502721a
|
4
|
+
data.tar.gz: 3af4cac369a3c684bdb387036845eca04747c14c8e12c9b38625e5c38130de74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21fb7ae25e1f085cd3642bb02ce32f8378f5d6013f6e8504b86586c86dfaf5c29b12d83b10e3bfd747a3dbfc996eb8473c12313ca5e2f4302554b0a6c40261e3
|
7
|
+
data.tar.gz: 75d7b3cea373adedbe8a6cc7c9f0b47f473ac0e77126779efd2f5ab1899596f4a8a926e1d97b033e5cb6e43b0c0f3706e8e7eb9f280d29354ba18792e2f4078b
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## 0.4.0 (2025-01-05)
|
2
|
+
|
3
|
+
- Added support for different prediction types
|
4
|
+
- Added support for `pandas_categorical` to `predict` method
|
5
|
+
- Added support for hashes and Rover data frames to `predict` method
|
6
|
+
- Added support for hashes to `Dataset`
|
7
|
+
- Added `importance_type` option to `dump_model`, `model_to_string`, and `save_model` methods
|
8
|
+
- Changed `Dataset` to use column names for feature names with Rover and Daru
|
9
|
+
- Changed `predict` method to match feature names with Daru
|
10
|
+
- Dropped support for Ruby < 3.1
|
11
|
+
|
1
12
|
## 0.3.4 (2024-07-28)
|
2
13
|
|
3
14
|
- Updated LightGBM to 4.5.0
|
data/LICENSE.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
The MIT License (MIT)
|
2
2
|
|
3
3
|
Copyright (c) Microsoft Corporation
|
4
|
-
Copyright (c) 2019-
|
4
|
+
Copyright (c) 2019-2025 Andrew Kane
|
5
5
|
|
6
6
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
7
|
of this software and associated documentation files (the "Software"), to deal
|
data/lib/lightgbm/booster.rb
CHANGED
@@ -1,20 +1,29 @@
|
|
1
1
|
module LightGBM
|
2
2
|
class Booster
|
3
|
-
|
3
|
+
include Utils
|
4
|
+
|
5
|
+
attr_accessor :best_iteration, :train_data_name, :params
|
4
6
|
|
5
7
|
def initialize(params: nil, train_set: nil, model_file: nil, model_str: nil)
|
6
|
-
@handle = ::FFI::MemoryPointer.new(:pointer)
|
7
8
|
if model_str
|
8
9
|
model_from_string(model_str)
|
9
10
|
elsif model_file
|
10
11
|
out_num_iterations = ::FFI::MemoryPointer.new(:int)
|
11
|
-
|
12
|
+
create_handle do |handle|
|
13
|
+
safe_call FFI.LGBM_BoosterCreateFromModelfile(model_file, out_num_iterations, handle)
|
14
|
+
end
|
15
|
+
@pandas_categorical = load_pandas_categorical(file_name: model_file)
|
16
|
+
if params
|
17
|
+
warn "[lightgbm] Ignoring params argument, using parameters from model file."
|
18
|
+
end
|
19
|
+
@params = loaded_param
|
12
20
|
else
|
13
21
|
params ||= {}
|
14
22
|
set_verbosity(params)
|
15
|
-
|
23
|
+
create_handle do |handle|
|
24
|
+
safe_call FFI.LGBM_BoosterCreate(train_set.handle, params_str(params), handle)
|
25
|
+
end
|
16
26
|
end
|
17
|
-
ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i))
|
18
27
|
|
19
28
|
self.best_iteration = -1
|
20
29
|
|
@@ -23,28 +32,28 @@ module LightGBM
|
|
23
32
|
end
|
24
33
|
|
25
34
|
def add_valid(data, name)
|
26
|
-
|
35
|
+
safe_call FFI.LGBM_BoosterAddValidData(@handle, data.handle)
|
27
36
|
@name_valid_sets << name
|
28
37
|
self # consistent with Python API
|
29
38
|
end
|
30
39
|
|
31
40
|
def current_iteration
|
32
41
|
out = ::FFI::MemoryPointer.new(:int)
|
33
|
-
|
42
|
+
safe_call FFI.LGBM_BoosterGetCurrentIteration(@handle, out)
|
34
43
|
out.read_int
|
35
44
|
end
|
36
45
|
|
37
|
-
def dump_model(num_iteration: nil, start_iteration: 0)
|
46
|
+
def dump_model(num_iteration: nil, start_iteration: 0, importance_type: "split")
|
38
47
|
num_iteration ||= best_iteration
|
48
|
+
importance_type_int = feature_importance_type_mapper(importance_type)
|
39
49
|
buffer_len = 1 << 20
|
40
50
|
out_len = ::FFI::MemoryPointer.new(:int64)
|
41
51
|
out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
|
42
|
-
|
43
|
-
|
44
|
-
actual_len = read_int64(out_len)
|
52
|
+
safe_call FFI.LGBM_BoosterDumpModel(@handle, start_iteration, num_iteration, importance_type_int, buffer_len, out_len, out_str)
|
53
|
+
actual_len = out_len.read_int64
|
45
54
|
if actual_len > buffer_len
|
46
55
|
out_str = ::FFI::MemoryPointer.new(:char, actual_len)
|
47
|
-
|
56
|
+
safe_call FFI.LGBM_BoosterDumpModel(@handle, start_iteration, num_iteration, importance_type_int, actual_len, out_len, out_str)
|
48
57
|
end
|
49
58
|
out_str.read_string
|
50
59
|
end
|
@@ -60,19 +69,10 @@ module LightGBM
|
|
60
69
|
|
61
70
|
def feature_importance(iteration: nil, importance_type: "split")
|
62
71
|
iteration ||= best_iteration
|
63
|
-
|
64
|
-
case importance_type
|
65
|
-
when "split"
|
66
|
-
0
|
67
|
-
when "gain"
|
68
|
-
1
|
69
|
-
else
|
70
|
-
-1
|
71
|
-
end
|
72
|
-
|
72
|
+
importance_type_int = feature_importance_type_mapper(importance_type)
|
73
73
|
num_feature = self.num_feature
|
74
74
|
out_result = ::FFI::MemoryPointer.new(:double, num_feature)
|
75
|
-
|
75
|
+
safe_call FFI.LGBM_BoosterFeatureImportance(@handle, iteration, importance_type_int, out_result)
|
76
76
|
out_result.read_array_of_double(num_feature).map(&:to_i)
|
77
77
|
end
|
78
78
|
|
@@ -84,13 +84,13 @@ module LightGBM
|
|
84
84
|
out_strs = ::FFI::MemoryPointer.new(:pointer, num_feature)
|
85
85
|
str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
|
86
86
|
out_strs.write_array_of_pointer(str_ptrs)
|
87
|
-
|
87
|
+
safe_call FFI.LGBM_BoosterGetFeatureNames(@handle, len, out_len, buffer_len, out_buffer_len, out_strs)
|
88
88
|
|
89
89
|
actual_len = out_buffer_len.read(:size_t)
|
90
90
|
if actual_len > buffer_len
|
91
91
|
str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
|
92
92
|
out_strs.write_array_of_pointer(str_ptrs)
|
93
|
-
|
93
|
+
safe_call FFI.LGBM_BoosterGetFeatureNames(@handle, len, out_len, actual_len, out_buffer_len, out_strs)
|
94
94
|
end
|
95
95
|
|
96
96
|
str_ptrs[0, out_len.read(:size_t)].map(&:read_string)
|
@@ -98,130 +98,122 @@ module LightGBM
|
|
98
98
|
|
99
99
|
def model_from_string(model_str)
|
100
100
|
out_num_iterations = ::FFI::MemoryPointer.new(:int)
|
101
|
-
|
101
|
+
create_handle do |handle|
|
102
|
+
safe_call FFI.LGBM_BoosterLoadModelFromString(model_str, out_num_iterations, handle)
|
103
|
+
end
|
104
|
+
@pandas_categorical = load_pandas_categorical(model_str: model_str)
|
105
|
+
@params = loaded_param
|
106
|
+
@cached_feature_name = nil
|
102
107
|
self
|
103
108
|
end
|
104
109
|
|
105
|
-
def model_to_string(num_iteration: nil, start_iteration: 0)
|
110
|
+
def model_to_string(num_iteration: nil, start_iteration: 0, importance_type: "split")
|
106
111
|
num_iteration ||= best_iteration
|
112
|
+
importance_type_int = feature_importance_type_mapper(importance_type)
|
107
113
|
buffer_len = 1 << 20
|
108
114
|
out_len = ::FFI::MemoryPointer.new(:int64)
|
109
115
|
out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
|
110
|
-
|
111
|
-
|
112
|
-
actual_len = read_int64(out_len)
|
116
|
+
safe_call FFI.LGBM_BoosterSaveModelToString(@handle, start_iteration, num_iteration, importance_type_int, buffer_len, out_len, out_str)
|
117
|
+
actual_len = out_len.read_int64
|
113
118
|
if actual_len > buffer_len
|
114
119
|
out_str = ::FFI::MemoryPointer.new(:char, actual_len)
|
115
|
-
|
120
|
+
safe_call FFI.LGBM_BoosterSaveModelToString(@handle, start_iteration, num_iteration, importance_type_int, actual_len, out_len, out_str)
|
116
121
|
end
|
117
122
|
out_str.read_string
|
118
123
|
end
|
119
124
|
|
120
125
|
def num_feature
|
121
126
|
out = ::FFI::MemoryPointer.new(:int)
|
122
|
-
|
127
|
+
safe_call FFI.LGBM_BoosterGetNumFeature(@handle, out)
|
123
128
|
out.read_int
|
124
129
|
end
|
125
130
|
alias_method :num_features, :num_feature # legacy typo
|
126
131
|
|
127
132
|
def num_model_per_iteration
|
128
133
|
out = ::FFI::MemoryPointer.new(:int)
|
129
|
-
|
134
|
+
safe_call FFI.LGBM_BoosterNumModelPerIteration(@handle, out)
|
130
135
|
out.read_int
|
131
136
|
end
|
132
137
|
|
133
138
|
def num_trees
|
134
139
|
out = ::FFI::MemoryPointer.new(:int)
|
135
|
-
|
140
|
+
safe_call FFI.LGBM_BoosterNumberOfTotalModel(@handle, out)
|
136
141
|
out.read_int
|
137
142
|
end
|
138
143
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
if
|
143
|
-
|
144
|
+
def predict(data, start_iteration: 0, num_iteration: nil, raw_score: false, pred_leaf: false, pred_contrib: false, **kwargs)
|
145
|
+
predictor = InnerPredictor.from_booster(self, kwargs.transform_values(&:dup))
|
146
|
+
if num_iteration.nil?
|
147
|
+
if start_iteration <= 0
|
148
|
+
num_iteration = best_iteration
|
144
149
|
else
|
145
|
-
|
150
|
+
num_iteration = -1
|
146
151
|
end
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
handle_missing(flat_input)
|
157
|
-
data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
|
158
|
-
data.write_array_of_double(flat_input)
|
159
|
-
|
160
|
-
out_len = ::FFI::MemoryPointer.new(:int64)
|
161
|
-
out_result = ::FFI::MemoryPointer.new(:double, num_class * input.count)
|
162
|
-
check_result FFI.LGBM_BoosterPredictForMat(handle_pointer, data, 1, input.count, input.first.count, 1, 0, start_iteration, num_iteration, params_str(params), out_len, out_result)
|
163
|
-
out = out_result.read_array_of_double(read_int64(out_len))
|
164
|
-
out = out.each_slice(num_class).to_a if num_class > 1
|
165
|
-
|
166
|
-
singular ? out.first : out
|
152
|
+
end
|
153
|
+
predictor.predict(
|
154
|
+
data,
|
155
|
+
start_iteration: start_iteration,
|
156
|
+
num_iteration: num_iteration,
|
157
|
+
raw_score: raw_score,
|
158
|
+
pred_leaf: pred_leaf,
|
159
|
+
pred_contrib: pred_contrib
|
160
|
+
)
|
167
161
|
end
|
168
162
|
|
169
|
-
def save_model(filename, num_iteration: nil, start_iteration: 0)
|
163
|
+
def save_model(filename, num_iteration: nil, start_iteration: 0, importance_type: "split")
|
170
164
|
num_iteration ||= best_iteration
|
171
|
-
|
172
|
-
|
165
|
+
importance_type_int = feature_importance_type_mapper(importance_type)
|
166
|
+
safe_call FFI.LGBM_BoosterSaveModel(@handle, start_iteration, num_iteration, importance_type_int, filename)
|
173
167
|
self # consistent with Python API
|
174
168
|
end
|
175
169
|
|
176
170
|
def update
|
177
171
|
finished = ::FFI::MemoryPointer.new(:int)
|
178
|
-
|
172
|
+
safe_call FFI.LGBM_BoosterUpdateOneIter(@handle, finished)
|
179
173
|
finished.read_int == 1
|
180
174
|
end
|
181
175
|
|
182
|
-
def self.finalize(addr)
|
183
|
-
# must use proc instead of stabby lambda
|
184
|
-
proc { FFI.LGBM_BoosterFree(::FFI::Pointer.new(:pointer, addr)) }
|
185
|
-
end
|
186
|
-
|
187
176
|
private
|
188
177
|
|
189
|
-
def
|
190
|
-
|
178
|
+
def create_handle
|
179
|
+
::FFI::MemoryPointer.new(:pointer) do |handle|
|
180
|
+
yield handle
|
181
|
+
@handle = ::FFI::AutoPointer.new(handle.read_pointer, FFI.method(:LGBM_BoosterFree))
|
182
|
+
end
|
191
183
|
end
|
192
184
|
|
193
185
|
def eval_counts
|
194
186
|
out = ::FFI::MemoryPointer.new(:int)
|
195
|
-
|
187
|
+
safe_call FFI.LGBM_BoosterGetEvalCounts(@handle, out)
|
196
188
|
out.read_int
|
197
189
|
end
|
198
190
|
|
199
191
|
def eval_names
|
200
|
-
eval_counts
|
192
|
+
eval_counts = self.eval_counts
|
201
193
|
out_len = ::FFI::MemoryPointer.new(:int)
|
202
194
|
out_buffer_len = ::FFI::MemoryPointer.new(:size_t)
|
203
195
|
out_strs = ::FFI::MemoryPointer.new(:pointer, eval_counts)
|
204
196
|
buffer_len = 255
|
205
197
|
str_ptrs = eval_counts.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
|
206
198
|
out_strs.write_array_of_pointer(str_ptrs)
|
207
|
-
|
199
|
+
safe_call FFI.LGBM_BoosterGetEvalNames(@handle, eval_counts, out_len, buffer_len, out_buffer_len, out_strs)
|
208
200
|
|
209
201
|
actual_len = out_buffer_len.read(:size_t)
|
210
202
|
if actual_len > buffer_len
|
211
203
|
str_ptrs = eval_counts.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
|
212
204
|
out_strs.write_array_of_pointer(str_ptrs)
|
213
|
-
|
205
|
+
safe_call FFI.LGBM_BoosterGetEvalNames(@handle, eval_counts, out_len, actual_len, out_buffer_len, out_strs)
|
214
206
|
end
|
215
207
|
|
216
208
|
str_ptrs.map(&:read_string)
|
217
209
|
end
|
218
210
|
|
219
211
|
def inner_eval(name, i)
|
220
|
-
eval_names
|
212
|
+
eval_names = self.eval_names
|
221
213
|
|
222
214
|
out_len = ::FFI::MemoryPointer.new(:int)
|
223
215
|
out_results = ::FFI::MemoryPointer.new(:double, eval_names.count)
|
224
|
-
|
216
|
+
safe_call FFI.LGBM_BoosterGetEval(@handle, i, out_len, out_results)
|
225
217
|
vals = out_results.read_array_of_double(out_len.read_int)
|
226
218
|
|
227
219
|
eval_names.zip(vals).map do |eval_name, val|
|
@@ -232,15 +224,66 @@ module LightGBM
|
|
232
224
|
|
233
225
|
def num_class
|
234
226
|
out = ::FFI::MemoryPointer.new(:int)
|
235
|
-
|
227
|
+
safe_call FFI.LGBM_BoosterGetNumClasses(@handle, out)
|
236
228
|
out.read_int
|
237
229
|
end
|
238
230
|
|
239
|
-
|
240
|
-
|
241
|
-
ptr.read_array_of_int64(1).first
|
231
|
+
def cached_feature_name
|
232
|
+
@cached_feature_name ||= feature_name
|
242
233
|
end
|
243
234
|
|
244
|
-
|
235
|
+
def feature_importance_type_mapper(importance_type)
|
236
|
+
case importance_type
|
237
|
+
when "split"
|
238
|
+
FFI::C_API_FEATURE_IMPORTANCE_SPLIT
|
239
|
+
when "gain"
|
240
|
+
FFI::C_API_FEATURE_IMPORTANCE_GAIN
|
241
|
+
else
|
242
|
+
-1
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def load_pandas_categorical(file_name: nil, model_str: nil)
|
247
|
+
pandas_key = "pandas_categorical:"
|
248
|
+
offset = -pandas_key.length
|
249
|
+
if !file_name.nil?
|
250
|
+
max_offset = -File.size(file_name)
|
251
|
+
lines = []
|
252
|
+
File.open(file_name, "rb") do |f|
|
253
|
+
loop do
|
254
|
+
offset = [offset, max_offset].max
|
255
|
+
f.seek(offset, IO::SEEK_END)
|
256
|
+
lines = f.readlines
|
257
|
+
if lines.length >= 2 || offset == max_offset
|
258
|
+
break
|
259
|
+
end
|
260
|
+
offset *= 2
|
261
|
+
end
|
262
|
+
end
|
263
|
+
last_line = lines[-1].strip
|
264
|
+
if !last_line.start_with?(pandas_key)
|
265
|
+
last_line = lines[-2].strip
|
266
|
+
end
|
267
|
+
elsif !model_str.nil?
|
268
|
+
idx = model_str[..offset].rindex("\n")
|
269
|
+
last_line = model_str[idx..].strip
|
270
|
+
end
|
271
|
+
if last_line.start_with?(pandas_key)
|
272
|
+
JSON.parse(last_line[pandas_key.length..])
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
def loaded_param
|
277
|
+
buffer_len = 1 << 20
|
278
|
+
out_len = ::FFI::MemoryPointer.new(:int64)
|
279
|
+
out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
|
280
|
+
safe_call FFI.LGBM_BoosterGetLoadedParam(@handle, buffer_len, out_len, out_str)
|
281
|
+
actual_len = out_len.read_int64
|
282
|
+
if actual_len > buffer_len
|
283
|
+
out_str = ::FFI::MemoryPointer.new(:char, actual_len)
|
284
|
+
safe_call FFI.LGBM_BoosterGetLoadedParam(@handle, actual_len, out_len, out_str)
|
285
|
+
end
|
286
|
+
JSON.parse(out_str.read_string)
|
287
|
+
end
|
245
288
|
end
|
246
289
|
end
|
data/lib/lightgbm/dataset.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
module LightGBM
|
2
2
|
class Dataset
|
3
|
+
include Utils
|
4
|
+
|
3
5
|
attr_reader :data, :params
|
4
6
|
|
5
|
-
def initialize(data, label: nil, weight: nil, group: nil, params: nil, reference: nil, used_indices: nil, categorical_feature: "auto", feature_names: nil)
|
7
|
+
def initialize(data, label: nil, weight: nil, group: nil, params: nil, reference: nil, used_indices: nil, categorical_feature: "auto", feature_name: nil, feature_names: nil)
|
6
8
|
@data = data
|
7
9
|
@label = label
|
8
10
|
@weight = weight
|
@@ -11,7 +13,7 @@ module LightGBM
|
|
11
13
|
@reference = reference
|
12
14
|
@used_indices = used_indices
|
13
15
|
@categorical_feature = categorical_feature
|
14
|
-
@
|
16
|
+
@feature_name = feature_name || feature_names || "auto"
|
15
17
|
|
16
18
|
construct
|
17
19
|
end
|
@@ -24,7 +26,7 @@ module LightGBM
|
|
24
26
|
field("weight")
|
25
27
|
end
|
26
28
|
|
27
|
-
def
|
29
|
+
def feature_name
|
28
30
|
# must preallocate space
|
29
31
|
num_feature_names = ::FFI::MemoryPointer.new(:int)
|
30
32
|
out_buffer_len = ::FFI::MemoryPointer.new(:size_t)
|
@@ -33,7 +35,7 @@ module LightGBM
|
|
33
35
|
buffer_len = 255
|
34
36
|
str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
|
35
37
|
out_strs.write_array_of_pointer(str_ptrs)
|
36
|
-
|
38
|
+
safe_call FFI.LGBM_DatasetGetFeatureNames(@handle, len, num_feature_names, buffer_len, out_buffer_len, out_strs)
|
37
39
|
|
38
40
|
num_features = num_feature_names.read_int
|
39
41
|
actual_len = out_buffer_len.read(:size_t)
|
@@ -41,13 +43,14 @@ module LightGBM
|
|
41
43
|
out_strs = ::FFI::MemoryPointer.new(:pointer, num_features) if num_features > len
|
42
44
|
str_ptrs = num_features.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
|
43
45
|
out_strs.write_array_of_pointer(str_ptrs)
|
44
|
-
|
46
|
+
safe_call FFI.LGBM_DatasetGetFeatureNames(@handle, num_features, num_feature_names, actual_len, out_buffer_len, out_strs)
|
45
47
|
end
|
46
48
|
|
47
49
|
# should be the same, but get number of features
|
48
50
|
# from most recent call (instead of num_features)
|
49
51
|
str_ptrs[0, num_feature_names.read_int].map(&:read_string)
|
50
52
|
end
|
53
|
+
alias_method :feature_names, :feature_name
|
51
54
|
|
52
55
|
def label=(label)
|
53
56
|
@label = label
|
@@ -64,12 +67,16 @@ module LightGBM
|
|
64
67
|
set_field("group", group, type: :int32)
|
65
68
|
end
|
66
69
|
|
67
|
-
def
|
70
|
+
def feature_name=(feature_names)
|
71
|
+
feature_names = feature_names.map(&:to_s)
|
68
72
|
@feature_names = feature_names
|
69
73
|
c_feature_names = ::FFI::MemoryPointer.new(:pointer, feature_names.size)
|
70
|
-
|
71
|
-
|
74
|
+
# keep reference to string pointers
|
75
|
+
str_ptrs = feature_names.map { |v| ::FFI::MemoryPointer.from_string(v) }
|
76
|
+
c_feature_names.write_array_of_pointer(str_ptrs)
|
77
|
+
safe_call FFI.LGBM_DatasetSetFeatureNames(@handle, c_feature_names, feature_names.size)
|
72
78
|
end
|
79
|
+
alias_method :feature_names=, :feature_name=
|
73
80
|
|
74
81
|
# TODO only update reference if not in chain
|
75
82
|
def reference=(reference)
|
@@ -81,18 +88,18 @@ module LightGBM
|
|
81
88
|
|
82
89
|
def num_data
|
83
90
|
out = ::FFI::MemoryPointer.new(:int)
|
84
|
-
|
91
|
+
safe_call FFI.LGBM_DatasetGetNumData(@handle, out)
|
85
92
|
out.read_int
|
86
93
|
end
|
87
94
|
|
88
95
|
def num_feature
|
89
96
|
out = ::FFI::MemoryPointer.new(:int)
|
90
|
-
|
97
|
+
safe_call FFI.LGBM_DatasetGetNumFeature(@handle, out)
|
91
98
|
out.read_int
|
92
99
|
end
|
93
100
|
|
94
101
|
def save_binary(filename)
|
95
|
-
|
102
|
+
safe_call FFI.LGBM_DatasetSaveBinary(@handle, filename)
|
96
103
|
end
|
97
104
|
|
98
105
|
def subset(used_indices, params: nil)
|
@@ -105,13 +112,8 @@ module LightGBM
|
|
105
112
|
)
|
106
113
|
end
|
107
114
|
|
108
|
-
def
|
109
|
-
@handle
|
110
|
-
end
|
111
|
-
|
112
|
-
def self.finalize(addr)
|
113
|
-
# must use proc instead of stabby lambda
|
114
|
-
proc { FFI.LGBM_DatasetFree(::FFI::Pointer.new(:pointer, addr)) }
|
115
|
+
def handle
|
116
|
+
@handle
|
115
117
|
end
|
116
118
|
|
117
119
|
private
|
@@ -127,27 +129,45 @@ module LightGBM
|
|
127
129
|
end
|
128
130
|
set_verbosity(params)
|
129
131
|
|
130
|
-
|
132
|
+
handle = ::FFI::MemoryPointer.new(:pointer)
|
131
133
|
parameters = params_str(params)
|
132
|
-
reference = @reference.
|
134
|
+
reference = @reference.handle if @reference
|
133
135
|
if used_indices
|
134
136
|
used_row_indices = ::FFI::MemoryPointer.new(:int32, used_indices.count)
|
135
137
|
used_row_indices.write_array_of_int32(used_indices)
|
136
|
-
|
138
|
+
safe_call FFI.LGBM_DatasetGetSubset(reference, used_row_indices, used_indices.count, parameters, handle)
|
137
139
|
elsif data.is_a?(String)
|
138
|
-
|
140
|
+
safe_call FFI.LGBM_DatasetCreateFromFile(data, parameters, reference, handle)
|
139
141
|
else
|
140
142
|
if matrix?(data)
|
141
143
|
nrow = data.row_count
|
142
144
|
ncol = data.column_count
|
143
145
|
flat_data = data.to_a.flatten
|
144
146
|
elsif daru?(data)
|
147
|
+
if @feature_name == "auto"
|
148
|
+
@feature_name = data.vectors.to_a
|
149
|
+
end
|
145
150
|
nrow, ncol = data.shape
|
146
151
|
flat_data = data.map_rows(&:to_a).flatten
|
147
|
-
elsif numo?(data)
|
148
|
-
|
152
|
+
elsif numo?(data)
|
153
|
+
nrow, ncol = data.shape
|
154
|
+
elsif rover?(data)
|
155
|
+
if @feature_name == "auto"
|
156
|
+
@feature_name = data.keys
|
157
|
+
end
|
158
|
+
data = data.to_numo
|
149
159
|
nrow, ncol = data.shape
|
160
|
+
elsif data.is_a?(Array) && data.first.is_a?(Hash)
|
161
|
+
keys = data.first.keys
|
162
|
+
if @feature_name == "auto"
|
163
|
+
@feature_name = keys
|
164
|
+
end
|
165
|
+
nrow = data.count
|
166
|
+
ncol = data.first.count
|
167
|
+
flat_data = data.flat_map { |v| v.fetch_values(*keys) }
|
150
168
|
else
|
169
|
+
data = data.to_a
|
170
|
+
check_2d_array(data)
|
151
171
|
nrow = data.count
|
152
172
|
ncol = data.first.count
|
153
173
|
flat_data = data.flatten
|
@@ -161,18 +181,22 @@ module LightGBM
|
|
161
181
|
c_data.write_array_of_double(flat_data)
|
162
182
|
end
|
163
183
|
|
164
|
-
|
184
|
+
safe_call FFI.LGBM_DatasetCreateFromMat(c_data, FFI::C_API_DTYPE_FLOAT64, nrow, ncol, 1, parameters, reference, handle)
|
185
|
+
end
|
186
|
+
if used_indices
|
187
|
+
@handle = handle.read_pointer
|
188
|
+
else
|
189
|
+
@handle = ::FFI::AutoPointer.new(handle.read_pointer, FFI.method(:LGBM_DatasetFree))
|
165
190
|
end
|
166
|
-
ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i)) unless used_indices
|
167
191
|
|
168
192
|
self.label = @label if @label
|
169
193
|
self.weight = @weight if @weight
|
170
194
|
self.group = @group if @group
|
171
|
-
self.
|
195
|
+
self.feature_name = @feature_name if @feature_name && @feature_name != "auto"
|
172
196
|
end
|
173
197
|
|
174
198
|
def dump_text(filename)
|
175
|
-
|
199
|
+
safe_call FFI.LGBM_DatasetDumpText(@handle, filename)
|
176
200
|
end
|
177
201
|
|
178
202
|
def field(field_name)
|
@@ -180,7 +204,7 @@ module LightGBM
|
|
180
204
|
out_len = ::FFI::MemoryPointer.new(:int)
|
181
205
|
out_ptr = ::FFI::MemoryPointer.new(:float, num_data)
|
182
206
|
out_type = ::FFI::MemoryPointer.new(:int)
|
183
|
-
|
207
|
+
safe_call FFI.LGBM_DatasetGetField(@handle, field_name, out_len, out_ptr, out_type)
|
184
208
|
out_ptr.read_pointer.read_array_of_float(num_data)
|
185
209
|
end
|
186
210
|
|
@@ -189,14 +213,12 @@ module LightGBM
|
|
189
213
|
if type == :int32
|
190
214
|
c_data = ::FFI::MemoryPointer.new(:int32, data.count)
|
191
215
|
c_data.write_array_of_int32(data)
|
192
|
-
|
216
|
+
safe_call FFI.LGBM_DatasetSetField(@handle, field_name, c_data, data.count, 2)
|
193
217
|
else
|
194
218
|
c_data = ::FFI::MemoryPointer.new(:float, data.count)
|
195
219
|
c_data.write_array_of_float(data)
|
196
|
-
|
220
|
+
safe_call FFI.LGBM_DatasetSetField(@handle, field_name, c_data, data.count, 0)
|
197
221
|
end
|
198
222
|
end
|
199
|
-
|
200
|
-
include Utils
|
201
223
|
end
|
202
224
|
end
|
data/lib/lightgbm/ffi.rb
CHANGED
@@ -15,6 +15,19 @@ module LightGBM
|
|
15
15
|
# https://github.com/microsoft/LightGBM/blob/master/include/LightGBM/c_api.h
|
16
16
|
# keep same order
|
17
17
|
|
18
|
+
C_API_DTYPE_FLOAT32 = 0
|
19
|
+
C_API_DTYPE_FLOAT64 = 1
|
20
|
+
C_API_DTYPE_INT32 = 2
|
21
|
+
C_API_DTYPE_INT64 = 3
|
22
|
+
|
23
|
+
C_API_PREDICT_NORMAL = 0
|
24
|
+
C_API_PREDICT_RAW_SCORE = 1
|
25
|
+
C_API_PREDICT_LEAF_INDEX = 2
|
26
|
+
C_API_PREDICT_CONTRIB = 3
|
27
|
+
|
28
|
+
C_API_FEATURE_IMPORTANCE_SPLIT = 0
|
29
|
+
C_API_FEATURE_IMPORTANCE_GAIN = 1
|
30
|
+
|
18
31
|
# error
|
19
32
|
attach_function :LGBM_GetLastError, %i[], :string
|
20
33
|
|
@@ -36,6 +49,7 @@ module LightGBM
|
|
36
49
|
attach_function :LGBM_BoosterCreate, %i[pointer string pointer], :int
|
37
50
|
attach_function :LGBM_BoosterCreateFromModelfile, %i[string pointer pointer], :int
|
38
51
|
attach_function :LGBM_BoosterLoadModelFromString, %i[string pointer pointer], :int
|
52
|
+
attach_function :LGBM_BoosterGetLoadedParam, %i[pointer int64 pointer pointer], :int
|
39
53
|
attach_function :LGBM_BoosterFree, %i[pointer], :int
|
40
54
|
attach_function :LGBM_BoosterAddValidData, %i[pointer pointer], :int
|
41
55
|
attach_function :LGBM_BoosterGetNumClasses, %i[pointer pointer], :int
|
@@ -48,6 +62,7 @@ module LightGBM
|
|
48
62
|
attach_function :LGBM_BoosterGetFeatureNames, %i[pointer int pointer size_t pointer pointer], :int
|
49
63
|
attach_function :LGBM_BoosterGetNumFeature, %i[pointer pointer], :int
|
50
64
|
attach_function :LGBM_BoosterGetEval, %i[pointer int pointer pointer], :int
|
65
|
+
attach_function :LGBM_BoosterCalcNumPredict, %i[pointer int int int int pointer], :int
|
51
66
|
attach_function :LGBM_BoosterPredictForMat, %i[pointer pointer int int32 int32 int int int int string pointer pointer], :int
|
52
67
|
attach_function :LGBM_BoosterSaveModel, %i[pointer int int int string], :int
|
53
68
|
attach_function :LGBM_BoosterSaveModelToString, %i[pointer int int int int64 pointer pointer], :int
|
@@ -0,0 +1,159 @@
|
|
1
|
+
module LightGBM
|
2
|
+
class InnerPredictor
|
3
|
+
include Utils
|
4
|
+
|
5
|
+
MAX_INT32 = (1 << 31) - 1
|
6
|
+
|
7
|
+
def initialize(booster, pred_parameter)
|
8
|
+
@handle = booster.instance_variable_get(:@handle)
|
9
|
+
@pandas_categorical = booster.instance_variable_get(:@pandas_categorical)
|
10
|
+
@pred_parameter = params_str(pred_parameter)
|
11
|
+
|
12
|
+
# keep booster for cached_feature_name
|
13
|
+
@booster = booster
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.from_booster(booster, pred_parameter)
|
17
|
+
new(booster, pred_parameter)
|
18
|
+
end
|
19
|
+
|
20
|
+
def predict(data, start_iteration: 0, num_iteration: -1, raw_score: false, pred_leaf: false, pred_contrib: false)
|
21
|
+
if data.is_a?(Dataset)
|
22
|
+
raise TypeError, "Cannot use Dataset instance for prediction, please use raw data instead"
|
23
|
+
end
|
24
|
+
|
25
|
+
predict_type = FFI::C_API_PREDICT_NORMAL
|
26
|
+
if raw_score
|
27
|
+
predict_type = FFI::C_API_PREDICT_RAW_SCORE
|
28
|
+
end
|
29
|
+
if pred_leaf
|
30
|
+
predict_type = FFI::C_API_PREDICT_LEAF_INDEX
|
31
|
+
end
|
32
|
+
if pred_contrib
|
33
|
+
predict_type = FFI::C_API_PREDICT_CONTRIB
|
34
|
+
end
|
35
|
+
|
36
|
+
if daru?(data)
|
37
|
+
data = data[*cached_feature_name].map_rows(&:to_a)
|
38
|
+
singular = false
|
39
|
+
elsif data.is_a?(Hash) # sort feature.values to match the order of model.feature_name
|
40
|
+
data = [sorted_feature_values(data)]
|
41
|
+
singular = true
|
42
|
+
elsif data.is_a?(Array) && data.first.is_a?(Hash) # on multiple elems, if 1st is hash, assume they all are
|
43
|
+
data = data.map(&method(:sorted_feature_values))
|
44
|
+
singular = false
|
45
|
+
elsif rover?(data)
|
46
|
+
# TODO improve performance
|
47
|
+
data = data[cached_feature_name].to_numo.to_a
|
48
|
+
singular = false
|
49
|
+
else
|
50
|
+
data = data.to_a
|
51
|
+
singular = !data.first.is_a?(Array)
|
52
|
+
data = [data] if singular
|
53
|
+
check_2d_array(data)
|
54
|
+
data = data.map(&:dup) if @pandas_categorical&.any?
|
55
|
+
end
|
56
|
+
|
57
|
+
if @pandas_categorical&.any?
|
58
|
+
apply_pandas_categorical(
|
59
|
+
data,
|
60
|
+
@booster.params["categorical_feature"],
|
61
|
+
@pandas_categorical
|
62
|
+
)
|
63
|
+
end
|
64
|
+
|
65
|
+
preds, nrow =
|
66
|
+
pred_for_array(
|
67
|
+
data,
|
68
|
+
start_iteration,
|
69
|
+
num_iteration,
|
70
|
+
predict_type
|
71
|
+
)
|
72
|
+
|
73
|
+
if pred_leaf
|
74
|
+
preds = preds.map(&:to_i)
|
75
|
+
end
|
76
|
+
|
77
|
+
if preds.size != nrow
|
78
|
+
if preds.size % nrow == 0
|
79
|
+
preds = preds.each_slice(preds.size / nrow).to_a
|
80
|
+
else
|
81
|
+
raise Error, "Length of predict result (#{preds.size}) cannot be divide nrow (#{nrow})"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
singular ? preds.first : preds
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def pred_for_array(input, start_iteration, num_iteration, predict_type)
|
91
|
+
nrow = input.count
|
92
|
+
if nrow > MAX_INT32
|
93
|
+
raise Error, "Not supported"
|
94
|
+
end
|
95
|
+
inner_predict_array(
|
96
|
+
input,
|
97
|
+
start_iteration,
|
98
|
+
num_iteration,
|
99
|
+
predict_type
|
100
|
+
)
|
101
|
+
end
|
102
|
+
|
103
|
+
def inner_predict_array(input, start_iteration, num_iteration, predict_type)
|
104
|
+
n_preds =
|
105
|
+
num_preds(
|
106
|
+
start_iteration,
|
107
|
+
num_iteration,
|
108
|
+
input.count,
|
109
|
+
predict_type
|
110
|
+
)
|
111
|
+
|
112
|
+
flat_input = input.flatten
|
113
|
+
handle_missing(flat_input)
|
114
|
+
data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
|
115
|
+
data.write_array_of_double(flat_input)
|
116
|
+
|
117
|
+
out_num_preds = ::FFI::MemoryPointer.new(:int64)
|
118
|
+
out_result = ::FFI::MemoryPointer.new(:double, n_preds)
|
119
|
+
safe_call FFI.LGBM_BoosterPredictForMat(@handle, data, FFI::C_API_DTYPE_FLOAT64, input.count, input.first.count, 1, predict_type, start_iteration, num_iteration, @pred_parameter, out_num_preds, out_result)
|
120
|
+
if n_preds != out_num_preds.read_int64
|
121
|
+
raise Error, "Wrong length for predict results"
|
122
|
+
end
|
123
|
+
preds = out_result.read_array_of_double(out_num_preds.read_int64)
|
124
|
+
[preds, input.count]
|
125
|
+
end
|
126
|
+
|
127
|
+
def num_preds(start_iteration, num_iteration, nrow, predict_type)
|
128
|
+
out = ::FFI::MemoryPointer.new(:int64)
|
129
|
+
safe_call FFI.LGBM_BoosterCalcNumPredict(@handle, nrow, predict_type, start_iteration, num_iteration, out)
|
130
|
+
out.read_int64
|
131
|
+
end
|
132
|
+
|
133
|
+
def sorted_feature_values(input_hash)
|
134
|
+
input_hash.transform_keys(&:to_s).fetch_values(*cached_feature_name)
|
135
|
+
end
|
136
|
+
|
137
|
+
def cached_feature_name
|
138
|
+
@booster.send(:cached_feature_name)
|
139
|
+
end
|
140
|
+
|
141
|
+
def apply_pandas_categorical(data, categorical_feature, pandas_categorical)
|
142
|
+
(categorical_feature || []).each_with_index do |cf, i|
|
143
|
+
cat_codes = pandas_categorical[i].map.with_index.to_h
|
144
|
+
data.each do |r|
|
145
|
+
cat = r[cf]
|
146
|
+
unless cat.nil?
|
147
|
+
r[cf] =
|
148
|
+
cat_codes.fetch(cat) do
|
149
|
+
unless cat.is_a?(String)
|
150
|
+
raise ArgumentError, "expected categorical value"
|
151
|
+
end
|
152
|
+
nil
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
data/lib/lightgbm/model.rb
CHANGED
data/lib/lightgbm/utils.rb
CHANGED
@@ -2,8 +2,8 @@ module LightGBM
|
|
2
2
|
module Utils
|
3
3
|
private
|
4
4
|
|
5
|
-
def
|
6
|
-
raise
|
5
|
+
def safe_call(err)
|
6
|
+
raise Error, FFI.LGBM_GetLastError if err != 0
|
7
7
|
end
|
8
8
|
|
9
9
|
# remove spaces in keys and values to prevent injection
|
@@ -24,6 +24,13 @@ module LightGBM
|
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
27
|
+
def check_2d_array(data)
|
28
|
+
ncol = data.first&.size || 0
|
29
|
+
if !data.all? { |r| r.size == ncol }
|
30
|
+
raise ArgumentError, "Rows have different sizes"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
27
34
|
# for categorical, NaN and negative value are the same
|
28
35
|
def handle_missing(data)
|
29
36
|
data.map! { |v| v.nil? ? Float::NAN : v }
|
data/lib/lightgbm/version.rb
CHANGED
data/lib/lightgbm.rb
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
# dependencies
|
2
2
|
require "ffi"
|
3
3
|
|
4
|
+
# stdlib
|
5
|
+
require "json"
|
6
|
+
|
4
7
|
# modules
|
5
8
|
require_relative "lightgbm/utils"
|
6
9
|
require_relative "lightgbm/booster"
|
7
10
|
require_relative "lightgbm/dataset"
|
11
|
+
require_relative "lightgbm/inner_predictor"
|
8
12
|
require_relative "lightgbm/version"
|
9
13
|
|
10
14
|
# scikit-learn API
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lightgbm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-01-05 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: ffi
|
@@ -24,7 +23,6 @@ dependencies:
|
|
24
23
|
- - ">="
|
25
24
|
- !ruby/object:Gem::Version
|
26
25
|
version: '0'
|
27
|
-
description:
|
28
26
|
email: andrew@ankane.org
|
29
27
|
executables: []
|
30
28
|
extensions: []
|
@@ -38,6 +36,7 @@ files:
|
|
38
36
|
- lib/lightgbm/classifier.rb
|
39
37
|
- lib/lightgbm/dataset.rb
|
40
38
|
- lib/lightgbm/ffi.rb
|
39
|
+
- lib/lightgbm/inner_predictor.rb
|
41
40
|
- lib/lightgbm/model.rb
|
42
41
|
- lib/lightgbm/ranker.rb
|
43
42
|
- lib/lightgbm/regressor.rb
|
@@ -53,7 +52,6 @@ homepage: https://github.com/ankane/lightgbm-ruby
|
|
53
52
|
licenses:
|
54
53
|
- MIT
|
55
54
|
metadata: {}
|
56
|
-
post_install_message:
|
57
55
|
rdoc_options: []
|
58
56
|
require_paths:
|
59
57
|
- lib
|
@@ -61,15 +59,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
61
59
|
requirements:
|
62
60
|
- - ">="
|
63
61
|
- !ruby/object:Gem::Version
|
64
|
-
version: '3'
|
62
|
+
version: '3.1'
|
65
63
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
64
|
requirements:
|
67
65
|
- - ">="
|
68
66
|
- !ruby/object:Gem::Version
|
69
67
|
version: '0'
|
70
68
|
requirements: []
|
71
|
-
rubygems_version: 3.
|
72
|
-
signing_key:
|
69
|
+
rubygems_version: 3.6.2
|
73
70
|
specification_version: 4
|
74
71
|
summary: High performance gradient boosting for Ruby
|
75
72
|
test_files: []
|