lightgbm 0.3.4 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 260da517ea054bb9de47d8e1f1ee6ece133bfae733da24ab943e9387a2a99f10
4
- data.tar.gz: a9a9a94153f21a6d085ca5ce10a973b71dc81f1710d4b2afbab5c4caf3302d65
3
+ metadata.gz: 4fadfa7ea250cf7c48f076effb5ce8f5db3cf0c8ab87bb04f2033457a502721a
4
+ data.tar.gz: 3af4cac369a3c684bdb387036845eca04747c14c8e12c9b38625e5c38130de74
5
5
  SHA512:
6
- metadata.gz: a95afda36a019f80afdaeb3a55c2bfa4e8a93988f155e546f0f2b98be33ba58a295fb31cc1643c9cd95576d5fabc37a5ed28fb6a0d1b13cf3be56dab4d9d0ba0
7
- data.tar.gz: 77021a65a856a645c2715460a0285937483b46a889d18c07ce8cf097a8c9176595767c3b32d33a03e8ee6f5c785f20e8817e5ddb3e98629297e5973a8673c0a7
6
+ metadata.gz: 21fb7ae25e1f085cd3642bb02ce32f8378f5d6013f6e8504b86586c86dfaf5c29b12d83b10e3bfd747a3dbfc996eb8473c12313ca5e2f4302554b0a6c40261e3
7
+ data.tar.gz: 75d7b3cea373adedbe8a6cc7c9f0b47f473ac0e77126779efd2f5ab1899596f4a8a926e1d97b033e5cb6e43b0c0f3706e8e7eb9f280d29354ba18792e2f4078b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## 0.4.0 (2025-01-05)
2
+
3
+ - Added support for different prediction types
4
+ - Added support for `pandas_categorical` to `predict` method
5
+ - Added support for hashes and Rover data frames to `predict` method
6
+ - Added support for hashes to `Dataset`
7
+ - Added `importance_type` option to `dump_model`, `model_to_string`, and `save_model` methods
8
+ - Changed `Dataset` to use column names for feature names with Rover and Daru
9
+ - Changed `predict` method to match feature names with Daru
10
+ - Dropped support for Ruby < 3.1
11
+
1
12
  ## 0.3.4 (2024-07-28)
2
13
 
3
14
  - Updated LightGBM to 4.5.0
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  The MIT License (MIT)
2
2
 
3
3
  Copyright (c) Microsoft Corporation
4
- Copyright (c) 2019-2023 Andrew Kane
4
+ Copyright (c) 2019-2025 Andrew Kane
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
7
7
  of this software and associated documentation files (the "Software"), to deal
@@ -1,20 +1,29 @@
1
1
  module LightGBM
2
2
  class Booster
3
- attr_accessor :best_iteration, :train_data_name
3
+ include Utils
4
+
5
+ attr_accessor :best_iteration, :train_data_name, :params
4
6
 
5
7
  def initialize(params: nil, train_set: nil, model_file: nil, model_str: nil)
6
- @handle = ::FFI::MemoryPointer.new(:pointer)
7
8
  if model_str
8
9
  model_from_string(model_str)
9
10
  elsif model_file
10
11
  out_num_iterations = ::FFI::MemoryPointer.new(:int)
11
- check_result FFI.LGBM_BoosterCreateFromModelfile(model_file, out_num_iterations, @handle)
12
+ create_handle do |handle|
13
+ safe_call FFI.LGBM_BoosterCreateFromModelfile(model_file, out_num_iterations, handle)
14
+ end
15
+ @pandas_categorical = load_pandas_categorical(file_name: model_file)
16
+ if params
17
+ warn "[lightgbm] Ignoring params argument, using parameters from model file."
18
+ end
19
+ @params = loaded_param
12
20
  else
13
21
  params ||= {}
14
22
  set_verbosity(params)
15
- check_result FFI.LGBM_BoosterCreate(train_set.handle_pointer, params_str(params), @handle)
23
+ create_handle do |handle|
24
+ safe_call FFI.LGBM_BoosterCreate(train_set.handle, params_str(params), handle)
25
+ end
16
26
  end
17
- ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i))
18
27
 
19
28
  self.best_iteration = -1
20
29
 
@@ -23,28 +32,28 @@ module LightGBM
23
32
  end
24
33
 
25
34
  def add_valid(data, name)
26
- check_result FFI.LGBM_BoosterAddValidData(handle_pointer, data.handle_pointer)
35
+ safe_call FFI.LGBM_BoosterAddValidData(@handle, data.handle)
27
36
  @name_valid_sets << name
28
37
  self # consistent with Python API
29
38
  end
30
39
 
31
40
  def current_iteration
32
41
  out = ::FFI::MemoryPointer.new(:int)
33
- check_result FFI.LGBM_BoosterGetCurrentIteration(handle_pointer, out)
42
+ safe_call FFI.LGBM_BoosterGetCurrentIteration(@handle, out)
34
43
  out.read_int
35
44
  end
36
45
 
37
- def dump_model(num_iteration: nil, start_iteration: 0)
46
+ def dump_model(num_iteration: nil, start_iteration: 0, importance_type: "split")
38
47
  num_iteration ||= best_iteration
48
+ importance_type_int = feature_importance_type_mapper(importance_type)
39
49
  buffer_len = 1 << 20
40
50
  out_len = ::FFI::MemoryPointer.new(:int64)
41
51
  out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
42
- feature_importance_type = 0 # TODO add option
43
- check_result FFI.LGBM_BoosterDumpModel(handle_pointer, start_iteration, num_iteration, feature_importance_type, buffer_len, out_len, out_str)
44
- actual_len = read_int64(out_len)
52
+ safe_call FFI.LGBM_BoosterDumpModel(@handle, start_iteration, num_iteration, importance_type_int, buffer_len, out_len, out_str)
53
+ actual_len = out_len.read_int64
45
54
  if actual_len > buffer_len
46
55
  out_str = ::FFI::MemoryPointer.new(:char, actual_len)
47
- check_result FFI.LGBM_BoosterDumpModel(handle_pointer, start_iteration, num_iteration, feature_importance_type, actual_len, out_len, out_str)
56
+ safe_call FFI.LGBM_BoosterDumpModel(@handle, start_iteration, num_iteration, importance_type_int, actual_len, out_len, out_str)
48
57
  end
49
58
  out_str.read_string
50
59
  end
@@ -60,19 +69,10 @@ module LightGBM
60
69
 
61
70
  def feature_importance(iteration: nil, importance_type: "split")
62
71
  iteration ||= best_iteration
63
- importance_type =
64
- case importance_type
65
- when "split"
66
- 0
67
- when "gain"
68
- 1
69
- else
70
- -1
71
- end
72
-
72
+ importance_type_int = feature_importance_type_mapper(importance_type)
73
73
  num_feature = self.num_feature
74
74
  out_result = ::FFI::MemoryPointer.new(:double, num_feature)
75
- check_result FFI.LGBM_BoosterFeatureImportance(handle_pointer, iteration, importance_type, out_result)
75
+ safe_call FFI.LGBM_BoosterFeatureImportance(@handle, iteration, importance_type_int, out_result)
76
76
  out_result.read_array_of_double(num_feature).map(&:to_i)
77
77
  end
78
78
 
@@ -84,13 +84,13 @@ module LightGBM
84
84
  out_strs = ::FFI::MemoryPointer.new(:pointer, num_feature)
85
85
  str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
86
86
  out_strs.write_array_of_pointer(str_ptrs)
87
- check_result FFI.LGBM_BoosterGetFeatureNames(handle_pointer, len, out_len, buffer_len, out_buffer_len, out_strs)
87
+ safe_call FFI.LGBM_BoosterGetFeatureNames(@handle, len, out_len, buffer_len, out_buffer_len, out_strs)
88
88
 
89
89
  actual_len = out_buffer_len.read(:size_t)
90
90
  if actual_len > buffer_len
91
91
  str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
92
92
  out_strs.write_array_of_pointer(str_ptrs)
93
- check_result FFI.LGBM_BoosterGetFeatureNames(handle_pointer, len, out_len, actual_len, out_buffer_len, out_strs)
93
+ safe_call FFI.LGBM_BoosterGetFeatureNames(@handle, len, out_len, actual_len, out_buffer_len, out_strs)
94
94
  end
95
95
 
96
96
  str_ptrs[0, out_len.read(:size_t)].map(&:read_string)
@@ -98,130 +98,122 @@ module LightGBM
98
98
 
99
99
  def model_from_string(model_str)
100
100
  out_num_iterations = ::FFI::MemoryPointer.new(:int)
101
- check_result FFI.LGBM_BoosterLoadModelFromString(model_str, out_num_iterations, @handle)
101
+ create_handle do |handle|
102
+ safe_call FFI.LGBM_BoosterLoadModelFromString(model_str, out_num_iterations, handle)
103
+ end
104
+ @pandas_categorical = load_pandas_categorical(model_str: model_str)
105
+ @params = loaded_param
106
+ @cached_feature_name = nil
102
107
  self
103
108
  end
104
109
 
105
- def model_to_string(num_iteration: nil, start_iteration: 0)
110
+ def model_to_string(num_iteration: nil, start_iteration: 0, importance_type: "split")
106
111
  num_iteration ||= best_iteration
112
+ importance_type_int = feature_importance_type_mapper(importance_type)
107
113
  buffer_len = 1 << 20
108
114
  out_len = ::FFI::MemoryPointer.new(:int64)
109
115
  out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
110
- feature_importance_type = 0 # TODO add option
111
- check_result FFI.LGBM_BoosterSaveModelToString(handle_pointer, start_iteration, num_iteration, feature_importance_type, buffer_len, out_len, out_str)
112
- actual_len = read_int64(out_len)
116
+ safe_call FFI.LGBM_BoosterSaveModelToString(@handle, start_iteration, num_iteration, importance_type_int, buffer_len, out_len, out_str)
117
+ actual_len = out_len.read_int64
113
118
  if actual_len > buffer_len
114
119
  out_str = ::FFI::MemoryPointer.new(:char, actual_len)
115
- check_result FFI.LGBM_BoosterSaveModelToString(handle_pointer, start_iteration, num_iteration, feature_importance_type, actual_len, out_len, out_str)
120
+ safe_call FFI.LGBM_BoosterSaveModelToString(@handle, start_iteration, num_iteration, importance_type_int, actual_len, out_len, out_str)
116
121
  end
117
122
  out_str.read_string
118
123
  end
119
124
 
120
125
  def num_feature
121
126
  out = ::FFI::MemoryPointer.new(:int)
122
- check_result FFI.LGBM_BoosterGetNumFeature(handle_pointer, out)
127
+ safe_call FFI.LGBM_BoosterGetNumFeature(@handle, out)
123
128
  out.read_int
124
129
  end
125
130
  alias_method :num_features, :num_feature # legacy typo
126
131
 
127
132
  def num_model_per_iteration
128
133
  out = ::FFI::MemoryPointer.new(:int)
129
- check_result FFI.LGBM_BoosterNumModelPerIteration(handle_pointer, out)
134
+ safe_call FFI.LGBM_BoosterNumModelPerIteration(@handle, out)
130
135
  out.read_int
131
136
  end
132
137
 
133
138
  def num_trees
134
139
  out = ::FFI::MemoryPointer.new(:int)
135
- check_result FFI.LGBM_BoosterNumberOfTotalModel(handle_pointer, out)
140
+ safe_call FFI.LGBM_BoosterNumberOfTotalModel(@handle, out)
136
141
  out.read_int
137
142
  end
138
143
 
139
- # TODO support different prediction types
140
- def predict(input, start_iteration: nil, num_iteration: nil, **params)
141
- input =
142
- if daru?(input)
143
- input.map_rows(&:to_a)
144
+ def predict(data, start_iteration: 0, num_iteration: nil, raw_score: false, pred_leaf: false, pred_contrib: false, **kwargs)
145
+ predictor = InnerPredictor.from_booster(self, kwargs.transform_values(&:dup))
146
+ if num_iteration.nil?
147
+ if start_iteration <= 0
148
+ num_iteration = best_iteration
144
149
  else
145
- input.to_a
150
+ num_iteration = -1
146
151
  end
147
-
148
- singular = !input.first.is_a?(Array)
149
- input = [input] if singular
150
-
151
- start_iteration ||= 0
152
- num_iteration ||= best_iteration
153
- num_class ||= num_class()
154
-
155
- flat_input = input.flatten
156
- handle_missing(flat_input)
157
- data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
158
- data.write_array_of_double(flat_input)
159
-
160
- out_len = ::FFI::MemoryPointer.new(:int64)
161
- out_result = ::FFI::MemoryPointer.new(:double, num_class * input.count)
162
- check_result FFI.LGBM_BoosterPredictForMat(handle_pointer, data, 1, input.count, input.first.count, 1, 0, start_iteration, num_iteration, params_str(params), out_len, out_result)
163
- out = out_result.read_array_of_double(read_int64(out_len))
164
- out = out.each_slice(num_class).to_a if num_class > 1
165
-
166
- singular ? out.first : out
152
+ end
153
+ predictor.predict(
154
+ data,
155
+ start_iteration: start_iteration,
156
+ num_iteration: num_iteration,
157
+ raw_score: raw_score,
158
+ pred_leaf: pred_leaf,
159
+ pred_contrib: pred_contrib
160
+ )
167
161
  end
168
162
 
169
- def save_model(filename, num_iteration: nil, start_iteration: 0)
163
+ def save_model(filename, num_iteration: nil, start_iteration: 0, importance_type: "split")
170
164
  num_iteration ||= best_iteration
171
- feature_importance_type = 0 # TODO add
172
- check_result FFI.LGBM_BoosterSaveModel(handle_pointer, start_iteration, num_iteration, feature_importance_type, filename)
165
+ importance_type_int = feature_importance_type_mapper(importance_type)
166
+ safe_call FFI.LGBM_BoosterSaveModel(@handle, start_iteration, num_iteration, importance_type_int, filename)
173
167
  self # consistent with Python API
174
168
  end
175
169
 
176
170
  def update
177
171
  finished = ::FFI::MemoryPointer.new(:int)
178
- check_result FFI.LGBM_BoosterUpdateOneIter(handle_pointer, finished)
172
+ safe_call FFI.LGBM_BoosterUpdateOneIter(@handle, finished)
179
173
  finished.read_int == 1
180
174
  end
181
175
 
182
- def self.finalize(addr)
183
- # must use proc instead of stabby lambda
184
- proc { FFI.LGBM_BoosterFree(::FFI::Pointer.new(:pointer, addr)) }
185
- end
186
-
187
176
  private
188
177
 
189
- def handle_pointer
190
- @handle.read_pointer
178
+ def create_handle
179
+ ::FFI::MemoryPointer.new(:pointer) do |handle|
180
+ yield handle
181
+ @handle = ::FFI::AutoPointer.new(handle.read_pointer, FFI.method(:LGBM_BoosterFree))
182
+ end
191
183
  end
192
184
 
193
185
  def eval_counts
194
186
  out = ::FFI::MemoryPointer.new(:int)
195
- check_result FFI.LGBM_BoosterGetEvalCounts(handle_pointer, out)
187
+ safe_call FFI.LGBM_BoosterGetEvalCounts(@handle, out)
196
188
  out.read_int
197
189
  end
198
190
 
199
191
  def eval_names
200
- eval_counts ||= eval_counts()
192
+ eval_counts = self.eval_counts
201
193
  out_len = ::FFI::MemoryPointer.new(:int)
202
194
  out_buffer_len = ::FFI::MemoryPointer.new(:size_t)
203
195
  out_strs = ::FFI::MemoryPointer.new(:pointer, eval_counts)
204
196
  buffer_len = 255
205
197
  str_ptrs = eval_counts.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
206
198
  out_strs.write_array_of_pointer(str_ptrs)
207
- check_result FFI.LGBM_BoosterGetEvalNames(handle_pointer, eval_counts, out_len, buffer_len, out_buffer_len, out_strs)
199
+ safe_call FFI.LGBM_BoosterGetEvalNames(@handle, eval_counts, out_len, buffer_len, out_buffer_len, out_strs)
208
200
 
209
201
  actual_len = out_buffer_len.read(:size_t)
210
202
  if actual_len > buffer_len
211
203
  str_ptrs = eval_counts.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
212
204
  out_strs.write_array_of_pointer(str_ptrs)
213
- check_result FFI.LGBM_BoosterGetEvalNames(handle_pointer, eval_counts, out_len, actual_len, out_buffer_len, out_strs)
205
+ safe_call FFI.LGBM_BoosterGetEvalNames(@handle, eval_counts, out_len, actual_len, out_buffer_len, out_strs)
214
206
  end
215
207
 
216
208
  str_ptrs.map(&:read_string)
217
209
  end
218
210
 
219
211
  def inner_eval(name, i)
220
- eval_names ||= eval_names()
212
+ eval_names = self.eval_names
221
213
 
222
214
  out_len = ::FFI::MemoryPointer.new(:int)
223
215
  out_results = ::FFI::MemoryPointer.new(:double, eval_names.count)
224
- check_result FFI.LGBM_BoosterGetEval(handle_pointer, i, out_len, out_results)
216
+ safe_call FFI.LGBM_BoosterGetEval(@handle, i, out_len, out_results)
225
217
  vals = out_results.read_array_of_double(out_len.read_int)
226
218
 
227
219
  eval_names.zip(vals).map do |eval_name, val|
@@ -232,15 +224,66 @@ module LightGBM
232
224
 
233
225
  def num_class
234
226
  out = ::FFI::MemoryPointer.new(:int)
235
- check_result FFI.LGBM_BoosterGetNumClasses(handle_pointer, out)
227
+ safe_call FFI.LGBM_BoosterGetNumClasses(@handle, out)
236
228
  out.read_int
237
229
  end
238
230
 
239
- # read_int64 not available on JRuby
240
- def read_int64(ptr)
241
- ptr.read_array_of_int64(1).first
231
+ def cached_feature_name
232
+ @cached_feature_name ||= feature_name
242
233
  end
243
234
 
244
- include Utils
235
+ def feature_importance_type_mapper(importance_type)
236
+ case importance_type
237
+ when "split"
238
+ FFI::C_API_FEATURE_IMPORTANCE_SPLIT
239
+ when "gain"
240
+ FFI::C_API_FEATURE_IMPORTANCE_GAIN
241
+ else
242
+ -1
243
+ end
244
+ end
245
+
246
+ def load_pandas_categorical(file_name: nil, model_str: nil)
247
+ pandas_key = "pandas_categorical:"
248
+ offset = -pandas_key.length
249
+ if !file_name.nil?
250
+ max_offset = -File.size(file_name)
251
+ lines = []
252
+ File.open(file_name, "rb") do |f|
253
+ loop do
254
+ offset = [offset, max_offset].max
255
+ f.seek(offset, IO::SEEK_END)
256
+ lines = f.readlines
257
+ if lines.length >= 2 || offset == max_offset
258
+ break
259
+ end
260
+ offset *= 2
261
+ end
262
+ end
263
+ last_line = lines[-1].strip
264
+ if !last_line.start_with?(pandas_key)
265
+ last_line = lines[-2].strip
266
+ end
267
+ elsif !model_str.nil?
268
+ idx = model_str[..offset].rindex("\n")
269
+ last_line = model_str[idx..].strip
270
+ end
271
+ if last_line.start_with?(pandas_key)
272
+ JSON.parse(last_line[pandas_key.length..])
273
+ end
274
+ end
275
+
276
+ def loaded_param
277
+ buffer_len = 1 << 20
278
+ out_len = ::FFI::MemoryPointer.new(:int64)
279
+ out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
280
+ safe_call FFI.LGBM_BoosterGetLoadedParam(@handle, buffer_len, out_len, out_str)
281
+ actual_len = out_len.read_int64
282
+ if actual_len > buffer_len
283
+ out_str = ::FFI::MemoryPointer.new(:char, actual_len)
284
+ safe_call FFI.LGBM_BoosterGetLoadedParam(@handle, actual_len, out_len, out_str)
285
+ end
286
+ JSON.parse(out_str.read_string)
287
+ end
245
288
  end
246
289
  end
@@ -1,8 +1,10 @@
1
1
  module LightGBM
2
2
  class Dataset
3
+ include Utils
4
+
3
5
  attr_reader :data, :params
4
6
 
5
- def initialize(data, label: nil, weight: nil, group: nil, params: nil, reference: nil, used_indices: nil, categorical_feature: "auto", feature_names: nil)
7
+ def initialize(data, label: nil, weight: nil, group: nil, params: nil, reference: nil, used_indices: nil, categorical_feature: "auto", feature_name: nil, feature_names: nil)
6
8
  @data = data
7
9
  @label = label
8
10
  @weight = weight
@@ -11,7 +13,7 @@ module LightGBM
11
13
  @reference = reference
12
14
  @used_indices = used_indices
13
15
  @categorical_feature = categorical_feature
14
- @feature_names = feature_names
16
+ @feature_name = feature_name || feature_names || "auto"
15
17
 
16
18
  construct
17
19
  end
@@ -24,7 +26,7 @@ module LightGBM
24
26
  field("weight")
25
27
  end
26
28
 
27
- def feature_names
29
+ def feature_name
28
30
  # must preallocate space
29
31
  num_feature_names = ::FFI::MemoryPointer.new(:int)
30
32
  out_buffer_len = ::FFI::MemoryPointer.new(:size_t)
@@ -33,7 +35,7 @@ module LightGBM
33
35
  buffer_len = 255
34
36
  str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
35
37
  out_strs.write_array_of_pointer(str_ptrs)
36
- check_result FFI.LGBM_DatasetGetFeatureNames(handle_pointer, len, num_feature_names, buffer_len, out_buffer_len, out_strs)
38
+ safe_call FFI.LGBM_DatasetGetFeatureNames(@handle, len, num_feature_names, buffer_len, out_buffer_len, out_strs)
37
39
 
38
40
  num_features = num_feature_names.read_int
39
41
  actual_len = out_buffer_len.read(:size_t)
@@ -41,13 +43,14 @@ module LightGBM
41
43
  out_strs = ::FFI::MemoryPointer.new(:pointer, num_features) if num_features > len
42
44
  str_ptrs = num_features.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
43
45
  out_strs.write_array_of_pointer(str_ptrs)
44
- check_result FFI.LGBM_DatasetGetFeatureNames(handle_pointer, num_features, num_feature_names, actual_len, out_buffer_len, out_strs)
46
+ safe_call FFI.LGBM_DatasetGetFeatureNames(@handle, num_features, num_feature_names, actual_len, out_buffer_len, out_strs)
45
47
  end
46
48
 
47
49
  # should be the same, but get number of features
48
50
  # from most recent call (instead of num_features)
49
51
  str_ptrs[0, num_feature_names.read_int].map(&:read_string)
50
52
  end
53
+ alias_method :feature_names, :feature_name
51
54
 
52
55
  def label=(label)
53
56
  @label = label
@@ -64,12 +67,16 @@ module LightGBM
64
67
  set_field("group", group, type: :int32)
65
68
  end
66
69
 
67
- def feature_names=(feature_names)
70
+ def feature_name=(feature_names)
71
+ feature_names = feature_names.map(&:to_s)
68
72
  @feature_names = feature_names
69
73
  c_feature_names = ::FFI::MemoryPointer.new(:pointer, feature_names.size)
70
- c_feature_names.write_array_of_pointer(feature_names.map { |v| ::FFI::MemoryPointer.from_string(v) })
71
- check_result FFI.LGBM_DatasetSetFeatureNames(handle_pointer, c_feature_names, feature_names.size)
74
+ # keep reference to string pointers
75
+ str_ptrs = feature_names.map { |v| ::FFI::MemoryPointer.from_string(v) }
76
+ c_feature_names.write_array_of_pointer(str_ptrs)
77
+ safe_call FFI.LGBM_DatasetSetFeatureNames(@handle, c_feature_names, feature_names.size)
72
78
  end
79
+ alias_method :feature_names=, :feature_name=
73
80
 
74
81
  # TODO only update reference if not in chain
75
82
  def reference=(reference)
@@ -81,18 +88,18 @@ module LightGBM
81
88
 
82
89
  def num_data
83
90
  out = ::FFI::MemoryPointer.new(:int)
84
- check_result FFI.LGBM_DatasetGetNumData(handle_pointer, out)
91
+ safe_call FFI.LGBM_DatasetGetNumData(@handle, out)
85
92
  out.read_int
86
93
  end
87
94
 
88
95
  def num_feature
89
96
  out = ::FFI::MemoryPointer.new(:int)
90
- check_result FFI.LGBM_DatasetGetNumFeature(handle_pointer, out)
97
+ safe_call FFI.LGBM_DatasetGetNumFeature(@handle, out)
91
98
  out.read_int
92
99
  end
93
100
 
94
101
  def save_binary(filename)
95
- check_result FFI.LGBM_DatasetSaveBinary(handle_pointer, filename)
102
+ safe_call FFI.LGBM_DatasetSaveBinary(@handle, filename)
96
103
  end
97
104
 
98
105
  def subset(used_indices, params: nil)
@@ -105,13 +112,8 @@ module LightGBM
105
112
  )
106
113
  end
107
114
 
108
- def handle_pointer
109
- @handle.read_pointer
110
- end
111
-
112
- def self.finalize(addr)
113
- # must use proc instead of stabby lambda
114
- proc { FFI.LGBM_DatasetFree(::FFI::Pointer.new(:pointer, addr)) }
115
+ def handle
116
+ @handle
115
117
  end
116
118
 
117
119
  private
@@ -127,27 +129,45 @@ module LightGBM
127
129
  end
128
130
  set_verbosity(params)
129
131
 
130
- @handle = ::FFI::MemoryPointer.new(:pointer)
132
+ handle = ::FFI::MemoryPointer.new(:pointer)
131
133
  parameters = params_str(params)
132
- reference = @reference.handle_pointer if @reference
134
+ reference = @reference.handle if @reference
133
135
  if used_indices
134
136
  used_row_indices = ::FFI::MemoryPointer.new(:int32, used_indices.count)
135
137
  used_row_indices.write_array_of_int32(used_indices)
136
- check_result FFI.LGBM_DatasetGetSubset(reference, used_row_indices, used_indices.count, parameters, @handle)
138
+ safe_call FFI.LGBM_DatasetGetSubset(reference, used_row_indices, used_indices.count, parameters, handle)
137
139
  elsif data.is_a?(String)
138
- check_result FFI.LGBM_DatasetCreateFromFile(data, parameters, reference, @handle)
140
+ safe_call FFI.LGBM_DatasetCreateFromFile(data, parameters, reference, handle)
139
141
  else
140
142
  if matrix?(data)
141
143
  nrow = data.row_count
142
144
  ncol = data.column_count
143
145
  flat_data = data.to_a.flatten
144
146
  elsif daru?(data)
147
+ if @feature_name == "auto"
148
+ @feature_name = data.vectors.to_a
149
+ end
145
150
  nrow, ncol = data.shape
146
151
  flat_data = data.map_rows(&:to_a).flatten
147
- elsif numo?(data) || rover?(data)
148
- data = data.to_numo if rover?(data)
152
+ elsif numo?(data)
153
+ nrow, ncol = data.shape
154
+ elsif rover?(data)
155
+ if @feature_name == "auto"
156
+ @feature_name = data.keys
157
+ end
158
+ data = data.to_numo
149
159
  nrow, ncol = data.shape
160
+ elsif data.is_a?(Array) && data.first.is_a?(Hash)
161
+ keys = data.first.keys
162
+ if @feature_name == "auto"
163
+ @feature_name = keys
164
+ end
165
+ nrow = data.count
166
+ ncol = data.first.count
167
+ flat_data = data.flat_map { |v| v.fetch_values(*keys) }
150
168
  else
169
+ data = data.to_a
170
+ check_2d_array(data)
151
171
  nrow = data.count
152
172
  ncol = data.first.count
153
173
  flat_data = data.flatten
@@ -161,18 +181,22 @@ module LightGBM
161
181
  c_data.write_array_of_double(flat_data)
162
182
  end
163
183
 
164
- check_result FFI.LGBM_DatasetCreateFromMat(c_data, 1, nrow, ncol, 1, parameters, reference, @handle)
184
+ safe_call FFI.LGBM_DatasetCreateFromMat(c_data, FFI::C_API_DTYPE_FLOAT64, nrow, ncol, 1, parameters, reference, handle)
185
+ end
186
+ if used_indices
187
+ @handle = handle.read_pointer
188
+ else
189
+ @handle = ::FFI::AutoPointer.new(handle.read_pointer, FFI.method(:LGBM_DatasetFree))
165
190
  end
166
- ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i)) unless used_indices
167
191
 
168
192
  self.label = @label if @label
169
193
  self.weight = @weight if @weight
170
194
  self.group = @group if @group
171
- self.feature_names = @feature_names if @feature_names
195
+ self.feature_name = @feature_name if @feature_name && @feature_name != "auto"
172
196
  end
173
197
 
174
198
  def dump_text(filename)
175
- check_result FFI.LGBM_DatasetDumpText(handle_pointer, filename)
199
+ safe_call FFI.LGBM_DatasetDumpText(@handle, filename)
176
200
  end
177
201
 
178
202
  def field(field_name)
@@ -180,7 +204,7 @@ module LightGBM
180
204
  out_len = ::FFI::MemoryPointer.new(:int)
181
205
  out_ptr = ::FFI::MemoryPointer.new(:float, num_data)
182
206
  out_type = ::FFI::MemoryPointer.new(:int)
183
- check_result FFI.LGBM_DatasetGetField(handle_pointer, field_name, out_len, out_ptr, out_type)
207
+ safe_call FFI.LGBM_DatasetGetField(@handle, field_name, out_len, out_ptr, out_type)
184
208
  out_ptr.read_pointer.read_array_of_float(num_data)
185
209
  end
186
210
 
@@ -189,14 +213,12 @@ module LightGBM
189
213
  if type == :int32
190
214
  c_data = ::FFI::MemoryPointer.new(:int32, data.count)
191
215
  c_data.write_array_of_int32(data)
192
- check_result FFI.LGBM_DatasetSetField(handle_pointer, field_name, c_data, data.count, 2)
216
+ safe_call FFI.LGBM_DatasetSetField(@handle, field_name, c_data, data.count, 2)
193
217
  else
194
218
  c_data = ::FFI::MemoryPointer.new(:float, data.count)
195
219
  c_data.write_array_of_float(data)
196
- check_result FFI.LGBM_DatasetSetField(handle_pointer, field_name, c_data, data.count, 0)
220
+ safe_call FFI.LGBM_DatasetSetField(@handle, field_name, c_data, data.count, 0)
197
221
  end
198
222
  end
199
-
200
- include Utils
201
223
  end
202
224
  end
data/lib/lightgbm/ffi.rb CHANGED
@@ -15,6 +15,19 @@ module LightGBM
15
15
  # https://github.com/microsoft/LightGBM/blob/master/include/LightGBM/c_api.h
16
16
  # keep same order
17
17
 
18
+ C_API_DTYPE_FLOAT32 = 0
19
+ C_API_DTYPE_FLOAT64 = 1
20
+ C_API_DTYPE_INT32 = 2
21
+ C_API_DTYPE_INT64 = 3
22
+
23
+ C_API_PREDICT_NORMAL = 0
24
+ C_API_PREDICT_RAW_SCORE = 1
25
+ C_API_PREDICT_LEAF_INDEX = 2
26
+ C_API_PREDICT_CONTRIB = 3
27
+
28
+ C_API_FEATURE_IMPORTANCE_SPLIT = 0
29
+ C_API_FEATURE_IMPORTANCE_GAIN = 1
30
+
18
31
  # error
19
32
  attach_function :LGBM_GetLastError, %i[], :string
20
33
 
@@ -36,6 +49,7 @@ module LightGBM
36
49
  attach_function :LGBM_BoosterCreate, %i[pointer string pointer], :int
37
50
  attach_function :LGBM_BoosterCreateFromModelfile, %i[string pointer pointer], :int
38
51
  attach_function :LGBM_BoosterLoadModelFromString, %i[string pointer pointer], :int
52
+ attach_function :LGBM_BoosterGetLoadedParam, %i[pointer int64 pointer pointer], :int
39
53
  attach_function :LGBM_BoosterFree, %i[pointer], :int
40
54
  attach_function :LGBM_BoosterAddValidData, %i[pointer pointer], :int
41
55
  attach_function :LGBM_BoosterGetNumClasses, %i[pointer pointer], :int
@@ -48,6 +62,7 @@ module LightGBM
48
62
  attach_function :LGBM_BoosterGetFeatureNames, %i[pointer int pointer size_t pointer pointer], :int
49
63
  attach_function :LGBM_BoosterGetNumFeature, %i[pointer pointer], :int
50
64
  attach_function :LGBM_BoosterGetEval, %i[pointer int pointer pointer], :int
65
+ attach_function :LGBM_BoosterCalcNumPredict, %i[pointer int int int int pointer], :int
51
66
  attach_function :LGBM_BoosterPredictForMat, %i[pointer pointer int int32 int32 int int int int string pointer pointer], :int
52
67
  attach_function :LGBM_BoosterSaveModel, %i[pointer int int int string], :int
53
68
  attach_function :LGBM_BoosterSaveModelToString, %i[pointer int int int int64 pointer pointer], :int
@@ -0,0 +1,159 @@
1
+ module LightGBM
2
+ class InnerPredictor
3
+ include Utils
4
+
5
+ MAX_INT32 = (1 << 31) - 1
6
+
7
+ def initialize(booster, pred_parameter)
8
+ @handle = booster.instance_variable_get(:@handle)
9
+ @pandas_categorical = booster.instance_variable_get(:@pandas_categorical)
10
+ @pred_parameter = params_str(pred_parameter)
11
+
12
+ # keep booster for cached_feature_name
13
+ @booster = booster
14
+ end
15
+
16
+ def self.from_booster(booster, pred_parameter)
17
+ new(booster, pred_parameter)
18
+ end
19
+
20
+ def predict(data, start_iteration: 0, num_iteration: -1, raw_score: false, pred_leaf: false, pred_contrib: false)
21
+ if data.is_a?(Dataset)
22
+ raise TypeError, "Cannot use Dataset instance for prediction, please use raw data instead"
23
+ end
24
+
25
+ predict_type = FFI::C_API_PREDICT_NORMAL
26
+ if raw_score
27
+ predict_type = FFI::C_API_PREDICT_RAW_SCORE
28
+ end
29
+ if pred_leaf
30
+ predict_type = FFI::C_API_PREDICT_LEAF_INDEX
31
+ end
32
+ if pred_contrib
33
+ predict_type = FFI::C_API_PREDICT_CONTRIB
34
+ end
35
+
36
+ if daru?(data)
37
+ data = data[*cached_feature_name].map_rows(&:to_a)
38
+ singular = false
39
+ elsif data.is_a?(Hash) # sort feature.values to match the order of model.feature_name
40
+ data = [sorted_feature_values(data)]
41
+ singular = true
42
+ elsif data.is_a?(Array) && data.first.is_a?(Hash) # on multiple elems, if 1st is hash, assume they all are
43
+ data = data.map(&method(:sorted_feature_values))
44
+ singular = false
45
+ elsif rover?(data)
46
+ # TODO improve performance
47
+ data = data[cached_feature_name].to_numo.to_a
48
+ singular = false
49
+ else
50
+ data = data.to_a
51
+ singular = !data.first.is_a?(Array)
52
+ data = [data] if singular
53
+ check_2d_array(data)
54
+ data = data.map(&:dup) if @pandas_categorical&.any?
55
+ end
56
+
57
+ if @pandas_categorical&.any?
58
+ apply_pandas_categorical(
59
+ data,
60
+ @booster.params["categorical_feature"],
61
+ @pandas_categorical
62
+ )
63
+ end
64
+
65
+ preds, nrow =
66
+ pred_for_array(
67
+ data,
68
+ start_iteration,
69
+ num_iteration,
70
+ predict_type
71
+ )
72
+
73
+ if pred_leaf
74
+ preds = preds.map(&:to_i)
75
+ end
76
+
77
+ if preds.size != nrow
78
+ if preds.size % nrow == 0
79
+ preds = preds.each_slice(preds.size / nrow).to_a
80
+ else
81
+ raise Error, "Length of predict result (#{preds.size}) cannot be divide nrow (#{nrow})"
82
+ end
83
+ end
84
+
85
+ singular ? preds.first : preds
86
+ end
87
+
88
+ private
89
+
90
+ def pred_for_array(input, start_iteration, num_iteration, predict_type)
91
+ nrow = input.count
92
+ if nrow > MAX_INT32
93
+ raise Error, "Not supported"
94
+ end
95
+ inner_predict_array(
96
+ input,
97
+ start_iteration,
98
+ num_iteration,
99
+ predict_type
100
+ )
101
+ end
102
+
103
+ def inner_predict_array(input, start_iteration, num_iteration, predict_type)
104
+ n_preds =
105
+ num_preds(
106
+ start_iteration,
107
+ num_iteration,
108
+ input.count,
109
+ predict_type
110
+ )
111
+
112
+ flat_input = input.flatten
113
+ handle_missing(flat_input)
114
+ data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
115
+ data.write_array_of_double(flat_input)
116
+
117
+ out_num_preds = ::FFI::MemoryPointer.new(:int64)
118
+ out_result = ::FFI::MemoryPointer.new(:double, n_preds)
119
+ safe_call FFI.LGBM_BoosterPredictForMat(@handle, data, FFI::C_API_DTYPE_FLOAT64, input.count, input.first.count, 1, predict_type, start_iteration, num_iteration, @pred_parameter, out_num_preds, out_result)
120
+ if n_preds != out_num_preds.read_int64
121
+ raise Error, "Wrong length for predict results"
122
+ end
123
+ preds = out_result.read_array_of_double(out_num_preds.read_int64)
124
+ [preds, input.count]
125
+ end
126
+
127
+ def num_preds(start_iteration, num_iteration, nrow, predict_type)
128
+ out = ::FFI::MemoryPointer.new(:int64)
129
+ safe_call FFI.LGBM_BoosterCalcNumPredict(@handle, nrow, predict_type, start_iteration, num_iteration, out)
130
+ out.read_int64
131
+ end
132
+
133
+ def sorted_feature_values(input_hash)
134
+ input_hash.transform_keys(&:to_s).fetch_values(*cached_feature_name)
135
+ end
136
+
137
+ def cached_feature_name
138
+ @booster.send(:cached_feature_name)
139
+ end
140
+
141
+ def apply_pandas_categorical(data, categorical_feature, pandas_categorical)
142
+ (categorical_feature || []).each_with_index do |cf, i|
143
+ cat_codes = pandas_categorical[i].map.with_index.to_h
144
+ data.each do |r|
145
+ cat = r[cf]
146
+ unless cat.nil?
147
+ r[cf] =
148
+ cat_codes.fetch(cat) do
149
+ unless cat.is_a?(String)
150
+ raise ArgumentError, "expected categorical value"
151
+ end
152
+ nil
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
@@ -16,7 +16,7 @@ module LightGBM
16
16
  end
17
17
 
18
18
  def load_model(fname)
19
- @booster = Booster.new(params: @params, model_file: fname)
19
+ @booster = Booster.new(model_file: fname)
20
20
  end
21
21
 
22
22
  def best_iteration
@@ -2,8 +2,8 @@ module LightGBM
2
2
  module Utils
3
3
  private
4
4
 
5
- def check_result(err)
6
- raise LightGBM::Error, FFI.LGBM_GetLastError if err != 0
5
+ def safe_call(err)
6
+ raise Error, FFI.LGBM_GetLastError if err != 0
7
7
  end
8
8
 
9
9
  # remove spaces in keys and values to prevent injection
@@ -24,6 +24,13 @@ module LightGBM
24
24
  end
25
25
  end
26
26
 
27
+ def check_2d_array(data)
28
+ ncol = data.first&.size || 0
29
+ if !data.all? { |r| r.size == ncol }
30
+ raise ArgumentError, "Rows have different sizes"
31
+ end
32
+ end
33
+
27
34
  # for categorical, NaN and negative value are the same
28
35
  def handle_missing(data)
29
36
  data.map! { |v| v.nil? ? Float::NAN : v }
@@ -1,3 +1,3 @@
1
1
  module LightGBM
2
- VERSION = "0.3.4"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/lightgbm.rb CHANGED
@@ -1,10 +1,14 @@
1
1
  # dependencies
2
2
  require "ffi"
3
3
 
4
+ # stdlib
5
+ require "json"
6
+
4
7
  # modules
5
8
  require_relative "lightgbm/utils"
6
9
  require_relative "lightgbm/booster"
7
10
  require_relative "lightgbm/dataset"
11
+ require_relative "lightgbm/inner_predictor"
8
12
  require_relative "lightgbm/version"
9
13
 
10
14
  # scikit-learn API
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lightgbm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-07-28 00:00:00.000000000 Z
10
+ date: 2025-01-05 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: ffi
@@ -24,7 +23,6 @@ dependencies:
24
23
  - - ">="
25
24
  - !ruby/object:Gem::Version
26
25
  version: '0'
27
- description:
28
26
  email: andrew@ankane.org
29
27
  executables: []
30
28
  extensions: []
@@ -38,6 +36,7 @@ files:
38
36
  - lib/lightgbm/classifier.rb
39
37
  - lib/lightgbm/dataset.rb
40
38
  - lib/lightgbm/ffi.rb
39
+ - lib/lightgbm/inner_predictor.rb
41
40
  - lib/lightgbm/model.rb
42
41
  - lib/lightgbm/ranker.rb
43
42
  - lib/lightgbm/regressor.rb
@@ -53,7 +52,6 @@ homepage: https://github.com/ankane/lightgbm-ruby
53
52
  licenses:
54
53
  - MIT
55
54
  metadata: {}
56
- post_install_message:
57
55
  rdoc_options: []
58
56
  require_paths:
59
57
  - lib
@@ -61,15 +59,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
61
59
  requirements:
62
60
  - - ">="
63
61
  - !ruby/object:Gem::Version
64
- version: '3'
62
+ version: '3.1'
65
63
  required_rubygems_version: !ruby/object:Gem::Requirement
66
64
  requirements:
67
65
  - - ">="
68
66
  - !ruby/object:Gem::Version
69
67
  version: '0'
70
68
  requirements: []
71
- rubygems_version: 3.5.11
72
- signing_key:
69
+ rubygems_version: 3.6.2
73
70
  specification_version: 4
74
71
  summary: High performance gradient boosting for Ruby
75
72
  test_files: []