lightgbm 0.3.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 260da517ea054bb9de47d8e1f1ee6ece133bfae733da24ab943e9387a2a99f10
4
- data.tar.gz: a9a9a94153f21a6d085ca5ce10a973b71dc81f1710d4b2afbab5c4caf3302d65
3
+ metadata.gz: 4fadfa7ea250cf7c48f076effb5ce8f5db3cf0c8ab87bb04f2033457a502721a
4
+ data.tar.gz: 3af4cac369a3c684bdb387036845eca04747c14c8e12c9b38625e5c38130de74
5
5
  SHA512:
6
- metadata.gz: a95afda36a019f80afdaeb3a55c2bfa4e8a93988f155e546f0f2b98be33ba58a295fb31cc1643c9cd95576d5fabc37a5ed28fb6a0d1b13cf3be56dab4d9d0ba0
7
- data.tar.gz: 77021a65a856a645c2715460a0285937483b46a889d18c07ce8cf097a8c9176595767c3b32d33a03e8ee6f5c785f20e8817e5ddb3e98629297e5973a8673c0a7
6
+ metadata.gz: 21fb7ae25e1f085cd3642bb02ce32f8378f5d6013f6e8504b86586c86dfaf5c29b12d83b10e3bfd747a3dbfc996eb8473c12313ca5e2f4302554b0a6c40261e3
7
+ data.tar.gz: 75d7b3cea373adedbe8a6cc7c9f0b47f473ac0e77126779efd2f5ab1899596f4a8a926e1d97b033e5cb6e43b0c0f3706e8e7eb9f280d29354ba18792e2f4078b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## 0.4.0 (2025-01-05)
2
+
3
+ - Added support for different prediction types
4
+ - Added support for `pandas_categorical` to `predict` method
5
+ - Added support for hashes and Rover data frames to `predict` method
6
+ - Added support for hashes to `Dataset`
7
+ - Added `importance_type` option to `dump_model`, `model_to_string`, and `save_model` methods
8
+ - Changed `Dataset` to use column names for feature names with Rover and Daru
9
+ - Changed `predict` method to match feature names with Daru
10
+ - Dropped support for Ruby < 3.1
11
+
1
12
  ## 0.3.4 (2024-07-28)
2
13
 
3
14
  - Updated LightGBM to 4.5.0
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  The MIT License (MIT)
2
2
 
3
3
  Copyright (c) Microsoft Corporation
4
- Copyright (c) 2019-2023 Andrew Kane
4
+ Copyright (c) 2019-2025 Andrew Kane
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
7
7
  of this software and associated documentation files (the "Software"), to deal
@@ -1,20 +1,29 @@
1
1
  module LightGBM
2
2
  class Booster
3
- attr_accessor :best_iteration, :train_data_name
3
+ include Utils
4
+
5
+ attr_accessor :best_iteration, :train_data_name, :params
4
6
 
5
7
  def initialize(params: nil, train_set: nil, model_file: nil, model_str: nil)
6
- @handle = ::FFI::MemoryPointer.new(:pointer)
7
8
  if model_str
8
9
  model_from_string(model_str)
9
10
  elsif model_file
10
11
  out_num_iterations = ::FFI::MemoryPointer.new(:int)
11
- check_result FFI.LGBM_BoosterCreateFromModelfile(model_file, out_num_iterations, @handle)
12
+ create_handle do |handle|
13
+ safe_call FFI.LGBM_BoosterCreateFromModelfile(model_file, out_num_iterations, handle)
14
+ end
15
+ @pandas_categorical = load_pandas_categorical(file_name: model_file)
16
+ if params
17
+ warn "[lightgbm] Ignoring params argument, using parameters from model file."
18
+ end
19
+ @params = loaded_param
12
20
  else
13
21
  params ||= {}
14
22
  set_verbosity(params)
15
- check_result FFI.LGBM_BoosterCreate(train_set.handle_pointer, params_str(params), @handle)
23
+ create_handle do |handle|
24
+ safe_call FFI.LGBM_BoosterCreate(train_set.handle, params_str(params), handle)
25
+ end
16
26
  end
17
- ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i))
18
27
 
19
28
  self.best_iteration = -1
20
29
 
@@ -23,28 +32,28 @@ module LightGBM
23
32
  end
24
33
 
25
34
  def add_valid(data, name)
26
- check_result FFI.LGBM_BoosterAddValidData(handle_pointer, data.handle_pointer)
35
+ safe_call FFI.LGBM_BoosterAddValidData(@handle, data.handle)
27
36
  @name_valid_sets << name
28
37
  self # consistent with Python API
29
38
  end
30
39
 
31
40
  def current_iteration
32
41
  out = ::FFI::MemoryPointer.new(:int)
33
- check_result FFI.LGBM_BoosterGetCurrentIteration(handle_pointer, out)
42
+ safe_call FFI.LGBM_BoosterGetCurrentIteration(@handle, out)
34
43
  out.read_int
35
44
  end
36
45
 
37
- def dump_model(num_iteration: nil, start_iteration: 0)
46
+ def dump_model(num_iteration: nil, start_iteration: 0, importance_type: "split")
38
47
  num_iteration ||= best_iteration
48
+ importance_type_int = feature_importance_type_mapper(importance_type)
39
49
  buffer_len = 1 << 20
40
50
  out_len = ::FFI::MemoryPointer.new(:int64)
41
51
  out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
42
- feature_importance_type = 0 # TODO add option
43
- check_result FFI.LGBM_BoosterDumpModel(handle_pointer, start_iteration, num_iteration, feature_importance_type, buffer_len, out_len, out_str)
44
- actual_len = read_int64(out_len)
52
+ safe_call FFI.LGBM_BoosterDumpModel(@handle, start_iteration, num_iteration, importance_type_int, buffer_len, out_len, out_str)
53
+ actual_len = out_len.read_int64
45
54
  if actual_len > buffer_len
46
55
  out_str = ::FFI::MemoryPointer.new(:char, actual_len)
47
- check_result FFI.LGBM_BoosterDumpModel(handle_pointer, start_iteration, num_iteration, feature_importance_type, actual_len, out_len, out_str)
56
+ safe_call FFI.LGBM_BoosterDumpModel(@handle, start_iteration, num_iteration, importance_type_int, actual_len, out_len, out_str)
48
57
  end
49
58
  out_str.read_string
50
59
  end
@@ -60,19 +69,10 @@ module LightGBM
60
69
 
61
70
  def feature_importance(iteration: nil, importance_type: "split")
62
71
  iteration ||= best_iteration
63
- importance_type =
64
- case importance_type
65
- when "split"
66
- 0
67
- when "gain"
68
- 1
69
- else
70
- -1
71
- end
72
-
72
+ importance_type_int = feature_importance_type_mapper(importance_type)
73
73
  num_feature = self.num_feature
74
74
  out_result = ::FFI::MemoryPointer.new(:double, num_feature)
75
- check_result FFI.LGBM_BoosterFeatureImportance(handle_pointer, iteration, importance_type, out_result)
75
+ safe_call FFI.LGBM_BoosterFeatureImportance(@handle, iteration, importance_type_int, out_result)
76
76
  out_result.read_array_of_double(num_feature).map(&:to_i)
77
77
  end
78
78
 
@@ -84,13 +84,13 @@ module LightGBM
84
84
  out_strs = ::FFI::MemoryPointer.new(:pointer, num_feature)
85
85
  str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
86
86
  out_strs.write_array_of_pointer(str_ptrs)
87
- check_result FFI.LGBM_BoosterGetFeatureNames(handle_pointer, len, out_len, buffer_len, out_buffer_len, out_strs)
87
+ safe_call FFI.LGBM_BoosterGetFeatureNames(@handle, len, out_len, buffer_len, out_buffer_len, out_strs)
88
88
 
89
89
  actual_len = out_buffer_len.read(:size_t)
90
90
  if actual_len > buffer_len
91
91
  str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
92
92
  out_strs.write_array_of_pointer(str_ptrs)
93
- check_result FFI.LGBM_BoosterGetFeatureNames(handle_pointer, len, out_len, actual_len, out_buffer_len, out_strs)
93
+ safe_call FFI.LGBM_BoosterGetFeatureNames(@handle, len, out_len, actual_len, out_buffer_len, out_strs)
94
94
  end
95
95
 
96
96
  str_ptrs[0, out_len.read(:size_t)].map(&:read_string)
@@ -98,130 +98,122 @@ module LightGBM
98
98
 
99
99
  def model_from_string(model_str)
100
100
  out_num_iterations = ::FFI::MemoryPointer.new(:int)
101
- check_result FFI.LGBM_BoosterLoadModelFromString(model_str, out_num_iterations, @handle)
101
+ create_handle do |handle|
102
+ safe_call FFI.LGBM_BoosterLoadModelFromString(model_str, out_num_iterations, handle)
103
+ end
104
+ @pandas_categorical = load_pandas_categorical(model_str: model_str)
105
+ @params = loaded_param
106
+ @cached_feature_name = nil
102
107
  self
103
108
  end
104
109
 
105
- def model_to_string(num_iteration: nil, start_iteration: 0)
110
+ def model_to_string(num_iteration: nil, start_iteration: 0, importance_type: "split")
106
111
  num_iteration ||= best_iteration
112
+ importance_type_int = feature_importance_type_mapper(importance_type)
107
113
  buffer_len = 1 << 20
108
114
  out_len = ::FFI::MemoryPointer.new(:int64)
109
115
  out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
110
- feature_importance_type = 0 # TODO add option
111
- check_result FFI.LGBM_BoosterSaveModelToString(handle_pointer, start_iteration, num_iteration, feature_importance_type, buffer_len, out_len, out_str)
112
- actual_len = read_int64(out_len)
116
+ safe_call FFI.LGBM_BoosterSaveModelToString(@handle, start_iteration, num_iteration, importance_type_int, buffer_len, out_len, out_str)
117
+ actual_len = out_len.read_int64
113
118
  if actual_len > buffer_len
114
119
  out_str = ::FFI::MemoryPointer.new(:char, actual_len)
115
- check_result FFI.LGBM_BoosterSaveModelToString(handle_pointer, start_iteration, num_iteration, feature_importance_type, actual_len, out_len, out_str)
120
+ safe_call FFI.LGBM_BoosterSaveModelToString(@handle, start_iteration, num_iteration, importance_type_int, actual_len, out_len, out_str)
116
121
  end
117
122
  out_str.read_string
118
123
  end
119
124
 
120
125
  def num_feature
121
126
  out = ::FFI::MemoryPointer.new(:int)
122
- check_result FFI.LGBM_BoosterGetNumFeature(handle_pointer, out)
127
+ safe_call FFI.LGBM_BoosterGetNumFeature(@handle, out)
123
128
  out.read_int
124
129
  end
125
130
  alias_method :num_features, :num_feature # legacy typo
126
131
 
127
132
  def num_model_per_iteration
128
133
  out = ::FFI::MemoryPointer.new(:int)
129
- check_result FFI.LGBM_BoosterNumModelPerIteration(handle_pointer, out)
134
+ safe_call FFI.LGBM_BoosterNumModelPerIteration(@handle, out)
130
135
  out.read_int
131
136
  end
132
137
 
133
138
  def num_trees
134
139
  out = ::FFI::MemoryPointer.new(:int)
135
- check_result FFI.LGBM_BoosterNumberOfTotalModel(handle_pointer, out)
140
+ safe_call FFI.LGBM_BoosterNumberOfTotalModel(@handle, out)
136
141
  out.read_int
137
142
  end
138
143
 
139
- # TODO support different prediction types
140
- def predict(input, start_iteration: nil, num_iteration: nil, **params)
141
- input =
142
- if daru?(input)
143
- input.map_rows(&:to_a)
144
+ def predict(data, start_iteration: 0, num_iteration: nil, raw_score: false, pred_leaf: false, pred_contrib: false, **kwargs)
145
+ predictor = InnerPredictor.from_booster(self, kwargs.transform_values(&:dup))
146
+ if num_iteration.nil?
147
+ if start_iteration <= 0
148
+ num_iteration = best_iteration
144
149
  else
145
- input.to_a
150
+ num_iteration = -1
146
151
  end
147
-
148
- singular = !input.first.is_a?(Array)
149
- input = [input] if singular
150
-
151
- start_iteration ||= 0
152
- num_iteration ||= best_iteration
153
- num_class ||= num_class()
154
-
155
- flat_input = input.flatten
156
- handle_missing(flat_input)
157
- data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
158
- data.write_array_of_double(flat_input)
159
-
160
- out_len = ::FFI::MemoryPointer.new(:int64)
161
- out_result = ::FFI::MemoryPointer.new(:double, num_class * input.count)
162
- check_result FFI.LGBM_BoosterPredictForMat(handle_pointer, data, 1, input.count, input.first.count, 1, 0, start_iteration, num_iteration, params_str(params), out_len, out_result)
163
- out = out_result.read_array_of_double(read_int64(out_len))
164
- out = out.each_slice(num_class).to_a if num_class > 1
165
-
166
- singular ? out.first : out
152
+ end
153
+ predictor.predict(
154
+ data,
155
+ start_iteration: start_iteration,
156
+ num_iteration: num_iteration,
157
+ raw_score: raw_score,
158
+ pred_leaf: pred_leaf,
159
+ pred_contrib: pred_contrib
160
+ )
167
161
  end
168
162
 
169
- def save_model(filename, num_iteration: nil, start_iteration: 0)
163
+ def save_model(filename, num_iteration: nil, start_iteration: 0, importance_type: "split")
170
164
  num_iteration ||= best_iteration
171
- feature_importance_type = 0 # TODO add
172
- check_result FFI.LGBM_BoosterSaveModel(handle_pointer, start_iteration, num_iteration, feature_importance_type, filename)
165
+ importance_type_int = feature_importance_type_mapper(importance_type)
166
+ safe_call FFI.LGBM_BoosterSaveModel(@handle, start_iteration, num_iteration, importance_type_int, filename)
173
167
  self # consistent with Python API
174
168
  end
175
169
 
176
170
  def update
177
171
  finished = ::FFI::MemoryPointer.new(:int)
178
- check_result FFI.LGBM_BoosterUpdateOneIter(handle_pointer, finished)
172
+ safe_call FFI.LGBM_BoosterUpdateOneIter(@handle, finished)
179
173
  finished.read_int == 1
180
174
  end
181
175
 
182
- def self.finalize(addr)
183
- # must use proc instead of stabby lambda
184
- proc { FFI.LGBM_BoosterFree(::FFI::Pointer.new(:pointer, addr)) }
185
- end
186
-
187
176
  private
188
177
 
189
- def handle_pointer
190
- @handle.read_pointer
178
+ def create_handle
179
+ ::FFI::MemoryPointer.new(:pointer) do |handle|
180
+ yield handle
181
+ @handle = ::FFI::AutoPointer.new(handle.read_pointer, FFI.method(:LGBM_BoosterFree))
182
+ end
191
183
  end
192
184
 
193
185
  def eval_counts
194
186
  out = ::FFI::MemoryPointer.new(:int)
195
- check_result FFI.LGBM_BoosterGetEvalCounts(handle_pointer, out)
187
+ safe_call FFI.LGBM_BoosterGetEvalCounts(@handle, out)
196
188
  out.read_int
197
189
  end
198
190
 
199
191
  def eval_names
200
- eval_counts ||= eval_counts()
192
+ eval_counts = self.eval_counts
201
193
  out_len = ::FFI::MemoryPointer.new(:int)
202
194
  out_buffer_len = ::FFI::MemoryPointer.new(:size_t)
203
195
  out_strs = ::FFI::MemoryPointer.new(:pointer, eval_counts)
204
196
  buffer_len = 255
205
197
  str_ptrs = eval_counts.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
206
198
  out_strs.write_array_of_pointer(str_ptrs)
207
- check_result FFI.LGBM_BoosterGetEvalNames(handle_pointer, eval_counts, out_len, buffer_len, out_buffer_len, out_strs)
199
+ safe_call FFI.LGBM_BoosterGetEvalNames(@handle, eval_counts, out_len, buffer_len, out_buffer_len, out_strs)
208
200
 
209
201
  actual_len = out_buffer_len.read(:size_t)
210
202
  if actual_len > buffer_len
211
203
  str_ptrs = eval_counts.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
212
204
  out_strs.write_array_of_pointer(str_ptrs)
213
- check_result FFI.LGBM_BoosterGetEvalNames(handle_pointer, eval_counts, out_len, actual_len, out_buffer_len, out_strs)
205
+ safe_call FFI.LGBM_BoosterGetEvalNames(@handle, eval_counts, out_len, actual_len, out_buffer_len, out_strs)
214
206
  end
215
207
 
216
208
  str_ptrs.map(&:read_string)
217
209
  end
218
210
 
219
211
  def inner_eval(name, i)
220
- eval_names ||= eval_names()
212
+ eval_names = self.eval_names
221
213
 
222
214
  out_len = ::FFI::MemoryPointer.new(:int)
223
215
  out_results = ::FFI::MemoryPointer.new(:double, eval_names.count)
224
- check_result FFI.LGBM_BoosterGetEval(handle_pointer, i, out_len, out_results)
216
+ safe_call FFI.LGBM_BoosterGetEval(@handle, i, out_len, out_results)
225
217
  vals = out_results.read_array_of_double(out_len.read_int)
226
218
 
227
219
  eval_names.zip(vals).map do |eval_name, val|
@@ -232,15 +224,66 @@ module LightGBM
232
224
 
233
225
  def num_class
234
226
  out = ::FFI::MemoryPointer.new(:int)
235
- check_result FFI.LGBM_BoosterGetNumClasses(handle_pointer, out)
227
+ safe_call FFI.LGBM_BoosterGetNumClasses(@handle, out)
236
228
  out.read_int
237
229
  end
238
230
 
239
- # read_int64 not available on JRuby
240
- def read_int64(ptr)
241
- ptr.read_array_of_int64(1).first
231
+ def cached_feature_name
232
+ @cached_feature_name ||= feature_name
242
233
  end
243
234
 
244
- include Utils
235
+ def feature_importance_type_mapper(importance_type)
236
+ case importance_type
237
+ when "split"
238
+ FFI::C_API_FEATURE_IMPORTANCE_SPLIT
239
+ when "gain"
240
+ FFI::C_API_FEATURE_IMPORTANCE_GAIN
241
+ else
242
+ -1
243
+ end
244
+ end
245
+
246
+ def load_pandas_categorical(file_name: nil, model_str: nil)
247
+ pandas_key = "pandas_categorical:"
248
+ offset = -pandas_key.length
249
+ if !file_name.nil?
250
+ max_offset = -File.size(file_name)
251
+ lines = []
252
+ File.open(file_name, "rb") do |f|
253
+ loop do
254
+ offset = [offset, max_offset].max
255
+ f.seek(offset, IO::SEEK_END)
256
+ lines = f.readlines
257
+ if lines.length >= 2 || offset == max_offset
258
+ break
259
+ end
260
+ offset *= 2
261
+ end
262
+ end
263
+ last_line = lines[-1].strip
264
+ if !last_line.start_with?(pandas_key)
265
+ last_line = lines[-2].strip
266
+ end
267
+ elsif !model_str.nil?
268
+ idx = model_str[..offset].rindex("\n")
269
+ last_line = model_str[idx..].strip
270
+ end
271
+ if last_line.start_with?(pandas_key)
272
+ JSON.parse(last_line[pandas_key.length..])
273
+ end
274
+ end
275
+
276
+ def loaded_param
277
+ buffer_len = 1 << 20
278
+ out_len = ::FFI::MemoryPointer.new(:int64)
279
+ out_str = ::FFI::MemoryPointer.new(:char, buffer_len)
280
+ safe_call FFI.LGBM_BoosterGetLoadedParam(@handle, buffer_len, out_len, out_str)
281
+ actual_len = out_len.read_int64
282
+ if actual_len > buffer_len
283
+ out_str = ::FFI::MemoryPointer.new(:char, actual_len)
284
+ safe_call FFI.LGBM_BoosterGetLoadedParam(@handle, actual_len, out_len, out_str)
285
+ end
286
+ JSON.parse(out_str.read_string)
287
+ end
245
288
  end
246
289
  end
@@ -1,8 +1,10 @@
1
1
  module LightGBM
2
2
  class Dataset
3
+ include Utils
4
+
3
5
  attr_reader :data, :params
4
6
 
5
- def initialize(data, label: nil, weight: nil, group: nil, params: nil, reference: nil, used_indices: nil, categorical_feature: "auto", feature_names: nil)
7
+ def initialize(data, label: nil, weight: nil, group: nil, params: nil, reference: nil, used_indices: nil, categorical_feature: "auto", feature_name: nil, feature_names: nil)
6
8
  @data = data
7
9
  @label = label
8
10
  @weight = weight
@@ -11,7 +13,7 @@ module LightGBM
11
13
  @reference = reference
12
14
  @used_indices = used_indices
13
15
  @categorical_feature = categorical_feature
14
- @feature_names = feature_names
16
+ @feature_name = feature_name || feature_names || "auto"
15
17
 
16
18
  construct
17
19
  end
@@ -24,7 +26,7 @@ module LightGBM
24
26
  field("weight")
25
27
  end
26
28
 
27
- def feature_names
29
+ def feature_name
28
30
  # must preallocate space
29
31
  num_feature_names = ::FFI::MemoryPointer.new(:int)
30
32
  out_buffer_len = ::FFI::MemoryPointer.new(:size_t)
@@ -33,7 +35,7 @@ module LightGBM
33
35
  buffer_len = 255
34
36
  str_ptrs = len.times.map { ::FFI::MemoryPointer.new(:char, buffer_len) }
35
37
  out_strs.write_array_of_pointer(str_ptrs)
36
- check_result FFI.LGBM_DatasetGetFeatureNames(handle_pointer, len, num_feature_names, buffer_len, out_buffer_len, out_strs)
38
+ safe_call FFI.LGBM_DatasetGetFeatureNames(@handle, len, num_feature_names, buffer_len, out_buffer_len, out_strs)
37
39
 
38
40
  num_features = num_feature_names.read_int
39
41
  actual_len = out_buffer_len.read(:size_t)
@@ -41,13 +43,14 @@ module LightGBM
41
43
  out_strs = ::FFI::MemoryPointer.new(:pointer, num_features) if num_features > len
42
44
  str_ptrs = num_features.times.map { ::FFI::MemoryPointer.new(:char, actual_len) }
43
45
  out_strs.write_array_of_pointer(str_ptrs)
44
- check_result FFI.LGBM_DatasetGetFeatureNames(handle_pointer, num_features, num_feature_names, actual_len, out_buffer_len, out_strs)
46
+ safe_call FFI.LGBM_DatasetGetFeatureNames(@handle, num_features, num_feature_names, actual_len, out_buffer_len, out_strs)
45
47
  end
46
48
 
47
49
  # should be the same, but get number of features
48
50
  # from most recent call (instead of num_features)
49
51
  str_ptrs[0, num_feature_names.read_int].map(&:read_string)
50
52
  end
53
+ alias_method :feature_names, :feature_name
51
54
 
52
55
  def label=(label)
53
56
  @label = label
@@ -64,12 +67,16 @@ module LightGBM
64
67
  set_field("group", group, type: :int32)
65
68
  end
66
69
 
67
- def feature_names=(feature_names)
70
+ def feature_name=(feature_names)
71
+ feature_names = feature_names.map(&:to_s)
68
72
  @feature_names = feature_names
69
73
  c_feature_names = ::FFI::MemoryPointer.new(:pointer, feature_names.size)
70
- c_feature_names.write_array_of_pointer(feature_names.map { |v| ::FFI::MemoryPointer.from_string(v) })
71
- check_result FFI.LGBM_DatasetSetFeatureNames(handle_pointer, c_feature_names, feature_names.size)
74
+ # keep reference to string pointers
75
+ str_ptrs = feature_names.map { |v| ::FFI::MemoryPointer.from_string(v) }
76
+ c_feature_names.write_array_of_pointer(str_ptrs)
77
+ safe_call FFI.LGBM_DatasetSetFeatureNames(@handle, c_feature_names, feature_names.size)
72
78
  end
79
+ alias_method :feature_names=, :feature_name=
73
80
 
74
81
  # TODO only update reference if not in chain
75
82
  def reference=(reference)
@@ -81,18 +88,18 @@ module LightGBM
81
88
 
82
89
  def num_data
83
90
  out = ::FFI::MemoryPointer.new(:int)
84
- check_result FFI.LGBM_DatasetGetNumData(handle_pointer, out)
91
+ safe_call FFI.LGBM_DatasetGetNumData(@handle, out)
85
92
  out.read_int
86
93
  end
87
94
 
88
95
  def num_feature
89
96
  out = ::FFI::MemoryPointer.new(:int)
90
- check_result FFI.LGBM_DatasetGetNumFeature(handle_pointer, out)
97
+ safe_call FFI.LGBM_DatasetGetNumFeature(@handle, out)
91
98
  out.read_int
92
99
  end
93
100
 
94
101
  def save_binary(filename)
95
- check_result FFI.LGBM_DatasetSaveBinary(handle_pointer, filename)
102
+ safe_call FFI.LGBM_DatasetSaveBinary(@handle, filename)
96
103
  end
97
104
 
98
105
  def subset(used_indices, params: nil)
@@ -105,13 +112,8 @@ module LightGBM
105
112
  )
106
113
  end
107
114
 
108
- def handle_pointer
109
- @handle.read_pointer
110
- end
111
-
112
- def self.finalize(addr)
113
- # must use proc instead of stabby lambda
114
- proc { FFI.LGBM_DatasetFree(::FFI::Pointer.new(:pointer, addr)) }
115
+ def handle
116
+ @handle
115
117
  end
116
118
 
117
119
  private
@@ -127,27 +129,45 @@ module LightGBM
127
129
  end
128
130
  set_verbosity(params)
129
131
 
130
- @handle = ::FFI::MemoryPointer.new(:pointer)
132
+ handle = ::FFI::MemoryPointer.new(:pointer)
131
133
  parameters = params_str(params)
132
- reference = @reference.handle_pointer if @reference
134
+ reference = @reference.handle if @reference
133
135
  if used_indices
134
136
  used_row_indices = ::FFI::MemoryPointer.new(:int32, used_indices.count)
135
137
  used_row_indices.write_array_of_int32(used_indices)
136
- check_result FFI.LGBM_DatasetGetSubset(reference, used_row_indices, used_indices.count, parameters, @handle)
138
+ safe_call FFI.LGBM_DatasetGetSubset(reference, used_row_indices, used_indices.count, parameters, handle)
137
139
  elsif data.is_a?(String)
138
- check_result FFI.LGBM_DatasetCreateFromFile(data, parameters, reference, @handle)
140
+ safe_call FFI.LGBM_DatasetCreateFromFile(data, parameters, reference, handle)
139
141
  else
140
142
  if matrix?(data)
141
143
  nrow = data.row_count
142
144
  ncol = data.column_count
143
145
  flat_data = data.to_a.flatten
144
146
  elsif daru?(data)
147
+ if @feature_name == "auto"
148
+ @feature_name = data.vectors.to_a
149
+ end
145
150
  nrow, ncol = data.shape
146
151
  flat_data = data.map_rows(&:to_a).flatten
147
- elsif numo?(data) || rover?(data)
148
- data = data.to_numo if rover?(data)
152
+ elsif numo?(data)
153
+ nrow, ncol = data.shape
154
+ elsif rover?(data)
155
+ if @feature_name == "auto"
156
+ @feature_name = data.keys
157
+ end
158
+ data = data.to_numo
149
159
  nrow, ncol = data.shape
160
+ elsif data.is_a?(Array) && data.first.is_a?(Hash)
161
+ keys = data.first.keys
162
+ if @feature_name == "auto"
163
+ @feature_name = keys
164
+ end
165
+ nrow = data.count
166
+ ncol = data.first.count
167
+ flat_data = data.flat_map { |v| v.fetch_values(*keys) }
150
168
  else
169
+ data = data.to_a
170
+ check_2d_array(data)
151
171
  nrow = data.count
152
172
  ncol = data.first.count
153
173
  flat_data = data.flatten
@@ -161,18 +181,22 @@ module LightGBM
161
181
  c_data.write_array_of_double(flat_data)
162
182
  end
163
183
 
164
- check_result FFI.LGBM_DatasetCreateFromMat(c_data, 1, nrow, ncol, 1, parameters, reference, @handle)
184
+ safe_call FFI.LGBM_DatasetCreateFromMat(c_data, FFI::C_API_DTYPE_FLOAT64, nrow, ncol, 1, parameters, reference, handle)
185
+ end
186
+ if used_indices
187
+ @handle = handle.read_pointer
188
+ else
189
+ @handle = ::FFI::AutoPointer.new(handle.read_pointer, FFI.method(:LGBM_DatasetFree))
165
190
  end
166
- ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i)) unless used_indices
167
191
 
168
192
  self.label = @label if @label
169
193
  self.weight = @weight if @weight
170
194
  self.group = @group if @group
171
- self.feature_names = @feature_names if @feature_names
195
+ self.feature_name = @feature_name if @feature_name && @feature_name != "auto"
172
196
  end
173
197
 
174
198
  def dump_text(filename)
175
- check_result FFI.LGBM_DatasetDumpText(handle_pointer, filename)
199
+ safe_call FFI.LGBM_DatasetDumpText(@handle, filename)
176
200
  end
177
201
 
178
202
  def field(field_name)
@@ -180,7 +204,7 @@ module LightGBM
180
204
  out_len = ::FFI::MemoryPointer.new(:int)
181
205
  out_ptr = ::FFI::MemoryPointer.new(:float, num_data)
182
206
  out_type = ::FFI::MemoryPointer.new(:int)
183
- check_result FFI.LGBM_DatasetGetField(handle_pointer, field_name, out_len, out_ptr, out_type)
207
+ safe_call FFI.LGBM_DatasetGetField(@handle, field_name, out_len, out_ptr, out_type)
184
208
  out_ptr.read_pointer.read_array_of_float(num_data)
185
209
  end
186
210
 
@@ -189,14 +213,12 @@ module LightGBM
189
213
  if type == :int32
190
214
  c_data = ::FFI::MemoryPointer.new(:int32, data.count)
191
215
  c_data.write_array_of_int32(data)
192
- check_result FFI.LGBM_DatasetSetField(handle_pointer, field_name, c_data, data.count, 2)
216
+ safe_call FFI.LGBM_DatasetSetField(@handle, field_name, c_data, data.count, 2)
193
217
  else
194
218
  c_data = ::FFI::MemoryPointer.new(:float, data.count)
195
219
  c_data.write_array_of_float(data)
196
- check_result FFI.LGBM_DatasetSetField(handle_pointer, field_name, c_data, data.count, 0)
220
+ safe_call FFI.LGBM_DatasetSetField(@handle, field_name, c_data, data.count, 0)
197
221
  end
198
222
  end
199
-
200
- include Utils
201
223
  end
202
224
  end
data/lib/lightgbm/ffi.rb CHANGED
@@ -15,6 +15,19 @@ module LightGBM
15
15
  # https://github.com/microsoft/LightGBM/blob/master/include/LightGBM/c_api.h
16
16
  # keep same order
17
17
 
18
+ C_API_DTYPE_FLOAT32 = 0
19
+ C_API_DTYPE_FLOAT64 = 1
20
+ C_API_DTYPE_INT32 = 2
21
+ C_API_DTYPE_INT64 = 3
22
+
23
+ C_API_PREDICT_NORMAL = 0
24
+ C_API_PREDICT_RAW_SCORE = 1
25
+ C_API_PREDICT_LEAF_INDEX = 2
26
+ C_API_PREDICT_CONTRIB = 3
27
+
28
+ C_API_FEATURE_IMPORTANCE_SPLIT = 0
29
+ C_API_FEATURE_IMPORTANCE_GAIN = 1
30
+
18
31
  # error
19
32
  attach_function :LGBM_GetLastError, %i[], :string
20
33
 
@@ -36,6 +49,7 @@ module LightGBM
36
49
  attach_function :LGBM_BoosterCreate, %i[pointer string pointer], :int
37
50
  attach_function :LGBM_BoosterCreateFromModelfile, %i[string pointer pointer], :int
38
51
  attach_function :LGBM_BoosterLoadModelFromString, %i[string pointer pointer], :int
52
+ attach_function :LGBM_BoosterGetLoadedParam, %i[pointer int64 pointer pointer], :int
39
53
  attach_function :LGBM_BoosterFree, %i[pointer], :int
40
54
  attach_function :LGBM_BoosterAddValidData, %i[pointer pointer], :int
41
55
  attach_function :LGBM_BoosterGetNumClasses, %i[pointer pointer], :int
@@ -48,6 +62,7 @@ module LightGBM
48
62
  attach_function :LGBM_BoosterGetFeatureNames, %i[pointer int pointer size_t pointer pointer], :int
49
63
  attach_function :LGBM_BoosterGetNumFeature, %i[pointer pointer], :int
50
64
  attach_function :LGBM_BoosterGetEval, %i[pointer int pointer pointer], :int
65
+ attach_function :LGBM_BoosterCalcNumPredict, %i[pointer int int int int pointer], :int
51
66
  attach_function :LGBM_BoosterPredictForMat, %i[pointer pointer int int32 int32 int int int int string pointer pointer], :int
52
67
  attach_function :LGBM_BoosterSaveModel, %i[pointer int int int string], :int
53
68
  attach_function :LGBM_BoosterSaveModelToString, %i[pointer int int int int64 pointer pointer], :int
@@ -0,0 +1,159 @@
1
+ module LightGBM
2
+ class InnerPredictor
3
+ include Utils
4
+
5
+ MAX_INT32 = (1 << 31) - 1
6
+
7
+ def initialize(booster, pred_parameter)
8
+ @handle = booster.instance_variable_get(:@handle)
9
+ @pandas_categorical = booster.instance_variable_get(:@pandas_categorical)
10
+ @pred_parameter = params_str(pred_parameter)
11
+
12
+ # keep booster for cached_feature_name
13
+ @booster = booster
14
+ end
15
+
16
+ def self.from_booster(booster, pred_parameter)
17
+ new(booster, pred_parameter)
18
+ end
19
+
20
+ def predict(data, start_iteration: 0, num_iteration: -1, raw_score: false, pred_leaf: false, pred_contrib: false)
21
+ if data.is_a?(Dataset)
22
+ raise TypeError, "Cannot use Dataset instance for prediction, please use raw data instead"
23
+ end
24
+
25
+ predict_type = FFI::C_API_PREDICT_NORMAL
26
+ if raw_score
27
+ predict_type = FFI::C_API_PREDICT_RAW_SCORE
28
+ end
29
+ if pred_leaf
30
+ predict_type = FFI::C_API_PREDICT_LEAF_INDEX
31
+ end
32
+ if pred_contrib
33
+ predict_type = FFI::C_API_PREDICT_CONTRIB
34
+ end
35
+
36
+ if daru?(data)
37
+ data = data[*cached_feature_name].map_rows(&:to_a)
38
+ singular = false
39
+ elsif data.is_a?(Hash) # sort feature.values to match the order of model.feature_name
40
+ data = [sorted_feature_values(data)]
41
+ singular = true
42
+ elsif data.is_a?(Array) && data.first.is_a?(Hash) # on multiple elems, if 1st is hash, assume they all are
43
+ data = data.map(&method(:sorted_feature_values))
44
+ singular = false
45
+ elsif rover?(data)
46
+ # TODO improve performance
47
+ data = data[cached_feature_name].to_numo.to_a
48
+ singular = false
49
+ else
50
+ data = data.to_a
51
+ singular = !data.first.is_a?(Array)
52
+ data = [data] if singular
53
+ check_2d_array(data)
54
+ data = data.map(&:dup) if @pandas_categorical&.any?
55
+ end
56
+
57
+ if @pandas_categorical&.any?
58
+ apply_pandas_categorical(
59
+ data,
60
+ @booster.params["categorical_feature"],
61
+ @pandas_categorical
62
+ )
63
+ end
64
+
65
+ preds, nrow =
66
+ pred_for_array(
67
+ data,
68
+ start_iteration,
69
+ num_iteration,
70
+ predict_type
71
+ )
72
+
73
+ if pred_leaf
74
+ preds = preds.map(&:to_i)
75
+ end
76
+
77
+ if preds.size != nrow
78
+ if preds.size % nrow == 0
79
+ preds = preds.each_slice(preds.size / nrow).to_a
80
+ else
81
+ raise Error, "Length of predict result (#{preds.size}) cannot be divide nrow (#{nrow})"
82
+ end
83
+ end
84
+
85
+ singular ? preds.first : preds
86
+ end
87
+
88
+ private
89
+
90
+ def pred_for_array(input, start_iteration, num_iteration, predict_type)
91
+ nrow = input.count
92
+ if nrow > MAX_INT32
93
+ raise Error, "Not supported"
94
+ end
95
+ inner_predict_array(
96
+ input,
97
+ start_iteration,
98
+ num_iteration,
99
+ predict_type
100
+ )
101
+ end
102
+
103
+ def inner_predict_array(input, start_iteration, num_iteration, predict_type)
104
+ n_preds =
105
+ num_preds(
106
+ start_iteration,
107
+ num_iteration,
108
+ input.count,
109
+ predict_type
110
+ )
111
+
112
+ flat_input = input.flatten
113
+ handle_missing(flat_input)
114
+ data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
115
+ data.write_array_of_double(flat_input)
116
+
117
+ out_num_preds = ::FFI::MemoryPointer.new(:int64)
118
+ out_result = ::FFI::MemoryPointer.new(:double, n_preds)
119
+ safe_call FFI.LGBM_BoosterPredictForMat(@handle, data, FFI::C_API_DTYPE_FLOAT64, input.count, input.first.count, 1, predict_type, start_iteration, num_iteration, @pred_parameter, out_num_preds, out_result)
120
+ if n_preds != out_num_preds.read_int64
121
+ raise Error, "Wrong length for predict results"
122
+ end
123
+ preds = out_result.read_array_of_double(out_num_preds.read_int64)
124
+ [preds, input.count]
125
+ end
126
+
127
+ def num_preds(start_iteration, num_iteration, nrow, predict_type)
128
+ out = ::FFI::MemoryPointer.new(:int64)
129
+ safe_call FFI.LGBM_BoosterCalcNumPredict(@handle, nrow, predict_type, start_iteration, num_iteration, out)
130
+ out.read_int64
131
+ end
132
+
133
+ def sorted_feature_values(input_hash)
134
+ input_hash.transform_keys(&:to_s).fetch_values(*cached_feature_name)
135
+ end
136
+
137
+ def cached_feature_name
138
+ @booster.send(:cached_feature_name)
139
+ end
140
+
141
+ def apply_pandas_categorical(data, categorical_feature, pandas_categorical)
142
+ (categorical_feature || []).each_with_index do |cf, i|
143
+ cat_codes = pandas_categorical[i].map.with_index.to_h
144
+ data.each do |r|
145
+ cat = r[cf]
146
+ unless cat.nil?
147
+ r[cf] =
148
+ cat_codes.fetch(cat) do
149
+ unless cat.is_a?(String)
150
+ raise ArgumentError, "expected categorical value"
151
+ end
152
+ nil
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
@@ -16,7 +16,7 @@ module LightGBM
16
16
  end
17
17
 
18
18
  def load_model(fname)
19
- @booster = Booster.new(params: @params, model_file: fname)
19
+ @booster = Booster.new(model_file: fname)
20
20
  end
21
21
 
22
22
  def best_iteration
@@ -2,8 +2,8 @@ module LightGBM
2
2
  module Utils
3
3
  private
4
4
 
5
- def check_result(err)
6
- raise LightGBM::Error, FFI.LGBM_GetLastError if err != 0
5
+ def safe_call(err)
6
+ raise Error, FFI.LGBM_GetLastError if err != 0
7
7
  end
8
8
 
9
9
  # remove spaces in keys and values to prevent injection
@@ -24,6 +24,13 @@ module LightGBM
24
24
  end
25
25
  end
26
26
 
27
+ def check_2d_array(data)
28
+ ncol = data.first&.size || 0
29
+ if !data.all? { |r| r.size == ncol }
30
+ raise ArgumentError, "Rows have different sizes"
31
+ end
32
+ end
33
+
27
34
  # for categorical, NaN and negative value are the same
28
35
  def handle_missing(data)
29
36
  data.map! { |v| v.nil? ? Float::NAN : v }
@@ -1,3 +1,3 @@
1
1
  module LightGBM
2
- VERSION = "0.3.4"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/lightgbm.rb CHANGED
@@ -1,10 +1,14 @@
1
1
  # dependencies
2
2
  require "ffi"
3
3
 
4
+ # stdlib
5
+ require "json"
6
+
4
7
  # modules
5
8
  require_relative "lightgbm/utils"
6
9
  require_relative "lightgbm/booster"
7
10
  require_relative "lightgbm/dataset"
11
+ require_relative "lightgbm/inner_predictor"
8
12
  require_relative "lightgbm/version"
9
13
 
10
14
  # scikit-learn API
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lightgbm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-07-28 00:00:00.000000000 Z
10
+ date: 2025-01-05 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: ffi
@@ -24,7 +23,6 @@ dependencies:
24
23
  - - ">="
25
24
  - !ruby/object:Gem::Version
26
25
  version: '0'
27
- description:
28
26
  email: andrew@ankane.org
29
27
  executables: []
30
28
  extensions: []
@@ -38,6 +36,7 @@ files:
38
36
  - lib/lightgbm/classifier.rb
39
37
  - lib/lightgbm/dataset.rb
40
38
  - lib/lightgbm/ffi.rb
39
+ - lib/lightgbm/inner_predictor.rb
41
40
  - lib/lightgbm/model.rb
42
41
  - lib/lightgbm/ranker.rb
43
42
  - lib/lightgbm/regressor.rb
@@ -53,7 +52,6 @@ homepage: https://github.com/ankane/lightgbm-ruby
53
52
  licenses:
54
53
  - MIT
55
54
  metadata: {}
56
- post_install_message:
57
55
  rdoc_options: []
58
56
  require_paths:
59
57
  - lib
@@ -61,15 +59,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
61
59
  requirements:
62
60
  - - ">="
63
61
  - !ruby/object:Gem::Version
64
- version: '3'
62
+ version: '3.1'
65
63
  required_rubygems_version: !ruby/object:Gem::Requirement
66
64
  requirements:
67
65
  - - ">="
68
66
  - !ruby/object:Gem::Version
69
67
  version: '0'
70
68
  requirements: []
71
- rubygems_version: 3.5.11
72
- signing_key:
69
+ rubygems_version: 3.6.2
73
70
  specification_version: 4
74
71
  summary: High performance gradient boosting for Ruby
75
72
  test_files: []