svmkit 0.6.1 → 0.6.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/HISTORY.md +4 -1
- data/lib/svmkit/linear_model/sgd_linear_estimator.rb +1 -1
- data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +1 -1
- data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +1 -1
- data/lib/svmkit/tree/decision_tree_classifier.rb +31 -34
- data/lib/svmkit/tree/decision_tree_regressor.rb +20 -21
- data/lib/svmkit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9655d7d990f37468c79de9713e55f74c6134b00c1cda9471832097c678cb6ded
|
4
|
+
data.tar.gz: fb294e6256d16272e80c2ade2f5b223dd792b14c7fd39caf1de8a9500a3ab55e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f7386071fe57df51bd8223d4945dc67069769464892bd64b972b5c1ba26cd206b7b67d50e600f34d79a3bff9f19803c0fdae06dd92fdf8f6ef87f1d5e982cf2d
|
7
|
+
data.tar.gz: 473a1233e0109672b80b8bf17933366276b0a81fab3c75699fb1a07f92923d66cc3f064e0a2df36e493bf1554aa4c34d567d60607d1802e10ab949eafe1187d3
|
data/.rubocop.yml
CHANGED
data/HISTORY.md
CHANGED
@@ -1,7 +1,10 @@
|
|
1
|
+
# 0.6.2
|
2
|
+
- Refactor decision tree classes for improving performance.
|
3
|
+
|
1
4
|
# 0.6.1
|
2
5
|
- Add abstract class for linear estimator with stochastic gradient descent.
|
3
6
|
- Refactor linear estimators to use linear esitmator abstract class.
|
4
|
-
- Refactor
|
7
|
+
- Refactor decision tree classes to avoid unneeded type conversion.
|
5
8
|
|
6
9
|
# 0.6.0
|
7
10
|
- Add class for Principal Component Analysis.
|
@@ -91,8 +91,9 @@ module SVMKit
|
|
91
91
|
n_samples, n_features = x.shape
|
92
92
|
@params[:max_features] = n_features if @params[:max_features].nil?
|
93
93
|
@params[:max_features] = [@params[:max_features], n_features].min
|
94
|
-
|
95
|
-
|
94
|
+
uniq_y = y.to_a.uniq.sort
|
95
|
+
@classes = Numo::Int32.asarray(uniq_y)
|
96
|
+
build_tree(x, y.map { |v| uniq_y.index(v) })
|
96
97
|
eval_importance(n_samples, n_features)
|
97
98
|
self
|
98
99
|
end
|
@@ -174,36 +175,35 @@ module SVMKit
|
|
174
175
|
def build_tree(x, y)
|
175
176
|
@n_leaves = 0
|
176
177
|
@leaf_labels = []
|
177
|
-
@tree = grow_node(0, x, y)
|
178
|
+
@tree = grow_node(0, x, y, impurity(y))
|
178
179
|
@leaf_labels = Numo::Int32[*@leaf_labels]
|
179
180
|
nil
|
180
181
|
end
|
181
182
|
|
182
|
-
def grow_node(depth, x, y)
|
183
|
-
|
183
|
+
def grow_node(depth, x, y, whole_impurity)
|
184
|
+
unless @params[:max_leaf_nodes].nil?
|
184
185
|
return nil if @n_leaves >= @params[:max_leaf_nodes]
|
185
186
|
end
|
186
187
|
|
187
188
|
n_samples, n_features = x.shape
|
188
|
-
if @params[:min_samples_leaf]
|
189
|
-
return nil if n_samples <= @params[:min_samples_leaf]
|
190
|
-
end
|
189
|
+
return nil if n_samples <= @params[:min_samples_leaf]
|
191
190
|
|
192
|
-
node = Node.new(depth: depth, impurity:
|
191
|
+
node = Node.new(depth: depth, impurity: whole_impurity, n_samples: n_samples)
|
193
192
|
|
194
193
|
return put_leaf(node, y) if y.to_a.uniq.size == 1
|
195
194
|
|
196
|
-
|
195
|
+
unless @params[:max_depth].nil?
|
197
196
|
return put_leaf(node, y) if depth == @params[:max_depth]
|
198
197
|
end
|
199
198
|
|
200
|
-
feature_id, threshold, left_ids, right_ids,
|
201
|
-
rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y)] }.max_by(&:last)
|
202
|
-
|
203
|
-
return put_leaf(node, y) if
|
199
|
+
feature_id, threshold, left_ids, right_ids, left_impurity, right_impurity, gain =
|
200
|
+
rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y, whole_impurity)] }.max_by(&:last)
|
201
|
+
|
202
|
+
return put_leaf(node, y) if gain.nil? || gain.zero?
|
203
|
+
|
204
|
+
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids], left_impurity)
|
205
|
+
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids], right_impurity)
|
204
206
|
|
205
|
-
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids])
|
206
|
-
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids])
|
207
207
|
return put_leaf(node, y) if node.left.nil? && node.right.nil?
|
208
208
|
|
209
209
|
node.feature_id = feature_id
|
@@ -213,7 +213,7 @@ module SVMKit
|
|
213
213
|
end
|
214
214
|
|
215
215
|
def put_leaf(node, y)
|
216
|
-
node.probs =
|
216
|
+
node.probs = y.bincount(minlength: @classes.size) / node.n_samples.to_f
|
217
217
|
node.leaf = true
|
218
218
|
node.leaf_id = @n_leaves
|
219
219
|
@n_leaves += 1
|
@@ -225,27 +225,23 @@ module SVMKit
|
|
225
225
|
[*0...n].sample(@params[:max_features], random: @rng)
|
226
226
|
end
|
227
227
|
|
228
|
-
def best_split(features, labels)
|
228
|
+
def best_split(features, labels, whole_impurity)
|
229
|
+
n_samples = labels.size
|
229
230
|
features.to_a.uniq.sort.each_cons(2).map do |l, r|
|
230
231
|
threshold = 0.5 * (l + r)
|
231
|
-
left_ids
|
232
|
-
|
232
|
+
left_ids = features.le(threshold).where
|
233
|
+
right_ids = features.gt(threshold).where
|
234
|
+
left_impurity = impurity(labels[left_ids])
|
235
|
+
right_impurity = impurity(labels[right_ids])
|
236
|
+
gain = whole_impurity -
|
237
|
+
left_impurity * left_ids.size.fdiv(n_samples) -
|
238
|
+
right_impurity * right_ids.size.fdiv(n_samples)
|
239
|
+
[threshold, left_ids, right_ids, left_impurity, right_impurity, gain]
|
233
240
|
end.max_by(&:last)
|
234
241
|
end
|
235
242
|
|
236
|
-
def splited_ids(features, threshold)
|
237
|
-
[features.le(threshold).where, features.gt(threshold).where]
|
238
|
-
end
|
239
|
-
|
240
|
-
def gain(labels, labels_left, labels_right)
|
241
|
-
prob_left = labels_left.size.fdiv(labels.size)
|
242
|
-
prob_right = labels_right.size.fdiv(labels.size)
|
243
|
-
impurity(labels) - prob_left * impurity(labels_left) - prob_right * impurity(labels_right)
|
244
|
-
end
|
245
|
-
|
246
243
|
def impurity(labels)
|
247
|
-
|
248
|
-
cls.size == 1 ? 0.0 : send(@criterion, Numo::DFloat[*(cls.map { |c| labels.eq(c).count_true.fdiv(labels.size) })])
|
244
|
+
send(@criterion, labels.bincount / labels.size.to_f)
|
249
245
|
end
|
250
246
|
|
251
247
|
def gini(posterior_probs)
|
@@ -253,7 +249,7 @@ module SVMKit
|
|
253
249
|
end
|
254
250
|
|
255
251
|
def entropy(posterior_probs)
|
256
|
-
-(posterior_probs * Numo::NMath.log(posterior_probs)).sum
|
252
|
+
-(posterior_probs * Numo::NMath.log(posterior_probs + 1)).sum
|
257
253
|
end
|
258
254
|
|
259
255
|
def eval_importance(n_samples, n_features)
|
@@ -269,7 +265,8 @@ module SVMKit
|
|
269
265
|
return nil if node.leaf
|
270
266
|
return nil if node.left.nil? || node.right.nil?
|
271
267
|
gain = node.n_samples * node.impurity -
|
272
|
-
node.left.n_samples * node.left.impurity -
|
268
|
+
node.left.n_samples * node.left.impurity -
|
269
|
+
node.right.n_samples * node.right.impurity
|
273
270
|
@feature_importances[node.feature_id] += gain
|
274
271
|
eval_importance_at_node(node.left)
|
275
272
|
eval_importance_at_node(node.right)
|
@@ -151,12 +151,12 @@ module SVMKit
|
|
151
151
|
def build_tree(x, y)
|
152
152
|
@n_leaves = 0
|
153
153
|
@leaf_values = []
|
154
|
-
@tree = grow_node(0, x, y)
|
154
|
+
@tree = grow_node(0, x, y, impurity(y))
|
155
155
|
@leaf_values = Numo::DFloat.cast(@leaf_values)
|
156
156
|
nil
|
157
157
|
end
|
158
158
|
|
159
|
-
def grow_node(depth, x, y)
|
159
|
+
def grow_node(depth, x, y, whole_impurity)
|
160
160
|
unless @params[:max_leaf_nodes].nil?
|
161
161
|
return nil if @n_leaves >= @params[:max_leaf_nodes]
|
162
162
|
end
|
@@ -164,7 +164,7 @@ module SVMKit
|
|
164
164
|
n_samples, n_features = x.shape
|
165
165
|
return nil if n_samples <= @params[:min_samples_leaf]
|
166
166
|
|
167
|
-
node = Node.new(depth: depth, impurity:
|
167
|
+
node = Node.new(depth: depth, impurity: whole_impurity, n_samples: n_samples)
|
168
168
|
|
169
169
|
return put_leaf(node, y) if (y - y.mean(0)).sum.abs.zero?
|
170
170
|
|
@@ -172,12 +172,14 @@ module SVMKit
|
|
172
172
|
return put_leaf(node, y) if depth == @params[:max_depth]
|
173
173
|
end
|
174
174
|
|
175
|
-
feature_id, threshold, left_ids, right_ids,
|
176
|
-
rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y)] }.max_by(&:last)
|
177
|
-
|
175
|
+
feature_id, threshold, left_ids, right_ids, left_impurity, right_impurity, gain =
|
176
|
+
rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y, whole_impurity)] }.max_by(&:last)
|
177
|
+
|
178
|
+
return put_leaf(node, y) if gain.nil? || gain.zero?
|
179
|
+
|
180
|
+
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_impurity)
|
181
|
+
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_impurity)
|
178
182
|
|
179
|
-
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true])
|
180
|
-
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true])
|
181
183
|
return put_leaf(node, y) if node.left.nil? && node.right.nil?
|
182
184
|
|
183
185
|
node.feature_id = feature_id
|
@@ -199,24 +201,21 @@ module SVMKit
|
|
199
201
|
[*0...n].sample(@params[:max_features], random: @rng)
|
200
202
|
end
|
201
203
|
|
202
|
-
def best_split(features, values)
|
204
|
+
def best_split(features, values, whole_impurity)
|
205
|
+
n_samples = values.shape[0]
|
203
206
|
features.to_a.uniq.sort.each_cons(2).map do |l, r|
|
204
207
|
threshold = 0.5 * (l + r)
|
205
|
-
left_ids
|
206
|
-
|
208
|
+
left_ids = features.le(threshold).where
|
209
|
+
right_ids = features.gt(threshold).where
|
210
|
+
left_impurity = impurity(values[left_ids, true])
|
211
|
+
right_impurity = impurity(values[right_ids, true])
|
212
|
+
gain = whole_impurity -
|
213
|
+
left_impurity * left_ids.size.fdiv(n_samples) -
|
214
|
+
right_impurity * right_ids.size.fdiv(n_samples)
|
215
|
+
[threshold, left_ids, right_ids, left_impurity, right_impurity, gain]
|
207
216
|
end.max_by(&:last)
|
208
217
|
end
|
209
218
|
|
210
|
-
def splited_ids(features, threshold)
|
211
|
-
[features.le(threshold).where, features.gt(threshold).where]
|
212
|
-
end
|
213
|
-
|
214
|
-
def gain(values, values_left, values_right)
|
215
|
-
prob_left = values_left.shape[0].fdiv(values.shape[0])
|
216
|
-
prob_right = values_right.shape[0].fdiv(values.shape[0])
|
217
|
-
impurity(values) - prob_left * impurity(values_left) - prob_right * impurity(values_right)
|
218
|
-
end
|
219
|
-
|
220
219
|
def impurity(values)
|
221
220
|
send(@criterion, values)
|
222
221
|
end
|
data/lib/svmkit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: svmkit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-09-
|
11
|
+
date: 2018-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|