svmkit 0.7.3 → 0.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -9
  3. data/.rspec +1 -0
  4. data/.travis.yml +4 -12
  5. data/LICENSE.txt +1 -1
  6. data/README.md +11 -13
  7. data/lib/svmkit.rb +3 -66
  8. data/svmkit.gemspec +12 -7
  9. metadata +16 -81
  10. data/.coveralls.yml +0 -1
  11. data/.rubocop.yml +0 -47
  12. data/.rubocop_todo.yml +0 -58
  13. data/HISTORY.md +0 -168
  14. data/lib/svmkit/base/base_estimator.rb +0 -13
  15. data/lib/svmkit/base/classifier.rb +0 -34
  16. data/lib/svmkit/base/cluster_analyzer.rb +0 -29
  17. data/lib/svmkit/base/evaluator.rb +0 -13
  18. data/lib/svmkit/base/regressor.rb +0 -34
  19. data/lib/svmkit/base/splitter.rb +0 -17
  20. data/lib/svmkit/base/transformer.rb +0 -18
  21. data/lib/svmkit/clustering/dbscan.rb +0 -127
  22. data/lib/svmkit/clustering/k_means.rb +0 -140
  23. data/lib/svmkit/dataset.rb +0 -109
  24. data/lib/svmkit/decomposition/nmf.rb +0 -147
  25. data/lib/svmkit/decomposition/pca.rb +0 -150
  26. data/lib/svmkit/ensemble/ada_boost_classifier.rb +0 -198
  27. data/lib/svmkit/ensemble/ada_boost_regressor.rb +0 -180
  28. data/lib/svmkit/ensemble/random_forest_classifier.rb +0 -182
  29. data/lib/svmkit/ensemble/random_forest_regressor.rb +0 -143
  30. data/lib/svmkit/evaluation_measure/accuracy.rb +0 -30
  31. data/lib/svmkit/evaluation_measure/f_score.rb +0 -51
  32. data/lib/svmkit/evaluation_measure/log_loss.rb +0 -46
  33. data/lib/svmkit/evaluation_measure/mean_absolute_error.rb +0 -30
  34. data/lib/svmkit/evaluation_measure/mean_squared_error.rb +0 -30
  35. data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb +0 -63
  36. data/lib/svmkit/evaluation_measure/precision.rb +0 -51
  37. data/lib/svmkit/evaluation_measure/precision_recall.rb +0 -91
  38. data/lib/svmkit/evaluation_measure/purity.rb +0 -41
  39. data/lib/svmkit/evaluation_measure/r2_score.rb +0 -44
  40. data/lib/svmkit/evaluation_measure/recall.rb +0 -51
  41. data/lib/svmkit/kernel_approximation/rbf.rb +0 -136
  42. data/lib/svmkit/kernel_machine/kernel_svc.rb +0 -194
  43. data/lib/svmkit/linear_model/lasso.rb +0 -138
  44. data/lib/svmkit/linear_model/linear_regression.rb +0 -112
  45. data/lib/svmkit/linear_model/logistic_regression.rb +0 -161
  46. data/lib/svmkit/linear_model/ridge.rb +0 -112
  47. data/lib/svmkit/linear_model/sgd_linear_estimator.rb +0 -89
  48. data/lib/svmkit/linear_model/svc.rb +0 -184
  49. data/lib/svmkit/linear_model/svr.rb +0 -123
  50. data/lib/svmkit/model_selection/cross_validation.rb +0 -121
  51. data/lib/svmkit/model_selection/grid_search_cv.rb +0 -247
  52. data/lib/svmkit/model_selection/k_fold.rb +0 -77
  53. data/lib/svmkit/model_selection/stratified_k_fold.rb +0 -95
  54. data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +0 -101
  55. data/lib/svmkit/naive_bayes/naive_bayes.rb +0 -316
  56. data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +0 -112
  57. data/lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb +0 -94
  58. data/lib/svmkit/optimizer/nadam.rb +0 -90
  59. data/lib/svmkit/optimizer/rmsprop.rb +0 -69
  60. data/lib/svmkit/optimizer/sgd.rb +0 -65
  61. data/lib/svmkit/optimizer/yellow_fin.rb +0 -144
  62. data/lib/svmkit/pairwise_metric.rb +0 -91
  63. data/lib/svmkit/pipeline/pipeline.rb +0 -197
  64. data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +0 -262
  65. data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +0 -194
  66. data/lib/svmkit/preprocessing/l2_normalizer.rb +0 -63
  67. data/lib/svmkit/preprocessing/label_encoder.rb +0 -95
  68. data/lib/svmkit/preprocessing/min_max_scaler.rb +0 -93
  69. data/lib/svmkit/preprocessing/one_hot_encoder.rb +0 -99
  70. data/lib/svmkit/preprocessing/standard_scaler.rb +0 -87
  71. data/lib/svmkit/probabilistic_output.rb +0 -112
  72. data/lib/svmkit/tree/decision_tree_classifier.rb +0 -276
  73. data/lib/svmkit/tree/decision_tree_regressor.rb +0 -251
  74. data/lib/svmkit/tree/node.rb +0 -70
  75. data/lib/svmkit/utils.rb +0 -22
  76. data/lib/svmkit/validation.rb +0 -79
  77. data/lib/svmkit/values.rb +0 -13
  78. data/lib/svmkit/version.rb +0 -7
@@ -1,251 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/base/base_estimator'
5
- require 'svmkit/base/regressor'
6
- require 'svmkit/tree/node'
7
-
8
- module SVMKit
9
- module Tree
10
- # DecisionTreeRegressor is a class that implements decision tree for regression.
11
- #
12
- # @example
13
- # estimator =
14
- # SVMKit::Tree::DecisionTreeRegressor.new(
15
- # max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
16
- # estimator.fit(training_samples, traininig_values)
17
- # results = estimator.predict(testing_samples)
18
- #
19
- class DecisionTreeRegressor
20
- include Base::BaseEstimator
21
- include Base::Regressor
22
- include Validation
23
-
24
- # Return the importance for each feature.
25
- # @return [Numo::DFloat] (size: n_features)
26
- attr_reader :feature_importances
27
-
28
- # Return the learned tree.
29
- # @return [Node]
30
- attr_reader :tree
31
-
32
- # Return the random generator for random selection of feature index.
33
- # @return [Random]
34
- attr_reader :rng
35
-
36
- # Return the values assigned each leaf.
37
- # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
38
- attr_reader :leaf_values
39
-
40
- # Create a new regressor with decision tree algorithm.
41
- #
42
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'mae' and 'mse'.
43
- # @param max_depth [Integer] The maximum depth of the tree.
44
- # If nil is given, decision tree grows without concern for depth.
45
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
46
- # If nil is given, number of leaves is not limited.
47
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
- # If nil is given, split process considers all features.
50
- # @param random_seed [Integer] The seed value using to initialize the random generator.
51
- # It is used to randomly determine the order of features when deciding spliting point.
52
- def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
53
- random_seed: nil)
54
- check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
55
- max_features: max_features, random_seed: random_seed)
56
- check_params_integer(min_samples_leaf: min_samples_leaf)
57
- check_params_string(criterion: criterion)
58
- check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
59
- min_samples_leaf: min_samples_leaf, max_features: max_features)
60
- @params = {}
61
- @params[:criterion] = criterion
62
- @params[:max_depth] = max_depth
63
- @params[:max_leaf_nodes] = max_leaf_nodes
64
- @params[:min_samples_leaf] = min_samples_leaf
65
- @params[:max_features] = max_features
66
- @params[:random_seed] = random_seed
67
- @params[:random_seed] ||= srand
68
- @criterion = :mse
69
- @criterion = :mae if @params[:criterion] == 'mae'
70
- @tree = nil
71
- @feature_importances = nil
72
- @n_leaves = nil
73
- @leaf_values = nil
74
- @rng = Random.new(@params[:random_seed])
75
- end
76
-
77
- # Fit the model with given training data.
78
- #
79
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
80
- # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
81
- # @return [DecisionTreeRegressor] The learned regressor itself.
82
- def fit(x, y)
83
- check_sample_array(x)
84
- check_tvalue_array(y)
85
- check_sample_tvalue_size(x, y)
86
- single_target = y.shape[1].nil?
87
- y = y.expand_dims(1) if single_target
88
- n_samples, n_features = x.shape
89
- @params[:max_features] = n_features if @params[:max_features].nil?
90
- @params[:max_features] = [@params[:max_features], n_features].min
91
- build_tree(x, y)
92
- @leaf_values = @leaf_values[true] if single_target
93
- eval_importance(n_samples, n_features)
94
- self
95
- end
96
-
97
- # Predict values for samples.
98
- #
99
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
100
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
101
- def predict(x)
102
- check_sample_array(x)
103
- @leaf_values.shape[1].nil? ? @leaf_values[apply(x)] : @leaf_values[apply(x), true]
104
- end
105
-
106
- # Return the index of the leaf that each sample reached.
107
- #
108
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
109
- # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
110
- def apply(x)
111
- check_sample_array(x)
112
- Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
113
- end
114
-
115
- # Dump marshal data.
116
- # @return [Hash] The marshal data about DecisionTreeRegressor
117
- def marshal_dump
118
- { params: @params,
119
- criterion: @criterion,
120
- tree: @tree,
121
- feature_importances: @feature_importances,
122
- leaf_values: @leaf_values,
123
- rng: @rng }
124
- end
125
-
126
- # Load marshal data.
127
- # @return [nil]
128
- def marshal_load(obj)
129
- @params = obj[:params]
130
- @criterion = obj[:criterion]
131
- @tree = obj[:tree]
132
- @feature_importances = obj[:feature_importances]
133
- @leaf_values = obj[:leaf_values]
134
- @rng = obj[:rng]
135
- nil
136
- end
137
-
138
- private
139
-
140
- def apply_at_node(node, sample)
141
- return node.leaf_id if node.leaf
142
- return apply_at_node(node.left, sample) if node.right.nil?
143
- return apply_at_node(node.right, sample) if node.left.nil?
144
- if sample[node.feature_id] <= node.threshold
145
- apply_at_node(node.left, sample)
146
- else
147
- apply_at_node(node.right, sample)
148
- end
149
- end
150
-
151
- def build_tree(x, y)
152
- @n_leaves = 0
153
- @leaf_values = []
154
- @tree = grow_node(0, x, y, impurity(y))
155
- @leaf_values = Numo::DFloat.cast(@leaf_values)
156
- nil
157
- end
158
-
159
- def grow_node(depth, x, y, whole_impurity)
160
- unless @params[:max_leaf_nodes].nil?
161
- return nil if @n_leaves >= @params[:max_leaf_nodes]
162
- end
163
-
164
- n_samples, n_features = x.shape
165
- return nil if n_samples <= @params[:min_samples_leaf]
166
-
167
- node = Node.new(depth: depth, impurity: whole_impurity, n_samples: n_samples)
168
-
169
- return put_leaf(node, y) if (y - y.mean(0)).sum.abs.zero?
170
-
171
- unless @params[:max_depth].nil?
172
- return put_leaf(node, y) if depth == @params[:max_depth]
173
- end
174
-
175
- feature_id, threshold, left_ids, right_ids, left_impurity, right_impurity, gain =
176
- rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y, whole_impurity)] }.max_by(&:last)
177
-
178
- return put_leaf(node, y) if gain.nil? || gain.zero?
179
-
180
- node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_impurity)
181
- node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_impurity)
182
-
183
- return put_leaf(node, y) if node.left.nil? && node.right.nil?
184
-
185
- node.feature_id = feature_id
186
- node.threshold = threshold
187
- node.leaf = false
188
- node
189
- end
190
-
191
- def put_leaf(node, values)
192
- node.probs = nil
193
- node.leaf = true
194
- node.leaf_id = @n_leaves
195
- @n_leaves += 1
196
- @leaf_values.push(values.mean(0))
197
- node
198
- end
199
-
200
- def rand_ids(n)
201
- [*0...n].sample(@params[:max_features], random: @rng)
202
- end
203
-
204
- def best_split(features, values, whole_impurity)
205
- n_samples = values.shape[0]
206
- features.to_a.uniq.sort.each_cons(2).map do |l, r|
207
- threshold = 0.5 * (l + r)
208
- left_ids = features.le(threshold).where
209
- right_ids = features.gt(threshold).where
210
- left_impurity = impurity(values[left_ids, true])
211
- right_impurity = impurity(values[right_ids, true])
212
- gain = whole_impurity -
213
- left_impurity * left_ids.size.fdiv(n_samples) -
214
- right_impurity * right_ids.size.fdiv(n_samples)
215
- [threshold, left_ids, right_ids, left_impurity, right_impurity, gain]
216
- end.max_by(&:last)
217
- end
218
-
219
- def impurity(values)
220
- send(@criterion, values)
221
- end
222
-
223
- def mse(values)
224
- ((values - values.mean(0))**2).mean
225
- end
226
-
227
- def mae(values)
228
- (values - values.mean(0)).abs.mean
229
- end
230
-
231
- def eval_importance(n_samples, n_features)
232
- @feature_importances = Numo::DFloat.zeros(n_features)
233
- eval_importance_at_node(@tree)
234
- @feature_importances /= n_samples
235
- normalizer = @feature_importances.sum
236
- @feature_importances /= normalizer if normalizer > 0.0
237
- nil
238
- end
239
-
240
- def eval_importance_at_node(node)
241
- return nil if node.leaf
242
- return nil if node.left.nil? || node.right.nil?
243
- gain = node.n_samples * node.impurity -
244
- node.left.n_samples * node.left.impurity - node.right.n_samples * node.right.impurity
245
- @feature_importances[node.feature_id] += gain
246
- eval_importance_at_node(node.left)
247
- eval_importance_at_node(node.right)
248
- end
249
- end
250
- end
251
- end
@@ -1,70 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SVMKit
4
- module Tree
5
- # Node is a class that implements node used for construction of decision tree.
6
- # This class is used for internal data structures.
7
- class Node
8
- # @!visibility private
9
- attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
10
-
11
- # Create a new node for decision tree.
12
- #
13
- # @param depth [Integer] The depth of the node in tree.
14
- # @param impurity [Float] The impurity of the node.
15
- # @param n_samples [Integer] The number of the samples in the node.
16
- # @param probs [Float] The probability of the node.
17
- # @param leaf [Boolean] The flag indicating whether the node is a leaf.
18
- # @param leaf_id [Integer] The leaf index of the node.
19
- # @param left [Node] The left node.
20
- # @param right [Node] The right node.
21
- # @param feature_id [Integer] The feature index used for evaluation.
22
- # @param threshold [Float] The threshold value of the feature for splitting the node.
23
- def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
24
- leaf: true, leaf_id: 0,
25
- left: nil, right: nil, feature_id: 0, threshold: 0.0)
26
- @depth = depth
27
- @impurity = impurity
28
- @n_samples = n_samples
29
- @probs = probs
30
- @leaf = leaf
31
- @leaf_id = leaf_id
32
- @left = left
33
- @right = right
34
- @feature_id = feature_id
35
- @threshold = threshold
36
- end
37
-
38
- # Dump marshal data.
39
- # @return [Hash] The marshal data about Node
40
- def marshal_dump
41
- { depth: @depth,
42
- impurity: @impurity,
43
- n_samples: @n_samples,
44
- probs: @probs,
45
- leaf: @leaf,
46
- leaf_id: @leaf_id,
47
- left: @left,
48
- right: @right,
49
- feature_id: @feature_id,
50
- threshold: @threshold }
51
- end
52
-
53
- # Load marshal data.
54
- # @return [nil]
55
- def marshal_load(obj)
56
- @depth = obj[:depth]
57
- @impurity = obj[:impurity]
58
- @n_samples = obj[:n_samples]
59
- @probs = obj[:probs]
60
- @leaf = obj[:leaf]
61
- @leaf_id = obj[:leaf_id]
62
- @left = obj[:left]
63
- @right = obj[:right]
64
- @feature_id = obj[:feature_id]
65
- @threshold = obj[:threshold]
66
- nil
67
- end
68
- end
69
- end
70
- end
@@ -1,22 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SVMKit
4
- # @!visibility private
5
- module Utils
6
- module_function
7
-
8
- # @!visibility private
9
- def choice_ids(size, probs, rng = nil)
10
- rng ||= Random.new
11
- Array.new(size) do
12
- target = rng.rand
13
- chosen = 0
14
- probs.each_with_index do |p, idx|
15
- break (chosen = idx) if target <= p
16
- target -= p
17
- end
18
- chosen
19
- end
20
- end
21
- end
22
- end
@@ -1,79 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SVMKit
4
- # @!visibility private
5
- module Validation
6
- module_function
7
-
8
- # @!visibility private
9
- def check_sample_array(x)
10
- raise TypeError, 'Expect class of sample matrix to be Numo::DFloat' unless x.is_a?(Numo::DFloat)
11
- raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
12
- nil
13
- end
14
-
15
- # @!visibility private
16
- def check_label_array(y)
17
- raise TypeError, 'Expect class of label vector to be Numo::Int32' unless y.is_a?(Numo::Int32)
18
- raise ArgumentError, 'Expect label vector to be 1-D arrray' unless y.shape.size == 1
19
- nil
20
- end
21
-
22
- # @!visibility private
23
- def check_tvalue_array(y)
24
- raise TypeError, 'Expect class of target value vector to be Numo::DFloat' unless y.is_a?(Numo::DFloat)
25
- nil
26
- end
27
-
28
- # @!visibility private
29
- def check_sample_label_size(x, y)
30
- raise ArgumentError, 'Expect to have the same number of samples for sample matrix and label vector' unless x.shape[0] == y.shape[0]
31
- nil
32
- end
33
-
34
- # @!visibility private
35
- def check_sample_tvalue_size(x, y)
36
- raise ArgumentError, 'Expect to have the same number of samples for sample matrix and target value vector' unless x.shape[0] == y.shape[0]
37
- nil
38
- end
39
-
40
- # @!visibility private
41
- def check_params_type(type, params = {})
42
- params.each { |k, v| raise TypeError, "Expect class of #{k} to be #{type}" unless v.is_a?(type) }
43
- nil
44
- end
45
-
46
- # @!visibility private
47
- def check_params_type_or_nil(type, params = {})
48
- params.each { |k, v| raise TypeError, "Expect class of #{k} to be #{type} or nil" unless v.is_a?(type) || v.is_a?(NilClass) }
49
- nil
50
- end
51
-
52
- # @!visibility private
53
- def check_params_float(params = {})
54
- check_params_type(Float, params)
55
- end
56
-
57
- # @!visibility private
58
- def check_params_integer(params = {})
59
- check_params_type(Integer, params)
60
- end
61
-
62
- # @!visibility private
63
- def check_params_string(params = {})
64
- check_params_type(String, params)
65
- end
66
-
67
- # @!visibility private
68
- def check_params_boolean(params = {})
69
- params.each { |k, v| raise TypeError, "Expect class of #{k} to be Boolean" unless v.is_a?(FalseClass) || v.is_a?(TrueClass) }
70
- nil
71
- end
72
-
73
- # @!visibility private
74
- def check_params_positive(params = {})
75
- params.reject { |_, v| v.nil? }.each { |k, v| raise ArgumentError, "Expect #{k} to be positive value" if v < 0 }
76
- nil
77
- end
78
- end
79
- end
@@ -1,13 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SVMKit
4
- # @!visibility private
5
- module Values
6
- module_function
7
-
8
- # @!visibility private
9
- def int_max
10
- @int_max ||= 2**([42].pack('i').size * 16 - 2) - 1
11
- end
12
- end
13
- end
@@ -1,7 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # SVMKit is a machine learning library in Ruby.
4
- module SVMKit
5
- # @!visibility private
6
- VERSION = '0.7.3'.freeze
7
- end