svmkit 0.7.3 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -9
  3. data/.rspec +1 -0
  4. data/.travis.yml +4 -12
  5. data/LICENSE.txt +1 -1
  6. data/README.md +11 -13
  7. data/lib/svmkit.rb +3 -66
  8. data/svmkit.gemspec +12 -7
  9. metadata +16 -81
  10. data/.coveralls.yml +0 -1
  11. data/.rubocop.yml +0 -47
  12. data/.rubocop_todo.yml +0 -58
  13. data/HISTORY.md +0 -168
  14. data/lib/svmkit/base/base_estimator.rb +0 -13
  15. data/lib/svmkit/base/classifier.rb +0 -34
  16. data/lib/svmkit/base/cluster_analyzer.rb +0 -29
  17. data/lib/svmkit/base/evaluator.rb +0 -13
  18. data/lib/svmkit/base/regressor.rb +0 -34
  19. data/lib/svmkit/base/splitter.rb +0 -17
  20. data/lib/svmkit/base/transformer.rb +0 -18
  21. data/lib/svmkit/clustering/dbscan.rb +0 -127
  22. data/lib/svmkit/clustering/k_means.rb +0 -140
  23. data/lib/svmkit/dataset.rb +0 -109
  24. data/lib/svmkit/decomposition/nmf.rb +0 -147
  25. data/lib/svmkit/decomposition/pca.rb +0 -150
  26. data/lib/svmkit/ensemble/ada_boost_classifier.rb +0 -198
  27. data/lib/svmkit/ensemble/ada_boost_regressor.rb +0 -180
  28. data/lib/svmkit/ensemble/random_forest_classifier.rb +0 -182
  29. data/lib/svmkit/ensemble/random_forest_regressor.rb +0 -143
  30. data/lib/svmkit/evaluation_measure/accuracy.rb +0 -30
  31. data/lib/svmkit/evaluation_measure/f_score.rb +0 -51
  32. data/lib/svmkit/evaluation_measure/log_loss.rb +0 -46
  33. data/lib/svmkit/evaluation_measure/mean_absolute_error.rb +0 -30
  34. data/lib/svmkit/evaluation_measure/mean_squared_error.rb +0 -30
  35. data/lib/svmkit/evaluation_measure/normalized_mutual_information.rb +0 -63
  36. data/lib/svmkit/evaluation_measure/precision.rb +0 -51
  37. data/lib/svmkit/evaluation_measure/precision_recall.rb +0 -91
  38. data/lib/svmkit/evaluation_measure/purity.rb +0 -41
  39. data/lib/svmkit/evaluation_measure/r2_score.rb +0 -44
  40. data/lib/svmkit/evaluation_measure/recall.rb +0 -51
  41. data/lib/svmkit/kernel_approximation/rbf.rb +0 -136
  42. data/lib/svmkit/kernel_machine/kernel_svc.rb +0 -194
  43. data/lib/svmkit/linear_model/lasso.rb +0 -138
  44. data/lib/svmkit/linear_model/linear_regression.rb +0 -112
  45. data/lib/svmkit/linear_model/logistic_regression.rb +0 -161
  46. data/lib/svmkit/linear_model/ridge.rb +0 -112
  47. data/lib/svmkit/linear_model/sgd_linear_estimator.rb +0 -89
  48. data/lib/svmkit/linear_model/svc.rb +0 -184
  49. data/lib/svmkit/linear_model/svr.rb +0 -123
  50. data/lib/svmkit/model_selection/cross_validation.rb +0 -121
  51. data/lib/svmkit/model_selection/grid_search_cv.rb +0 -247
  52. data/lib/svmkit/model_selection/k_fold.rb +0 -77
  53. data/lib/svmkit/model_selection/stratified_k_fold.rb +0 -95
  54. data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +0 -101
  55. data/lib/svmkit/naive_bayes/naive_bayes.rb +0 -316
  56. data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +0 -112
  57. data/lib/svmkit/nearest_neighbors/k_neighbors_regressor.rb +0 -94
  58. data/lib/svmkit/optimizer/nadam.rb +0 -90
  59. data/lib/svmkit/optimizer/rmsprop.rb +0 -69
  60. data/lib/svmkit/optimizer/sgd.rb +0 -65
  61. data/lib/svmkit/optimizer/yellow_fin.rb +0 -144
  62. data/lib/svmkit/pairwise_metric.rb +0 -91
  63. data/lib/svmkit/pipeline/pipeline.rb +0 -197
  64. data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +0 -262
  65. data/lib/svmkit/polynomial_model/factorization_machine_regressor.rb +0 -194
  66. data/lib/svmkit/preprocessing/l2_normalizer.rb +0 -63
  67. data/lib/svmkit/preprocessing/label_encoder.rb +0 -95
  68. data/lib/svmkit/preprocessing/min_max_scaler.rb +0 -93
  69. data/lib/svmkit/preprocessing/one_hot_encoder.rb +0 -99
  70. data/lib/svmkit/preprocessing/standard_scaler.rb +0 -87
  71. data/lib/svmkit/probabilistic_output.rb +0 -112
  72. data/lib/svmkit/tree/decision_tree_classifier.rb +0 -276
  73. data/lib/svmkit/tree/decision_tree_regressor.rb +0 -251
  74. data/lib/svmkit/tree/node.rb +0 -70
  75. data/lib/svmkit/utils.rb +0 -22
  76. data/lib/svmkit/validation.rb +0 -79
  77. data/lib/svmkit/values.rb +0 -13
  78. data/lib/svmkit/version.rb +0 -7
@@ -1,251 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'svmkit/validation'
4
- require 'svmkit/base/base_estimator'
5
- require 'svmkit/base/regressor'
6
- require 'svmkit/tree/node'
7
-
8
- module SVMKit
9
- module Tree
10
- # DecisionTreeRegressor is a class that implements decision tree for regression.
11
- #
12
- # @example
13
- # estimator =
14
- # SVMKit::Tree::DecisionTreeRegressor.new(
15
- # max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
16
- # estimator.fit(training_samples, traininig_values)
17
- # results = estimator.predict(testing_samples)
18
- #
19
- class DecisionTreeRegressor
20
- include Base::BaseEstimator
21
- include Base::Regressor
22
- include Validation
23
-
24
- # Return the importance for each feature.
25
- # @return [Numo::DFloat] (size: n_features)
26
- attr_reader :feature_importances
27
-
28
- # Return the learned tree.
29
- # @return [Node]
30
- attr_reader :tree
31
-
32
- # Return the random generator for random selection of feature index.
33
- # @return [Random]
34
- attr_reader :rng
35
-
36
- # Return the values assigned each leaf.
37
- # @return [Numo::DFloat] (shape: [n_leafs, n_outputs])
38
- attr_reader :leaf_values
39
-
40
- # Create a new regressor with decision tree algorithm.
41
- #
42
- # @param criterion [String] The function to evalue spliting point. Supported criteria are 'mae' and 'mse'.
43
- # @param max_depth [Integer] The maximum depth of the tree.
44
- # If nil is given, decision tree grows without concern for depth.
45
- # @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
46
- # If nil is given, number of leaves is not limited.
47
- # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
48
- # @param max_features [Integer] The number of features to consider when searching optimal split point.
49
- # If nil is given, split process considers all features.
50
- # @param random_seed [Integer] The seed value using to initialize the random generator.
51
- # It is used to randomly determine the order of features when deciding spliting point.
52
- def initialize(criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
53
- random_seed: nil)
54
- check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
55
- max_features: max_features, random_seed: random_seed)
56
- check_params_integer(min_samples_leaf: min_samples_leaf)
57
- check_params_string(criterion: criterion)
58
- check_params_positive(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
59
- min_samples_leaf: min_samples_leaf, max_features: max_features)
60
- @params = {}
61
- @params[:criterion] = criterion
62
- @params[:max_depth] = max_depth
63
- @params[:max_leaf_nodes] = max_leaf_nodes
64
- @params[:min_samples_leaf] = min_samples_leaf
65
- @params[:max_features] = max_features
66
- @params[:random_seed] = random_seed
67
- @params[:random_seed] ||= srand
68
- @criterion = :mse
69
- @criterion = :mae if @params[:criterion] == 'mae'
70
- @tree = nil
71
- @feature_importances = nil
72
- @n_leaves = nil
73
- @leaf_values = nil
74
- @rng = Random.new(@params[:random_seed])
75
- end
76
-
77
- # Fit the model with given training data.
78
- #
79
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
80
- # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The taget values to be used for fitting the model.
81
- # @return [DecisionTreeRegressor] The learned regressor itself.
82
- def fit(x, y)
83
- check_sample_array(x)
84
- check_tvalue_array(y)
85
- check_sample_tvalue_size(x, y)
86
- single_target = y.shape[1].nil?
87
- y = y.expand_dims(1) if single_target
88
- n_samples, n_features = x.shape
89
- @params[:max_features] = n_features if @params[:max_features].nil?
90
- @params[:max_features] = [@params[:max_features], n_features].min
91
- build_tree(x, y)
92
- @leaf_values = @leaf_values[true] if single_target
93
- eval_importance(n_samples, n_features)
94
- self
95
- end
96
-
97
- # Predict values for samples.
98
- #
99
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
100
- # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
101
- def predict(x)
102
- check_sample_array(x)
103
- @leaf_values.shape[1].nil? ? @leaf_values[apply(x)] : @leaf_values[apply(x), true]
104
- end
105
-
106
- # Return the index of the leaf that each sample reached.
107
- #
108
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
109
- # @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
110
- def apply(x)
111
- check_sample_array(x)
112
- Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
113
- end
114
-
115
- # Dump marshal data.
116
- # @return [Hash] The marshal data about DecisionTreeRegressor
117
- def marshal_dump
118
- { params: @params,
119
- criterion: @criterion,
120
- tree: @tree,
121
- feature_importances: @feature_importances,
122
- leaf_values: @leaf_values,
123
- rng: @rng }
124
- end
125
-
126
- # Load marshal data.
127
- # @return [nil]
128
- def marshal_load(obj)
129
- @params = obj[:params]
130
- @criterion = obj[:criterion]
131
- @tree = obj[:tree]
132
- @feature_importances = obj[:feature_importances]
133
- @leaf_values = obj[:leaf_values]
134
- @rng = obj[:rng]
135
- nil
136
- end
137
-
138
- private
139
-
140
- def apply_at_node(node, sample)
141
- return node.leaf_id if node.leaf
142
- return apply_at_node(node.left, sample) if node.right.nil?
143
- return apply_at_node(node.right, sample) if node.left.nil?
144
- if sample[node.feature_id] <= node.threshold
145
- apply_at_node(node.left, sample)
146
- else
147
- apply_at_node(node.right, sample)
148
- end
149
- end
150
-
151
- def build_tree(x, y)
152
- @n_leaves = 0
153
- @leaf_values = []
154
- @tree = grow_node(0, x, y, impurity(y))
155
- @leaf_values = Numo::DFloat.cast(@leaf_values)
156
- nil
157
- end
158
-
159
- def grow_node(depth, x, y, whole_impurity)
160
- unless @params[:max_leaf_nodes].nil?
161
- return nil if @n_leaves >= @params[:max_leaf_nodes]
162
- end
163
-
164
- n_samples, n_features = x.shape
165
- return nil if n_samples <= @params[:min_samples_leaf]
166
-
167
- node = Node.new(depth: depth, impurity: whole_impurity, n_samples: n_samples)
168
-
169
- return put_leaf(node, y) if (y - y.mean(0)).sum.abs.zero?
170
-
171
- unless @params[:max_depth].nil?
172
- return put_leaf(node, y) if depth == @params[:max_depth]
173
- end
174
-
175
- feature_id, threshold, left_ids, right_ids, left_impurity, right_impurity, gain =
176
- rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y, whole_impurity)] }.max_by(&:last)
177
-
178
- return put_leaf(node, y) if gain.nil? || gain.zero?
179
-
180
- node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids, true], left_impurity)
181
- node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids, true], right_impurity)
182
-
183
- return put_leaf(node, y) if node.left.nil? && node.right.nil?
184
-
185
- node.feature_id = feature_id
186
- node.threshold = threshold
187
- node.leaf = false
188
- node
189
- end
190
-
191
- def put_leaf(node, values)
192
- node.probs = nil
193
- node.leaf = true
194
- node.leaf_id = @n_leaves
195
- @n_leaves += 1
196
- @leaf_values.push(values.mean(0))
197
- node
198
- end
199
-
200
- def rand_ids(n)
201
- [*0...n].sample(@params[:max_features], random: @rng)
202
- end
203
-
204
- def best_split(features, values, whole_impurity)
205
- n_samples = values.shape[0]
206
- features.to_a.uniq.sort.each_cons(2).map do |l, r|
207
- threshold = 0.5 * (l + r)
208
- left_ids = features.le(threshold).where
209
- right_ids = features.gt(threshold).where
210
- left_impurity = impurity(values[left_ids, true])
211
- right_impurity = impurity(values[right_ids, true])
212
- gain = whole_impurity -
213
- left_impurity * left_ids.size.fdiv(n_samples) -
214
- right_impurity * right_ids.size.fdiv(n_samples)
215
- [threshold, left_ids, right_ids, left_impurity, right_impurity, gain]
216
- end.max_by(&:last)
217
- end
218
-
219
- def impurity(values)
220
- send(@criterion, values)
221
- end
222
-
223
- def mse(values)
224
- ((values - values.mean(0))**2).mean
225
- end
226
-
227
- def mae(values)
228
- (values - values.mean(0)).abs.mean
229
- end
230
-
231
- def eval_importance(n_samples, n_features)
232
- @feature_importances = Numo::DFloat.zeros(n_features)
233
- eval_importance_at_node(@tree)
234
- @feature_importances /= n_samples
235
- normalizer = @feature_importances.sum
236
- @feature_importances /= normalizer if normalizer > 0.0
237
- nil
238
- end
239
-
240
- def eval_importance_at_node(node)
241
- return nil if node.leaf
242
- return nil if node.left.nil? || node.right.nil?
243
- gain = node.n_samples * node.impurity -
244
- node.left.n_samples * node.left.impurity - node.right.n_samples * node.right.impurity
245
- @feature_importances[node.feature_id] += gain
246
- eval_importance_at_node(node.left)
247
- eval_importance_at_node(node.right)
248
- end
249
- end
250
- end
251
- end
@@ -1,70 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SVMKit
4
- module Tree
5
- # Node is a class that implements node used for construction of decision tree.
6
- # This class is used for internal data structures.
7
- class Node
8
- # @!visibility private
9
- attr_accessor :depth, :impurity, :n_samples, :probs, :leaf, :leaf_id, :left, :right, :feature_id, :threshold
10
-
11
- # Create a new node for decision tree.
12
- #
13
- # @param depth [Integer] The depth of the node in tree.
14
- # @param impurity [Float] The impurity of the node.
15
- # @param n_samples [Integer] The number of the samples in the node.
16
- # @param probs [Float] The probability of the node.
17
- # @param leaf [Boolean] The flag indicating whether the node is a leaf.
18
- # @param leaf_id [Integer] The leaf index of the node.
19
- # @param left [Node] The left node.
20
- # @param right [Node] The right node.
21
- # @param feature_id [Integer] The feature index used for evaluation.
22
- # @param threshold [Float] The threshold value of the feature for splitting the node.
23
- def initialize(depth: 0, impurity: 0.0, n_samples: 0, probs: 0.0,
24
- leaf: true, leaf_id: 0,
25
- left: nil, right: nil, feature_id: 0, threshold: 0.0)
26
- @depth = depth
27
- @impurity = impurity
28
- @n_samples = n_samples
29
- @probs = probs
30
- @leaf = leaf
31
- @leaf_id = leaf_id
32
- @left = left
33
- @right = right
34
- @feature_id = feature_id
35
- @threshold = threshold
36
- end
37
-
38
- # Dump marshal data.
39
- # @return [Hash] The marshal data about Node
40
- def marshal_dump
41
- { depth: @depth,
42
- impurity: @impurity,
43
- n_samples: @n_samples,
44
- probs: @probs,
45
- leaf: @leaf,
46
- leaf_id: @leaf_id,
47
- left: @left,
48
- right: @right,
49
- feature_id: @feature_id,
50
- threshold: @threshold }
51
- end
52
-
53
- # Load marshal data.
54
- # @return [nil]
55
- def marshal_load(obj)
56
- @depth = obj[:depth]
57
- @impurity = obj[:impurity]
58
- @n_samples = obj[:n_samples]
59
- @probs = obj[:probs]
60
- @leaf = obj[:leaf]
61
- @leaf_id = obj[:leaf_id]
62
- @left = obj[:left]
63
- @right = obj[:right]
64
- @feature_id = obj[:feature_id]
65
- @threshold = obj[:threshold]
66
- nil
67
- end
68
- end
69
- end
70
- end
@@ -1,22 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SVMKit
4
- # @!visibility private
5
- module Utils
6
- module_function
7
-
8
- # @!visibility private
9
- def choice_ids(size, probs, rng = nil)
10
- rng ||= Random.new
11
- Array.new(size) do
12
- target = rng.rand
13
- chosen = 0
14
- probs.each_with_index do |p, idx|
15
- break (chosen = idx) if target <= p
16
- target -= p
17
- end
18
- chosen
19
- end
20
- end
21
- end
22
- end
@@ -1,79 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SVMKit
4
- # @!visibility private
5
- module Validation
6
- module_function
7
-
8
- # @!visibility private
9
- def check_sample_array(x)
10
- raise TypeError, 'Expect class of sample matrix to be Numo::DFloat' unless x.is_a?(Numo::DFloat)
11
- raise ArgumentError, 'Expect sample matrix to be 2-D array' unless x.shape.size == 2
12
- nil
13
- end
14
-
15
- # @!visibility private
16
- def check_label_array(y)
17
- raise TypeError, 'Expect class of label vector to be Numo::Int32' unless y.is_a?(Numo::Int32)
18
- raise ArgumentError, 'Expect label vector to be 1-D arrray' unless y.shape.size == 1
19
- nil
20
- end
21
-
22
- # @!visibility private
23
- def check_tvalue_array(y)
24
- raise TypeError, 'Expect class of target value vector to be Numo::DFloat' unless y.is_a?(Numo::DFloat)
25
- nil
26
- end
27
-
28
- # @!visibility private
29
- def check_sample_label_size(x, y)
30
- raise ArgumentError, 'Expect to have the same number of samples for sample matrix and label vector' unless x.shape[0] == y.shape[0]
31
- nil
32
- end
33
-
34
- # @!visibility private
35
- def check_sample_tvalue_size(x, y)
36
- raise ArgumentError, 'Expect to have the same number of samples for sample matrix and target value vector' unless x.shape[0] == y.shape[0]
37
- nil
38
- end
39
-
40
- # @!visibility private
41
- def check_params_type(type, params = {})
42
- params.each { |k, v| raise TypeError, "Expect class of #{k} to be #{type}" unless v.is_a?(type) }
43
- nil
44
- end
45
-
46
- # @!visibility private
47
- def check_params_type_or_nil(type, params = {})
48
- params.each { |k, v| raise TypeError, "Expect class of #{k} to be #{type} or nil" unless v.is_a?(type) || v.is_a?(NilClass) }
49
- nil
50
- end
51
-
52
- # @!visibility private
53
- def check_params_float(params = {})
54
- check_params_type(Float, params)
55
- end
56
-
57
- # @!visibility private
58
- def check_params_integer(params = {})
59
- check_params_type(Integer, params)
60
- end
61
-
62
- # @!visibility private
63
- def check_params_string(params = {})
64
- check_params_type(String, params)
65
- end
66
-
67
- # @!visibility private
68
- def check_params_boolean(params = {})
69
- params.each { |k, v| raise TypeError, "Expect class of #{k} to be Boolean" unless v.is_a?(FalseClass) || v.is_a?(TrueClass) }
70
- nil
71
- end
72
-
73
- # @!visibility private
74
- def check_params_positive(params = {})
75
- params.reject { |_, v| v.nil? }.each { |k, v| raise ArgumentError, "Expect #{k} to be positive value" if v < 0 }
76
- nil
77
- end
78
- end
79
- end
@@ -1,13 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SVMKit
4
- # @!visibility private
5
- module Values
6
- module_function
7
-
8
- # @!visibility private
9
- def int_max
10
- @int_max ||= 2**([42].pack('i').size * 16 - 2) - 1
11
- end
12
- end
13
- end
@@ -1,7 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # SVMKit is a machine learning library in Ruby.
4
- module SVMKit
5
- # @!visibility private
6
- VERSION = '0.7.3'.freeze
7
- end