svmkit 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.coveralls.yml +1 -0
- data/HISTORY.md +7 -0
- data/README.md +2 -1
- data/lib/svmkit.rb +3 -0
- data/lib/svmkit/base/base_estimator.rb +1 -0
- data/lib/svmkit/base/classifier.rb +9 -3
- data/lib/svmkit/base/evaluator.rb +1 -0
- data/lib/svmkit/base/splitter.rb +1 -0
- data/lib/svmkit/base/transformer.rb +1 -0
- data/lib/svmkit/dataset.rb +2 -0
- data/lib/svmkit/ensemble/random_forest_classifier.rb +161 -0
- data/lib/svmkit/evaluation_measure/accuracy.rb +2 -0
- data/lib/svmkit/evaluation_measure/f_score.rb +2 -0
- data/lib/svmkit/evaluation_measure/precision.rb +2 -0
- data/lib/svmkit/evaluation_measure/precision_recall.rb +2 -0
- data/lib/svmkit/evaluation_measure/recall.rb +2 -0
- data/lib/svmkit/kernel_approximation/rbf.rb +2 -0
- data/lib/svmkit/kernel_machine/kernel_svc.rb +3 -3
- data/lib/svmkit/linear_model/logistic_regression.rb +2 -11
- data/lib/svmkit/linear_model/svc.rb +2 -11
- data/lib/svmkit/model_selection/cross_validation.rb +2 -0
- data/lib/svmkit/model_selection/k_fold.rb +2 -0
- data/lib/svmkit/model_selection/stratified_k_fold.rb +2 -0
- data/lib/svmkit/multiclass/one_vs_rest_classifier.rb +2 -11
- data/lib/svmkit/naive_bayes/naive_bayes.rb +5 -13
- data/lib/svmkit/nearest_neighbors/k_neighbors_classifier.rb +3 -12
- data/lib/svmkit/pairwise_metric.rb +2 -0
- data/lib/svmkit/polynomial_model/factorization_machine_classifier.rb +3 -12
- data/lib/svmkit/preprocessing/l2_normalizer.rb +2 -0
- data/lib/svmkit/preprocessing/min_max_scaler.rb +2 -0
- data/lib/svmkit/preprocessing/standard_scaler.rb +2 -0
- data/lib/svmkit/tree/decision_tree_classifier.rb +260 -0
- data/lib/svmkit/version.rb +4 -2
- data/svmkit.gemspec +2 -2
- metadata +9 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b36f6b299c47d1107d587aafeb7bb66531f1208c
|
4
|
+
data.tar.gz: 1bd382f3339b8fb08454493a45a2338020791b6c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0676f2e9b3ef4ac9786f10ca976721e73d2cd918a9c939900281e36267ab14a3413c3a719d9504415f527e5d6163d5640ea2af186e023c3980327cb7c476afba
|
7
|
+
data.tar.gz: e8dcf72f7d1641903a4625bb23399deabf9a19931a5c00bd2c5077b525a2d0361b194b8c43dee1b7a365b25eafdabb460e0a4ae21f17afd5b401379671379463
|
data/.coveralls.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
service_name: travis-ci
|
data/HISTORY.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
# 0.2.6
|
2
|
+
- Added class for Decision Tree classifier.
|
3
|
+
- Added class for Random Forest classifier.
|
4
|
+
- Fixed to use frozen string literal.
|
5
|
+
- Refactored marshal dump method on some classes.
|
6
|
+
- Introduced Coveralls to confirm test coverage.
|
7
|
+
|
1
8
|
# 0.2.5
|
2
9
|
- Added classes for Naive Bayes classifier.
|
3
10
|
- Fixed decision function method on Logistic Regression class.
|
data/README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
# SVMKit
|
2
2
|
|
3
3
|
[![Build Status](https://travis-ci.org/yoshoku/SVMKit.svg?branch=master)](https://travis-ci.org/yoshoku/SVMKit)
|
4
|
+
[![Coverage Status](https://coveralls.io/repos/github/yoshoku/SVMKit/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/SVMKit?branch=master)
|
4
5
|
[![Gem Version](https://badge.fury.io/rb/svmkit.svg)](https://badge.fury.io/rb/svmkit)
|
5
6
|
[![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/SVMKit/blob/master/LICENSE.txt)
|
6
7
|
|
7
8
|
SVMKit is a machine learninig library in Ruby.
|
8
9
|
SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
9
10
|
SVMKit currently supports Linear / Kernel Support Vector Machine,
|
10
|
-
Logistic Regression, Factorization Machine, Naive Bayes,
|
11
|
+
Logistic Regression, Factorization Machine, Naive Bayes, Decision Tree, Random Forest,
|
11
12
|
K-nearest neighbor classifier, and cross-validation.
|
12
13
|
|
13
14
|
## Installation
|
data/lib/svmkit.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
|
2
3
|
require 'numo/narray'
|
3
4
|
|
@@ -17,6 +18,8 @@ require 'svmkit/polynomial_model/factorization_machine_classifier'
|
|
17
18
|
require 'svmkit/multiclass/one_vs_rest_classifier'
|
18
19
|
require 'svmkit/nearest_neighbors/k_neighbors_classifier'
|
19
20
|
require 'svmkit/naive_bayes/naive_bayes'
|
21
|
+
require 'svmkit/tree/decision_tree_classifier'
|
22
|
+
require 'svmkit/ensemble/random_forest_classifier'
|
20
23
|
require 'svmkit/preprocessing/l2_normalizer'
|
21
24
|
require 'svmkit/preprocessing/min_max_scaler'
|
22
25
|
require 'svmkit/preprocessing/standard_scaler'
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
|
2
3
|
module SVMKit
|
3
4
|
module Base
|
@@ -13,9 +14,14 @@ module SVMKit
|
|
13
14
|
raise NotImplementedError, "#{__method__} has to be implemented in #{self.class}."
|
14
15
|
end
|
15
16
|
|
16
|
-
#
|
17
|
-
|
18
|
-
|
17
|
+
# Claculate the mean accuracy of the given testing data.
|
18
|
+
#
|
19
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
|
20
|
+
# @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
|
21
|
+
# @return [Float] Mean accuracy
|
22
|
+
def score(x, y)
|
23
|
+
evaluator = SVMKit::EvaluationMeasure::Accuracy.new
|
24
|
+
evaluator.score(y, predict(x))
|
19
25
|
end
|
20
26
|
end
|
21
27
|
end
|
data/lib/svmkit/base/splitter.rb
CHANGED
data/lib/svmkit/dataset.rb
CHANGED
@@ -0,0 +1,161 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'svmkit/base/base_estimator'
|
4
|
+
require 'svmkit/base/classifier'
|
5
|
+
|
6
|
+
module SVMKit
|
7
|
+
# This module consists of the classes that implement ensemble-based methods.
|
8
|
+
module Ensemble
|
9
|
+
# RandomForestClassifier is a class that implements random forest for classification.
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
# estimator =
|
13
|
+
# SVMKit::Ensemble::RandomForestClassifier.new(
|
14
|
+
# n_estimators: 10, criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
15
|
+
# estimator.fit(training_samples, traininig_labels)
|
16
|
+
# results = estimator.predict(testing_samples)
|
17
|
+
#
|
18
|
+
class RandomForestClassifier
|
19
|
+
include Base::BaseEstimator
|
20
|
+
include Base::Classifier
|
21
|
+
|
22
|
+
# Return the set of estimators.
|
23
|
+
# @return [Array<DecisionTreeClassifier>]
|
24
|
+
attr_reader :estimators
|
25
|
+
|
26
|
+
# Return the class labels.
|
27
|
+
# @return [Numo::Int32] (size: n_classes)
|
28
|
+
attr_reader :classes
|
29
|
+
|
30
|
+
# Return the importance for each feature.
|
31
|
+
# @return [Numo::DFloat] (size: n_features)
|
32
|
+
attr_reader :feature_importances
|
33
|
+
|
34
|
+
# Return the random generator for performing random sampling in the Pegasos algorithm.
|
35
|
+
# @return [Random]
|
36
|
+
attr_reader :rng
|
37
|
+
|
38
|
+
# Create a new classifier with random forest.
|
39
|
+
#
|
40
|
+
# @param n_estimators [Integer] The numeber of decision trees for contructing random forest.
|
41
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
42
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
43
|
+
# If nil is given, decision tree grows without concern for depth.
|
44
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
45
|
+
# If nil is given, number of leaves is not limited.
|
46
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
47
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
48
|
+
# If nil is given, split process considers all features.
|
49
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
50
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
51
|
+
def initialize(n_estimators: 10, criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
|
52
|
+
max_features: nil, random_seed: nil)
|
53
|
+
@params = {}
|
54
|
+
@params[:n_estimators] = n_estimators
|
55
|
+
@params[:criterion] = criterion
|
56
|
+
@params[:max_depth] = max_depth
|
57
|
+
@params[:max_leaf_nodes] = max_leaf_nodes
|
58
|
+
@params[:min_samples_leaf] = min_samples_leaf
|
59
|
+
@params[:max_features] = max_features
|
60
|
+
@params[:random_seed] = random_seed
|
61
|
+
@params[:random_seed] ||= srand
|
62
|
+
@rng = Random.new(@params[:random_seed])
|
63
|
+
@estimators = nil
|
64
|
+
@classes = nil
|
65
|
+
@feature_importances = nil
|
66
|
+
end
|
67
|
+
|
68
|
+
# Fit the model with given training data.
|
69
|
+
#
|
70
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
71
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
72
|
+
# @return [RandomForestClassifier] The learned classifier itself.
|
73
|
+
def fit(x, y)
|
74
|
+
# Initialize some variables.
|
75
|
+
n_samples, n_features = x.shape
|
76
|
+
@params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
|
77
|
+
@params[:max_features] = [[1, @params[:max_features]].max, Math.sqrt(n_features).to_i].min
|
78
|
+
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
79
|
+
# Construct forest.
|
80
|
+
@estimators = Array.new(@params[:n_estimators]) do |_n|
|
81
|
+
tree = Tree::DecisionTreeClassifier.new(
|
82
|
+
criterion: @params[:criterion], max_depth: @params[:max_depth],
|
83
|
+
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
|
84
|
+
max_features: @params[:max_features], random_seed: @params[:random_seed]
|
85
|
+
)
|
86
|
+
bootstrap_ids = Array.new(n_samples) { @rng.rand(0...n_samples) }
|
87
|
+
tree.fit(x[bootstrap_ids, true], y[bootstrap_ids])
|
88
|
+
end
|
89
|
+
# Calculate feature importances.
|
90
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
91
|
+
@estimators.each { |tree| @feature_importances += tree.feature_importances }
|
92
|
+
@feature_importances /= @feature_importances.sum
|
93
|
+
self
|
94
|
+
end
|
95
|
+
|
96
|
+
# Predict class labels for samples.
|
97
|
+
#
|
98
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
99
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
100
|
+
def predict(x)
|
101
|
+
n_samples, = x.shape
|
102
|
+
n_classes = @classes.size
|
103
|
+
classes_arr = @classes.to_a
|
104
|
+
ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
|
105
|
+
@estimators.each do |tree|
|
106
|
+
predicted = tree.predict(x)
|
107
|
+
n_samples.times do |n|
|
108
|
+
class_id = classes_arr.index(predicted[n])
|
109
|
+
ballot_box[n, class_id] += 1.0 unless class_id.nil?
|
110
|
+
end
|
111
|
+
end
|
112
|
+
Numo::Int32[*Array.new(n_samples) { |n| @classes[ballot_box[n, true].max_index] }]
|
113
|
+
end
|
114
|
+
|
115
|
+
# Predict probability for samples.
|
116
|
+
#
|
117
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
118
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
119
|
+
def predict_proba(x)
|
120
|
+
n_samples, = x.shape
|
121
|
+
n_classes = @classes.size
|
122
|
+
classes_arr = @classes.to_a
|
123
|
+
ballot_box = Numo::DFloat.zeros(n_samples, n_classes)
|
124
|
+
@estimators.each do |tree|
|
125
|
+
probs = tree.predict_proba(x)
|
126
|
+
tree.classes.size.times do |n|
|
127
|
+
class_id = classes_arr.index(tree.classes[n])
|
128
|
+
ballot_box[true, class_id] += probs[true, n] unless class_id.nil?
|
129
|
+
end
|
130
|
+
end
|
131
|
+
(ballot_box.transpose / ballot_box.sum(axis: 1)).transpose
|
132
|
+
end
|
133
|
+
|
134
|
+
# Return the index of the leaf that each sample reached.
|
135
|
+
#
|
136
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
137
|
+
# @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
|
138
|
+
def apply(x)
|
139
|
+
Numo::Int32[*Array.new(@params[:n_estimators]) { |n| @estimators[n].apply(x) }].transpose
|
140
|
+
end
|
141
|
+
|
142
|
+
# Dump marshal data.
|
143
|
+
# @return [Hash] The marshal data about RandomForestClassifier
|
144
|
+
def marshal_dump
|
145
|
+
{ params: @params, estimators: @estimators, classes: @classes,
|
146
|
+
feature_importances: @feature_importances, rng: @rng }
|
147
|
+
end
|
148
|
+
|
149
|
+
# Load marshal data.
|
150
|
+
# @return [nil]
|
151
|
+
def marshal_load(obj)
|
152
|
+
@params = obj[:params]
|
153
|
+
@estimators = obj[:estimators]
|
154
|
+
@classes = obj[:classes]
|
155
|
+
@feature_importances = obj[:feature_importances]
|
156
|
+
@rng = obj[:rng]
|
157
|
+
nil
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'svmkit/base/base_estimator'
|
2
4
|
require 'svmkit/base/classifier'
|
3
5
|
|
@@ -97,9 +99,7 @@ module SVMKit
|
|
97
99
|
# @param y [Numo::Int32] (shape: [n_testing_samples]) True labels for testing data.
|
98
100
|
# @return [Float] Mean accuracy
|
99
101
|
def score(x, y)
|
100
|
-
|
101
|
-
n_hits = (y.to_a.map.with_index { |l, n| l == p[n] ? 1 : 0 }).inject(:+)
|
102
|
-
n_hits / y.size.to_f
|
102
|
+
super
|
103
103
|
end
|
104
104
|
|
105
105
|
# Dump marshal data.
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'svmkit/base/base_estimator'
|
2
4
|
require 'svmkit/base/classifier'
|
3
5
|
|
@@ -135,17 +137,6 @@ module SVMKit
|
|
135
137
|
proba
|
136
138
|
end
|
137
139
|
|
138
|
-
# Claculate the mean accuracy of the given testing data.
|
139
|
-
#
|
140
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
|
141
|
-
# @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
|
142
|
-
# @return [Float] Mean accuracy
|
143
|
-
def score(x, y)
|
144
|
-
p = predict(x)
|
145
|
-
n_hits = (y.to_a.map.with_index { |l, n| l == p[n] ? 1 : 0 }).inject(:+)
|
146
|
-
n_hits / y.size.to_f
|
147
|
-
end
|
148
|
-
|
149
140
|
# Dump marshal data.
|
150
141
|
# @return [Hash] The marshal data about LogisticRegression.
|
151
142
|
def marshal_dump
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'svmkit/base/base_estimator'
|
2
4
|
require 'svmkit/base/classifier'
|
3
5
|
|
@@ -118,17 +120,6 @@ module SVMKit
|
|
118
120
|
Numo::Int32.cast(decision_function(x).map { |v| v >= 0 ? 1 : -1 })
|
119
121
|
end
|
120
122
|
|
121
|
-
# Claculate the mean accuracy of the given testing data.
|
122
|
-
#
|
123
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
|
124
|
-
# @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
|
125
|
-
# @return [Float] Mean accuracy
|
126
|
-
def score(x, y)
|
127
|
-
p = predict(x)
|
128
|
-
n_hits = (y.to_a.map.with_index { |l, n| l == p[n] ? 1 : 0 }).inject(:+)
|
129
|
-
n_hits / y.size.to_f
|
130
|
-
end
|
131
|
-
|
132
123
|
# Dump marshal data.
|
133
124
|
# @return [Hash] The marshal data about SVC.
|
134
125
|
def marshal_dump
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'svmkit/base/base_estimator.rb'
|
2
4
|
require 'svmkit/base/classifier.rb'
|
3
5
|
|
@@ -68,17 +70,6 @@ module SVMKit
|
|
68
70
|
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
|
69
71
|
end
|
70
72
|
|
71
|
-
# Claculate the mean accuracy of the given testing data.
|
72
|
-
#
|
73
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
|
74
|
-
# @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
|
75
|
-
# @return [Float] Mean accuracy
|
76
|
-
def score(x, y)
|
77
|
-
p = predict(x)
|
78
|
-
n_hits = (y.to_a.map.with_index { |l, n| l == p[n] ? 1 : 0 }).inject(:+)
|
79
|
-
n_hits / y.size.to_f
|
80
|
-
end
|
81
|
-
|
82
73
|
# Dump marshal data.
|
83
74
|
# @return [Hash] The marshal data about OneVsRestClassifier.
|
84
75
|
def marshal_dump
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'svmkit/base/base_estimator'
|
2
4
|
require 'svmkit/base/classifier'
|
3
5
|
|
@@ -36,16 +38,6 @@ module SVMKit
|
|
36
38
|
def predict_proba(x)
|
37
39
|
Numo::NMath.exp(predict_log_proba(x)).abs
|
38
40
|
end
|
39
|
-
|
40
|
-
# Claculate the mean accuracy of the given testing data.
|
41
|
-
#
|
42
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
|
43
|
-
# @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
|
44
|
-
# @return [Float] Mean accuracy
|
45
|
-
def score(x, y)
|
46
|
-
evaluator = SVMKit::EvaluationMeasure::Accuracy.new
|
47
|
-
evaluator.score(y, predict(x))
|
48
|
-
end
|
49
41
|
end
|
50
42
|
|
51
43
|
# GaussianNB is a class that implements Gaussian Naive Bayes classifier.
|
@@ -109,7 +101,7 @@ module SVMKit
|
|
109
101
|
#
|
110
102
|
# @return [Hash] The marshal data about GaussianNB.
|
111
103
|
def marshal_dump
|
112
|
-
{ params: params,
|
104
|
+
{ params: @params,
|
113
105
|
classes: @classes,
|
114
106
|
class_priors: @class_priors,
|
115
107
|
means: @means,
|
@@ -193,7 +185,7 @@ module SVMKit
|
|
193
185
|
#
|
194
186
|
# @return [Hash] The marshal data about MultinomialNB.
|
195
187
|
def marshal_dump
|
196
|
-
{ params: params,
|
188
|
+
{ params: @params,
|
197
189
|
classes: @classes,
|
198
190
|
class_priors: @class_priors,
|
199
191
|
feature_probs: @feature_probs }
|
@@ -283,7 +275,7 @@ module SVMKit
|
|
283
275
|
#
|
284
276
|
# @return [Hash] The marshal data about BernoulliNB.
|
285
277
|
def marshal_dump
|
286
|
-
{ params: params,
|
278
|
+
{ params: @params,
|
287
279
|
classes: @classes,
|
288
280
|
class_priors: @class_priors,
|
289
281
|
feature_probs: @feature_probs }
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'svmkit/base/base_estimator'
|
2
4
|
require 'svmkit/base/classifier'
|
3
5
|
|
@@ -79,21 +81,10 @@ module SVMKit
|
|
79
81
|
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
|
80
82
|
end
|
81
83
|
|
82
|
-
# Claculate the mean accuracy of the given testing data.
|
83
|
-
#
|
84
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
|
85
|
-
# @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
|
86
|
-
# @return [Float] Mean accuracy
|
87
|
-
def score(x, y)
|
88
|
-
p = predict(x)
|
89
|
-
n_hits = (y.to_a.map.with_index { |l, n| l == p[n] ? 1 : 0 }).inject(:+)
|
90
|
-
n_hits / y.size.to_f
|
91
|
-
end
|
92
|
-
|
93
84
|
# Dump marshal data.
|
94
85
|
# @return [Hash] The marshal data about KNeighborsClassifier.
|
95
86
|
def marshal_dump
|
96
|
-
{ params: params,
|
87
|
+
{ params: @params,
|
97
88
|
prototypes: @prototypes,
|
98
89
|
labels: @labels,
|
99
90
|
classes: @classes }
|
@@ -1,8 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'svmkit/base/base_estimator'
|
2
4
|
require 'svmkit/base/classifier'
|
3
5
|
|
4
6
|
module SVMKit
|
5
|
-
# This module consists of the classes that
|
7
|
+
# This module consists of the classes that implement polynomial models.
|
6
8
|
module PolynomialModel
|
7
9
|
# FactorizationMachineClassifier is a class that
|
8
10
|
# implements Fatorization Machine for binary classification
|
@@ -136,17 +138,6 @@ module SVMKit
|
|
136
138
|
proba
|
137
139
|
end
|
138
140
|
|
139
|
-
# Claculate the mean accuracy of the given testing data.
|
140
|
-
#
|
141
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
|
142
|
-
# @param y [Numo::Int32] (shape: [n_samples]) True labels for testing data.
|
143
|
-
# @return [Float] Mean accuracy
|
144
|
-
def score(x, y)
|
145
|
-
p = predict(x)
|
146
|
-
n_hits = (y.to_a.map.with_index { |l, n| l == p[n] ? 1 : 0 }).inject(:+)
|
147
|
-
n_hits / y.size.to_f
|
148
|
-
end
|
149
|
-
|
150
141
|
# Dump marshal data.
|
151
142
|
# @return [Hash] The marshal data about FactorizationMachineClassifier
|
152
143
|
def marshal_dump
|
@@ -0,0 +1,260 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'svmkit/base/base_estimator'
|
4
|
+
require 'svmkit/base/classifier'
|
5
|
+
require 'ostruct'
|
6
|
+
|
7
|
+
module SVMKit
|
8
|
+
# This module consists of the classes that implement tree models.
|
9
|
+
module Tree
|
10
|
+
# DecisionTreeClassifier is a class that implements decision tree for classification.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# estimator =
|
14
|
+
# SVMKit::Tree::DecisionTreeClassifier.new(
|
15
|
+
# criterion: 'gini', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
|
16
|
+
# estimator.fit(training_samples, traininig_labels)
|
17
|
+
# results = estimator.predict(testing_samples)
|
18
|
+
#
|
19
|
+
class DecisionTreeClassifier
|
20
|
+
include Base::BaseEstimator
|
21
|
+
include Base::Classifier
|
22
|
+
|
23
|
+
# Return the class labels.
|
24
|
+
# @return [Numo::Int32] (size: n_classes)
|
25
|
+
attr_reader :classes
|
26
|
+
|
27
|
+
# Return the importance for each feature.
|
28
|
+
# @return [Numo::DFloat] (size: n_features)
|
29
|
+
attr_reader :feature_importances
|
30
|
+
|
31
|
+
# Return the learned tree.
|
32
|
+
# @return [OpenStruct]
|
33
|
+
attr_reader :tree
|
34
|
+
|
35
|
+
# Return the random generator for performing random sampling in the Pegasos algorithm.
|
36
|
+
# @return [Random]
|
37
|
+
attr_reader :rng
|
38
|
+
|
39
|
+
# Return the labels assigned each leaf.
|
40
|
+
# @return [Numo::Int32] (size: n_leafs)
|
41
|
+
attr_reader :leaf_labels
|
42
|
+
|
43
|
+
# Create a new classifier with decision tree algorithm.
|
44
|
+
#
|
45
|
+
# @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
|
46
|
+
# @param max_depth [Integer] The maximum depth of the tree.
|
47
|
+
# If nil is given, decision tree grows without concern for depth.
|
48
|
+
# @param max_leaf_nodes [Integer] The maximum number of leaves on decision tree.
|
49
|
+
# If nil is given, number of leaves is not limited.
|
50
|
+
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
|
51
|
+
# @param max_features [Integer] The number of features to consider when searching optimal split point.
|
52
|
+
# If nil is given, split process considers all features.
|
53
|
+
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
54
|
+
# It is used to randomly determine the order of features when deciding spliting point.
|
55
|
+
def initialize(criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil,
|
56
|
+
random_seed: nil)
|
57
|
+
@params = {}
|
58
|
+
@params[:criterion] = criterion
|
59
|
+
@params[:max_depth] = max_depth
|
60
|
+
@params[:max_leaf_nodes] = max_leaf_nodes
|
61
|
+
@params[:min_samples_leaf] = min_samples_leaf
|
62
|
+
@params[:max_features] = max_features
|
63
|
+
@params[:random_seed] = random_seed
|
64
|
+
@params[:random_seed] ||= srand
|
65
|
+
@rng = Random.new(@params[:random_seed])
|
66
|
+
@tree = nil
|
67
|
+
@classes = nil
|
68
|
+
@feature_importances = nil
|
69
|
+
@n_leaves = nil
|
70
|
+
@leaf_labels = nil
|
71
|
+
end
|
72
|
+
|
73
|
+
# Fit the model with given training data.
|
74
|
+
#
|
75
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
76
|
+
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
77
|
+
# @return [DecisionTreeClassifier] The learned classifier itself.
|
78
|
+
def fit(x, y)
|
79
|
+
n_samples, n_features = x.shape
|
80
|
+
@params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
|
81
|
+
@params[:max_features] = [[1, @params[:max_features]].max, n_features].min
|
82
|
+
@classes = Numo::Int32.asarray(y.to_a.uniq.sort)
|
83
|
+
build_tree(x, y)
|
84
|
+
eval_importance(n_samples, n_features)
|
85
|
+
self
|
86
|
+
end
|
87
|
+
|
88
|
+
# Predict class labels for samples.
|
89
|
+
#
|
90
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
91
|
+
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
|
92
|
+
def predict(x)
|
93
|
+
@leaf_labels[apply(x)]
|
94
|
+
end
|
95
|
+
|
96
|
+
# Predict probability for samples.
|
97
|
+
#
|
98
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
|
99
|
+
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
|
100
|
+
def predict_proba(x)
|
101
|
+
probs = Numo::DFloat[*(Array.new(x.shape[0]) { |n| predict_at_node(@tree, x[n, true]) })]
|
102
|
+
probs[true, @classes]
|
103
|
+
end
|
104
|
+
|
105
|
+
# Return the index of the leaf that each sample reached.
|
106
|
+
#
|
107
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
|
108
|
+
# @return [Numo::Int32] (shape: [n_samples]) Leaf index for sample.
|
109
|
+
def apply(x)
|
110
|
+
Numo::Int32[*(Array.new(x.shape[0]) { |n| apply_at_node(@tree, x[n, true]) })]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Dump marshal data.
|
114
|
+
# @return [Hash] The marshal data about DecisionTreeClassifier
|
115
|
+
def marshal_dump
|
116
|
+
{ params: @params, classes: @classes, tree: @tree,
|
117
|
+
feature_importances: @feature_importances, leaf_labels: @leaf_labels,
|
118
|
+
rng: @rng }
|
119
|
+
end
|
120
|
+
|
121
|
+
# Load marshal data.
|
122
|
+
# @return [nil]
|
123
|
+
def marshal_load(obj)
|
124
|
+
@params = obj[:params]
|
125
|
+
@classes = obj[:classes]
|
126
|
+
@tree = obj[:tree]
|
127
|
+
@feature_importances = obj[:feature_importances]
|
128
|
+
@leaf_labels = obj[:leaf_labels]
|
129
|
+
@rng = obj[:rng]
|
130
|
+
nil
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
def predict_at_node(node, sample)
|
136
|
+
return node.probs if node.leaf
|
137
|
+
branch_at_node('predict', node, sample)
|
138
|
+
end
|
139
|
+
|
140
|
+
def apply_at_node(node, sample)
|
141
|
+
return node.leaf_id if node.leaf
|
142
|
+
branch_at_node('apply', node, sample)
|
143
|
+
end
|
144
|
+
|
145
|
+
def branch_at_node(action, node, sample)
|
146
|
+
return send("#{action}_at_node", node.left, sample) if node.right.nil?
|
147
|
+
return send("#{action}_at_node", node.right, sample) if node.left.nil?
|
148
|
+
if sample[node.feature_id] <= node.threshold
|
149
|
+
send("#{action}_at_node", node.left, sample)
|
150
|
+
else
|
151
|
+
send("#{action}_at_node", node.right, sample)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def build_tree(x, y)
|
156
|
+
@n_leaves = 0
|
157
|
+
@leaf_labels = []
|
158
|
+
@tree = grow_node(0, x, y)
|
159
|
+
@leaf_labels = Numo::Int32[*@leaf_labels]
|
160
|
+
nil
|
161
|
+
end
|
162
|
+
|
163
|
+
def grow_node(depth, x, y)
|
164
|
+
if @params[:max_leaf_nodes].is_a?(Integer)
|
165
|
+
return nil if @n_leaves >= @params[:max_leaf_nodes]
|
166
|
+
end
|
167
|
+
|
168
|
+
n_samples, n_features = x.shape
|
169
|
+
if @params[:min_samples_leaf].is_a?(Integer)
|
170
|
+
return nil if n_samples <= @params[:min_samples_leaf]
|
171
|
+
end
|
172
|
+
|
173
|
+
node = OpenStruct.new(depth: depth, impurity: impurity(y), n_samples: n_samples)
|
174
|
+
|
175
|
+
return put_leaf(node, y) if y.to_a.uniq.size == 1
|
176
|
+
|
177
|
+
if @params[:max_depth].is_a?(Integer)
|
178
|
+
return put_leaf(node, y) if depth == @params[:max_depth]
|
179
|
+
end
|
180
|
+
|
181
|
+
feature_id, threshold, left_ids, right_ids, max_gain =
|
182
|
+
rand_ids(n_features).map { |f_id| [f_id, *best_split(x[true, f_id], y)] }.max_by(&:last)
|
183
|
+
return put_leaf(node, y) if max_gain.nil?
|
184
|
+
return put_leaf(node, y) if max_gain.zero?
|
185
|
+
|
186
|
+
node.left = grow_node(depth + 1, x[left_ids, true], y[left_ids])
|
187
|
+
node.right = grow_node(depth + 1, x[right_ids, true], y[right_ids])
|
188
|
+
return put_leaf(node, y) if node.left.nil? && node.right.nil?
|
189
|
+
|
190
|
+
node.feature_id = feature_id
|
191
|
+
node.threshold = threshold
|
192
|
+
node.leaf = false
|
193
|
+
node
|
194
|
+
end
|
195
|
+
|
196
|
+
def put_leaf(node, y)
|
197
|
+
node.probs = y.bincount(minlength: @classes.max + 1) / node.n_samples.to_f
|
198
|
+
node.leaf = true
|
199
|
+
node.leaf_id = @n_leaves
|
200
|
+
@n_leaves += 1
|
201
|
+
@leaf_labels.push(node.probs.max_index)
|
202
|
+
node
|
203
|
+
end
|
204
|
+
|
205
|
+
def rand_ids(n)
|
206
|
+
[*0...n].sample(@params[:max_features], random: @rng)
|
207
|
+
end
|
208
|
+
|
209
|
+
def best_split(features, labels)
|
210
|
+
features.to_a.uniq.sort.each_cons(2).map do |l, r|
|
211
|
+
threshold = 0.5 * (l + r)
|
212
|
+
left_ids, right_ids = splited_ids(features, threshold)
|
213
|
+
[threshold, left_ids, right_ids, gain(labels, labels[left_ids], labels[right_ids])]
|
214
|
+
end.max_by(&:last)
|
215
|
+
end
|
216
|
+
|
217
|
+
def splited_ids(features, threshold)
|
218
|
+
[features.le(threshold).where.to_a, features.gt(threshold).where.to_a]
|
219
|
+
end
|
220
|
+
|
221
|
+
def gain(labels, labels_left, labels_right)
|
222
|
+
prob_left = labels_left.size / labels.size.to_f
|
223
|
+
prob_right = labels_right.size / labels.size.to_f
|
224
|
+
impurity(labels) - prob_left * impurity(labels_left) - prob_right * impurity(labels_right)
|
225
|
+
end
|
226
|
+
|
227
|
+
def impurity(labels)
|
228
|
+
posterior_probs = labels.to_a.uniq.sort.map { |c| labels.eq(c).count / labels.size.to_f }
|
229
|
+
@params[:criterion] == 'entropy' ? entropy(posterior_probs) : gini(posterior_probs)
|
230
|
+
end
|
231
|
+
|
232
|
+
def gini(posterior_probs)
|
233
|
+
1.0 - posterior_probs.map { |p| p**2 }.inject(:+)
|
234
|
+
end
|
235
|
+
|
236
|
+
def entropy(posterior_probs)
|
237
|
+
-posterior_probs.map { |p| p * Math.log(p) }.inject(:+)
|
238
|
+
end
|
239
|
+
|
240
|
+
def eval_importance(n_samples, n_features)
|
241
|
+
@feature_importances = Numo::DFloat.zeros(n_features)
|
242
|
+
eval_importance_at_node(@tree)
|
243
|
+
@feature_importances /= n_samples
|
244
|
+
normalizer = @feature_importances.sum
|
245
|
+
@feature_importances /= normalizer if normalizer > 0.0
|
246
|
+
nil
|
247
|
+
end
|
248
|
+
|
249
|
+
def eval_importance_at_node(node)
|
250
|
+
return nil if node.leaf
|
251
|
+
return nil if node.left.nil? || node.right.nil?
|
252
|
+
gain = node.n_samples * node.impurity -
|
253
|
+
node.left.n_samples * node.left.impurity - node.right.n_samples * node.right.impurity
|
254
|
+
@feature_importances[node.feature_id] += gain
|
255
|
+
eval_importance_at_node(node.left)
|
256
|
+
eval_importance_at_node(node.right)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
data/lib/svmkit/version.rb
CHANGED
data/svmkit.gemspec
CHANGED
@@ -18,7 +18,7 @@ MSG
|
|
18
18
|
SVMKit is a machine learninig library in Ruby.
|
19
19
|
SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
20
20
|
SVMKit currently supports Linear / Kernel Support Vector Machine,
|
21
|
-
Logistic Regression, Factorization Machine, Naive Bayes,
|
21
|
+
Logistic Regression, Factorization Machine, Naive Bayes, Decision Tree, Random Forest,
|
22
22
|
K-nearest neighbor classifier, and cross-validation.
|
23
23
|
MSG
|
24
24
|
spec.homepage = 'https://github.com/yoshoku/svmkit'
|
@@ -38,7 +38,7 @@ MSG
|
|
38
38
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
39
39
|
spec.add_development_dependency 'rake', '~> 12.0'
|
40
40
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
41
|
-
spec.add_development_dependency '
|
41
|
+
spec.add_development_dependency 'coveralls', '~> 0.8'
|
42
42
|
|
43
43
|
spec.post_install_message = <<MSG
|
44
44
|
*************************************************************************
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: svmkit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-03-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -67,24 +67,24 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '3.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: coveralls
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0.
|
75
|
+
version: '0.8'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0.
|
82
|
+
version: '0.8'
|
83
83
|
description: |
|
84
84
|
SVMKit is a machine learninig library in Ruby.
|
85
85
|
SVMKit provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
86
86
|
SVMKit currently supports Linear / Kernel Support Vector Machine,
|
87
|
-
Logistic Regression, Factorization Machine, Naive Bayes,
|
87
|
+
Logistic Regression, Factorization Machine, Naive Bayes, Decision Tree, Random Forest,
|
88
88
|
K-nearest neighbor classifier, and cross-validation.
|
89
89
|
email:
|
90
90
|
- yoshoku@outlook.com
|
@@ -92,6 +92,7 @@ executables: []
|
|
92
92
|
extensions: []
|
93
93
|
extra_rdoc_files: []
|
94
94
|
files:
|
95
|
+
- ".coveralls.yml"
|
95
96
|
- ".gitignore"
|
96
97
|
- ".rspec"
|
97
98
|
- ".rubocop.yml"
|
@@ -112,6 +113,7 @@ files:
|
|
112
113
|
- lib/svmkit/base/splitter.rb
|
113
114
|
- lib/svmkit/base/transformer.rb
|
114
115
|
- lib/svmkit/dataset.rb
|
116
|
+
- lib/svmkit/ensemble/random_forest_classifier.rb
|
115
117
|
- lib/svmkit/evaluation_measure/accuracy.rb
|
116
118
|
- lib/svmkit/evaluation_measure/f_score.rb
|
117
119
|
- lib/svmkit/evaluation_measure/precision.rb
|
@@ -132,6 +134,7 @@ files:
|
|
132
134
|
- lib/svmkit/preprocessing/l2_normalizer.rb
|
133
135
|
- lib/svmkit/preprocessing/min_max_scaler.rb
|
134
136
|
- lib/svmkit/preprocessing/standard_scaler.rb
|
137
|
+
- lib/svmkit/tree/decision_tree_classifier.rb
|
135
138
|
- lib/svmkit/version.rb
|
136
139
|
- svmkit.gemspec
|
137
140
|
homepage: https://github.com/yoshoku/svmkit
|