rumale 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/build.yml +23 -0
- data/.rubocop.yml +15 -95
- data/CHANGELOG.md +28 -0
- data/Gemfile +4 -2
- data/README.md +5 -2
- data/lib/rumale.rb +3 -0
- data/lib/rumale/clustering/hdbscan.rb +2 -2
- data/lib/rumale/clustering/snn.rb +1 -1
- data/lib/rumale/dataset.rb +1 -1
- data/lib/rumale/decomposition/nmf.rb +2 -2
- data/lib/rumale/ensemble/random_forest_classifier.rb +1 -1
- data/lib/rumale/ensemble/random_forest_regressor.rb +1 -1
- data/lib/rumale/evaluation_measure/roc_auc.rb +3 -0
- data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
- data/lib/rumale/linear_model/base_sgd.rb +1 -1
- data/lib/rumale/linear_model/elastic_net.rb +2 -2
- data/lib/rumale/linear_model/lasso.rb +2 -2
- data/lib/rumale/linear_model/linear_regression.rb +2 -2
- data/lib/rumale/linear_model/logistic_regression.rb +123 -35
- data/lib/rumale/linear_model/ridge.rb +2 -2
- data/lib/rumale/linear_model/svc.rb +2 -2
- data/lib/rumale/linear_model/svr.rb +2 -2
- data/lib/rumale/manifold/tsne.rb +1 -1
- data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +13 -45
- data/lib/rumale/model_selection/group_k_fold.rb +93 -0
- data/lib/rumale/model_selection/group_shuffle_split.rb +115 -0
- data/lib/rumale/model_selection/shuffle_split.rb +4 -4
- data/lib/rumale/model_selection/stratified_k_fold.rb +1 -1
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +13 -9
- data/lib/rumale/model_selection/time_series_split.rb +91 -0
- data/lib/rumale/pipeline/pipeline.rb +1 -1
- data/lib/rumale/probabilistic_output.rb +1 -1
- data/lib/rumale/tree/base_decision_tree.rb +2 -9
- data/lib/rumale/tree/gradient_tree_regressor.rb +3 -10
- data/lib/rumale/version.rb +1 -1
- data/rumale.gemspec +1 -0
- metadata +21 -4
- data/.coveralls.yml +0 -1
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/splitter'
|
4
|
+
|
5
|
+
module Rumale
|
6
|
+
module ModelSelection
|
7
|
+
# TimeSeriesSplit is a class that generates the set of data indices for time series cross-validation.
|
8
|
+
# It is assumed that the dataset given are already ordered by time information.
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# cv = Rumale::ModelSelection::TimeSeriesSplit.new(n_splits: 5)
|
12
|
+
# x = Numo::DFloat.new(6, 2).rand
|
13
|
+
# cv.split(x, nil).each do |train_ids, test_ids|
|
14
|
+
# puts '---'
|
15
|
+
# pp train_ids
|
16
|
+
# pp test_ids
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# # ---
|
20
|
+
# # [0]
|
21
|
+
# # [1]
|
22
|
+
# # ---
|
23
|
+
# # [0, 1]
|
24
|
+
# # [2]
|
25
|
+
# # ---
|
26
|
+
# # [0, 1, 2]
|
27
|
+
# # [3]
|
28
|
+
# # ---
|
29
|
+
# # [0, 1, 2, 3]
|
30
|
+
# # [4]
|
31
|
+
# # ---
|
32
|
+
# # [0, 1, 2, 3, 4]
|
33
|
+
# # [5]
|
34
|
+
#
|
35
|
+
class TimeSeriesSplit
|
36
|
+
include Base::Splitter
|
37
|
+
|
38
|
+
# Return the number of splits.
|
39
|
+
# @return [Integer]
|
40
|
+
attr_reader :n_splits
|
41
|
+
|
42
|
+
# Return the maximum number of training samples in a split.
|
43
|
+
# @return [Integer/Nil]
|
44
|
+
attr_reader :max_train_size
|
45
|
+
|
46
|
+
# Create a new data splitter for time series cross-validation.
|
47
|
+
#
|
48
|
+
# @param n_splits [Integer] The number of splits.
|
49
|
+
# @param max_train_size [Integer/Nil] The maximum number of training samples in a split.
|
50
|
+
def initialize(n_splits: 5, max_train_size: nil)
|
51
|
+
check_params_numeric(n_splits: n_splits)
|
52
|
+
check_params_numeric_or_nil(max_train_size: max_train_size)
|
53
|
+
@n_splits = n_splits
|
54
|
+
@max_train_size = max_train_size
|
55
|
+
end
|
56
|
+
|
57
|
+
# Generate data indices for time series cross-validation.
|
58
|
+
#
|
59
|
+
# @overload split(x, y) -> Array
|
60
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features])
|
61
|
+
# The dataset to be used to generate data indices for time series cross-validation.
|
62
|
+
# It is expected that the data will be ordered by time information.
|
63
|
+
# @param y [Numo::Int32] (shape: [n_samples])
|
64
|
+
# This argument exists to unify the interface between the K-fold methods, it is not used in the method.
|
65
|
+
# @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
|
66
|
+
def split(x, _y)
|
67
|
+
x = check_convert_sample_array(x)
|
68
|
+
|
69
|
+
n_samples = x.shape[0]
|
70
|
+
unless (@n_splits + 1).between?(2, n_samples)
|
71
|
+
raise ArgumentError,
|
72
|
+
'The number of folds (n_splits + 1) must be not less than 2 and not more than the number of samples.'
|
73
|
+
end
|
74
|
+
|
75
|
+
test_size = n_samples / (@n_splits + 1)
|
76
|
+
offset = test_size + n_samples % (@n_splits + 1)
|
77
|
+
|
78
|
+
Array.new(@n_splits) do |n|
|
79
|
+
start = offset * (n + 1)
|
80
|
+
train_ids = if !@max_train_size.nil? && @max_train_size < test_size
|
81
|
+
Array((start - @max_train_size)...start)
|
82
|
+
else
|
83
|
+
Array(0...start)
|
84
|
+
end
|
85
|
+
test_ids = Array(start...(start + test_size))
|
86
|
+
[train_ids, test_ids]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -140,7 +140,7 @@ module Rumale
|
|
140
140
|
def validate_steps(steps)
|
141
141
|
steps.keys[0...-1].each do |name|
|
142
142
|
transformer = steps[name]
|
143
|
-
next if transformer.nil? ||
|
143
|
+
next if transformer.nil? || (transformer.class.method_defined?(:fit) && transformer.class.method_defined?(:transform))
|
144
144
|
|
145
145
|
raise TypeError,
|
146
146
|
'Class of intermediate step in pipeline should be implemented fit and transform methods: ' \
|
@@ -75,17 +75,10 @@ module Rumale
|
|
75
75
|
node = Node.new(depth: depth, impurity: impurity, n_samples: n_samples)
|
76
76
|
|
77
77
|
# terminate growing.
|
78
|
-
|
79
|
-
return nil if @n_leaves >= @params[:max_leaf_nodes]
|
80
|
-
end
|
81
|
-
|
78
|
+
return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
|
82
79
|
return nil if n_samples < @params[:min_samples_leaf]
|
83
80
|
return put_leaf(node, y) if n_samples == @params[:min_samples_leaf]
|
84
|
-
|
85
|
-
unless @params[:max_depth].nil?
|
86
|
-
return put_leaf(node, y) if depth == @params[:max_depth]
|
87
|
-
end
|
88
|
-
|
81
|
+
return put_leaf(node, y) if !@params[:max_depth].nil? && depth == @params[:max_depth]
|
89
82
|
return put_leaf(node, y) if stop_growing?(y)
|
90
83
|
|
91
84
|
# calculate optimal parameters.
|
@@ -138,7 +138,7 @@ module Rumale
|
|
138
138
|
nil
|
139
139
|
end
|
140
140
|
|
141
|
-
def grow_node(depth, x, y, g, h)
|
141
|
+
def grow_node(depth, x, y, g, h) # rubocop:disable Metrics/AbcSize
|
142
142
|
# intialize some variables.
|
143
143
|
sum_g = g.sum
|
144
144
|
sum_h = h.sum
|
@@ -146,17 +146,10 @@ module Rumale
|
|
146
146
|
node = Node.new(depth: depth, n_samples: n_samples)
|
147
147
|
|
148
148
|
# terminate growing.
|
149
|
-
|
150
|
-
return nil if @n_leaves >= @params[:max_leaf_nodes]
|
151
|
-
end
|
152
|
-
|
149
|
+
return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
|
153
150
|
return nil if n_samples < @params[:min_samples_leaf]
|
154
151
|
return put_leaf(node, sum_g, sum_h) if n_samples == @params[:min_samples_leaf]
|
155
|
-
|
156
|
-
unless @params[:max_depth].nil?
|
157
|
-
return put_leaf(node, sum_g, sum_h) if depth == @params[:max_depth]
|
158
|
-
end
|
159
|
-
|
152
|
+
return put_leaf(node, sum_g, sum_h) if !@params[:max_depth].nil? && depth == @params[:max_depth]
|
160
153
|
return put_leaf(node, sum_g, sum_h) if stop_growing?(y)
|
161
154
|
|
162
155
|
# calculate optimal parameters.
|
data/lib/rumale/version.rb
CHANGED
data/rumale.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rumale
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.22.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-11-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 0.9.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: lbfgsb
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.3.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.3.0
|
27
41
|
description: |
|
28
42
|
Rumale is a machine learning library in Ruby.
|
29
43
|
Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
|
@@ -43,7 +57,7 @@ extensions:
|
|
43
57
|
- ext/rumale/extconf.rb
|
44
58
|
extra_rdoc_files: []
|
45
59
|
files:
|
46
|
-
- ".
|
60
|
+
- ".github/workflows/build.yml"
|
47
61
|
- ".gitignore"
|
48
62
|
- ".rspec"
|
49
63
|
- ".rubocop.yml"
|
@@ -135,10 +149,13 @@ files:
|
|
135
149
|
- lib/rumale/model_selection/cross_validation.rb
|
136
150
|
- lib/rumale/model_selection/function.rb
|
137
151
|
- lib/rumale/model_selection/grid_search_cv.rb
|
152
|
+
- lib/rumale/model_selection/group_k_fold.rb
|
153
|
+
- lib/rumale/model_selection/group_shuffle_split.rb
|
138
154
|
- lib/rumale/model_selection/k_fold.rb
|
139
155
|
- lib/rumale/model_selection/shuffle_split.rb
|
140
156
|
- lib/rumale/model_selection/stratified_k_fold.rb
|
141
157
|
- lib/rumale/model_selection/stratified_shuffle_split.rb
|
158
|
+
- lib/rumale/model_selection/time_series_split.rb
|
142
159
|
- lib/rumale/multiclass/one_vs_rest_classifier.rb
|
143
160
|
- lib/rumale/naive_bayes/base_naive_bayes.rb
|
144
161
|
- lib/rumale/naive_bayes/bernoulli_nb.rb
|
@@ -206,7 +223,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
206
223
|
- !ruby/object:Gem::Version
|
207
224
|
version: '0'
|
208
225
|
requirements: []
|
209
|
-
rubygems_version: 3.1.
|
226
|
+
rubygems_version: 3.1.4
|
210
227
|
signing_key:
|
211
228
|
specification_version: 4
|
212
229
|
summary: Rumale is a machine learning library in Ruby. Rumale provides machine learning
|
data/.coveralls.yml
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
service_name: travis-ci
|