rumale 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/build.yml +23 -0
  3. data/.rubocop.yml +15 -95
  4. data/CHANGELOG.md +28 -0
  5. data/Gemfile +4 -2
  6. data/README.md +5 -2
  7. data/lib/rumale.rb +3 -0
  8. data/lib/rumale/clustering/hdbscan.rb +2 -2
  9. data/lib/rumale/clustering/snn.rb +1 -1
  10. data/lib/rumale/dataset.rb +1 -1
  11. data/lib/rumale/decomposition/nmf.rb +2 -2
  12. data/lib/rumale/ensemble/random_forest_classifier.rb +1 -1
  13. data/lib/rumale/ensemble/random_forest_regressor.rb +1 -1
  14. data/lib/rumale/evaluation_measure/roc_auc.rb +3 -0
  15. data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
  16. data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
  17. data/lib/rumale/linear_model/base_sgd.rb +1 -1
  18. data/lib/rumale/linear_model/elastic_net.rb +2 -2
  19. data/lib/rumale/linear_model/lasso.rb +2 -2
  20. data/lib/rumale/linear_model/linear_regression.rb +2 -2
  21. data/lib/rumale/linear_model/logistic_regression.rb +123 -35
  22. data/lib/rumale/linear_model/ridge.rb +2 -2
  23. data/lib/rumale/linear_model/svc.rb +2 -2
  24. data/lib/rumale/linear_model/svr.rb +2 -2
  25. data/lib/rumale/manifold/tsne.rb +1 -1
  26. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +13 -45
  27. data/lib/rumale/model_selection/group_k_fold.rb +93 -0
  28. data/lib/rumale/model_selection/group_shuffle_split.rb +115 -0
  29. data/lib/rumale/model_selection/shuffle_split.rb +4 -4
  30. data/lib/rumale/model_selection/stratified_k_fold.rb +1 -1
  31. data/lib/rumale/model_selection/stratified_shuffle_split.rb +13 -9
  32. data/lib/rumale/model_selection/time_series_split.rb +91 -0
  33. data/lib/rumale/pipeline/pipeline.rb +1 -1
  34. data/lib/rumale/probabilistic_output.rb +1 -1
  35. data/lib/rumale/tree/base_decision_tree.rb +2 -9
  36. data/lib/rumale/tree/gradient_tree_regressor.rb +3 -10
  37. data/lib/rumale/version.rb +1 -1
  38. data/rumale.gemspec +1 -0
  39. metadata +21 -4
  40. data/.coveralls.yml +0 -1
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/splitter'
4
+
5
+ module Rumale
6
+ module ModelSelection
7
+ # TimeSeriesSplit is a class that generates the set of data indices for time series cross-validation.
8
+ # It is assumed that the dataset given are already ordered by time information.
9
+ #
10
+ # @example
11
+ # cv = Rumale::ModelSelection::TimeSeriesSplit.new(n_splits: 5)
12
+ # x = Numo::DFloat.new(6, 2).rand
13
+ # cv.split(x, nil).each do |train_ids, test_ids|
14
+ # puts '---'
15
+ # pp train_ids
16
+ # pp test_ids
17
+ # end
18
+ #
19
+ # # ---
20
+ # # [0]
21
+ # # [1]
22
+ # # ---
23
+ # # [0, 1]
24
+ # # [2]
25
+ # # ---
26
+ # # [0, 1, 2]
27
+ # # [3]
28
+ # # ---
29
+ # # [0, 1, 2, 3]
30
+ # # [4]
31
+ # # ---
32
+ # # [0, 1, 2, 3, 4]
33
+ # # [5]
34
+ #
35
+ class TimeSeriesSplit
36
+ include Base::Splitter
37
+
38
+ # Return the number of splits.
39
+ # @return [Integer]
40
+ attr_reader :n_splits
41
+
42
+ # Return the maximum number of training samples in a split.
43
+ # @return [Integer/Nil]
44
+ attr_reader :max_train_size
45
+
46
+ # Create a new data splitter for time series cross-validation.
47
+ #
48
+ # @param n_splits [Integer] The number of splits.
49
+ # @param max_train_size [Integer/Nil] The maximum number of training samples in a split.
50
+ def initialize(n_splits: 5, max_train_size: nil)
51
+ check_params_numeric(n_splits: n_splits)
52
+ check_params_numeric_or_nil(max_train_size: max_train_size)
53
+ @n_splits = n_splits
54
+ @max_train_size = max_train_size
55
+ end
56
+
57
+ # Generate data indices for time series cross-validation.
58
+ #
59
+ # @overload split(x, y) -> Array
60
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features])
61
+ # The dataset to be used to generate data indices for time series cross-validation.
62
+ # It is expected that the data will be ordered by time information.
63
+ # @param y [Numo::Int32] (shape: [n_samples])
64
+ # This argument exists to unify the interface between the K-fold methods, it is not used in the method.
65
+ # @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
66
+ def split(x, _y)
67
+ x = check_convert_sample_array(x)
68
+
69
+ n_samples = x.shape[0]
70
+ unless (@n_splits + 1).between?(2, n_samples)
71
+ raise ArgumentError,
72
+ 'The number of folds (n_splits + 1) must be not less than 2 and not more than the number of samples.'
73
+ end
74
+
75
+ test_size = n_samples / (@n_splits + 1)
76
+ offset = test_size + n_samples % (@n_splits + 1)
77
+
78
+ Array.new(@n_splits) do |n|
79
+ start = offset * (n + 1)
80
+ train_ids = if !@max_train_size.nil? && @max_train_size < test_size
81
+ Array((start - @max_train_size)...start)
82
+ else
83
+ Array(0...start)
84
+ end
85
+ test_ids = Array(start...(start + test_size))
86
+ [train_ids, test_ids]
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
@@ -140,7 +140,7 @@ module Rumale
140
140
  def validate_steps(steps)
141
141
  steps.keys[0...-1].each do |name|
142
142
  transformer = steps[name]
143
- next if transformer.nil? || %i[fit transform].all? { |m| transformer.class.method_defined?(m) }
143
+ next if transformer.nil? || (transformer.class.method_defined?(:fit) && transformer.class.method_defined?(:transform))
144
144
 
145
145
  raise TypeError,
146
146
  'Class of intermediate step in pipeline should be implemented fit and transform methods: ' \
@@ -98,7 +98,7 @@ module Rumale
98
98
 
99
99
  def hessian_matrix(probs, df, sigma)
100
100
  sub = probs * (1 - probs)
101
- h11 = (df * df * sub).sum + sigma
101
+ h11 = (df**2 * sub).sum + sigma
102
102
  h22 = sub.sum + sigma
103
103
  h21 = (df * sub).sum
104
104
  Numo::DFloat[[h11, h21], [h21, h22]]
@@ -75,17 +75,10 @@ module Rumale
75
75
  node = Node.new(depth: depth, impurity: impurity, n_samples: n_samples)
76
76
 
77
77
  # terminate growing.
78
- unless @params[:max_leaf_nodes].nil?
79
- return nil if @n_leaves >= @params[:max_leaf_nodes]
80
- end
81
-
78
+ return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
82
79
  return nil if n_samples < @params[:min_samples_leaf]
83
80
  return put_leaf(node, y) if n_samples == @params[:min_samples_leaf]
84
-
85
- unless @params[:max_depth].nil?
86
- return put_leaf(node, y) if depth == @params[:max_depth]
87
- end
88
-
81
+ return put_leaf(node, y) if !@params[:max_depth].nil? && depth == @params[:max_depth]
89
82
  return put_leaf(node, y) if stop_growing?(y)
90
83
 
91
84
  # calculate optimal parameters.
@@ -138,7 +138,7 @@ module Rumale
138
138
  nil
139
139
  end
140
140
 
141
- def grow_node(depth, x, y, g, h)
141
+ def grow_node(depth, x, y, g, h) # rubocop:disable Metrics/AbcSize
142
142
  # intialize some variables.
143
143
  sum_g = g.sum
144
144
  sum_h = h.sum
@@ -146,17 +146,10 @@ module Rumale
146
146
  node = Node.new(depth: depth, n_samples: n_samples)
147
147
 
148
148
  # terminate growing.
149
- unless @params[:max_leaf_nodes].nil?
150
- return nil if @n_leaves >= @params[:max_leaf_nodes]
151
- end
152
-
149
+ return nil if !@params[:max_leaf_nodes].nil? && @n_leaves >= @params[:max_leaf_nodes]
153
150
  return nil if n_samples < @params[:min_samples_leaf]
154
151
  return put_leaf(node, sum_g, sum_h) if n_samples == @params[:min_samples_leaf]
155
-
156
- unless @params[:max_depth].nil?
157
- return put_leaf(node, sum_g, sum_h) if depth == @params[:max_depth]
158
- end
159
-
152
+ return put_leaf(node, sum_g, sum_h) if !@params[:max_depth].nil? && depth == @params[:max_depth]
160
153
  return put_leaf(node, sum_g, sum_h) if stop_growing?(y)
161
154
 
162
155
  # calculate optimal parameters.
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.20.0'
6
+ VERSION = '0.22.0'
7
7
  end
@@ -45,4 +45,5 @@ Gem::Specification.new do |spec|
45
45
  }
46
46
 
47
47
  spec.add_runtime_dependency 'numo-narray', '>= 0.9.1'
48
+ spec.add_runtime_dependency 'lbfgsb', '>=0.3.0'
48
49
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.20.0
4
+ version: 0.22.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-01 00:00:00.000000000 Z
11
+ date: 2020-11-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 0.9.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: lbfgsb
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.3.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.3.0
27
41
  description: |
28
42
  Rumale is a machine learning library in Ruby.
29
43
  Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
@@ -43,7 +57,7 @@ extensions:
43
57
  - ext/rumale/extconf.rb
44
58
  extra_rdoc_files: []
45
59
  files:
46
- - ".coveralls.yml"
60
+ - ".github/workflows/build.yml"
47
61
  - ".gitignore"
48
62
  - ".rspec"
49
63
  - ".rubocop.yml"
@@ -135,10 +149,13 @@ files:
135
149
  - lib/rumale/model_selection/cross_validation.rb
136
150
  - lib/rumale/model_selection/function.rb
137
151
  - lib/rumale/model_selection/grid_search_cv.rb
152
+ - lib/rumale/model_selection/group_k_fold.rb
153
+ - lib/rumale/model_selection/group_shuffle_split.rb
138
154
  - lib/rumale/model_selection/k_fold.rb
139
155
  - lib/rumale/model_selection/shuffle_split.rb
140
156
  - lib/rumale/model_selection/stratified_k_fold.rb
141
157
  - lib/rumale/model_selection/stratified_shuffle_split.rb
158
+ - lib/rumale/model_selection/time_series_split.rb
142
159
  - lib/rumale/multiclass/one_vs_rest_classifier.rb
143
160
  - lib/rumale/naive_bayes/base_naive_bayes.rb
144
161
  - lib/rumale/naive_bayes/bernoulli_nb.rb
@@ -206,7 +223,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
206
223
  - !ruby/object:Gem::Version
207
224
  version: '0'
208
225
  requirements: []
209
- rubygems_version: 3.1.2
226
+ rubygems_version: 3.1.4
210
227
  signing_key:
211
228
  specification_version: 4
212
229
  summary: Rumale is a machine learning library in Ruby. Rumale provides machine learning
@@ -1 +0,0 @@
1
- service_name: travis-ci