hybridforest 0.10.0 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fd85ad9a8081b634556371599f61557c0490c0694bdc24f8aa8d866d44ba0843
4
- data.tar.gz: f58382ac2737bce52c85cdf83e27f3099190a0f52be8c159780033194c183dd8
3
+ metadata.gz: 4311342fccd332cd6f98b2b6a30f32acc01f2f3090490e452e9bfe8981730f07
4
+ data.tar.gz: 2a4461a04ac9232d5506271ddbf8b9f9b49b7397a06c00aea894711c3ce0586f
5
5
  SHA512:
6
- metadata.gz: c6678ba9cf3ccf15970801879a4d66f6ef98c8cf80353fc4ce2e9808f79063c447d19e96be374223403fedc093ac67cdd6599140df391ca68d05c708725ba4fc
7
- data.tar.gz: 5c94ea5df0e6305719bf8ae9848a0a36d95d71fa51d7abe7bb09ed3f32720df98616bd3f2892e5c3467b647c9f6ebb7b89486e5cff771dde3141388d026fefe4
6
+ metadata.gz: 9708316bfa685c814afbffea2d2ea238dd8736ac929bd782a365d6ab65dbe2d1987df7a66750db3d9108071279a99cb1206f179163f1383494c0ebcda704fab6
7
+ data.tar.gz: efbd6db6210d02830734bb7e3411f8b9553ed8b6172df59f1180e801884057d34edd85a67cd6c63e6b09c8531cf434ccdf265e14f4d8d26e035b333a5805c48d
data/CHANGELOG.md CHANGED
@@ -34,4 +34,23 @@
34
34
 
35
35
  ## [0.10.0] - 2021-12-29
36
36
 
37
- - Refactor dataframe extensions
37
+ - Refactor dataframe extensions
38
+
39
+ ## [0.11.0] - 2021-12-29
40
+
41
+ - Randomize Utils.train_test_split
42
+ - Refactor Utils module
43
+
44
+ ## [0.12.0] - 2022-01-08
45
+
46
+ - Allow Utils.random_sample to be passed a dataframe or a dataframe convertible object
47
+ - Allow Utils.random_sample's 'size' arg to equal the size of the initial dataframe if the strategy is sampling with replacement
48
+
49
+ ## [0.13.0] - 2022-01-09
50
+
51
+ - Refactor forest growers
52
+
53
+
54
+ ## [0.14.0] - 2022-01-09
55
+
56
+ - Refactor hybrid forest grower
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- hybridforest (0.9.0)
4
+ hybridforest (0.13.0)
5
5
  activesupport (~> 6.1)
6
6
  rake (~> 13.0)
7
7
  require_all
@@ -8,7 +8,7 @@ module HybridForest
8
8
  def grow_forest(instances, number_of_trees)
9
9
  forest = []
10
10
  number_of_trees.times do
11
- sample, _, _ = HybridForest::Utils.train_test_bootstrap_split(instances)
11
+ sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
12
12
  forest << HybridForest::Trees::CARTTree.new.fit(sample)
13
13
  end
14
14
  forest
@@ -11,8 +11,9 @@ module HybridForest
11
11
  def grow_forest(instances, number_of_trees)
12
12
  forest = []
13
13
  number_of_trees.times do
14
- in_of_bag, out_of_bag, out_of_bag_labels = HybridForest::Utils.train_test_bootstrap_split(instances)
15
- tree_results = grow_trees(TREE_TYPES, in_of_bag, out_of_bag, out_of_bag_labels)
14
+ iob_data, oob_data, oob_labels = HybridForest::Utils.train_test_bootstrap_split(instances)
15
+ trees = grow_trees(TREE_TYPES, iob_data)
16
+ tree_results = predict_evaluate_trees(trees, oob_data, oob_labels)
16
17
  best_tree = select_best_tree(tree_results)
17
18
  forest << best_tree
18
19
  end
@@ -21,25 +22,28 @@ module HybridForest
21
22
 
22
23
  private
23
24
 
24
- def fit_and_predict(tree_class, in_of_bag, out_of_bag, out_of_bag_labels)
25
- tree = tree_class.new.fit(in_of_bag)
26
- tree_predictions = tree.predict(out_of_bag)
27
- tree_accuracy = HybridForest::Utils.accuracy(tree_predictions, out_of_bag_labels)
28
- {tree: tree, oob_accuracy: tree_accuracy}
25
+ def grow_trees(tree_types, iob_data)
26
+ tree_types.collect do |tree_type|
27
+ tree_type.new.fit(iob_data)
28
+ end
29
+ end
30
+
31
+ def predict_evaluate_trees(trees, oob_data, oob_labels)
32
+ trees.collect do |tree|
33
+ predict_evaluate(tree, oob_data, oob_labels)
34
+ end
35
+ end
36
+
37
+ def predict_evaluate(tree, data, actual_labels)
38
+ predicted_labels = tree.predict(data)
39
+ accuracy = HybridForest::Utils.accuracy(predicted_labels, actual_labels)
40
+ {tree: tree, oob_accuracy: accuracy}
29
41
  end
30
42
 
31
43
  def select_best_tree(tree_results)
32
44
  best_result = tree_results.max_by(1) { |result| result[:oob_accuracy] }.first
33
45
  best_result[:tree]
34
46
  end
35
-
36
- def grow_trees(tree_types, in_of_bag, out_of_bag, out_of_bag_labels)
37
- tree_results = []
38
- tree_types.each do |tree_type|
39
- tree_results << fit_and_predict(tree_type, in_of_bag, out_of_bag, out_of_bag_labels)
40
- end
41
- tree_results
42
- end
43
47
  end
44
48
  end
45
49
  end
@@ -8,7 +8,7 @@ module HybridForest
8
8
  def grow_forest(instances, number_of_trees)
9
9
  forest = []
10
10
  number_of_trees.times do
11
- sample, _, _ = HybridForest::Utils.train_test_bootstrap_split(instances)
11
+ sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
12
12
  forest << HybridForest::Trees::ID3Tree.new.fit(sample)
13
13
  end
14
14
  forest
@@ -6,6 +6,7 @@ module HybridForest
6
6
  module Trees
7
7
  class RandomFeatureSubspace
8
8
  def select_features(all_features)
9
+ # TODO: Allow the subspace size to be configured.
9
10
  n = default_subspace_size(all_features.count)
10
11
  indices = Set.new
11
12
  until indices.size == n
@@ -12,17 +12,16 @@ module HybridForest
12
12
  # of independent features and an array of labels. Returns [+training_set+, +testing_set+, +testing_set_labels+]
13
13
  #
14
14
  def self.train_test_split(dataset, test_set_size = 0.20)
15
- # TODO: Shuffle and stratify samples
15
+ # TODO: Offer stratify param
16
16
  dataset = to_dataframe(dataset)
17
+ all_rows = (0...dataset.count).to_a
17
18
 
18
19
  test_set_count = (dataset.count * test_set_size).floor
19
- test_set_indices = 0..test_set_count
20
- test_set = dataset[test_set_indices]
21
- test_set_labels = test_set.class_labels
22
- test_set.except!(test_set.label)
20
+ test_set_rows = rand_uniq_nums(test_set_count, 0...dataset.count)
21
+ test_set = dataset[test_set_rows]
22
+ test_set, test_set_labels = test_set.disconnect_labels
23
23
 
24
- train_set_indices = test_set_count + 1...dataset.count
25
- train_set = dataset[train_set_indices]
24
+ train_set = dataset[all_rows - test_set_rows]
26
25
 
27
26
  [train_set, test_set, test_set_labels]
28
27
  end
@@ -37,20 +36,13 @@ module HybridForest
37
36
  dataset = to_dataframe(dataset)
38
37
  all_rows = (0...dataset.count).to_a
39
38
 
40
- train_set = Rover::DataFrame.new
41
- train_set_rows = []
42
- dataset.count.times do
43
- row = all_rows.sample
44
- train_set_rows << row
45
- train_set.concat(dataset[row])
46
- end
39
+ train_set_rows = rand_nums(dataset.count, 0...dataset.count)
40
+ train_set = dataset[train_set_rows]
47
41
 
48
42
  return train_test_split(dataset) if train_set_rows.sort == all_rows
49
43
 
50
- test_set_rows = all_rows - train_set_rows
51
- test_set = dataset[test_set_rows]
52
- test_set_labels = test_set.class_labels
53
- test_set.except!(test_set.label)
44
+ test_set = dataset[all_rows - train_set_rows]
45
+ test_set, test_set_labels = test_set.disconnect_labels
54
46
 
55
47
  [train_set, test_set, test_set_labels]
56
48
  end
@@ -86,18 +78,18 @@ module HybridForest
86
78
  # Draws a random sample of +size+ from +data+.
87
79
  #
88
80
  def self.random_sample(data:, size:, with_replacement: true)
89
- raise ArgumentError, "Invalid sample size" if size < 1 || size > data.count
81
+ data = to_dataframe(data)
90
82
 
91
- if with_replacement
92
- rows = size.times.collect { rand(0...data.count) }
93
- data[rows]
83
+ if size < 1 || (!with_replacement && size > data.count)
84
+ raise ArgumentError, "Invalid sample size"
85
+ end
86
+
87
+ rows = if with_replacement
88
+ rand_nums(size, 0...data.count)
94
89
  else
95
- rows = Set.new
96
- until rows.size == size
97
- rows << rand(0...data.count)
98
- end
99
- data[rows.to_a]
90
+ rand_uniq_nums(size, 0...data.count)
100
91
  end
92
+ data[rows]
101
93
  end
102
94
 
103
95
  # Outputs a report of common prediction metrics.
@@ -168,6 +160,12 @@ module HybridForest
168
160
  def class_labels
169
161
  self[label].to_a
170
162
  end
163
+
164
+ def disconnect_labels
165
+ labels = class_labels
166
+ except!(label)
167
+ [self, labels]
168
+ end
171
169
  end
172
170
  end
173
171
 
@@ -202,5 +200,23 @@ module HybridForest
202
200
  def false_label?(label)
203
201
  [false, 0].include? label
204
202
  end
203
+
204
+ ##
205
+ # Returns an array of +n+ random numbers in the exclusive +range+.
206
+ def rand_nums(n, range)
207
+ n.times.collect { rand(range) }
208
+ end
209
+
210
+ ##
211
+ # Returns an array of +n+ _unique_ random numbers in the exclusive +range+.
212
+ def rand_uniq_nums(n, range)
213
+ raise ArgumentError if n > range.size
214
+
215
+ nums = Set.new
216
+ until nums.size == n
217
+ nums << rand(range)
218
+ end
219
+ nums.to_a
220
+ end
205
221
  end
206
222
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HybridForest
4
- VERSION = "0.10.0"
4
+ VERSION = "0.14.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hybridforest
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hi-tech-jazz
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-12-29 00:00:00.000000000 Z
11
+ date: 2022-01-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake