hybridforest 0.10.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fd85ad9a8081b634556371599f61557c0490c0694bdc24f8aa8d866d44ba0843
4
- data.tar.gz: f58382ac2737bce52c85cdf83e27f3099190a0f52be8c159780033194c183dd8
3
+ metadata.gz: 4311342fccd332cd6f98b2b6a30f32acc01f2f3090490e452e9bfe8981730f07
4
+ data.tar.gz: 2a4461a04ac9232d5506271ddbf8b9f9b49b7397a06c00aea894711c3ce0586f
5
5
  SHA512:
6
- metadata.gz: c6678ba9cf3ccf15970801879a4d66f6ef98c8cf80353fc4ce2e9808f79063c447d19e96be374223403fedc093ac67cdd6599140df391ca68d05c708725ba4fc
7
- data.tar.gz: 5c94ea5df0e6305719bf8ae9848a0a36d95d71fa51d7abe7bb09ed3f32720df98616bd3f2892e5c3467b647c9f6ebb7b89486e5cff771dde3141388d026fefe4
6
+ metadata.gz: 9708316bfa685c814afbffea2d2ea238dd8736ac929bd782a365d6ab65dbe2d1987df7a66750db3d9108071279a99cb1206f179163f1383494c0ebcda704fab6
7
+ data.tar.gz: efbd6db6210d02830734bb7e3411f8b9553ed8b6172df59f1180e801884057d34edd85a67cd6c63e6b09c8531cf434ccdf265e14f4d8d26e035b333a5805c48d
data/CHANGELOG.md CHANGED
@@ -34,4 +34,23 @@
34
34
 
35
35
  ## [0.10.0] - 2021-12-29
36
36
 
37
- - Refactor dataframe extensions
37
+ - Refactor dataframe extensions
38
+
39
+ ## [0.11.0] - 2021-12-29
40
+
41
+ - Randomize Utils.train_test_split
42
+ - Refactor Utils module
43
+
44
+ ## [0.12.0] - 2022-01-08
45
+
46
+ - Allow Utils.random_sample to be passed a dataframe or a dataframe convertible object
47
+ - Allow Utils.random_sample's 'size' arg to equal the size of the initial dataframe if the strategy is sampling with replacement
48
+
49
+ ## [0.13.0] - 2022-01-09
50
+
51
+ - Refactor forest growers
52
+
53
+
54
+ ## [0.14.0] - 2022-01-09
55
+
56
+ - Refactor hybrid forest grower
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- hybridforest (0.9.0)
4
+ hybridforest (0.13.0)
5
5
  activesupport (~> 6.1)
6
6
  rake (~> 13.0)
7
7
  require_all
@@ -8,7 +8,7 @@ module HybridForest
8
8
  def grow_forest(instances, number_of_trees)
9
9
  forest = []
10
10
  number_of_trees.times do
11
- sample, _, _ = HybridForest::Utils.train_test_bootstrap_split(instances)
11
+ sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
12
12
  forest << HybridForest::Trees::CARTTree.new.fit(sample)
13
13
  end
14
14
  forest
@@ -11,8 +11,9 @@ module HybridForest
11
11
  def grow_forest(instances, number_of_trees)
12
12
  forest = []
13
13
  number_of_trees.times do
14
- in_of_bag, out_of_bag, out_of_bag_labels = HybridForest::Utils.train_test_bootstrap_split(instances)
15
- tree_results = grow_trees(TREE_TYPES, in_of_bag, out_of_bag, out_of_bag_labels)
14
+ iob_data, oob_data, oob_labels = HybridForest::Utils.train_test_bootstrap_split(instances)
15
+ trees = grow_trees(TREE_TYPES, iob_data)
16
+ tree_results = predict_evaluate_trees(trees, oob_data, oob_labels)
16
17
  best_tree = select_best_tree(tree_results)
17
18
  forest << best_tree
18
19
  end
@@ -21,25 +22,28 @@ module HybridForest
21
22
 
22
23
  private
23
24
 
24
- def fit_and_predict(tree_class, in_of_bag, out_of_bag, out_of_bag_labels)
25
- tree = tree_class.new.fit(in_of_bag)
26
- tree_predictions = tree.predict(out_of_bag)
27
- tree_accuracy = HybridForest::Utils.accuracy(tree_predictions, out_of_bag_labels)
28
- {tree: tree, oob_accuracy: tree_accuracy}
25
+ def grow_trees(tree_types, iob_data)
26
+ tree_types.collect do |tree_type|
27
+ tree_type.new.fit(iob_data)
28
+ end
29
+ end
30
+
31
+ def predict_evaluate_trees(trees, oob_data, oob_labels)
32
+ trees.collect do |tree|
33
+ predict_evaluate(tree, oob_data, oob_labels)
34
+ end
35
+ end
36
+
37
+ def predict_evaluate(tree, data, actual_labels)
38
+ predicted_labels = tree.predict(data)
39
+ accuracy = HybridForest::Utils.accuracy(predicted_labels, actual_labels)
40
+ {tree: tree, oob_accuracy: accuracy}
29
41
  end
30
42
 
31
43
  def select_best_tree(tree_results)
32
44
  best_result = tree_results.max_by(1) { |result| result[:oob_accuracy] }.first
33
45
  best_result[:tree]
34
46
  end
35
-
36
- def grow_trees(tree_types, in_of_bag, out_of_bag, out_of_bag_labels)
37
- tree_results = []
38
- tree_types.each do |tree_type|
39
- tree_results << fit_and_predict(tree_type, in_of_bag, out_of_bag, out_of_bag_labels)
40
- end
41
- tree_results
42
- end
43
47
  end
44
48
  end
45
49
  end
@@ -8,7 +8,7 @@ module HybridForest
8
8
  def grow_forest(instances, number_of_trees)
9
9
  forest = []
10
10
  number_of_trees.times do
11
- sample, _, _ = HybridForest::Utils.train_test_bootstrap_split(instances)
11
+ sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
12
12
  forest << HybridForest::Trees::ID3Tree.new.fit(sample)
13
13
  end
14
14
  forest
@@ -6,6 +6,7 @@ module HybridForest
6
6
  module Trees
7
7
  class RandomFeatureSubspace
8
8
  def select_features(all_features)
9
+ # TODO: Allow the subspace size to be configured.
9
10
  n = default_subspace_size(all_features.count)
10
11
  indices = Set.new
11
12
  until indices.size == n
@@ -12,17 +12,16 @@ module HybridForest
12
12
  # of independent features and an array of labels. Returns [+training_set+, +testing_set+, +testing_set_labels+]
13
13
  #
14
14
  def self.train_test_split(dataset, test_set_size = 0.20)
15
- # TODO: Shuffle and stratify samples
15
+ # TODO: Offer stratify param
16
16
  dataset = to_dataframe(dataset)
17
+ all_rows = (0...dataset.count).to_a
17
18
 
18
19
  test_set_count = (dataset.count * test_set_size).floor
19
- test_set_indices = 0..test_set_count
20
- test_set = dataset[test_set_indices]
21
- test_set_labels = test_set.class_labels
22
- test_set.except!(test_set.label)
20
+ test_set_rows = rand_uniq_nums(test_set_count, 0...dataset.count)
21
+ test_set = dataset[test_set_rows]
22
+ test_set, test_set_labels = test_set.disconnect_labels
23
23
 
24
- train_set_indices = test_set_count + 1...dataset.count
25
- train_set = dataset[train_set_indices]
24
+ train_set = dataset[all_rows - test_set_rows]
26
25
 
27
26
  [train_set, test_set, test_set_labels]
28
27
  end
@@ -37,20 +36,13 @@ module HybridForest
37
36
  dataset = to_dataframe(dataset)
38
37
  all_rows = (0...dataset.count).to_a
39
38
 
40
- train_set = Rover::DataFrame.new
41
- train_set_rows = []
42
- dataset.count.times do
43
- row = all_rows.sample
44
- train_set_rows << row
45
- train_set.concat(dataset[row])
46
- end
39
+ train_set_rows = rand_nums(dataset.count, 0...dataset.count)
40
+ train_set = dataset[train_set_rows]
47
41
 
48
42
  return train_test_split(dataset) if train_set_rows.sort == all_rows
49
43
 
50
- test_set_rows = all_rows - train_set_rows
51
- test_set = dataset[test_set_rows]
52
- test_set_labels = test_set.class_labels
53
- test_set.except!(test_set.label)
44
+ test_set = dataset[all_rows - train_set_rows]
45
+ test_set, test_set_labels = test_set.disconnect_labels
54
46
 
55
47
  [train_set, test_set, test_set_labels]
56
48
  end
@@ -86,18 +78,18 @@ module HybridForest
86
78
  # Draws a random sample of +size+ from +data+.
87
79
  #
88
80
  def self.random_sample(data:, size:, with_replacement: true)
89
- raise ArgumentError, "Invalid sample size" if size < 1 || size > data.count
81
+ data = to_dataframe(data)
90
82
 
91
- if with_replacement
92
- rows = size.times.collect { rand(0...data.count) }
93
- data[rows]
83
+ if size < 1 || (!with_replacement && size > data.count)
84
+ raise ArgumentError, "Invalid sample size"
85
+ end
86
+
87
+ rows = if with_replacement
88
+ rand_nums(size, 0...data.count)
94
89
  else
95
- rows = Set.new
96
- until rows.size == size
97
- rows << rand(0...data.count)
98
- end
99
- data[rows.to_a]
90
+ rand_uniq_nums(size, 0...data.count)
100
91
  end
92
+ data[rows]
101
93
  end
102
94
 
103
95
  # Outputs a report of common prediction metrics.
@@ -168,6 +160,12 @@ module HybridForest
168
160
  def class_labels
169
161
  self[label].to_a
170
162
  end
163
+
164
+ def disconnect_labels
165
+ labels = class_labels
166
+ except!(label)
167
+ [self, labels]
168
+ end
171
169
  end
172
170
  end
173
171
 
@@ -202,5 +200,23 @@ module HybridForest
202
200
  def false_label?(label)
203
201
  [false, 0].include? label
204
202
  end
203
+
204
+ ##
205
+ # Returns an array of +n+ random numbers in the exclusive +range+.
206
+ def rand_nums(n, range)
207
+ n.times.collect { rand(range) }
208
+ end
209
+
210
+ ##
211
+ # Returns an array of +n+ _unique_ random numbers in the exclusive +range+.
212
+ def rand_uniq_nums(n, range)
213
+ raise ArgumentError if n > range.size
214
+
215
+ nums = Set.new
216
+ until nums.size == n
217
+ nums << rand(range)
218
+ end
219
+ nums.to_a
220
+ end
205
221
  end
206
222
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HybridForest
4
- VERSION = "0.10.0"
4
+ VERSION = "0.14.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hybridforest
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hi-tech-jazz
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-12-29 00:00:00.000000000 Z
11
+ date: 2022-01-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake