hybridforest 0.10.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +20 -1
 - data/Gemfile.lock +1 -1
 - data/lib/hybridforest/forests/forest_growers/cart_grower.rb +1 -1
 - data/lib/hybridforest/forests/forest_growers/hybrid_grower.rb +19 -15
 - data/lib/hybridforest/forests/forest_growers/id3_grower.rb +1 -1
 - data/lib/hybridforest/trees/feature_selectors/random_feature_subspace.rb +1 -0
 - data/lib/hybridforest/utilities/utils.rb +43 -27
 - data/lib/hybridforest/version.rb +1 -1
 - metadata +2 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 4311342fccd332cd6f98b2b6a30f32acc01f2f3090490e452e9bfe8981730f07
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 2a4461a04ac9232d5506271ddbf8b9f9b49b7397a06c00aea894711c3ce0586f
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 9708316bfa685c814afbffea2d2ea238dd8736ac929bd782a365d6ab65dbe2d1987df7a66750db3d9108071279a99cb1206f179163f1383494c0ebcda704fab6
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: efbd6db6210d02830734bb7e3411f8b9553ed8b6172df59f1180e801884057d34edd85a67cd6c63e6b09c8531cf434ccdf265e14f4d8d26e035b333a5805c48d
         
     | 
    
        data/CHANGELOG.md
    CHANGED
    
    | 
         @@ -34,4 +34,23 @@ 
     | 
|
| 
       34 
34 
     | 
    
         | 
| 
       35 
35 
     | 
    
         
             
            ## [0.10.0] - 2021-12-29
         
     | 
| 
       36 
36 
     | 
    
         | 
| 
       37 
     | 
    
         
            -
            - Refactor dataframe extensions
         
     | 
| 
      
 37 
     | 
    
         
            +
            - Refactor dataframe extensions
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
            ## [0.11.0] - 2021-12-29
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
            - Randomize Utils.train_test_split
         
     | 
| 
      
 42 
     | 
    
         
            +
            - Refactor Utils module
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
            ## [0.12.0] - 2022-01-08
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
            - Allow Utils.random_sample to be passed a dataframe or a dataframe convertible object
         
     | 
| 
      
 47 
     | 
    
         
            +
            - Allow Utils.random_sample's 'size' arg to equal the size of the initial dataframe if the strategy is sampling with replacement
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
            ## [0.13.0] - 2022-01-09
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
            - Refactor forest growers
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
            ## [0.14.0] - 2022-01-09
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
            - Refactor hybrid forest grower
         
     | 
    
        data/Gemfile.lock
    CHANGED
    
    
| 
         @@ -8,7 +8,7 @@ module HybridForest 
     | 
|
| 
       8 
8 
     | 
    
         
             
                    def grow_forest(instances, number_of_trees)
         
     | 
| 
       9 
9 
     | 
    
         
             
                      forest = []
         
     | 
| 
       10 
10 
     | 
    
         
             
                      number_of_trees.times do
         
     | 
| 
       11 
     | 
    
         
            -
                        sample 
     | 
| 
      
 11 
     | 
    
         
            +
                        sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
         
     | 
| 
       12 
12 
     | 
    
         
             
                        forest << HybridForest::Trees::CARTTree.new.fit(sample)
         
     | 
| 
       13 
13 
     | 
    
         
             
                      end
         
     | 
| 
       14 
14 
     | 
    
         
             
                      forest
         
     | 
| 
         @@ -11,8 +11,9 @@ module HybridForest 
     | 
|
| 
       11 
11 
     | 
    
         
             
                    def grow_forest(instances, number_of_trees)
         
     | 
| 
       12 
12 
     | 
    
         
             
                      forest = []
         
     | 
| 
       13 
13 
     | 
    
         
             
                      number_of_trees.times do
         
     | 
| 
       14 
     | 
    
         
            -
                         
     | 
| 
       15 
     | 
    
         
            -
                         
     | 
| 
      
 14 
     | 
    
         
            +
                        iob_data, oob_data, oob_labels = HybridForest::Utils.train_test_bootstrap_split(instances)
         
     | 
| 
      
 15 
     | 
    
         
            +
                        trees = grow_trees(TREE_TYPES, iob_data)
         
     | 
| 
      
 16 
     | 
    
         
            +
                        tree_results = predict_evaluate_trees(trees, oob_data, oob_labels)
         
     | 
| 
       16 
17 
     | 
    
         
             
                        best_tree = select_best_tree(tree_results)
         
     | 
| 
       17 
18 
     | 
    
         
             
                        forest << best_tree
         
     | 
| 
       18 
19 
     | 
    
         
             
                      end
         
     | 
| 
         @@ -21,25 +22,28 @@ module HybridForest 
     | 
|
| 
       21 
22 
     | 
    
         | 
| 
       22 
23 
     | 
    
         
             
                    private
         
     | 
| 
       23 
24 
     | 
    
         | 
| 
       24 
     | 
    
         
            -
                    def  
     | 
| 
       25 
     | 
    
         
            -
                       
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
                       
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
      
 25 
     | 
    
         
            +
                    def grow_trees(tree_types, iob_data)
         
     | 
| 
      
 26 
     | 
    
         
            +
                      tree_types.collect do |tree_type|
         
     | 
| 
      
 27 
     | 
    
         
            +
                        tree_type.new.fit(iob_data)
         
     | 
| 
      
 28 
     | 
    
         
            +
                      end
         
     | 
| 
      
 29 
     | 
    
         
            +
                    end
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                    def predict_evaluate_trees(trees, oob_data, oob_labels)
         
     | 
| 
      
 32 
     | 
    
         
            +
                      trees.collect do |tree|
         
     | 
| 
      
 33 
     | 
    
         
            +
                        predict_evaluate(tree, oob_data, oob_labels)
         
     | 
| 
      
 34 
     | 
    
         
            +
                      end
         
     | 
| 
      
 35 
     | 
    
         
            +
                    end
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                    def predict_evaluate(tree, data, actual_labels)
         
     | 
| 
      
 38 
     | 
    
         
            +
                      predicted_labels = tree.predict(data)
         
     | 
| 
      
 39 
     | 
    
         
            +
                      accuracy = HybridForest::Utils.accuracy(predicted_labels, actual_labels)
         
     | 
| 
      
 40 
     | 
    
         
            +
                      {tree: tree, oob_accuracy: accuracy}
         
     | 
| 
       29 
41 
     | 
    
         
             
                    end
         
     | 
| 
       30 
42 
     | 
    
         | 
| 
       31 
43 
     | 
    
         
             
                    def select_best_tree(tree_results)
         
     | 
| 
       32 
44 
     | 
    
         
             
                      best_result = tree_results.max_by(1) { |result| result[:oob_accuracy] }.first
         
     | 
| 
       33 
45 
     | 
    
         
             
                      best_result[:tree]
         
     | 
| 
       34 
46 
     | 
    
         
             
                    end
         
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
       36 
     | 
    
         
            -
                    def grow_trees(tree_types, in_of_bag, out_of_bag, out_of_bag_labels)
         
     | 
| 
       37 
     | 
    
         
            -
                      tree_results = []
         
     | 
| 
       38 
     | 
    
         
            -
                      tree_types.each do |tree_type|
         
     | 
| 
       39 
     | 
    
         
            -
                        tree_results << fit_and_predict(tree_type, in_of_bag, out_of_bag, out_of_bag_labels)
         
     | 
| 
       40 
     | 
    
         
            -
                      end
         
     | 
| 
       41 
     | 
    
         
            -
                      tree_results
         
     | 
| 
       42 
     | 
    
         
            -
                    end
         
     | 
| 
       43 
47 
     | 
    
         
             
                  end
         
     | 
| 
       44 
48 
     | 
    
         
             
                end
         
     | 
| 
       45 
49 
     | 
    
         
             
              end
         
     | 
| 
         @@ -8,7 +8,7 @@ module HybridForest 
     | 
|
| 
       8 
8 
     | 
    
         
             
                    def grow_forest(instances, number_of_trees)
         
     | 
| 
       9 
9 
     | 
    
         
             
                      forest = []
         
     | 
| 
       10 
10 
     | 
    
         
             
                      number_of_trees.times do
         
     | 
| 
       11 
     | 
    
         
            -
                        sample 
     | 
| 
      
 11 
     | 
    
         
            +
                        sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
         
     | 
| 
       12 
12 
     | 
    
         
             
                        forest << HybridForest::Trees::ID3Tree.new.fit(sample)
         
     | 
| 
       13 
13 
     | 
    
         
             
                      end
         
     | 
| 
       14 
14 
     | 
    
         
             
                      forest
         
     | 
| 
         @@ -12,17 +12,16 @@ module HybridForest 
     | 
|
| 
       12 
12 
     | 
    
         
             
                # of independent features and an array of labels. Returns [+training_set+, +testing_set+, +testing_set_labels+]
         
     | 
| 
       13 
13 
     | 
    
         
             
                #
         
     | 
| 
       14 
14 
     | 
    
         
             
                def self.train_test_split(dataset, test_set_size = 0.20)
         
     | 
| 
       15 
     | 
    
         
            -
                  # TODO:  
     | 
| 
      
 15 
     | 
    
         
            +
                  # TODO: Offer stratify param
         
     | 
| 
       16 
16 
     | 
    
         
             
                  dataset = to_dataframe(dataset)
         
     | 
| 
      
 17 
     | 
    
         
            +
                  all_rows = (0...dataset.count).to_a
         
     | 
| 
       17 
18 
     | 
    
         | 
| 
       18 
19 
     | 
    
         
             
                  test_set_count = (dataset.count * test_set_size).floor
         
     | 
| 
       19 
     | 
    
         
            -
                   
     | 
| 
       20 
     | 
    
         
            -
                  test_set = dataset[ 
     | 
| 
       21 
     | 
    
         
            -
                  test_set_labels = test_set. 
     | 
| 
       22 
     | 
    
         
            -
                  test_set.except!(test_set.label)
         
     | 
| 
      
 20 
     | 
    
         
            +
                  test_set_rows = rand_uniq_nums(test_set_count, 0...dataset.count)
         
     | 
| 
      
 21 
     | 
    
         
            +
                  test_set = dataset[test_set_rows]
         
     | 
| 
      
 22 
     | 
    
         
            +
                  test_set, test_set_labels = test_set.disconnect_labels
         
     | 
| 
       23 
23 
     | 
    
         | 
| 
       24 
     | 
    
         
            -
                   
     | 
| 
       25 
     | 
    
         
            -
                  train_set = dataset[train_set_indices]
         
     | 
| 
      
 24 
     | 
    
         
            +
                  train_set = dataset[all_rows - test_set_rows]
         
     | 
| 
       26 
25 
     | 
    
         | 
| 
       27 
26 
     | 
    
         
             
                  [train_set, test_set, test_set_labels]
         
     | 
| 
       28 
27 
     | 
    
         
             
                end
         
     | 
| 
         @@ -37,20 +36,13 @@ module HybridForest 
     | 
|
| 
       37 
36 
     | 
    
         
             
                  dataset = to_dataframe(dataset)
         
     | 
| 
       38 
37 
     | 
    
         
             
                  all_rows = (0...dataset.count).to_a
         
     | 
| 
       39 
38 
     | 
    
         | 
| 
       40 
     | 
    
         
            -
                   
     | 
| 
       41 
     | 
    
         
            -
                   
     | 
| 
       42 
     | 
    
         
            -
                  dataset.count.times do
         
     | 
| 
       43 
     | 
    
         
            -
                    row = all_rows.sample
         
     | 
| 
       44 
     | 
    
         
            -
                    train_set_rows << row
         
     | 
| 
       45 
     | 
    
         
            -
                    train_set.concat(dataset[row])
         
     | 
| 
       46 
     | 
    
         
            -
                  end
         
     | 
| 
      
 39 
     | 
    
         
            +
                  train_set_rows = rand_nums(dataset.count, 0...dataset.count)
         
     | 
| 
      
 40 
     | 
    
         
            +
                  train_set = dataset[train_set_rows]
         
     | 
| 
       47 
41 
     | 
    
         | 
| 
       48 
42 
     | 
    
         
             
                  return train_test_split(dataset) if train_set_rows.sort == all_rows
         
     | 
| 
       49 
43 
     | 
    
         | 
| 
       50 
     | 
    
         
            -
                   
     | 
| 
       51 
     | 
    
         
            -
                  test_set =  
     | 
| 
       52 
     | 
    
         
            -
                  test_set_labels = test_set.class_labels
         
     | 
| 
       53 
     | 
    
         
            -
                  test_set.except!(test_set.label)
         
     | 
| 
      
 44 
     | 
    
         
            +
                  test_set = dataset[all_rows - train_set_rows]
         
     | 
| 
      
 45 
     | 
    
         
            +
                  test_set, test_set_labels = test_set.disconnect_labels
         
     | 
| 
       54 
46 
     | 
    
         | 
| 
       55 
47 
     | 
    
         
             
                  [train_set, test_set, test_set_labels]
         
     | 
| 
       56 
48 
     | 
    
         
             
                end
         
     | 
| 
         @@ -86,18 +78,18 @@ module HybridForest 
     | 
|
| 
       86 
78 
     | 
    
         
             
                # Draws a random sample of +size+ from +data+.
         
     | 
| 
       87 
79 
     | 
    
         
             
                #
         
     | 
| 
       88 
80 
     | 
    
         
             
                def self.random_sample(data:, size:, with_replacement: true)
         
     | 
| 
       89 
     | 
    
         
            -
                   
     | 
| 
      
 81 
     | 
    
         
            +
                  data = to_dataframe(data)
         
     | 
| 
       90 
82 
     | 
    
         | 
| 
       91 
     | 
    
         
            -
                  if with_replacement
         
     | 
| 
       92 
     | 
    
         
            -
                     
     | 
| 
       93 
     | 
    
         
            -
             
     | 
| 
      
 83 
     | 
    
         
            +
                  if size < 1 || (!with_replacement && size > data.count)
         
     | 
| 
      
 84 
     | 
    
         
            +
                    raise ArgumentError, "Invalid sample size"
         
     | 
| 
      
 85 
     | 
    
         
            +
                  end
         
     | 
| 
      
 86 
     | 
    
         
            +
             
     | 
| 
      
 87 
     | 
    
         
            +
                  rows = if with_replacement
         
     | 
| 
      
 88 
     | 
    
         
            +
                    rand_nums(size, 0...data.count)
         
     | 
| 
       94 
89 
     | 
    
         
             
                  else
         
     | 
| 
       95 
     | 
    
         
            -
                     
     | 
| 
       96 
     | 
    
         
            -
                    until rows.size == size
         
     | 
| 
       97 
     | 
    
         
            -
                      rows << rand(0...data.count)
         
     | 
| 
       98 
     | 
    
         
            -
                    end
         
     | 
| 
       99 
     | 
    
         
            -
                    data[rows.to_a]
         
     | 
| 
      
 90 
     | 
    
         
            +
                    rand_uniq_nums(size, 0...data.count)
         
     | 
| 
       100 
91 
     | 
    
         
             
                  end
         
     | 
| 
      
 92 
     | 
    
         
            +
                  data[rows]
         
     | 
| 
       101 
93 
     | 
    
         
             
                end
         
     | 
| 
       102 
94 
     | 
    
         | 
| 
       103 
95 
     | 
    
         
             
                # Outputs a report of common prediction metrics.
         
     | 
| 
         @@ -168,6 +160,12 @@ module HybridForest 
     | 
|
| 
       168 
160 
     | 
    
         
             
                    def class_labels
         
     | 
| 
       169 
161 
     | 
    
         
             
                      self[label].to_a
         
     | 
| 
       170 
162 
     | 
    
         
             
                    end
         
     | 
| 
      
 163 
     | 
    
         
            +
             
     | 
| 
      
 164 
     | 
    
         
            +
                    def disconnect_labels
         
     | 
| 
      
 165 
     | 
    
         
            +
                      labels = class_labels
         
     | 
| 
      
 166 
     | 
    
         
            +
                      except!(label)
         
     | 
| 
      
 167 
     | 
    
         
            +
                      [self, labels]
         
     | 
| 
      
 168 
     | 
    
         
            +
                    end
         
     | 
| 
       171 
169 
     | 
    
         
             
                  end
         
     | 
| 
       172 
170 
     | 
    
         
             
                end
         
     | 
| 
       173 
171 
     | 
    
         | 
| 
         @@ -202,5 +200,23 @@ module HybridForest 
     | 
|
| 
       202 
200 
     | 
    
         
             
                def false_label?(label)
         
     | 
| 
       203 
201 
     | 
    
         
             
                  [false, 0].include? label
         
     | 
| 
       204 
202 
     | 
    
         
             
                end
         
     | 
| 
      
 203 
     | 
    
         
            +
             
     | 
| 
      
 204 
     | 
    
         
            +
                ##
         
     | 
| 
      
 205 
     | 
    
         
            +
                # Returns an array of +n+ random numbers in the exclusive +range+.
         
     | 
| 
      
 206 
     | 
    
         
            +
                def rand_nums(n, range)
         
     | 
| 
      
 207 
     | 
    
         
            +
                  n.times.collect { rand(range) }
         
     | 
| 
      
 208 
     | 
    
         
            +
                end
         
     | 
| 
      
 209 
     | 
    
         
            +
             
     | 
| 
      
 210 
     | 
    
         
            +
                ##
         
     | 
| 
      
 211 
     | 
    
         
            +
                # Returns an array of +n+ _unique_ random numbers in the exclusive +range+.
         
     | 
| 
      
 212 
     | 
    
         
            +
                def rand_uniq_nums(n, range)
         
     | 
| 
      
 213 
     | 
    
         
            +
                  raise ArgumentError if n > range.size
         
     | 
| 
      
 214 
     | 
    
         
            +
             
     | 
| 
      
 215 
     | 
    
         
            +
                  nums = Set.new
         
     | 
| 
      
 216 
     | 
    
         
            +
                  until nums.size == n
         
     | 
| 
      
 217 
     | 
    
         
            +
                    nums << rand(range)
         
     | 
| 
      
 218 
     | 
    
         
            +
                  end
         
     | 
| 
      
 219 
     | 
    
         
            +
                  nums.to_a
         
     | 
| 
      
 220 
     | 
    
         
            +
                end
         
     | 
| 
       205 
221 
     | 
    
         
             
              end
         
     | 
| 
       206 
222 
     | 
    
         
             
            end
         
     | 
    
        data/lib/hybridforest/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: hybridforest
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.14.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - hi-tech-jazz
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2022-01-09 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: rake
         
     |