hybridforest 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fd85ad9a8081b634556371599f61557c0490c0694bdc24f8aa8d866d44ba0843
4
- data.tar.gz: f58382ac2737bce52c85cdf83e27f3099190a0f52be8c159780033194c183dd8
3
+ metadata.gz: b32e830c0e55cd23122b879bae7d42407085026f573d5565522ba64fe71a255e
4
+ data.tar.gz: 55c965cbe81e2f2a2e18ebecd21b1027577249f2aa6de4bb0e57b14ab6409f30
5
5
  SHA512:
6
- metadata.gz: c6678ba9cf3ccf15970801879a4d66f6ef98c8cf80353fc4ce2e9808f79063c447d19e96be374223403fedc093ac67cdd6599140df391ca68d05c708725ba4fc
7
- data.tar.gz: 5c94ea5df0e6305719bf8ae9848a0a36d95d71fa51d7abe7bb09ed3f32720df98616bd3f2892e5c3467b647c9f6ebb7b89486e5cff771dde3141388d026fefe4
6
+ metadata.gz: 7fd27d1de1bcc9f1776607475bc3e03731c8053dbd309171265d1eeb1abc1498965b62d251f1f731ea708ea3914372cba1843b73be1acedcb90d417e79cc54bf
7
+ data.tar.gz: 75a6a949ccce8e0369966814fdd9a40944a9163dacb82de7a30fad31d54844e5fdc396ea546dbfcad19258e249a60b452973ee3df4e9fe8f07da71d4e888c274
data/CHANGELOG.md CHANGED
@@ -34,4 +34,9 @@
34
34
 
35
35
  ## [0.10.0] - 2021-12-29
36
36
 
37
- - Refactor dataframe extensions
37
+ - Refactor dataframe extensions
38
+
39
+ ## [0.11.0] - 2021-12-29
40
+
41
+ - Randomize Utils.train_test_split
42
+ - Refactor Utils module
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- hybridforest (0.9.0)
4
+ hybridforest (0.10.0)
5
5
  activesupport (~> 6.1)
6
6
  rake (~> 13.0)
7
7
  require_all
@@ -6,6 +6,7 @@ module HybridForest
6
6
  module Trees
7
7
  class RandomFeatureSubspace
8
8
  def select_features(all_features)
9
+ # TODO: Allow the subspace size to be configured.
9
10
  n = default_subspace_size(all_features.count)
10
11
  indices = Set.new
11
12
  until indices.size == n
@@ -12,17 +12,16 @@ module HybridForest
12
12
  # of independent features and an array of labels. Returns [+training_set+, +testing_set+, +testing_set_labels+]
13
13
  #
14
14
  def self.train_test_split(dataset, test_set_size = 0.20)
15
- # TODO: Shuffle and stratify samples
15
+ # TODO: Offer stratify param
16
16
  dataset = to_dataframe(dataset)
17
+ all_rows = (0...dataset.count).to_a
17
18
 
18
19
  test_set_count = (dataset.count * test_set_size).floor
19
- test_set_indices = 0..test_set_count
20
- test_set = dataset[test_set_indices]
21
- test_set_labels = test_set.class_labels
22
- test_set.except!(test_set.label)
20
+ test_set_rows = rand_uniq_nums(test_set_count, 0...dataset.count)
21
+ test_set = dataset[test_set_rows]
22
+ test_set, test_set_labels = test_set.disconnect_labels
23
23
 
24
- train_set_indices = test_set_count + 1...dataset.count
25
- train_set = dataset[train_set_indices]
24
+ train_set = dataset[all_rows - test_set_rows]
26
25
 
27
26
  [train_set, test_set, test_set_labels]
28
27
  end
@@ -37,20 +36,13 @@ module HybridForest
37
36
  dataset = to_dataframe(dataset)
38
37
  all_rows = (0...dataset.count).to_a
39
38
 
40
- train_set = Rover::DataFrame.new
41
- train_set_rows = []
42
- dataset.count.times do
43
- row = all_rows.sample
44
- train_set_rows << row
45
- train_set.concat(dataset[row])
46
- end
39
+ train_set_rows = rand_nums(dataset.count, 0...dataset.count)
40
+ train_set = dataset[train_set_rows]
47
41
 
48
42
  return train_test_split(dataset) if train_set_rows.sort == all_rows
49
43
 
50
- test_set_rows = all_rows - train_set_rows
51
- test_set = dataset[test_set_rows]
52
- test_set_labels = test_set.class_labels
53
- test_set.except!(test_set.label)
44
+ test_set = dataset[all_rows - train_set_rows]
45
+ test_set, test_set_labels = test_set.disconnect_labels
54
46
 
55
47
  [train_set, test_set, test_set_labels]
56
48
  end
@@ -88,16 +80,12 @@ module HybridForest
88
80
  def self.random_sample(data:, size:, with_replacement: true)
89
81
  raise ArgumentError, "Invalid sample size" if size < 1 || size > data.count
90
82
 
91
- if with_replacement
92
- rows = size.times.collect { rand(0...data.count) }
93
- data[rows]
83
+ rows = if with_replacement
84
+ rand_nums(size, 0...data.count)
94
85
  else
95
- rows = Set.new
96
- until rows.size == size
97
- rows << rand(0...data.count)
98
- end
99
- data[rows.to_a]
86
+ rand_uniq_nums(size, 0...data.count)
100
87
  end
88
+ data[rows]
101
89
  end
102
90
 
103
91
  # Outputs a report of common prediction metrics.
@@ -168,6 +156,12 @@ module HybridForest
168
156
  def class_labels
169
157
  self[label].to_a
170
158
  end
159
+
160
+ def disconnect_labels
161
+ labels = class_labels
162
+ except!(label)
163
+ [self, labels]
164
+ end
171
165
  end
172
166
  end
173
167
 
@@ -202,5 +196,23 @@ module HybridForest
202
196
  def false_label?(label)
203
197
  [false, 0].include? label
204
198
  end
199
+
200
+ ##
201
+ # Returns an array of +n+ random numbers in the exclusive +range+.
202
+ def rand_nums(n, range)
203
+ n.times.collect { rand(range) }
204
+ end
205
+
206
+ ##
207
+ # Returns an array of +n+ _unique_ random numbers in the exclusive +range+.
208
+ def rand_uniq_nums(n, range)
209
+ raise ArgumentError if n > range.size
210
+
211
+ nums = Set.new
212
+ until nums.size == n
213
+ nums << rand(range)
214
+ end
215
+ nums.to_a
216
+ end
205
217
  end
206
218
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HybridForest
4
- VERSION = "0.10.0"
4
+ VERSION = "0.11.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hybridforest
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hi-tech-jazz
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-12-29 00:00:00.000000000 Z
11
+ date: 2021-12-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake