hybridforest 0.10.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fd85ad9a8081b634556371599f61557c0490c0694bdc24f8aa8d866d44ba0843
4
- data.tar.gz: f58382ac2737bce52c85cdf83e27f3099190a0f52be8c159780033194c183dd8
3
+ metadata.gz: b32e830c0e55cd23122b879bae7d42407085026f573d5565522ba64fe71a255e
4
+ data.tar.gz: 55c965cbe81e2f2a2e18ebecd21b1027577249f2aa6de4bb0e57b14ab6409f30
5
5
  SHA512:
6
- metadata.gz: c6678ba9cf3ccf15970801879a4d66f6ef98c8cf80353fc4ce2e9808f79063c447d19e96be374223403fedc093ac67cdd6599140df391ca68d05c708725ba4fc
7
- data.tar.gz: 5c94ea5df0e6305719bf8ae9848a0a36d95d71fa51d7abe7bb09ed3f32720df98616bd3f2892e5c3467b647c9f6ebb7b89486e5cff771dde3141388d026fefe4
6
+ metadata.gz: 7fd27d1de1bcc9f1776607475bc3e03731c8053dbd309171265d1eeb1abc1498965b62d251f1f731ea708ea3914372cba1843b73be1acedcb90d417e79cc54bf
7
+ data.tar.gz: 75a6a949ccce8e0369966814fdd9a40944a9163dacb82de7a30fad31d54844e5fdc396ea546dbfcad19258e249a60b452973ee3df4e9fe8f07da71d4e888c274
data/CHANGELOG.md CHANGED
@@ -34,4 +34,9 @@
34
34
 
35
35
  ## [0.10.0] - 2021-12-29
36
36
 
37
- - Refactor dataframe extensions
37
+ - Refactor dataframe extensions
38
+
39
+ ## [0.11.0] - 2021-12-29
40
+
41
+ - Randomize Utils.train_test_split
42
+ - Refactor Utils module
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- hybridforest (0.9.0)
4
+ hybridforest (0.10.0)
5
5
  activesupport (~> 6.1)
6
6
  rake (~> 13.0)
7
7
  require_all
@@ -6,6 +6,7 @@ module HybridForest
6
6
  module Trees
7
7
  class RandomFeatureSubspace
8
8
  def select_features(all_features)
9
+ # TODO: Allow the subspace size to be configured.
9
10
  n = default_subspace_size(all_features.count)
10
11
  indices = Set.new
11
12
  until indices.size == n
@@ -12,17 +12,16 @@ module HybridForest
12
12
  # of independent features and an array of labels. Returns [+training_set+, +testing_set+, +testing_set_labels+]
13
13
  #
14
14
  def self.train_test_split(dataset, test_set_size = 0.20)
15
- # TODO: Shuffle and stratify samples
15
+ # TODO: Offer stratify param
16
16
  dataset = to_dataframe(dataset)
17
+ all_rows = (0...dataset.count).to_a
17
18
 
18
19
  test_set_count = (dataset.count * test_set_size).floor
19
- test_set_indices = 0..test_set_count
20
- test_set = dataset[test_set_indices]
21
- test_set_labels = test_set.class_labels
22
- test_set.except!(test_set.label)
20
+ test_set_rows = rand_uniq_nums(test_set_count, 0...dataset.count)
21
+ test_set = dataset[test_set_rows]
22
+ test_set, test_set_labels = test_set.disconnect_labels
23
23
 
24
- train_set_indices = test_set_count + 1...dataset.count
25
- train_set = dataset[train_set_indices]
24
+ train_set = dataset[all_rows - test_set_rows]
26
25
 
27
26
  [train_set, test_set, test_set_labels]
28
27
  end
@@ -37,20 +36,13 @@ module HybridForest
37
36
  dataset = to_dataframe(dataset)
38
37
  all_rows = (0...dataset.count).to_a
39
38
 
40
- train_set = Rover::DataFrame.new
41
- train_set_rows = []
42
- dataset.count.times do
43
- row = all_rows.sample
44
- train_set_rows << row
45
- train_set.concat(dataset[row])
46
- end
39
+ train_set_rows = rand_nums(dataset.count, 0...dataset.count)
40
+ train_set = dataset[train_set_rows]
47
41
 
48
42
  return train_test_split(dataset) if train_set_rows.sort == all_rows
49
43
 
50
- test_set_rows = all_rows - train_set_rows
51
- test_set = dataset[test_set_rows]
52
- test_set_labels = test_set.class_labels
53
- test_set.except!(test_set.label)
44
+ test_set = dataset[all_rows - train_set_rows]
45
+ test_set, test_set_labels = test_set.disconnect_labels
54
46
 
55
47
  [train_set, test_set, test_set_labels]
56
48
  end
@@ -88,16 +80,12 @@ module HybridForest
88
80
  def self.random_sample(data:, size:, with_replacement: true)
89
81
  raise ArgumentError, "Invalid sample size" if size < 1 || size > data.count
90
82
 
91
- if with_replacement
92
- rows = size.times.collect { rand(0...data.count) }
93
- data[rows]
83
+ rows = if with_replacement
84
+ rand_nums(size, 0...data.count)
94
85
  else
95
- rows = Set.new
96
- until rows.size == size
97
- rows << rand(0...data.count)
98
- end
99
- data[rows.to_a]
86
+ rand_uniq_nums(size, 0...data.count)
100
87
  end
88
+ data[rows]
101
89
  end
102
90
 
103
91
  # Outputs a report of common prediction metrics.
@@ -168,6 +156,12 @@ module HybridForest
168
156
  def class_labels
169
157
  self[label].to_a
170
158
  end
159
+
160
+ def disconnect_labels
161
+ labels = class_labels
162
+ except!(label)
163
+ [self, labels]
164
+ end
171
165
  end
172
166
  end
173
167
 
@@ -202,5 +196,23 @@ module HybridForest
202
196
  def false_label?(label)
203
197
  [false, 0].include? label
204
198
  end
199
+
200
+ ##
201
+ # Returns an array of +n+ random numbers in the exclusive +range+.
202
+ def rand_nums(n, range)
203
+ n.times.collect { rand(range) }
204
+ end
205
+
206
+ ##
207
+ # Returns an array of +n+ _unique_ random numbers in the exclusive +range+.
208
+ def rand_uniq_nums(n, range)
209
+ raise ArgumentError if n > range.size
210
+
211
+ nums = Set.new
212
+ until nums.size == n
213
+ nums << rand(range)
214
+ end
215
+ nums.to_a
216
+ end
205
217
  end
206
218
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HybridForest
4
- VERSION = "0.10.0"
4
+ VERSION = "0.11.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hybridforest
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hi-tech-jazz
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-12-29 00:00:00.000000000 Z
11
+ date: 2021-12-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake