hybridforest 0.9.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -1
- data/Gemfile.lock +1 -1
- data/lib/hybridforest/forests/forest_growers/cart_grower.rb +1 -1
- data/lib/hybridforest/forests/forest_growers/id3_grower.rb +1 -1
- data/lib/hybridforest/trees/feature_selectors/random_feature_subspace.rb +1 -0
- data/lib/hybridforest/utilities/utils.rb +47 -31
- data/lib/hybridforest/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa4457c9f58fde0edfa9fd8af0a4ac5ef23db54892f75a50a8f62ac1f8652631
|
4
|
+
data.tar.gz: a8f0afc4b58d067388f3c5986521609a5ede0816643f73235b8517265c6b72c7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9261b0cc8fa086e8b91f910f8b852f2369a1eac0e76ee4b01f1db3dddf9bb7f8f9947cbcaca2f81d6db6a9c734ed516a0848f29b7735661ea5be428b8e720392
|
7
|
+
data.tar.gz: 5d8a886b98ca5e735b4173fb51e114d313c28ef480bd8cae32cf339634e49384a44d4fdb53548afb82228bd368960ef002d55bbd1292dd8313706b2458bf543c
|
data/CHANGELOG.md
CHANGED
@@ -30,4 +30,22 @@
|
|
30
30
|
|
31
31
|
## [0.9.0] - 2021-12-28
|
32
32
|
|
33
|
-
- Update dependencies
|
33
|
+
- Update dependencies
|
34
|
+
|
35
|
+
## [0.10.0] - 2021-12-29
|
36
|
+
|
37
|
+
- Refactor dataframe extensions
|
38
|
+
|
39
|
+
## [0.11.0] - 2021-12-29
|
40
|
+
|
41
|
+
- Randomize Utils.train_test_split
|
42
|
+
- Refactor Utils module
|
43
|
+
|
44
|
+
## [0.12.0] - 2022-01-08
|
45
|
+
|
46
|
+
- Allow Utils.random_sample to be passed a dataframe or a dataframe convertible object
|
47
|
+
- Allow Utils.random_sample's 'size' arg to equal the size of the initial dataframe if the strategy is sampling with replacement
|
48
|
+
|
49
|
+
## [0.13.0] - 2022-01-09
|
50
|
+
|
51
|
+
- Refactor forest growers
|
data/Gemfile.lock
CHANGED
@@ -8,7 +8,7 @@ module HybridForest
|
|
8
8
|
def grow_forest(instances, number_of_trees)
|
9
9
|
forest = []
|
10
10
|
number_of_trees.times do
|
11
|
-
sample
|
11
|
+
sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
|
12
12
|
forest << HybridForest::Trees::CARTTree.new.fit(sample)
|
13
13
|
end
|
14
14
|
forest
|
@@ -8,7 +8,7 @@ module HybridForest
|
|
8
8
|
def grow_forest(instances, number_of_trees)
|
9
9
|
forest = []
|
10
10
|
number_of_trees.times do
|
11
|
-
sample
|
11
|
+
sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
|
12
12
|
forest << HybridForest::Trees::ID3Tree.new.fit(sample)
|
13
13
|
end
|
14
14
|
forest
|
@@ -12,17 +12,16 @@ module HybridForest
|
|
12
12
|
# of independent features and an array of labels. Returns [+training_set+, +testing_set+, +testing_set_labels+]
|
13
13
|
#
|
14
14
|
def self.train_test_split(dataset, test_set_size = 0.20)
|
15
|
-
# TODO:
|
15
|
+
# TODO: Offer stratify param
|
16
16
|
dataset = to_dataframe(dataset)
|
17
|
+
all_rows = (0...dataset.count).to_a
|
17
18
|
|
18
19
|
test_set_count = (dataset.count * test_set_size).floor
|
19
|
-
|
20
|
-
test_set = dataset[
|
21
|
-
test_set_labels = test_set.
|
22
|
-
test_set.except!(test_set.label)
|
20
|
+
test_set_rows = rand_uniq_nums(test_set_count, 0...dataset.count)
|
21
|
+
test_set = dataset[test_set_rows]
|
22
|
+
test_set, test_set_labels = test_set.disconnect_labels
|
23
23
|
|
24
|
-
|
25
|
-
train_set = dataset[train_set_indices]
|
24
|
+
train_set = dataset[all_rows - test_set_rows]
|
26
25
|
|
27
26
|
[train_set, test_set, test_set_labels]
|
28
27
|
end
|
@@ -37,20 +36,13 @@ module HybridForest
|
|
37
36
|
dataset = to_dataframe(dataset)
|
38
37
|
all_rows = (0...dataset.count).to_a
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
dataset.count.times do
|
43
|
-
row = all_rows.sample
|
44
|
-
train_set_rows << row
|
45
|
-
train_set.concat(dataset[row])
|
46
|
-
end
|
39
|
+
train_set_rows = rand_nums(dataset.count, 0...dataset.count)
|
40
|
+
train_set = dataset[train_set_rows]
|
47
41
|
|
48
42
|
return train_test_split(dataset) if train_set_rows.sort == all_rows
|
49
43
|
|
50
|
-
|
51
|
-
test_set =
|
52
|
-
test_set_labels = test_set.class_labels
|
53
|
-
test_set.except!(test_set.label)
|
44
|
+
test_set = dataset[all_rows - train_set_rows]
|
45
|
+
test_set, test_set_labels = test_set.disconnect_labels
|
54
46
|
|
55
47
|
[train_set, test_set, test_set_labels]
|
56
48
|
end
|
@@ -86,18 +78,18 @@ module HybridForest
|
|
86
78
|
# Draws a random sample of +size+ from +data+.
|
87
79
|
#
|
88
80
|
def self.random_sample(data:, size:, with_replacement: true)
|
89
|
-
|
81
|
+
data = to_dataframe(data)
|
90
82
|
|
91
|
-
if with_replacement
|
92
|
-
|
93
|
-
|
83
|
+
if size < 1 || (!with_replacement && size > data.count)
|
84
|
+
raise ArgumentError, "Invalid sample size"
|
85
|
+
end
|
86
|
+
|
87
|
+
rows = if with_replacement
|
88
|
+
rand_nums(size, 0...data.count)
|
94
89
|
else
|
95
|
-
|
96
|
-
until rows.size == size
|
97
|
-
rows << rand(0...data.count)
|
98
|
-
end
|
99
|
-
data[rows.to_a]
|
90
|
+
rand_uniq_nums(size, 0...data.count)
|
100
91
|
end
|
92
|
+
data[rows]
|
101
93
|
end
|
102
94
|
|
103
95
|
# Outputs a report of common prediction metrics.
|
@@ -113,7 +105,7 @@ module HybridForest
|
|
113
105
|
#
|
114
106
|
def self.accuracy(predicted, actual)
|
115
107
|
accurate = predicted.zip(actual).count { |p, a| equal_labels?(p, a) }
|
116
|
-
accurate.to_f / predicted.count
|
108
|
+
accurate.to_f / predicted.count
|
117
109
|
end
|
118
110
|
|
119
111
|
# Extensions to simplify common dataframe operations.
|
@@ -146,11 +138,11 @@ module HybridForest
|
|
146
138
|
end
|
147
139
|
|
148
140
|
def feature_count(without_label: true)
|
149
|
-
without_label ?
|
141
|
+
without_label ? features.count : names.count
|
150
142
|
end
|
151
143
|
|
152
144
|
def pure?
|
153
|
-
|
145
|
+
self[label].uniq.size == 1
|
154
146
|
end
|
155
147
|
|
156
148
|
def features
|
@@ -158,7 +150,7 @@ module HybridForest
|
|
158
150
|
end
|
159
151
|
|
160
152
|
def count_labels
|
161
|
-
|
153
|
+
self[label].tally
|
162
154
|
end
|
163
155
|
|
164
156
|
def label
|
@@ -168,6 +160,12 @@ module HybridForest
|
|
168
160
|
def class_labels
|
169
161
|
self[label].to_a
|
170
162
|
end
|
163
|
+
|
164
|
+
def disconnect_labels
|
165
|
+
labels = class_labels
|
166
|
+
except!(label)
|
167
|
+
[self, labels]
|
168
|
+
end
|
171
169
|
end
|
172
170
|
end
|
173
171
|
|
@@ -202,5 +200,23 @@ module HybridForest
|
|
202
200
|
def false_label?(label)
|
203
201
|
[false, 0].include? label
|
204
202
|
end
|
203
|
+
|
204
|
+
##
|
205
|
+
# Returns an array of +n+ random numbers in the exclusive +range+.
|
206
|
+
def rand_nums(n, range)
|
207
|
+
n.times.collect { rand(range) }
|
208
|
+
end
|
209
|
+
|
210
|
+
##
|
211
|
+
# Returns an array of +n+ _unique_ random numbers in the exclusive +range+.
|
212
|
+
def rand_uniq_nums(n, range)
|
213
|
+
raise ArgumentError if n > range.size
|
214
|
+
|
215
|
+
nums = Set.new
|
216
|
+
until nums.size == n
|
217
|
+
nums << rand(range)
|
218
|
+
end
|
219
|
+
nums.to_a
|
220
|
+
end
|
205
221
|
end
|
206
222
|
end
|
data/lib/hybridforest/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hybridforest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hi-tech-jazz
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|