hybridforest 0.9.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -1
- data/Gemfile.lock +1 -1
- data/lib/hybridforest/forests/forest_growers/cart_grower.rb +1 -1
- data/lib/hybridforest/forests/forest_growers/id3_grower.rb +1 -1
- data/lib/hybridforest/trees/feature_selectors/random_feature_subspace.rb +1 -0
- data/lib/hybridforest/utilities/utils.rb +47 -31
- data/lib/hybridforest/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa4457c9f58fde0edfa9fd8af0a4ac5ef23db54892f75a50a8f62ac1f8652631
|
4
|
+
data.tar.gz: a8f0afc4b58d067388f3c5986521609a5ede0816643f73235b8517265c6b72c7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9261b0cc8fa086e8b91f910f8b852f2369a1eac0e76ee4b01f1db3dddf9bb7f8f9947cbcaca2f81d6db6a9c734ed516a0848f29b7735661ea5be428b8e720392
|
7
|
+
data.tar.gz: 5d8a886b98ca5e735b4173fb51e114d313c28ef480bd8cae32cf339634e49384a44d4fdb53548afb82228bd368960ef002d55bbd1292dd8313706b2458bf543c
|
data/CHANGELOG.md
CHANGED
@@ -30,4 +30,22 @@
|
|
30
30
|
|
31
31
|
## [0.9.0] - 2021-12-28
|
32
32
|
|
33
|
-
- Update dependencies
|
33
|
+
- Update dependencies
|
34
|
+
|
35
|
+
## [0.10.0] - 2021-12-29
|
36
|
+
|
37
|
+
- Refactor dataframe extensions
|
38
|
+
|
39
|
+
## [0.11.0] - 2021-12-29
|
40
|
+
|
41
|
+
- Randomize Utils.train_test_split
|
42
|
+
- Refactor Utils module
|
43
|
+
|
44
|
+
## [0.12.0] - 2022-01-08
|
45
|
+
|
46
|
+
- Allow Utils.random_sample to be passed a dataframe or a dataframe convertible object
|
47
|
+
- Allow Utils.random_sample's 'size' arg to equal the size of the initial dataframe if the strategy is sampling with replacement
|
48
|
+
|
49
|
+
## [0.13.0] - 2022-01-09
|
50
|
+
|
51
|
+
- Refactor forest growers
|
data/Gemfile.lock
CHANGED
@@ -8,7 +8,7 @@ module HybridForest
|
|
8
8
|
def grow_forest(instances, number_of_trees)
|
9
9
|
forest = []
|
10
10
|
number_of_trees.times do
|
11
|
-
sample
|
11
|
+
sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
|
12
12
|
forest << HybridForest::Trees::CARTTree.new.fit(sample)
|
13
13
|
end
|
14
14
|
forest
|
@@ -8,7 +8,7 @@ module HybridForest
|
|
8
8
|
def grow_forest(instances, number_of_trees)
|
9
9
|
forest = []
|
10
10
|
number_of_trees.times do
|
11
|
-
sample
|
11
|
+
sample = HybridForest::Utils.random_sample(data: instances, size: instances.size)
|
12
12
|
forest << HybridForest::Trees::ID3Tree.new.fit(sample)
|
13
13
|
end
|
14
14
|
forest
|
@@ -12,17 +12,16 @@ module HybridForest
|
|
12
12
|
# of independent features and an array of labels. Returns [+training_set+, +testing_set+, +testing_set_labels+]
|
13
13
|
#
|
14
14
|
def self.train_test_split(dataset, test_set_size = 0.20)
|
15
|
-
# TODO:
|
15
|
+
# TODO: Offer stratify param
|
16
16
|
dataset = to_dataframe(dataset)
|
17
|
+
all_rows = (0...dataset.count).to_a
|
17
18
|
|
18
19
|
test_set_count = (dataset.count * test_set_size).floor
|
19
|
-
|
20
|
-
test_set = dataset[
|
21
|
-
test_set_labels = test_set.
|
22
|
-
test_set.except!(test_set.label)
|
20
|
+
test_set_rows = rand_uniq_nums(test_set_count, 0...dataset.count)
|
21
|
+
test_set = dataset[test_set_rows]
|
22
|
+
test_set, test_set_labels = test_set.disconnect_labels
|
23
23
|
|
24
|
-
|
25
|
-
train_set = dataset[train_set_indices]
|
24
|
+
train_set = dataset[all_rows - test_set_rows]
|
26
25
|
|
27
26
|
[train_set, test_set, test_set_labels]
|
28
27
|
end
|
@@ -37,20 +36,13 @@ module HybridForest
|
|
37
36
|
dataset = to_dataframe(dataset)
|
38
37
|
all_rows = (0...dataset.count).to_a
|
39
38
|
|
40
|
-
|
41
|
-
|
42
|
-
dataset.count.times do
|
43
|
-
row = all_rows.sample
|
44
|
-
train_set_rows << row
|
45
|
-
train_set.concat(dataset[row])
|
46
|
-
end
|
39
|
+
train_set_rows = rand_nums(dataset.count, 0...dataset.count)
|
40
|
+
train_set = dataset[train_set_rows]
|
47
41
|
|
48
42
|
return train_test_split(dataset) if train_set_rows.sort == all_rows
|
49
43
|
|
50
|
-
|
51
|
-
test_set =
|
52
|
-
test_set_labels = test_set.class_labels
|
53
|
-
test_set.except!(test_set.label)
|
44
|
+
test_set = dataset[all_rows - train_set_rows]
|
45
|
+
test_set, test_set_labels = test_set.disconnect_labels
|
54
46
|
|
55
47
|
[train_set, test_set, test_set_labels]
|
56
48
|
end
|
@@ -86,18 +78,18 @@ module HybridForest
|
|
86
78
|
# Draws a random sample of +size+ from +data+.
|
87
79
|
#
|
88
80
|
def self.random_sample(data:, size:, with_replacement: true)
|
89
|
-
|
81
|
+
data = to_dataframe(data)
|
90
82
|
|
91
|
-
if with_replacement
|
92
|
-
|
93
|
-
|
83
|
+
if size < 1 || (!with_replacement && size > data.count)
|
84
|
+
raise ArgumentError, "Invalid sample size"
|
85
|
+
end
|
86
|
+
|
87
|
+
rows = if with_replacement
|
88
|
+
rand_nums(size, 0...data.count)
|
94
89
|
else
|
95
|
-
|
96
|
-
until rows.size == size
|
97
|
-
rows << rand(0...data.count)
|
98
|
-
end
|
99
|
-
data[rows.to_a]
|
90
|
+
rand_uniq_nums(size, 0...data.count)
|
100
91
|
end
|
92
|
+
data[rows]
|
101
93
|
end
|
102
94
|
|
103
95
|
# Outputs a report of common prediction metrics.
|
@@ -113,7 +105,7 @@ module HybridForest
|
|
113
105
|
#
|
114
106
|
def self.accuracy(predicted, actual)
|
115
107
|
accurate = predicted.zip(actual).count { |p, a| equal_labels?(p, a) }
|
116
|
-
accurate.to_f / predicted.count
|
108
|
+
accurate.to_f / predicted.count
|
117
109
|
end
|
118
110
|
|
119
111
|
# Extensions to simplify common dataframe operations.
|
@@ -146,11 +138,11 @@ module HybridForest
|
|
146
138
|
end
|
147
139
|
|
148
140
|
def feature_count(without_label: true)
|
149
|
-
without_label ?
|
141
|
+
without_label ? features.count : names.count
|
150
142
|
end
|
151
143
|
|
152
144
|
def pure?
|
153
|
-
|
145
|
+
self[label].uniq.size == 1
|
154
146
|
end
|
155
147
|
|
156
148
|
def features
|
@@ -158,7 +150,7 @@ module HybridForest
|
|
158
150
|
end
|
159
151
|
|
160
152
|
def count_labels
|
161
|
-
|
153
|
+
self[label].tally
|
162
154
|
end
|
163
155
|
|
164
156
|
def label
|
@@ -168,6 +160,12 @@ module HybridForest
|
|
168
160
|
def class_labels
|
169
161
|
self[label].to_a
|
170
162
|
end
|
163
|
+
|
164
|
+
def disconnect_labels
|
165
|
+
labels = class_labels
|
166
|
+
except!(label)
|
167
|
+
[self, labels]
|
168
|
+
end
|
171
169
|
end
|
172
170
|
end
|
173
171
|
|
@@ -202,5 +200,23 @@ module HybridForest
|
|
202
200
|
def false_label?(label)
|
203
201
|
[false, 0].include? label
|
204
202
|
end
|
203
|
+
|
204
|
+
##
|
205
|
+
# Returns an array of +n+ random numbers in the exclusive +range+.
|
206
|
+
def rand_nums(n, range)
|
207
|
+
n.times.collect { rand(range) }
|
208
|
+
end
|
209
|
+
|
210
|
+
##
|
211
|
+
# Returns an array of +n+ _unique_ random numbers in the exclusive +range+.
|
212
|
+
def rand_uniq_nums(n, range)
|
213
|
+
raise ArgumentError if n > range.size
|
214
|
+
|
215
|
+
nums = Set.new
|
216
|
+
until nums.size == n
|
217
|
+
nums << rand(range)
|
218
|
+
end
|
219
|
+
nums.to_a
|
220
|
+
end
|
205
221
|
end
|
206
222
|
end
|
data/lib/hybridforest/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hybridforest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hi-tech-jazz
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|