adaboost 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/adaboost/adaboost.rb +18 -22
- data/lib/adaboost/contingency_table.rb +3 -3
- data/lib/adaboost/evaluator.rb +9 -9
- data/lib/adaboost/features_analyzer.rb +24 -24
- data/lib/adaboost/resampler.rb +5 -5
- data/lib/adaboost/weak_classifier.rb +4 -4
- data/lib/adaboost/weak_learner.rb +20 -20
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: dbef461fb8ab7809de5e99ec234e85c002427bd5
|
|
4
|
+
data.tar.gz: 446d84856754769aeb49e4526de18f4c547d85de
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: de5b31c2367c9459d09d5456684b595eeb8e2fa2b8846ee80911027e66b7a45a779762b464f2bd06365315ba298365727f8cbe1a81f8ebbdb3be31ef98686725
|
|
7
|
+
data.tar.gz: 8b6157db326c5510329fd3b0b3531ffe7e4bb8fee5fbcd3e5b1b73e36ea17081a0fe809c71e8fa6f2f871fdb238b3edb060c3559c6e196e8194fd18ecf604861
|
data/lib/adaboost/adaboost.rb
CHANGED
|
@@ -4,40 +4,40 @@ module AdaBoost
|
|
|
4
4
|
|
|
5
5
|
attr_reader :weak_classifiers, :y_index
|
|
6
6
|
|
|
7
|
-
def initialize
|
|
7
|
+
def initialize(number_of_classifiers, y_index)
|
|
8
8
|
@weak_classifiers = []
|
|
9
|
-
@weak_learner = WeakLearner.new
|
|
9
|
+
@weak_learner = WeakLearner.new(y_index)
|
|
10
10
|
@number_of_classifiers = number_of_classifiers
|
|
11
11
|
@weights = []
|
|
12
12
|
@y_index = y_index
|
|
13
13
|
end
|
|
14
14
|
|
|
15
|
-
def train
|
|
15
|
+
def train(samples)
|
|
16
16
|
if Config::OVER_SAMPLING_TRAINING_SET
|
|
17
|
-
resampler = Resampler.new
|
|
18
|
-
resampler.over_sample
|
|
17
|
+
resampler = Resampler.new(@y_index)
|
|
18
|
+
resampler.over_sample(samples)
|
|
19
19
|
end
|
|
20
|
-
initialize_weights
|
|
21
|
-
0.upto
|
|
22
|
-
weak_classifier = @weak_learner.generate_weak_classifier
|
|
20
|
+
initialize_weights(samples)
|
|
21
|
+
0.upto(@number_of_classifiers - 1) do |i|
|
|
22
|
+
weak_classifier = @weak_learner.generate_weak_classifier(samples, @weights)
|
|
23
23
|
weak_classifier.compute_alpha
|
|
24
|
-
update_weights
|
|
24
|
+
update_weights(weak_classifier, samples)
|
|
25
25
|
@weak_classifiers << weak_classifier
|
|
26
26
|
yield i, weak_classifier if block_given?
|
|
27
27
|
end
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
def classify
|
|
30
|
+
def classify(sample)
|
|
31
31
|
score = 0.0
|
|
32
32
|
@weak_classifiers.each do |weak_classifier|
|
|
33
|
-
score += weak_classifier.classify_with_alpha
|
|
33
|
+
score += weak_classifier.classify_with_alpha(sample)
|
|
34
34
|
end
|
|
35
35
|
score
|
|
36
36
|
end
|
|
37
37
|
|
|
38
|
-
def self.build_from_model
|
|
38
|
+
def self.build_from_model(model, y_index = 0)
|
|
39
39
|
classifiers = model.weak_classifiers
|
|
40
|
-
adaboost = AdaBoost.new
|
|
40
|
+
adaboost = AdaBoost.new(classifiers.size, y_index)
|
|
41
41
|
classifiers.each do |classifier|
|
|
42
42
|
adaboost.weak_classifiers << WeakClassifier.new(classifier.feature_number, classifier.split, classifier.alpha)
|
|
43
43
|
end
|
|
@@ -46,12 +46,12 @@ module AdaBoost
|
|
|
46
46
|
|
|
47
47
|
private
|
|
48
48
|
|
|
49
|
-
def initialize_weights
|
|
49
|
+
def initialize_weights(samples)
|
|
50
50
|
samples_size = samples.size.to_f
|
|
51
51
|
negative_weight = 1 / samples_size
|
|
52
52
|
positive_weight = negative_weight
|
|
53
53
|
if Config::INCORPORATE_COST_SENSITIVE_LEARNING
|
|
54
|
-
analyzer = FeaturesAnalyzer.new
|
|
54
|
+
analyzer = FeaturesAnalyzer.new(@y_index)
|
|
55
55
|
distribution = analyzer.analyze(samples).distribution
|
|
56
56
|
positive_rate = distribution.positive / samples_size
|
|
57
57
|
negative_rate = distribution.negative / samples_size
|
|
@@ -61,19 +61,15 @@ module AdaBoost
|
|
|
61
61
|
end
|
|
62
62
|
samples.each_with_index do |sample, i|
|
|
63
63
|
y = sample[@y_index]
|
|
64
|
-
|
|
65
|
-
@weights[i] = positive_weight
|
|
66
|
-
else
|
|
67
|
-
@weights[i] = negative_weight
|
|
68
|
-
end
|
|
64
|
+
@weights[i] = (y == -1) ? positive_weight : negative_weight
|
|
69
65
|
end
|
|
70
66
|
end
|
|
71
67
|
|
|
72
|
-
def update_weights
|
|
68
|
+
def update_weights(weak_classifier, samples)
|
|
73
69
|
sum = 0.0
|
|
74
70
|
samples.each_with_index do |sample, i|
|
|
75
71
|
y = sample[@y_index]
|
|
76
|
-
@weights[i] *= Math.exp
|
|
72
|
+
@weights[i] *= Math.exp(-(weak_classifier.alpha) * weak_classifier.classify(sample) * y)
|
|
77
73
|
sum += @weights[i]
|
|
78
74
|
end
|
|
79
75
|
@weights.each_with_index do |_, i|
|
|
@@ -22,7 +22,7 @@ module AdaBoost
|
|
|
22
22
|
@table[1][0]
|
|
23
23
|
end
|
|
24
24
|
|
|
25
|
-
def add_prediction
|
|
25
|
+
def add_prediction(y, h)
|
|
26
26
|
@table[class_to_index(y)][class_to_index(h)] += 1
|
|
27
27
|
end
|
|
28
28
|
|
|
@@ -175,8 +175,8 @@ module AdaBoost
|
|
|
175
175
|
]
|
|
176
176
|
end
|
|
177
177
|
|
|
178
|
-
def class_to_index
|
|
179
|
-
k > 0 ? 1 : 0
|
|
178
|
+
def class_to_index(k)
|
|
179
|
+
(k > 0) ? 1 : 0
|
|
180
180
|
end
|
|
181
181
|
end
|
|
182
182
|
end
|
data/lib/adaboost/evaluator.rb
CHANGED
|
@@ -2,26 +2,26 @@ module AdaBoost
|
|
|
2
2
|
|
|
3
3
|
class Evaluator
|
|
4
4
|
|
|
5
|
-
def initialize
|
|
5
|
+
def initialize(classifier)
|
|
6
6
|
@classifier = classifier
|
|
7
7
|
@threshold = Float::MAX
|
|
8
8
|
end
|
|
9
9
|
|
|
10
|
-
def evaluate
|
|
10
|
+
def evaluate(test_set)
|
|
11
11
|
contingency_table = ContingencyTable.new
|
|
12
12
|
test_set.each do |sample|
|
|
13
13
|
y = sample[@classifier.y_index]
|
|
14
|
-
if Config::USE_THRESHOLD_CLASSIFICATION
|
|
15
|
-
|
|
14
|
+
h = if Config::USE_THRESHOLD_CLASSIFICATION
|
|
15
|
+
classify_using_threshold(sample)
|
|
16
16
|
else
|
|
17
|
-
|
|
17
|
+
classify_normally(sample)
|
|
18
18
|
end
|
|
19
|
-
contingency_table.add_prediction
|
|
19
|
+
contingency_table.add_prediction(y, h)
|
|
20
20
|
end
|
|
21
21
|
contingency_table
|
|
22
22
|
end
|
|
23
23
|
|
|
24
|
-
def used_feature_numbers
|
|
24
|
+
def used_feature_numbers(unique = false)
|
|
25
25
|
used_feature_numbers = []
|
|
26
26
|
@classifier.weak_classifiers.each do |weak_classifier|
|
|
27
27
|
used_feature_numbers << weak_classifier.feature_number
|
|
@@ -51,11 +51,11 @@ module AdaBoost
|
|
|
51
51
|
@threshold
|
|
52
52
|
end
|
|
53
53
|
|
|
54
|
-
def classify_normally
|
|
54
|
+
def classify_normally(sample)
|
|
55
55
|
@classifier.classify(sample > 0) ? 1 : -1
|
|
56
56
|
end
|
|
57
57
|
|
|
58
|
-
def classify_using_threshold
|
|
58
|
+
def classify_using_threshold(sample)
|
|
59
59
|
score = 0.0
|
|
60
60
|
@classifier.weak_classifiers.each do |weak_classifier|
|
|
61
61
|
if sample[weak_classifier.feature_number] > weak_classifier.split
|
|
@@ -7,45 +7,45 @@ module AdaBoost
|
|
|
7
7
|
|
|
8
8
|
class FeaturesAnalyzer
|
|
9
9
|
|
|
10
|
-
def initialize
|
|
10
|
+
def initialize(y_index)
|
|
11
11
|
@y_index = y_index
|
|
12
12
|
end
|
|
13
13
|
|
|
14
|
-
def analyze
|
|
14
|
+
def analyze(samples)
|
|
15
15
|
|
|
16
16
|
statistics = []
|
|
17
|
-
distribution = Distribution.new
|
|
17
|
+
distribution = Distribution.new(0, 0)
|
|
18
18
|
number_of_samples = samples.size
|
|
19
19
|
|
|
20
20
|
if number_of_samples < 1
|
|
21
|
-
raise ArgumentError.new
|
|
21
|
+
raise ArgumentError.new('At least one sample is needed to analyze.')
|
|
22
22
|
end
|
|
23
23
|
number_of_features = @y_index
|
|
24
24
|
sample_size = samples[0].size
|
|
25
25
|
if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
|
|
26
|
-
raise ArgumentError.new
|
|
26
|
+
raise ArgumentError.new('At least 1 feature is needed to analyze.')
|
|
27
27
|
end
|
|
28
|
-
0.upto
|
|
29
|
-
|
|
28
|
+
0.upto(number_of_features - 1) do
|
|
29
|
+
statistics << FeatureStatistic.new(Float::MAX, -Float::MAX, 0, 0, 0, 0)
|
|
30
30
|
end
|
|
31
31
|
samples.each do |sample|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
32
|
+
y = sample[@y_index]
|
|
33
|
+
if y == -1
|
|
34
|
+
distribution.negative += 1
|
|
35
|
+
else
|
|
36
|
+
distribution.positive += 1
|
|
37
|
+
end
|
|
38
|
+
0.upto(number_of_features - 1) do |i|
|
|
39
|
+
statistic = statistics[i]
|
|
40
|
+
feature_value = sample[i]
|
|
41
|
+
if feature_value < statistic.min
|
|
42
|
+
statistic.min = feature_value
|
|
37
43
|
end
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
feature_value = sample[i]
|
|
41
|
-
if feature_value < statistic.min
|
|
42
|
-
statistic.min = feature_value
|
|
43
|
-
end
|
|
44
|
-
if feature_value > statistic.max
|
|
45
|
-
statistic.max = feature_value
|
|
46
|
-
end
|
|
47
|
-
statistic.sum += feature_value
|
|
44
|
+
if feature_value > statistic.max
|
|
45
|
+
statistic.max = feature_value
|
|
48
46
|
end
|
|
47
|
+
statistic.sum += feature_value
|
|
48
|
+
end
|
|
49
49
|
end
|
|
50
50
|
statistics.each do |statistic|
|
|
51
51
|
statistic.avg = statistic.sum / number_of_samples.to_f
|
|
@@ -67,7 +67,7 @@ module AdaBoost
|
|
|
67
67
|
analyze
|
|
68
68
|
end
|
|
69
69
|
|
|
70
|
-
def relations
|
|
70
|
+
def relations(x, y, samples, statistics)
|
|
71
71
|
sum = 0.0
|
|
72
72
|
samples.each do |sample|
|
|
73
73
|
x_value = sample[x].to_f
|
|
@@ -76,7 +76,7 @@ module AdaBoost
|
|
|
76
76
|
end
|
|
77
77
|
cov = sum / (samples.size - 1).to_f
|
|
78
78
|
cor = cov / (statistics[x].std * statistics[y].std).to_f
|
|
79
|
-
VariableRelations.new
|
|
79
|
+
VariableRelations.new(x, y, cov, cor)
|
|
80
80
|
end
|
|
81
81
|
end
|
|
82
82
|
end
|
data/lib/adaboost/resampler.rb
CHANGED
|
@@ -2,12 +2,12 @@ module AdaBoost
|
|
|
2
2
|
|
|
3
3
|
class Resampler
|
|
4
4
|
|
|
5
|
-
def initialize
|
|
5
|
+
def initialize(y_index)
|
|
6
6
|
@y_index = y_index
|
|
7
7
|
end
|
|
8
8
|
|
|
9
|
-
def over_sample
|
|
10
|
-
distribution = distribution
|
|
9
|
+
def over_sample(samples)
|
|
10
|
+
distribution = distribution(samples)
|
|
11
11
|
y0 = distribution.negative
|
|
12
12
|
y1 = distribution.positive
|
|
13
13
|
majority = y0 < y1 ? 1.0 : -1.0
|
|
@@ -25,8 +25,8 @@ module AdaBoost
|
|
|
25
25
|
|
|
26
26
|
private
|
|
27
27
|
|
|
28
|
-
def distribution
|
|
29
|
-
analyzer = FeaturesAnalyzer.new
|
|
28
|
+
def distribution(instances)
|
|
29
|
+
analyzer = FeaturesAnalyzer.new(@y_index)
|
|
30
30
|
analyzer.analyze(instances).distribution
|
|
31
31
|
end
|
|
32
32
|
end
|
|
@@ -5,7 +5,7 @@ module AdaBoost
|
|
|
5
5
|
attr_accessor :error
|
|
6
6
|
attr_reader :feature_number, :split, :alpha
|
|
7
7
|
|
|
8
|
-
def initialize
|
|
8
|
+
def initialize(feature_number, split, alpha = 0.0, error = 0.0)
|
|
9
9
|
@feature_number = feature_number
|
|
10
10
|
@split = split
|
|
11
11
|
@error = error
|
|
@@ -16,15 +16,15 @@ module AdaBoost
|
|
|
16
16
|
@alpha = 0.5 * Math.log((1.0 - @error) / @error)
|
|
17
17
|
end
|
|
18
18
|
|
|
19
|
-
def classify
|
|
19
|
+
def classify(sample)
|
|
20
20
|
sample[@feature_number] > @split ? 1 : -1
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
-
def classify_with_alpha
|
|
23
|
+
def classify_with_alpha(sample)
|
|
24
24
|
return classify(sample) * @alpha
|
|
25
25
|
end
|
|
26
26
|
|
|
27
|
-
def increase_error
|
|
27
|
+
def increase_error(amount)
|
|
28
28
|
@error += amount
|
|
29
29
|
end
|
|
30
30
|
end
|
|
@@ -2,31 +2,31 @@ module AdaBoost
|
|
|
2
2
|
|
|
3
3
|
class WeakLearner
|
|
4
4
|
|
|
5
|
-
def initialize
|
|
5
|
+
def initialize(y_index)
|
|
6
6
|
@y_index = y_index
|
|
7
|
-
@analyzer = FeaturesAnalyzer.new
|
|
7
|
+
@analyzer = FeaturesAnalyzer.new(y_index)
|
|
8
8
|
@classifiers_cache = []
|
|
9
9
|
end
|
|
10
10
|
|
|
11
|
-
def features_satistics
|
|
11
|
+
def features_satistics(samples)
|
|
12
12
|
@analyzer.analyze(samples).statistics
|
|
13
13
|
end
|
|
14
14
|
|
|
15
|
-
def generate_weak_classifier
|
|
15
|
+
def generate_weak_classifier(samples, weights)
|
|
16
16
|
number_of_samples = samples.size
|
|
17
17
|
if number_of_samples < 1
|
|
18
|
-
raise ArgumentError.new
|
|
18
|
+
raise ArgumentError.new('At least one sample is needed to generate.')
|
|
19
19
|
end
|
|
20
20
|
number_of_features = @y_index
|
|
21
21
|
sample_size = samples[0].size
|
|
22
22
|
if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
|
|
23
|
-
raise ArgumentError.new
|
|
23
|
+
raise ArgumentError.new('At least 1 feature is needed to generate.')
|
|
24
24
|
end
|
|
25
25
|
classifiers = []
|
|
26
26
|
if Config::USE_RANDOM_WEAK_CLASSIFIERS
|
|
27
|
-
classifiers = generate_random_classifiers
|
|
27
|
+
classifiers = generate_random_classifiers(samples, number_of_features)
|
|
28
28
|
else
|
|
29
|
-
classifiers = generate_all_possible_classifiers
|
|
29
|
+
classifiers = generate_all_possible_classifiers(samples, number_of_features)
|
|
30
30
|
end
|
|
31
31
|
best_index = -1
|
|
32
32
|
best_error = Float::MAX
|
|
@@ -35,7 +35,7 @@ module AdaBoost
|
|
|
35
35
|
samples.each_with_index do |sample, j|
|
|
36
36
|
y = sample[@y_index]
|
|
37
37
|
if classifier.classify(sample).to_f != y
|
|
38
|
-
classifier.increase_error
|
|
38
|
+
classifier.increase_error(weights[j])
|
|
39
39
|
end
|
|
40
40
|
end
|
|
41
41
|
if classifier.error < best_error
|
|
@@ -45,33 +45,33 @@ module AdaBoost
|
|
|
45
45
|
end
|
|
46
46
|
best = classifiers[best_index]
|
|
47
47
|
if !Config::USE_RANDOM_WEAK_CLASSIFIERS
|
|
48
|
-
classifiers.delete_at
|
|
48
|
+
classifiers.delete_at(best_index)
|
|
49
49
|
end
|
|
50
50
|
best
|
|
51
51
|
end
|
|
52
52
|
|
|
53
53
|
private
|
|
54
54
|
|
|
55
|
-
def generate_random_classifiers
|
|
55
|
+
def generate_random_classifiers(samples, number_of_features)
|
|
56
56
|
classifiers = []
|
|
57
|
-
statistics = features_satistics
|
|
58
|
-
0.upto
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
57
|
+
statistics = features_satistics(samples)
|
|
58
|
+
0.upto(Config::NUMBER_OF_RANDOM_CLASSIFIERS - 1) do
|
|
59
|
+
feature_number = rand(number_of_features)
|
|
60
|
+
info = statistics[feature_number]
|
|
61
|
+
split = rand * info.rng + info.min
|
|
62
|
+
classifiers << WeakClassifier.new(feature_number, split)
|
|
63
63
|
end
|
|
64
64
|
classifiers
|
|
65
65
|
end
|
|
66
66
|
|
|
67
|
-
def generate_all_possible_classifiers
|
|
67
|
+
def generate_all_possible_classifiers(samples, number_of_features)
|
|
68
68
|
if @classifiers_cache.size == 0
|
|
69
69
|
matrix = []
|
|
70
|
-
0.upto
|
|
70
|
+
0.upto(number_of_features - 1) do
|
|
71
71
|
matrix << []
|
|
72
72
|
end
|
|
73
73
|
samples.each do |sample|
|
|
74
|
-
0.upto
|
|
74
|
+
0.upto(number_of_features - 1) do |i|
|
|
75
75
|
sample_value = sample[i]
|
|
76
76
|
matrix[i] << sample_value
|
|
77
77
|
end
|