adaboost 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/adaboost/adaboost.rb +18 -22
- data/lib/adaboost/contingency_table.rb +3 -3
- data/lib/adaboost/evaluator.rb +9 -9
- data/lib/adaboost/features_analyzer.rb +24 -24
- data/lib/adaboost/resampler.rb +5 -5
- data/lib/adaboost/weak_classifier.rb +4 -4
- data/lib/adaboost/weak_learner.rb +20 -20
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dbef461fb8ab7809de5e99ec234e85c002427bd5
|
4
|
+
data.tar.gz: 446d84856754769aeb49e4526de18f4c547d85de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de5b31c2367c9459d09d5456684b595eeb8e2fa2b8846ee80911027e66b7a45a779762b464f2bd06365315ba298365727f8cbe1a81f8ebbdb3be31ef98686725
|
7
|
+
data.tar.gz: 8b6157db326c5510329fd3b0b3531ffe7e4bb8fee5fbcd3e5b1b73e36ea17081a0fe809c71e8fa6f2f871fdb238b3edb060c3559c6e196e8194fd18ecf604861
|
data/lib/adaboost/adaboost.rb
CHANGED
@@ -4,40 +4,40 @@ module AdaBoost
|
|
4
4
|
|
5
5
|
attr_reader :weak_classifiers, :y_index
|
6
6
|
|
7
|
-
def initialize
|
7
|
+
def initialize(number_of_classifiers, y_index)
|
8
8
|
@weak_classifiers = []
|
9
|
-
@weak_learner = WeakLearner.new
|
9
|
+
@weak_learner = WeakLearner.new(y_index)
|
10
10
|
@number_of_classifiers = number_of_classifiers
|
11
11
|
@weights = []
|
12
12
|
@y_index = y_index
|
13
13
|
end
|
14
14
|
|
15
|
-
def train
|
15
|
+
def train(samples)
|
16
16
|
if Config::OVER_SAMPLING_TRAINING_SET
|
17
|
-
resampler = Resampler.new
|
18
|
-
resampler.over_sample
|
17
|
+
resampler = Resampler.new(@y_index)
|
18
|
+
resampler.over_sample(samples)
|
19
19
|
end
|
20
|
-
initialize_weights
|
21
|
-
0.upto
|
22
|
-
weak_classifier = @weak_learner.generate_weak_classifier
|
20
|
+
initialize_weights(samples)
|
21
|
+
0.upto(@number_of_classifiers - 1) do |i|
|
22
|
+
weak_classifier = @weak_learner.generate_weak_classifier(samples, @weights)
|
23
23
|
weak_classifier.compute_alpha
|
24
|
-
update_weights
|
24
|
+
update_weights(weak_classifier, samples)
|
25
25
|
@weak_classifiers << weak_classifier
|
26
26
|
yield i, weak_classifier if block_given?
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
def classify
|
30
|
+
def classify(sample)
|
31
31
|
score = 0.0
|
32
32
|
@weak_classifiers.each do |weak_classifier|
|
33
|
-
score += weak_classifier.classify_with_alpha
|
33
|
+
score += weak_classifier.classify_with_alpha(sample)
|
34
34
|
end
|
35
35
|
score
|
36
36
|
end
|
37
37
|
|
38
|
-
def self.build_from_model
|
38
|
+
def self.build_from_model(model, y_index = 0)
|
39
39
|
classifiers = model.weak_classifiers
|
40
|
-
adaboost = AdaBoost.new
|
40
|
+
adaboost = AdaBoost.new(classifiers.size, y_index)
|
41
41
|
classifiers.each do |classifier|
|
42
42
|
adaboost.weak_classifiers << WeakClassifier.new(classifier.feature_number, classifier.split, classifier.alpha)
|
43
43
|
end
|
@@ -46,12 +46,12 @@ module AdaBoost
|
|
46
46
|
|
47
47
|
private
|
48
48
|
|
49
|
-
def initialize_weights
|
49
|
+
def initialize_weights(samples)
|
50
50
|
samples_size = samples.size.to_f
|
51
51
|
negative_weight = 1 / samples_size
|
52
52
|
positive_weight = negative_weight
|
53
53
|
if Config::INCORPORATE_COST_SENSITIVE_LEARNING
|
54
|
-
analyzer = FeaturesAnalyzer.new
|
54
|
+
analyzer = FeaturesAnalyzer.new(@y_index)
|
55
55
|
distribution = analyzer.analyze(samples).distribution
|
56
56
|
positive_rate = distribution.positive / samples_size
|
57
57
|
negative_rate = distribution.negative / samples_size
|
@@ -61,19 +61,15 @@ module AdaBoost
|
|
61
61
|
end
|
62
62
|
samples.each_with_index do |sample, i|
|
63
63
|
y = sample[@y_index]
|
64
|
-
|
65
|
-
@weights[i] = positive_weight
|
66
|
-
else
|
67
|
-
@weights[i] = negative_weight
|
68
|
-
end
|
64
|
+
@weights[i] = (y == -1) ? positive_weight : negative_weight
|
69
65
|
end
|
70
66
|
end
|
71
67
|
|
72
|
-
def update_weights
|
68
|
+
def update_weights(weak_classifier, samples)
|
73
69
|
sum = 0.0
|
74
70
|
samples.each_with_index do |sample, i|
|
75
71
|
y = sample[@y_index]
|
76
|
-
@weights[i] *= Math.exp
|
72
|
+
@weights[i] *= Math.exp(-(weak_classifier.alpha) * weak_classifier.classify(sample) * y)
|
77
73
|
sum += @weights[i]
|
78
74
|
end
|
79
75
|
@weights.each_with_index do |_, i|
|
@@ -22,7 +22,7 @@ module AdaBoost
|
|
22
22
|
@table[1][0]
|
23
23
|
end
|
24
24
|
|
25
|
-
def add_prediction
|
25
|
+
def add_prediction(y, h)
|
26
26
|
@table[class_to_index(y)][class_to_index(h)] += 1
|
27
27
|
end
|
28
28
|
|
@@ -175,8 +175,8 @@ module AdaBoost
|
|
175
175
|
]
|
176
176
|
end
|
177
177
|
|
178
|
-
def class_to_index
|
179
|
-
k > 0 ? 1 : 0
|
178
|
+
def class_to_index(k)
|
179
|
+
(k > 0) ? 1 : 0
|
180
180
|
end
|
181
181
|
end
|
182
182
|
end
|
data/lib/adaboost/evaluator.rb
CHANGED
@@ -2,26 +2,26 @@ module AdaBoost
|
|
2
2
|
|
3
3
|
class Evaluator
|
4
4
|
|
5
|
-
def initialize
|
5
|
+
def initialize(classifier)
|
6
6
|
@classifier = classifier
|
7
7
|
@threshold = Float::MAX
|
8
8
|
end
|
9
9
|
|
10
|
-
def evaluate
|
10
|
+
def evaluate(test_set)
|
11
11
|
contingency_table = ContingencyTable.new
|
12
12
|
test_set.each do |sample|
|
13
13
|
y = sample[@classifier.y_index]
|
14
|
-
if Config::USE_THRESHOLD_CLASSIFICATION
|
15
|
-
|
14
|
+
h = if Config::USE_THRESHOLD_CLASSIFICATION
|
15
|
+
classify_using_threshold(sample)
|
16
16
|
else
|
17
|
-
|
17
|
+
classify_normally(sample)
|
18
18
|
end
|
19
|
-
contingency_table.add_prediction
|
19
|
+
contingency_table.add_prediction(y, h)
|
20
20
|
end
|
21
21
|
contingency_table
|
22
22
|
end
|
23
23
|
|
24
|
-
def used_feature_numbers
|
24
|
+
def used_feature_numbers(unique = false)
|
25
25
|
used_feature_numbers = []
|
26
26
|
@classifier.weak_classifiers.each do |weak_classifier|
|
27
27
|
used_feature_numbers << weak_classifier.feature_number
|
@@ -51,11 +51,11 @@ module AdaBoost
|
|
51
51
|
@threshold
|
52
52
|
end
|
53
53
|
|
54
|
-
def classify_normally
|
54
|
+
def classify_normally(sample)
|
55
55
|
@classifier.classify(sample > 0) ? 1 : -1
|
56
56
|
end
|
57
57
|
|
58
|
-
def classify_using_threshold
|
58
|
+
def classify_using_threshold(sample)
|
59
59
|
score = 0.0
|
60
60
|
@classifier.weak_classifiers.each do |weak_classifier|
|
61
61
|
if sample[weak_classifier.feature_number] > weak_classifier.split
|
@@ -7,45 +7,45 @@ module AdaBoost
|
|
7
7
|
|
8
8
|
class FeaturesAnalyzer
|
9
9
|
|
10
|
-
def initialize
|
10
|
+
def initialize(y_index)
|
11
11
|
@y_index = y_index
|
12
12
|
end
|
13
13
|
|
14
|
-
def analyze
|
14
|
+
def analyze(samples)
|
15
15
|
|
16
16
|
statistics = []
|
17
|
-
distribution = Distribution.new
|
17
|
+
distribution = Distribution.new(0, 0)
|
18
18
|
number_of_samples = samples.size
|
19
19
|
|
20
20
|
if number_of_samples < 1
|
21
|
-
raise ArgumentError.new
|
21
|
+
raise ArgumentError.new('At least one sample is needed to analyze.')
|
22
22
|
end
|
23
23
|
number_of_features = @y_index
|
24
24
|
sample_size = samples[0].size
|
25
25
|
if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
|
26
|
-
raise ArgumentError.new
|
26
|
+
raise ArgumentError.new('At least 1 feature is needed to analyze.')
|
27
27
|
end
|
28
|
-
0.upto
|
29
|
-
|
28
|
+
0.upto(number_of_features - 1) do
|
29
|
+
statistics << FeatureStatistic.new(Float::MAX, -Float::MAX, 0, 0, 0, 0)
|
30
30
|
end
|
31
31
|
samples.each do |sample|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
y = sample[@y_index]
|
33
|
+
if y == -1
|
34
|
+
distribution.negative += 1
|
35
|
+
else
|
36
|
+
distribution.positive += 1
|
37
|
+
end
|
38
|
+
0.upto(number_of_features - 1) do |i|
|
39
|
+
statistic = statistics[i]
|
40
|
+
feature_value = sample[i]
|
41
|
+
if feature_value < statistic.min
|
42
|
+
statistic.min = feature_value
|
37
43
|
end
|
38
|
-
|
39
|
-
|
40
|
-
feature_value = sample[i]
|
41
|
-
if feature_value < statistic.min
|
42
|
-
statistic.min = feature_value
|
43
|
-
end
|
44
|
-
if feature_value > statistic.max
|
45
|
-
statistic.max = feature_value
|
46
|
-
end
|
47
|
-
statistic.sum += feature_value
|
44
|
+
if feature_value > statistic.max
|
45
|
+
statistic.max = feature_value
|
48
46
|
end
|
47
|
+
statistic.sum += feature_value
|
48
|
+
end
|
49
49
|
end
|
50
50
|
statistics.each do |statistic|
|
51
51
|
statistic.avg = statistic.sum / number_of_samples.to_f
|
@@ -67,7 +67,7 @@ module AdaBoost
|
|
67
67
|
analyze
|
68
68
|
end
|
69
69
|
|
70
|
-
def relations
|
70
|
+
def relations(x, y, samples, statistics)
|
71
71
|
sum = 0.0
|
72
72
|
samples.each do |sample|
|
73
73
|
x_value = sample[x].to_f
|
@@ -76,7 +76,7 @@ module AdaBoost
|
|
76
76
|
end
|
77
77
|
cov = sum / (samples.size - 1).to_f
|
78
78
|
cor = cov / (statistics[x].std * statistics[y].std).to_f
|
79
|
-
VariableRelations.new
|
79
|
+
VariableRelations.new(x, y, cov, cor)
|
80
80
|
end
|
81
81
|
end
|
82
82
|
end
|
data/lib/adaboost/resampler.rb
CHANGED
@@ -2,12 +2,12 @@ module AdaBoost
|
|
2
2
|
|
3
3
|
class Resampler
|
4
4
|
|
5
|
-
def initialize
|
5
|
+
def initialize(y_index)
|
6
6
|
@y_index = y_index
|
7
7
|
end
|
8
8
|
|
9
|
-
def over_sample
|
10
|
-
distribution = distribution
|
9
|
+
def over_sample(samples)
|
10
|
+
distribution = distribution(samples)
|
11
11
|
y0 = distribution.negative
|
12
12
|
y1 = distribution.positive
|
13
13
|
majority = y0 < y1 ? 1.0 : -1.0
|
@@ -25,8 +25,8 @@ module AdaBoost
|
|
25
25
|
|
26
26
|
private
|
27
27
|
|
28
|
-
def distribution
|
29
|
-
analyzer = FeaturesAnalyzer.new
|
28
|
+
def distribution(instances)
|
29
|
+
analyzer = FeaturesAnalyzer.new(@y_index)
|
30
30
|
analyzer.analyze(instances).distribution
|
31
31
|
end
|
32
32
|
end
|
@@ -5,7 +5,7 @@ module AdaBoost
|
|
5
5
|
attr_accessor :error
|
6
6
|
attr_reader :feature_number, :split, :alpha
|
7
7
|
|
8
|
-
def initialize
|
8
|
+
def initialize(feature_number, split, alpha = 0.0, error = 0.0)
|
9
9
|
@feature_number = feature_number
|
10
10
|
@split = split
|
11
11
|
@error = error
|
@@ -16,15 +16,15 @@ module AdaBoost
|
|
16
16
|
@alpha = 0.5 * Math.log((1.0 - @error) / @error)
|
17
17
|
end
|
18
18
|
|
19
|
-
def classify
|
19
|
+
def classify(sample)
|
20
20
|
sample[@feature_number] > @split ? 1 : -1
|
21
21
|
end
|
22
22
|
|
23
|
-
def classify_with_alpha
|
23
|
+
def classify_with_alpha(sample)
|
24
24
|
return classify(sample) * @alpha
|
25
25
|
end
|
26
26
|
|
27
|
-
def increase_error
|
27
|
+
def increase_error(amount)
|
28
28
|
@error += amount
|
29
29
|
end
|
30
30
|
end
|
@@ -2,31 +2,31 @@ module AdaBoost
|
|
2
2
|
|
3
3
|
class WeakLearner
|
4
4
|
|
5
|
-
def initialize
|
5
|
+
def initialize(y_index)
|
6
6
|
@y_index = y_index
|
7
|
-
@analyzer = FeaturesAnalyzer.new
|
7
|
+
@analyzer = FeaturesAnalyzer.new(y_index)
|
8
8
|
@classifiers_cache = []
|
9
9
|
end
|
10
10
|
|
11
|
-
def features_satistics
|
11
|
+
def features_satistics(samples)
|
12
12
|
@analyzer.analyze(samples).statistics
|
13
13
|
end
|
14
14
|
|
15
|
-
def generate_weak_classifier
|
15
|
+
def generate_weak_classifier(samples, weights)
|
16
16
|
number_of_samples = samples.size
|
17
17
|
if number_of_samples < 1
|
18
|
-
raise ArgumentError.new
|
18
|
+
raise ArgumentError.new('At least one sample is needed to generate.')
|
19
19
|
end
|
20
20
|
number_of_features = @y_index
|
21
21
|
sample_size = samples[0].size
|
22
22
|
if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
|
23
|
-
raise ArgumentError.new
|
23
|
+
raise ArgumentError.new('At least 1 feature is needed to generate.')
|
24
24
|
end
|
25
25
|
classifiers = []
|
26
26
|
if Config::USE_RANDOM_WEAK_CLASSIFIERS
|
27
|
-
classifiers = generate_random_classifiers
|
27
|
+
classifiers = generate_random_classifiers(samples, number_of_features)
|
28
28
|
else
|
29
|
-
classifiers = generate_all_possible_classifiers
|
29
|
+
classifiers = generate_all_possible_classifiers(samples, number_of_features)
|
30
30
|
end
|
31
31
|
best_index = -1
|
32
32
|
best_error = Float::MAX
|
@@ -35,7 +35,7 @@ module AdaBoost
|
|
35
35
|
samples.each_with_index do |sample, j|
|
36
36
|
y = sample[@y_index]
|
37
37
|
if classifier.classify(sample).to_f != y
|
38
|
-
classifier.increase_error
|
38
|
+
classifier.increase_error(weights[j])
|
39
39
|
end
|
40
40
|
end
|
41
41
|
if classifier.error < best_error
|
@@ -45,33 +45,33 @@ module AdaBoost
|
|
45
45
|
end
|
46
46
|
best = classifiers[best_index]
|
47
47
|
if !Config::USE_RANDOM_WEAK_CLASSIFIERS
|
48
|
-
classifiers.delete_at
|
48
|
+
classifiers.delete_at(best_index)
|
49
49
|
end
|
50
50
|
best
|
51
51
|
end
|
52
52
|
|
53
53
|
private
|
54
54
|
|
55
|
-
def generate_random_classifiers
|
55
|
+
def generate_random_classifiers(samples, number_of_features)
|
56
56
|
classifiers = []
|
57
|
-
statistics = features_satistics
|
58
|
-
0.upto
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
57
|
+
statistics = features_satistics(samples)
|
58
|
+
0.upto(Config::NUMBER_OF_RANDOM_CLASSIFIERS - 1) do
|
59
|
+
feature_number = rand(number_of_features)
|
60
|
+
info = statistics[feature_number]
|
61
|
+
split = rand * info.rng + info.min
|
62
|
+
classifiers << WeakClassifier.new(feature_number, split)
|
63
63
|
end
|
64
64
|
classifiers
|
65
65
|
end
|
66
66
|
|
67
|
-
def generate_all_possible_classifiers
|
67
|
+
def generate_all_possible_classifiers(samples, number_of_features)
|
68
68
|
if @classifiers_cache.size == 0
|
69
69
|
matrix = []
|
70
|
-
0.upto
|
70
|
+
0.upto(number_of_features - 1) do
|
71
71
|
matrix << []
|
72
72
|
end
|
73
73
|
samples.each do |sample|
|
74
|
-
0.upto
|
74
|
+
0.upto(number_of_features - 1) do |i|
|
75
75
|
sample_value = sample[i]
|
76
76
|
matrix[i] << sample_value
|
77
77
|
end
|