adaboost 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2144b2d6abb1c701c8dbf955673a9a5ffd96e00e
4
+ data.tar.gz: 74dbd52e82039ca79e2dcfa3ef8c2b2ac20bd3e7
5
+ SHA512:
6
+ metadata.gz: 035ca3856d5343afde2f968f43cc5de165f0f69fb1f8284c87fe2ba4525599221eb711ffeced58346ddbfbf2b89f640a240fbf336c3c2e367d6e6f99826a932b
7
+ data.tar.gz: d6f6adc39fa2327ea37e986f759976d83c9c0efda519f429af0d9325927fffe9dc8c540b84556ba890434fd59674b550191dd7d1f8df8ad9e39759795e697be8
@@ -0,0 +1,74 @@
1
+ module AdaBoost
2
+
3
+ class AdaBoost
4
+
5
+ attr_reader :weak_classifiers, :y_index
6
+
7
+ def initialize number_of_classifiers, y_index
8
+ @weak_classifiers = []
9
+ @weak_learner = WeakLearner.new y_index
10
+ @number_of_classifiers = number_of_classifiers
11
+ @weights = []
12
+ @y_index = y_index
13
+ end
14
+
15
+ def initialize_weights samples
16
+ samples_size = samples.size.to_f
17
+ negative_weight = 1 / samples_size
18
+ positive_weight = negative_weight
19
+ if Config::INCORPORATE_COST_SENSITIVE_LEARNING
20
+ analyzer = FeaturesAnalyzer.new @y_index
21
+ distribution = analyzer.analyze(samples).distribution
22
+ positive_rate = distribution.positive / samples_size
23
+ negative_rate = distribution.negative / samples_size
24
+ normalizing_constant = distribution.negative * positive_rate + distribution.positive * negative_rate
25
+ positive_weight = positive_rate / normalizing_constant.to_f
26
+ negative_weight = negative_rate / normalizing_constant.to_f
27
+ end
28
+ samples.each_with_index do |sample, i|
29
+ y = sample[@y_index]
30
+ if y == -1
31
+ @weights[i] = positive_weight
32
+ else
33
+ @weights[i] = negative_weight
34
+ end
35
+ end
36
+ end
37
+
38
+ def update_weights weak_classifier, samples
39
+ sum = 0.0
40
+ samples.each_with_index do |sample, i|
41
+ y = sample[@y_index]
42
+ @weights[i] *= Math.exp -(weak_classifier.alpha) * weak_classifier.classify(sample) * y
43
+ sum += @weights[i]
44
+ end
45
+ @weights.each_with_index do |_, i|
46
+ @weights[i] /= sum
47
+ end
48
+ end
49
+
50
+ def train samples
51
+ puts "boom2"
52
+ if Config::OVER_SAMPLING_TRAINING_SET
53
+ resampler = Resampler.new @y_index
54
+ resampler.over_sample samples
55
+ end
56
+ initialize_weights samples
57
+ 0.upto @number_of_classifiers - 1 do |i|
58
+ puts "boom"
59
+ weak_classifier = @weak_learner.generate_weak_classifier samples, @weights
60
+ weak_classifier.compute_alpha
61
+ update_weights weak_classifier, samples
62
+ @weak_classifiers << weak_classifier
63
+ end
64
+ end
65
+
66
+ def classify sample
67
+ score = 0.0
68
+ @weak_classifiers.each do |weak_classifier|
69
+ score += weak_classifier.classify_with_alpha sample
70
+ end
71
+ score
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,10 @@
1
+ module AdaBoost
2
+
3
+ module Config
4
+ NUMBER_OF_RANDOM_CLASSIFIERS = 100
5
+ INCORPORATE_COST_SENSITIVE_LEARNING = true
6
+ OVER_SAMPLING_TRAINING_SET = false
7
+ USE_RANDOM_WEAK_CLASSIFIERS = false
8
+ USE_THRESHOLD_CLASSIFICATION = true
9
+ end
10
+ end
@@ -0,0 +1,182 @@
1
+ module AdaBoost
2
+
3
+ class ContingencyTable
4
+
5
+ def initialize
6
+ @table = [[0, 0], [0, 0]]
7
+ end
8
+
9
+ def true_positive
10
+ @table[1][1]
11
+ end
12
+
13
+ def false_positive
14
+ @table[0][1]
15
+ end
16
+
17
+ def true_negative
18
+ @table[0][0]
19
+ end
20
+
21
+ def false_negative
22
+ @table[1][0]
23
+ end
24
+
25
+ def add_prediction y, h
26
+ @table[class_to_index(y)][class_to_index(h)] += 1
27
+ end
28
+
29
+ def outcome_positive
30
+ true_positive + false_positive
31
+ end
32
+
33
+ def outcome_negative
34
+ true_negative + false_negative
35
+ end
36
+
37
+ def total_population
38
+ @table[0][0] + @table[0][1] + @table[1][0] + @table[1][1]
39
+ end
40
+
41
+ def predicted_condition_positive
42
+ true_positive + false_positive
43
+ end
44
+
45
+ def predicted_condition_negative
46
+ false_negative + true_negative
47
+ end
48
+
49
+ def condition_positive
50
+ true_positive + false_negative
51
+ end
52
+
53
+ def condition_negative
54
+ false_positive + true_negative
55
+ end
56
+
57
+ def prevalence
58
+ condition_positive / total_population.to_f
59
+ end
60
+
61
+ def true_positive_rate
62
+ true_positive / condition_positive.to_f
63
+ end
64
+
65
+ def recall
66
+ true_positive_rate
67
+ end
68
+
69
+ def sensitivity
70
+ true_positive_rate
71
+ end
72
+
73
+ def false_positive_rate
74
+ false_positive / condition_negative.to_f
75
+ end
76
+
77
+ def fall_out
78
+ false_positive_rate
79
+ end
80
+
81
+ def false_negative_rate
82
+ false_negative / condition_positive.to_f
83
+ end
84
+
85
+ def true_negative_rate
86
+ true_negative / condition_negative.to_f
87
+ end
88
+
89
+ def specificity
90
+ true_negative_rate
91
+ end
92
+
93
+ def accuracy
94
+ (true_positive + true_negative) / total_population.to_f
95
+ end
96
+
97
+ def positive_predictive_value
98
+ true_positive / outcome_positive.to_f
99
+ end
100
+
101
+ def precision
102
+ positive_predictive_value
103
+ end
104
+
105
+ def false_discovery_rate
106
+ false_positive / outcome_positive.to_f
107
+ end
108
+
109
+ def false_omission_rate
110
+ false_negative / outcome_negative.to_f
111
+ end
112
+
113
+ def negative_predictive_value
114
+ true_negative / outcome_negative.to_f
115
+ end
116
+
117
+ def positive_likelihood_ratio
118
+ true_positive_rate / false_positive_rate.to_f
119
+ end
120
+
121
+ def negative_likelihood_ratio
122
+ false_negative_rate / true_negative_rate.to_f
123
+ end
124
+
125
+ def diagnostic_odds_ratio
126
+ positive_likelihood_ratio / negative_likelihood_ratio.to_f
127
+ end
128
+
129
+ def to_s
130
+ "\nTotal population: %d\t \
131
+ \nCondition positive: %d\t \
132
+ \nCondition negative: %d\t \
133
+ \nPredicted Condition positive: %d\t \
134
+ \nPredicted Condition negative: %d\t \
135
+ \nTrue positive: %d\t \
136
+ \nTrue negative: %d\t \
137
+ \nFalse Negative: %d\t \
138
+ \nFalse Positive: %d\t \
139
+ \nPrevalence = Σ Condition positive / Σ Total population: %f\t \
140
+ \nTrue positive rate (TPR) = Σ True positive / Σ Condition positive: %f\t \
141
+ \nFalse positive rate (FPR) = Σ False positive / Σ Condition negative: %f\t \
142
+ \nFalse negative rate (FNR) = Σ False negative / Σ Condition positive: %f\t \
143
+ \nTrue negative rate (TNR) = Σ True negative / Σ Condition negative: %f\t \
144
+ \nAccuracy (ACC) = Σ True positive \ Σ True negative / Σ Total population: %f\t \
145
+ \nPositive predictive value (PPV) = Σ True positive / Σ Test outcome positive: %f\t \
146
+ \nFalse discovery rate (FDR) = Σ False positive / Σ Test outcome positive: %f\t \
147
+ \nFalse omission rate (FOR) = Σ False negative / Σ Test outcome negative: %f\t \
148
+ \nNegative predictive value (NPV) = Σ True negative / Σ Test outcome negative: %f\t \
149
+ \nPositive likelihood ratio (LR\) = TPR / FPR: %f\t \
150
+ \nNegative likelihood ratio (LR−) = FNR / TNR: %f\t \
151
+ \nDiagnostic odds ratio (DOR) = LR+ / LR−: %f\t" %
152
+ [
153
+ total_population,
154
+ condition_positive,
155
+ condition_negative,
156
+ predicted_condition_positive,
157
+ predicted_condition_negative,
158
+ true_positive,
159
+ true_negative,
160
+ false_negative,
161
+ false_positive,
162
+ prevalence,
163
+ true_positive_rate,
164
+ false_positive_rate,
165
+ false_negative_rate,
166
+ true_negative_rate,
167
+ accuracy,
168
+ positive_predictive_value,
169
+ false_discovery_rate,
170
+ false_omission_rate,
171
+ negative_predictive_value,
172
+ positive_likelihood_ratio,
173
+ negative_likelihood_ratio,
174
+ diagnostic_odds_ratio
175
+ ]
176
+ end
177
+
178
+ def class_to_index k
179
+ k > 0 ? 1 : 0
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,66 @@
1
+ module AdaBoost
2
+
3
+ class Evaluator
4
+
5
+ def initialize classifier
6
+ @classifier = classifier
7
+ @threshold = Float::MAX
8
+ end
9
+
10
+ def evaluate test_set
11
+ contingency_table = ContingencyTable.new
12
+ test_set.each do |sample|
13
+ y = sample[@classifier.y_index]
14
+ if Config::USE_THRESHOLD_CLASSIFICATION
15
+ h = classify_using_threshold sample
16
+ else
17
+ h = e.classify_normally sample
18
+ end
19
+ contingency_table.add_prediction y, h
20
+ end
21
+ contingency_table
22
+ end
23
+
24
+ def threshold
25
+ if @threshold == Float::MAX
26
+ @threshold = 0
27
+ @classifier.weak_classifiers.each do |weak_classifier|
28
+ @threshold += weak_classifier.alpha / 2.0
29
+ end
30
+ end
31
+ @threshold
32
+ end
33
+
34
+ def classify_using_threshold sample
35
+ score = 0.0
36
+ @classifier.weak_classifiers.each do |weak_classifier|
37
+ if sample[weak_classifier.feature_number] > weak_classifier.split
38
+ score += weak_classifier.alpha
39
+ end
40
+ end
41
+ score > threshold ? 1 : -1
42
+ end
43
+
44
+ def classify_normally sample
45
+ @classifier.classify(sample > 0) ? 1 : -1
46
+ end
47
+
48
+ def used_feature_numbers unique = false
49
+ used_feature_numbers = []
50
+ @classifier.weak_classifiers.each do |weak_classifier|
51
+ used_feature_numbers << weak_classifier.feature_number
52
+ end
53
+ unique ? used_feature_numbers.uniq : used_feature_numbers
54
+ end
55
+
56
+ def feature_occurrences
57
+ used_numbers = used_feature_numbers
58
+ occurrences = {}
59
+ used_numbers.each do |number|
60
+ occurrences[number] = 0 if occurrences[number].nil?
61
+ occurrences[number] += 1
62
+ end
63
+ occurrences
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,82 @@
1
+ module AdaBoost
2
+
3
+ Analyze = Struct.new(:statistics, :distribution)
4
+ Distribution = Struct.new(:negative, :positive)
5
+ FeatureStatistic = Struct.new(:min, :max, :sum, :avg, :vrn, :std, :rng)
6
+ VariableRelations = Struct.new(:x, :y, :cov, :cor)
7
+
8
+ class FeaturesAnalyzer
9
+
10
+ def initialize y_index
11
+ @y_index = y_index
12
+ end
13
+
14
+ def analyze samples
15
+
16
+ statistics = []
17
+ distribution = Distribution.new 0, 0
18
+ number_of_samples = samples.size
19
+
20
+ if number_of_samples < 1
21
+ raise ArgumentError.new 'At least one sample is needed to analyze.'
22
+ end
23
+ number_of_features = @y_index
24
+ sample_size = samples[0].size
25
+ if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
26
+ raise ArgumentError.new 'At least 1 feature is needed to analyze.'
27
+ end
28
+ 0.upto number_of_features - 1 do
29
+ statistics << FeatureStatistic.new(Float::MAX, -Float::MAX, 0, 0, 0, 0)
30
+ end
31
+ samples.each do |sample|
32
+ y = sample[@y_index]
33
+ if y == -1
34
+ distribution.negative += 1
35
+ else
36
+ distribution.positive += 1
37
+ end
38
+ 0.upto number_of_features - 1 do |i|
39
+ statistic = statistics[i]
40
+ feature_value = sample[i]
41
+ if feature_value < statistic.min
42
+ statistic.min = feature_value
43
+ end
44
+ if feature_value > statistic.max
45
+ statistic.max = feature_value
46
+ end
47
+ statistic.sum += feature_value
48
+ end
49
+ end
50
+ statistics.each do |statistic|
51
+ statistic.avg = statistic.sum / number_of_samples.to_f
52
+ statistic.rng = (statistic.max - statistic.min).abs
53
+ end
54
+ samples.each do |sample|
55
+ statistics.each_with_index do |statistic, i|
56
+ feature_value = sample[i]
57
+ statistic.vrn += (statistic.avg - feature_value) ** 2
58
+ end
59
+ end
60
+ statistics.each do |statistic|
61
+ statistic.vrn /= (number_of_samples - 1).to_f
62
+ statistic.std = Math.sqrt statistic.vrn
63
+ end
64
+ analyze = Analyze.new
65
+ analyze.statistics = statistics
66
+ analyze.distribution = distribution
67
+ analyze
68
+ end
69
+
70
+ def relations x, y, samples, statistics
71
+ sum = 0.0
72
+ samples.each do |sample|
73
+ x_value = sample[x].to_f
74
+ y_value = sample[y].to_f
75
+ sum += (x_value - statistics[x].avg) * (y_value - statistics[y].avg)
76
+ end
77
+ cov = sum / (samples.size - 1).to_f
78
+ cor = cov / (statistics[x].std * statistics[y].std).to_f
79
+ VariableRelations.new x, y, cov, cor
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,31 @@
1
+ module AdaBoost
2
+
3
+ class Resampler
4
+
5
+ def initialize y_index
6
+ @y_index = y_index
7
+ end
8
+
9
+ def over_sample samples
10
+ distribution = distribution samples
11
+ y0 = distribution.negative
12
+ y1 = distribution.positive
13
+ majority = y0 < y1 ? 1.0 : -1.0
14
+ difference = (y0 - y1).abs
15
+ samples.each do |sample|
16
+ if difference <= 0
17
+ break
18
+ end
19
+ if sample[@y_index] != majority
20
+ samples << sample
21
+ difference -= 1
22
+ end
23
+ end
24
+ end
25
+
26
+ def distribution instances
27
+ analyzer = FeaturesAnalyzer.new @y_index
28
+ analyzer.analyze(instances).distribution
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,31 @@
1
+ module AdaBoost
2
+
3
+ class WeakClassifier
4
+
5
+ attr_accessor :error
6
+ attr_reader :feature_number, :split, :alpha
7
+
8
+ def initialize feature_number, split
9
+ @feature_number = feature_number
10
+ @split = split
11
+ @error = 0.0
12
+ @alpha = 0.0
13
+ end
14
+
15
+ def compute_alpha
16
+ @alpha = 0.5 * Math.log((1.0 - @error) / @error)
17
+ end
18
+
19
+ def classify sample
20
+ sample[@feature_number] > @split ? 1 : -1
21
+ end
22
+
23
+ def classify_with_alpha sample
24
+ return classify(sample) * @alpha
25
+ end
26
+
27
+ def increase_error amount
28
+ @error += amount
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,87 @@
1
+ module AdaBoost
2
+
3
+ class WeakLearner
4
+
5
+ def initialize y_index
6
+ @y_index = y_index
7
+ @analyzer = FeaturesAnalyzer.new y_index
8
+ @classifiers_cache = []
9
+ end
10
+
11
+ def features_satistics samples
12
+ @analyzer.analyze(samples).statistics
13
+ end
14
+
15
+ def generate_weak_classifier samples, weights
16
+ number_of_samples = samples.size
17
+ if number_of_samples < 1
18
+ raise ArgumentError.new 'At least one sample is needed to generate.'
19
+ end
20
+ number_of_features = @y_index
21
+ sample_size = samples[0].size
22
+ if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
23
+ raise ArgumentError.new 'At least 1 feature is needed to generate.'
24
+ end
25
+ classifiers = []
26
+ if Config::USE_RANDOM_WEAK_CLASSIFIERS
27
+ classifiers = generate_random_classifiers samples, number_of_features
28
+ else
29
+ classifiers = generate_all_possible_classifiers samples, number_of_features
30
+ end
31
+ best_index = -1
32
+ best_error = Float::MAX
33
+ classifiers.each_with_index do |classifier, i|
34
+ classifier.error = 0.0
35
+ samples.each_with_index do |sample, j|
36
+ y = sample[@y_index]
37
+ if classifier.classify(sample).to_f != y
38
+ classifier.increase_error weights[j]
39
+ end
40
+ end
41
+ if classifier.error < best_error
42
+ best_error = classifier.error
43
+ best_index = i
44
+ end
45
+ end
46
+ best = classifiers[best_index]
47
+ if !Config::USE_RANDOM_WEAK_CLASSIFIERS
48
+ classifiers.delete_at best_index
49
+ end
50
+ best
51
+ end
52
+
53
+ def generate_random_classifiers samples, number_of_features
54
+ classifiers = []
55
+ statistics = features_satistics samples
56
+ 0.upto Config::NUMBER_OF_RANDOM_CLASSIFIERS - 1 do
57
+ feature_number = rand number_of_features
58
+ info = statistics[feature_number]
59
+ split = rand * info.rng + info.min
60
+ classifiers << WeakClassifier.new(feature_number, split)
61
+ end
62
+ classifiers
63
+ end
64
+
65
+ def generate_all_possible_classifiers samples, number_of_features
66
+ if @classifiers_cache.size == 0
67
+ matrix = []
68
+ 0.upto number_of_features - 1 do
69
+ matrix << []
70
+ end
71
+ samples.each do |sample|
72
+ 0.upto number_of_features - 1 do |i|
73
+ sample_value = sample[i]
74
+ matrix[i] << sample_value
75
+ end
76
+ end
77
+ matrix.each_with_index do |entry, i|
78
+ entry = entry.uniq
79
+ entry.each do |uniq_value|
80
+ @classifiers_cache << WeakClassifier.new(i, uniq_value)
81
+ end
82
+ end
83
+ end
84
+ @classifiers_cache
85
+ end
86
+ end
87
+ end
data/lib/adaboost.rb ADDED
@@ -0,0 +1,11 @@
1
+ require 'adaboost/adaboost.rb'
2
+ require 'adaboost/config.rb'
3
+ require 'adaboost/contingency_table.rb'
4
+ require 'adaboost/evaluator.rb'
5
+ require 'adaboost/features_analyzer.rb'
6
+ require 'adaboost/resampler.rb'
7
+ require 'adaboost/weak_classifier.rb'
8
+ require 'adaboost/weak_learner.rb'
9
+
10
+ module AdaBoost
11
+ end
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: adaboost
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Dalmir da Silva
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-07-25 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: AdaBoost classifier!
14
+ email: dalmirdasilva@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/adaboost.rb
20
+ - lib/adaboost/adaboost.rb
21
+ - lib/adaboost/config.rb
22
+ - lib/adaboost/contingency_table.rb
23
+ - lib/adaboost/evaluator.rb
24
+ - lib/adaboost/features_analyzer.rb
25
+ - lib/adaboost/resampler.rb
26
+ - lib/adaboost/weak_classifier.rb
27
+ - lib/adaboost/weak_learner.rb
28
+ homepage: http://dalmirdasilva.com/adaboost-classifier
29
+ licenses:
30
+ - MIT
31
+ metadata: {}
32
+ post_install_message:
33
+ rdoc_options: []
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ requirements: []
47
+ rubyforge_project:
48
+ rubygems_version: 2.5.1
49
+ signing_key:
50
+ specification_version: 4
51
+ summary: AdaBoost classifier!
52
+ test_files: []