adaboost 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2144b2d6abb1c701c8dbf955673a9a5ffd96e00e
4
+ data.tar.gz: 74dbd52e82039ca79e2dcfa3ef8c2b2ac20bd3e7
5
+ SHA512:
6
+ metadata.gz: 035ca3856d5343afde2f968f43cc5de165f0f69fb1f8284c87fe2ba4525599221eb711ffeced58346ddbfbf2b89f640a240fbf336c3c2e367d6e6f99826a932b
7
+ data.tar.gz: d6f6adc39fa2327ea37e986f759976d83c9c0efda519f429af0d9325927fffe9dc8c540b84556ba890434fd59674b550191dd7d1f8df8ad9e39759795e697be8
@@ -0,0 +1,74 @@
1
+ module AdaBoost
2
+
3
+ class AdaBoost
4
+
5
+ attr_reader :weak_classifiers, :y_index
6
+
7
+ def initialize number_of_classifiers, y_index
8
+ @weak_classifiers = []
9
+ @weak_learner = WeakLearner.new y_index
10
+ @number_of_classifiers = number_of_classifiers
11
+ @weights = []
12
+ @y_index = y_index
13
+ end
14
+
15
+ def initialize_weights samples
16
+ samples_size = samples.size.to_f
17
+ negative_weight = 1 / samples_size
18
+ positive_weight = negative_weight
19
+ if Config::INCORPORATE_COST_SENSITIVE_LEARNING
20
+ analyzer = FeaturesAnalyzer.new @y_index
21
+ distribution = analyzer.analyze(samples).distribution
22
+ positive_rate = distribution.positive / samples_size
23
+ negative_rate = distribution.negative / samples_size
24
+ normalizing_constant = distribution.negative * positive_rate + distribution.positive * negative_rate
25
+ positive_weight = positive_rate / normalizing_constant.to_f
26
+ negative_weight = negative_rate / normalizing_constant.to_f
27
+ end
28
+ samples.each_with_index do |sample, i|
29
+ y = sample[@y_index]
30
+ if y == -1
31
+ @weights[i] = positive_weight
32
+ else
33
+ @weights[i] = negative_weight
34
+ end
35
+ end
36
+ end
37
+
38
+ def update_weights weak_classifier, samples
39
+ sum = 0.0
40
+ samples.each_with_index do |sample, i|
41
+ y = sample[@y_index]
42
+ @weights[i] *= Math.exp -(weak_classifier.alpha) * weak_classifier.classify(sample) * y
43
+ sum += @weights[i]
44
+ end
45
+ @weights.each_with_index do |_, i|
46
+ @weights[i] /= sum
47
+ end
48
+ end
49
+
50
+ def train samples
51
+ puts "boom2"
52
+ if Config::OVER_SAMPLING_TRAINING_SET
53
+ resampler = Resampler.new @y_index
54
+ resampler.over_sample samples
55
+ end
56
+ initialize_weights samples
57
+ 0.upto @number_of_classifiers - 1 do |i|
58
+ puts "boom"
59
+ weak_classifier = @weak_learner.generate_weak_classifier samples, @weights
60
+ weak_classifier.compute_alpha
61
+ update_weights weak_classifier, samples
62
+ @weak_classifiers << weak_classifier
63
+ end
64
+ end
65
+
66
+ def classify sample
67
+ score = 0.0
68
+ @weak_classifiers.each do |weak_classifier|
69
+ score += weak_classifier.classify_with_alpha sample
70
+ end
71
+ score
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,10 @@
1
+ module AdaBoost
2
+
3
+ module Config
4
+ NUMBER_OF_RANDOM_CLASSIFIERS = 100
5
+ INCORPORATE_COST_SENSITIVE_LEARNING = true
6
+ OVER_SAMPLING_TRAINING_SET = false
7
+ USE_RANDOM_WEAK_CLASSIFIERS = false
8
+ USE_THRESHOLD_CLASSIFICATION = true
9
+ end
10
+ end
@@ -0,0 +1,182 @@
1
+ module AdaBoost
2
+
3
+ class ContingencyTable
4
+
5
+ def initialize
6
+ @table = [[0, 0], [0, 0]]
7
+ end
8
+
9
+ def true_positive
10
+ @table[1][1]
11
+ end
12
+
13
+ def false_positive
14
+ @table[0][1]
15
+ end
16
+
17
+ def true_negative
18
+ @table[0][0]
19
+ end
20
+
21
+ def false_negative
22
+ @table[1][0]
23
+ end
24
+
25
+ def add_prediction y, h
26
+ @table[class_to_index(y)][class_to_index(h)] += 1
27
+ end
28
+
29
+ def outcome_positive
30
+ true_positive + false_positive
31
+ end
32
+
33
+ def outcome_negative
34
+ true_negative + false_negative
35
+ end
36
+
37
+ def total_population
38
+ @table[0][0] + @table[0][1] + @table[1][0] + @table[1][1]
39
+ end
40
+
41
+ def predicted_condition_positive
42
+ true_positive + false_positive
43
+ end
44
+
45
+ def predicted_condition_negative
46
+ false_negative + true_negative
47
+ end
48
+
49
+ def condition_positive
50
+ true_positive + false_negative
51
+ end
52
+
53
+ def condition_negative
54
+ false_positive + true_negative
55
+ end
56
+
57
+ def prevalence
58
+ condition_positive / total_population.to_f
59
+ end
60
+
61
+ def true_positive_rate
62
+ true_positive / condition_positive.to_f
63
+ end
64
+
65
+ def recall
66
+ true_positive_rate
67
+ end
68
+
69
+ def sensitivity
70
+ true_positive_rate
71
+ end
72
+
73
+ def false_positive_rate
74
+ false_positive / condition_negative.to_f
75
+ end
76
+
77
+ def fall_out
78
+ false_positive_rate
79
+ end
80
+
81
+ def false_negative_rate
82
+ false_negative / condition_positive.to_f
83
+ end
84
+
85
+ def true_negative_rate
86
+ true_negative / condition_negative.to_f
87
+ end
88
+
89
+ def specificity
90
+ true_negative_rate
91
+ end
92
+
93
+ def accuracy
94
+ (true_positive + true_negative) / total_population.to_f
95
+ end
96
+
97
+ def positive_predictive_value
98
+ true_positive / outcome_positive.to_f
99
+ end
100
+
101
+ def precision
102
+ positive_predictive_value
103
+ end
104
+
105
+ def false_discovery_rate
106
+ false_positive / outcome_positive.to_f
107
+ end
108
+
109
+ def false_omission_rate
110
+ false_negative / outcome_negative.to_f
111
+ end
112
+
113
+ def negative_predictive_value
114
+ true_negative / outcome_negative.to_f
115
+ end
116
+
117
+ def positive_likelihood_ratio
118
+ true_positive_rate / false_positive_rate.to_f
119
+ end
120
+
121
+ def negative_likelihood_ratio
122
+ false_negative_rate / true_negative_rate.to_f
123
+ end
124
+
125
+ def diagnostic_odds_ratio
126
+ positive_likelihood_ratio / negative_likelihood_ratio.to_f
127
+ end
128
+
129
+ def to_s
130
+ "\nTotal population: %d\t \
131
+ \nCondition positive: %d\t \
132
+ \nCondition negative: %d\t \
133
+ \nPredicted Condition positive: %d\t \
134
+ \nPredicted Condition negative: %d\t \
135
+ \nTrue positive: %d\t \
136
+ \nTrue negative: %d\t \
137
+ \nFalse Negative: %d\t \
138
+ \nFalse Positive: %d\t \
139
+ \nPrevalence = Σ Condition positive / Σ Total population: %f\t \
140
+ \nTrue positive rate (TPR) = Σ True positive / Σ Condition positive: %f\t \
141
+ \nFalse positive rate (FPR) = Σ False positive / Σ Condition negative: %f\t \
142
+ \nFalse negative rate (FNR) = Σ False negative / Σ Condition positive: %f\t \
143
+ \nTrue negative rate (TNR) = Σ True negative / Σ Condition negative: %f\t \
144
+ \nAccuracy (ACC) = Σ True positive \ Σ True negative / Σ Total population: %f\t \
145
+ \nPositive predictive value (PPV) = Σ True positive / Σ Test outcome positive: %f\t \
146
+ \nFalse discovery rate (FDR) = Σ False positive / Σ Test outcome positive: %f\t \
147
+ \nFalse omission rate (FOR) = Σ False negative / Σ Test outcome negative: %f\t \
148
+ \nNegative predictive value (NPV) = Σ True negative / Σ Test outcome negative: %f\t \
149
+ \nPositive likelihood ratio (LR\) = TPR / FPR: %f\t \
150
+ \nNegative likelihood ratio (LR−) = FNR / TNR: %f\t \
151
+ \nDiagnostic odds ratio (DOR) = LR+ / LR−: %f\t" %
152
+ [
153
+ total_population,
154
+ condition_positive,
155
+ condition_negative,
156
+ predicted_condition_positive,
157
+ predicted_condition_negative,
158
+ true_positive,
159
+ true_negative,
160
+ false_negative,
161
+ false_positive,
162
+ prevalence,
163
+ true_positive_rate,
164
+ false_positive_rate,
165
+ false_negative_rate,
166
+ true_negative_rate,
167
+ accuracy,
168
+ positive_predictive_value,
169
+ false_discovery_rate,
170
+ false_omission_rate,
171
+ negative_predictive_value,
172
+ positive_likelihood_ratio,
173
+ negative_likelihood_ratio,
174
+ diagnostic_odds_ratio
175
+ ]
176
+ end
177
+
178
+ def class_to_index k
179
+ k > 0 ? 1 : 0
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,66 @@
1
+ module AdaBoost
2
+
3
+ class Evaluator
4
+
5
+ def initialize classifier
6
+ @classifier = classifier
7
+ @threshold = Float::MAX
8
+ end
9
+
10
+ def evaluate test_set
11
+ contingency_table = ContingencyTable.new
12
+ test_set.each do |sample|
13
+ y = sample[@classifier.y_index]
14
+ if Config::USE_THRESHOLD_CLASSIFICATION
15
+ h = classify_using_threshold sample
16
+ else
17
+ h = e.classify_normally sample
18
+ end
19
+ contingency_table.add_prediction y, h
20
+ end
21
+ contingency_table
22
+ end
23
+
24
+ def threshold
25
+ if @threshold == Float::MAX
26
+ @threshold = 0
27
+ @classifier.weak_classifiers.each do |weak_classifier|
28
+ @threshold += weak_classifier.alpha / 2.0
29
+ end
30
+ end
31
+ @threshold
32
+ end
33
+
34
+ def classify_using_threshold sample
35
+ score = 0.0
36
+ @classifier.weak_classifiers.each do |weak_classifier|
37
+ if sample[weak_classifier.feature_number] > weak_classifier.split
38
+ score += weak_classifier.alpha
39
+ end
40
+ end
41
+ score > threshold ? 1 : -1
42
+ end
43
+
44
+ def classify_normally sample
45
+ @classifier.classify(sample > 0) ? 1 : -1
46
+ end
47
+
48
+ def used_feature_numbers unique = false
49
+ used_feature_numbers = []
50
+ @classifier.weak_classifiers.each do |weak_classifier|
51
+ used_feature_numbers << weak_classifier.feature_number
52
+ end
53
+ unique ? used_feature_numbers.uniq : used_feature_numbers
54
+ end
55
+
56
+ def feature_occurrences
57
+ used_numbers = used_feature_numbers
58
+ occurrences = {}
59
+ used_numbers.each do |number|
60
+ occurrences[number] = 0 if occurrences[number].nil?
61
+ occurrences[number] += 1
62
+ end
63
+ occurrences
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,82 @@
1
+ module AdaBoost
2
+
3
+ Analyze = Struct.new(:statistics, :distribution)
4
+ Distribution = Struct.new(:negative, :positive)
5
+ FeatureStatistic = Struct.new(:min, :max, :sum, :avg, :vrn, :std, :rng)
6
+ VariableRelations = Struct.new(:x, :y, :cov, :cor)
7
+
8
+ class FeaturesAnalyzer
9
+
10
+ def initialize y_index
11
+ @y_index = y_index
12
+ end
13
+
14
+ def analyze samples
15
+
16
+ statistics = []
17
+ distribution = Distribution.new 0, 0
18
+ number_of_samples = samples.size
19
+
20
+ if number_of_samples < 1
21
+ raise ArgumentError.new 'At least one sample is needed to analyze.'
22
+ end
23
+ number_of_features = @y_index
24
+ sample_size = samples[0].size
25
+ if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
26
+ raise ArgumentError.new 'At least 1 feature is needed to analyze.'
27
+ end
28
+ 0.upto number_of_features - 1 do
29
+ statistics << FeatureStatistic.new(Float::MAX, -Float::MAX, 0, 0, 0, 0)
30
+ end
31
+ samples.each do |sample|
32
+ y = sample[@y_index]
33
+ if y == -1
34
+ distribution.negative += 1
35
+ else
36
+ distribution.positive += 1
37
+ end
38
+ 0.upto number_of_features - 1 do |i|
39
+ statistic = statistics[i]
40
+ feature_value = sample[i]
41
+ if feature_value < statistic.min
42
+ statistic.min = feature_value
43
+ end
44
+ if feature_value > statistic.max
45
+ statistic.max = feature_value
46
+ end
47
+ statistic.sum += feature_value
48
+ end
49
+ end
50
+ statistics.each do |statistic|
51
+ statistic.avg = statistic.sum / number_of_samples.to_f
52
+ statistic.rng = (statistic.max - statistic.min).abs
53
+ end
54
+ samples.each do |sample|
55
+ statistics.each_with_index do |statistic, i|
56
+ feature_value = sample[i]
57
+ statistic.vrn += (statistic.avg - feature_value) ** 2
58
+ end
59
+ end
60
+ statistics.each do |statistic|
61
+ statistic.vrn /= (number_of_samples - 1).to_f
62
+ statistic.std = Math.sqrt statistic.vrn
63
+ end
64
+ analyze = Analyze.new
65
+ analyze.statistics = statistics
66
+ analyze.distribution = distribution
67
+ analyze
68
+ end
69
+
70
+ def relations x, y, samples, statistics
71
+ sum = 0.0
72
+ samples.each do |sample|
73
+ x_value = sample[x].to_f
74
+ y_value = sample[y].to_f
75
+ sum += (x_value - statistics[x].avg) * (y_value - statistics[y].avg)
76
+ end
77
+ cov = sum / (samples.size - 1).to_f
78
+ cor = cov / (statistics[x].std * statistics[y].std).to_f
79
+ VariableRelations.new x, y, cov, cor
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,31 @@
1
+ module AdaBoost
2
+
3
+ class Resampler
4
+
5
+ def initialize y_index
6
+ @y_index = y_index
7
+ end
8
+
9
+ def over_sample samples
10
+ distribution = distribution samples
11
+ y0 = distribution.negative
12
+ y1 = distribution.positive
13
+ majority = y0 < y1 ? 1.0 : -1.0
14
+ difference = (y0 - y1).abs
15
+ samples.each do |sample|
16
+ if difference <= 0
17
+ break
18
+ end
19
+ if sample[@y_index] != majority
20
+ samples << sample
21
+ difference -= 1
22
+ end
23
+ end
24
+ end
25
+
26
+ def distribution instances
27
+ analyzer = FeaturesAnalyzer.new @y_index
28
+ analyzer.analyze(instances).distribution
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,31 @@
1
+ module AdaBoost
2
+
3
+ class WeakClassifier
4
+
5
+ attr_accessor :error
6
+ attr_reader :feature_number, :split, :alpha
7
+
8
+ def initialize feature_number, split
9
+ @feature_number = feature_number
10
+ @split = split
11
+ @error = 0.0
12
+ @alpha = 0.0
13
+ end
14
+
15
+ def compute_alpha
16
+ @alpha = 0.5 * Math.log((1.0 - @error) / @error)
17
+ end
18
+
19
+ def classify sample
20
+ sample[@feature_number] > @split ? 1 : -1
21
+ end
22
+
23
+ def classify_with_alpha sample
24
+ return classify(sample) * @alpha
25
+ end
26
+
27
+ def increase_error amount
28
+ @error += amount
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,87 @@
1
+ module AdaBoost
2
+
3
+ class WeakLearner
4
+
5
+ def initialize y_index
6
+ @y_index = y_index
7
+ @analyzer = FeaturesAnalyzer.new y_index
8
+ @classifiers_cache = []
9
+ end
10
+
11
+ def features_satistics samples
12
+ @analyzer.analyze(samples).statistics
13
+ end
14
+
15
+ def generate_weak_classifier samples, weights
16
+ number_of_samples = samples.size
17
+ if number_of_samples < 1
18
+ raise ArgumentError.new 'At least one sample is needed to generate.'
19
+ end
20
+ number_of_features = @y_index
21
+ sample_size = samples[0].size
22
+ if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
23
+ raise ArgumentError.new 'At least 1 feature is needed to generate.'
24
+ end
25
+ classifiers = []
26
+ if Config::USE_RANDOM_WEAK_CLASSIFIERS
27
+ classifiers = generate_random_classifiers samples, number_of_features
28
+ else
29
+ classifiers = generate_all_possible_classifiers samples, number_of_features
30
+ end
31
+ best_index = -1
32
+ best_error = Float::MAX
33
+ classifiers.each_with_index do |classifier, i|
34
+ classifier.error = 0.0
35
+ samples.each_with_index do |sample, j|
36
+ y = sample[@y_index]
37
+ if classifier.classify(sample).to_f != y
38
+ classifier.increase_error weights[j]
39
+ end
40
+ end
41
+ if classifier.error < best_error
42
+ best_error = classifier.error
43
+ best_index = i
44
+ end
45
+ end
46
+ best = classifiers[best_index]
47
+ if !Config::USE_RANDOM_WEAK_CLASSIFIERS
48
+ classifiers.delete_at best_index
49
+ end
50
+ best
51
+ end
52
+
53
+ def generate_random_classifiers samples, number_of_features
54
+ classifiers = []
55
+ statistics = features_satistics samples
56
+ 0.upto Config::NUMBER_OF_RANDOM_CLASSIFIERS - 1 do
57
+ feature_number = rand number_of_features
58
+ info = statistics[feature_number]
59
+ split = rand * info.rng + info.min
60
+ classifiers << WeakClassifier.new(feature_number, split)
61
+ end
62
+ classifiers
63
+ end
64
+
65
+ def generate_all_possible_classifiers samples, number_of_features
66
+ if @classifiers_cache.size == 0
67
+ matrix = []
68
+ 0.upto number_of_features - 1 do
69
+ matrix << []
70
+ end
71
+ samples.each do |sample|
72
+ 0.upto number_of_features - 1 do |i|
73
+ sample_value = sample[i]
74
+ matrix[i] << sample_value
75
+ end
76
+ end
77
+ matrix.each_with_index do |entry, i|
78
+ entry = entry.uniq
79
+ entry.each do |uniq_value|
80
+ @classifiers_cache << WeakClassifier.new(i, uniq_value)
81
+ end
82
+ end
83
+ end
84
+ @classifiers_cache
85
+ end
86
+ end
87
+ end
data/lib/adaboost.rb ADDED
@@ -0,0 +1,11 @@
1
+ require 'adaboost/adaboost.rb'
2
+ require 'adaboost/config.rb'
3
+ require 'adaboost/contingency_table.rb'
4
+ require 'adaboost/evaluator.rb'
5
+ require 'adaboost/features_analyzer.rb'
6
+ require 'adaboost/resampler.rb'
7
+ require 'adaboost/weak_classifier.rb'
8
+ require 'adaboost/weak_learner.rb'
9
+
10
+ module AdaBoost
11
+ end
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: adaboost
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Dalmir da Silva
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-07-25 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: AdaBoost classifier!
14
+ email: dalmirdasilva@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/adaboost.rb
20
+ - lib/adaboost/adaboost.rb
21
+ - lib/adaboost/config.rb
22
+ - lib/adaboost/contingency_table.rb
23
+ - lib/adaboost/evaluator.rb
24
+ - lib/adaboost/features_analyzer.rb
25
+ - lib/adaboost/resampler.rb
26
+ - lib/adaboost/weak_classifier.rb
27
+ - lib/adaboost/weak_learner.rb
28
+ homepage: http://dalmirdasilva.com/adaboost-classifier
29
+ licenses:
30
+ - MIT
31
+ metadata: {}
32
+ post_install_message:
33
+ rdoc_options: []
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ requirements: []
47
+ rubyforge_project:
48
+ rubygems_version: 2.5.1
49
+ signing_key:
50
+ specification_version: 4
51
+ summary: AdaBoost classifier!
52
+ test_files: []