sapor 0.1b1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +7 -0
  2. data/Area Class Diagram.dia +0 -0
  3. data/Area Class Diagram.png +0 -0
  4. data/Class Diagram.dia +0 -0
  5. data/Class Diagram.png +0 -0
  6. data/Examples.md +361 -0
  7. data/LICENSE +674 -0
  8. data/README.md +70 -0
  9. data/Rakefile +18 -0
  10. data/Technical Documentation.md +14 -0
  11. data/bin/create_installation_package.sh +49 -0
  12. data/bin/install.sh +45 -0
  13. data/bin/sapor.rb +22 -0
  14. data/bin/sapor.sh +105 -0
  15. data/lib/sapor.rb +44 -0
  16. data/lib/sapor/binomials_cache.rb +45 -0
  17. data/lib/sapor/combinations_distribution.rb +180 -0
  18. data/lib/sapor/dichotomies.rb +98 -0
  19. data/lib/sapor/dichotomy.rb +138 -0
  20. data/lib/sapor/first_past_the_post.rb +78 -0
  21. data/lib/sapor/leveled_proportional.rb +64 -0
  22. data/lib/sapor/log4r_logger.rb +49 -0
  23. data/lib/sapor/log_facade.rb +40 -0
  24. data/lib/sapor/number_formatter.rb +45 -0
  25. data/lib/sapor/poll.rb +137 -0
  26. data/lib/sapor/polychotomy.rb +359 -0
  27. data/lib/sapor/proportional.rb +128 -0
  28. data/lib/sapor/pseudorandom_multirange_enumerator.rb +87 -0
  29. data/lib/sapor/regional_data/area.rb +80 -0
  30. data/lib/sapor/regional_data/catalonia-2012-2015.psv +100 -0
  31. data/lib/sapor/regional_data/catalonia-2012.psv +87 -0
  32. data/lib/sapor/regional_data/catalonia.rb +90 -0
  33. data/lib/sapor/regional_data/norway.rb +408 -0
  34. data/lib/sapor/regional_data/united_kingdom.rb +1075 -0
  35. data/lib/sapor/regional_data/utopia.rb +66 -0
  36. data/sapor.gemspec +35 -0
  37. data/spec/integration/area_spec.rb +28 -0
  38. data/spec/integration/poll_spec.rb +107 -0
  39. data/spec/integration/sample.poll +7 -0
  40. data/spec/spec_helper.rb +31 -0
  41. data/spec/unit/area_spec.rb +115 -0
  42. data/spec/unit/binomials_cache_spec.rb +34 -0
  43. data/spec/unit/catalonia_spec.rb +82 -0
  44. data/spec/unit/combinations_distribution_spec.rb +241 -0
  45. data/spec/unit/denominators_spec.rb +34 -0
  46. data/spec/unit/dichotomies_spec.rb +154 -0
  47. data/spec/unit/dichotomy_spec.rb +320 -0
  48. data/spec/unit/first_past_the_post_spec.rb +53 -0
  49. data/spec/unit/leveled_proportional_spec.rb +51 -0
  50. data/spec/unit/norway_spec.rb +47 -0
  51. data/spec/unit/number_formatter_spec.rb +173 -0
  52. data/spec/unit/poll_spec.rb +105 -0
  53. data/spec/unit/polychotomy_spec.rb +332 -0
  54. data/spec/unit/proportional_spec.rb +86 -0
  55. data/spec/unit/pseudorandom_multirange_enumerator_spec.rb +82 -0
  56. metadata +119 -0
@@ -0,0 +1,45 @@
1
+ # encoding: utf-8
2
+ #
3
+ # Statistical Analysis of Polling Results (SAPoR)
4
+ # Copyright (C) 2014 Filip van Laenen <f.a.vanlaenen@ieee.org>
5
+ #
6
+ # This file is part of SAPoR.
7
+ #
8
+ # SAPoR is free software: you can redistribute it and/or modify it under the
9
+ # terms of the GNU General Public License as published by the Free Software
10
+ # Foundation, either version 3 of the License, or (at your option) any later
11
+ # version.
12
+ #
13
+ # SAPoR is distributed in the hope that it will be useful, but WITHOUT ANY
14
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15
+ # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16
+ #
17
+ # You can find a copy of the GNU General Public License in /doc/gpl.txt
18
+ #
19
+
20
+ module Sapor
21
+ #
22
+ # Module to format numbers.
23
+ #
24
+ module NumberFormatter
25
+ def three_digits_percentage(number)
26
+ if number >= 0.9995 || number < 0.000005
27
+ sprintf('%.0f', number * 100) + '%'
28
+ elsif number >= 0.09995
29
+ sprintf('%.1f', number * 100) + '%'
30
+ elsif number >= 0.009995
31
+ sprintf('%.2f', number * 100) + '%'
32
+ else
33
+ sprintf('%.3f', number * 100) + '%'
34
+ end
35
+ end
36
+
37
+ def six_char_percentage(number)
38
+ sprintf('%5.1f', number * 100) + '%'
39
+ end
40
+
41
+ def with_thousands_separator(integer)
42
+ integer.to_s.reverse.gsub(/...(?=.)/, '\&,').reverse
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,137 @@
1
+ # encoding: utf-8
2
+ #
3
+ # Statistical Analysis of Polling Results (SAPoR)
4
+ # Copyright (C) 2014 Filip van Laenen <f.a.vanlaenen@ieee.org>
5
+ #
6
+ # This file is part of SAPoR.
7
+ #
8
+ # SAPoR is free software: you can redistribute it and/or modify it under the
9
+ # terms of the GNU General Public License as published by the Free Software
10
+ # Foundation, either version 3 of the License, or (at your option) any later
11
+ # version.
12
+ #
13
+ # SAPoR is distributed in the hope that it will be useful, but WITHOUT ANY
14
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15
+ # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16
+ #
17
+ # You can find a copy of the GNU General Public License in /doc/gpl.txt
18
+ #
19
+
20
+ module Sapor
21
+ OTHER = 'Other'
22
+
23
+ #
24
+ # Represents a poll.
25
+ #
26
+ class Poll
27
+ include NumberFormatter
28
+ attr_reader :area, :logger
29
+
30
+ AREA_KEY = 'Area'
31
+ AREAS_MAP = {}
32
+ [Catalonia.instance, Norway.instance, UnitedKingdom.instance, Utopia.instance].map { |area| AREAS_MAP[area.area_code] = area }
33
+
34
+ def initialize(metadata, results)
35
+ @logger = LogFacade.create_logger
36
+ @area = lookup_area(metadata.delete(AREA_KEY))
37
+ @results = interpret(results)
38
+ end
39
+
40
+ def analyze(max_error = 0.001)
41
+ analyze_as_dichotomies(max_error)
42
+ analyze_as_polychotomy(max_error)
43
+ @logger.info('Done.')
44
+ end
45
+
46
+ def confidence_interval(choice, level = 0.95)
47
+ @analysis.confidence_interval(choice, level) unless @analysis.nil?
48
+ end
49
+
50
+ def most_probable_fraction(choice)
51
+ @analysis.most_probable_fraction(choice) unless @analysis.nil?
52
+ end
53
+
54
+ def most_probable_value(choice)
55
+ @analysis.most_probable_value(choice) unless @analysis.nil?
56
+ end
57
+
58
+ def result(choice)
59
+ @results[choice]
60
+ end
61
+
62
+ private
63
+
64
+ def self.line_to_hash(line, current, results)
65
+ if line.chomp.eql?('==')
66
+ current = results
67
+ else
68
+ elements = line.chomp.split('=')
69
+ current[elements.first] = elements.last
70
+ end
71
+ current
72
+ end
73
+
74
+ def self.as_hashes(lines)
75
+ metadata = {}
76
+ results = {}
77
+ current = metadata
78
+ lines.each do |line|
79
+ current = line_to_hash(line, current, results)
80
+ end
81
+ [metadata, results]
82
+ end
83
+
84
+ def self.from_lines(lines)
85
+ hashes = as_hashes(lines)
86
+ metadata = hashes.first
87
+ results = hashes.last
88
+ new(metadata, results)
89
+ end
90
+
91
+ def self.from_file(filename)
92
+ from_lines(File.open(filename))
93
+ end
94
+
95
+ def interpret(results)
96
+ interpreted = {}
97
+ results.each_pair do |key, value|
98
+ interpreted[key] = value.to_i
99
+ end
100
+ interpreted
101
+ end
102
+
103
+ def lookup_area(area_code)
104
+ AREAS_MAP[area_code]
105
+ end
106
+
107
+ def population_size
108
+ @area.population_size
109
+ end
110
+
111
+ def threshold
112
+ @area.threshold
113
+ end
114
+
115
+ def analyze_until_convergence(max_error)
116
+ while @analysis.error_estimate > max_error
117
+ @analysis.refine
118
+ @logger.info(@analysis.report)
119
+ @logger.info('Error estimate: ε ≤' \
120
+ " #{three_digits_percentage(@analysis.error_estimate)}.")
121
+ @logger.info(@analysis.progress_report)
122
+ end
123
+ end
124
+
125
+ def analyze_as_dichotomies(max_error)
126
+ @logger.info('Analyzing as a set of dichotomies...')
127
+ @analysis = Dichotomies.new(@results, population_size, threshold)
128
+ analyze_until_convergence(max_error)
129
+ end
130
+
131
+ def analyze_as_polychotomy(max_error)
132
+ @logger.info('Analyzing as a polychotomy...')
133
+ @analysis = Polychotomy.new(@results, @area, @analysis, max_error)
134
+ analyze_until_convergence(max_error)
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,359 @@
1
+ # encoding: utf-8
2
+ #
3
+ # Statistical Analysis of Polling Results (SAPoR)
4
+ # Copyright (C) 2014 Filip van Laenen <f.a.vanlaenen@ieee.org>
5
+ #
6
+ # This file is part of SAPoR.
7
+ #
8
+ # SAPoR is free software: you can redistribute it and/or modify it under the
9
+ # terms of the GNU General Public License as published by the Free Software
10
+ # Foundation, either version 3 of the License, or (at your option) any later
11
+ # version.
12
+ #
13
+ # SAPoR is distributed in the hope that it will be useful, but WITHOUT ANY
14
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15
+ # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16
+ #
17
+ # You can find a copy of the GNU General Public License in /doc/gpl.txt
18
+ #
19
+
20
+ require 'prime'
21
+
22
+ module Sapor
23
+ #
24
+ # Represents a polychotomy.
25
+ #
26
+ class Polychotomy
27
+ include NumberFormatter
28
+
29
+ attr_reader :error_estimate, :no_of_data_points, :no_of_simulations
30
+
31
+ def initialize(results, area, dichotomies, max_error)
32
+ @results = results
33
+ @area = area
34
+ @choices = results.keys
35
+ @coalitions = area.coalitions
36
+ @ranges = extract_ranges_from_dichotomies(dichotomies, max_error)
37
+ range_sizes = @ranges.values.map(&:size)
38
+ @enum = PseudoRandomMultiRangeEnumerator.new(range_sizes).each
39
+ @no_of_simulations = 0
40
+ @no_of_data_points = 0
41
+ @distributions = create_new_votes_distributions # TODO: Rename to @votes
42
+ @seats = create_new_seats_distributions
43
+ @comparisons = {}
44
+ @choices.each do |a|
45
+ @choices.each do |b|
46
+ @comparisons[a + '>' + b] = 0.to_lf
47
+ end
48
+ end
49
+ @combinations_sum = 0.to_lf
50
+ @error_estimate = 1.0
51
+ end
52
+
53
+ def range(choice)
54
+ @ranges[choice]
55
+ end
56
+
57
+ def space_size
58
+ @enum.size
59
+ end
60
+
61
+ def most_probable_value(key)
62
+ if @no_of_simulations == 0
63
+ nil
64
+ else
65
+ @distributions[key].most_probable_value
66
+ end
67
+ end
68
+
69
+ def calculate_most_probable_fraction(key, distributions)
70
+ distributions[key].most_probable_value.to_f / @area.population_size
71
+ end
72
+
73
+ def most_probable_fraction(key)
74
+ if @no_of_simulations == 0
75
+ nil
76
+ else
77
+ calculate_most_probable_fraction(key, @distributions)
78
+ end
79
+ end
80
+
81
+ def refine
82
+ no_of_new_simulations = 0
83
+ new_votes = create_new_votes_distributions
84
+ new_seats = create_new_seats_distributions
85
+ while @no_of_data_points == 0 || no_of_new_simulations == 0 ||
86
+ no_of_new_simulations < @no_of_simulations
87
+ no_of_new_simulations += try_next_data_point(new_votes, new_seats)
88
+ end
89
+ unless @no_of_simulations == 0
90
+ @error_estimate = calculate_error_estimate(new_votes)
91
+ end
92
+ @distributions = merge_distributions(@distributions, new_votes)
93
+ @seats = merge_distributions(@seats, new_seats)
94
+ @no_of_simulations += no_of_new_simulations
95
+ end
96
+
97
+ def try_next_data_point(new_votes, new_seats)
98
+ data_point = next_data_point
99
+ if data_point[OTHER] >= 0
100
+ simulate(new_votes, new_seats, data_point)
101
+ new_simulation = 1
102
+ else
103
+ new_simulation = 0
104
+ end
105
+ @no_of_data_points += 1
106
+ new_simulation
107
+ end
108
+
109
+ def report
110
+ choice_lengths = @choices.map(&:length)
111
+ choice_lengths << 6
112
+ max_choice_width = choice_lengths.max
113
+ max_seats_width = @area.no_of_seats.to_s.size
114
+ sorted_choices = sort_choices_by_result
115
+ choice_lines = sorted_choices.map.with_index do |choice, i|
116
+ next_choice = sorted_choices[i + 1]
117
+ create_choice_report_line(choice, next_choice, max_choice_width, max_seats_width)
118
+ end
119
+ coalition_lengths = @coalitions.map { |coalition| coalition_label(coalition).length }
120
+ coalition_lengths << 6
121
+ max_coalition_width = coalition_lengths.max
122
+ sorted_coalitions = sort_coalitions_by_result
123
+ coalition_lines = sorted_coalitions.map do |coalition|
124
+ create_coalition_report_line(coalition, max_coalition_width, max_seats_width)
125
+ end
126
+ 'Most probable rounded fractions, fractions and 95% confidence' \
127
+ " intervals:\n" + 'Choice'.ljust(max_choice_width) +
128
+ " Result MPRF MPF CI(95%) P(>↓) Seats\n" +
129
+ choice_lines.join("\n") + "\n" +
130
+ 'Coalition'.ljust(max_coalition_width) + " Result MPRF MPF" +
131
+ " CI(95%) P(>50%) Seats P(>50%)\n" +
132
+ coalition_lines.join("\n")
133
+ end
134
+
135
+ def progress_report
136
+ space_size_ratio = space_size / @no_of_data_points
137
+ if space_size_ratio > 10
138
+ space_size_ratio = with_thousands_separator(space_size_ratio.round)
139
+ else
140
+ space_size_ratio = space_size_ratio.round(1)
141
+ end
142
+ "#{with_thousands_separator(@no_of_simulations)} simulations out of " \
143
+ "#{with_thousands_separator(@no_of_data_points)} data" \
144
+ " points, 1 / #{space_size_ratio} of search space size" \
145
+ " (#{with_thousands_separator(space_size)})."
146
+ end
147
+
148
+ private
149
+
150
+ def extract_ranges_from_dichotomies(dichotomies, max_error)
151
+ ranges = {}
152
+ level = 1 - (max_error**2)
153
+ @choices.each do |choice|
154
+ unless choice == OTHER
155
+ ranges[choice] = dichotomies.confidence_interval_values(choice,
156
+ level).sort
157
+ end
158
+ end
159
+ ranges
160
+ end
161
+
162
+ def create_new_seats_distributions
163
+ distributions = {}
164
+ @choices.each do |choice|
165
+ distributions[choice] = CombinationsDistribution.new
166
+ Range.new(0, @area.no_of_seats).each do |value|
167
+ distributions[choice][value] = 0.to_lf
168
+ end
169
+ end
170
+ @coalitions.each do |coalition|
171
+ distributions[coalition] = CombinationsDistribution.new
172
+ Range.new(0, @area.no_of_seats).each do |value|
173
+ distributions[coalition][value] = 0.to_lf
174
+ end
175
+ end
176
+ distributions
177
+ end
178
+
179
+ def create_new_votes_distributions
180
+ distributions = {}
181
+ @choices.each do |choice|
182
+ unless choice == OTHER
183
+ distributions[choice] = CombinationsDistribution.new
184
+ @ranges[choice].each do |value|
185
+ distributions[choice][value] = 0.to_lf
186
+ end
187
+ end
188
+ end
189
+ @coalitions.each do |coalition|
190
+ distributions[coalition] = CombinationsDistribution.new
191
+ end
192
+ distributions
193
+ end
194
+
195
+ def next_data_point
196
+ indexes = @enum.next
197
+ data_point = {}
198
+ indexes.each_with_index do |ix, i|
199
+ data_point[@ranges.keys[i]] = @ranges.values[i][ix]
200
+ end
201
+ data_point[OTHER] = @area.population_size - data_point.values.inject(:+)
202
+ data_point
203
+ end
204
+
205
+ def simulate(votes, seats, data_point)
206
+ combinations = 1.to_lf
207
+ data_point.each do |choice, value|
208
+ combinations *= BinomialsCache.binomial(value, @results[choice])
209
+ end
210
+ @combinations_sum += combinations
211
+ data_point.each do |choice, value|
212
+ votes[choice][value] += combinations unless choice == OTHER
213
+ end
214
+ @choices.each do |a|
215
+ @choices.each do |b|
216
+ if data_point[a] > data_point[b]
217
+ @comparisons[a + '>' + b] += combinations
218
+ end
219
+ end
220
+ end
221
+ @coalitions.each do |coalition|
222
+ coalition_value = coalition.map { |choice| data_point.key?(choice) ? data_point[choice] : 0}.inject(:+)
223
+ if votes[coalition][coalition_value].nil?
224
+ votes[coalition][coalition_value] = combinations
225
+ else
226
+ votes[coalition][coalition_value] += combinations
227
+ end
228
+ end
229
+ projection = @area.seats(data_point)
230
+ @choices.each do |choice|
231
+ if projection.key?(choice)
232
+ seats[choice][projection[choice]] += combinations
233
+ else
234
+ seats[choice][0] += combinations
235
+ end
236
+ end
237
+ @coalitions.each do |coalition|
238
+ coalition_value = coalition.map { |choice| projection.key?(choice) ? projection[choice] : 0}.inject(:+)
239
+ if seats[coalition][coalition_value].nil?
240
+ seats[coalition][coalition_value] = combinations
241
+ else
242
+ seats[coalition][coalition_value] += combinations
243
+ end
244
+ end
245
+ end
246
+
247
+ def calculate_error_estimate(new_simulations)
248
+ error_estimate = 0
249
+ @choices.each do |choice|
250
+ unless choice == OTHER
251
+ mpv_new = calculate_most_probable_fraction(choice, new_simulations)
252
+ mpv_old = calculate_most_probable_fraction(choice, @distributions)
253
+ delta = (mpv_new - mpv_old).abs
254
+ error_estimate = [error_estimate, delta].max
255
+ end
256
+ end
257
+ error_estimate
258
+ end
259
+
260
+ def merge_distributions(distributions1, distributions2)
261
+ merged_distributions = {}
262
+ @choices.each do |choice|
263
+ unless choice == OTHER
264
+ merged_distributions[choice] = distributions1[choice] + \
265
+ distributions2[choice]
266
+ end
267
+ end
268
+ @coalitions.each do |coalition|
269
+ merged_distributions[coalition] = distributions1[coalition] + \
270
+ distributions2[coalition]
271
+ end
272
+ merged_distributions
273
+ end
274
+
275
+ def sort_choices_by_result
276
+ sorted_choices = @choices.reject { |choice| choice == OTHER }
277
+ sorted_choices.sort do |a, b|
278
+ comparison = result(b) <=> result(a)
279
+ if comparison == 0
280
+ a <=> b
281
+ else
282
+ comparison
283
+ end
284
+ end
285
+ end
286
+
287
+ def coalition_label(coalition)
288
+ coalition.sort.join(' + ')
289
+ end
290
+
291
+ def sort_coalitions_by_result
292
+ @coalitions.sort do |a, b|
293
+ comparison = result(b) <=> result(a)
294
+ if comparison == 0
295
+ coalition_label(a) <=> coalition_label(b)
296
+ else
297
+ comparison
298
+ end
299
+ end
300
+ end
301
+
302
+ def result(choice)
303
+ @results[choice].to_f / @results.values.inject(:+)
304
+ end
305
+
306
+ def coalition_result(coalition)
307
+ coalition.map { |choice| @results[choice].to_f }.inject(:+) / @results.values.inject(:+)
308
+ end
309
+
310
+ def larger_than(a, b)
311
+ probability = @comparisons[a + '>' + b] / @combinations_sum
312
+ probability.mantissa * (10**probability.exponent)
313
+ end
314
+
315
+ def create_choice_report_line(choice, next_choice, max_choice_width, max_seats_width)
316
+ ci_values = @distributions[choice].confidence_interval(0.95)
317
+ confidence_interval = ci_values.map { |x| x.to_f / @area.population_size }
318
+ ci_seats = @seats[choice].confidence_interval(0.95)
319
+ choice.ljust(max_choice_width) + ' ' + \
320
+ six_char_percentage(result(choice)) + ' ' + \
321
+ six_char_percentage(most_probable_rounded_fraction(choice)) + ' ' + \
322
+ six_char_percentage(most_probable_fraction(choice)) + ' ' + \
323
+ six_char_percentage(confidence_interval.first) + '–' + \
324
+ six_char_percentage(confidence_interval.last) + ' ' + \
325
+ (next_choice.nil? ? ' ' : six_char_percentage(larger_than(choice, next_choice))) + ' ' + \
326
+ (max_seats_width == 1 ? ' ' : '') + \
327
+ ci_seats.first.to_s.rjust(max_seats_width) + '–' + \
328
+ ci_seats.last.to_s.rjust(max_seats_width)
329
+ end
330
+
331
+ def create_coalition_report_line(coalition, max_coalition_width, max_seats_width)
332
+ ci_values = @distributions[coalition].confidence_interval(0.95)
333
+ majority_votes_probability = @distributions[coalition].threshold_probability(0.5, @area.population_size)
334
+ confidence_interval = ci_values.map { |x| x.to_f / @area.population_size }
335
+ ci_seats = @seats[coalition].confidence_interval(0.95)
336
+ seats_majority = 1 + @area.no_of_seats / 2
337
+ majority_seats_probability = @seats[coalition].value_threshold_probability(seats_majority)
338
+ coalition_label(coalition).ljust(max_coalition_width) + ' ' + \
339
+ six_char_percentage(coalition_result(coalition)) + ' ' + \
340
+ six_char_percentage(most_probable_rounded_fraction(coalition)) + ' ' + \
341
+ six_char_percentage(most_probable_fraction(coalition)) + ' ' + \
342
+ six_char_percentage(confidence_interval.first) + '–' + \
343
+ six_char_percentage(confidence_interval.last) + ' ' + \
344
+ six_char_percentage(majority_votes_probability) + ' ' + \
345
+ (max_seats_width == 1 ? ' ' : '') + \
346
+ ci_seats.first.to_s.rjust(max_seats_width) + '–' + \
347
+ ci_seats.last.to_s.rjust(max_seats_width) + ' ' + \
348
+ six_char_percentage(majority_seats_probability)
349
+ end
350
+
351
+ def most_probable_rounded_fraction(key)
352
+ if @no_of_simulations == 0
353
+ nil
354
+ else
355
+ @distributions[key].most_probable_rounded_fraction(@area.population_size)
356
+ end
357
+ end
358
+ end
359
+ end