sapor 0.1b1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +7 -0
  2. data/Area Class Diagram.dia +0 -0
  3. data/Area Class Diagram.png +0 -0
  4. data/Class Diagram.dia +0 -0
  5. data/Class Diagram.png +0 -0
  6. data/Examples.md +361 -0
  7. data/LICENSE +674 -0
  8. data/README.md +70 -0
  9. data/Rakefile +18 -0
  10. data/Technical Documentation.md +14 -0
  11. data/bin/create_installation_package.sh +49 -0
  12. data/bin/install.sh +45 -0
  13. data/bin/sapor.rb +22 -0
  14. data/bin/sapor.sh +105 -0
  15. data/lib/sapor.rb +44 -0
  16. data/lib/sapor/binomials_cache.rb +45 -0
  17. data/lib/sapor/combinations_distribution.rb +180 -0
  18. data/lib/sapor/dichotomies.rb +98 -0
  19. data/lib/sapor/dichotomy.rb +138 -0
  20. data/lib/sapor/first_past_the_post.rb +78 -0
  21. data/lib/sapor/leveled_proportional.rb +64 -0
  22. data/lib/sapor/log4r_logger.rb +49 -0
  23. data/lib/sapor/log_facade.rb +40 -0
  24. data/lib/sapor/number_formatter.rb +45 -0
  25. data/lib/sapor/poll.rb +137 -0
  26. data/lib/sapor/polychotomy.rb +359 -0
  27. data/lib/sapor/proportional.rb +128 -0
  28. data/lib/sapor/pseudorandom_multirange_enumerator.rb +87 -0
  29. data/lib/sapor/regional_data/area.rb +80 -0
  30. data/lib/sapor/regional_data/catalonia-2012-2015.psv +100 -0
  31. data/lib/sapor/regional_data/catalonia-2012.psv +87 -0
  32. data/lib/sapor/regional_data/catalonia.rb +90 -0
  33. data/lib/sapor/regional_data/norway.rb +408 -0
  34. data/lib/sapor/regional_data/united_kingdom.rb +1075 -0
  35. data/lib/sapor/regional_data/utopia.rb +66 -0
  36. data/sapor.gemspec +35 -0
  37. data/spec/integration/area_spec.rb +28 -0
  38. data/spec/integration/poll_spec.rb +107 -0
  39. data/spec/integration/sample.poll +7 -0
  40. data/spec/spec_helper.rb +31 -0
  41. data/spec/unit/area_spec.rb +115 -0
  42. data/spec/unit/binomials_cache_spec.rb +34 -0
  43. data/spec/unit/catalonia_spec.rb +82 -0
  44. data/spec/unit/combinations_distribution_spec.rb +241 -0
  45. data/spec/unit/denominators_spec.rb +34 -0
  46. data/spec/unit/dichotomies_spec.rb +154 -0
  47. data/spec/unit/dichotomy_spec.rb +320 -0
  48. data/spec/unit/first_past_the_post_spec.rb +53 -0
  49. data/spec/unit/leveled_proportional_spec.rb +51 -0
  50. data/spec/unit/norway_spec.rb +47 -0
  51. data/spec/unit/number_formatter_spec.rb +173 -0
  52. data/spec/unit/poll_spec.rb +105 -0
  53. data/spec/unit/polychotomy_spec.rb +332 -0
  54. data/spec/unit/proportional_spec.rb +86 -0
  55. data/spec/unit/pseudorandom_multirange_enumerator_spec.rb +82 -0
  56. metadata +119 -0
@@ -0,0 +1,45 @@
1
+ # encoding: utf-8
2
+ #
3
+ # Statistical Analysis of Polling Results (SAPoR)
4
+ # Copyright (C) 2014 Filip van Laenen <f.a.vanlaenen@ieee.org>
5
+ #
6
+ # This file is part of SAPoR.
7
+ #
8
+ # SAPoR is free software: you can redistribute it and/or modify it under the
9
+ # terms of the GNU General Public License as published by the Free Software
10
+ # Foundation, either version 3 of the License, or (at your option) any later
11
+ # version.
12
+ #
13
+ # SAPoR is distributed in the hope that it will be useful, but WITHOUT ANY
14
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15
+ # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16
+ #
17
+ # You can find a copy of the GNU General Public License in /doc/gpl.txt
18
+ #
19
+
20
+ module Sapor
21
+ #
22
+ # Module to format numbers.
23
+ #
24
+ module NumberFormatter
25
+ def three_digits_percentage(number)
26
+ if number >= 0.9995 || number < 0.000005
27
+ sprintf('%.0f', number * 100) + '%'
28
+ elsif number >= 0.09995
29
+ sprintf('%.1f', number * 100) + '%'
30
+ elsif number >= 0.009995
31
+ sprintf('%.2f', number * 100) + '%'
32
+ else
33
+ sprintf('%.3f', number * 100) + '%'
34
+ end
35
+ end
36
+
37
+ def six_char_percentage(number)
38
+ sprintf('%5.1f', number * 100) + '%'
39
+ end
40
+
41
+ def with_thousands_separator(integer)
42
+ integer.to_s.reverse.gsub(/...(?=.)/, '\&,').reverse
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,137 @@
1
+ # encoding: utf-8
2
+ #
3
+ # Statistical Analysis of Polling Results (SAPoR)
4
+ # Copyright (C) 2014 Filip van Laenen <f.a.vanlaenen@ieee.org>
5
+ #
6
+ # This file is part of SAPoR.
7
+ #
8
+ # SAPoR is free software: you can redistribute it and/or modify it under the
9
+ # terms of the GNU General Public License as published by the Free Software
10
+ # Foundation, either version 3 of the License, or (at your option) any later
11
+ # version.
12
+ #
13
+ # SAPoR is distributed in the hope that it will be useful, but WITHOUT ANY
14
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15
+ # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16
+ #
17
+ # You can find a copy of the GNU General Public License in /doc/gpl.txt
18
+ #
19
+
20
+ module Sapor
21
+ OTHER = 'Other'
22
+
23
+ #
24
+ # Represents a poll.
25
+ #
26
+ class Poll
27
+ include NumberFormatter
28
+ attr_reader :area, :logger
29
+
30
+ AREA_KEY = 'Area'
31
+ AREAS_MAP = {}
32
+ [Catalonia.instance, Norway.instance, UnitedKingdom.instance, Utopia.instance].map { |area| AREAS_MAP[area.area_code] = area }
33
+
34
+ def initialize(metadata, results)
35
+ @logger = LogFacade.create_logger
36
+ @area = lookup_area(metadata.delete(AREA_KEY))
37
+ @results = interpret(results)
38
+ end
39
+
40
+ def analyze(max_error = 0.001)
41
+ analyze_as_dichotomies(max_error)
42
+ analyze_as_polychotomy(max_error)
43
+ @logger.info('Done.')
44
+ end
45
+
46
+ def confidence_interval(choice, level = 0.95)
47
+ @analysis.confidence_interval(choice, level) unless @analysis.nil?
48
+ end
49
+
50
+ def most_probable_fraction(choice)
51
+ @analysis.most_probable_fraction(choice) unless @analysis.nil?
52
+ end
53
+
54
+ def most_probable_value(choice)
55
+ @analysis.most_probable_value(choice) unless @analysis.nil?
56
+ end
57
+
58
+ def result(choice)
59
+ @results[choice]
60
+ end
61
+
62
+ private
63
+
64
+ def self.line_to_hash(line, current, results)
65
+ if line.chomp.eql?('==')
66
+ current = results
67
+ else
68
+ elements = line.chomp.split('=')
69
+ current[elements.first] = elements.last
70
+ end
71
+ current
72
+ end
73
+
74
+ def self.as_hashes(lines)
75
+ metadata = {}
76
+ results = {}
77
+ current = metadata
78
+ lines.each do |line|
79
+ current = line_to_hash(line, current, results)
80
+ end
81
+ [metadata, results]
82
+ end
83
+
84
+ def self.from_lines(lines)
85
+ hashes = as_hashes(lines)
86
+ metadata = hashes.first
87
+ results = hashes.last
88
+ new(metadata, results)
89
+ end
90
+
91
+ def self.from_file(filename)
92
+ from_lines(File.open(filename))
93
+ end
94
+
95
+ def interpret(results)
96
+ interpreted = {}
97
+ results.each_pair do |key, value|
98
+ interpreted[key] = value.to_i
99
+ end
100
+ interpreted
101
+ end
102
+
103
+ def lookup_area(area_code)
104
+ AREAS_MAP[area_code]
105
+ end
106
+
107
+ def population_size
108
+ @area.population_size
109
+ end
110
+
111
+ def threshold
112
+ @area.threshold
113
+ end
114
+
115
+ def analyze_until_convergence(max_error)
116
+ while @analysis.error_estimate > max_error
117
+ @analysis.refine
118
+ @logger.info(@analysis.report)
119
+ @logger.info('Error estimate: ε ≤' \
120
+ " #{three_digits_percentage(@analysis.error_estimate)}.")
121
+ @logger.info(@analysis.progress_report)
122
+ end
123
+ end
124
+
125
+ def analyze_as_dichotomies(max_error)
126
+ @logger.info('Analyzing as a set of dichotomies...')
127
+ @analysis = Dichotomies.new(@results, population_size, threshold)
128
+ analyze_until_convergence(max_error)
129
+ end
130
+
131
+ def analyze_as_polychotomy(max_error)
132
+ @logger.info('Analyzing as a polychotomy...')
133
+ @analysis = Polychotomy.new(@results, @area, @analysis, max_error)
134
+ analyze_until_convergence(max_error)
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,359 @@
1
+ # encoding: utf-8
2
+ #
3
+ # Statistical Analysis of Polling Results (SAPoR)
4
+ # Copyright (C) 2014 Filip van Laenen <f.a.vanlaenen@ieee.org>
5
+ #
6
+ # This file is part of SAPoR.
7
+ #
8
+ # SAPoR is free software: you can redistribute it and/or modify it under the
9
+ # terms of the GNU General Public License as published by the Free Software
10
+ # Foundation, either version 3 of the License, or (at your option) any later
11
+ # version.
12
+ #
13
+ # SAPoR is distributed in the hope that it will be useful, but WITHOUT ANY
14
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15
+ # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16
+ #
17
+ # You can find a copy of the GNU General Public License in /doc/gpl.txt
18
+ #
19
+
20
+ require 'prime'
21
+
22
+ module Sapor
23
+ #
24
+ # Represents a polychotomy.
25
+ #
26
+ class Polychotomy
27
+ include NumberFormatter
28
+
29
+ attr_reader :error_estimate, :no_of_data_points, :no_of_simulations
30
+
31
+ def initialize(results, area, dichotomies, max_error)
32
+ @results = results
33
+ @area = area
34
+ @choices = results.keys
35
+ @coalitions = area.coalitions
36
+ @ranges = extract_ranges_from_dichotomies(dichotomies, max_error)
37
+ range_sizes = @ranges.values.map(&:size)
38
+ @enum = PseudoRandomMultiRangeEnumerator.new(range_sizes).each
39
+ @no_of_simulations = 0
40
+ @no_of_data_points = 0
41
+ @distributions = create_new_votes_distributions # TODO: Rename to @votes
42
+ @seats = create_new_seats_distributions
43
+ @comparisons = {}
44
+ @choices.each do |a|
45
+ @choices.each do |b|
46
+ @comparisons[a + '>' + b] = 0.to_lf
47
+ end
48
+ end
49
+ @combinations_sum = 0.to_lf
50
+ @error_estimate = 1.0
51
+ end
52
+
53
+ def range(choice)
54
+ @ranges[choice]
55
+ end
56
+
57
+ def space_size
58
+ @enum.size
59
+ end
60
+
61
+ def most_probable_value(key)
62
+ if @no_of_simulations == 0
63
+ nil
64
+ else
65
+ @distributions[key].most_probable_value
66
+ end
67
+ end
68
+
69
+ def calculate_most_probable_fraction(key, distributions)
70
+ distributions[key].most_probable_value.to_f / @area.population_size
71
+ end
72
+
73
+ def most_probable_fraction(key)
74
+ if @no_of_simulations == 0
75
+ nil
76
+ else
77
+ calculate_most_probable_fraction(key, @distributions)
78
+ end
79
+ end
80
+
81
+ def refine
82
+ no_of_new_simulations = 0
83
+ new_votes = create_new_votes_distributions
84
+ new_seats = create_new_seats_distributions
85
+ while @no_of_data_points == 0 || no_of_new_simulations == 0 ||
86
+ no_of_new_simulations < @no_of_simulations
87
+ no_of_new_simulations += try_next_data_point(new_votes, new_seats)
88
+ end
89
+ unless @no_of_simulations == 0
90
+ @error_estimate = calculate_error_estimate(new_votes)
91
+ end
92
+ @distributions = merge_distributions(@distributions, new_votes)
93
+ @seats = merge_distributions(@seats, new_seats)
94
+ @no_of_simulations += no_of_new_simulations
95
+ end
96
+
97
+ def try_next_data_point(new_votes, new_seats)
98
+ data_point = next_data_point
99
+ if data_point[OTHER] >= 0
100
+ simulate(new_votes, new_seats, data_point)
101
+ new_simulation = 1
102
+ else
103
+ new_simulation = 0
104
+ end
105
+ @no_of_data_points += 1
106
+ new_simulation
107
+ end
108
+
109
+ def report
110
+ choice_lengths = @choices.map(&:length)
111
+ choice_lengths << 6
112
+ max_choice_width = choice_lengths.max
113
+ max_seats_width = @area.no_of_seats.to_s.size
114
+ sorted_choices = sort_choices_by_result
115
+ choice_lines = sorted_choices.map.with_index do |choice, i|
116
+ next_choice = sorted_choices[i + 1]
117
+ create_choice_report_line(choice, next_choice, max_choice_width, max_seats_width)
118
+ end
119
+ coalition_lengths = @coalitions.map { |coalition| coalition_label(coalition).length }
120
+ coalition_lengths << 6
121
+ max_coalition_width = coalition_lengths.max
122
+ sorted_coalitions = sort_coalitions_by_result
123
+ coalition_lines = sorted_coalitions.map do |coalition|
124
+ create_coalition_report_line(coalition, max_coalition_width, max_seats_width)
125
+ end
126
+ 'Most probable rounded fractions, fractions and 95% confidence' \
127
+ " intervals:\n" + 'Choice'.ljust(max_choice_width) +
128
+ " Result MPRF MPF CI(95%) P(>↓) Seats\n" +
129
+ choice_lines.join("\n") + "\n" +
130
+ 'Coalition'.ljust(max_coalition_width) + " Result MPRF MPF" +
131
+ " CI(95%) P(>50%) Seats P(>50%)\n" +
132
+ coalition_lines.join("\n")
133
+ end
134
+
135
+ def progress_report
136
+ space_size_ratio = space_size / @no_of_data_points
137
+ if space_size_ratio > 10
138
+ space_size_ratio = with_thousands_separator(space_size_ratio.round)
139
+ else
140
+ space_size_ratio = space_size_ratio.round(1)
141
+ end
142
+ "#{with_thousands_separator(@no_of_simulations)} simulations out of " \
143
+ "#{with_thousands_separator(@no_of_data_points)} data" \
144
+ " points, 1 / #{space_size_ratio} of search space size" \
145
+ " (#{with_thousands_separator(space_size)})."
146
+ end
147
+
148
+ private
149
+
150
+ def extract_ranges_from_dichotomies(dichotomies, max_error)
151
+ ranges = {}
152
+ level = 1 - (max_error**2)
153
+ @choices.each do |choice|
154
+ unless choice == OTHER
155
+ ranges[choice] = dichotomies.confidence_interval_values(choice,
156
+ level).sort
157
+ end
158
+ end
159
+ ranges
160
+ end
161
+
162
+ def create_new_seats_distributions
163
+ distributions = {}
164
+ @choices.each do |choice|
165
+ distributions[choice] = CombinationsDistribution.new
166
+ Range.new(0, @area.no_of_seats).each do |value|
167
+ distributions[choice][value] = 0.to_lf
168
+ end
169
+ end
170
+ @coalitions.each do |coalition|
171
+ distributions[coalition] = CombinationsDistribution.new
172
+ Range.new(0, @area.no_of_seats).each do |value|
173
+ distributions[coalition][value] = 0.to_lf
174
+ end
175
+ end
176
+ distributions
177
+ end
178
+
179
+ def create_new_votes_distributions
180
+ distributions = {}
181
+ @choices.each do |choice|
182
+ unless choice == OTHER
183
+ distributions[choice] = CombinationsDistribution.new
184
+ @ranges[choice].each do |value|
185
+ distributions[choice][value] = 0.to_lf
186
+ end
187
+ end
188
+ end
189
+ @coalitions.each do |coalition|
190
+ distributions[coalition] = CombinationsDistribution.new
191
+ end
192
+ distributions
193
+ end
194
+
195
+ def next_data_point
196
+ indexes = @enum.next
197
+ data_point = {}
198
+ indexes.each_with_index do |ix, i|
199
+ data_point[@ranges.keys[i]] = @ranges.values[i][ix]
200
+ end
201
+ data_point[OTHER] = @area.population_size - data_point.values.inject(:+)
202
+ data_point
203
+ end
204
+
205
+ def simulate(votes, seats, data_point)
206
+ combinations = 1.to_lf
207
+ data_point.each do |choice, value|
208
+ combinations *= BinomialsCache.binomial(value, @results[choice])
209
+ end
210
+ @combinations_sum += combinations
211
+ data_point.each do |choice, value|
212
+ votes[choice][value] += combinations unless choice == OTHER
213
+ end
214
+ @choices.each do |a|
215
+ @choices.each do |b|
216
+ if data_point[a] > data_point[b]
217
+ @comparisons[a + '>' + b] += combinations
218
+ end
219
+ end
220
+ end
221
+ @coalitions.each do |coalition|
222
+ coalition_value = coalition.map { |choice| data_point.key?(choice) ? data_point[choice] : 0}.inject(:+)
223
+ if votes[coalition][coalition_value].nil?
224
+ votes[coalition][coalition_value] = combinations
225
+ else
226
+ votes[coalition][coalition_value] += combinations
227
+ end
228
+ end
229
+ projection = @area.seats(data_point)
230
+ @choices.each do |choice|
231
+ if projection.key?(choice)
232
+ seats[choice][projection[choice]] += combinations
233
+ else
234
+ seats[choice][0] += combinations
235
+ end
236
+ end
237
+ @coalitions.each do |coalition|
238
+ coalition_value = coalition.map { |choice| projection.key?(choice) ? projection[choice] : 0}.inject(:+)
239
+ if seats[coalition][coalition_value].nil?
240
+ seats[coalition][coalition_value] = combinations
241
+ else
242
+ seats[coalition][coalition_value] += combinations
243
+ end
244
+ end
245
+ end
246
+
247
+ def calculate_error_estimate(new_simulations)
248
+ error_estimate = 0
249
+ @choices.each do |choice|
250
+ unless choice == OTHER
251
+ mpv_new = calculate_most_probable_fraction(choice, new_simulations)
252
+ mpv_old = calculate_most_probable_fraction(choice, @distributions)
253
+ delta = (mpv_new - mpv_old).abs
254
+ error_estimate = [error_estimate, delta].max
255
+ end
256
+ end
257
+ error_estimate
258
+ end
259
+
260
+ def merge_distributions(distributions1, distributions2)
261
+ merged_distributions = {}
262
+ @choices.each do |choice|
263
+ unless choice == OTHER
264
+ merged_distributions[choice] = distributions1[choice] + \
265
+ distributions2[choice]
266
+ end
267
+ end
268
+ @coalitions.each do |coalition|
269
+ merged_distributions[coalition] = distributions1[coalition] + \
270
+ distributions2[coalition]
271
+ end
272
+ merged_distributions
273
+ end
274
+
275
+ def sort_choices_by_result
276
+ sorted_choices = @choices.reject { |choice| choice == OTHER }
277
+ sorted_choices.sort do |a, b|
278
+ comparison = result(b) <=> result(a)
279
+ if comparison == 0
280
+ a <=> b
281
+ else
282
+ comparison
283
+ end
284
+ end
285
+ end
286
+
287
+ def coalition_label(coalition)
288
+ coalition.sort.join(' + ')
289
+ end
290
+
291
+ def sort_coalitions_by_result
292
+ @coalitions.sort do |a, b|
293
+ comparison = result(b) <=> result(a)
294
+ if comparison == 0
295
+ coalition_label(a) <=> coalition_label(b)
296
+ else
297
+ comparison
298
+ end
299
+ end
300
+ end
301
+
302
+ def result(choice)
303
+ @results[choice].to_f / @results.values.inject(:+)
304
+ end
305
+
306
+ def coalition_result(coalition)
307
+ coalition.map { |choice| @results[choice].to_f }.inject(:+) / @results.values.inject(:+)
308
+ end
309
+
310
+ def larger_than(a, b)
311
+ probability = @comparisons[a + '>' + b] / @combinations_sum
312
+ probability.mantissa * (10**probability.exponent)
313
+ end
314
+
315
+ def create_choice_report_line(choice, next_choice, max_choice_width, max_seats_width)
316
+ ci_values = @distributions[choice].confidence_interval(0.95)
317
+ confidence_interval = ci_values.map { |x| x.to_f / @area.population_size }
318
+ ci_seats = @seats[choice].confidence_interval(0.95)
319
+ choice.ljust(max_choice_width) + ' ' + \
320
+ six_char_percentage(result(choice)) + ' ' + \
321
+ six_char_percentage(most_probable_rounded_fraction(choice)) + ' ' + \
322
+ six_char_percentage(most_probable_fraction(choice)) + ' ' + \
323
+ six_char_percentage(confidence_interval.first) + '–' + \
324
+ six_char_percentage(confidence_interval.last) + ' ' + \
325
+ (next_choice.nil? ? ' ' : six_char_percentage(larger_than(choice, next_choice))) + ' ' + \
326
+ (max_seats_width == 1 ? ' ' : '') + \
327
+ ci_seats.first.to_s.rjust(max_seats_width) + '–' + \
328
+ ci_seats.last.to_s.rjust(max_seats_width)
329
+ end
330
+
331
+ def create_coalition_report_line(coalition, max_coalition_width, max_seats_width)
332
+ ci_values = @distributions[coalition].confidence_interval(0.95)
333
+ majority_votes_probability = @distributions[coalition].threshold_probability(0.5, @area.population_size)
334
+ confidence_interval = ci_values.map { |x| x.to_f / @area.population_size }
335
+ ci_seats = @seats[coalition].confidence_interval(0.95)
336
+ seats_majority = 1 + @area.no_of_seats / 2
337
+ majority_seats_probability = @seats[coalition].value_threshold_probability(seats_majority)
338
+ coalition_label(coalition).ljust(max_coalition_width) + ' ' + \
339
+ six_char_percentage(coalition_result(coalition)) + ' ' + \
340
+ six_char_percentage(most_probable_rounded_fraction(coalition)) + ' ' + \
341
+ six_char_percentage(most_probable_fraction(coalition)) + ' ' + \
342
+ six_char_percentage(confidence_interval.first) + '–' + \
343
+ six_char_percentage(confidence_interval.last) + ' ' + \
344
+ six_char_percentage(majority_votes_probability) + ' ' + \
345
+ (max_seats_width == 1 ? ' ' : '') + \
346
+ ci_seats.first.to_s.rjust(max_seats_width) + '–' + \
347
+ ci_seats.last.to_s.rjust(max_seats_width) + ' ' + \
348
+ six_char_percentage(majority_seats_probability)
349
+ end
350
+
351
+ def most_probable_rounded_fraction(key)
352
+ if @no_of_simulations == 0
353
+ nil
354
+ else
355
+ @distributions[key].most_probable_rounded_fraction(@area.population_size)
356
+ end
357
+ end
358
+ end
359
+ end