modesty 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (130) hide show
  1. data/Gemfile +13 -0
  2. data/Gemfile.lock +18 -0
  3. data/LICENSE +21 -0
  4. data/README.md +121 -0
  5. data/Rakefile +29 -0
  6. data/VERSION +1 -0
  7. data/init.rb +1 -0
  8. data/lib/modesty.rb +26 -0
  9. data/lib/modesty/api.rb +14 -0
  10. data/lib/modesty/core_ext.rb +5 -0
  11. data/lib/modesty/core_ext/array.rb +21 -0
  12. data/lib/modesty/core_ext/fixnum.rb +5 -0
  13. data/lib/modesty/core_ext/hash.rb +39 -0
  14. data/lib/modesty/core_ext/string.rb +9 -0
  15. data/lib/modesty/core_ext/symbol.rb +33 -0
  16. data/lib/modesty/datastore.rb +51 -0
  17. data/lib/modesty/datastore/redis.rb +180 -0
  18. data/lib/modesty/experiment.rb +87 -0
  19. data/lib/modesty/experiment/base.rb +47 -0
  20. data/lib/modesty/experiment/builder.rb +48 -0
  21. data/lib/modesty/experiment/console.rb +4 -0
  22. data/lib/modesty/experiment/data.rb +75 -0
  23. data/lib/modesty/experiment/interface.rb +29 -0
  24. data/lib/modesty/experiment/significance.rb +376 -0
  25. data/lib/modesty/experiment/stats.rb +163 -0
  26. data/lib/modesty/frameworks/rails.rb +27 -0
  27. data/lib/modesty/identity.rb +32 -0
  28. data/lib/modesty/load.rb +80 -0
  29. data/lib/modesty/load/load_experiments.rb +14 -0
  30. data/lib/modesty/load/load_metrics.rb +17 -0
  31. data/lib/modesty/metric.rb +56 -0
  32. data/lib/modesty/metric/base.rb +38 -0
  33. data/lib/modesty/metric/builder.rb +23 -0
  34. data/lib/modesty/metric/data.rb +133 -0
  35. data/modesty.gemspec +192 -0
  36. data/spec/core_ext_spec.rb +17 -0
  37. data/spec/experiment_spec.rb +239 -0
  38. data/spec/identity_spec.rb +161 -0
  39. data/spec/load_spec.rb +87 -0
  40. data/spec/metric_spec.rb +176 -0
  41. data/spec/rails_spec.rb +48 -0
  42. data/spec/redis_spec.rb +29 -0
  43. data/spec/significance_spec.rb +147 -0
  44. data/spec/spec.opts +1 -0
  45. data/test/myapp/config/modesty.yml +9 -0
  46. data/test/myapp/modesty/experiments/cookbook.rb +4 -0
  47. data/test/myapp/modesty/metrics/kitchen_metrics.rb +9 -0
  48. data/test/myapp/modesty/metrics/stove/burner_metrics.rb +2 -0
  49. data/vendor/.piston.yml +8 -0
  50. data/vendor/mock_redis/.gitignore +2 -0
  51. data/vendor/mock_redis/README +8 -0
  52. data/vendor/mock_redis/lib/mock_redis.rb +10 -0
  53. data/vendor/mock_redis/lib/mock_redis/hash.rb +61 -0
  54. data/vendor/mock_redis/lib/mock_redis/list.rb +6 -0
  55. data/vendor/mock_redis/lib/mock_redis/misc.rb +69 -0
  56. data/vendor/mock_redis/lib/mock_redis/set.rb +108 -0
  57. data/vendor/mock_redis/lib/mock_redis/string.rb +32 -0
  58. data/vendor/redis-rb/.gitignore +8 -0
  59. data/vendor/redis-rb/LICENSE +20 -0
  60. data/vendor/redis-rb/README.markdown +129 -0
  61. data/vendor/redis-rb/Rakefile +155 -0
  62. data/vendor/redis-rb/benchmarking/logging.rb +62 -0
  63. data/vendor/redis-rb/benchmarking/pipeline.rb +51 -0
  64. data/vendor/redis-rb/benchmarking/speed.rb +21 -0
  65. data/vendor/redis-rb/benchmarking/suite.rb +24 -0
  66. data/vendor/redis-rb/benchmarking/thread_safety.rb +38 -0
  67. data/vendor/redis-rb/benchmarking/worker.rb +71 -0
  68. data/vendor/redis-rb/examples/basic.rb +15 -0
  69. data/vendor/redis-rb/examples/dist_redis.rb +43 -0
  70. data/vendor/redis-rb/examples/incr-decr.rb +17 -0
  71. data/vendor/redis-rb/examples/list.rb +26 -0
  72. data/vendor/redis-rb/examples/pubsub.rb +31 -0
  73. data/vendor/redis-rb/examples/sets.rb +36 -0
  74. data/vendor/redis-rb/examples/unicorn/config.ru +3 -0
  75. data/vendor/redis-rb/examples/unicorn/unicorn.rb +20 -0
  76. data/vendor/redis-rb/lib/redis.rb +676 -0
  77. data/vendor/redis-rb/lib/redis/client.rb +201 -0
  78. data/vendor/redis-rb/lib/redis/compat.rb +21 -0
  79. data/vendor/redis-rb/lib/redis/connection.rb +134 -0
  80. data/vendor/redis-rb/lib/redis/distributed.rb +526 -0
  81. data/vendor/redis-rb/lib/redis/hash_ring.rb +131 -0
  82. data/vendor/redis-rb/lib/redis/pipeline.rb +13 -0
  83. data/vendor/redis-rb/lib/redis/subscribe.rb +79 -0
  84. data/vendor/redis-rb/redis.gemspec +29 -0
  85. data/vendor/redis-rb/test/commands_on_hashes_test.rb +46 -0
  86. data/vendor/redis-rb/test/commands_on_lists_test.rb +50 -0
  87. data/vendor/redis-rb/test/commands_on_sets_test.rb +78 -0
  88. data/vendor/redis-rb/test/commands_on_sorted_sets_test.rb +109 -0
  89. data/vendor/redis-rb/test/commands_on_strings_test.rb +70 -0
  90. data/vendor/redis-rb/test/commands_on_value_types_test.rb +88 -0
  91. data/vendor/redis-rb/test/connection_handling_test.rb +87 -0
  92. data/vendor/redis-rb/test/db/.gitignore +1 -0
  93. data/vendor/redis-rb/test/distributd_key_tags_test.rb +53 -0
  94. data/vendor/redis-rb/test/distributed_blocking_commands_test.rb +54 -0
  95. data/vendor/redis-rb/test/distributed_commands_on_hashes_test.rb +12 -0
  96. data/vendor/redis-rb/test/distributed_commands_on_lists_test.rb +18 -0
  97. data/vendor/redis-rb/test/distributed_commands_on_sets_test.rb +85 -0
  98. data/vendor/redis-rb/test/distributed_commands_on_strings_test.rb +50 -0
  99. data/vendor/redis-rb/test/distributed_commands_on_value_types_test.rb +73 -0
  100. data/vendor/redis-rb/test/distributed_commands_requiring_clustering_test.rb +141 -0
  101. data/vendor/redis-rb/test/distributed_connection_handling_test.rb +25 -0
  102. data/vendor/redis-rb/test/distributed_internals_test.rb +18 -0
  103. data/vendor/redis-rb/test/distributed_persistence_control_commands_test.rb +24 -0
  104. data/vendor/redis-rb/test/distributed_publish_subscribe_test.rb +90 -0
  105. data/vendor/redis-rb/test/distributed_remote_server_control_commands_test.rb +31 -0
  106. data/vendor/redis-rb/test/distributed_sorting_test.rb +21 -0
  107. data/vendor/redis-rb/test/distributed_test.rb +60 -0
  108. data/vendor/redis-rb/test/distributed_transactions_test.rb +34 -0
  109. data/vendor/redis-rb/test/encoding_test.rb +16 -0
  110. data/vendor/redis-rb/test/helper.rb +86 -0
  111. data/vendor/redis-rb/test/internals_test.rb +27 -0
  112. data/vendor/redis-rb/test/lint/hashes.rb +90 -0
  113. data/vendor/redis-rb/test/lint/internals.rb +53 -0
  114. data/vendor/redis-rb/test/lint/lists.rb +93 -0
  115. data/vendor/redis-rb/test/lint/sets.rb +66 -0
  116. data/vendor/redis-rb/test/lint/sorted_sets.rb +132 -0
  117. data/vendor/redis-rb/test/lint/strings.rb +98 -0
  118. data/vendor/redis-rb/test/lint/value_types.rb +84 -0
  119. data/vendor/redis-rb/test/persistence_control_commands_test.rb +22 -0
  120. data/vendor/redis-rb/test/pipelining_commands_test.rb +78 -0
  121. data/vendor/redis-rb/test/publish_subscribe_test.rb +151 -0
  122. data/vendor/redis-rb/test/redis_mock.rb +64 -0
  123. data/vendor/redis-rb/test/remote_server_control_commands_test.rb +56 -0
  124. data/vendor/redis-rb/test/sorting_test.rb +44 -0
  125. data/vendor/redis-rb/test/test.conf +8 -0
  126. data/vendor/redis-rb/test/thread_safety_test.rb +34 -0
  127. data/vendor/redis-rb/test/transactions_test.rb +91 -0
  128. data/vendor/redis-rb/test/unknown_commands_test.rb +14 -0
  129. data/vendor/redis-rb/test/url_param_test.rb +52 -0
  130. metadata +277 -0
@@ -0,0 +1,87 @@
1
+ module Modesty
2
+ class Experiment
3
+ class Error < StandardError; end
4
+ end
5
+
6
+ module ExperimentMethods
7
+ def experiments
8
+ @experiments ||= Hash.new do |h, k|
9
+ raise Experiment::Error, <<-msg.squish
10
+ Unrecognized experiment #{k.inspect}.
11
+ msg
12
+ end
13
+ end
14
+
15
+ def add_experiment(exp)
16
+ raise Experiment::Error, <<-msg if self.experiments.include? exp.slug
17
+ Experiment #{exp.slug.inspect} already defined!
18
+ msg
19
+ self.experiments[exp.slug] = exp
20
+ end
21
+
22
+ def new_experiment(slug, &block)
23
+ exp = Experiment.new(slug)
24
+ yield Experiment::Builder.new(exp) if block
25
+ exp.metrics.each do |m|
26
+ m.experiments << exp
27
+ exp.alternatives.each do |a|
28
+ Modesty.new_metric(m.slug/exp.slug/a, :parent => m, :experiment => exp)
29
+ end
30
+ end
31
+ add_experiment(exp)
32
+ exp
33
+ end
34
+
35
+ def decide_identity(options)
36
+ if options.include? :identity
37
+ options[:identity]
38
+ elsif options.include? :for
39
+ options[:for]
40
+ elsif options.include? :on
41
+ options[:on]
42
+ else
43
+ Modesty.identity
44
+ end
45
+ end
46
+
47
+ def experiment(sym, options={}, &blk)
48
+ exp = self.experiments[sym]
49
+
50
+ identity = decide_identity(options)
51
+
52
+ interface = Experiment::Interface.new(exp, identity)
53
+ self.with_identity identity do
54
+ yield interface
55
+ end
56
+
57
+ interface.last_value
58
+ end
59
+
60
+ def group?(sym, options={})
61
+ id = decide_identity(options)
62
+
63
+ exp = sym.to_s.split(/\//)
64
+ alt = exp.pop.to_sym
65
+ exp = exp.join('/').to_sym
66
+ exp = self.experiments[exp]
67
+ exp.group? alt
68
+ end
69
+
70
+ def group(sym, options={})
71
+ id = decide_identity(options)
72
+ exp = self.experiments[sym]
73
+ exp ? exp.group(id) : :control
74
+ end
75
+ end
76
+
77
+ class API
78
+ include ExperimentMethods
79
+ end
80
+ end
81
+
82
+ require 'modesty/experiment/base'
83
+ require 'modesty/experiment/builder'
84
+ require 'modesty/experiment/data'
85
+ require 'modesty/experiment/interface'
86
+ require 'modesty/experiment/significance'
87
+ require 'modesty/experiment/stats'
@@ -0,0 +1,47 @@
1
+ module Modesty
2
+ class Experiment
3
+
4
+ def initialize(slug)
5
+ @slug = slug
6
+ end
7
+
8
+ def inspect
9
+ "#<Modesty::Experiment[ #{self.slug.inspect} ]>"
10
+ end
11
+
12
+ ATTRIBUTES = [
13
+ :description,
14
+ ]
15
+
16
+ def identity_for(sym)
17
+ sym = sym.slug if sym.is_a? Metric
18
+ self.metric_contexts[sym]
19
+ end
20
+
21
+ attr_reader *ATTRIBUTES
22
+ attr_reader :slug
23
+ attr_reader :metrics
24
+
25
+ def metric_contexts
26
+ @metric_contexts ||= Hash.new(:user)
27
+ end
28
+
29
+ def alternatives
30
+ @alternatives ||= [:control, :experiment]
31
+ end
32
+
33
+ def metrics(alt=nil)
34
+ @metrics ||= []
35
+ return @metrics unless alt
36
+ raise Error, <<-msg.squish unless self.alternatives.include? alt
37
+ Unrecognized alternative #{alt.inspect} for #{self.inspect}.
38
+ Available alternatives: #{self.alternatives.inspect}
39
+ msg
40
+
41
+ Hash[@metrics.map do |m|
42
+ [m.slug, m/(self.slug/alt)]
43
+ end]
44
+ end
45
+
46
+ end
47
+ end
@@ -0,0 +1,48 @@
1
+ module Modesty
2
+ class Experiment
3
+ class Builder
4
+ def method_missing(name, *args)
5
+ if Experiment::ATTRIBUTES.include?(name) && args.count > 0
6
+ val = (args.count == 1) ? args[0] : args
7
+ @exp.instance_variable_set("@#{name}", val)
8
+ else
9
+ @exp.send(name)
10
+ end
11
+ end
12
+
13
+ def initialize(exp)
14
+ @exp = exp
15
+ end
16
+
17
+ def alternatives(*alts)
18
+ alts.unshift :control unless alts.include? :control
19
+ @exp.instance_variable_set("@alternatives", alts)
20
+ end
21
+
22
+ def metrics(*args)
23
+ metrics = args.map do |s|
24
+ Modesty.metrics[s] || raise(
25
+ Modesty::NoMetricError,
26
+ "Undefined metric '#{s.inspect}' in experiment #{@exp}'"
27
+ )
28
+ end
29
+ @exp.instance_variable_set("@metrics", metrics)
30
+ end
31
+
32
+ def metric(sym, options={})
33
+ @exp.metrics << (Modesty.metrics[sym] || raise(
34
+ Modesty::NoMetricError,
35
+ "Undefined metric #{sym.inspect} in experiment #{@exp}"
36
+ ))
37
+ if as = options.delete(:as)
38
+ @exp.metric_contexts[sym] = as.to_sym
39
+ end
40
+
41
+ raise <<-msg.squish unless options.empty?
42
+ unrecognized options
43
+ #{options.keys.inspect}
44
+ msg
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,4 @@
1
+ module Modesty
2
+ class Experiment
3
+ end
4
+ end
@@ -0,0 +1,75 @@
1
+ module Modesty
2
+ class Experiment
3
+ def data
4
+ @data ||= (Modesty.data.class)::ExperimentData.new(self)
5
+ end
6
+
7
+ def chooses(alt, options={})
8
+ raise Experiment::Error, <<-msg.squish unless self.alternatives.include? alt
9
+ Unknown alternative #{alt.inspect}
10
+ msg
11
+
12
+ id = options.include?(:for) ? options[:for] : Modesty.identity
13
+
14
+ raise IdentityError, <<-msg.squish unless id
15
+ Experiment#chooses doesn't work for guests.
16
+ Either identify globally or pass in :for => id
17
+ msg
18
+
19
+ self.data.register!(alt, id)
20
+ rescue Datastore::ConnectionError => e
21
+ Modesty.handle_error(e)
22
+ alt
23
+ end
24
+
25
+ def group(id=Modesty.identity)
26
+ return :control unless id
27
+ fetch_or_generate_group(id)
28
+ end
29
+
30
+ # usage: `e.group?(:experiment)`
31
+ def group?(alt)
32
+ self.group == alt
33
+ end
34
+
35
+ def num_users(alt=nil)
36
+ if self.data.respond_to? :num_users
37
+ self.data.num_users(alt)
38
+ else
39
+ self.users(alt).count
40
+ end
41
+ end
42
+
43
+ def users(alt=nil)
44
+ self.data.users(alt)
45
+ end
46
+
47
+ private
48
+ # used to fetch the cached alternative from redis
49
+ def fetch_group(identity)
50
+ self.data.get_cached_alternative(identity)
51
+ rescue Datastore::ConnectionError => e
52
+ Modesty.handle_error(e)
53
+ nil
54
+ end
55
+
56
+ # this is the method with the fallbacks - fetch it from redis or create it.
57
+ def fetch_or_generate_group(id=Modesty.identity)
58
+ alt = begin
59
+ fetch_group(id)
60
+ rescue Datastore::ConnectionError
61
+ nil
62
+ end || generate_group(id)
63
+ end
64
+
65
+ # generates an alternative and stores it in redis
66
+ def generate_group(identity)
67
+ alternative = self.alternatives[
68
+ "#{@slug}#{identity}".hash % self.alternatives.count
69
+ ]
70
+ self.chooses(alternative, :for => identity)
71
+ return alternative
72
+ end
73
+
74
+ end
75
+ end
@@ -0,0 +1,29 @@
1
+ module Modesty
2
+ class Experiment
3
+ # the thing yielded when you say `Modesty.experiment :foo do |e| ...`
4
+ class Interface
5
+ def initialize(exp, identity)
6
+ @exp = exp
7
+ @alt = exp.group(identity)
8
+ end
9
+
10
+ attr_reader :last_value
11
+
12
+ def group(gr=nil)
13
+ if block_given?
14
+ if gr && @exp.group == gr
15
+ @last_value = yield
16
+ else
17
+ @last_value
18
+ end
19
+ else
20
+ @exp.group
21
+ end
22
+ end
23
+
24
+ def group?(alt)
25
+ alt == @alt
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,376 @@
1
+ module Modesty
2
+ class Experiment
3
+ class Significance
4
+
5
+ #this is the table for up to 4 degrees of freedom. If we are going to use
6
+ #more than this we should actually have a flat file with the table that we
7
+ #can parse.
8
+ CHI_SQUARE_TABLE = {
9
+ 1 => {
10
+ 2.7055 => 0.10,
11
+ 3.8415 => 0.05,
12
+ 5.0239 => 0.025,
13
+ 6.6349 => 0.01,
14
+ 7.8794 => 0.005
15
+ },
16
+ 2 => {
17
+ 4.6052 => 0.10,
18
+ 5.9915 => 0.05,
19
+ 7.3778 => 0.025,
20
+ 9.2104 => 0.01,
21
+ 10.5965 => 0.005
22
+ },
23
+ 3 => {
24
+ 6.2514 => 0.10,
25
+ 7.8147 => 0.05,
26
+ 9.3484 => 0.025,
27
+ 11.349 => 0.01,
28
+ 12.8381 => 0.005
29
+ },
30
+ 4 => {
31
+ 7.7794 => 0.10,
32
+ 9.4877 => 0.05,
33
+ 11.1433 => 0.025,
34
+ 13.2767 => 0.01,
35
+ 14.860 => 0.005
36
+ },
37
+ 5 => {
38
+ 9.236 => 0.10,
39
+ 11.070 => 0.05,
40
+ 12.833 => 0.025,
41
+ 15.086 => 0.01,
42
+ 16.750 => 0.005
43
+ },
44
+ 6 => {
45
+ 10.645 => 0.10,
46
+ 12.592 => 0.05,
47
+ 14.449 => 0.025,
48
+ 16.812 => 0.01,
49
+ 18.548 => 0.005
50
+ },
51
+ 7 => {
52
+ 12.017 => 0.10,
53
+ 14.067 => 0.05,
54
+ 16.013 => 0.025,
55
+ 18.475 => 0.01,
56
+ 20.278 => 0.005
57
+ },
58
+ 8 => {
59
+ 13.362 => 0.10,
60
+ 15.507 => 0.05,
61
+ 17.535 => 0.025,
62
+ 20.090 => 0.01,
63
+ 21.955 => 0.005
64
+ }
65
+ }
66
+
67
+ def self.significance(*args)
68
+ df = (args.size - 1) * (args[0].size - 1)
69
+ raise "Currently unimplemented: More than 8 degrees of freedom" if df > 8
70
+ chi_square = self.chi_square(args)
71
+ current = nil
72
+ CHI_SQUARE_TABLE[df].keys.sort.each do |key|
73
+ if chi_square > key
74
+ current = CHI_SQUARE_TABLE[df][key]
75
+ end
76
+ end
77
+ current
78
+ end
79
+
80
+ # return an hash with all the values from the distributions in it, not
81
+ # necessarily sorted. Basically, pool all the histograms.
82
+ def self.pool_distributions(distributions)
83
+ pooled_distribution = Hash.new(0)
84
+ distributions.each do |name, frequency_map|
85
+ frequency_map.each do |key, value|
86
+ pooled_distribution[key] += value
87
+ end
88
+ end
89
+ pooled_distribution
90
+ end
91
+
92
+ # Take a histogram and turn it into an array
93
+ def self.squash_distribution(distribution)
94
+ a_flattened_histogram = []
95
+ distribution.each do |key, value|
96
+ (0..value).each do
97
+ a_flattened_histogram << key
98
+ end
99
+ end
100
+ a_flattened_histogram
101
+ end
102
+
103
+ # pick two unique samples from 'array' of size num_elements
104
+ def self.bi_sample_array(array, num_elements)
105
+ raise "We don't have that many elements" unless num_elements*2 <= array.size
106
+ values = array.shuffle
107
+ [values[0...num_elements], values[num_elements...2*num_elements]]
108
+ end
109
+
110
+ def self.add_sums(rows)
111
+ size = nil
112
+ rows.each do |row|
113
+ size = row.size unless size
114
+ raise "Unequal sized rows!" if size != row.size
115
+ row.push row.sum
116
+ end
117
+ new_row = [0] * (size + 1)
118
+ rows.each do |row|
119
+ new_row = new_row.zip(row).map(&:sum)
120
+ end
121
+ rows.push new_row
122
+ end
123
+
124
+ def self.chi_square(rows)
125
+ rows = self.add_sums(rows)
126
+
127
+ chi_square = 0
128
+ num_rows = rows.size
129
+ len = rows[0].size
130
+ (0...num_rows).each do |i|
131
+ (0...len).each do |j|
132
+ error = rows[i][len - 1].to_f * rows[num_rows - 1][j].to_f /
133
+ rows[num_rows - 1][len - 1].to_f
134
+ chi_square += ((error - rows[i][j])**2) / error
135
+ end
136
+ end
137
+ chi_square
138
+ end
139
+
140
+
141
+ def self.size_total_mean_and_stdev(distribution)
142
+ total = 0
143
+ size = 0
144
+ distribution.each do |pair|
145
+ value = pair[0].to_i
146
+ freq = pair[1].to_i
147
+ total += value * freq
148
+ size += freq
149
+ end
150
+ mean = total.to_f / size
151
+ stderr = 0
152
+ distribution.each do |pair|
153
+ value = pair[0].to_i
154
+ freq = pair[1].to_i
155
+ stderr += freq * ((value - mean)**2)
156
+ end
157
+ std_dev = (stderr.to_f / size) ** (0.5)
158
+ [size, total, mean, std_dev]
159
+ end
160
+
161
+
162
+ #assume infinite df. Numbers here are huge
163
+ SIGNIFICANCE_VALUES = {1.282 => 0.10, 1.645 => 0.05, 1.960 => 0.025,
164
+ 2.326 => 0.01, 2.576 => 0.005}
165
+
166
+ # Let's also have a table of signifigant values based on degrees of freedom
167
+ # and see if we can look up data in it
168
+ #
169
+ LOOKUP_SIGNIFICANCE_TABLE = {
170
+ 0 => 0.25,
171
+ 1 => 0.20,
172
+ 2 => 0.15,
173
+ 3 => 0.10,
174
+ 4 => 0.05,
175
+ 5 => 0.025,
176
+ 6 => 0.01,
177
+ 7 => 0.005,
178
+ }
179
+
180
+ # Taken from Wikipedia's page on Student's T distribution
181
+ SIGNIFICANCE_VALUES_FOR_V = {
182
+ # V 75% 80% 85% 90% 95% 97.5% 99% 99.5% 99.75% 99.9% 99.95%
183
+ 1 => [ 1.000, 1.376, 1.963, 3.078, 6.314, 12.71, 31.82, 63.66, 127.3, 318.3, 636.6 ],
184
+ 2 => [ 0.816, 1.061, 1.386, 1.886, 2.920, 4.303, 6.965, 9.925, 14.09, 22.33, 31.60 ],
185
+ 3 => [ 0.765, 0.978, 1.250, 1.638, 2.353, 3.182, 4.541, 5.841, 7.453, 10.21, 12.92 ],
186
+ 4 => [ 0.741, 0.941, 1.190, 1.533, 2.132, 2.776, 3.747, 4.604, 5.598, 7.173, 8.610 ],
187
+ 5 => [ 0.727, 0.920, 1.156, 1.476, 2.015, 2.571, 3.365, 4.032, 4.773, 5.893, 6.869 ],
188
+ 6 => [ 0.718, 0.906, 1.134, 1.440, 1.943, 2.447, 3.143, 3.707, 4.317, 5.208, 5.959 ],
189
+ 7 => [ 0.711, 0.896, 1.119, 1.415, 1.895, 2.365, 2.998, 3.499, 4.029, 4.785, 5.408 ],
190
+ 8 => [ 0.706, 0.889, 1.108, 1.397, 1.860, 2.306, 2.896, 3.355, 3.833, 4.501, 5.041 ],
191
+ 9 => [ 0.703, 0.883, 1.100, 1.383, 1.833, 2.262, 2.821, 3.250, 3.690, 4.297, 4.781 ],
192
+ 10 => [ 0.700, 0.879, 1.093, 1.372, 1.812, 2.228, 2.764, 3.169, 3.581, 4.144, 4.587 ],
193
+ 11 => [ 0.697, 0.876, 1.088, 1.363, 1.796, 2.201, 2.718, 3.106, 3.497, 4.025, 4.437 ],
194
+ 12 => [ 0.695, 0.873, 1.083, 1.356, 1.782, 2.179, 2.681, 3.055, 3.428, 3.930, 4.318 ],
195
+ 13 => [ 0.694, 0.870, 1.079, 1.350, 1.771, 2.160, 2.650, 3.012, 3.372, 3.852, 4.221 ],
196
+ 14 => [ 0.692, 0.868, 1.076, 1.345, 1.761, 2.145, 2.624, 2.977, 3.326, 3.787, 4.140 ],
197
+ 15 => [ 0.691, 0.866, 1.074, 1.341, 1.753, 2.131, 2.602, 2.947, 3.286, 3.733, 4.073 ],
198
+ 16 => [ 0.690, 0.865, 1.071, 1.337, 1.746, 2.120, 2.583, 2.921, 3.252, 3.686, 4.015 ],
199
+ 17 => [ 0.689, 0.863, 1.069, 1.333, 1.740, 2.110, 2.567, 2.898, 3.222, 3.646, 3.965 ],
200
+ 18 => [ 0.688, 0.862, 1.067, 1.330, 1.734, 2.101, 2.552, 2.878, 3.197, 3.610, 3.922 ],
201
+ 19 => [ 0.688, 0.861, 1.066, 1.328, 1.729, 2.093, 2.539, 2.861, 3.174, 3.579, 3.883 ],
202
+ 20 => [ 0.687, 0.860, 1.064, 1.325, 1.725, 2.086, 2.528, 2.845, 3.153, 3.552, 3.850 ],
203
+ 21 => [ 0.686, 0.859, 1.063, 1.323, 1.721, 2.080, 2.518, 2.831, 3.135, 3.527, 3.819 ],
204
+ 22 => [ 0.686, 0.858, 1.061, 1.321, 1.717, 2.074, 2.508, 2.819, 3.119, 3.505, 3.792 ],
205
+ 23 => [ 0.685, 0.858, 1.060, 1.319, 1.714, 2.069, 2.500, 2.807, 3.104, 3.485, 3.767 ],
206
+ 24 => [ 0.685, 0.857, 1.059, 1.318, 1.711, 2.064, 2.492, 2.797, 3.091, 3.467, 3.745 ],
207
+ 25 => [ 0.684, 0.856, 1.058, 1.316, 1.708, 2.060, 2.485, 2.787, 3.078, 3.450, 3.725 ],
208
+ 26 => [ 0.684, 0.856, 1.058, 1.315, 1.706, 2.056, 2.479, 2.779, 3.067, 3.435, 3.707 ],
209
+ 27 => [ 0.684, 0.855, 1.057, 1.314, 1.703, 2.052, 2.473, 2.771, 3.057, 3.421, 3.690 ],
210
+ 28 => [ 0.683, 0.855, 1.056, 1.313, 1.701, 2.048, 2.467, 2.763, 3.047, 3.408, 3.674 ],
211
+ 29 => [ 0.683, 0.854, 1.055, 1.311, 1.699, 2.045, 2.462, 2.756, 3.038, 3.396, 3.659 ],
212
+ 30 => [ 0.683, 0.854, 1.055, 1.310, 1.697, 2.042, 2.457, 2.750, 3.030, 3.385, 3.646 ],
213
+ 40 => [ 0.681, 0.851, 1.050, 1.303, 1.684, 2.021, 2.423, 2.704, 2.971, 3.307, 3.551 ],
214
+ 50 => [ 0.679, 0.849, 1.047, 1.299, 1.676, 2.009, 2.403, 2.678, 2.937, 3.261, 3.496 ],
215
+ 60 => [ 0.679, 0.848, 1.045, 1.296, 1.671, 2.000, 2.390, 2.660, 2.915, 3.232, 3.460 ],
216
+ 80 => [ 0.678, 0.846, 1.043, 1.292, 1.664, 1.990, 2.374, 2.639, 2.887, 3.195, 3.416 ],
217
+ 100 => [ 0.677, 0.845, 1.042, 1.290, 1.660, 1.984, 2.364, 2.626, 2.871, 3.174, 3.390 ],
218
+ 120 => [ 0.677, 0.845, 1.041, 1.289, 1.658, 1.980, 2.358, 2.617, 2.860, 3.160, 3.373 ],
219
+ 0 => [ 0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326, 2.576, 2.807, 3.090, 3.291 ],
220
+ }
221
+
222
+ # Calculate the p_value for a given t,v from the student's t distribution
223
+ def calculate_p_value(t_val, v_val=0)
224
+ v_arr = SIGNIFICANCE_VALUES_FOR_V[v]
225
+
226
+ return nil if !v_arr
227
+
228
+ v_arr = v_arr.sort()
229
+ lookup_val = nil
230
+ # find the largest value that t_val is greater than
231
+ v_arr.each do | v_val |
232
+ lookup_val = v_val if t_val > v_val
233
+ end
234
+
235
+ # return the p_value that corresponds to it
236
+ index_into_v_arr = v_arr.index(lookup_val)
237
+ return LOOKUP_SIGNIFICANCE_TABLE[index_into_v_arr]
238
+ end
239
+
240
+ def self.calculate_histogram_stats(distributions)
241
+ #distributions should be hash of {name => histogram }
242
+ stats = distributions.inject({}) do |hash, pair|
243
+ size, tot, mean, sdev = self.size_total_mean_and_stdev(pair[1])
244
+ hash[pair[0]] = {:size => size, :total => tot,
245
+ :mean => mean, :sdev => sdev}
246
+ hash
247
+ end
248
+ return stats
249
+ end
250
+
251
+
252
+ # [okay] my initial comments on the following function:
253
+ # assumptions:
254
+ # * Does a signifigance check against V = infinity
255
+ # * assumes stddev for both distributions are equal.
256
+ def self.dist_significance(distributions)
257
+ #distributions should be hash of {name => histogram }
258
+ stats = self.calculate_histogram_stats(distributions)
259
+ if distributions.keys.size != 2
260
+ #for now can only test for significance in pairwise. To do more than
261
+ #2, need to implement ANOVA
262
+ return stats
263
+ end
264
+
265
+ # Run a student's T test on the distributions
266
+ #
267
+ # t = x1[:mean] - x2[:mean]
268
+ # ---------------------
269
+ # pooled_sdev * sqrt(1/n1 + 1/n2)
270
+ #
271
+ # where n1 is the number of elements in x1, n2 the number of elems in x2,
272
+ # and pooled_sdev is:
273
+ #
274
+ # pooled_sdev = sqrt ( (n1 - 1)(sdev1**2) + (n2 - 1)(sdev2**2) )
275
+ # ( --------------------------------------- )
276
+ # ( n1 + n2 - 2 )
277
+ pooled_sdev = stats.values.map {|hash| (hash[:size] - 1) * (hash[:sdev] ** 2)}.sum
278
+ pooled_sdev /= (stats.values.map {|hash| hash[:size]}.sum - 2)
279
+ pooled_sdev = pooled_sdev ** 0.5
280
+ t_val = (stats.values.first[:mean] - stats.values.last[:mean]) /
281
+ (pooled_sdev *
282
+ (stats.values.map {|hash| 1.0 / hash[:size]}.sum ** 0.5))
283
+ t_val = t_val.abs
284
+ current_sig = nil
285
+ SIGNIFICANCE_VALUES.keys.sort.each do |key|
286
+ if t_val > key
287
+ current_sig = SIGNIFICANCE_VALUES[key]
288
+ end
289
+ end
290
+ stats.merge(:significant => current_sig)
291
+ end
292
+
293
+ def self.welch_t_test(distributions)
294
+ #distributions should be hash of {name => histogram }
295
+ stats = self.calculate_histogram_stats(distributions)
296
+
297
+ # Run a student's T test for assumed unequal size/unequal variance on the
298
+ # populations
299
+ #
300
+ #
301
+ # t = x1[:mean] - x2[:mean]
302
+ # ------------------------------
303
+ # sqrt( sdev1**2 sdev2**2 )
304
+ # ( -------- + -------- )
305
+ # ( n1 n2 )
306
+ #
307
+ # degrees of freedom (yuck, ugly ugly ugly)
308
+ # The Welch-Satterthwaite approximation (also from wikipedia)
309
+ # v = ( sdev1**2 sdev2**2)
310
+ # ( -------- + --------) ** 2
311
+ # ( n1 n2 )
312
+ # --------------------------------
313
+ # sdev1**4 sdev2**4
314
+ # -------------- + --------------
315
+ # n1**2 * (n1-1) n2**2 * (n2-1)
316
+ #
317
+ # it might be acceptable to assume infinite degrees of freedom. We shall see.
318
+
319
+
320
+ denom = stats.values.map {|hash| (hash[:sdev]**2) / hash[:size] }.sum
321
+ denom = denom ** 0.5 # Take the square root
322
+ t_val = (stats.values.first[:mean] - stats.values.last[:mean]) / denom
323
+
324
+ t_val = t_val.abs
325
+ current_sig = nil
326
+
327
+ current_sig = calculate_p_value(t_val, 0)
328
+ stats.merge(:significant => current_sig)
329
+ end
330
+
331
+ # ideal: build all possible permutations of sample_size from distributions
332
+ # and then compare them to each other. since there is no 're-usage' of any
333
+ # item, sample_size <= total size of population
334
+ # num_permutations is the number of times to do this.
335
+ def self.permutation_test(distributions, num_permutations=1000,
336
+ sample_size_percentage=0.2)
337
+ stats = self.calculate_histogram_stats(distributions)
338
+ # Data comes in histogram form? Hmmm. Need to massage it into an array
339
+ # A histogram of all distributions
340
+ pooled_dist = self.pool_distributions(distributions)
341
+ # An array with all possible values
342
+ pooled_flat = self.squash_distribution(pooled_dist)
343
+ sample_size = pooled_flat.count * sample_size_percentage
344
+
345
+ # Let's try with ruby's random number generator for a while.
346
+ mean_differences = []
347
+ (0..num_permutations).each do |i|
348
+ samples = self.bi_sample_array(pooled_flat, sample_size)
349
+ mean_differences << (samples[0].mean - samples[1].mean).abs
350
+ end
351
+
352
+ mean_differences = mean_differences.sort
353
+
354
+ # Find the index of where the sample difference means falls
355
+ dist_mean_diff = (stats.values.first[:mean] - stats.values.last[:mean]).abs
356
+
357
+ # run through the mean differences in sorted order until we find a value
358
+ # that is greater than dist_mean_diff or run off the array
359
+ fit_index = 0
360
+ while fit_index < mean_differences.count do
361
+ break if dist_mean_diff < mean_differences[fit_index]
362
+ fit_index += 1
363
+ end
364
+
365
+ # calculate where fit_index falls in the array - for it to be statistically
366
+ # signifigant, it has to fall in the top 5 - 10% of mean differences,
367
+ # i.e. greater than array.count * .90. I think.
368
+ fp = fit_index.to_f / num_permutations.to_f
369
+
370
+ # We want to return 1 - the possibility, I guess.
371
+ 1.0 - fp
372
+
373
+ end
374
+ end
375
+ end
376
+ end