modesty 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. data/Gemfile +13 -0
  2. data/Gemfile.lock +18 -0
  3. data/LICENSE +21 -0
  4. data/README.md +121 -0
  5. data/Rakefile +29 -0
  6. data/VERSION +1 -0
  7. data/init.rb +1 -0
  8. data/lib/modesty.rb +26 -0
  9. data/lib/modesty/api.rb +14 -0
  10. data/lib/modesty/core_ext.rb +5 -0
  11. data/lib/modesty/core_ext/array.rb +21 -0
  12. data/lib/modesty/core_ext/fixnum.rb +5 -0
  13. data/lib/modesty/core_ext/hash.rb +39 -0
  14. data/lib/modesty/core_ext/string.rb +9 -0
  15. data/lib/modesty/core_ext/symbol.rb +33 -0
  16. data/lib/modesty/datastore.rb +51 -0
  17. data/lib/modesty/datastore/redis.rb +180 -0
  18. data/lib/modesty/experiment.rb +87 -0
  19. data/lib/modesty/experiment/base.rb +47 -0
  20. data/lib/modesty/experiment/builder.rb +48 -0
  21. data/lib/modesty/experiment/console.rb +4 -0
  22. data/lib/modesty/experiment/data.rb +75 -0
  23. data/lib/modesty/experiment/interface.rb +29 -0
  24. data/lib/modesty/experiment/significance.rb +376 -0
  25. data/lib/modesty/experiment/stats.rb +163 -0
  26. data/lib/modesty/frameworks/rails.rb +27 -0
  27. data/lib/modesty/identity.rb +32 -0
  28. data/lib/modesty/load.rb +80 -0
  29. data/lib/modesty/load/load_experiments.rb +14 -0
  30. data/lib/modesty/load/load_metrics.rb +17 -0
  31. data/lib/modesty/metric.rb +56 -0
  32. data/lib/modesty/metric/base.rb +38 -0
  33. data/lib/modesty/metric/builder.rb +23 -0
  34. data/lib/modesty/metric/data.rb +133 -0
  35. data/modesty.gemspec +192 -0
  36. data/spec/core_ext_spec.rb +17 -0
  37. data/spec/experiment_spec.rb +239 -0
  38. data/spec/identity_spec.rb +161 -0
  39. data/spec/load_spec.rb +87 -0
  40. data/spec/metric_spec.rb +176 -0
  41. data/spec/rails_spec.rb +48 -0
  42. data/spec/redis_spec.rb +29 -0
  43. data/spec/significance_spec.rb +147 -0
  44. data/spec/spec.opts +1 -0
  45. data/test/myapp/config/modesty.yml +9 -0
  46. data/test/myapp/modesty/experiments/cookbook.rb +4 -0
  47. data/test/myapp/modesty/metrics/kitchen_metrics.rb +9 -0
  48. data/test/myapp/modesty/metrics/stove/burner_metrics.rb +2 -0
  49. data/vendor/.piston.yml +8 -0
  50. data/vendor/mock_redis/.gitignore +2 -0
  51. data/vendor/mock_redis/README +8 -0
  52. data/vendor/mock_redis/lib/mock_redis.rb +10 -0
  53. data/vendor/mock_redis/lib/mock_redis/hash.rb +61 -0
  54. data/vendor/mock_redis/lib/mock_redis/list.rb +6 -0
  55. data/vendor/mock_redis/lib/mock_redis/misc.rb +69 -0
  56. data/vendor/mock_redis/lib/mock_redis/set.rb +108 -0
  57. data/vendor/mock_redis/lib/mock_redis/string.rb +32 -0
  58. data/vendor/redis-rb/.gitignore +8 -0
  59. data/vendor/redis-rb/LICENSE +20 -0
  60. data/vendor/redis-rb/README.markdown +129 -0
  61. data/vendor/redis-rb/Rakefile +155 -0
  62. data/vendor/redis-rb/benchmarking/logging.rb +62 -0
  63. data/vendor/redis-rb/benchmarking/pipeline.rb +51 -0
  64. data/vendor/redis-rb/benchmarking/speed.rb +21 -0
  65. data/vendor/redis-rb/benchmarking/suite.rb +24 -0
  66. data/vendor/redis-rb/benchmarking/thread_safety.rb +38 -0
  67. data/vendor/redis-rb/benchmarking/worker.rb +71 -0
  68. data/vendor/redis-rb/examples/basic.rb +15 -0
  69. data/vendor/redis-rb/examples/dist_redis.rb +43 -0
  70. data/vendor/redis-rb/examples/incr-decr.rb +17 -0
  71. data/vendor/redis-rb/examples/list.rb +26 -0
  72. data/vendor/redis-rb/examples/pubsub.rb +31 -0
  73. data/vendor/redis-rb/examples/sets.rb +36 -0
  74. data/vendor/redis-rb/examples/unicorn/config.ru +3 -0
  75. data/vendor/redis-rb/examples/unicorn/unicorn.rb +20 -0
  76. data/vendor/redis-rb/lib/redis.rb +676 -0
  77. data/vendor/redis-rb/lib/redis/client.rb +201 -0
  78. data/vendor/redis-rb/lib/redis/compat.rb +21 -0
  79. data/vendor/redis-rb/lib/redis/connection.rb +134 -0
  80. data/vendor/redis-rb/lib/redis/distributed.rb +526 -0
  81. data/vendor/redis-rb/lib/redis/hash_ring.rb +131 -0
  82. data/vendor/redis-rb/lib/redis/pipeline.rb +13 -0
  83. data/vendor/redis-rb/lib/redis/subscribe.rb +79 -0
  84. data/vendor/redis-rb/redis.gemspec +29 -0
  85. data/vendor/redis-rb/test/commands_on_hashes_test.rb +46 -0
  86. data/vendor/redis-rb/test/commands_on_lists_test.rb +50 -0
  87. data/vendor/redis-rb/test/commands_on_sets_test.rb +78 -0
  88. data/vendor/redis-rb/test/commands_on_sorted_sets_test.rb +109 -0
  89. data/vendor/redis-rb/test/commands_on_strings_test.rb +70 -0
  90. data/vendor/redis-rb/test/commands_on_value_types_test.rb +88 -0
  91. data/vendor/redis-rb/test/connection_handling_test.rb +87 -0
  92. data/vendor/redis-rb/test/db/.gitignore +1 -0
  93. data/vendor/redis-rb/test/distributd_key_tags_test.rb +53 -0
  94. data/vendor/redis-rb/test/distributed_blocking_commands_test.rb +54 -0
  95. data/vendor/redis-rb/test/distributed_commands_on_hashes_test.rb +12 -0
  96. data/vendor/redis-rb/test/distributed_commands_on_lists_test.rb +18 -0
  97. data/vendor/redis-rb/test/distributed_commands_on_sets_test.rb +85 -0
  98. data/vendor/redis-rb/test/distributed_commands_on_strings_test.rb +50 -0
  99. data/vendor/redis-rb/test/distributed_commands_on_value_types_test.rb +73 -0
  100. data/vendor/redis-rb/test/distributed_commands_requiring_clustering_test.rb +141 -0
  101. data/vendor/redis-rb/test/distributed_connection_handling_test.rb +25 -0
  102. data/vendor/redis-rb/test/distributed_internals_test.rb +18 -0
  103. data/vendor/redis-rb/test/distributed_persistence_control_commands_test.rb +24 -0
  104. data/vendor/redis-rb/test/distributed_publish_subscribe_test.rb +90 -0
  105. data/vendor/redis-rb/test/distributed_remote_server_control_commands_test.rb +31 -0
  106. data/vendor/redis-rb/test/distributed_sorting_test.rb +21 -0
  107. data/vendor/redis-rb/test/distributed_test.rb +60 -0
  108. data/vendor/redis-rb/test/distributed_transactions_test.rb +34 -0
  109. data/vendor/redis-rb/test/encoding_test.rb +16 -0
  110. data/vendor/redis-rb/test/helper.rb +86 -0
  111. data/vendor/redis-rb/test/internals_test.rb +27 -0
  112. data/vendor/redis-rb/test/lint/hashes.rb +90 -0
  113. data/vendor/redis-rb/test/lint/internals.rb +53 -0
  114. data/vendor/redis-rb/test/lint/lists.rb +93 -0
  115. data/vendor/redis-rb/test/lint/sets.rb +66 -0
  116. data/vendor/redis-rb/test/lint/sorted_sets.rb +132 -0
  117. data/vendor/redis-rb/test/lint/strings.rb +98 -0
  118. data/vendor/redis-rb/test/lint/value_types.rb +84 -0
  119. data/vendor/redis-rb/test/persistence_control_commands_test.rb +22 -0
  120. data/vendor/redis-rb/test/pipelining_commands_test.rb +78 -0
  121. data/vendor/redis-rb/test/publish_subscribe_test.rb +151 -0
  122. data/vendor/redis-rb/test/redis_mock.rb +64 -0
  123. data/vendor/redis-rb/test/remote_server_control_commands_test.rb +56 -0
  124. data/vendor/redis-rb/test/sorting_test.rb +44 -0
  125. data/vendor/redis-rb/test/test.conf +8 -0
  126. data/vendor/redis-rb/test/thread_safety_test.rb +34 -0
  127. data/vendor/redis-rb/test/transactions_test.rb +91 -0
  128. data/vendor/redis-rb/test/unknown_commands_test.rb +14 -0
  129. data/vendor/redis-rb/test/url_param_test.rb +52 -0
  130. metadata +277 -0
@@ -0,0 +1,87 @@
1
+ module Modesty
2
+ class Experiment
3
+ class Error < StandardError; end
4
+ end
5
+
6
+ module ExperimentMethods
7
+ def experiments
8
+ @experiments ||= Hash.new do |h, k|
9
+ raise Experiment::Error, <<-msg.squish
10
+ Unrecognized experiment #{k.inspect}.
11
+ msg
12
+ end
13
+ end
14
+
15
+ def add_experiment(exp)
16
+ raise Experiment::Error, <<-msg if self.experiments.include? exp.slug
17
+ Experiment #{exp.slug.inspect} already defined!
18
+ msg
19
+ self.experiments[exp.slug] = exp
20
+ end
21
+
22
+ def new_experiment(slug, &block)
23
+ exp = Experiment.new(slug)
24
+ yield Experiment::Builder.new(exp) if block
25
+ exp.metrics.each do |m|
26
+ m.experiments << exp
27
+ exp.alternatives.each do |a|
28
+ Modesty.new_metric(m.slug/exp.slug/a, :parent => m, :experiment => exp)
29
+ end
30
+ end
31
+ add_experiment(exp)
32
+ exp
33
+ end
34
+
35
+ def decide_identity(options)
36
+ if options.include? :identity
37
+ options[:identity]
38
+ elsif options.include? :for
39
+ options[:for]
40
+ elsif options.include? :on
41
+ options[:on]
42
+ else
43
+ Modesty.identity
44
+ end
45
+ end
46
+
47
+ def experiment(sym, options={}, &blk)
48
+ exp = self.experiments[sym]
49
+
50
+ identity = decide_identity(options)
51
+
52
+ interface = Experiment::Interface.new(exp, identity)
53
+ self.with_identity identity do
54
+ yield interface
55
+ end
56
+
57
+ interface.last_value
58
+ end
59
+
60
+ def group?(sym, options={})
61
+ id = decide_identity(options)
62
+
63
+ exp = sym.to_s.split(/\//)
64
+ alt = exp.pop.to_sym
65
+ exp = exp.join('/').to_sym
66
+ exp = self.experiments[exp]
67
+ exp.group? alt
68
+ end
69
+
70
+ def group(sym, options={})
71
+ id = decide_identity(options)
72
+ exp = self.experiments[sym]
73
+ exp ? exp.group(id) : :control
74
+ end
75
+ end
76
+
77
+ class API
78
+ include ExperimentMethods
79
+ end
80
+ end
81
+
82
+ require 'modesty/experiment/base'
83
+ require 'modesty/experiment/builder'
84
+ require 'modesty/experiment/data'
85
+ require 'modesty/experiment/interface'
86
+ require 'modesty/experiment/significance'
87
+ require 'modesty/experiment/stats'
@@ -0,0 +1,47 @@
1
+ module Modesty
2
+ class Experiment
3
+
4
+ def initialize(slug)
5
+ @slug = slug
6
+ end
7
+
8
+ def inspect
9
+ "#<Modesty::Experiment[ #{self.slug.inspect} ]>"
10
+ end
11
+
12
+ ATTRIBUTES = [
13
+ :description,
14
+ ]
15
+
16
+ def identity_for(sym)
17
+ sym = sym.slug if sym.is_a? Metric
18
+ self.metric_contexts[sym]
19
+ end
20
+
21
+ attr_reader *ATTRIBUTES
22
+ attr_reader :slug
23
+ attr_reader :metrics
24
+
25
+ def metric_contexts
26
+ @metric_contexts ||= Hash.new(:user)
27
+ end
28
+
29
+ def alternatives
30
+ @alternatives ||= [:control, :experiment]
31
+ end
32
+
33
+ def metrics(alt=nil)
34
+ @metrics ||= []
35
+ return @metrics unless alt
36
+ raise Error, <<-msg.squish unless self.alternatives.include? alt
37
+ Unrecognized alternative #{alt.inspect} for #{self.inspect}.
38
+ Available alternatives: #{self.alternatives.inspect}
39
+ msg
40
+
41
+ Hash[@metrics.map do |m|
42
+ [m.slug, m/(self.slug/alt)]
43
+ end]
44
+ end
45
+
46
+ end
47
+ end
@@ -0,0 +1,48 @@
1
+ module Modesty
2
+ class Experiment
3
+ class Builder
4
+ def method_missing(name, *args)
5
+ if Experiment::ATTRIBUTES.include?(name) && args.count > 0
6
+ val = (args.count == 1) ? args[0] : args
7
+ @exp.instance_variable_set("@#{name}", val)
8
+ else
9
+ @exp.send(name)
10
+ end
11
+ end
12
+
13
+ def initialize(exp)
14
+ @exp = exp
15
+ end
16
+
17
+ def alternatives(*alts)
18
+ alts.unshift :control unless alts.include? :control
19
+ @exp.instance_variable_set("@alternatives", alts)
20
+ end
21
+
22
+ def metrics(*args)
23
+ metrics = args.map do |s|
24
+ Modesty.metrics[s] || raise(
25
+ Modesty::NoMetricError,
26
+ "Undefined metric '#{s.inspect}' in experiment #{@exp}'"
27
+ )
28
+ end
29
+ @exp.instance_variable_set("@metrics", metrics)
30
+ end
31
+
32
+ def metric(sym, options={})
33
+ @exp.metrics << (Modesty.metrics[sym] || raise(
34
+ Modesty::NoMetricError,
35
+ "Undefined metric #{sym.inspect} in experiment #{@exp}"
36
+ ))
37
+ if as = options.delete(:as)
38
+ @exp.metric_contexts[sym] = as.to_sym
39
+ end
40
+
41
+ raise <<-msg.squish unless options.empty?
42
+ unrecognized options
43
+ #{options.keys.inspect}
44
+ msg
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,4 @@
1
+ module Modesty
2
+ class Experiment
3
+ end
4
+ end
@@ -0,0 +1,75 @@
1
+ module Modesty
2
+ class Experiment
3
+ def data
4
+ @data ||= (Modesty.data.class)::ExperimentData.new(self)
5
+ end
6
+
7
+ def chooses(alt, options={})
8
+ raise Experiment::Error, <<-msg.squish unless self.alternatives.include? alt
9
+ Unknown alternative #{alt.inspect}
10
+ msg
11
+
12
+ id = options.include?(:for) ? options[:for] : Modesty.identity
13
+
14
+ raise IdentityError, <<-msg.squish unless id
15
+ Experiment#chooses doesn't work for guests.
16
+ Either identify globally or pass in :for => id
17
+ msg
18
+
19
+ self.data.register!(alt, id)
20
+ rescue Datastore::ConnectionError => e
21
+ Modesty.handle_error(e)
22
+ alt
23
+ end
24
+
25
+ def group(id=Modesty.identity)
26
+ return :control unless id
27
+ fetch_or_generate_group(id)
28
+ end
29
+
30
+ # usage: `e.group?(:experiment)`
31
+ def group?(alt)
32
+ self.group == alt
33
+ end
34
+
35
+ def num_users(alt=nil)
36
+ if self.data.respond_to? :num_users
37
+ self.data.num_users(alt)
38
+ else
39
+ self.users(alt).count
40
+ end
41
+ end
42
+
43
+ def users(alt=nil)
44
+ self.data.users(alt)
45
+ end
46
+
47
+ private
48
+ # used to fetch the cached alternative from redis
49
+ def fetch_group(identity)
50
+ self.data.get_cached_alternative(identity)
51
+ rescue Datastore::ConnectionError => e
52
+ Modesty.handle_error(e)
53
+ nil
54
+ end
55
+
56
+ # this is the method with the fallbacks - fetch it from redis or create it.
57
+ def fetch_or_generate_group(id=Modesty.identity)
58
+ alt = begin
59
+ fetch_group(id)
60
+ rescue Datastore::ConnectionError
61
+ nil
62
+ end || generate_group(id)
63
+ end
64
+
65
+ # generates an alternative and stores it in redis
66
+ def generate_group(identity)
67
+ alternative = self.alternatives[
68
+ "#{@slug}#{identity}".hash % self.alternatives.count
69
+ ]
70
+ self.chooses(alternative, :for => identity)
71
+ return alternative
72
+ end
73
+
74
+ end
75
+ end
@@ -0,0 +1,29 @@
1
+ module Modesty
2
+ class Experiment
3
+ # the thing yielded when you say `Modesty.experiment :foo do |e| ...`
4
+ class Interface
5
+ def initialize(exp, identity)
6
+ @exp = exp
7
+ @alt = exp.group(identity)
8
+ end
9
+
10
+ attr_reader :last_value
11
+
12
+ def group(gr=nil)
13
+ if block_given?
14
+ if gr && @exp.group == gr
15
+ @last_value = yield
16
+ else
17
+ @last_value
18
+ end
19
+ else
20
+ @exp.group
21
+ end
22
+ end
23
+
24
+ def group?(alt)
25
+ alt == @alt
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,376 @@
1
+ module Modesty
2
+ class Experiment
3
+ class Significance
4
+
5
+ #this is the table for up to 4 degrees of freedom. If we are going to use
6
+ #more than this we should actually have a flat file with the table that we
7
+ #can parse.
8
+ CHI_SQUARE_TABLE = {
9
+ 1 => {
10
+ 2.7055 => 0.10,
11
+ 3.8415 => 0.05,
12
+ 5.0239 => 0.025,
13
+ 6.6349 => 0.01,
14
+ 7.8794 => 0.005
15
+ },
16
+ 2 => {
17
+ 4.6052 => 0.10,
18
+ 5.9915 => 0.05,
19
+ 7.3778 => 0.025,
20
+ 9.2104 => 0.01,
21
+ 10.5965 => 0.005
22
+ },
23
+ 3 => {
24
+ 6.2514 => 0.10,
25
+ 7.8147 => 0.05,
26
+ 9.3484 => 0.025,
27
+ 11.349 => 0.01,
28
+ 12.8381 => 0.005
29
+ },
30
+ 4 => {
31
+ 7.7794 => 0.10,
32
+ 9.4877 => 0.05,
33
+ 11.1433 => 0.025,
34
+ 13.2767 => 0.01,
35
+ 14.860 => 0.005
36
+ },
37
+ 5 => {
38
+ 9.236 => 0.10,
39
+ 11.070 => 0.05,
40
+ 12.833 => 0.025,
41
+ 15.086 => 0.01,
42
+ 16.750 => 0.005
43
+ },
44
+ 6 => {
45
+ 10.645 => 0.10,
46
+ 12.592 => 0.05,
47
+ 14.449 => 0.025,
48
+ 16.812 => 0.01,
49
+ 18.548 => 0.005
50
+ },
51
+ 7 => {
52
+ 12.017 => 0.10,
53
+ 14.067 => 0.05,
54
+ 16.013 => 0.025,
55
+ 18.475 => 0.01,
56
+ 20.278 => 0.005
57
+ },
58
+ 8 => {
59
+ 13.362 => 0.10,
60
+ 15.507 => 0.05,
61
+ 17.535 => 0.025,
62
+ 20.090 => 0.01,
63
+ 21.955 => 0.005
64
+ }
65
+ }
66
+
67
+ def self.significance(*args)
68
+ df = (args.size - 1) * (args[0].size - 1)
69
+ raise "Currently unimplemented: More than 8 degrees of freedom" if df > 8
70
+ chi_square = self.chi_square(args)
71
+ current = nil
72
+ CHI_SQUARE_TABLE[df].keys.sort.each do |key|
73
+ if chi_square > key
74
+ current = CHI_SQUARE_TABLE[df][key]
75
+ end
76
+ end
77
+ current
78
+ end
79
+
80
+ # return an hash with all the values from the distributions in it, not
81
+ # necessarily sorted. Basically, pool all the histograms.
82
+ def self.pool_distributions(distributions)
83
+ pooled_distribution = Hash.new(0)
84
+ distributions.each do |name, frequency_map|
85
+ frequency_map.each do |key, value|
86
+ pooled_distribution[key] += value
87
+ end
88
+ end
89
+ pooled_distribution
90
+ end
91
+
92
+ # Take a histogram and turn it into an array
93
+ def self.squash_distribution(distribution)
94
+ a_flattened_histogram = []
95
+ distribution.each do |key, value|
96
+ (0..value).each do
97
+ a_flattened_histogram << key
98
+ end
99
+ end
100
+ a_flattened_histogram
101
+ end
102
+
103
+ # pick two unique samples from 'array' of size num_elements
104
+ def self.bi_sample_array(array, num_elements)
105
+ raise "We don't have that many elements" unless num_elements*2 <= array.size
106
+ values = array.shuffle
107
+ [values[0...num_elements], values[num_elements...2*num_elements]]
108
+ end
109
+
110
+ def self.add_sums(rows)
111
+ size = nil
112
+ rows.each do |row|
113
+ size = row.size unless size
114
+ raise "Unequal sized rows!" if size != row.size
115
+ row.push row.sum
116
+ end
117
+ new_row = [0] * (size + 1)
118
+ rows.each do |row|
119
+ new_row = new_row.zip(row).map(&:sum)
120
+ end
121
+ rows.push new_row
122
+ end
123
+
124
+ def self.chi_square(rows)
125
+ rows = self.add_sums(rows)
126
+
127
+ chi_square = 0
128
+ num_rows = rows.size
129
+ len = rows[0].size
130
+ (0...num_rows).each do |i|
131
+ (0...len).each do |j|
132
+ error = rows[i][len - 1].to_f * rows[num_rows - 1][j].to_f /
133
+ rows[num_rows - 1][len - 1].to_f
134
+ chi_square += ((error - rows[i][j])**2) / error
135
+ end
136
+ end
137
+ chi_square
138
+ end
139
+
140
+
141
+ def self.size_total_mean_and_stdev(distribution)
142
+ total = 0
143
+ size = 0
144
+ distribution.each do |pair|
145
+ value = pair[0].to_i
146
+ freq = pair[1].to_i
147
+ total += value * freq
148
+ size += freq
149
+ end
150
+ mean = total.to_f / size
151
+ stderr = 0
152
+ distribution.each do |pair|
153
+ value = pair[0].to_i
154
+ freq = pair[1].to_i
155
+ stderr += freq * ((value - mean)**2)
156
+ end
157
+ std_dev = (stderr.to_f / size) ** (0.5)
158
+ [size, total, mean, std_dev]
159
+ end
160
+
161
+
162
+ #assume infinite df. Numbers here are huge
163
+ SIGNIFICANCE_VALUES = {1.282 => 0.10, 1.645 => 0.05, 1.960 => 0.025,
164
+ 2.326 => 0.01, 2.576 => 0.005}
165
+
166
+ # Let's also have a table of signifigant values based on degrees of freedom
167
+ # and see if we can look up data in it
168
+ #
169
+ LOOKUP_SIGNIFICANCE_TABLE = {
170
+ 0 => 0.25,
171
+ 1 => 0.20,
172
+ 2 => 0.15,
173
+ 3 => 0.10,
174
+ 4 => 0.05,
175
+ 5 => 0.025,
176
+ 6 => 0.01,
177
+ 7 => 0.005,
178
+ }
179
+
180
+ # Taken from Wikipedia's page on Student's T distribution
181
+ SIGNIFICANCE_VALUES_FOR_V = {
182
+ # V 75% 80% 85% 90% 95% 97.5% 99% 99.5% 99.75% 99.9% 99.95%
183
+ 1 => [ 1.000, 1.376, 1.963, 3.078, 6.314, 12.71, 31.82, 63.66, 127.3, 318.3, 636.6 ],
184
+ 2 => [ 0.816, 1.061, 1.386, 1.886, 2.920, 4.303, 6.965, 9.925, 14.09, 22.33, 31.60 ],
185
+ 3 => [ 0.765, 0.978, 1.250, 1.638, 2.353, 3.182, 4.541, 5.841, 7.453, 10.21, 12.92 ],
186
+ 4 => [ 0.741, 0.941, 1.190, 1.533, 2.132, 2.776, 3.747, 4.604, 5.598, 7.173, 8.610 ],
187
+ 5 => [ 0.727, 0.920, 1.156, 1.476, 2.015, 2.571, 3.365, 4.032, 4.773, 5.893, 6.869 ],
188
+ 6 => [ 0.718, 0.906, 1.134, 1.440, 1.943, 2.447, 3.143, 3.707, 4.317, 5.208, 5.959 ],
189
+ 7 => [ 0.711, 0.896, 1.119, 1.415, 1.895, 2.365, 2.998, 3.499, 4.029, 4.785, 5.408 ],
190
+ 8 => [ 0.706, 0.889, 1.108, 1.397, 1.860, 2.306, 2.896, 3.355, 3.833, 4.501, 5.041 ],
191
+ 9 => [ 0.703, 0.883, 1.100, 1.383, 1.833, 2.262, 2.821, 3.250, 3.690, 4.297, 4.781 ],
192
+ 10 => [ 0.700, 0.879, 1.093, 1.372, 1.812, 2.228, 2.764, 3.169, 3.581, 4.144, 4.587 ],
193
+ 11 => [ 0.697, 0.876, 1.088, 1.363, 1.796, 2.201, 2.718, 3.106, 3.497, 4.025, 4.437 ],
194
+ 12 => [ 0.695, 0.873, 1.083, 1.356, 1.782, 2.179, 2.681, 3.055, 3.428, 3.930, 4.318 ],
195
+ 13 => [ 0.694, 0.870, 1.079, 1.350, 1.771, 2.160, 2.650, 3.012, 3.372, 3.852, 4.221 ],
196
+ 14 => [ 0.692, 0.868, 1.076, 1.345, 1.761, 2.145, 2.624, 2.977, 3.326, 3.787, 4.140 ],
197
+ 15 => [ 0.691, 0.866, 1.074, 1.341, 1.753, 2.131, 2.602, 2.947, 3.286, 3.733, 4.073 ],
198
+ 16 => [ 0.690, 0.865, 1.071, 1.337, 1.746, 2.120, 2.583, 2.921, 3.252, 3.686, 4.015 ],
199
+ 17 => [ 0.689, 0.863, 1.069, 1.333, 1.740, 2.110, 2.567, 2.898, 3.222, 3.646, 3.965 ],
200
+ 18 => [ 0.688, 0.862, 1.067, 1.330, 1.734, 2.101, 2.552, 2.878, 3.197, 3.610, 3.922 ],
201
+ 19 => [ 0.688, 0.861, 1.066, 1.328, 1.729, 2.093, 2.539, 2.861, 3.174, 3.579, 3.883 ],
202
+ 20 => [ 0.687, 0.860, 1.064, 1.325, 1.725, 2.086, 2.528, 2.845, 3.153, 3.552, 3.850 ],
203
+ 21 => [ 0.686, 0.859, 1.063, 1.323, 1.721, 2.080, 2.518, 2.831, 3.135, 3.527, 3.819 ],
204
+ 22 => [ 0.686, 0.858, 1.061, 1.321, 1.717, 2.074, 2.508, 2.819, 3.119, 3.505, 3.792 ],
205
+ 23 => [ 0.685, 0.858, 1.060, 1.319, 1.714, 2.069, 2.500, 2.807, 3.104, 3.485, 3.767 ],
206
+ 24 => [ 0.685, 0.857, 1.059, 1.318, 1.711, 2.064, 2.492, 2.797, 3.091, 3.467, 3.745 ],
207
+ 25 => [ 0.684, 0.856, 1.058, 1.316, 1.708, 2.060, 2.485, 2.787, 3.078, 3.450, 3.725 ],
208
+ 26 => [ 0.684, 0.856, 1.058, 1.315, 1.706, 2.056, 2.479, 2.779, 3.067, 3.435, 3.707 ],
209
+ 27 => [ 0.684, 0.855, 1.057, 1.314, 1.703, 2.052, 2.473, 2.771, 3.057, 3.421, 3.690 ],
210
+ 28 => [ 0.683, 0.855, 1.056, 1.313, 1.701, 2.048, 2.467, 2.763, 3.047, 3.408, 3.674 ],
211
+ 29 => [ 0.683, 0.854, 1.055, 1.311, 1.699, 2.045, 2.462, 2.756, 3.038, 3.396, 3.659 ],
212
+ 30 => [ 0.683, 0.854, 1.055, 1.310, 1.697, 2.042, 2.457, 2.750, 3.030, 3.385, 3.646 ],
213
+ 40 => [ 0.681, 0.851, 1.050, 1.303, 1.684, 2.021, 2.423, 2.704, 2.971, 3.307, 3.551 ],
214
+ 50 => [ 0.679, 0.849, 1.047, 1.299, 1.676, 2.009, 2.403, 2.678, 2.937, 3.261, 3.496 ],
215
+ 60 => [ 0.679, 0.848, 1.045, 1.296, 1.671, 2.000, 2.390, 2.660, 2.915, 3.232, 3.460 ],
216
+ 80 => [ 0.678, 0.846, 1.043, 1.292, 1.664, 1.990, 2.374, 2.639, 2.887, 3.195, 3.416 ],
217
+ 100 => [ 0.677, 0.845, 1.042, 1.290, 1.660, 1.984, 2.364, 2.626, 2.871, 3.174, 3.390 ],
218
+ 120 => [ 0.677, 0.845, 1.041, 1.289, 1.658, 1.980, 2.358, 2.617, 2.860, 3.160, 3.373 ],
219
+ 0 => [ 0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326, 2.576, 2.807, 3.090, 3.291 ],
220
+ }
221
+
222
+ # Calculate the p_value for a given t,v from the student's t distribution
223
+ def calculate_p_value(t_val, v_val=0)
224
+ v_arr = SIGNIFICANCE_VALUES_FOR_V[v]
225
+
226
+ return nil if !v_arr
227
+
228
+ v_arr = v_arr.sort()
229
+ lookup_val = nil
230
+ # find the largest value that t_val is greater than
231
+ v_arr.each do | v_val |
232
+ lookup_val = v_val if t_val > v_val
233
+ end
234
+
235
+ # return the p_value that corresponds to it
236
+ index_into_v_arr = v_arr.index(lookup_val)
237
+ return LOOKUP_SIGNIFICANCE_TABLE[index_into_v_arr]
238
+ end
239
+
240
+ def self.calculate_histogram_stats(distributions)
241
+ #distributions should be hash of {name => histogram }
242
+ stats = distributions.inject({}) do |hash, pair|
243
+ size, tot, mean, sdev = self.size_total_mean_and_stdev(pair[1])
244
+ hash[pair[0]] = {:size => size, :total => tot,
245
+ :mean => mean, :sdev => sdev}
246
+ hash
247
+ end
248
+ return stats
249
+ end
250
+
251
+
252
+ # [okay] my initial comments on the following function:
253
+ # assumptions:
254
+ # * Does a signifigance check against V = infinity
255
+ # * assumes stddev for both distributions are equal.
256
+ def self.dist_significance(distributions)
257
+ #distributions should be hash of {name => histogram }
258
+ stats = self.calculate_histogram_stats(distributions)
259
+ if distributions.keys.size != 2
260
+ #for now can only test for significance in pairwise. To do more than
261
+ #2, need to implement ANOVA
262
+ return stats
263
+ end
264
+
265
+ # Run a student's T test on the distributions
266
+ #
267
+ # t = x1[:mean] - x2[:mean]
268
+ # ---------------------
269
+ # pooled_sdev * sqrt(1/n1 + 1/n2)
270
+ #
271
+ # where n1 is the number of elements in x1, n2 the number of elems in x2,
272
+ # and pooled_sdev is:
273
+ #
274
+ # pooled_sdev = sqrt ( (n1 - 1)(sdev1**2) + (n2 - 1)(sdev2**2) )
275
+ # ( --------------------------------------- )
276
+ # ( n1 + n2 - 2 )
277
+ pooled_sdev = stats.values.map {|hash| (hash[:size] - 1) * (hash[:sdev] ** 2)}.sum
278
+ pooled_sdev /= (stats.values.map {|hash| hash[:size]}.sum - 2)
279
+ pooled_sdev = pooled_sdev ** 0.5
280
+ t_val = (stats.values.first[:mean] - stats.values.last[:mean]) /
281
+ (pooled_sdev *
282
+ (stats.values.map {|hash| 1.0 / hash[:size]}.sum ** 0.5))
283
+ t_val = t_val.abs
284
+ current_sig = nil
285
+ SIGNIFICANCE_VALUES.keys.sort.each do |key|
286
+ if t_val > key
287
+ current_sig = SIGNIFICANCE_VALUES[key]
288
+ end
289
+ end
290
+ stats.merge(:significant => current_sig)
291
+ end
292
+
293
+ def self.welch_t_test(distributions)
294
+ #distributions should be hash of {name => histogram }
295
+ stats = self.calculate_histogram_stats(distributions)
296
+
297
+ # Run a student's T test for assumed unequal size/unequal variance on the
298
+ # populations
299
+ #
300
+ #
301
+ # t = x1[:mean] - x2[:mean]
302
+ # ------------------------------
303
+ # sqrt( sdev1**2 sdev2**2 )
304
+ # ( -------- + -------- )
305
+ # ( n1 n2 )
306
+ #
307
+ # degrees of freedom (yuck, ugly ugly ugly)
308
+ # The Welch-Satterthwaite approximation (also from wikipedia)
309
+ # v = ( sdev1**2 sdev2**2)
310
+ # ( -------- + --------) ** 2
311
+ # ( n1 n2 )
312
+ # --------------------------------
313
+ # sdev1**4 sdev2**4
314
+ # -------------- + --------------
315
+ # n1**2 * (n1-1) n2**2 * (n2-1)
316
+ #
317
+ # it might be acceptable to assume infinite degrees of freedom. We shall see.
318
+
319
+
320
+ denom = stats.values.map {|hash| (hash[:sdev]**2) / hash[:size] }.sum
321
+ denom = denom ** 0.5 # Take the square root
322
+ t_val = (stats.values.first[:mean] - stats.values.last[:mean]) / denom
323
+
324
+ t_val = t_val.abs
325
+ current_sig = nil
326
+
327
+ current_sig = calculate_p_value(t_val, 0)
328
+ stats.merge(:significant => current_sig)
329
+ end
330
+
331
+ # ideal: build all possible permutations of sample_size from distributions
332
+ # and then compare them to each other. since there is no 're-usage' of any
333
+ # item, sample_size <= total size of population
334
+ # num_permutations is the number of times to do this.
335
+ def self.permutation_test(distributions, num_permutations=1000,
336
+ sample_size_percentage=0.2)
337
+ stats = self.calculate_histogram_stats(distributions)
338
+ # Data comes in histogram form? Hmmm. Need to massage it into an array
339
+ # A histogram of all distributions
340
+ pooled_dist = self.pool_distributions(distributions)
341
+ # An array with all possible values
342
+ pooled_flat = self.squash_distribution(pooled_dist)
343
+ sample_size = pooled_flat.count * sample_size_percentage
344
+
345
+ # Let's try with ruby's random number generator for a while.
346
+ mean_differences = []
347
+ (0..num_permutations).each do |i|
348
+ samples = self.bi_sample_array(pooled_flat, sample_size)
349
+ mean_differences << (samples[0].mean - samples[1].mean).abs
350
+ end
351
+
352
+ mean_differences = mean_differences.sort
353
+
354
+ # Find the index of where the sample difference means falls
355
+ dist_mean_diff = (stats.values.first[:mean] - stats.values.last[:mean]).abs
356
+
357
+ # run through the mean differences in sorted order until we find a value
358
+ # that is greater than dist_mean_diff or run off the array
359
+ fit_index = 0
360
+ while fit_index < mean_differences.count do
361
+ break if dist_mean_diff < mean_differences[fit_index]
362
+ fit_index += 1
363
+ end
364
+
365
+ # calculate where fit_index falls in the array - for it to be statistically
366
+ # signifigant, it has to fall in the top 5 - 10% of mean differences,
367
+ # i.e. greater than array.count * .90. I think.
368
+ fp = fit_index.to_f / num_permutations.to_f
369
+
370
+ # We want to return 1 - the possibility, I guess.
371
+ 1.0 - fp
372
+
373
+ end
374
+ end
375
+ end
376
+ end