modesty 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +13 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +21 -0
- data/README.md +121 -0
- data/Rakefile +29 -0
- data/VERSION +1 -0
- data/init.rb +1 -0
- data/lib/modesty.rb +26 -0
- data/lib/modesty/api.rb +14 -0
- data/lib/modesty/core_ext.rb +5 -0
- data/lib/modesty/core_ext/array.rb +21 -0
- data/lib/modesty/core_ext/fixnum.rb +5 -0
- data/lib/modesty/core_ext/hash.rb +39 -0
- data/lib/modesty/core_ext/string.rb +9 -0
- data/lib/modesty/core_ext/symbol.rb +33 -0
- data/lib/modesty/datastore.rb +51 -0
- data/lib/modesty/datastore/redis.rb +180 -0
- data/lib/modesty/experiment.rb +87 -0
- data/lib/modesty/experiment/base.rb +47 -0
- data/lib/modesty/experiment/builder.rb +48 -0
- data/lib/modesty/experiment/console.rb +4 -0
- data/lib/modesty/experiment/data.rb +75 -0
- data/lib/modesty/experiment/interface.rb +29 -0
- data/lib/modesty/experiment/significance.rb +376 -0
- data/lib/modesty/experiment/stats.rb +163 -0
- data/lib/modesty/frameworks/rails.rb +27 -0
- data/lib/modesty/identity.rb +32 -0
- data/lib/modesty/load.rb +80 -0
- data/lib/modesty/load/load_experiments.rb +14 -0
- data/lib/modesty/load/load_metrics.rb +17 -0
- data/lib/modesty/metric.rb +56 -0
- data/lib/modesty/metric/base.rb +38 -0
- data/lib/modesty/metric/builder.rb +23 -0
- data/lib/modesty/metric/data.rb +133 -0
- data/modesty.gemspec +192 -0
- data/spec/core_ext_spec.rb +17 -0
- data/spec/experiment_spec.rb +239 -0
- data/spec/identity_spec.rb +161 -0
- data/spec/load_spec.rb +87 -0
- data/spec/metric_spec.rb +176 -0
- data/spec/rails_spec.rb +48 -0
- data/spec/redis_spec.rb +29 -0
- data/spec/significance_spec.rb +147 -0
- data/spec/spec.opts +1 -0
- data/test/myapp/config/modesty.yml +9 -0
- data/test/myapp/modesty/experiments/cookbook.rb +4 -0
- data/test/myapp/modesty/metrics/kitchen_metrics.rb +9 -0
- data/test/myapp/modesty/metrics/stove/burner_metrics.rb +2 -0
- data/vendor/.piston.yml +8 -0
- data/vendor/mock_redis/.gitignore +2 -0
- data/vendor/mock_redis/README +8 -0
- data/vendor/mock_redis/lib/mock_redis.rb +10 -0
- data/vendor/mock_redis/lib/mock_redis/hash.rb +61 -0
- data/vendor/mock_redis/lib/mock_redis/list.rb +6 -0
- data/vendor/mock_redis/lib/mock_redis/misc.rb +69 -0
- data/vendor/mock_redis/lib/mock_redis/set.rb +108 -0
- data/vendor/mock_redis/lib/mock_redis/string.rb +32 -0
- data/vendor/redis-rb/.gitignore +8 -0
- data/vendor/redis-rb/LICENSE +20 -0
- data/vendor/redis-rb/README.markdown +129 -0
- data/vendor/redis-rb/Rakefile +155 -0
- data/vendor/redis-rb/benchmarking/logging.rb +62 -0
- data/vendor/redis-rb/benchmarking/pipeline.rb +51 -0
- data/vendor/redis-rb/benchmarking/speed.rb +21 -0
- data/vendor/redis-rb/benchmarking/suite.rb +24 -0
- data/vendor/redis-rb/benchmarking/thread_safety.rb +38 -0
- data/vendor/redis-rb/benchmarking/worker.rb +71 -0
- data/vendor/redis-rb/examples/basic.rb +15 -0
- data/vendor/redis-rb/examples/dist_redis.rb +43 -0
- data/vendor/redis-rb/examples/incr-decr.rb +17 -0
- data/vendor/redis-rb/examples/list.rb +26 -0
- data/vendor/redis-rb/examples/pubsub.rb +31 -0
- data/vendor/redis-rb/examples/sets.rb +36 -0
- data/vendor/redis-rb/examples/unicorn/config.ru +3 -0
- data/vendor/redis-rb/examples/unicorn/unicorn.rb +20 -0
- data/vendor/redis-rb/lib/redis.rb +676 -0
- data/vendor/redis-rb/lib/redis/client.rb +201 -0
- data/vendor/redis-rb/lib/redis/compat.rb +21 -0
- data/vendor/redis-rb/lib/redis/connection.rb +134 -0
- data/vendor/redis-rb/lib/redis/distributed.rb +526 -0
- data/vendor/redis-rb/lib/redis/hash_ring.rb +131 -0
- data/vendor/redis-rb/lib/redis/pipeline.rb +13 -0
- data/vendor/redis-rb/lib/redis/subscribe.rb +79 -0
- data/vendor/redis-rb/redis.gemspec +29 -0
- data/vendor/redis-rb/test/commands_on_hashes_test.rb +46 -0
- data/vendor/redis-rb/test/commands_on_lists_test.rb +50 -0
- data/vendor/redis-rb/test/commands_on_sets_test.rb +78 -0
- data/vendor/redis-rb/test/commands_on_sorted_sets_test.rb +109 -0
- data/vendor/redis-rb/test/commands_on_strings_test.rb +70 -0
- data/vendor/redis-rb/test/commands_on_value_types_test.rb +88 -0
- data/vendor/redis-rb/test/connection_handling_test.rb +87 -0
- data/vendor/redis-rb/test/db/.gitignore +1 -0
- data/vendor/redis-rb/test/distributd_key_tags_test.rb +53 -0
- data/vendor/redis-rb/test/distributed_blocking_commands_test.rb +54 -0
- data/vendor/redis-rb/test/distributed_commands_on_hashes_test.rb +12 -0
- data/vendor/redis-rb/test/distributed_commands_on_lists_test.rb +18 -0
- data/vendor/redis-rb/test/distributed_commands_on_sets_test.rb +85 -0
- data/vendor/redis-rb/test/distributed_commands_on_strings_test.rb +50 -0
- data/vendor/redis-rb/test/distributed_commands_on_value_types_test.rb +73 -0
- data/vendor/redis-rb/test/distributed_commands_requiring_clustering_test.rb +141 -0
- data/vendor/redis-rb/test/distributed_connection_handling_test.rb +25 -0
- data/vendor/redis-rb/test/distributed_internals_test.rb +18 -0
- data/vendor/redis-rb/test/distributed_persistence_control_commands_test.rb +24 -0
- data/vendor/redis-rb/test/distributed_publish_subscribe_test.rb +90 -0
- data/vendor/redis-rb/test/distributed_remote_server_control_commands_test.rb +31 -0
- data/vendor/redis-rb/test/distributed_sorting_test.rb +21 -0
- data/vendor/redis-rb/test/distributed_test.rb +60 -0
- data/vendor/redis-rb/test/distributed_transactions_test.rb +34 -0
- data/vendor/redis-rb/test/encoding_test.rb +16 -0
- data/vendor/redis-rb/test/helper.rb +86 -0
- data/vendor/redis-rb/test/internals_test.rb +27 -0
- data/vendor/redis-rb/test/lint/hashes.rb +90 -0
- data/vendor/redis-rb/test/lint/internals.rb +53 -0
- data/vendor/redis-rb/test/lint/lists.rb +93 -0
- data/vendor/redis-rb/test/lint/sets.rb +66 -0
- data/vendor/redis-rb/test/lint/sorted_sets.rb +132 -0
- data/vendor/redis-rb/test/lint/strings.rb +98 -0
- data/vendor/redis-rb/test/lint/value_types.rb +84 -0
- data/vendor/redis-rb/test/persistence_control_commands_test.rb +22 -0
- data/vendor/redis-rb/test/pipelining_commands_test.rb +78 -0
- data/vendor/redis-rb/test/publish_subscribe_test.rb +151 -0
- data/vendor/redis-rb/test/redis_mock.rb +64 -0
- data/vendor/redis-rb/test/remote_server_control_commands_test.rb +56 -0
- data/vendor/redis-rb/test/sorting_test.rb +44 -0
- data/vendor/redis-rb/test/test.conf +8 -0
- data/vendor/redis-rb/test/thread_safety_test.rb +34 -0
- data/vendor/redis-rb/test/transactions_test.rb +91 -0
- data/vendor/redis-rb/test/unknown_commands_test.rb +14 -0
- data/vendor/redis-rb/test/url_param_test.rb +52 -0
- metadata +277 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
module Modesty
|
|
2
|
+
class Experiment
|
|
3
|
+
class Error < StandardError; end
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
module ExperimentMethods
|
|
7
|
+
def experiments
|
|
8
|
+
@experiments ||= Hash.new do |h, k|
|
|
9
|
+
raise Experiment::Error, <<-msg.squish
|
|
10
|
+
Unrecognized experiment #{k.inspect}.
|
|
11
|
+
msg
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def add_experiment(exp)
|
|
16
|
+
raise Experiment::Error, <<-msg if self.experiments.include? exp.slug
|
|
17
|
+
Experiment #{exp.slug.inspect} already defined!
|
|
18
|
+
msg
|
|
19
|
+
self.experiments[exp.slug] = exp
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def new_experiment(slug, &block)
|
|
23
|
+
exp = Experiment.new(slug)
|
|
24
|
+
yield Experiment::Builder.new(exp) if block
|
|
25
|
+
exp.metrics.each do |m|
|
|
26
|
+
m.experiments << exp
|
|
27
|
+
exp.alternatives.each do |a|
|
|
28
|
+
Modesty.new_metric(m.slug/exp.slug/a, :parent => m, :experiment => exp)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
add_experiment(exp)
|
|
32
|
+
exp
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def decide_identity(options)
|
|
36
|
+
if options.include? :identity
|
|
37
|
+
options[:identity]
|
|
38
|
+
elsif options.include? :for
|
|
39
|
+
options[:for]
|
|
40
|
+
elsif options.include? :on
|
|
41
|
+
options[:on]
|
|
42
|
+
else
|
|
43
|
+
Modesty.identity
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def experiment(sym, options={}, &blk)
|
|
48
|
+
exp = self.experiments[sym]
|
|
49
|
+
|
|
50
|
+
identity = decide_identity(options)
|
|
51
|
+
|
|
52
|
+
interface = Experiment::Interface.new(exp, identity)
|
|
53
|
+
self.with_identity identity do
|
|
54
|
+
yield interface
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
interface.last_value
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def group?(sym, options={})
|
|
61
|
+
id = decide_identity(options)
|
|
62
|
+
|
|
63
|
+
exp = sym.to_s.split(/\//)
|
|
64
|
+
alt = exp.pop.to_sym
|
|
65
|
+
exp = exp.join('/').to_sym
|
|
66
|
+
exp = self.experiments[exp]
|
|
67
|
+
exp.group? alt
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def group(sym, options={})
|
|
71
|
+
id = decide_identity(options)
|
|
72
|
+
exp = self.experiments[sym]
|
|
73
|
+
exp ? exp.group(id) : :control
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
class API
|
|
78
|
+
include ExperimentMethods
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
require 'modesty/experiment/base'
|
|
83
|
+
require 'modesty/experiment/builder'
|
|
84
|
+
require 'modesty/experiment/data'
|
|
85
|
+
require 'modesty/experiment/interface'
|
|
86
|
+
require 'modesty/experiment/significance'
|
|
87
|
+
require 'modesty/experiment/stats'
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
module Modesty
|
|
2
|
+
class Experiment
|
|
3
|
+
|
|
4
|
+
def initialize(slug)
|
|
5
|
+
@slug = slug
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def inspect
|
|
9
|
+
"#<Modesty::Experiment[ #{self.slug.inspect} ]>"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
ATTRIBUTES = [
|
|
13
|
+
:description,
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
def identity_for(sym)
|
|
17
|
+
sym = sym.slug if sym.is_a? Metric
|
|
18
|
+
self.metric_contexts[sym]
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
attr_reader *ATTRIBUTES
|
|
22
|
+
attr_reader :slug
|
|
23
|
+
attr_reader :metrics
|
|
24
|
+
|
|
25
|
+
def metric_contexts
|
|
26
|
+
@metric_contexts ||= Hash.new(:user)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def alternatives
|
|
30
|
+
@alternatives ||= [:control, :experiment]
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def metrics(alt=nil)
|
|
34
|
+
@metrics ||= []
|
|
35
|
+
return @metrics unless alt
|
|
36
|
+
raise Error, <<-msg.squish unless self.alternatives.include? alt
|
|
37
|
+
Unrecognized alternative #{alt.inspect} for #{self.inspect}.
|
|
38
|
+
Available alternatives: #{self.alternatives.inspect}
|
|
39
|
+
msg
|
|
40
|
+
|
|
41
|
+
Hash[@metrics.map do |m|
|
|
42
|
+
[m.slug, m/(self.slug/alt)]
|
|
43
|
+
end]
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
module Modesty
|
|
2
|
+
class Experiment
|
|
3
|
+
class Builder
|
|
4
|
+
def method_missing(name, *args)
|
|
5
|
+
if Experiment::ATTRIBUTES.include?(name) && args.count > 0
|
|
6
|
+
val = (args.count == 1) ? args[0] : args
|
|
7
|
+
@exp.instance_variable_set("@#{name}", val)
|
|
8
|
+
else
|
|
9
|
+
@exp.send(name)
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def initialize(exp)
|
|
14
|
+
@exp = exp
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def alternatives(*alts)
|
|
18
|
+
alts.unshift :control unless alts.include? :control
|
|
19
|
+
@exp.instance_variable_set("@alternatives", alts)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def metrics(*args)
|
|
23
|
+
metrics = args.map do |s|
|
|
24
|
+
Modesty.metrics[s] || raise(
|
|
25
|
+
Modesty::NoMetricError,
|
|
26
|
+
"Undefined metric '#{s.inspect}' in experiment #{@exp}'"
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
@exp.instance_variable_set("@metrics", metrics)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def metric(sym, options={})
|
|
33
|
+
@exp.metrics << (Modesty.metrics[sym] || raise(
|
|
34
|
+
Modesty::NoMetricError,
|
|
35
|
+
"Undefined metric #{sym.inspect} in experiment #{@exp}"
|
|
36
|
+
))
|
|
37
|
+
if as = options.delete(:as)
|
|
38
|
+
@exp.metric_contexts[sym] = as.to_sym
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
raise <<-msg.squish unless options.empty?
|
|
42
|
+
unrecognized options
|
|
43
|
+
#{options.keys.inspect}
|
|
44
|
+
msg
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
module Modesty
|
|
2
|
+
class Experiment
|
|
3
|
+
def data
|
|
4
|
+
@data ||= (Modesty.data.class)::ExperimentData.new(self)
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
def chooses(alt, options={})
|
|
8
|
+
raise Experiment::Error, <<-msg.squish unless self.alternatives.include? alt
|
|
9
|
+
Unknown alternative #{alt.inspect}
|
|
10
|
+
msg
|
|
11
|
+
|
|
12
|
+
id = options.include?(:for) ? options[:for] : Modesty.identity
|
|
13
|
+
|
|
14
|
+
raise IdentityError, <<-msg.squish unless id
|
|
15
|
+
Experiment#chooses doesn't work for guests.
|
|
16
|
+
Either identify globally or pass in :for => id
|
|
17
|
+
msg
|
|
18
|
+
|
|
19
|
+
self.data.register!(alt, id)
|
|
20
|
+
rescue Datastore::ConnectionError => e
|
|
21
|
+
Modesty.handle_error(e)
|
|
22
|
+
alt
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def group(id=Modesty.identity)
|
|
26
|
+
return :control unless id
|
|
27
|
+
fetch_or_generate_group(id)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# usage: `e.group?(:experiment)`
|
|
31
|
+
def group?(alt)
|
|
32
|
+
self.group == alt
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def num_users(alt=nil)
|
|
36
|
+
if self.data.respond_to? :num_users
|
|
37
|
+
self.data.num_users(alt)
|
|
38
|
+
else
|
|
39
|
+
self.users(alt).count
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def users(alt=nil)
|
|
44
|
+
self.data.users(alt)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
# used to fetch the cached alternative from redis
|
|
49
|
+
def fetch_group(identity)
|
|
50
|
+
self.data.get_cached_alternative(identity)
|
|
51
|
+
rescue Datastore::ConnectionError => e
|
|
52
|
+
Modesty.handle_error(e)
|
|
53
|
+
nil
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# this is the method with the fallbacks - fetch it from redis or create it.
|
|
57
|
+
def fetch_or_generate_group(id=Modesty.identity)
|
|
58
|
+
alt = begin
|
|
59
|
+
fetch_group(id)
|
|
60
|
+
rescue Datastore::ConnectionError
|
|
61
|
+
nil
|
|
62
|
+
end || generate_group(id)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# generates an alternative and stores it in redis
|
|
66
|
+
def generate_group(identity)
|
|
67
|
+
alternative = self.alternatives[
|
|
68
|
+
"#{@slug}#{identity}".hash % self.alternatives.count
|
|
69
|
+
]
|
|
70
|
+
self.chooses(alternative, :for => identity)
|
|
71
|
+
return alternative
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module Modesty
|
|
2
|
+
class Experiment
|
|
3
|
+
# the thing yielded when you say `Modesty.experiment :foo do |e| ...`
|
|
4
|
+
class Interface
|
|
5
|
+
def initialize(exp, identity)
|
|
6
|
+
@exp = exp
|
|
7
|
+
@alt = exp.group(identity)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
attr_reader :last_value
|
|
11
|
+
|
|
12
|
+
def group(gr=nil)
|
|
13
|
+
if block_given?
|
|
14
|
+
if gr && @exp.group == gr
|
|
15
|
+
@last_value = yield
|
|
16
|
+
else
|
|
17
|
+
@last_value
|
|
18
|
+
end
|
|
19
|
+
else
|
|
20
|
+
@exp.group
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def group?(alt)
|
|
25
|
+
alt == @alt
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
module Modesty
|
|
2
|
+
class Experiment
|
|
3
|
+
class Significance
|
|
4
|
+
|
|
5
|
+
#this is the table for up to 4 degrees of freedom. If we are going to use
|
|
6
|
+
#more than this we should actually have a flat file with the table that we
|
|
7
|
+
#can parse.
|
|
8
|
+
CHI_SQUARE_TABLE = {
|
|
9
|
+
1 => {
|
|
10
|
+
2.7055 => 0.10,
|
|
11
|
+
3.8415 => 0.05,
|
|
12
|
+
5.0239 => 0.025,
|
|
13
|
+
6.6349 => 0.01,
|
|
14
|
+
7.8794 => 0.005
|
|
15
|
+
},
|
|
16
|
+
2 => {
|
|
17
|
+
4.6052 => 0.10,
|
|
18
|
+
5.9915 => 0.05,
|
|
19
|
+
7.3778 => 0.025,
|
|
20
|
+
9.2104 => 0.01,
|
|
21
|
+
10.5965 => 0.005
|
|
22
|
+
},
|
|
23
|
+
3 => {
|
|
24
|
+
6.2514 => 0.10,
|
|
25
|
+
7.8147 => 0.05,
|
|
26
|
+
9.3484 => 0.025,
|
|
27
|
+
11.349 => 0.01,
|
|
28
|
+
12.8381 => 0.005
|
|
29
|
+
},
|
|
30
|
+
4 => {
|
|
31
|
+
7.7794 => 0.10,
|
|
32
|
+
9.4877 => 0.05,
|
|
33
|
+
11.1433 => 0.025,
|
|
34
|
+
13.2767 => 0.01,
|
|
35
|
+
14.860 => 0.005
|
|
36
|
+
},
|
|
37
|
+
5 => {
|
|
38
|
+
9.236 => 0.10,
|
|
39
|
+
11.070 => 0.05,
|
|
40
|
+
12.833 => 0.025,
|
|
41
|
+
15.086 => 0.01,
|
|
42
|
+
16.750 => 0.005
|
|
43
|
+
},
|
|
44
|
+
6 => {
|
|
45
|
+
10.645 => 0.10,
|
|
46
|
+
12.592 => 0.05,
|
|
47
|
+
14.449 => 0.025,
|
|
48
|
+
16.812 => 0.01,
|
|
49
|
+
18.548 => 0.005
|
|
50
|
+
},
|
|
51
|
+
7 => {
|
|
52
|
+
12.017 => 0.10,
|
|
53
|
+
14.067 => 0.05,
|
|
54
|
+
16.013 => 0.025,
|
|
55
|
+
18.475 => 0.01,
|
|
56
|
+
20.278 => 0.005
|
|
57
|
+
},
|
|
58
|
+
8 => {
|
|
59
|
+
13.362 => 0.10,
|
|
60
|
+
15.507 => 0.05,
|
|
61
|
+
17.535 => 0.025,
|
|
62
|
+
20.090 => 0.01,
|
|
63
|
+
21.955 => 0.005
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
def self.significance(*args)
|
|
68
|
+
df = (args.size - 1) * (args[0].size - 1)
|
|
69
|
+
raise "Currently unimplemented: More than 8 degrees of freedom" if df > 8
|
|
70
|
+
chi_square = self.chi_square(args)
|
|
71
|
+
current = nil
|
|
72
|
+
CHI_SQUARE_TABLE[df].keys.sort.each do |key|
|
|
73
|
+
if chi_square > key
|
|
74
|
+
current = CHI_SQUARE_TABLE[df][key]
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
current
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# return an hash with all the values from the distributions in it, not
|
|
81
|
+
# necessarily sorted. Basically, pool all the histograms.
|
|
82
|
+
def self.pool_distributions(distributions)
|
|
83
|
+
pooled_distribution = Hash.new(0)
|
|
84
|
+
distributions.each do |name, frequency_map|
|
|
85
|
+
frequency_map.each do |key, value|
|
|
86
|
+
pooled_distribution[key] += value
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
pooled_distribution
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Take a histogram and turn it into an array
|
|
93
|
+
def self.squash_distribution(distribution)
|
|
94
|
+
a_flattened_histogram = []
|
|
95
|
+
distribution.each do |key, value|
|
|
96
|
+
(0..value).each do
|
|
97
|
+
a_flattened_histogram << key
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
a_flattened_histogram
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# pick two unique samples from 'array' of size num_elements
|
|
104
|
+
def self.bi_sample_array(array, num_elements)
|
|
105
|
+
raise "We don't have that many elements" unless num_elements*2 <= array.size
|
|
106
|
+
values = array.shuffle
|
|
107
|
+
[values[0...num_elements], values[num_elements...2*num_elements]]
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def self.add_sums(rows)
|
|
111
|
+
size = nil
|
|
112
|
+
rows.each do |row|
|
|
113
|
+
size = row.size unless size
|
|
114
|
+
raise "Unequal sized rows!" if size != row.size
|
|
115
|
+
row.push row.sum
|
|
116
|
+
end
|
|
117
|
+
new_row = [0] * (size + 1)
|
|
118
|
+
rows.each do |row|
|
|
119
|
+
new_row = new_row.zip(row).map(&:sum)
|
|
120
|
+
end
|
|
121
|
+
rows.push new_row
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def self.chi_square(rows)
|
|
125
|
+
rows = self.add_sums(rows)
|
|
126
|
+
|
|
127
|
+
chi_square = 0
|
|
128
|
+
num_rows = rows.size
|
|
129
|
+
len = rows[0].size
|
|
130
|
+
(0...num_rows).each do |i|
|
|
131
|
+
(0...len).each do |j|
|
|
132
|
+
error = rows[i][len - 1].to_f * rows[num_rows - 1][j].to_f /
|
|
133
|
+
rows[num_rows - 1][len - 1].to_f
|
|
134
|
+
chi_square += ((error - rows[i][j])**2) / error
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
chi_square
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def self.size_total_mean_and_stdev(distribution)
|
|
142
|
+
total = 0
|
|
143
|
+
size = 0
|
|
144
|
+
distribution.each do |pair|
|
|
145
|
+
value = pair[0].to_i
|
|
146
|
+
freq = pair[1].to_i
|
|
147
|
+
total += value * freq
|
|
148
|
+
size += freq
|
|
149
|
+
end
|
|
150
|
+
mean = total.to_f / size
|
|
151
|
+
stderr = 0
|
|
152
|
+
distribution.each do |pair|
|
|
153
|
+
value = pair[0].to_i
|
|
154
|
+
freq = pair[1].to_i
|
|
155
|
+
stderr += freq * ((value - mean)**2)
|
|
156
|
+
end
|
|
157
|
+
std_dev = (stderr.to_f / size) ** (0.5)
|
|
158
|
+
[size, total, mean, std_dev]
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
#assume infinite df. Numbers here are huge
|
|
163
|
+
SIGNIFICANCE_VALUES = {1.282 => 0.10, 1.645 => 0.05, 1.960 => 0.025,
|
|
164
|
+
2.326 => 0.01, 2.576 => 0.005}
|
|
165
|
+
|
|
166
|
+
# Let's also have a table of signifigant values based on degrees of freedom
|
|
167
|
+
# and see if we can look up data in it
|
|
168
|
+
#
|
|
169
|
+
LOOKUP_SIGNIFICANCE_TABLE = {
|
|
170
|
+
0 => 0.25,
|
|
171
|
+
1 => 0.20,
|
|
172
|
+
2 => 0.15,
|
|
173
|
+
3 => 0.10,
|
|
174
|
+
4 => 0.05,
|
|
175
|
+
5 => 0.025,
|
|
176
|
+
6 => 0.01,
|
|
177
|
+
7 => 0.005,
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
# Taken from Wikipedia's page on Student's T distribution
|
|
181
|
+
SIGNIFICANCE_VALUES_FOR_V = {
|
|
182
|
+
# V 75% 80% 85% 90% 95% 97.5% 99% 99.5% 99.75% 99.9% 99.95%
|
|
183
|
+
1 => [ 1.000, 1.376, 1.963, 3.078, 6.314, 12.71, 31.82, 63.66, 127.3, 318.3, 636.6 ],
|
|
184
|
+
2 => [ 0.816, 1.061, 1.386, 1.886, 2.920, 4.303, 6.965, 9.925, 14.09, 22.33, 31.60 ],
|
|
185
|
+
3 => [ 0.765, 0.978, 1.250, 1.638, 2.353, 3.182, 4.541, 5.841, 7.453, 10.21, 12.92 ],
|
|
186
|
+
4 => [ 0.741, 0.941, 1.190, 1.533, 2.132, 2.776, 3.747, 4.604, 5.598, 7.173, 8.610 ],
|
|
187
|
+
5 => [ 0.727, 0.920, 1.156, 1.476, 2.015, 2.571, 3.365, 4.032, 4.773, 5.893, 6.869 ],
|
|
188
|
+
6 => [ 0.718, 0.906, 1.134, 1.440, 1.943, 2.447, 3.143, 3.707, 4.317, 5.208, 5.959 ],
|
|
189
|
+
7 => [ 0.711, 0.896, 1.119, 1.415, 1.895, 2.365, 2.998, 3.499, 4.029, 4.785, 5.408 ],
|
|
190
|
+
8 => [ 0.706, 0.889, 1.108, 1.397, 1.860, 2.306, 2.896, 3.355, 3.833, 4.501, 5.041 ],
|
|
191
|
+
9 => [ 0.703, 0.883, 1.100, 1.383, 1.833, 2.262, 2.821, 3.250, 3.690, 4.297, 4.781 ],
|
|
192
|
+
10 => [ 0.700, 0.879, 1.093, 1.372, 1.812, 2.228, 2.764, 3.169, 3.581, 4.144, 4.587 ],
|
|
193
|
+
11 => [ 0.697, 0.876, 1.088, 1.363, 1.796, 2.201, 2.718, 3.106, 3.497, 4.025, 4.437 ],
|
|
194
|
+
12 => [ 0.695, 0.873, 1.083, 1.356, 1.782, 2.179, 2.681, 3.055, 3.428, 3.930, 4.318 ],
|
|
195
|
+
13 => [ 0.694, 0.870, 1.079, 1.350, 1.771, 2.160, 2.650, 3.012, 3.372, 3.852, 4.221 ],
|
|
196
|
+
14 => [ 0.692, 0.868, 1.076, 1.345, 1.761, 2.145, 2.624, 2.977, 3.326, 3.787, 4.140 ],
|
|
197
|
+
15 => [ 0.691, 0.866, 1.074, 1.341, 1.753, 2.131, 2.602, 2.947, 3.286, 3.733, 4.073 ],
|
|
198
|
+
16 => [ 0.690, 0.865, 1.071, 1.337, 1.746, 2.120, 2.583, 2.921, 3.252, 3.686, 4.015 ],
|
|
199
|
+
17 => [ 0.689, 0.863, 1.069, 1.333, 1.740, 2.110, 2.567, 2.898, 3.222, 3.646, 3.965 ],
|
|
200
|
+
18 => [ 0.688, 0.862, 1.067, 1.330, 1.734, 2.101, 2.552, 2.878, 3.197, 3.610, 3.922 ],
|
|
201
|
+
19 => [ 0.688, 0.861, 1.066, 1.328, 1.729, 2.093, 2.539, 2.861, 3.174, 3.579, 3.883 ],
|
|
202
|
+
20 => [ 0.687, 0.860, 1.064, 1.325, 1.725, 2.086, 2.528, 2.845, 3.153, 3.552, 3.850 ],
|
|
203
|
+
21 => [ 0.686, 0.859, 1.063, 1.323, 1.721, 2.080, 2.518, 2.831, 3.135, 3.527, 3.819 ],
|
|
204
|
+
22 => [ 0.686, 0.858, 1.061, 1.321, 1.717, 2.074, 2.508, 2.819, 3.119, 3.505, 3.792 ],
|
|
205
|
+
23 => [ 0.685, 0.858, 1.060, 1.319, 1.714, 2.069, 2.500, 2.807, 3.104, 3.485, 3.767 ],
|
|
206
|
+
24 => [ 0.685, 0.857, 1.059, 1.318, 1.711, 2.064, 2.492, 2.797, 3.091, 3.467, 3.745 ],
|
|
207
|
+
25 => [ 0.684, 0.856, 1.058, 1.316, 1.708, 2.060, 2.485, 2.787, 3.078, 3.450, 3.725 ],
|
|
208
|
+
26 => [ 0.684, 0.856, 1.058, 1.315, 1.706, 2.056, 2.479, 2.779, 3.067, 3.435, 3.707 ],
|
|
209
|
+
27 => [ 0.684, 0.855, 1.057, 1.314, 1.703, 2.052, 2.473, 2.771, 3.057, 3.421, 3.690 ],
|
|
210
|
+
28 => [ 0.683, 0.855, 1.056, 1.313, 1.701, 2.048, 2.467, 2.763, 3.047, 3.408, 3.674 ],
|
|
211
|
+
29 => [ 0.683, 0.854, 1.055, 1.311, 1.699, 2.045, 2.462, 2.756, 3.038, 3.396, 3.659 ],
|
|
212
|
+
30 => [ 0.683, 0.854, 1.055, 1.310, 1.697, 2.042, 2.457, 2.750, 3.030, 3.385, 3.646 ],
|
|
213
|
+
40 => [ 0.681, 0.851, 1.050, 1.303, 1.684, 2.021, 2.423, 2.704, 2.971, 3.307, 3.551 ],
|
|
214
|
+
50 => [ 0.679, 0.849, 1.047, 1.299, 1.676, 2.009, 2.403, 2.678, 2.937, 3.261, 3.496 ],
|
|
215
|
+
60 => [ 0.679, 0.848, 1.045, 1.296, 1.671, 2.000, 2.390, 2.660, 2.915, 3.232, 3.460 ],
|
|
216
|
+
80 => [ 0.678, 0.846, 1.043, 1.292, 1.664, 1.990, 2.374, 2.639, 2.887, 3.195, 3.416 ],
|
|
217
|
+
100 => [ 0.677, 0.845, 1.042, 1.290, 1.660, 1.984, 2.364, 2.626, 2.871, 3.174, 3.390 ],
|
|
218
|
+
120 => [ 0.677, 0.845, 1.041, 1.289, 1.658, 1.980, 2.358, 2.617, 2.860, 3.160, 3.373 ],
|
|
219
|
+
0 => [ 0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326, 2.576, 2.807, 3.090, 3.291 ],
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
# Calculate the p_value for a given t,v from the student's t distribution
|
|
223
|
+
def calculate_p_value(t_val, v_val=0)
|
|
224
|
+
v_arr = SIGNIFICANCE_VALUES_FOR_V[v]
|
|
225
|
+
|
|
226
|
+
return nil if !v_arr
|
|
227
|
+
|
|
228
|
+
v_arr = v_arr.sort()
|
|
229
|
+
lookup_val = nil
|
|
230
|
+
# find the largest value that t_val is greater than
|
|
231
|
+
v_arr.each do | v_val |
|
|
232
|
+
lookup_val = v_val if t_val > v_val
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# return the p_value that corresponds to it
|
|
236
|
+
index_into_v_arr = v_arr.index(lookup_val)
|
|
237
|
+
return LOOKUP_SIGNIFICANCE_TABLE[index_into_v_arr]
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def self.calculate_histogram_stats(distributions)
|
|
241
|
+
#distributions should be hash of {name => histogram }
|
|
242
|
+
stats = distributions.inject({}) do |hash, pair|
|
|
243
|
+
size, tot, mean, sdev = self.size_total_mean_and_stdev(pair[1])
|
|
244
|
+
hash[pair[0]] = {:size => size, :total => tot,
|
|
245
|
+
:mean => mean, :sdev => sdev}
|
|
246
|
+
hash
|
|
247
|
+
end
|
|
248
|
+
return stats
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# [okay] my initial comments on the following function:
|
|
253
|
+
# assumptions:
|
|
254
|
+
# * Does a signifigance check against V = infinity
|
|
255
|
+
# * assumes stddev for both distributions are equal.
|
|
256
|
+
def self.dist_significance(distributions)
|
|
257
|
+
#distributions should be hash of {name => histogram }
|
|
258
|
+
stats = self.calculate_histogram_stats(distributions)
|
|
259
|
+
if distributions.keys.size != 2
|
|
260
|
+
#for now can only test for significance in pairwise. To do more than
|
|
261
|
+
#2, need to implement ANOVA
|
|
262
|
+
return stats
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
# Run a student's T test on the distributions
|
|
266
|
+
#
|
|
267
|
+
# t = x1[:mean] - x2[:mean]
|
|
268
|
+
# ---------------------
|
|
269
|
+
# pooled_sdev * sqrt(1/n1 + 1/n2)
|
|
270
|
+
#
|
|
271
|
+
# where n1 is the number of elements in x1, n2 the number of elems in x2,
|
|
272
|
+
# and pooled_sdev is:
|
|
273
|
+
#
|
|
274
|
+
# pooled_sdev = sqrt ( (n1 - 1)(sdev1**2) + (n2 - 1)(sdev2**2) )
|
|
275
|
+
# ( --------------------------------------- )
|
|
276
|
+
# ( n1 + n2 - 2 )
|
|
277
|
+
pooled_sdev = stats.values.map {|hash| (hash[:size] - 1) * (hash[:sdev] ** 2)}.sum
|
|
278
|
+
pooled_sdev /= (stats.values.map {|hash| hash[:size]}.sum - 2)
|
|
279
|
+
pooled_sdev = pooled_sdev ** 0.5
|
|
280
|
+
t_val = (stats.values.first[:mean] - stats.values.last[:mean]) /
|
|
281
|
+
(pooled_sdev *
|
|
282
|
+
(stats.values.map {|hash| 1.0 / hash[:size]}.sum ** 0.5))
|
|
283
|
+
t_val = t_val.abs
|
|
284
|
+
current_sig = nil
|
|
285
|
+
SIGNIFICANCE_VALUES.keys.sort.each do |key|
|
|
286
|
+
if t_val > key
|
|
287
|
+
current_sig = SIGNIFICANCE_VALUES[key]
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
stats.merge(:significant => current_sig)
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def self.welch_t_test(distributions)
|
|
294
|
+
#distributions should be hash of {name => histogram }
|
|
295
|
+
stats = self.calculate_histogram_stats(distributions)
|
|
296
|
+
|
|
297
|
+
# Run a student's T test for assumed unequal size/unequal variance on the
|
|
298
|
+
# populations
|
|
299
|
+
#
|
|
300
|
+
#
|
|
301
|
+
# t = x1[:mean] - x2[:mean]
|
|
302
|
+
# ------------------------------
|
|
303
|
+
# sqrt( sdev1**2 sdev2**2 )
|
|
304
|
+
# ( -------- + -------- )
|
|
305
|
+
# ( n1 n2 )
|
|
306
|
+
#
|
|
307
|
+
# degrees of freedom (yuck, ugly ugly ugly)
|
|
308
|
+
# The Welch-Satterthwaite approximation (also from wikipedia)
|
|
309
|
+
# v = ( sdev1**2 sdev2**2)
|
|
310
|
+
# ( -------- + --------) ** 2
|
|
311
|
+
# ( n1 n2 )
|
|
312
|
+
# --------------------------------
|
|
313
|
+
# sdev1**4 sdev2**4
|
|
314
|
+
# -------------- + --------------
|
|
315
|
+
# n1**2 * (n1-1) n2**2 * (n2-1)
|
|
316
|
+
#
|
|
317
|
+
# it might be acceptable to assume infinite degrees of freedom. We shall see.
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
denom = stats.values.map {|hash| (hash[:sdev]**2) / hash[:size] }.sum
|
|
321
|
+
denom = denom ** 0.5 # Take the square root
|
|
322
|
+
t_val = (stats.values.first[:mean] - stats.values.last[:mean]) / denom
|
|
323
|
+
|
|
324
|
+
t_val = t_val.abs
|
|
325
|
+
current_sig = nil
|
|
326
|
+
|
|
327
|
+
current_sig = calculate_p_value(t_val, 0)
|
|
328
|
+
stats.merge(:significant => current_sig)
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
# ideal: build all possible permutations of sample_size from distributions
|
|
332
|
+
# and then compare them to each other. since there is no 're-usage' of any
|
|
333
|
+
# item, sample_size <= total size of population
|
|
334
|
+
# num_permutations is the number of times to do this.
|
|
335
|
+
def self.permutation_test(distributions, num_permutations=1000,
|
|
336
|
+
sample_size_percentage=0.2)
|
|
337
|
+
stats = self.calculate_histogram_stats(distributions)
|
|
338
|
+
# Data comes in histogram form? Hmmm. Need to massage it into an array
|
|
339
|
+
# A histogram of all distributions
|
|
340
|
+
pooled_dist = self.pool_distributions(distributions)
|
|
341
|
+
# An array with all possible values
|
|
342
|
+
pooled_flat = self.squash_distribution(pooled_dist)
|
|
343
|
+
sample_size = pooled_flat.count * sample_size_percentage
|
|
344
|
+
|
|
345
|
+
# Let's try with ruby's random number generator for a while.
|
|
346
|
+
mean_differences = []
|
|
347
|
+
(0..num_permutations).each do |i|
|
|
348
|
+
samples = self.bi_sample_array(pooled_flat, sample_size)
|
|
349
|
+
mean_differences << (samples[0].mean - samples[1].mean).abs
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
mean_differences = mean_differences.sort
|
|
353
|
+
|
|
354
|
+
# Find the index of where the sample difference means falls
|
|
355
|
+
dist_mean_diff = (stats.values.first[:mean] - stats.values.last[:mean]).abs
|
|
356
|
+
|
|
357
|
+
# run through the mean differences in sorted order until we find a value
|
|
358
|
+
# that is greater than dist_mean_diff or run off the array
|
|
359
|
+
fit_index = 0
|
|
360
|
+
while fit_index < mean_differences.count do
|
|
361
|
+
break if dist_mean_diff < mean_differences[fit_index]
|
|
362
|
+
fit_index += 1
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
# calculate where fit_index falls in the array - for it to be statistically
|
|
366
|
+
# signifigant, it has to fall in the top 5 - 10% of mean differences,
|
|
367
|
+
# i.e. greater than array.count * .90. I think.
|
|
368
|
+
fp = fit_index.to_f / num_permutations.to_f
|
|
369
|
+
|
|
370
|
+
# We want to return 1 - the possibility, I guess.
|
|
371
|
+
1.0 - fp
|
|
372
|
+
|
|
373
|
+
end
|
|
374
|
+
end
|
|
375
|
+
end
|
|
376
|
+
end
|