modesty 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +13 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +21 -0
- data/README.md +121 -0
- data/Rakefile +29 -0
- data/VERSION +1 -0
- data/init.rb +1 -0
- data/lib/modesty.rb +26 -0
- data/lib/modesty/api.rb +14 -0
- data/lib/modesty/core_ext.rb +5 -0
- data/lib/modesty/core_ext/array.rb +21 -0
- data/lib/modesty/core_ext/fixnum.rb +5 -0
- data/lib/modesty/core_ext/hash.rb +39 -0
- data/lib/modesty/core_ext/string.rb +9 -0
- data/lib/modesty/core_ext/symbol.rb +33 -0
- data/lib/modesty/datastore.rb +51 -0
- data/lib/modesty/datastore/redis.rb +180 -0
- data/lib/modesty/experiment.rb +87 -0
- data/lib/modesty/experiment/base.rb +47 -0
- data/lib/modesty/experiment/builder.rb +48 -0
- data/lib/modesty/experiment/console.rb +4 -0
- data/lib/modesty/experiment/data.rb +75 -0
- data/lib/modesty/experiment/interface.rb +29 -0
- data/lib/modesty/experiment/significance.rb +376 -0
- data/lib/modesty/experiment/stats.rb +163 -0
- data/lib/modesty/frameworks/rails.rb +27 -0
- data/lib/modesty/identity.rb +32 -0
- data/lib/modesty/load.rb +80 -0
- data/lib/modesty/load/load_experiments.rb +14 -0
- data/lib/modesty/load/load_metrics.rb +17 -0
- data/lib/modesty/metric.rb +56 -0
- data/lib/modesty/metric/base.rb +38 -0
- data/lib/modesty/metric/builder.rb +23 -0
- data/lib/modesty/metric/data.rb +133 -0
- data/modesty.gemspec +192 -0
- data/spec/core_ext_spec.rb +17 -0
- data/spec/experiment_spec.rb +239 -0
- data/spec/identity_spec.rb +161 -0
- data/spec/load_spec.rb +87 -0
- data/spec/metric_spec.rb +176 -0
- data/spec/rails_spec.rb +48 -0
- data/spec/redis_spec.rb +29 -0
- data/spec/significance_spec.rb +147 -0
- data/spec/spec.opts +1 -0
- data/test/myapp/config/modesty.yml +9 -0
- data/test/myapp/modesty/experiments/cookbook.rb +4 -0
- data/test/myapp/modesty/metrics/kitchen_metrics.rb +9 -0
- data/test/myapp/modesty/metrics/stove/burner_metrics.rb +2 -0
- data/vendor/.piston.yml +8 -0
- data/vendor/mock_redis/.gitignore +2 -0
- data/vendor/mock_redis/README +8 -0
- data/vendor/mock_redis/lib/mock_redis.rb +10 -0
- data/vendor/mock_redis/lib/mock_redis/hash.rb +61 -0
- data/vendor/mock_redis/lib/mock_redis/list.rb +6 -0
- data/vendor/mock_redis/lib/mock_redis/misc.rb +69 -0
- data/vendor/mock_redis/lib/mock_redis/set.rb +108 -0
- data/vendor/mock_redis/lib/mock_redis/string.rb +32 -0
- data/vendor/redis-rb/.gitignore +8 -0
- data/vendor/redis-rb/LICENSE +20 -0
- data/vendor/redis-rb/README.markdown +129 -0
- data/vendor/redis-rb/Rakefile +155 -0
- data/vendor/redis-rb/benchmarking/logging.rb +62 -0
- data/vendor/redis-rb/benchmarking/pipeline.rb +51 -0
- data/vendor/redis-rb/benchmarking/speed.rb +21 -0
- data/vendor/redis-rb/benchmarking/suite.rb +24 -0
- data/vendor/redis-rb/benchmarking/thread_safety.rb +38 -0
- data/vendor/redis-rb/benchmarking/worker.rb +71 -0
- data/vendor/redis-rb/examples/basic.rb +15 -0
- data/vendor/redis-rb/examples/dist_redis.rb +43 -0
- data/vendor/redis-rb/examples/incr-decr.rb +17 -0
- data/vendor/redis-rb/examples/list.rb +26 -0
- data/vendor/redis-rb/examples/pubsub.rb +31 -0
- data/vendor/redis-rb/examples/sets.rb +36 -0
- data/vendor/redis-rb/examples/unicorn/config.ru +3 -0
- data/vendor/redis-rb/examples/unicorn/unicorn.rb +20 -0
- data/vendor/redis-rb/lib/redis.rb +676 -0
- data/vendor/redis-rb/lib/redis/client.rb +201 -0
- data/vendor/redis-rb/lib/redis/compat.rb +21 -0
- data/vendor/redis-rb/lib/redis/connection.rb +134 -0
- data/vendor/redis-rb/lib/redis/distributed.rb +526 -0
- data/vendor/redis-rb/lib/redis/hash_ring.rb +131 -0
- data/vendor/redis-rb/lib/redis/pipeline.rb +13 -0
- data/vendor/redis-rb/lib/redis/subscribe.rb +79 -0
- data/vendor/redis-rb/redis.gemspec +29 -0
- data/vendor/redis-rb/test/commands_on_hashes_test.rb +46 -0
- data/vendor/redis-rb/test/commands_on_lists_test.rb +50 -0
- data/vendor/redis-rb/test/commands_on_sets_test.rb +78 -0
- data/vendor/redis-rb/test/commands_on_sorted_sets_test.rb +109 -0
- data/vendor/redis-rb/test/commands_on_strings_test.rb +70 -0
- data/vendor/redis-rb/test/commands_on_value_types_test.rb +88 -0
- data/vendor/redis-rb/test/connection_handling_test.rb +87 -0
- data/vendor/redis-rb/test/db/.gitignore +1 -0
- data/vendor/redis-rb/test/distributd_key_tags_test.rb +53 -0
- data/vendor/redis-rb/test/distributed_blocking_commands_test.rb +54 -0
- data/vendor/redis-rb/test/distributed_commands_on_hashes_test.rb +12 -0
- data/vendor/redis-rb/test/distributed_commands_on_lists_test.rb +18 -0
- data/vendor/redis-rb/test/distributed_commands_on_sets_test.rb +85 -0
- data/vendor/redis-rb/test/distributed_commands_on_strings_test.rb +50 -0
- data/vendor/redis-rb/test/distributed_commands_on_value_types_test.rb +73 -0
- data/vendor/redis-rb/test/distributed_commands_requiring_clustering_test.rb +141 -0
- data/vendor/redis-rb/test/distributed_connection_handling_test.rb +25 -0
- data/vendor/redis-rb/test/distributed_internals_test.rb +18 -0
- data/vendor/redis-rb/test/distributed_persistence_control_commands_test.rb +24 -0
- data/vendor/redis-rb/test/distributed_publish_subscribe_test.rb +90 -0
- data/vendor/redis-rb/test/distributed_remote_server_control_commands_test.rb +31 -0
- data/vendor/redis-rb/test/distributed_sorting_test.rb +21 -0
- data/vendor/redis-rb/test/distributed_test.rb +60 -0
- data/vendor/redis-rb/test/distributed_transactions_test.rb +34 -0
- data/vendor/redis-rb/test/encoding_test.rb +16 -0
- data/vendor/redis-rb/test/helper.rb +86 -0
- data/vendor/redis-rb/test/internals_test.rb +27 -0
- data/vendor/redis-rb/test/lint/hashes.rb +90 -0
- data/vendor/redis-rb/test/lint/internals.rb +53 -0
- data/vendor/redis-rb/test/lint/lists.rb +93 -0
- data/vendor/redis-rb/test/lint/sets.rb +66 -0
- data/vendor/redis-rb/test/lint/sorted_sets.rb +132 -0
- data/vendor/redis-rb/test/lint/strings.rb +98 -0
- data/vendor/redis-rb/test/lint/value_types.rb +84 -0
- data/vendor/redis-rb/test/persistence_control_commands_test.rb +22 -0
- data/vendor/redis-rb/test/pipelining_commands_test.rb +78 -0
- data/vendor/redis-rb/test/publish_subscribe_test.rb +151 -0
- data/vendor/redis-rb/test/redis_mock.rb +64 -0
- data/vendor/redis-rb/test/remote_server_control_commands_test.rb +56 -0
- data/vendor/redis-rb/test/sorting_test.rb +44 -0
- data/vendor/redis-rb/test/test.conf +8 -0
- data/vendor/redis-rb/test/thread_safety_test.rb +34 -0
- data/vendor/redis-rb/test/transactions_test.rb +91 -0
- data/vendor/redis-rb/test/unknown_commands_test.rb +14 -0
- data/vendor/redis-rb/test/url_param_test.rb +52 -0
- metadata +277 -0
@@ -0,0 +1,87 @@
|
|
1
|
+
module Modesty
|
2
|
+
class Experiment
|
3
|
+
class Error < StandardError; end
|
4
|
+
end
|
5
|
+
|
6
|
+
module ExperimentMethods
|
7
|
+
def experiments
|
8
|
+
@experiments ||= Hash.new do |h, k|
|
9
|
+
raise Experiment::Error, <<-msg.squish
|
10
|
+
Unrecognized experiment #{k.inspect}.
|
11
|
+
msg
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def add_experiment(exp)
|
16
|
+
raise Experiment::Error, <<-msg if self.experiments.include? exp.slug
|
17
|
+
Experiment #{exp.slug.inspect} already defined!
|
18
|
+
msg
|
19
|
+
self.experiments[exp.slug] = exp
|
20
|
+
end
|
21
|
+
|
22
|
+
def new_experiment(slug, &block)
|
23
|
+
exp = Experiment.new(slug)
|
24
|
+
yield Experiment::Builder.new(exp) if block
|
25
|
+
exp.metrics.each do |m|
|
26
|
+
m.experiments << exp
|
27
|
+
exp.alternatives.each do |a|
|
28
|
+
Modesty.new_metric(m.slug/exp.slug/a, :parent => m, :experiment => exp)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
add_experiment(exp)
|
32
|
+
exp
|
33
|
+
end
|
34
|
+
|
35
|
+
def decide_identity(options)
|
36
|
+
if options.include? :identity
|
37
|
+
options[:identity]
|
38
|
+
elsif options.include? :for
|
39
|
+
options[:for]
|
40
|
+
elsif options.include? :on
|
41
|
+
options[:on]
|
42
|
+
else
|
43
|
+
Modesty.identity
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def experiment(sym, options={}, &blk)
|
48
|
+
exp = self.experiments[sym]
|
49
|
+
|
50
|
+
identity = decide_identity(options)
|
51
|
+
|
52
|
+
interface = Experiment::Interface.new(exp, identity)
|
53
|
+
self.with_identity identity do
|
54
|
+
yield interface
|
55
|
+
end
|
56
|
+
|
57
|
+
interface.last_value
|
58
|
+
end
|
59
|
+
|
60
|
+
def group?(sym, options={})
|
61
|
+
id = decide_identity(options)
|
62
|
+
|
63
|
+
exp = sym.to_s.split(/\//)
|
64
|
+
alt = exp.pop.to_sym
|
65
|
+
exp = exp.join('/').to_sym
|
66
|
+
exp = self.experiments[exp]
|
67
|
+
exp.group? alt
|
68
|
+
end
|
69
|
+
|
70
|
+
def group(sym, options={})
|
71
|
+
id = decide_identity(options)
|
72
|
+
exp = self.experiments[sym]
|
73
|
+
exp ? exp.group(id) : :control
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class API
|
78
|
+
include ExperimentMethods
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
require 'modesty/experiment/base'
|
83
|
+
require 'modesty/experiment/builder'
|
84
|
+
require 'modesty/experiment/data'
|
85
|
+
require 'modesty/experiment/interface'
|
86
|
+
require 'modesty/experiment/significance'
|
87
|
+
require 'modesty/experiment/stats'
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Modesty
|
2
|
+
class Experiment
|
3
|
+
|
4
|
+
def initialize(slug)
|
5
|
+
@slug = slug
|
6
|
+
end
|
7
|
+
|
8
|
+
def inspect
|
9
|
+
"#<Modesty::Experiment[ #{self.slug.inspect} ]>"
|
10
|
+
end
|
11
|
+
|
12
|
+
ATTRIBUTES = [
|
13
|
+
:description,
|
14
|
+
]
|
15
|
+
|
16
|
+
def identity_for(sym)
|
17
|
+
sym = sym.slug if sym.is_a? Metric
|
18
|
+
self.metric_contexts[sym]
|
19
|
+
end
|
20
|
+
|
21
|
+
attr_reader *ATTRIBUTES
|
22
|
+
attr_reader :slug
|
23
|
+
attr_reader :metrics
|
24
|
+
|
25
|
+
def metric_contexts
|
26
|
+
@metric_contexts ||= Hash.new(:user)
|
27
|
+
end
|
28
|
+
|
29
|
+
def alternatives
|
30
|
+
@alternatives ||= [:control, :experiment]
|
31
|
+
end
|
32
|
+
|
33
|
+
def metrics(alt=nil)
|
34
|
+
@metrics ||= []
|
35
|
+
return @metrics unless alt
|
36
|
+
raise Error, <<-msg.squish unless self.alternatives.include? alt
|
37
|
+
Unrecognized alternative #{alt.inspect} for #{self.inspect}.
|
38
|
+
Available alternatives: #{self.alternatives.inspect}
|
39
|
+
msg
|
40
|
+
|
41
|
+
Hash[@metrics.map do |m|
|
42
|
+
[m.slug, m/(self.slug/alt)]
|
43
|
+
end]
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Modesty
|
2
|
+
class Experiment
|
3
|
+
class Builder
|
4
|
+
def method_missing(name, *args)
|
5
|
+
if Experiment::ATTRIBUTES.include?(name) && args.count > 0
|
6
|
+
val = (args.count == 1) ? args[0] : args
|
7
|
+
@exp.instance_variable_set("@#{name}", val)
|
8
|
+
else
|
9
|
+
@exp.send(name)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(exp)
|
14
|
+
@exp = exp
|
15
|
+
end
|
16
|
+
|
17
|
+
def alternatives(*alts)
|
18
|
+
alts.unshift :control unless alts.include? :control
|
19
|
+
@exp.instance_variable_set("@alternatives", alts)
|
20
|
+
end
|
21
|
+
|
22
|
+
def metrics(*args)
|
23
|
+
metrics = args.map do |s|
|
24
|
+
Modesty.metrics[s] || raise(
|
25
|
+
Modesty::NoMetricError,
|
26
|
+
"Undefined metric '#{s.inspect}' in experiment #{@exp}'"
|
27
|
+
)
|
28
|
+
end
|
29
|
+
@exp.instance_variable_set("@metrics", metrics)
|
30
|
+
end
|
31
|
+
|
32
|
+
def metric(sym, options={})
|
33
|
+
@exp.metrics << (Modesty.metrics[sym] || raise(
|
34
|
+
Modesty::NoMetricError,
|
35
|
+
"Undefined metric #{sym.inspect} in experiment #{@exp}"
|
36
|
+
))
|
37
|
+
if as = options.delete(:as)
|
38
|
+
@exp.metric_contexts[sym] = as.to_sym
|
39
|
+
end
|
40
|
+
|
41
|
+
raise <<-msg.squish unless options.empty?
|
42
|
+
unrecognized options
|
43
|
+
#{options.keys.inspect}
|
44
|
+
msg
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Modesty
|
2
|
+
class Experiment
|
3
|
+
def data
|
4
|
+
@data ||= (Modesty.data.class)::ExperimentData.new(self)
|
5
|
+
end
|
6
|
+
|
7
|
+
def chooses(alt, options={})
|
8
|
+
raise Experiment::Error, <<-msg.squish unless self.alternatives.include? alt
|
9
|
+
Unknown alternative #{alt.inspect}
|
10
|
+
msg
|
11
|
+
|
12
|
+
id = options.include?(:for) ? options[:for] : Modesty.identity
|
13
|
+
|
14
|
+
raise IdentityError, <<-msg.squish unless id
|
15
|
+
Experiment#chooses doesn't work for guests.
|
16
|
+
Either identify globally or pass in :for => id
|
17
|
+
msg
|
18
|
+
|
19
|
+
self.data.register!(alt, id)
|
20
|
+
rescue Datastore::ConnectionError => e
|
21
|
+
Modesty.handle_error(e)
|
22
|
+
alt
|
23
|
+
end
|
24
|
+
|
25
|
+
def group(id=Modesty.identity)
|
26
|
+
return :control unless id
|
27
|
+
fetch_or_generate_group(id)
|
28
|
+
end
|
29
|
+
|
30
|
+
# usage: `e.group?(:experiment)`
|
31
|
+
def group?(alt)
|
32
|
+
self.group == alt
|
33
|
+
end
|
34
|
+
|
35
|
+
def num_users(alt=nil)
|
36
|
+
if self.data.respond_to? :num_users
|
37
|
+
self.data.num_users(alt)
|
38
|
+
else
|
39
|
+
self.users(alt).count
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def users(alt=nil)
|
44
|
+
self.data.users(alt)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
# used to fetch the cached alternative from redis
|
49
|
+
def fetch_group(identity)
|
50
|
+
self.data.get_cached_alternative(identity)
|
51
|
+
rescue Datastore::ConnectionError => e
|
52
|
+
Modesty.handle_error(e)
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
|
56
|
+
# this is the method with the fallbacks - fetch it from redis or create it.
|
57
|
+
def fetch_or_generate_group(id=Modesty.identity)
|
58
|
+
alt = begin
|
59
|
+
fetch_group(id)
|
60
|
+
rescue Datastore::ConnectionError
|
61
|
+
nil
|
62
|
+
end || generate_group(id)
|
63
|
+
end
|
64
|
+
|
65
|
+
# generates an alternative and stores it in redis
|
66
|
+
def generate_group(identity)
|
67
|
+
alternative = self.alternatives[
|
68
|
+
"#{@slug}#{identity}".hash % self.alternatives.count
|
69
|
+
]
|
70
|
+
self.chooses(alternative, :for => identity)
|
71
|
+
return alternative
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Modesty
|
2
|
+
class Experiment
|
3
|
+
# the thing yielded when you say `Modesty.experiment :foo do |e| ...`
|
4
|
+
class Interface
|
5
|
+
def initialize(exp, identity)
|
6
|
+
@exp = exp
|
7
|
+
@alt = exp.group(identity)
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_reader :last_value
|
11
|
+
|
12
|
+
def group(gr=nil)
|
13
|
+
if block_given?
|
14
|
+
if gr && @exp.group == gr
|
15
|
+
@last_value = yield
|
16
|
+
else
|
17
|
+
@last_value
|
18
|
+
end
|
19
|
+
else
|
20
|
+
@exp.group
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def group?(alt)
|
25
|
+
alt == @alt
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,376 @@
|
|
1
|
+
module Modesty
|
2
|
+
class Experiment
|
3
|
+
class Significance
|
4
|
+
|
5
|
+
#this is the table for up to 4 degrees of freedom. If we are going to use
|
6
|
+
#more than this we should actually have a flat file with the table that we
|
7
|
+
#can parse.
|
8
|
+
CHI_SQUARE_TABLE = {
|
9
|
+
1 => {
|
10
|
+
2.7055 => 0.10,
|
11
|
+
3.8415 => 0.05,
|
12
|
+
5.0239 => 0.025,
|
13
|
+
6.6349 => 0.01,
|
14
|
+
7.8794 => 0.005
|
15
|
+
},
|
16
|
+
2 => {
|
17
|
+
4.6052 => 0.10,
|
18
|
+
5.9915 => 0.05,
|
19
|
+
7.3778 => 0.025,
|
20
|
+
9.2104 => 0.01,
|
21
|
+
10.5965 => 0.005
|
22
|
+
},
|
23
|
+
3 => {
|
24
|
+
6.2514 => 0.10,
|
25
|
+
7.8147 => 0.05,
|
26
|
+
9.3484 => 0.025,
|
27
|
+
11.349 => 0.01,
|
28
|
+
12.8381 => 0.005
|
29
|
+
},
|
30
|
+
4 => {
|
31
|
+
7.7794 => 0.10,
|
32
|
+
9.4877 => 0.05,
|
33
|
+
11.1433 => 0.025,
|
34
|
+
13.2767 => 0.01,
|
35
|
+
14.860 => 0.005
|
36
|
+
},
|
37
|
+
5 => {
|
38
|
+
9.236 => 0.10,
|
39
|
+
11.070 => 0.05,
|
40
|
+
12.833 => 0.025,
|
41
|
+
15.086 => 0.01,
|
42
|
+
16.750 => 0.005
|
43
|
+
},
|
44
|
+
6 => {
|
45
|
+
10.645 => 0.10,
|
46
|
+
12.592 => 0.05,
|
47
|
+
14.449 => 0.025,
|
48
|
+
16.812 => 0.01,
|
49
|
+
18.548 => 0.005
|
50
|
+
},
|
51
|
+
7 => {
|
52
|
+
12.017 => 0.10,
|
53
|
+
14.067 => 0.05,
|
54
|
+
16.013 => 0.025,
|
55
|
+
18.475 => 0.01,
|
56
|
+
20.278 => 0.005
|
57
|
+
},
|
58
|
+
8 => {
|
59
|
+
13.362 => 0.10,
|
60
|
+
15.507 => 0.05,
|
61
|
+
17.535 => 0.025,
|
62
|
+
20.090 => 0.01,
|
63
|
+
21.955 => 0.005
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
def self.significance(*args)
|
68
|
+
df = (args.size - 1) * (args[0].size - 1)
|
69
|
+
raise "Currently unimplemented: More than 8 degrees of freedom" if df > 8
|
70
|
+
chi_square = self.chi_square(args)
|
71
|
+
current = nil
|
72
|
+
CHI_SQUARE_TABLE[df].keys.sort.each do |key|
|
73
|
+
if chi_square > key
|
74
|
+
current = CHI_SQUARE_TABLE[df][key]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
current
|
78
|
+
end
|
79
|
+
|
80
|
+
# return an hash with all the values from the distributions in it, not
|
81
|
+
# necessarily sorted. Basically, pool all the histograms.
|
82
|
+
def self.pool_distributions(distributions)
|
83
|
+
pooled_distribution = Hash.new(0)
|
84
|
+
distributions.each do |name, frequency_map|
|
85
|
+
frequency_map.each do |key, value|
|
86
|
+
pooled_distribution[key] += value
|
87
|
+
end
|
88
|
+
end
|
89
|
+
pooled_distribution
|
90
|
+
end
|
91
|
+
|
92
|
+
# Take a histogram and turn it into an array
|
93
|
+
def self.squash_distribution(distribution)
|
94
|
+
a_flattened_histogram = []
|
95
|
+
distribution.each do |key, value|
|
96
|
+
(0..value).each do
|
97
|
+
a_flattened_histogram << key
|
98
|
+
end
|
99
|
+
end
|
100
|
+
a_flattened_histogram
|
101
|
+
end
|
102
|
+
|
103
|
+
# pick two unique samples from 'array' of size num_elements
|
104
|
+
def self.bi_sample_array(array, num_elements)
|
105
|
+
raise "We don't have that many elements" unless num_elements*2 <= array.size
|
106
|
+
values = array.shuffle
|
107
|
+
[values[0...num_elements], values[num_elements...2*num_elements]]
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.add_sums(rows)
|
111
|
+
size = nil
|
112
|
+
rows.each do |row|
|
113
|
+
size = row.size unless size
|
114
|
+
raise "Unequal sized rows!" if size != row.size
|
115
|
+
row.push row.sum
|
116
|
+
end
|
117
|
+
new_row = [0] * (size + 1)
|
118
|
+
rows.each do |row|
|
119
|
+
new_row = new_row.zip(row).map(&:sum)
|
120
|
+
end
|
121
|
+
rows.push new_row
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.chi_square(rows)
|
125
|
+
rows = self.add_sums(rows)
|
126
|
+
|
127
|
+
chi_square = 0
|
128
|
+
num_rows = rows.size
|
129
|
+
len = rows[0].size
|
130
|
+
(0...num_rows).each do |i|
|
131
|
+
(0...len).each do |j|
|
132
|
+
error = rows[i][len - 1].to_f * rows[num_rows - 1][j].to_f /
|
133
|
+
rows[num_rows - 1][len - 1].to_f
|
134
|
+
chi_square += ((error - rows[i][j])**2) / error
|
135
|
+
end
|
136
|
+
end
|
137
|
+
chi_square
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
def self.size_total_mean_and_stdev(distribution)
|
142
|
+
total = 0
|
143
|
+
size = 0
|
144
|
+
distribution.each do |pair|
|
145
|
+
value = pair[0].to_i
|
146
|
+
freq = pair[1].to_i
|
147
|
+
total += value * freq
|
148
|
+
size += freq
|
149
|
+
end
|
150
|
+
mean = total.to_f / size
|
151
|
+
stderr = 0
|
152
|
+
distribution.each do |pair|
|
153
|
+
value = pair[0].to_i
|
154
|
+
freq = pair[1].to_i
|
155
|
+
stderr += freq * ((value - mean)**2)
|
156
|
+
end
|
157
|
+
std_dev = (stderr.to_f / size) ** (0.5)
|
158
|
+
[size, total, mean, std_dev]
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
#assume infinite df. Numbers here are huge
|
163
|
+
SIGNIFICANCE_VALUES = {1.282 => 0.10, 1.645 => 0.05, 1.960 => 0.025,
|
164
|
+
2.326 => 0.01, 2.576 => 0.005}
|
165
|
+
|
166
|
+
# Let's also have a table of signifigant values based on degrees of freedom
|
167
|
+
# and see if we can look up data in it
|
168
|
+
#
|
169
|
+
LOOKUP_SIGNIFICANCE_TABLE = {
|
170
|
+
0 => 0.25,
|
171
|
+
1 => 0.20,
|
172
|
+
2 => 0.15,
|
173
|
+
3 => 0.10,
|
174
|
+
4 => 0.05,
|
175
|
+
5 => 0.025,
|
176
|
+
6 => 0.01,
|
177
|
+
7 => 0.005,
|
178
|
+
}
|
179
|
+
|
180
|
+
# Taken from Wikipedia's page on Student's T distribution
|
181
|
+
SIGNIFICANCE_VALUES_FOR_V = {
|
182
|
+
# V 75% 80% 85% 90% 95% 97.5% 99% 99.5% 99.75% 99.9% 99.95%
|
183
|
+
1 => [ 1.000, 1.376, 1.963, 3.078, 6.314, 12.71, 31.82, 63.66, 127.3, 318.3, 636.6 ],
|
184
|
+
2 => [ 0.816, 1.061, 1.386, 1.886, 2.920, 4.303, 6.965, 9.925, 14.09, 22.33, 31.60 ],
|
185
|
+
3 => [ 0.765, 0.978, 1.250, 1.638, 2.353, 3.182, 4.541, 5.841, 7.453, 10.21, 12.92 ],
|
186
|
+
4 => [ 0.741, 0.941, 1.190, 1.533, 2.132, 2.776, 3.747, 4.604, 5.598, 7.173, 8.610 ],
|
187
|
+
5 => [ 0.727, 0.920, 1.156, 1.476, 2.015, 2.571, 3.365, 4.032, 4.773, 5.893, 6.869 ],
|
188
|
+
6 => [ 0.718, 0.906, 1.134, 1.440, 1.943, 2.447, 3.143, 3.707, 4.317, 5.208, 5.959 ],
|
189
|
+
7 => [ 0.711, 0.896, 1.119, 1.415, 1.895, 2.365, 2.998, 3.499, 4.029, 4.785, 5.408 ],
|
190
|
+
8 => [ 0.706, 0.889, 1.108, 1.397, 1.860, 2.306, 2.896, 3.355, 3.833, 4.501, 5.041 ],
|
191
|
+
9 => [ 0.703, 0.883, 1.100, 1.383, 1.833, 2.262, 2.821, 3.250, 3.690, 4.297, 4.781 ],
|
192
|
+
10 => [ 0.700, 0.879, 1.093, 1.372, 1.812, 2.228, 2.764, 3.169, 3.581, 4.144, 4.587 ],
|
193
|
+
11 => [ 0.697, 0.876, 1.088, 1.363, 1.796, 2.201, 2.718, 3.106, 3.497, 4.025, 4.437 ],
|
194
|
+
12 => [ 0.695, 0.873, 1.083, 1.356, 1.782, 2.179, 2.681, 3.055, 3.428, 3.930, 4.318 ],
|
195
|
+
13 => [ 0.694, 0.870, 1.079, 1.350, 1.771, 2.160, 2.650, 3.012, 3.372, 3.852, 4.221 ],
|
196
|
+
14 => [ 0.692, 0.868, 1.076, 1.345, 1.761, 2.145, 2.624, 2.977, 3.326, 3.787, 4.140 ],
|
197
|
+
15 => [ 0.691, 0.866, 1.074, 1.341, 1.753, 2.131, 2.602, 2.947, 3.286, 3.733, 4.073 ],
|
198
|
+
16 => [ 0.690, 0.865, 1.071, 1.337, 1.746, 2.120, 2.583, 2.921, 3.252, 3.686, 4.015 ],
|
199
|
+
17 => [ 0.689, 0.863, 1.069, 1.333, 1.740, 2.110, 2.567, 2.898, 3.222, 3.646, 3.965 ],
|
200
|
+
18 => [ 0.688, 0.862, 1.067, 1.330, 1.734, 2.101, 2.552, 2.878, 3.197, 3.610, 3.922 ],
|
201
|
+
19 => [ 0.688, 0.861, 1.066, 1.328, 1.729, 2.093, 2.539, 2.861, 3.174, 3.579, 3.883 ],
|
202
|
+
20 => [ 0.687, 0.860, 1.064, 1.325, 1.725, 2.086, 2.528, 2.845, 3.153, 3.552, 3.850 ],
|
203
|
+
21 => [ 0.686, 0.859, 1.063, 1.323, 1.721, 2.080, 2.518, 2.831, 3.135, 3.527, 3.819 ],
|
204
|
+
22 => [ 0.686, 0.858, 1.061, 1.321, 1.717, 2.074, 2.508, 2.819, 3.119, 3.505, 3.792 ],
|
205
|
+
23 => [ 0.685, 0.858, 1.060, 1.319, 1.714, 2.069, 2.500, 2.807, 3.104, 3.485, 3.767 ],
|
206
|
+
24 => [ 0.685, 0.857, 1.059, 1.318, 1.711, 2.064, 2.492, 2.797, 3.091, 3.467, 3.745 ],
|
207
|
+
25 => [ 0.684, 0.856, 1.058, 1.316, 1.708, 2.060, 2.485, 2.787, 3.078, 3.450, 3.725 ],
|
208
|
+
26 => [ 0.684, 0.856, 1.058, 1.315, 1.706, 2.056, 2.479, 2.779, 3.067, 3.435, 3.707 ],
|
209
|
+
27 => [ 0.684, 0.855, 1.057, 1.314, 1.703, 2.052, 2.473, 2.771, 3.057, 3.421, 3.690 ],
|
210
|
+
28 => [ 0.683, 0.855, 1.056, 1.313, 1.701, 2.048, 2.467, 2.763, 3.047, 3.408, 3.674 ],
|
211
|
+
29 => [ 0.683, 0.854, 1.055, 1.311, 1.699, 2.045, 2.462, 2.756, 3.038, 3.396, 3.659 ],
|
212
|
+
30 => [ 0.683, 0.854, 1.055, 1.310, 1.697, 2.042, 2.457, 2.750, 3.030, 3.385, 3.646 ],
|
213
|
+
40 => [ 0.681, 0.851, 1.050, 1.303, 1.684, 2.021, 2.423, 2.704, 2.971, 3.307, 3.551 ],
|
214
|
+
50 => [ 0.679, 0.849, 1.047, 1.299, 1.676, 2.009, 2.403, 2.678, 2.937, 3.261, 3.496 ],
|
215
|
+
60 => [ 0.679, 0.848, 1.045, 1.296, 1.671, 2.000, 2.390, 2.660, 2.915, 3.232, 3.460 ],
|
216
|
+
80 => [ 0.678, 0.846, 1.043, 1.292, 1.664, 1.990, 2.374, 2.639, 2.887, 3.195, 3.416 ],
|
217
|
+
100 => [ 0.677, 0.845, 1.042, 1.290, 1.660, 1.984, 2.364, 2.626, 2.871, 3.174, 3.390 ],
|
218
|
+
120 => [ 0.677, 0.845, 1.041, 1.289, 1.658, 1.980, 2.358, 2.617, 2.860, 3.160, 3.373 ],
|
219
|
+
0 => [ 0.674, 0.842, 1.036, 1.282, 1.645, 1.960, 2.326, 2.576, 2.807, 3.090, 3.291 ],
|
220
|
+
}
|
221
|
+
|
222
|
+
# Calculate the p_value for a given t,v from the student's t distribution
|
223
|
+
def calculate_p_value(t_val, v_val=0)
|
224
|
+
v_arr = SIGNIFICANCE_VALUES_FOR_V[v]
|
225
|
+
|
226
|
+
return nil if !v_arr
|
227
|
+
|
228
|
+
v_arr = v_arr.sort()
|
229
|
+
lookup_val = nil
|
230
|
+
# find the largest value that t_val is greater than
|
231
|
+
v_arr.each do | v_val |
|
232
|
+
lookup_val = v_val if t_val > v_val
|
233
|
+
end
|
234
|
+
|
235
|
+
# return the p_value that corresponds to it
|
236
|
+
index_into_v_arr = v_arr.index(lookup_val)
|
237
|
+
return LOOKUP_SIGNIFICANCE_TABLE[index_into_v_arr]
|
238
|
+
end
|
239
|
+
|
240
|
+
def self.calculate_histogram_stats(distributions)
|
241
|
+
#distributions should be hash of {name => histogram }
|
242
|
+
stats = distributions.inject({}) do |hash, pair|
|
243
|
+
size, tot, mean, sdev = self.size_total_mean_and_stdev(pair[1])
|
244
|
+
hash[pair[0]] = {:size => size, :total => tot,
|
245
|
+
:mean => mean, :sdev => sdev}
|
246
|
+
hash
|
247
|
+
end
|
248
|
+
return stats
|
249
|
+
end
|
250
|
+
|
251
|
+
|
252
|
+
# [okay] my initial comments on the following function:
|
253
|
+
# assumptions:
|
254
|
+
# * Does a signifigance check against V = infinity
|
255
|
+
# * assumes stddev for both distributions are equal.
|
256
|
+
def self.dist_significance(distributions)
|
257
|
+
#distributions should be hash of {name => histogram }
|
258
|
+
stats = self.calculate_histogram_stats(distributions)
|
259
|
+
if distributions.keys.size != 2
|
260
|
+
#for now can only test for significance in pairwise. To do more than
|
261
|
+
#2, need to implement ANOVA
|
262
|
+
return stats
|
263
|
+
end
|
264
|
+
|
265
|
+
# Run a student's T test on the distributions
|
266
|
+
#
|
267
|
+
# t = x1[:mean] - x2[:mean]
|
268
|
+
# ---------------------
|
269
|
+
# pooled_sdev * sqrt(1/n1 + 1/n2)
|
270
|
+
#
|
271
|
+
# where n1 is the number of elements in x1, n2 the number of elems in x2,
|
272
|
+
# and pooled_sdev is:
|
273
|
+
#
|
274
|
+
# pooled_sdev = sqrt ( (n1 - 1)(sdev1**2) + (n2 - 1)(sdev2**2) )
|
275
|
+
# ( --------------------------------------- )
|
276
|
+
# ( n1 + n2 - 2 )
|
277
|
+
pooled_sdev = stats.values.map {|hash| (hash[:size] - 1) * (hash[:sdev] ** 2)}.sum
|
278
|
+
pooled_sdev /= (stats.values.map {|hash| hash[:size]}.sum - 2)
|
279
|
+
pooled_sdev = pooled_sdev ** 0.5
|
280
|
+
t_val = (stats.values.first[:mean] - stats.values.last[:mean]) /
|
281
|
+
(pooled_sdev *
|
282
|
+
(stats.values.map {|hash| 1.0 / hash[:size]}.sum ** 0.5))
|
283
|
+
t_val = t_val.abs
|
284
|
+
current_sig = nil
|
285
|
+
SIGNIFICANCE_VALUES.keys.sort.each do |key|
|
286
|
+
if t_val > key
|
287
|
+
current_sig = SIGNIFICANCE_VALUES[key]
|
288
|
+
end
|
289
|
+
end
|
290
|
+
stats.merge(:significant => current_sig)
|
291
|
+
end
|
292
|
+
|
293
|
+
def self.welch_t_test(distributions)
|
294
|
+
#distributions should be hash of {name => histogram }
|
295
|
+
stats = self.calculate_histogram_stats(distributions)
|
296
|
+
|
297
|
+
# Run a student's T test for assumed unequal size/unequal variance on the
|
298
|
+
# populations
|
299
|
+
#
|
300
|
+
#
|
301
|
+
# t = x1[:mean] - x2[:mean]
|
302
|
+
# ------------------------------
|
303
|
+
# sqrt( sdev1**2 sdev2**2 )
|
304
|
+
# ( -------- + -------- )
|
305
|
+
# ( n1 n2 )
|
306
|
+
#
|
307
|
+
# degrees of freedom (yuck, ugly ugly ugly)
|
308
|
+
# The Welch-Satterthwaite approximation (also from wikipedia)
|
309
|
+
# v = ( sdev1**2 sdev2**2)
|
310
|
+
# ( -------- + --------) ** 2
|
311
|
+
# ( n1 n2 )
|
312
|
+
# --------------------------------
|
313
|
+
# sdev1**4 sdev2**4
|
314
|
+
# -------------- + --------------
|
315
|
+
# n1**2 * (n1-1) n2**2 * (n2-1)
|
316
|
+
#
|
317
|
+
# it might be acceptable to assume infinite degrees of freedom. We shall see.
|
318
|
+
|
319
|
+
|
320
|
+
denom = stats.values.map {|hash| (hash[:sdev]**2) / hash[:size] }.sum
|
321
|
+
denom = denom ** 0.5 # Take the square root
|
322
|
+
t_val = (stats.values.first[:mean] - stats.values.last[:mean]) / denom
|
323
|
+
|
324
|
+
t_val = t_val.abs
|
325
|
+
current_sig = nil
|
326
|
+
|
327
|
+
current_sig = calculate_p_value(t_val, 0)
|
328
|
+
stats.merge(:significant => current_sig)
|
329
|
+
end
|
330
|
+
|
331
|
+
# ideal: build all possible permutations of sample_size from distributions
|
332
|
+
# and then compare them to each other. since there is no 're-usage' of any
|
333
|
+
# item, sample_size <= total size of population
|
334
|
+
# num_permutations is the number of times to do this.
|
335
|
+
def self.permutation_test(distributions, num_permutations=1000,
|
336
|
+
sample_size_percentage=0.2)
|
337
|
+
stats = self.calculate_histogram_stats(distributions)
|
338
|
+
# Data comes in histogram form? Hmmm. Need to massage it into an array
|
339
|
+
# A histogram of all distributions
|
340
|
+
pooled_dist = self.pool_distributions(distributions)
|
341
|
+
# An array with all possible values
|
342
|
+
pooled_flat = self.squash_distribution(pooled_dist)
|
343
|
+
sample_size = pooled_flat.count * sample_size_percentage
|
344
|
+
|
345
|
+
# Let's try with ruby's random number generator for a while.
|
346
|
+
mean_differences = []
|
347
|
+
(0..num_permutations).each do |i|
|
348
|
+
samples = self.bi_sample_array(pooled_flat, sample_size)
|
349
|
+
mean_differences << (samples[0].mean - samples[1].mean).abs
|
350
|
+
end
|
351
|
+
|
352
|
+
mean_differences = mean_differences.sort
|
353
|
+
|
354
|
+
# Find the index of where the sample difference means falls
|
355
|
+
dist_mean_diff = (stats.values.first[:mean] - stats.values.last[:mean]).abs
|
356
|
+
|
357
|
+
# run through the mean differences in sorted order until we find a value
|
358
|
+
# that is greater than dist_mean_diff or run off the array
|
359
|
+
fit_index = 0
|
360
|
+
while fit_index < mean_differences.count do
|
361
|
+
break if dist_mean_diff < mean_differences[fit_index]
|
362
|
+
fit_index += 1
|
363
|
+
end
|
364
|
+
|
365
|
+
# calculate where fit_index falls in the array - for it to be statistically
|
366
|
+
# signifigant, it has to fall in the top 5 - 10% of mean differences,
|
367
|
+
# i.e. greater than array.count * .90. I think.
|
368
|
+
fp = fit_index.to_f / num_permutations.to_f
|
369
|
+
|
370
|
+
# We want to return 1 - the possibility, I guess.
|
371
|
+
1.0 - fp
|
372
|
+
|
373
|
+
end
|
374
|
+
end
|
375
|
+
end
|
376
|
+
end
|