benchmark-lab 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +29 -0
- data/.pullreview.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +674 -0
- data/README.md +103 -0
- data/Rakefile +10 -0
- data/benchmark-lab.gemspec +26 -0
- data/lib/benchmark/lab.rb +147 -0
- data/lib/benchmark/lab/descriptive_statistics.rb +63 -0
- data/lib/benchmark/lab/mann_whitney_u_test.rb +114 -0
- data/lib/benchmark/lab/version.rb +5 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/unit/descriptive_statistics_spec.rb +45 -0
- data/spec/unit/experiment_spec.rb +101 -0
- data/spec/unit/mann_whitney_u_test_spec.rb +32 -0
- metadata +134 -0
data/README.md
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
[](https://www.pullreview.com/github/toch/benchmark-lab/reviews/master)
|
2
|
+
|
3
|
+
# Benchmark Lab
|
4
|
+
|
5
|
+
Run Real Experiment and Calculate Non-Parametric Statistics.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Install it yourself as:
|
10
|
+
|
11
|
+
$ gem install benchmark-lab
|
12
|
+
|
13
|
+
## Usage
|
14
|
+
|
15
|
+
There are two ways to use it:
|
16
|
+
1. classic: as Benchmark.bm does
|
17
|
+
2. iterative: collects and measures separately, stores into different JSON
|
18
|
+
files, then put everything together and rank them
|
19
|
+
|
20
|
+
### Classic Usage
|
21
|
+
|
22
|
+
```Ruby
|
23
|
+
require 'benchmark/lab'
|
24
|
+
|
25
|
+
n = 5_000_000
|
26
|
+
cases = {
|
27
|
+
'for:' => proc { for i in 1..n; a = "1"; end },
|
28
|
+
'times:' => proc { n.times do ; a = "1"; end },
|
29
|
+
'upto:' => proc { 1.upto(n) do ; a = "1"; end }
|
30
|
+
}
|
31
|
+
|
32
|
+
# How many times do you run the function
|
33
|
+
# 20 is a good minimum number
|
34
|
+
nbr_of_samples = 20
|
35
|
+
|
36
|
+
Benchmark.experiment(nbr_of_samples) do |x|
|
37
|
+
cases.each { |label, blk| x.report(label, &blk) }
|
38
|
+
end
|
39
|
+
```
|
40
|
+
|
41
|
+
The output looks like the following:
|
42
|
+
```
|
43
|
+
user system total real
|
44
|
+
for: [0.77,0.77,0.78] [0.00,0.00,0.00] [0.77,0.77,0.78] [0.77,0.77,0.78]
|
45
|
+
times: [0.74,0.74,0.74] [0.00,0.00,0.00] [0.74,0.74,0.74] [0.74,0.74,0.74]
|
46
|
+
upto: [0.75,0.75,0.75] [0.00,0.00,0.00] [0.75,0.75,0.75] [0.75,0.75,0.75]
|
47
|
+
The best "times:" is significantly (95%) better (total time).
|
48
|
+
```
|
49
|
+
|
50
|
+
### Iterative Usage
|
51
|
+
|
52
|
+
```Ruby
|
53
|
+
require 'benchmark/lab'
|
54
|
+
|
55
|
+
n = 5_000_000
|
56
|
+
|
57
|
+
# How many times do you run the function
|
58
|
+
# 20 is a good minimum number
|
59
|
+
nbr_of_samples = 20
|
60
|
+
|
61
|
+
jsons = []
|
62
|
+
|
63
|
+
jsons << Benchmark.observe_and_summarize(nbr_of_samples) do |x|
|
64
|
+
x.report('for') { for i in 1..n; a = "1"; end }
|
65
|
+
end
|
66
|
+
|
67
|
+
jsons << Benchmark.observe_and_summarize(nbr_of_samples) do |x|
|
68
|
+
x.report('times') { n.times do ; a = "1"; end }
|
69
|
+
end
|
70
|
+
|
71
|
+
jsons << Benchmark.observe_and_summarize(nbr_of_samples) do |x|
|
72
|
+
x.report('upto') { 1.upto(n) do ; a = "1"; end }
|
73
|
+
end
|
74
|
+
|
75
|
+
best, is_h0_rejected = Benchmark.aggregate_and_rank(jsons.map { |json| JSON.parse(json) })
|
76
|
+
|
77
|
+
puts best
|
78
|
+
puts is_h0_rejected
|
79
|
+
```
|
80
|
+
|
81
|
+
The output looks like the following:
|
82
|
+
```
|
83
|
+
{"name"=>"total", "sample"=>[0.6899999999999977, 0.6899999999999977, 0.6899999999999977, 0.6899999999999977, 0.6900000000000013, 0.6900000000000048, 0.6900000000000048, 0.6999999999999957, 0.6999999999999957, 0.6999999999999957, 0.6999999999999957, 0.6999999999999957, 0.6999999999999993, 0.6999999999999993, 0.7000000000000028, 0.7000000000000028, 0.7000000000000028, 0.7000000000000028, 0.7000000000000028, 0.7000000000000028], "sample_size"=>20, "minimum"=>0.6899999999999977, "maximum"=>0.7000000000000028, "first_quartile"=>0.690000000000003, "third_quartile"=>0.7000000000000028, "median"=>0.6999999999999957, "interquartile_range"=>0.009999999999999787, "label"=>"upto"}
|
84
|
+
true
|
85
|
+
```
|
86
|
+
|
87
|
+
## Ideas
|
88
|
+
|
89
|
+
* compare two different implementations of a same function
|
90
|
+
1. get the stats, then compare
|
91
|
+
2. use git (commit, branch)
|
92
|
+
3. use tests to check no performance regression at the same time
|
93
|
+
4. annotate the tests you want to check
|
94
|
+
* decide the sample size automatically (based on the power you want to reach)
|
95
|
+
* explain correctly why we should do that
|
96
|
+
|
97
|
+
## Contributing
|
98
|
+
|
99
|
+
1. Fork it ( https://github.com/toch/benchmark-lab/fork )
|
100
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
101
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
102
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
103
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'benchmark/lab/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'benchmark-lab'
|
8
|
+
spec.version = Benchmark::Experiment::VERSION
|
9
|
+
spec.authors = ['Christophe Philemotte']
|
10
|
+
spec.email = ['christophe.philemotte@8thcolor.com']
|
11
|
+
spec.summary = %q{Run Real Experiment and Calculate Non-Parametric Statistics.}
|
12
|
+
spec.description = %q{Run Real Experiment and Calculate Non-Parametric Statistics.}
|
13
|
+
spec.homepage = 'https://github.com/toch/benchmark-lab'
|
14
|
+
spec.license = 'GPLv3'
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_development_dependency 'bundler', '~> 1.6'
|
22
|
+
spec.add_development_dependency 'rake'
|
23
|
+
spec.add_development_dependency 'minitest', '4.5.0'
|
24
|
+
spec.add_development_dependency 'turn', '~> 0.9'
|
25
|
+
spec.add_runtime_dependency 'distribution'
|
26
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require 'benchmark/lab/descriptive_statistics'
|
3
|
+
require 'benchmark/lab/mann_whitney_u_test'
|
4
|
+
require 'benchmark/lab/version'
|
5
|
+
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
module Benchmark
|
9
|
+
class Sample
|
10
|
+
include Enumerable
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@observations = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def <<(observation)
|
17
|
+
@observations << observation
|
18
|
+
end
|
19
|
+
|
20
|
+
def each(&block)
|
21
|
+
@observations.each do |observation|
|
22
|
+
if block_given?
|
23
|
+
block.call observation
|
24
|
+
else
|
25
|
+
yield observation
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Job
|
32
|
+
alias old_item item
|
33
|
+
def item(label = '', &blk)
|
34
|
+
old_item(label, &blk)
|
35
|
+
@list.last << Sample.new
|
36
|
+
@list.last << []
|
37
|
+
self
|
38
|
+
end
|
39
|
+
|
40
|
+
def observe_and_summarize(sample_size)
|
41
|
+
@list.each do |label, item, sample, stats|
|
42
|
+
sample_size.times.each do
|
43
|
+
sample << Benchmark.measure(label, &item)
|
44
|
+
end
|
45
|
+
|
46
|
+
Experiment::MEASURED_TIMES.keys.each do |time_name|
|
47
|
+
stats << Benchmark::Experiment::DescriptiveStatistics.new(sample.map(&time_name), time_name)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
alias report item
|
53
|
+
end
|
54
|
+
|
55
|
+
module Experiment
|
56
|
+
MEASURED_TIMES =
|
57
|
+
{
|
58
|
+
utime: 'user',
|
59
|
+
stime: 'system',
|
60
|
+
total: 'total',
|
61
|
+
real: 'real'
|
62
|
+
}
|
63
|
+
|
64
|
+
def observe_and_summarize(sample_size, &blk)
|
65
|
+
job = Job.new(0)
|
66
|
+
yield(job)
|
67
|
+
job.observe_and_summarize(sample_size)
|
68
|
+
all_stats = job.list.map{ |label, _, _, stats| [label, stats] }.to_h
|
69
|
+
all_stats.to_json
|
70
|
+
end
|
71
|
+
|
72
|
+
def aggregate_and_rank(jsons)
|
73
|
+
return if jsons.empty?
|
74
|
+
all_stats = jsons.inject({}) { |elem, hsh| hsh.merge(elem) }
|
75
|
+
rank(all_stats)
|
76
|
+
end
|
77
|
+
|
78
|
+
def experiment(sample_size, &blk)
|
79
|
+
all_stats = JSON.parse(observe_and_summarize(sample_size, &blk))
|
80
|
+
print_stats(all_stats)
|
81
|
+
|
82
|
+
best, is_the_best_significative = rank(all_stats)
|
83
|
+
|
84
|
+
puts "The best \"#{best['label']}\" is #{is_the_best_significative ? '' : 'not '}significantly (95%) better (total time)."
|
85
|
+
|
86
|
+
all_stats
|
87
|
+
end
|
88
|
+
|
89
|
+
def rank(all_stats, alpha = 0.05)
|
90
|
+
ranked = all_stats.map do |label, stats|
|
91
|
+
total = stats.select{ |stat| stat['name'] == 'total' }.first
|
92
|
+
total['label'] = label
|
93
|
+
total
|
94
|
+
end.sort_by { |stat| stat['median'] }
|
95
|
+
is_h0_rejected = true
|
96
|
+
if all_stats.size > 1
|
97
|
+
z = Benchmark::Experiment::MannWhitneyUTest::calculate_z(ranked.first['sample'], ranked[1]['sample'])
|
98
|
+
p_value = Benchmark::Experiment::MannWhitneyUTest::calculate_probability_z(z)
|
99
|
+
is_h0_rejected = Benchmark::Experiment::MannWhitneyUTest::is_null_hypothesis_rejected?(p_value, alpha)
|
100
|
+
end
|
101
|
+
|
102
|
+
return ranked.first, is_h0_rejected
|
103
|
+
end
|
104
|
+
|
105
|
+
def iterative_experiment
|
106
|
+
end
|
107
|
+
|
108
|
+
private
|
109
|
+
|
110
|
+
def print_stats(all_stats)
|
111
|
+
width = label_width(all_stats)
|
112
|
+
|
113
|
+
lines = []
|
114
|
+
spacing = [0] * MEASURED_TIMES.size
|
115
|
+
tab = ' ' * 4
|
116
|
+
|
117
|
+
all_stats.each do |label, stats|
|
118
|
+
line = ''
|
119
|
+
line << label.ljust(width)
|
120
|
+
|
121
|
+
stats.each_with_index do |stat, index|
|
122
|
+
value = "#{tab}[#{'%.2f' % stat['first_quartile']},#{'%.2f' % stat['median']},#{'%.2f' % stat['third_quartile']}]"
|
123
|
+
spacing[index] = [spacing[index], value.length].minmax.last
|
124
|
+
line << value
|
125
|
+
end
|
126
|
+
line << "\n"
|
127
|
+
lines << line
|
128
|
+
end
|
129
|
+
|
130
|
+
print ''.ljust(width)
|
131
|
+
MEASURED_TIMES.values.each_with_index do |head, index|
|
132
|
+
print "#{tab}#{head}".ljust(spacing[index])
|
133
|
+
end
|
134
|
+
print "\n"
|
135
|
+
|
136
|
+
lines.each { |line| print line }
|
137
|
+
end
|
138
|
+
|
139
|
+
def label_width(all_stats)
|
140
|
+
label_widths = all_stats.map { |label, _| label.to_s.length }
|
141
|
+
label_widths.minmax.last
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
145
|
+
|
146
|
+
extend Benchmark::Experiment
|
147
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Benchmark
|
2
|
+
module Experiment
|
3
|
+
class DescriptiveStatistics
|
4
|
+
def initialize(sample, name = '')
|
5
|
+
# raise exception if empty sample
|
6
|
+
@name = name
|
7
|
+
@sample = sample.sort
|
8
|
+
@minimum, @maximum = @sample.minmax
|
9
|
+
@median = calculate_median_of(@sample)
|
10
|
+
@first_quartile = calculate_first_quartile_of(@sample)
|
11
|
+
@third_quartile = calculate_third_quartile_of(@sample)
|
12
|
+
end
|
13
|
+
|
14
|
+
attr_reader :name, :sample, :minimum, :maximum, :first_quartile, :third_quartile, :median
|
15
|
+
|
16
|
+
def sample_size
|
17
|
+
sample.size
|
18
|
+
end
|
19
|
+
|
20
|
+
def interquartile_range
|
21
|
+
@third_quartile - @first_quartile
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_json(options = {})
|
25
|
+
{
|
26
|
+
'name' => name,
|
27
|
+
'sample' => sample,
|
28
|
+
'sample_size' => sample_size,
|
29
|
+
'minimum' => minimum,
|
30
|
+
'maximum' => maximum,
|
31
|
+
'first_quartile' => first_quartile,
|
32
|
+
'third_quartile' => third_quartile,
|
33
|
+
'median' => median,
|
34
|
+
'interquartile_range' => interquartile_range
|
35
|
+
}.to_json
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
# https://en.wikipedia.org/wiki/Median
|
41
|
+
def calculate_median_of(data)
|
42
|
+
return data[data.size / 2] if data.size.odd?
|
43
|
+
|
44
|
+
(data[(data.size - 1) / 2] + data[data.size / 2]) / 2.0
|
45
|
+
end
|
46
|
+
|
47
|
+
# http://mathworld.wolfram.com/Quartile.html
|
48
|
+
# https://en.wikipedia.org/wiki/Quartile
|
49
|
+
def calculate_first_quartile_of(data)
|
50
|
+
return calculate_median_of(data[0..(data.size / 2)]) if data.size.odd?
|
51
|
+
|
52
|
+
calculate_median_of(data[0..((data.size - 1) / 2)])
|
53
|
+
end
|
54
|
+
|
55
|
+
def calculate_third_quartile_of(data)
|
56
|
+
return calculate_median_of(data[(data.size / 2)..-1]) if data.size.odd?
|
57
|
+
|
58
|
+
calculate_median_of(data[(data.size / 2)..-1])
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'distribution'
|
2
|
+
|
3
|
+
module Benchmark
|
4
|
+
module Experiment
|
5
|
+
module MannWhitneyUTest
|
6
|
+
def self.calculate_U(x, y)
|
7
|
+
ranked = concatenate_and_label(x, y)
|
8
|
+
|
9
|
+
rank!(ranked)
|
10
|
+
|
11
|
+
adjust_ties!(ranked)
|
12
|
+
|
13
|
+
r_x = rank_sum(ranked, :x)
|
14
|
+
r_y = rank_sum(ranked, :y)
|
15
|
+
|
16
|
+
n_x = x.size
|
17
|
+
n_y = y.size
|
18
|
+
|
19
|
+
u_x = calculate_u_sample(n_x, n_y, r_x)
|
20
|
+
u_y = calculate_u_sample(n_y, n_x, r_y)
|
21
|
+
|
22
|
+
[u_x, u_y]
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.calculate_z(x, y)
|
26
|
+
n_x = x.size.to_f
|
27
|
+
n_y = y.size.to_f
|
28
|
+
n = n_x + n_y
|
29
|
+
n_xy = n_x * n_y
|
30
|
+
|
31
|
+
u = calculate_U(x, y).minmax.first.to_f
|
32
|
+
|
33
|
+
t = ties?(x, y)
|
34
|
+
|
35
|
+
mu_u = n_xy / 2.0
|
36
|
+
|
37
|
+
if !t.first
|
38
|
+
sigma_u = Math::sqrt(n_xy * (n + 1.0) / 12.0)
|
39
|
+
else
|
40
|
+
sigma_u = Math::sqrt(n_xy / (n * (n + 1)) * ((n**3 - n) / 12.0 - t.last))
|
41
|
+
end
|
42
|
+
|
43
|
+
(u - mu_u) / sigma_u
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.calculate_probability_z(z, two_sided = true)
|
47
|
+
prob = (1.0 - Distribution::Normal.cdf(z.abs()))
|
48
|
+
prob *= 2.0 if two_sided
|
49
|
+
prob
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.is_null_hypothesis_rejected?(pvalue, significance_level)
|
53
|
+
pvalue < significance_level
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def self.ties?(x, y)
|
59
|
+
all = x + y
|
60
|
+
ties = all.group_by { |e| e }.reject { |_, v| v.size < 2 }
|
61
|
+
|
62
|
+
found_ties = ties.size > 0
|
63
|
+
[
|
64
|
+
found_ties,
|
65
|
+
ties.inject(0) { |a, v| a + (v.size**3 - v.size) / 12.0 }
|
66
|
+
]
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.concatenate_and_label(x, y)
|
70
|
+
ranked = []
|
71
|
+
|
72
|
+
ranked += x.map { |e| [e, :x] }
|
73
|
+
ranked += y.map { |e| [e, :y] }
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.rank!(ranked)
|
77
|
+
ranked.sort!
|
78
|
+
|
79
|
+
ranked.inject(1) do |rank, elem|
|
80
|
+
elem << rank
|
81
|
+
rank + 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.rank_sum(ranked, label)
|
86
|
+
ranked
|
87
|
+
.select { |elem| elem[1] == label }
|
88
|
+
.inject(0) { |rank_sum, elem| rank_sum + elem.last }
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.adjust_ties!(ranked)
|
92
|
+
ties = {}
|
93
|
+
|
94
|
+
ranked
|
95
|
+
.group_by { |e| e.first }
|
96
|
+
.reject { |_, v| v.size < 2 }
|
97
|
+
.each do |score, data|
|
98
|
+
ties[score] = data.inject(0) do |sum, elem|
|
99
|
+
sum + elem.last
|
100
|
+
end / data.size.to_f
|
101
|
+
end
|
102
|
+
|
103
|
+
ranked.map! do |elem|
|
104
|
+
elem[-1] = ties[elem.first] if ties.keys.include? elem.first
|
105
|
+
elem
|
106
|
+
end if ties.keys.size > 0
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.calculate_u_sample(n, n_other, r)
|
110
|
+
n * n_other + n * (n + 1) / 2.0 - r
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|