ruby-statistics 1.0.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -0
- data/README.md +11 -0
- data/lib/math.rb +1 -1
- data/lib/statistics/distribution/beta.rb +1 -0
- data/lib/statistics/distribution/normal.rb +40 -0
- data/lib/statistics/distribution/t_student.rb +32 -0
- data/lib/statistics/distribution/weibull.rb +20 -0
- data/lib/statistics/statistical_test/chi_squared_test.rb +42 -0
- data/lib/statistics/statistical_test/t_test.rb +22 -0
- data/lib/statistics/statistical_test/wilcoxon_rank_sum_test.rb +95 -0
- data/lib/statistics/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87876f9613fc4472a574b392096f81a18e804da0
|
4
|
+
data.tar.gz: 52316d0cc82b0a2e89b2409ff6f13593f5e4d73c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4a113a9384ff5d4cce963217c97a194ecf142a389f2f5c1f712ec92cef2ef498b656809cdc528dbf5b178104d39058e630bfd69c401858c33ca1b056cfe545ae
|
7
|
+
data.tar.gz: 078fb2c0bae3eb54f357b3bb614692289dea75ef8c93e264b64ffcb29567a48a5f39f9bb7e01cc56dd0ea8d3f4dfc82563749575ef25653236c6e9268e85f6c7
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
# Ruby Statistics
|
2
2
|
|
3
|
+
![](https://travis-ci.org/estebanz01/ruby-statistics.svg?branch=master)
|
4
|
+
|
3
5
|
A basic ruby gem that implements some statistical methods, functions and concepts to be used in any ruby environment without depending on any mathematical software like `R`, `Matlab`, `Octave` or similar.
|
4
6
|
|
7
|
+
Unit test runs under the following ruby versions:
|
8
|
+
* Ruby 2.2.
|
9
|
+
* Ruby 2.3.1.
|
10
|
+
* Ruby 2.4.0.
|
11
|
+
* Ruby 2.5.0.
|
12
|
+
|
5
13
|
We got the inspiration from the folks at [JStat](https://github.com/jstat/jstat) and some interesting lectures about [Keystroke dynamics](http://www.biometric-solutions.com/keystroke-dynamics.html).
|
6
14
|
|
7
15
|
Some logic and algorithms are extractions or adaptations from other authors, which are referenced in the comments.
|
@@ -43,6 +51,9 @@ poisson = Distribution::Poisson.new(l) # Using Distribution alias.
|
|
43
51
|
normal = Statistics::Distribution::StandardNormal.new # Using all namespaces.
|
44
52
|
```
|
45
53
|
|
54
|
+
## Documentation
|
55
|
+
You can find a bit more detailed documentation of all available distributions, tests and functions in the [Documentation Index](https://github.com/estebanz01/ruby-statistics/wiki/Documentation-Index)
|
56
|
+
|
46
57
|
## Development
|
47
58
|
|
48
59
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/math.rb
CHANGED
@@ -45,7 +45,7 @@ module Math
|
|
45
45
|
|
46
46
|
def self.lower_incomplete_gamma_function(s, x)
|
47
47
|
# The greater the iterations, the better. That's why we are iterating 10_000 * x times
|
48
|
-
self.simpson_rule(0, x, (10_000 * x).round) do |t|
|
48
|
+
self.simpson_rule(0, x, (10_000 * x.round).round) do |t|
|
49
49
|
(t ** (s - 1)) * Math.exp(-t)
|
50
50
|
end
|
51
51
|
end
|
@@ -25,6 +25,46 @@ module Statistics
|
|
25
25
|
|
26
26
|
(left_up/(left_down) * right)
|
27
27
|
end
|
28
|
+
|
29
|
+
## Marsaglia polar method implementation for random gaussian (normal) number generation.
|
30
|
+
# References:
|
31
|
+
# https://en.wikipedia.org/wiki/Marsaglia_polar_method
|
32
|
+
# https://math.stackexchange.com/questions/69245/transform-uniform-distribution-to-normal-distribution-using-lindeberg-l%C3%A9vy-clt
|
33
|
+
# https://www.projectrhea.org/rhea/index.php/The_principles_for_how_to_generate_random_samples_from_a_Gaussian_distribution
|
34
|
+
|
35
|
+
def random(elements: 1, seed: Random.new_seed)
|
36
|
+
results = []
|
37
|
+
|
38
|
+
# Setup seed
|
39
|
+
srand(seed)
|
40
|
+
|
41
|
+
# Number of random numbers to be generated.
|
42
|
+
elements.times do
|
43
|
+
x, y, r = 0.0, 0.0, 0.0
|
44
|
+
|
45
|
+
# Find an (x, y) point in the x^2 + y^2 < 1 circumference.
|
46
|
+
loop do
|
47
|
+
x = 2.0 * rand - 1.0
|
48
|
+
y = 2.0 * rand - 1.0
|
49
|
+
|
50
|
+
r = (x ** 2) + (y ** 2)
|
51
|
+
|
52
|
+
break unless r >= 1.0 || r == 0
|
53
|
+
end
|
54
|
+
|
55
|
+
# Project the random point to the required random distance
|
56
|
+
r = Math.sqrt(-2.0 * Math.log(r) / r)
|
57
|
+
|
58
|
+
# Transform the random distance to a gaussian value and append it to the results array
|
59
|
+
results << mean + x * r * standard_deviation
|
60
|
+
end
|
61
|
+
|
62
|
+
if elements == 1
|
63
|
+
results.first
|
64
|
+
else
|
65
|
+
results
|
66
|
+
end
|
67
|
+
end
|
28
68
|
end
|
29
69
|
|
30
70
|
class StandardNormal < Normal
|
@@ -45,6 +45,38 @@ module Statistics
|
|
45
45
|
degrees_of_freedom/(degrees_of_freedom - 2.0)
|
46
46
|
end
|
47
47
|
end
|
48
|
+
|
49
|
+
# Quantile function extracted from http://www.jennessent.com/arcview/idf.htm
|
50
|
+
# TODO: Make it truly Student's T sample.
|
51
|
+
def random(elements: 1, seed: Random.new_seed)
|
52
|
+
warn 'This is an alpha version code. The generated sample is similar to an uniform distribution'
|
53
|
+
srand(seed)
|
54
|
+
|
55
|
+
v = degrees_of_freedom
|
56
|
+
results = []
|
57
|
+
|
58
|
+
# Because the Quantile function of a student-t distribution is between (-Infinity, y)
|
59
|
+
# we setup an small threshold in order to properly compute the integral
|
60
|
+
threshold = 10_000.0e-12
|
61
|
+
|
62
|
+
elements.times do
|
63
|
+
y = rand
|
64
|
+
results << Math.simpson_rule(threshold, y, 10_000) do |t|
|
65
|
+
up = Math.gamma((v+1)/2.0)
|
66
|
+
down = Math.sqrt(Math::PI * v) * Math.gamma(v/2.0)
|
67
|
+
right = (1 + ((y ** 2)/v.to_f)) ** ((v+1)/2.0)
|
68
|
+
left = up/down.to_f
|
69
|
+
|
70
|
+
left * right
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
if elements == 1
|
75
|
+
results.first
|
76
|
+
else
|
77
|
+
results
|
78
|
+
end
|
79
|
+
end
|
48
80
|
end
|
49
81
|
end
|
50
82
|
end
|
@@ -41,6 +41,26 @@ module Statistics
|
|
41
41
|
|
42
42
|
(scale ** 2) * (left - right)
|
43
43
|
end
|
44
|
+
|
45
|
+
# Using the inverse CDF function, also called quantile, we can calculate
|
46
|
+
# a random sample that follows a weibull distribution.
|
47
|
+
#
|
48
|
+
# Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm
|
49
|
+
def random(elements: 1, seed: Random.new_seed)
|
50
|
+
results = []
|
51
|
+
|
52
|
+
srand(seed)
|
53
|
+
|
54
|
+
elements.times do
|
55
|
+
results << ((-1/scale) * Math.log(1 - rand)) ** (1/shape)
|
56
|
+
end
|
57
|
+
|
58
|
+
if elements == 1
|
59
|
+
results.first
|
60
|
+
else
|
61
|
+
results
|
62
|
+
end
|
63
|
+
end
|
44
64
|
end
|
45
65
|
end
|
46
66
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Statistics
|
2
|
+
module StatisticalTest
|
3
|
+
class ChiSquaredTest
|
4
|
+
def self.chi_statistic(expected, observed)
|
5
|
+
# If the expected is a number, we asumme that all expected observations
|
6
|
+
# has the same probability to occur, hence we expect to see the same number
|
7
|
+
# of expected observations per each observed value
|
8
|
+
statistic = if expected.is_a? Numeric
|
9
|
+
observed.reduce(0) do |memo, observed_value|
|
10
|
+
up = (observed_value - expected) ** 2
|
11
|
+
memo += (up/expected.to_f)
|
12
|
+
end
|
13
|
+
else
|
14
|
+
expected.each_with_index.reduce(0) do |memo, (expected_value, index)|
|
15
|
+
up = (observed[index] - expected_value) ** 2
|
16
|
+
memo += (up/expected_value.to_f)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
[statistic, observed.size - 1]
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.goodness_of_fit(alpha, expected, observed)
|
24
|
+
chi_score, df = *self.chi_statistic(expected, observed) # Splat array result
|
25
|
+
|
26
|
+
return if chi_score.nil? || df.nil?
|
27
|
+
|
28
|
+
probability = Distribution::ChiSquared.new(df).cumulative_function(chi_score)
|
29
|
+
p_value = 1 - probability
|
30
|
+
|
31
|
+
# According to https://stats.stackexchange.com/questions/29158/do-you-reject-the-null-hypothesis-when-p-alpha-or-p-leq-alpha
|
32
|
+
# We can assume that if p_value <= alpha, we can safely reject the null hypothesis, ie. accept the alternative hypothesis.
|
33
|
+
{ probability: probability,
|
34
|
+
p_value: p_value,
|
35
|
+
alpha: alpha,
|
36
|
+
null: alpha < p_value,
|
37
|
+
alternative: p_value <= alpha,
|
38
|
+
confidence_level: 1 - alpha }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -41,6 +41,28 @@ module Statistics
|
|
41
41
|
alternative: p_value <= alpha,
|
42
42
|
confidence_level: 1 - alpha }
|
43
43
|
end
|
44
|
+
|
45
|
+
def self.paired_test(alpha, tails, left_group, right_group)
|
46
|
+
# Handy snippet grabbed from https://stackoverflow.com/questions/2682411/ruby-sum-corresponding-members-of-two-or-more-arrays
|
47
|
+
differences = [left_group, right_group].transpose.map { |value| value.reduce(:-) }
|
48
|
+
|
49
|
+
degrees_of_freedom = differences.size - 1
|
50
|
+
down = differences.standard_deviation/Math.sqrt(differences.size)
|
51
|
+
|
52
|
+
t_score = (differences.mean - 0)/down.to_f
|
53
|
+
|
54
|
+
probability = Distribution::TStudent.new(degrees_of_freedom).cumulative_function(t_score)
|
55
|
+
|
56
|
+
p_value = 1 - probability
|
57
|
+
p_value *= 2 if tails == :two_tail
|
58
|
+
|
59
|
+
{ probability: probability,
|
60
|
+
p_value: p_value,
|
61
|
+
alpha: alpha,
|
62
|
+
null: alpha < p_value,
|
63
|
+
alternative: p_value <= alpha,
|
64
|
+
confidence_level: 1 - alpha }
|
65
|
+
end
|
44
66
|
end
|
45
67
|
end
|
46
68
|
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Statistics
|
2
|
+
module StatisticalTest
|
3
|
+
class WilcoxonRankSumTest
|
4
|
+
def rank(elements)
|
5
|
+
ranked_elements = {}
|
6
|
+
|
7
|
+
elements.sort.each_with_index do |element, index|
|
8
|
+
if ranked_elements.fetch(element, false)
|
9
|
+
# This allow us to solve the ties easily when performing the rank summation per group
|
10
|
+
ranked_elements[element][:counter] += 1
|
11
|
+
ranked_elements[element][:rank] += (index + 1)
|
12
|
+
else
|
13
|
+
ranked_elements[element] = { counter: 1, rank: (index + 1) }
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# ranked_elements = [{ x => { counter: 1, rank: y } ]
|
18
|
+
ranked_elements
|
19
|
+
end
|
20
|
+
|
21
|
+
# Steps to perform the calculation are based on http://www.mit.edu/~6.s085/notes/lecture5.pdf
|
22
|
+
def perform(alpha, tails, group_one, group_two)
|
23
|
+
# Size for each group
|
24
|
+
n1, n2 = group_one.size, group_two.size
|
25
|
+
|
26
|
+
# Rank all data
|
27
|
+
total_ranks = rank(group_one + group_two)
|
28
|
+
|
29
|
+
# sum rankings per group
|
30
|
+
r1 = ranked_sum_for(total_ranks, group_one)
|
31
|
+
r2 = ranked_sum_for(total_ranks, group_two)
|
32
|
+
|
33
|
+
# calculate U statistic
|
34
|
+
u1 = (n1 * (n1 + 1)/2.0) - r1
|
35
|
+
u2 = (n2 * (n2 + 1)/2.0 ) - r2
|
36
|
+
|
37
|
+
u_statistic = [u1.abs, u2.abs].min
|
38
|
+
|
39
|
+
median_u = (n1 * n2)/2.0
|
40
|
+
|
41
|
+
ties = total_ranks.values.select { |element| element[:counter] > 1 }
|
42
|
+
|
43
|
+
std_u = if ties.size > 0
|
44
|
+
corrected_sigma(ties, n1, n2)
|
45
|
+
else
|
46
|
+
Math.sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
|
47
|
+
end
|
48
|
+
|
49
|
+
z = (u_statistic - median_u)/std_u
|
50
|
+
|
51
|
+
# Most literature are not very specific about the normal distribution to be used.
|
52
|
+
# We ran multiple tests with a Normal(median_u, std_u) and Normal(0, 1) and we found
|
53
|
+
# the latter to be more aligned with the results.
|
54
|
+
probability = Distribution::StandardNormal.new.cumulative_function(z.abs)
|
55
|
+
p_value = 1 - probability
|
56
|
+
p_value *= 2 if tails == :two_tail
|
57
|
+
|
58
|
+
{ probability: probability,
|
59
|
+
u: u_statistic,
|
60
|
+
z: z,
|
61
|
+
p_value: p_value,
|
62
|
+
alpha: alpha,
|
63
|
+
null: alpha < p_value,
|
64
|
+
alternative: p_value <= alpha,
|
65
|
+
confidence_level: 1 - alpha }
|
66
|
+
end
|
67
|
+
|
68
|
+
# Formula extracted from http://www.statstutor.ac.uk/resources/uploaded/mannwhitney.pdf
|
69
|
+
private def corrected_sigma(ties, total_group_one, total_group_two)
|
70
|
+
n = total_group_one + total_group_two
|
71
|
+
|
72
|
+
rank_sum = ties.reduce(0) do |memo, t|
|
73
|
+
memo += ((t[:counter] ** 3) - t[:counter])/12.0
|
74
|
+
end
|
75
|
+
|
76
|
+
left = (total_group_one * total_group_two)/(n * (n - 1)).to_f
|
77
|
+
right = (((n ** 3) - n)/12.0) - rank_sum
|
78
|
+
|
79
|
+
Math.sqrt(left * right)
|
80
|
+
end
|
81
|
+
|
82
|
+
private def ranked_sum_for(total, group)
|
83
|
+
# sum rankings per group
|
84
|
+
group.reduce(0) do |memo, element|
|
85
|
+
rank_of_element = total[element][:rank] / total[element][:counter].to_f
|
86
|
+
memo += rank_of_element
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Both test are the same. To keep the selected name, we just alias the class
|
92
|
+
# with the implementation.
|
93
|
+
MannWhitneyU = WilcoxonRankSumTest
|
94
|
+
end
|
95
|
+
end
|
data/lib/statistics/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-statistics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- esteban zapata
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -130,8 +130,10 @@ files:
|
|
130
130
|
- lib/statistics/distribution/uniform.rb
|
131
131
|
- lib/statistics/distribution/weibull.rb
|
132
132
|
- lib/statistics/statistical_test.rb
|
133
|
+
- lib/statistics/statistical_test/chi_squared_test.rb
|
133
134
|
- lib/statistics/statistical_test/f_test.rb
|
134
135
|
- lib/statistics/statistical_test/t_test.rb
|
136
|
+
- lib/statistics/statistical_test/wilcoxon_rank_sum_test.rb
|
135
137
|
- lib/statistics/version.rb
|
136
138
|
- ruby-statistics.gemspec
|
137
139
|
homepage: https://github.com/estebanz01/ruby-statistics
|
@@ -154,7 +156,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
156
|
version: '0'
|
155
157
|
requirements: []
|
156
158
|
rubyforge_project:
|
157
|
-
rubygems_version: 2.
|
159
|
+
rubygems_version: 2.5.1
|
158
160
|
signing_key:
|
159
161
|
specification_version: 4
|
160
162
|
summary: A ruby gem for som specific statistics. Inspired by the jStat js library.
|