ruby-statistics 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -0
- data/README.md +11 -0
- data/lib/math.rb +1 -1
- data/lib/statistics/distribution/beta.rb +1 -0
- data/lib/statistics/distribution/normal.rb +40 -0
- data/lib/statistics/distribution/t_student.rb +32 -0
- data/lib/statistics/distribution/weibull.rb +20 -0
- data/lib/statistics/statistical_test/chi_squared_test.rb +42 -0
- data/lib/statistics/statistical_test/t_test.rb +22 -0
- data/lib/statistics/statistical_test/wilcoxon_rank_sum_test.rb +95 -0
- data/lib/statistics/version.rb +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87876f9613fc4472a574b392096f81a18e804da0
|
4
|
+
data.tar.gz: 52316d0cc82b0a2e89b2409ff6f13593f5e4d73c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4a113a9384ff5d4cce963217c97a194ecf142a389f2f5c1f712ec92cef2ef498b656809cdc528dbf5b178104d39058e630bfd69c401858c33ca1b056cfe545ae
|
7
|
+
data.tar.gz: 078fb2c0bae3eb54f357b3bb614692289dea75ef8c93e264b64ffcb29567a48a5f39f9bb7e01cc56dd0ea8d3f4dfc82563749575ef25653236c6e9268e85f6c7
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
# Ruby Statistics
|
2
2
|
|
3
|
+

|
4
|
+
|
3
5
|
A basic ruby gem that implements some statistical methods, functions and concepts to be used in any ruby environment without depending on any mathematical software like `R`, `Matlab`, `Octave` or similar.
|
4
6
|
|
7
|
+
Unit test runs under the following ruby versions:
|
8
|
+
* Ruby 2.2.
|
9
|
+
* Ruby 2.3.1.
|
10
|
+
* Ruby 2.4.0.
|
11
|
+
* Ruby 2.5.0.
|
12
|
+
|
5
13
|
We got the inspiration from the folks at [JStat](https://github.com/jstat/jstat) and some interesting lectures about [Keystroke dynamics](http://www.biometric-solutions.com/keystroke-dynamics.html).
|
6
14
|
|
7
15
|
Some logic and algorithms are extractions or adaptations from other authors, which are referenced in the comments.
|
@@ -43,6 +51,9 @@ poisson = Distribution::Poisson.new(l) # Using Distribution alias.
|
|
43
51
|
normal = Statistics::Distribution::StandardNormal.new # Using all namespaces.
|
44
52
|
```
|
45
53
|
|
54
|
+
## Documentation
|
55
|
+
You can find a bit more detailed documentation of all available distributions, tests and functions in the [Documentation Index](https://github.com/estebanz01/ruby-statistics/wiki/Documentation-Index)
|
56
|
+
|
46
57
|
## Development
|
47
58
|
|
48
59
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/math.rb
CHANGED
@@ -45,7 +45,7 @@ module Math
|
|
45
45
|
|
46
46
|
def self.lower_incomplete_gamma_function(s, x)
|
47
47
|
# The greater the iterations, the better. That's why we are iterating 10_000 * x times
|
48
|
-
self.simpson_rule(0, x, (10_000 * x).round) do |t|
|
48
|
+
self.simpson_rule(0, x, (10_000 * x.round).round) do |t|
|
49
49
|
(t ** (s - 1)) * Math.exp(-t)
|
50
50
|
end
|
51
51
|
end
|
@@ -25,6 +25,46 @@ module Statistics
|
|
25
25
|
|
26
26
|
(left_up/(left_down) * right)
|
27
27
|
end
|
28
|
+
|
29
|
+
## Marsaglia polar method implementation for random gaussian (normal) number generation.
|
30
|
+
# References:
|
31
|
+
# https://en.wikipedia.org/wiki/Marsaglia_polar_method
|
32
|
+
# https://math.stackexchange.com/questions/69245/transform-uniform-distribution-to-normal-distribution-using-lindeberg-l%C3%A9vy-clt
|
33
|
+
# https://www.projectrhea.org/rhea/index.php/The_principles_for_how_to_generate_random_samples_from_a_Gaussian_distribution
|
34
|
+
|
35
|
+
def random(elements: 1, seed: Random.new_seed)
|
36
|
+
results = []
|
37
|
+
|
38
|
+
# Setup seed
|
39
|
+
srand(seed)
|
40
|
+
|
41
|
+
# Number of random numbers to be generated.
|
42
|
+
elements.times do
|
43
|
+
x, y, r = 0.0, 0.0, 0.0
|
44
|
+
|
45
|
+
# Find an (x, y) point in the x^2 + y^2 < 1 circumference.
|
46
|
+
loop do
|
47
|
+
x = 2.0 * rand - 1.0
|
48
|
+
y = 2.0 * rand - 1.0
|
49
|
+
|
50
|
+
r = (x ** 2) + (y ** 2)
|
51
|
+
|
52
|
+
break unless r >= 1.0 || r == 0
|
53
|
+
end
|
54
|
+
|
55
|
+
# Project the random point to the required random distance
|
56
|
+
r = Math.sqrt(-2.0 * Math.log(r) / r)
|
57
|
+
|
58
|
+
# Transform the random distance to a gaussian value and append it to the results array
|
59
|
+
results << mean + x * r * standard_deviation
|
60
|
+
end
|
61
|
+
|
62
|
+
if elements == 1
|
63
|
+
results.first
|
64
|
+
else
|
65
|
+
results
|
66
|
+
end
|
67
|
+
end
|
28
68
|
end
|
29
69
|
|
30
70
|
class StandardNormal < Normal
|
@@ -45,6 +45,38 @@ module Statistics
|
|
45
45
|
degrees_of_freedom/(degrees_of_freedom - 2.0)
|
46
46
|
end
|
47
47
|
end
|
48
|
+
|
49
|
+
# Quantile function extracted from http://www.jennessent.com/arcview/idf.htm
|
50
|
+
# TODO: Make it truly Student's T sample.
|
51
|
+
def random(elements: 1, seed: Random.new_seed)
|
52
|
+
warn 'This is an alpha version code. The generated sample is similar to an uniform distribution'
|
53
|
+
srand(seed)
|
54
|
+
|
55
|
+
v = degrees_of_freedom
|
56
|
+
results = []
|
57
|
+
|
58
|
+
# Because the Quantile function of a student-t distribution is between (-Infinity, y)
|
59
|
+
# we setup an small threshold in order to properly compute the integral
|
60
|
+
threshold = 10_000.0e-12
|
61
|
+
|
62
|
+
elements.times do
|
63
|
+
y = rand
|
64
|
+
results << Math.simpson_rule(threshold, y, 10_000) do |t|
|
65
|
+
up = Math.gamma((v+1)/2.0)
|
66
|
+
down = Math.sqrt(Math::PI * v) * Math.gamma(v/2.0)
|
67
|
+
right = (1 + ((y ** 2)/v.to_f)) ** ((v+1)/2.0)
|
68
|
+
left = up/down.to_f
|
69
|
+
|
70
|
+
left * right
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
if elements == 1
|
75
|
+
results.first
|
76
|
+
else
|
77
|
+
results
|
78
|
+
end
|
79
|
+
end
|
48
80
|
end
|
49
81
|
end
|
50
82
|
end
|
@@ -41,6 +41,26 @@ module Statistics
|
|
41
41
|
|
42
42
|
(scale ** 2) * (left - right)
|
43
43
|
end
|
44
|
+
|
45
|
+
# Using the inverse CDF function, also called quantile, we can calculate
|
46
|
+
# a random sample that follows a weibull distribution.
|
47
|
+
#
|
48
|
+
# Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm
|
49
|
+
def random(elements: 1, seed: Random.new_seed)
|
50
|
+
results = []
|
51
|
+
|
52
|
+
srand(seed)
|
53
|
+
|
54
|
+
elements.times do
|
55
|
+
results << ((-1/scale) * Math.log(1 - rand)) ** (1/shape)
|
56
|
+
end
|
57
|
+
|
58
|
+
if elements == 1
|
59
|
+
results.first
|
60
|
+
else
|
61
|
+
results
|
62
|
+
end
|
63
|
+
end
|
44
64
|
end
|
45
65
|
end
|
46
66
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Statistics
|
2
|
+
module StatisticalTest
|
3
|
+
class ChiSquaredTest
|
4
|
+
def self.chi_statistic(expected, observed)
|
5
|
+
# If the expected is a number, we asumme that all expected observations
|
6
|
+
# has the same probability to occur, hence we expect to see the same number
|
7
|
+
# of expected observations per each observed value
|
8
|
+
statistic = if expected.is_a? Numeric
|
9
|
+
observed.reduce(0) do |memo, observed_value|
|
10
|
+
up = (observed_value - expected) ** 2
|
11
|
+
memo += (up/expected.to_f)
|
12
|
+
end
|
13
|
+
else
|
14
|
+
expected.each_with_index.reduce(0) do |memo, (expected_value, index)|
|
15
|
+
up = (observed[index] - expected_value) ** 2
|
16
|
+
memo += (up/expected_value.to_f)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
[statistic, observed.size - 1]
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.goodness_of_fit(alpha, expected, observed)
|
24
|
+
chi_score, df = *self.chi_statistic(expected, observed) # Splat array result
|
25
|
+
|
26
|
+
return if chi_score.nil? || df.nil?
|
27
|
+
|
28
|
+
probability = Distribution::ChiSquared.new(df).cumulative_function(chi_score)
|
29
|
+
p_value = 1 - probability
|
30
|
+
|
31
|
+
# According to https://stats.stackexchange.com/questions/29158/do-you-reject-the-null-hypothesis-when-p-alpha-or-p-leq-alpha
|
32
|
+
# We can assume that if p_value <= alpha, we can safely reject the null hypothesis, ie. accept the alternative hypothesis.
|
33
|
+
{ probability: probability,
|
34
|
+
p_value: p_value,
|
35
|
+
alpha: alpha,
|
36
|
+
null: alpha < p_value,
|
37
|
+
alternative: p_value <= alpha,
|
38
|
+
confidence_level: 1 - alpha }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -41,6 +41,28 @@ module Statistics
|
|
41
41
|
alternative: p_value <= alpha,
|
42
42
|
confidence_level: 1 - alpha }
|
43
43
|
end
|
44
|
+
|
45
|
+
def self.paired_test(alpha, tails, left_group, right_group)
|
46
|
+
# Handy snippet grabbed from https://stackoverflow.com/questions/2682411/ruby-sum-corresponding-members-of-two-or-more-arrays
|
47
|
+
differences = [left_group, right_group].transpose.map { |value| value.reduce(:-) }
|
48
|
+
|
49
|
+
degrees_of_freedom = differences.size - 1
|
50
|
+
down = differences.standard_deviation/Math.sqrt(differences.size)
|
51
|
+
|
52
|
+
t_score = (differences.mean - 0)/down.to_f
|
53
|
+
|
54
|
+
probability = Distribution::TStudent.new(degrees_of_freedom).cumulative_function(t_score)
|
55
|
+
|
56
|
+
p_value = 1 - probability
|
57
|
+
p_value *= 2 if tails == :two_tail
|
58
|
+
|
59
|
+
{ probability: probability,
|
60
|
+
p_value: p_value,
|
61
|
+
alpha: alpha,
|
62
|
+
null: alpha < p_value,
|
63
|
+
alternative: p_value <= alpha,
|
64
|
+
confidence_level: 1 - alpha }
|
65
|
+
end
|
44
66
|
end
|
45
67
|
end
|
46
68
|
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Statistics
|
2
|
+
module StatisticalTest
|
3
|
+
class WilcoxonRankSumTest
|
4
|
+
def rank(elements)
|
5
|
+
ranked_elements = {}
|
6
|
+
|
7
|
+
elements.sort.each_with_index do |element, index|
|
8
|
+
if ranked_elements.fetch(element, false)
|
9
|
+
# This allow us to solve the ties easily when performing the rank summation per group
|
10
|
+
ranked_elements[element][:counter] += 1
|
11
|
+
ranked_elements[element][:rank] += (index + 1)
|
12
|
+
else
|
13
|
+
ranked_elements[element] = { counter: 1, rank: (index + 1) }
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# ranked_elements = [{ x => { counter: 1, rank: y } ]
|
18
|
+
ranked_elements
|
19
|
+
end
|
20
|
+
|
21
|
+
# Steps to perform the calculation are based on http://www.mit.edu/~6.s085/notes/lecture5.pdf
|
22
|
+
def perform(alpha, tails, group_one, group_two)
|
23
|
+
# Size for each group
|
24
|
+
n1, n2 = group_one.size, group_two.size
|
25
|
+
|
26
|
+
# Rank all data
|
27
|
+
total_ranks = rank(group_one + group_two)
|
28
|
+
|
29
|
+
# sum rankings per group
|
30
|
+
r1 = ranked_sum_for(total_ranks, group_one)
|
31
|
+
r2 = ranked_sum_for(total_ranks, group_two)
|
32
|
+
|
33
|
+
# calculate U statistic
|
34
|
+
u1 = (n1 * (n1 + 1)/2.0) - r1
|
35
|
+
u2 = (n2 * (n2 + 1)/2.0 ) - r2
|
36
|
+
|
37
|
+
u_statistic = [u1.abs, u2.abs].min
|
38
|
+
|
39
|
+
median_u = (n1 * n2)/2.0
|
40
|
+
|
41
|
+
ties = total_ranks.values.select { |element| element[:counter] > 1 }
|
42
|
+
|
43
|
+
std_u = if ties.size > 0
|
44
|
+
corrected_sigma(ties, n1, n2)
|
45
|
+
else
|
46
|
+
Math.sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
|
47
|
+
end
|
48
|
+
|
49
|
+
z = (u_statistic - median_u)/std_u
|
50
|
+
|
51
|
+
# Most literature are not very specific about the normal distribution to be used.
|
52
|
+
# We ran multiple tests with a Normal(median_u, std_u) and Normal(0, 1) and we found
|
53
|
+
# the latter to be more aligned with the results.
|
54
|
+
probability = Distribution::StandardNormal.new.cumulative_function(z.abs)
|
55
|
+
p_value = 1 - probability
|
56
|
+
p_value *= 2 if tails == :two_tail
|
57
|
+
|
58
|
+
{ probability: probability,
|
59
|
+
u: u_statistic,
|
60
|
+
z: z,
|
61
|
+
p_value: p_value,
|
62
|
+
alpha: alpha,
|
63
|
+
null: alpha < p_value,
|
64
|
+
alternative: p_value <= alpha,
|
65
|
+
confidence_level: 1 - alpha }
|
66
|
+
end
|
67
|
+
|
68
|
+
# Formula extracted from http://www.statstutor.ac.uk/resources/uploaded/mannwhitney.pdf
|
69
|
+
private def corrected_sigma(ties, total_group_one, total_group_two)
|
70
|
+
n = total_group_one + total_group_two
|
71
|
+
|
72
|
+
rank_sum = ties.reduce(0) do |memo, t|
|
73
|
+
memo += ((t[:counter] ** 3) - t[:counter])/12.0
|
74
|
+
end
|
75
|
+
|
76
|
+
left = (total_group_one * total_group_two)/(n * (n - 1)).to_f
|
77
|
+
right = (((n ** 3) - n)/12.0) - rank_sum
|
78
|
+
|
79
|
+
Math.sqrt(left * right)
|
80
|
+
end
|
81
|
+
|
82
|
+
private def ranked_sum_for(total, group)
|
83
|
+
# sum rankings per group
|
84
|
+
group.reduce(0) do |memo, element|
|
85
|
+
rank_of_element = total[element][:rank] / total[element][:counter].to_f
|
86
|
+
memo += rank_of_element
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Both test are the same. To keep the selected name, we just alias the class
|
92
|
+
# with the implementation.
|
93
|
+
MannWhitneyU = WilcoxonRankSumTest
|
94
|
+
end
|
95
|
+
end
|
data/lib/statistics/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-statistics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- esteban zapata
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -130,8 +130,10 @@ files:
|
|
130
130
|
- lib/statistics/distribution/uniform.rb
|
131
131
|
- lib/statistics/distribution/weibull.rb
|
132
132
|
- lib/statistics/statistical_test.rb
|
133
|
+
- lib/statistics/statistical_test/chi_squared_test.rb
|
133
134
|
- lib/statistics/statistical_test/f_test.rb
|
134
135
|
- lib/statistics/statistical_test/t_test.rb
|
136
|
+
- lib/statistics/statistical_test/wilcoxon_rank_sum_test.rb
|
135
137
|
- lib/statistics/version.rb
|
136
138
|
- ruby-statistics.gemspec
|
137
139
|
homepage: https://github.com/estebanz01/ruby-statistics
|
@@ -154,7 +156,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
156
|
version: '0'
|
155
157
|
requirements: []
|
156
158
|
rubyforge_project:
|
157
|
-
rubygems_version: 2.
|
159
|
+
rubygems_version: 2.5.1
|
158
160
|
signing_key:
|
159
161
|
specification_version: 4
|
160
162
|
summary: A ruby gem for som specific statistics. Inspired by the jStat js library.
|