ruby-statistics 1.0.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7e8b5d9c7b99a8d2bd81f0976ff883558053fbd1
4
- data.tar.gz: 886fa6a14b3620dcb81be856da2ba7d5539d0d2b
3
+ metadata.gz: 87876f9613fc4472a574b392096f81a18e804da0
4
+ data.tar.gz: 52316d0cc82b0a2e89b2409ff6f13593f5e4d73c
5
5
  SHA512:
6
- metadata.gz: 726ac86977b1354ddf1e82fd7d7794b1e14717f9132e2cc3544a28fcc130d76a8ac839f8a5e0280a98dc4e0a9620889f999a434c7349cfead7a8d5bf6c580f05
7
- data.tar.gz: 605336ac006a155924d1fea0252577c5cab88d8375be119856dc80d64ed3cd4780af8136f496ec211a671af1d48d76de55e77f0a8ff2315e0421e2806284fe14
6
+ metadata.gz: 4a113a9384ff5d4cce963217c97a194ecf142a389f2f5c1f712ec92cef2ef498b656809cdc528dbf5b178104d39058e630bfd69c401858c33ca1b056cfe545ae
7
+ data.tar.gz: 078fb2c0bae3eb54f357b3bb614692289dea75ef8c93e264b64ffcb29567a48a5f39f9bb7e01cc56dd0ea8d3f4dfc82563749575ef25653236c6e9268e85f6c7
data/.travis.yml CHANGED
@@ -3,4 +3,6 @@ language: ruby
3
3
  rvm:
4
4
  - 2.2
5
5
  - 2.3.1
6
+ - 2.4.0
7
+ - 2.5.0
6
8
  before_install: gem install bundler
data/README.md CHANGED
@@ -1,7 +1,15 @@
1
1
  # Ruby Statistics
2
2
 
3
+ ![](https://travis-ci.org/estebanz01/ruby-statistics.svg?branch=master)
4
+
3
5
  A basic ruby gem that implements some statistical methods, functions and concepts to be used in any ruby environment without depending on any mathematical software like `R`, `Matlab`, `Octave` or similar.
4
6
 
7
+ Unit test runs under the following ruby versions:
8
+ * Ruby 2.2.
9
+ * Ruby 2.3.1.
10
+ * Ruby 2.4.0.
11
+ * Ruby 2.5.0.
12
+
5
13
  We got the inspiration from the folks at [JStat](https://github.com/jstat/jstat) and some interesting lectures about [Keystroke dynamics](http://www.biometric-solutions.com/keystroke-dynamics.html).
6
14
 
7
15
  Some logic and algorithms are extractions or adaptations from other authors, which are referenced in the comments.
@@ -43,6 +51,9 @@ poisson = Distribution::Poisson.new(l) # Using Distribution alias.
43
51
  normal = Statistics::Distribution::StandardNormal.new # Using all namespaces.
44
52
  ```
45
53
 
54
+ ## Documentation
55
+ You can find a bit more detailed documentation of all available distributions, tests and functions in the [Documentation Index](https://github.com/estebanz01/ruby-statistics/wiki/Documentation-Index)
56
+
46
57
  ## Development
47
58
 
48
59
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/lib/math.rb CHANGED
@@ -45,7 +45,7 @@ module Math
45
45
 
46
46
  def self.lower_incomplete_gamma_function(s, x)
47
47
  # The greater the iterations, the better. That's why we are iterating 10_000 * x times
48
- self.simpson_rule(0, x, (10_000 * x).round) do |t|
48
+ self.simpson_rule(0, x, (10_000 * x.round).round) do |t|
49
49
  (t ** (s - 1)) * Math.exp(-t)
50
50
  end
51
51
  end
@@ -28,6 +28,7 @@ module Statistics
28
28
  end
29
29
 
30
30
  def mean
31
+ return if alpha + beta == 0
31
32
  alpha / (alpha + beta)
32
33
  end
33
34
  end
@@ -25,6 +25,46 @@ module Statistics
25
25
 
26
26
  (left_up/(left_down) * right)
27
27
  end
28
+
29
+ ## Marsaglia polar method implementation for random gaussian (normal) number generation.
30
+ # References:
31
+ # https://en.wikipedia.org/wiki/Marsaglia_polar_method
32
+ # https://math.stackexchange.com/questions/69245/transform-uniform-distribution-to-normal-distribution-using-lindeberg-l%C3%A9vy-clt
33
+ # https://www.projectrhea.org/rhea/index.php/The_principles_for_how_to_generate_random_samples_from_a_Gaussian_distribution
34
+
35
+ def random(elements: 1, seed: Random.new_seed)
36
+ results = []
37
+
38
+ # Setup seed
39
+ srand(seed)
40
+
41
+ # Number of random numbers to be generated.
42
+ elements.times do
43
+ x, y, r = 0.0, 0.0, 0.0
44
+
45
+ # Find an (x, y) point in the x^2 + y^2 < 1 circumference.
46
+ loop do
47
+ x = 2.0 * rand - 1.0
48
+ y = 2.0 * rand - 1.0
49
+
50
+ r = (x ** 2) + (y ** 2)
51
+
52
+ break unless r >= 1.0 || r == 0
53
+ end
54
+
55
+ # Project the random point to the required random distance
56
+ r = Math.sqrt(-2.0 * Math.log(r) / r)
57
+
58
+ # Transform the random distance to a gaussian value and append it to the results array
59
+ results << mean + x * r * standard_deviation
60
+ end
61
+
62
+ if elements == 1
63
+ results.first
64
+ else
65
+ results
66
+ end
67
+ end
28
68
  end
29
69
 
30
70
  class StandardNormal < Normal
@@ -45,6 +45,38 @@ module Statistics
45
45
  degrees_of_freedom/(degrees_of_freedom - 2.0)
46
46
  end
47
47
  end
48
+
49
+ # Quantile function extracted from http://www.jennessent.com/arcview/idf.htm
50
+ # TODO: Make it truly Student's T sample.
51
+ def random(elements: 1, seed: Random.new_seed)
52
+ warn 'This is an alpha version code. The generated sample is similar to an uniform distribution'
53
+ srand(seed)
54
+
55
+ v = degrees_of_freedom
56
+ results = []
57
+
58
+ # Because the Quantile function of a student-t distribution is between (-Infinity, y)
59
+ # we setup an small threshold in order to properly compute the integral
60
+ threshold = 10_000.0e-12
61
+
62
+ elements.times do
63
+ y = rand
64
+ results << Math.simpson_rule(threshold, y, 10_000) do |t|
65
+ up = Math.gamma((v+1)/2.0)
66
+ down = Math.sqrt(Math::PI * v) * Math.gamma(v/2.0)
67
+ right = (1 + ((y ** 2)/v.to_f)) ** ((v+1)/2.0)
68
+ left = up/down.to_f
69
+
70
+ left * right
71
+ end
72
+ end
73
+
74
+ if elements == 1
75
+ results.first
76
+ else
77
+ results
78
+ end
79
+ end
48
80
  end
49
81
  end
50
82
  end
@@ -41,6 +41,26 @@ module Statistics
41
41
 
42
42
  (scale ** 2) * (left - right)
43
43
  end
44
+
45
+ # Using the inverse CDF function, also called quantile, we can calculate
46
+ # a random sample that follows a weibull distribution.
47
+ #
48
+ # Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm
49
+ def random(elements: 1, seed: Random.new_seed)
50
+ results = []
51
+
52
+ srand(seed)
53
+
54
+ elements.times do
55
+ results << ((-1/scale) * Math.log(1 - rand)) ** (1/shape)
56
+ end
57
+
58
+ if elements == 1
59
+ results.first
60
+ else
61
+ results
62
+ end
63
+ end
44
64
  end
45
65
  end
46
66
  end
@@ -0,0 +1,42 @@
1
+ module Statistics
2
+ module StatisticalTest
3
+ class ChiSquaredTest
4
+ def self.chi_statistic(expected, observed)
5
+ # If the expected is a number, we asumme that all expected observations
6
+ # has the same probability to occur, hence we expect to see the same number
7
+ # of expected observations per each observed value
8
+ statistic = if expected.is_a? Numeric
9
+ observed.reduce(0) do |memo, observed_value|
10
+ up = (observed_value - expected) ** 2
11
+ memo += (up/expected.to_f)
12
+ end
13
+ else
14
+ expected.each_with_index.reduce(0) do |memo, (expected_value, index)|
15
+ up = (observed[index] - expected_value) ** 2
16
+ memo += (up/expected_value.to_f)
17
+ end
18
+ end
19
+
20
+ [statistic, observed.size - 1]
21
+ end
22
+
23
+ def self.goodness_of_fit(alpha, expected, observed)
24
+ chi_score, df = *self.chi_statistic(expected, observed) # Splat array result
25
+
26
+ return if chi_score.nil? || df.nil?
27
+
28
+ probability = Distribution::ChiSquared.new(df).cumulative_function(chi_score)
29
+ p_value = 1 - probability
30
+
31
+ # According to https://stats.stackexchange.com/questions/29158/do-you-reject-the-null-hypothesis-when-p-alpha-or-p-leq-alpha
32
+ # We can assume that if p_value <= alpha, we can safely reject the null hypothesis, ie. accept the alternative hypothesis.
33
+ { probability: probability,
34
+ p_value: p_value,
35
+ alpha: alpha,
36
+ null: alpha < p_value,
37
+ alternative: p_value <= alpha,
38
+ confidence_level: 1 - alpha }
39
+ end
40
+ end
41
+ end
42
+ end
@@ -41,6 +41,28 @@ module Statistics
41
41
  alternative: p_value <= alpha,
42
42
  confidence_level: 1 - alpha }
43
43
  end
44
+
45
+ def self.paired_test(alpha, tails, left_group, right_group)
46
+ # Handy snippet grabbed from https://stackoverflow.com/questions/2682411/ruby-sum-corresponding-members-of-two-or-more-arrays
47
+ differences = [left_group, right_group].transpose.map { |value| value.reduce(:-) }
48
+
49
+ degrees_of_freedom = differences.size - 1
50
+ down = differences.standard_deviation/Math.sqrt(differences.size)
51
+
52
+ t_score = (differences.mean - 0)/down.to_f
53
+
54
+ probability = Distribution::TStudent.new(degrees_of_freedom).cumulative_function(t_score)
55
+
56
+ p_value = 1 - probability
57
+ p_value *= 2 if tails == :two_tail
58
+
59
+ { probability: probability,
60
+ p_value: p_value,
61
+ alpha: alpha,
62
+ null: alpha < p_value,
63
+ alternative: p_value <= alpha,
64
+ confidence_level: 1 - alpha }
65
+ end
44
66
  end
45
67
  end
46
68
  end
@@ -0,0 +1,95 @@
1
+ module Statistics
2
+ module StatisticalTest
3
+ class WilcoxonRankSumTest
4
+ def rank(elements)
5
+ ranked_elements = {}
6
+
7
+ elements.sort.each_with_index do |element, index|
8
+ if ranked_elements.fetch(element, false)
9
+ # This allow us to solve the ties easily when performing the rank summation per group
10
+ ranked_elements[element][:counter] += 1
11
+ ranked_elements[element][:rank] += (index + 1)
12
+ else
13
+ ranked_elements[element] = { counter: 1, rank: (index + 1) }
14
+ end
15
+ end
16
+
17
+ # ranked_elements = [{ x => { counter: 1, rank: y } ]
18
+ ranked_elements
19
+ end
20
+
21
+ # Steps to perform the calculation are based on http://www.mit.edu/~6.s085/notes/lecture5.pdf
22
+ def perform(alpha, tails, group_one, group_two)
23
+ # Size for each group
24
+ n1, n2 = group_one.size, group_two.size
25
+
26
+ # Rank all data
27
+ total_ranks = rank(group_one + group_two)
28
+
29
+ # sum rankings per group
30
+ r1 = ranked_sum_for(total_ranks, group_one)
31
+ r2 = ranked_sum_for(total_ranks, group_two)
32
+
33
+ # calculate U statistic
34
+ u1 = (n1 * (n1 + 1)/2.0) - r1
35
+ u2 = (n2 * (n2 + 1)/2.0 ) - r2
36
+
37
+ u_statistic = [u1.abs, u2.abs].min
38
+
39
+ median_u = (n1 * n2)/2.0
40
+
41
+ ties = total_ranks.values.select { |element| element[:counter] > 1 }
42
+
43
+ std_u = if ties.size > 0
44
+ corrected_sigma(ties, n1, n2)
45
+ else
46
+ Math.sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
47
+ end
48
+
49
+ z = (u_statistic - median_u)/std_u
50
+
51
+ # Most literature are not very specific about the normal distribution to be used.
52
+ # We ran multiple tests with a Normal(median_u, std_u) and Normal(0, 1) and we found
53
+ # the latter to be more aligned with the results.
54
+ probability = Distribution::StandardNormal.new.cumulative_function(z.abs)
55
+ p_value = 1 - probability
56
+ p_value *= 2 if tails == :two_tail
57
+
58
+ { probability: probability,
59
+ u: u_statistic,
60
+ z: z,
61
+ p_value: p_value,
62
+ alpha: alpha,
63
+ null: alpha < p_value,
64
+ alternative: p_value <= alpha,
65
+ confidence_level: 1 - alpha }
66
+ end
67
+
68
+ # Formula extracted from http://www.statstutor.ac.uk/resources/uploaded/mannwhitney.pdf
69
+ private def corrected_sigma(ties, total_group_one, total_group_two)
70
+ n = total_group_one + total_group_two
71
+
72
+ rank_sum = ties.reduce(0) do |memo, t|
73
+ memo += ((t[:counter] ** 3) - t[:counter])/12.0
74
+ end
75
+
76
+ left = (total_group_one * total_group_two)/(n * (n - 1)).to_f
77
+ right = (((n ** 3) - n)/12.0) - rank_sum
78
+
79
+ Math.sqrt(left * right)
80
+ end
81
+
82
+ private def ranked_sum_for(total, group)
83
+ # sum rankings per group
84
+ group.reduce(0) do |memo, element|
85
+ rank_of_element = total[element][:rank] / total[element][:counter].to_f
86
+ memo += rank_of_element
87
+ end
88
+ end
89
+ end
90
+
91
+ # Both test are the same. To keep the selected name, we just alias the class
92
+ # with the implementation.
93
+ MannWhitneyU = WilcoxonRankSumTest
94
+ end
95
+ end
@@ -1,3 +1,3 @@
1
1
  module Statistics
2
- VERSION = "1.0.2"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-statistics
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - esteban zapata
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-10-17 00:00:00.000000000 Z
11
+ date: 2018-01-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -130,8 +130,10 @@ files:
130
130
  - lib/statistics/distribution/uniform.rb
131
131
  - lib/statistics/distribution/weibull.rb
132
132
  - lib/statistics/statistical_test.rb
133
+ - lib/statistics/statistical_test/chi_squared_test.rb
133
134
  - lib/statistics/statistical_test/f_test.rb
134
135
  - lib/statistics/statistical_test/t_test.rb
136
+ - lib/statistics/statistical_test/wilcoxon_rank_sum_test.rb
135
137
  - lib/statistics/version.rb
136
138
  - ruby-statistics.gemspec
137
139
  homepage: https://github.com/estebanz01/ruby-statistics
@@ -154,7 +156,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
156
  version: '0'
155
157
  requirements: []
156
158
  rubyforge_project:
157
- rubygems_version: 2.6.14
159
+ rubygems_version: 2.5.1
158
160
  signing_key:
159
161
  specification_version: 4
160
162
  summary: A ruby gem for som specific statistics. Inspired by the jStat js library.