ruby-statistics 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7e8b5d9c7b99a8d2bd81f0976ff883558053fbd1
4
- data.tar.gz: 886fa6a14b3620dcb81be856da2ba7d5539d0d2b
3
+ metadata.gz: 87876f9613fc4472a574b392096f81a18e804da0
4
+ data.tar.gz: 52316d0cc82b0a2e89b2409ff6f13593f5e4d73c
5
5
  SHA512:
6
- metadata.gz: 726ac86977b1354ddf1e82fd7d7794b1e14717f9132e2cc3544a28fcc130d76a8ac839f8a5e0280a98dc4e0a9620889f999a434c7349cfead7a8d5bf6c580f05
7
- data.tar.gz: 605336ac006a155924d1fea0252577c5cab88d8375be119856dc80d64ed3cd4780af8136f496ec211a671af1d48d76de55e77f0a8ff2315e0421e2806284fe14
6
+ metadata.gz: 4a113a9384ff5d4cce963217c97a194ecf142a389f2f5c1f712ec92cef2ef498b656809cdc528dbf5b178104d39058e630bfd69c401858c33ca1b056cfe545ae
7
+ data.tar.gz: 078fb2c0bae3eb54f357b3bb614692289dea75ef8c93e264b64ffcb29567a48a5f39f9bb7e01cc56dd0ea8d3f4dfc82563749575ef25653236c6e9268e85f6c7
data/.travis.yml CHANGED
@@ -3,4 +3,6 @@ language: ruby
3
3
  rvm:
4
4
  - 2.2
5
5
  - 2.3.1
6
+ - 2.4.0
7
+ - 2.5.0
6
8
  before_install: gem install bundler
data/README.md CHANGED
@@ -1,7 +1,15 @@
1
1
  # Ruby Statistics
2
2
 
3
+ ![](https://travis-ci.org/estebanz01/ruby-statistics.svg?branch=master)
4
+
3
5
  A basic ruby gem that implements some statistical methods, functions and concepts to be used in any ruby environment without depending on any mathematical software like `R`, `Matlab`, `Octave` or similar.
4
6
 
7
+ Unit test runs under the following ruby versions:
8
+ * Ruby 2.2.
9
+ * Ruby 2.3.1.
10
+ * Ruby 2.4.0.
11
+ * Ruby 2.5.0.
12
+
5
13
  We got the inspiration from the folks at [JStat](https://github.com/jstat/jstat) and some interesting lectures about [Keystroke dynamics](http://www.biometric-solutions.com/keystroke-dynamics.html).
6
14
 
7
15
  Some logic and algorithms are extractions or adaptations from other authors, which are referenced in the comments.
@@ -43,6 +51,9 @@ poisson = Distribution::Poisson.new(l) # Using Distribution alias.
43
51
  normal = Statistics::Distribution::StandardNormal.new # Using all namespaces.
44
52
  ```
45
53
 
54
+ ## Documentation
55
+ You can find a bit more detailed documentation of all available distributions, tests and functions in the [Documentation Index](https://github.com/estebanz01/ruby-statistics/wiki/Documentation-Index)
56
+
46
57
  ## Development
47
58
 
48
59
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/lib/math.rb CHANGED
@@ -45,7 +45,7 @@ module Math
45
45
 
46
46
  def self.lower_incomplete_gamma_function(s, x)
47
47
  # The greater the iterations, the better. That's why we are iterating 10_000 * x times
48
- self.simpson_rule(0, x, (10_000 * x).round) do |t|
48
+ self.simpson_rule(0, x, (10_000 * x.round).round) do |t|
49
49
  (t ** (s - 1)) * Math.exp(-t)
50
50
  end
51
51
  end
@@ -28,6 +28,7 @@ module Statistics
28
28
  end
29
29
 
30
30
  def mean
31
+ return if alpha + beta == 0
31
32
  alpha / (alpha + beta)
32
33
  end
33
34
  end
@@ -25,6 +25,46 @@ module Statistics
25
25
 
26
26
  (left_up/(left_down) * right)
27
27
  end
28
+
29
+ ## Marsaglia polar method implementation for random gaussian (normal) number generation.
30
+ # References:
31
+ # https://en.wikipedia.org/wiki/Marsaglia_polar_method
32
+ # https://math.stackexchange.com/questions/69245/transform-uniform-distribution-to-normal-distribution-using-lindeberg-l%C3%A9vy-clt
33
+ # https://www.projectrhea.org/rhea/index.php/The_principles_for_how_to_generate_random_samples_from_a_Gaussian_distribution
34
+
35
+ def random(elements: 1, seed: Random.new_seed)
36
+ results = []
37
+
38
+ # Setup seed
39
+ srand(seed)
40
+
41
+ # Number of random numbers to be generated.
42
+ elements.times do
43
+ x, y, r = 0.0, 0.0, 0.0
44
+
45
+ # Find an (x, y) point in the x^2 + y^2 < 1 circumference.
46
+ loop do
47
+ x = 2.0 * rand - 1.0
48
+ y = 2.0 * rand - 1.0
49
+
50
+ r = (x ** 2) + (y ** 2)
51
+
52
+ break unless r >= 1.0 || r == 0
53
+ end
54
+
55
+ # Project the random point to the required random distance
56
+ r = Math.sqrt(-2.0 * Math.log(r) / r)
57
+
58
+ # Transform the random distance to a gaussian value and append it to the results array
59
+ results << mean + x * r * standard_deviation
60
+ end
61
+
62
+ if elements == 1
63
+ results.first
64
+ else
65
+ results
66
+ end
67
+ end
28
68
  end
29
69
 
30
70
  class StandardNormal < Normal
@@ -45,6 +45,38 @@ module Statistics
45
45
  degrees_of_freedom/(degrees_of_freedom - 2.0)
46
46
  end
47
47
  end
48
+
49
+ # Quantile function extracted from http://www.jennessent.com/arcview/idf.htm
50
+ # TODO: Make it truly Student's T sample.
51
+ def random(elements: 1, seed: Random.new_seed)
52
+ warn 'This is an alpha version code. The generated sample is similar to an uniform distribution'
53
+ srand(seed)
54
+
55
+ v = degrees_of_freedom
56
+ results = []
57
+
58
+ # Because the Quantile function of a student-t distribution is between (-Infinity, y)
59
+ # we setup an small threshold in order to properly compute the integral
60
+ threshold = 10_000.0e-12
61
+
62
+ elements.times do
63
+ y = rand
64
+ results << Math.simpson_rule(threshold, y, 10_000) do |t|
65
+ up = Math.gamma((v+1)/2.0)
66
+ down = Math.sqrt(Math::PI * v) * Math.gamma(v/2.0)
67
+ right = (1 + ((y ** 2)/v.to_f)) ** ((v+1)/2.0)
68
+ left = up/down.to_f
69
+
70
+ left * right
71
+ end
72
+ end
73
+
74
+ if elements == 1
75
+ results.first
76
+ else
77
+ results
78
+ end
79
+ end
48
80
  end
49
81
  end
50
82
  end
@@ -41,6 +41,26 @@ module Statistics
41
41
 
42
42
  (scale ** 2) * (left - right)
43
43
  end
44
+
45
+ # Using the inverse CDF function, also called quantile, we can calculate
46
+ # a random sample that follows a weibull distribution.
47
+ #
48
+ # Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm
49
+ def random(elements: 1, seed: Random.new_seed)
50
+ results = []
51
+
52
+ srand(seed)
53
+
54
+ elements.times do
55
+ results << ((-1/scale) * Math.log(1 - rand)) ** (1/shape)
56
+ end
57
+
58
+ if elements == 1
59
+ results.first
60
+ else
61
+ results
62
+ end
63
+ end
44
64
  end
45
65
  end
46
66
  end
@@ -0,0 +1,42 @@
1
+ module Statistics
2
+ module StatisticalTest
3
+ class ChiSquaredTest
4
+ def self.chi_statistic(expected, observed)
5
+ # If the expected is a number, we asumme that all expected observations
6
+ # has the same probability to occur, hence we expect to see the same number
7
+ # of expected observations per each observed value
8
+ statistic = if expected.is_a? Numeric
9
+ observed.reduce(0) do |memo, observed_value|
10
+ up = (observed_value - expected) ** 2
11
+ memo += (up/expected.to_f)
12
+ end
13
+ else
14
+ expected.each_with_index.reduce(0) do |memo, (expected_value, index)|
15
+ up = (observed[index] - expected_value) ** 2
16
+ memo += (up/expected_value.to_f)
17
+ end
18
+ end
19
+
20
+ [statistic, observed.size - 1]
21
+ end
22
+
23
+ def self.goodness_of_fit(alpha, expected, observed)
24
+ chi_score, df = *self.chi_statistic(expected, observed) # Splat array result
25
+
26
+ return if chi_score.nil? || df.nil?
27
+
28
+ probability = Distribution::ChiSquared.new(df).cumulative_function(chi_score)
29
+ p_value = 1 - probability
30
+
31
+ # According to https://stats.stackexchange.com/questions/29158/do-you-reject-the-null-hypothesis-when-p-alpha-or-p-leq-alpha
32
+ # We can assume that if p_value <= alpha, we can safely reject the null hypothesis, ie. accept the alternative hypothesis.
33
+ { probability: probability,
34
+ p_value: p_value,
35
+ alpha: alpha,
36
+ null: alpha < p_value,
37
+ alternative: p_value <= alpha,
38
+ confidence_level: 1 - alpha }
39
+ end
40
+ end
41
+ end
42
+ end
@@ -41,6 +41,28 @@ module Statistics
41
41
  alternative: p_value <= alpha,
42
42
  confidence_level: 1 - alpha }
43
43
  end
44
+
45
+ def self.paired_test(alpha, tails, left_group, right_group)
46
+ # Handy snippet grabbed from https://stackoverflow.com/questions/2682411/ruby-sum-corresponding-members-of-two-or-more-arrays
47
+ differences = [left_group, right_group].transpose.map { |value| value.reduce(:-) }
48
+
49
+ degrees_of_freedom = differences.size - 1
50
+ down = differences.standard_deviation/Math.sqrt(differences.size)
51
+
52
+ t_score = (differences.mean - 0)/down.to_f
53
+
54
+ probability = Distribution::TStudent.new(degrees_of_freedom).cumulative_function(t_score)
55
+
56
+ p_value = 1 - probability
57
+ p_value *= 2 if tails == :two_tail
58
+
59
+ { probability: probability,
60
+ p_value: p_value,
61
+ alpha: alpha,
62
+ null: alpha < p_value,
63
+ alternative: p_value <= alpha,
64
+ confidence_level: 1 - alpha }
65
+ end
44
66
  end
45
67
  end
46
68
  end
@@ -0,0 +1,95 @@
1
+ module Statistics
2
+ module StatisticalTest
3
+ class WilcoxonRankSumTest
4
+ def rank(elements)
5
+ ranked_elements = {}
6
+
7
+ elements.sort.each_with_index do |element, index|
8
+ if ranked_elements.fetch(element, false)
9
+ # This allow us to solve the ties easily when performing the rank summation per group
10
+ ranked_elements[element][:counter] += 1
11
+ ranked_elements[element][:rank] += (index + 1)
12
+ else
13
+ ranked_elements[element] = { counter: 1, rank: (index + 1) }
14
+ end
15
+ end
16
+
17
+ # ranked_elements = [{ x => { counter: 1, rank: y } ]
18
+ ranked_elements
19
+ end
20
+
21
+ # Steps to perform the calculation are based on http://www.mit.edu/~6.s085/notes/lecture5.pdf
22
+ def perform(alpha, tails, group_one, group_two)
23
+ # Size for each group
24
+ n1, n2 = group_one.size, group_two.size
25
+
26
+ # Rank all data
27
+ total_ranks = rank(group_one + group_two)
28
+
29
+ # sum rankings per group
30
+ r1 = ranked_sum_for(total_ranks, group_one)
31
+ r2 = ranked_sum_for(total_ranks, group_two)
32
+
33
+ # calculate U statistic
34
+ u1 = (n1 * (n1 + 1)/2.0) - r1
35
+ u2 = (n2 * (n2 + 1)/2.0 ) - r2
36
+
37
+ u_statistic = [u1.abs, u2.abs].min
38
+
39
+ median_u = (n1 * n2)/2.0
40
+
41
+ ties = total_ranks.values.select { |element| element[:counter] > 1 }
42
+
43
+ std_u = if ties.size > 0
44
+ corrected_sigma(ties, n1, n2)
45
+ else
46
+ Math.sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
47
+ end
48
+
49
+ z = (u_statistic - median_u)/std_u
50
+
51
+ # Most literature are not very specific about the normal distribution to be used.
52
+ # We ran multiple tests with a Normal(median_u, std_u) and Normal(0, 1) and we found
53
+ # the latter to be more aligned with the results.
54
+ probability = Distribution::StandardNormal.new.cumulative_function(z.abs)
55
+ p_value = 1 - probability
56
+ p_value *= 2 if tails == :two_tail
57
+
58
+ { probability: probability,
59
+ u: u_statistic,
60
+ z: z,
61
+ p_value: p_value,
62
+ alpha: alpha,
63
+ null: alpha < p_value,
64
+ alternative: p_value <= alpha,
65
+ confidence_level: 1 - alpha }
66
+ end
67
+
68
+ # Formula extracted from http://www.statstutor.ac.uk/resources/uploaded/mannwhitney.pdf
69
+ private def corrected_sigma(ties, total_group_one, total_group_two)
70
+ n = total_group_one + total_group_two
71
+
72
+ rank_sum = ties.reduce(0) do |memo, t|
73
+ memo += ((t[:counter] ** 3) - t[:counter])/12.0
74
+ end
75
+
76
+ left = (total_group_one * total_group_two)/(n * (n - 1)).to_f
77
+ right = (((n ** 3) - n)/12.0) - rank_sum
78
+
79
+ Math.sqrt(left * right)
80
+ end
81
+
82
+ private def ranked_sum_for(total, group)
83
+ # sum rankings per group
84
+ group.reduce(0) do |memo, element|
85
+ rank_of_element = total[element][:rank] / total[element][:counter].to_f
86
+ memo += rank_of_element
87
+ end
88
+ end
89
+ end
90
+
91
+ # Both test are the same. To keep the selected name, we just alias the class
92
+ # with the implementation.
93
+ MannWhitneyU = WilcoxonRankSumTest
94
+ end
95
+ end
@@ -1,3 +1,3 @@
1
1
  module Statistics
2
- VERSION = "1.0.2"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-statistics
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - esteban zapata
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-10-17 00:00:00.000000000 Z
11
+ date: 2018-01-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -130,8 +130,10 @@ files:
130
130
  - lib/statistics/distribution/uniform.rb
131
131
  - lib/statistics/distribution/weibull.rb
132
132
  - lib/statistics/statistical_test.rb
133
+ - lib/statistics/statistical_test/chi_squared_test.rb
133
134
  - lib/statistics/statistical_test/f_test.rb
134
135
  - lib/statistics/statistical_test/t_test.rb
136
+ - lib/statistics/statistical_test/wilcoxon_rank_sum_test.rb
135
137
  - lib/statistics/version.rb
136
138
  - ruby-statistics.gemspec
137
139
  homepage: https://github.com/estebanz01/ruby-statistics
@@ -154,7 +156,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
156
  version: '0'
155
157
  requirements: []
156
158
  rubyforge_project:
157
- rubygems_version: 2.6.14
159
+ rubygems_version: 2.5.1
158
160
  signing_key:
159
161
  specification_version: 4
160
162
  summary: A ruby gem for som specific statistics. Inspired by the jStat js library.