RubyGems - ruby-statistics - Versions diffs - 1.0.2 → 2.0.0 - Mend

ruby-statistics 1.0.2 → 2.0.0

Files changed (13) hide show

checksums.yaml +4 -4
data/.travis.yml +2 -0
data/README.md +11 -0
data/lib/math.rb +1 -1
data/lib/statistics/distribution/beta.rb +1 -0
data/lib/statistics/distribution/normal.rb +40 -0
data/lib/statistics/distribution/t_student.rb +32 -0
data/lib/statistics/distribution/weibull.rb +20 -0
data/lib/statistics/statistical_test/chi_squared_test.rb +42 -0
data/lib/statistics/statistical_test/t_test.rb +22 -0
data/lib/statistics/statistical_test/wilcoxon_rank_sum_test.rb +95 -0
data/lib/statistics/version.rb +1 -1
metadata +5 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7e8b5d9c7b99a8d2bd81f0976ff883558053fbd1
-  data.tar.gz: 886fa6a14b3620dcb81be856da2ba7d5539d0d2b
+  metadata.gz: 87876f9613fc4472a574b392096f81a18e804da0
+  data.tar.gz: 52316d0cc82b0a2e89b2409ff6f13593f5e4d73c
 SHA512:
-  metadata.gz: 726ac86977b1354ddf1e82fd7d7794b1e14717f9132e2cc3544a28fcc130d76a8ac839f8a5e0280a98dc4e0a9620889f999a434c7349cfead7a8d5bf6c580f05
-  data.tar.gz: 605336ac006a155924d1fea0252577c5cab88d8375be119856dc80d64ed3cd4780af8136f496ec211a671af1d48d76de55e77f0a8ff2315e0421e2806284fe14
+  metadata.gz: 4a113a9384ff5d4cce963217c97a194ecf142a389f2f5c1f712ec92cef2ef498b656809cdc528dbf5b178104d39058e630bfd69c401858c33ca1b056cfe545ae
+  data.tar.gz: 078fb2c0bae3eb54f357b3bb614692289dea75ef8c93e264b64ffcb29567a48a5f39f9bb7e01cc56dd0ea8d3f4dfc82563749575ef25653236c6e9268e85f6c7

data/.travis.yml CHANGED Viewed

@@ -3,4 +3,6 @@ language: ruby
 rvm:
   - 2.2
   - 2.3.1
+  - 2.4.0
+  - 2.5.0
 before_install: gem install bundler

data/README.md CHANGED Viewed

@@ -1,7 +1,15 @@
 # Ruby Statistics
+![](https://travis-ci.org/estebanz01/ruby-statistics.svg?branch=master)
 A basic ruby gem that implements some statistical methods, functions and concepts to be used in any ruby environment without depending on any mathematical software like `R`, `Matlab`, `Octave` or similar.
+Unit test runs under the following ruby versions:
+* Ruby 2.2.
+* Ruby 2.3.1.
+* Ruby 2.4.0.
+* Ruby 2.5.0.
 We got the inspiration from the folks at [JStat](https://github.com/jstat/jstat) and some interesting lectures about [Keystroke dynamics](http://www.biometric-solutions.com/keystroke-dynamics.html).
 Some logic and algorithms are extractions or adaptations from other authors, which are referenced in the comments.
@@ -43,6 +51,9 @@ poisson = Distribution::Poisson.new(l) # Using Distribution alias.
 normal = Statistics::Distribution::StandardNormal.new # Using all namespaces.
 ```
+## Documentation
+You can find a bit more detailed documentation of all available distributions, tests and functions in the [Documentation Index](https://github.com/estebanz01/ruby-statistics/wiki/Documentation-Index)
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

data/lib/math.rb CHANGED Viewed

@@ -45,7 +45,7 @@ module Math
   def self.lower_incomplete_gamma_function(s, x)
     # The greater the iterations, the better. That's why we are iterating 10_000 * x times
-    self.simpson_rule(0, x, (10_000 * x).round) do |t|
+    self.simpson_rule(0, x, (10_000 * x.round).round) do |t|
       (t ** (s - 1)) * Math.exp(-t)
     end
   end

data/lib/statistics/distribution/beta.rb CHANGED Viewed

@@ -28,6 +28,7 @@ module Statistics
       end
       def mean
+        return if alpha + beta == 0
         alpha / (alpha + beta)
       end
     end

data/lib/statistics/distribution/normal.rb CHANGED Viewed

@@ -25,6 +25,46 @@ module Statistics
         (left_up/(left_down) * right)
       end
+      ## Marsaglia polar method implementation for random gaussian (normal) number generation.
+      # References:
+      # https://en.wikipedia.org/wiki/Marsaglia_polar_method
+      # https://math.stackexchange.com/questions/69245/transform-uniform-distribution-to-normal-distribution-using-lindeberg-l%C3%A9vy-clt
+      # https://www.projectrhea.org/rhea/index.php/The_principles_for_how_to_generate_random_samples_from_a_Gaussian_distribution
+      def random(elements: 1, seed: Random.new_seed)
+        results = []
+        # Setup seed
+        srand(seed)
+        # Number of random numbers to be generated.
+        elements.times do
+          x, y, r = 0.0, 0.0, 0.0
+          # Find an (x, y) point in the x^2 + y^2 < 1 circumference.
+          loop do
+            x = 2.0 * rand - 1.0
+            y = 2.0 * rand - 1.0
+            r = (x ** 2) + (y ** 2)
+            break unless r >= 1.0 || r == 0
+          end
+          # Project the random point to the required random distance
+          r = Math.sqrt(-2.0 * Math.log(r) / r)
+          # Transform the random distance to a gaussian value and append it to the results array
+          results << mean + x * r * standard_deviation
+        end
+        if elements == 1
+          results.first
+        else
+          results
+        end
+      end
     end
     class StandardNormal < Normal

data/lib/statistics/distribution/t_student.rb CHANGED Viewed

@@ -45,6 +45,38 @@ module Statistics
           degrees_of_freedom/(degrees_of_freedom - 2.0)
         end
       end
+      # Quantile function extracted from http://www.jennessent.com/arcview/idf.htm
+      # TODO: Make it truly Student's T sample.
+      def random(elements: 1, seed: Random.new_seed)
+        warn 'This is an alpha version code. The generated sample is similar to an uniform distribution'
+        srand(seed)
+        v = degrees_of_freedom
+        results = []
+        # Because the Quantile function of a student-t distribution is between (-Infinity, y)
+        # we setup an small threshold in order to properly compute the integral
+        threshold = 10_000.0e-12
+        elements.times do
+          y = rand
+          results << Math.simpson_rule(threshold, y, 10_000) do |t|
+            up = Math.gamma((v+1)/2.0)
+            down = Math.sqrt(Math::PI * v) * Math.gamma(v/2.0)
+            right = (1 + ((y ** 2)/v.to_f)) ** ((v+1)/2.0)
+            left = up/down.to_f
+            left * right
+          end
+        end
+        if elements == 1
+          results.first
+        else
+          results
+        end
+      end
     end
   end
 end

data/lib/statistics/distribution/weibull.rb CHANGED Viewed

@@ -41,6 +41,26 @@ module Statistics
         (scale ** 2) * (left - right)
       end
+      # Using the inverse CDF function, also called quantile, we can calculate
+      # a random sample that follows a weibull distribution.
+      #
+      # Formula extracted from http://www.stat.yale.edu/Courses/1997-98/101/chigf.htm
+      def random(elements: 1, seed: Random.new_seed)
+        results = []
+        srand(seed)
+        elements.times do
+          results << ((-1/scale) * Math.log(1 - rand)) ** (1/shape)
+        end
+        if elements == 1
+          results.first
+        else
+          results
+        end
+      end
     end
   end
 end

data/lib/statistics/statistical_test/chi_squared_test.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module Statistics
+  module StatisticalTest
+    class ChiSquaredTest
+      def self.chi_statistic(expected, observed)
+        # If the expected is a number, we asumme that all expected observations
+        # has the same probability to occur, hence we expect to see the same number
+        # of expected observations per each observed value
+        statistic = if expected.is_a? Numeric
+                      observed.reduce(0) do |memo, observed_value|
+                        up = (observed_value - expected) ** 2
+                        memo += (up/expected.to_f)
+                      end
+                    else
+                      expected.each_with_index.reduce(0) do |memo, (expected_value, index)|
+                        up = (observed[index] - expected_value) ** 2
+                        memo += (up/expected_value.to_f)
+                      end
+                    end
+          [statistic, observed.size - 1]
+      end
+      def self.goodness_of_fit(alpha, expected, observed)
+        chi_score, df = *self.chi_statistic(expected, observed) # Splat array result
+        return if chi_score.nil? || df.nil?
+        probability = Distribution::ChiSquared.new(df).cumulative_function(chi_score)
+        p_value = 1 - probability
+        # According to https://stats.stackexchange.com/questions/29158/do-you-reject-the-null-hypothesis-when-p-alpha-or-p-leq-alpha
+        # We can assume that if p_value <= alpha, we can safely reject the null hypothesis, ie. accept the alternative hypothesis.
+        { probability: probability,
+          p_value: p_value,
+          alpha: alpha,
+          null: alpha < p_value,
+          alternative: p_value <= alpha,
+          confidence_level: 1 - alpha }
+      end
+    end
+  end
+end

data/lib/statistics/statistical_test/t_test.rb CHANGED Viewed

@@ -41,6 +41,28 @@ module Statistics
           alternative: p_value <= alpha,
           confidence_level: 1 - alpha }
       end
+      def self.paired_test(alpha, tails, left_group, right_group)
+        # Handy snippet grabbed from https://stackoverflow.com/questions/2682411/ruby-sum-corresponding-members-of-two-or-more-arrays
+        differences = [left_group, right_group].transpose.map { |value| value.reduce(:-) }
+        degrees_of_freedom = differences.size - 1
+        down = differences.standard_deviation/Math.sqrt(differences.size)
+        t_score = (differences.mean - 0)/down.to_f
+        probability = Distribution::TStudent.new(degrees_of_freedom).cumulative_function(t_score)
+        p_value = 1 - probability
+        p_value *= 2 if tails == :two_tail
+        { probability: probability,
+          p_value: p_value,
+          alpha: alpha,
+          null: alpha < p_value,
+          alternative: p_value <= alpha,
+          confidence_level: 1 - alpha }
+      end
     end
   end
 end

data/lib/statistics/statistical_test/wilcoxon_rank_sum_test.rb ADDED Viewed

@@ -0,0 +1,95 @@
+module Statistics
+  module StatisticalTest
+    class WilcoxonRankSumTest
+      def rank(elements)
+        ranked_elements = {}
+        elements.sort.each_with_index do |element, index|
+          if ranked_elements.fetch(element, false)
+            # This allow us to solve the ties easily when performing the rank summation per group
+            ranked_elements[element][:counter] += 1
+            ranked_elements[element][:rank] += (index + 1)
+          else
+            ranked_elements[element] = { counter: 1, rank: (index + 1) }
+          end
+        end
+        # ranked_elements = [{ x => { counter: 1, rank: y } ]
+        ranked_elements
+      end
+      # Steps to perform the calculation are based on http://www.mit.edu/~6.s085/notes/lecture5.pdf
+      def perform(alpha, tails, group_one, group_two)
+        # Size for each group
+        n1, n2 = group_one.size, group_two.size
+        # Rank all data
+        total_ranks = rank(group_one + group_two)
+        # sum rankings per group
+        r1 = ranked_sum_for(total_ranks, group_one)
+        r2 = ranked_sum_for(total_ranks, group_two)
+        # calculate U statistic
+        u1 = (n1 * (n1 + 1)/2.0) - r1
+        u2 = (n2 * (n2 + 1)/2.0 ) - r2
+        u_statistic = [u1.abs, u2.abs].min
+        median_u = (n1 * n2)/2.0
+        ties = total_ranks.values.select { |element| element[:counter] > 1 }
+        std_u = if ties.size > 0
+                  corrected_sigma(ties, n1, n2)
+                else
+                  Math.sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
+                end
+        z = (u_statistic - median_u)/std_u
+        # Most literature are not very specific about the normal distribution to be used.
+        # We ran multiple tests with a Normal(median_u, std_u) and Normal(0, 1) and we found
+        # the latter to be more aligned with the results.
+        probability = Distribution::StandardNormal.new.cumulative_function(z.abs)
+        p_value = 1 - probability
+        p_value *= 2 if tails == :two_tail
+        { probability: probability,
+          u: u_statistic,
+          z: z,
+          p_value: p_value,
+          alpha: alpha,
+          null: alpha < p_value,
+          alternative: p_value <= alpha,
+          confidence_level: 1 - alpha }
+      end
+      # Formula extracted from http://www.statstutor.ac.uk/resources/uploaded/mannwhitney.pdf
+      private def corrected_sigma(ties, total_group_one, total_group_two)
+        n = total_group_one + total_group_two
+        rank_sum = ties.reduce(0) do |memo, t|
+                    memo += ((t[:counter] ** 3) - t[:counter])/12.0
+                  end
+        left = (total_group_one * total_group_two)/(n * (n - 1)).to_f
+        right = (((n ** 3) - n)/12.0) - rank_sum
+        Math.sqrt(left * right)
+      end
+      private def ranked_sum_for(total, group)
+        # sum rankings per group
+        group.reduce(0) do |memo, element|
+          rank_of_element = total[element][:rank] / total[element][:counter].to_f
+          memo += rank_of_element
+        end
+      end
+    end
+    # Both test are the same. To keep the selected name, we just alias the class
+    # with the implementation.
+    MannWhitneyU = WilcoxonRankSumTest
+  end
+end

data/lib/statistics/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Statistics
-  VERSION = "1.0.2"
+  VERSION = "2.0.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ruby-statistics
 version: !ruby/object:Gem::Version
-  version: 1.0.2
+  version: 2.0.0
 platform: ruby
 authors:
 - esteban zapata
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-10-17 00:00:00.000000000 Z
+date: 2018-01-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -130,8 +130,10 @@ files:
 - lib/statistics/distribution/uniform.rb
 - lib/statistics/distribution/weibull.rb
 - lib/statistics/statistical_test.rb
+- lib/statistics/statistical_test/chi_squared_test.rb
 - lib/statistics/statistical_test/f_test.rb
 - lib/statistics/statistical_test/t_test.rb
+- lib/statistics/statistical_test/wilcoxon_rank_sum_test.rb
 - lib/statistics/version.rb
 - ruby-statistics.gemspec
 homepage: https://github.com/estebanz01/ruby-statistics
@@ -154,7 +156,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.6.14
+rubygems_version: 2.5.1
 signing_key:
 specification_version: 4
 summary: A ruby gem for som specific statistics. Inspired by the jStat js library.