RubyGems - statsample - Versions diffs - 1.0.1 → 1.1.0 - Mend

statsample 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

data/.gemtest +0 -0
data/History.txt +14 -0
data/Manifest.txt +4 -0
data/README.txt +49 -13
data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
data/lib/statsample.rb +1 -23
data/lib/statsample/analysis.rb +49 -28
data/lib/statsample/analysis/suite.rb +18 -5
data/lib/statsample/analysis/suitereportbuilder.rb +9 -3
data/lib/statsample/anova.rb +2 -0
data/lib/statsample/anova/contrast.rb +79 -0
data/lib/statsample/anova/oneway.rb +39 -5
data/lib/statsample/converter/csv.rb +2 -5
data/lib/statsample/converters.rb +1 -0
data/lib/statsample/dataset.rb +31 -1
data/lib/statsample/graph/histogram.rb +1 -1
data/lib/statsample/regression/multiple/baseengine.rb +5 -0
data/lib/statsample/reliability/multiscaleanalysis.rb +3 -1
data/lib/statsample/reliability/scaleanalysis.rb +3 -4
data/lib/statsample/shorthand.rb +41 -1
data/lib/statsample/test.rb +10 -0
data/lib/statsample/test/kolmogorovsmirnov.rb +61 -0
data/lib/statsample/test/t.rb +92 -9
data/lib/statsample/vector.rb +143 -10
data/po/es/statsample.mo +0 -0
data/po/es/statsample.po +109 -110
data/po/statsample.pot +108 -60
data/test/helpers_tests.rb +1 -0
data/test/test_analysis.rb +70 -11
data/test/test_anova_contrast.rb +36 -0
data/test/test_anovawithvectors.rb +8 -0
data/test/test_dataset.rb +12 -0
data/test/test_factor_pa.rb +1 -3
data/test/test_test_kolmogorovsmirnov.rb +34 -0
data/test/test_test_t.rb +16 -0
data/test/test_vector.rb +40 -2
metadata +44 -118
data.tar.gz.sig +0 -0
metadata.gz.sig +0 -0

data/lib/statsample/anova/oneway.rb CHANGED Viewed

@@ -35,10 +35,10 @@ module Statsample
                       :name_denominator=>_("Explained variance"),
                       :name_numerator=>_("Unexplained variance")}
         @opts=opts_default.merge(opts)
-        opts_default.keys.each {|k|
-          send("#{k}=", @opts[k])
+        opts.keys.each {|k|
+          send("#{k}=", @opts[k]) if self.respond_to? "#{k}="
         }
-        @f_object=Statsample::Test::F.new(@ms_num,@ms_den,@df_num,@df_den)
+        @f_object=Statsample::Test::F.new(@ms_num, @ms_den, @df_num,@df_den)
       end
       # F value
       def f
@@ -62,6 +62,7 @@ module Statsample
       end
     end
     # One Way Anova with vectors
     # Example:
     #   v1=[2,3,4,5,6].to_scale
@@ -80,6 +81,11 @@ module Statsample
       attr_accessor :summary_levene
       # Show on summary descriptives for vectors
       attr_accessor :summary_descriptives
+      # Show on summary of contrasts
+      attr_accessor :summary_contrasts
+      # Array with stored contrasts
+      attr_reader :contrasts
       def initialize(*args)
         if args[0].is_a? Array
           @vectors=args.shift
@@ -92,11 +98,31 @@ module Statsample
                       :name_numerator=>_("Between Groups"),
                       :name_denominator=>_("Within Groups"),
                       :summary_descriptives=>false,
-                      :summary_levene=>true}
+                      :summary_levene=>true,
+                      :summary_contrasts=>true
+        }
         @opts=opts_default.merge(opts).merge(:ss_num=>ssbg, :ss_den=>sswg, :df_num=>df_bg, :df_den=>df_wg)
+        @contrasts=[]
         super(@opts)
       end
-      alias  :sst :ss_total
+      alias :sst :ss_total
+      alias :msb :ms_num
+      alias :msw :ms_den
+      # Generates and store a contrast.
+      # Options should be provided as a hash
+      # [:c]=>contrast vector
+      # [:c1 - :c2]=>index for automatic construction of contrast
+      # [:name]=>contrast name
+      def contrast(opts=Hash.new)
+        name=opts[:name] || _("Contrast for %s") % @name
+        opts=opts.merge({:vectors=>@vectors, :name=>name})
+        c=Statsample::Anova::Contrast.new(opts)
+        @contrasts.push(c)
+        c
+      end
       def levene
         Statsample::Test.levene(@vectors, :name=>_("Test of Homogeneity of variances (Levene)"))
       end
@@ -140,10 +166,18 @@ module Statsample
               end
             end
           end
           if summary_levene
             s.parse_element(levene)
           end
           report_building_table(s)
+          if summary_contrasts and @contrasts.size>0
+            @contrasts.each do |c|
+              s.parse_element(c)
+            end
+          end
         end
       end
     end

data/lib/statsample/converter/csv.rb CHANGED Viewed

@@ -12,16 +12,13 @@ module Statsample
       #
       # USE:
       #     ds=Statsample::CSV.read("test_csv.csv")
-      def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
+      def read(filename, empty=[''],ignore_lines=0,csv_opts=Hash.new)
         first_row=true
         fields=[]
         fields_data={}
         ds=nil
         line_number=0
-        opts={}
-        opts[:col_sep]=fs unless fs.nil?
-        opts[:row_sep]=rs unless rs.nil?
-        csv=CSV_klass.open(filename,'r',opts)
+        csv=CSV_klass.open(filename,'rb', csv_opts)
         csv.each do |row|
           line_number+=1
           if(line_number<=ignore_lines)

data/lib/statsample/converters.rb CHANGED Viewed

@@ -184,6 +184,7 @@ module Statsample
       #
       def read(filename, opts=Hash.new)
         require 'spreadsheet'
+        raise "options should be Hash" unless opts.is_a? Hash
         opts_default={
           :worksheet_id=>0,
           :ignore_lines=>0,

data/lib/statsample/dataset.rb CHANGED Viewed

@@ -119,6 +119,33 @@ module Statsample
     def has_missing_data?
       @vectors.any? {|k,v| v.has_missing_data?}
     end
+    # Return a nested hash using fields as keys and
+    # an array constructed of hashes with other values.
+    # If block provided, is used to provide the
+    # values, with parameters +row+ of dataset,
+    # +current+ last hash on hierarchy and
+    # +name+ of the key to include
+    def nest(*tree_keys,&block)
+      tree_keys=tree_keys[0] if tree_keys[0].is_a? Array
+      out=Hash.new
+      each do |row|
+        current=out
+        # Create tree
+        tree_keys[0,tree_keys.size-1].each do |f|
+          root=row[f]
+          current[root]||=Hash.new
+          current=current[root]
+        end
+        name=row[tree_keys.last]
+        if !block
+          current[name]||=Array.new
+          current[name].push(row.delete_if{|key,value| tree_keys.include? key})
+        else
+          current[name]=block.call(row, current,name)
+        end
+      end
+      out
+    end
     # Creates a new dataset. A dataset is a set of ordered named vectors
     # of the same size.
     #
@@ -170,6 +197,7 @@ module Statsample
       else
         ds=dup fields_to_include
       end
+      ds.name= self.name
       ds
     end
     #
@@ -192,7 +220,9 @@ module Statsample
         vectors[f]=@vectors[f].dup
         fields.push(f)
       }
-      Dataset.new(vectors,fields)
+      ds=Dataset.new(vectors,fields)
+      ds.name= self.name
+      ds
     end

data/lib/statsample/graph/histogram.rb CHANGED Viewed

@@ -44,7 +44,7 @@ module Statsample
       # Add a line showing normal distribution
       attr_accessor :line_normal_distribution
       # data could be a vector or a histogram
-      def initialize(data,opts=Hash.new)
+      def initialize(data, opts=Hash.new)
         prov_name=(data.respond_to?(:name)) ? data.name : ""
         opts_default={
           :name=>_("Histograma (%s)") % prov_name,

data/lib/statsample/regression/multiple/baseengine.rb CHANGED Viewed

@@ -79,6 +79,11 @@ module Statsample
         def sst
           raise "You should implement this"
         end
+        # R^2 Adjusted.
+        # Estimate Population R^2 usign Ezequiel formula.
+        # Always lower than sample R^2
+        # == Reference:
+        # * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11.
         def r2_adjusted
           r2-((1-r2)*@predictors_n).quo(df_e)
         end

data/lib/statsample/reliability/multiscaleanalysis.rb CHANGED Viewed

@@ -110,6 +110,8 @@ module Statsample
         opts||=pca_options
         Statsample::Factor::PCA.new(correlation_matrix, opts)
       end
+      # Retrieve Velicer's MAP
+      # using all scales.
       def map(opts=nil)
         opts||=map_options
         Statsample::Factor::MAP.new(correlation_matrix, opts)
@@ -141,7 +143,7 @@ module Statsample
       def report_building(b) # :nodoc:
         b.section(:name=>name) do |s|
           s.section(:name=>_("Reliability analysis of scales")) do |s2|
-            @scales.each_pair do |k,scale|
+            @scales.each_pair do |k, scale|
               s2.parse_element(scale)
             end
           end

data/lib/statsample/reliability/scaleanalysis.rb CHANGED Viewed

@@ -22,11 +22,10 @@ module Statsample
         @ods=ds
         @ds=ds.dup_only_valid(ds.fields - @dumped)
+        @ds.name=ds.name
         @k=@ds.fields.size
         @total=@ds.vector_sum
         @o_total=@dumped.size > 0 ? @ods.vector_sum : nil
         @vector_mean=@ds.vector_mean
@@ -165,7 +164,7 @@ module Statsample
                   t.row(["#{@ods[f].name}(#{f})", "%0.5f" % @ods[f].mean])
                 end
               end
-              s.parse_element(Statsample::Graph::Histogram.new(@o_total)) if @summary_histogram
+              s.parse_element(Statsample::Graph::Histogram.new(@o_total, :name=>"Histogram (complete data) for %s" % @name)) if @summary_histogram
             end
           end
@@ -229,7 +228,7 @@ module Statsample
               t.row row
             end # end each
           end # table
-          s.parse_element(Statsample::Graph::Histogram.new(@total)) if @summary_histogram
+          s.parse_element(Statsample::Graph::Histogram.new(@total, :name=>"Histogram (valid data) for %s" % @name)) if @summary_histogram
         end # section
       end # def
     end # class

data/lib/statsample/shorthand.rb CHANGED Viewed

@@ -1,9 +1,36 @@
+class Object
+  # Shorthand for Statsample::Analysis.store(*args,&block)
+  def ss_analysis(*args,&block)
+    Statsample::Analysis.store(*args,&block)
+  end
+end
 module Statsample
   # Module which provide shorthands for many methods.
   module Shorthand
     ###
     # :section: R like methods
     ###
+    def read_with_cache(klass, filename,opts=Hash.new, cache=true)
+      file_ds=filename+".ds"
+      if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
+        ds=Statsample.load(file_ds)
+      else
+        ds=klass.read(filename)
+        ds.save(file_ds) if cache
+      end
+      ds
+    end
+    # Import an Excel file. Cache result by default
+    def read_excel(filename, opts=Hash.new, cache=true)
+      read_with_cache(Statsample::Excel, filename, opts, cache)
+    end
+    # Import an CSV file. Cache result by default
+    def read_csv
+      read_with_cache(Statsample::CSV, filename, opts, cache)
+    end
     # Retrieve names (fields) from dataset
     def names(ds)
@@ -19,7 +46,7 @@ module Statsample
     end
     # Create a Statsample::Vector
     # Analog to R's c
-    def c(*args)
+    def vector(*args)
       Statsample::Vector[*args]
     end
     # Random generation for the normal distribution
@@ -77,5 +104,18 @@ module Statsample
     def dominance_analysis_bootstrap(*args)
       Statsample::DominanceAnalysis::Bootstrap.new(*args)
     end
+    def scale_analysis(*args)
+      Statsample::Reliability::ScaleAnalysis.new(*args)
+    end
+    def skill_scale_analysis(*args)
+      Statsample::Reliability::SkillScaleAnalysis.new(*args)
+    end
+    def multiscale_analysis(*args,&block)
+      Statsample::Reliability::MultiScaleAnalysis.new(*args,&block)
+    end
+    def test_u(*args)
+      Statsample::Test::UMannWhitney.new(*args)
+    end
+    module_function :test_u, :rnorm
   end
 end

data/lib/statsample/test.rb CHANGED Viewed

@@ -8,6 +8,7 @@ module Statsample
     autoload(:F, 'statsample/test/f')
     autoload(:ChiSquare, 'statsample/test/chisquare')
     autoload(:BartlettSphericity, 'statsample/test/bartlettsphericity')
+    autoload(:KolmogorovSmirnov, 'statsample/test/kolmogorovsmirnov')
     # Returns probability of getting a value lower or higher
     # than sample, using cdf and number of tails.
@@ -29,6 +30,15 @@ module Statsample
           2*cdf
       end
     end
+    # Get critical t to create confidence interval
+    def t_critical(confidence_level, df)
+      -Distribution::T.p_value((1-confidence_level) / 2.0, df)
+    end
+    # Get critical z to create confidence interval
+    def z_critical(confidence_level)
+      -Distribution::Z.p_value((1-confidence_level) / 2.0)
+    end
     extend self
     # Calculate chi square for two Matrix
     class << self

data/lib/statsample/test/kolmogorovsmirnov.rb ADDED Viewed

@@ -0,0 +1,61 @@
+module Statsample
+  module Test
+    # == Kolmogorov-Smirnov's test of equality of distributions.
+    class KolmogorovSmirnov
+      attr_reader :d
+      include Statsample::Test
+      include Summarizable
+      # Creates a new Kolmogorov-Smirnov test
+      # d1 should have each method
+      # d2 could be a Distribution class, with a cdf method,
+      # a vector or a lambda
+      def initialize(d1,d2)
+        raise "First argument should have each method" unless d1.respond_to? :each
+        @d1=make_cdf(d1)
+        if d2.respond_to? :cdf or d2.is_a? Proc
+          @d2=d2
+        elsif d2.respond_to? :each
+          @d2=make_cdf(d2)
+        else
+           raise "Second argument should respond to cdf or each"
+         end
+         calculate
+       end
+       def calculate
+         d=0
+        @d1.each {|x|
+        v1=@d1.cdf(x);
+        v2=@d2.is_a?(Proc) ? @d2.call(x) : @d2.cdf(x)
+        d=(v1-v2).to_f.abs if (v1-v2).abs>d
+        }
+        @d=d
+       end
+      # Make a wrapper EmpiricDistribution to any method which implements
+      # each
+      # On Statsample::Vector, only uses #valid_data
+      def make_cdf(v)
+        v.is_a?(Statsample::Vector) ? EmpiricDistribution.new(v.valid_data) : EmpiricDistribution.new(v)
+      end
+      class EmpiricDistribution
+        def initialize(data)
+          @min=data.min
+          @max=data.max
+          @data=data.sort
+          @n=data.size
+        end
+        def each
+          @data.each {|x|
+            yield x
+          }
+        end
+        def cdf(x)
+          return 0 if x<@min
+          return 1 if x>=@max
+          v=@data.index{|v1| v1>=x}
+          v.nil? ? 0 : (v+(x==@data[v]? 1 : 0)).quo(@n)
+        end
+      end
+    end
+  end
+end

data/lib/statsample/test/t.rb CHANGED Viewed

@@ -1,6 +1,12 @@
 module Statsample
   module Test
-    module T
+    # A t-test is any statistical hypothesis test in which the test statistic follows a Student's t distribution, if the null hypothesis is supported
+    class T
       class << self
         include Math
         # Test the null hypothesis that the population mean is equal to a specified value u, one uses the statistic.
@@ -10,7 +16,7 @@ module Statsample
         # * <tt>s</tt>: sample/differences standard deviation
         # * <tt>n</tt>: sample size
         def one_sample(x,u,s,n)
-          (x-u).quo(s.quo(Math::sqrt(n)))
+          (x-u)*Math::sqrt(n).quo(s)
         end
         # Test if means of two samples are different.
         # * <tt>x1</tt>: sample 1 mean
@@ -50,6 +56,73 @@ module Statsample
           num.quo(den)
         end
       end
+      include Statsample::Test
+      include Summarizable
+      attr_reader :standard_error, :estimate, :df
+      # Tails for p-value (:both, :left or :right). Default :both
+      attr_accessor :tails
+      # Name of F analysis
+      attr_accessor :name
+      attr_accessor :confidence_level
+      attr_reader :t
+      attr_accessor :estimate_name, :standard_error_name
+      # Creates a generic t test. Use OneSample or TwoSamplesIndependent
+      # classes for better summaries.
+      # Parameters:
+      # * estimate: estimate
+      # * standard_error: standard error of estimate
+      # * df: degrees of freedom
+      def initialize(estimate, standard_error, df, opts=Hash.new)
+        @estimate=estimate
+        @standard_error=standard_error
+        @df=df
+        @t = @estimate / @standard_error.to_f
+        opts_default={  :tails=>:both,
+                        :name=>_("T Test"),
+                        :estimate_name=>_("Estimate"),
+                        :standard_error_name=>_("Std.Err.of Estimate"),
+        :confidence_level=>0.95}
+        @opts = opts_default.merge(opts)
+        @opts.keys.each {|k|
+          send("#{k}=", @opts[k]) if respond_to? k
+        }
+      end
+      alias :se :standard_error
+      def to_f
+        t
+      end
+      # probability
+      def probability
+        p_using_cdf(Distribution::T.cdf(t, df),  tails)
+      end
+      def confidence_interval(cl=nil)
+          cl||=confidence_level
+          t_crit = t_critical(cl, df)
+          [estimate - se*t_crit, estimate + se*t_crit]
+      end
+      alias :ci :confidence_interval
+      def report_building(builder) #:nodoc:
+        builder.section(:name=>@name) do |section|
+          section.text _("%s: %0.4f | %s: %0.4f") % [@estimate_name, @estimate, @standard_error_name, se]
+          report_building_t(section)
+        end
+      end
+      def report_building_t(s)
+        df_f=@df.is_a?(Integer) ? "%d" : "%0.4f"
+        s.text _("t(%d) = %0.4f, p=%0.4f (%s tails)") % [df, t,probability, tails]
+        s.text _("CI(%d%%): %0.4f - %0.4f") % [confidence_level*100, ci[0],ci[1]]
+      end
       # One Sample t-test
       # == Usage
       #   a=1000.times.map {rand(100)}.to_scale
@@ -91,22 +164,32 @@ module Statsample
           @name=@opts[:name]
           @u=@opts[:u]
           @tails=@opts[:tails]
+          @confidence_level=@opts[:confidence_level] || 0.95
           @df= @vector.n_valid-1
           @t=nil
         end
+        def t_object
+          T.new(@vector.mean-u, @vector.se, @vector.n_valid-1, opts)
+        end
         def t
-          T.one_sample(@vector.mean, @u, @vector.sd, @vector.n_valid)
+          t_object.t
         end
         def probability
-          p_using_cdf(Distribution::T.cdf(t, @df), tails)
+          t_object.probability
+        end
+        def standard_error
+          t_object.standard_error
+        end
+        alias :se :standard_error
+        def confidence_interval(cl=nil)
+          t_object.confidence_interval(cl)
         end
+        alias :ci :confidence_interval
         def report_building(b) # :nodoc:
           b.section(:name=>@name) {|s|
-            s.text "Sample mean: #{@vector.mean}"
-            s.text "Population mean:#{u}"
-            s.text "Tails: #{tails}"
-            s.text sprintf("t = %0.4f, p=%0.4f, d.f=%d", t, probability, df)
+            s.text _("Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f") % [@vector.mean, @vector.sd, se]
+            s.text _("Population mean: %0.4f") % u if u!=0
+            t_object.report_building_t(s)
           }
         end
       end