RubyGems - statsample - Versions diffs - 0.3.4 → 0.4.0 - Mend

statsample 0.3.4 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

data/History.txt +8 -0
data/Manifest.txt +20 -2
data/data/crime.txt +47 -0
data/data/test_binomial.csv +201 -0
data/demo/distribution_t.rb +2 -2
data/demo/regression.rb +2 -1
data/lib/distribution.rb +8 -0
data/lib/distribution/chisquare.rb +24 -0
data/lib/distribution/f.rb +25 -0
data/lib/distribution/normal.rb +25 -0
data/lib/distribution/t.rb +22 -0
data/lib/matrix_extension.rb +78 -0
data/lib/statistics2.rb +531 -0
data/lib/statsample.rb +12 -9
data/lib/statsample/anova.rb +1 -5
data/lib/statsample/bivariate.rb +24 -20
data/lib/statsample/combination.rb +14 -4
data/lib/statsample/converters.rb +17 -1
data/lib/statsample/dataset.rb +66 -10
data/lib/statsample/dominanceanalysis/bootstrap.rb +1 -3
data/lib/statsample/graph/gdchart.rb +2 -3
data/lib/statsample/graph/svggraph.rb +8 -4
data/lib/statsample/mle.rb +137 -0
data/lib/statsample/mle/logit.rb +95 -0
data/lib/statsample/mle/normal.rb +83 -0
data/lib/statsample/mle/probit.rb +93 -0
data/lib/statsample/regression.rb +3 -1
data/lib/statsample/regression/binomial.rb +65 -0
data/lib/statsample/regression/binomial/logit.rb +13 -0
data/lib/statsample/regression/binomial/probit.rb +13 -0
data/lib/statsample/regression/multiple.rb +61 -58
data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
data/lib/statsample/srs.rb +5 -5
data/lib/statsample/vector.rb +129 -59
data/test/test_anova.rb +0 -5
data/test/test_dataset.rb +13 -1
data/test/test_distribution.rb +57 -0
data/test/test_gsl.rb +22 -0
data/test/test_logit.rb +22 -0
data/test/test_mle.rb +140 -0
data/test/test_r.rb +9 -0
data/test/test_regression.rb +12 -4
data/test/test_srs.rb +0 -4
data/test/test_stata.rb +11 -0
data/test/test_statistics.rb +0 -15
data/test/test_vector.rb +11 -0
metadata +28 -4
data/lib/statsample/chidistribution.rb +0 -39
data/lib/statsample/regression/logit.rb +0 -35

data/lib/statsample.rb CHANGED Viewed

@@ -21,9 +21,8 @@
 $:.unshift(File.dirname(__FILE__))
 $:.unshift(File.expand_path(File.dirname(__FILE__)+"/../ext"))
-require 'delegate'
 require 'matrix'
+require 'distribution'
 class Numeric
   def square ; self * self ; end
@@ -44,6 +43,7 @@ def create_test(*args,&proc)
     fields=args
     [description, fields, Proc.new]
 end
+#--
 # Test extensions
 begin
     require 'gettext'
@@ -59,7 +59,7 @@ begin
             end
         end
 end
     begin
         require 'rbgsl'
         HAS_GSL=true
@@ -72,7 +72,7 @@ end
     rescue LoadError
         HAS_ALGIB=false
     end
-#
+# ++
 # Modules for statistical analysis
 # See first:
 # * Converter : several modules to import and export data
@@ -80,12 +80,14 @@ end
 # * Dataset: An union of vectors.
 #
 module Statsample
-    VERSION = '0.3.4'
+    VERSION = '0.4.0'
     SPLIT_TOKEN = ","
 	autoload(:Database, 'statsample/converters')
     autoload(:Anova, 'statsample/anova')
 	autoload(:Combination, 'statsample/combination')
 	autoload(:CSV, 'statsample/converters')
+	autoload(:PlainText, 'statsample/converters')
 	autoload(:Excel, 'statsample/converters')
 	autoload(:GGobi, 'statsample/converters')
     autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
@@ -99,8 +101,7 @@ module Statsample
 	autoload(:Multivariate, 'statsample/multivariate')
 	autoload(:Multiset, 'statsample/multiset')
 	autoload(:StratifiedSample, 'statsample/multiset')
+	autoload(:MLE, 'statsample/mle')
 	autoload(:Regression, 'statsample/regression')
 	autoload(:Test, 'statsample/test')
     def self.load(filename)
@@ -240,16 +241,18 @@ module Statsample
         end
     end
-    module STATSAMPLE__
+    module STATSAMPLE__ #:nodoc:
     end
 end
+#--
 begin
     require 'statsamplert'
 rescue LoadError
-    module Statsample
+    module Statsample
         OPTIMIZED=false
     end
 end

data/lib/statsample/anova.rb CHANGED Viewed

@@ -63,11 +63,7 @@ module Statsample
         end
         # Significance of Fisher
         def significance
-            if HAS_GSL
-                GSL::Cdf.fdist_Q(f,df_bg,df_wg)
-            else
-                raise "Need Ruby/GSL"
-            end
+            1.0-Distribution::F.cdf(f,df_bg,df_wg)
         end
     end
     end

data/lib/statsample/bivariate.rb CHANGED Viewed

@@ -20,8 +20,8 @@ module Statsample
                }
                sum
             end
-			# Covariance. The denominator is n-1
-			def covariance_slow(v1a,v2a)
+			def covariance_slow(v1a,v2a) # :nodoc:
 				t=0
 				m1=v1a.mean
 				m2=v1a.mean
@@ -40,8 +40,8 @@ module Statsample
 					pearson_slow(v1a,v2a)
 				end
             end
-            #:nodoc:
-            def pearson_slow(v1a,v2a)
+            def pearson_slow(v1a,v2a) # :nodoc:
                 v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
                 t=0
                 siz=v1s.size
@@ -62,7 +62,7 @@ module Statsample
             # Retrieves the value for t test for a pearson correlation
             # giving r and vector size
             def t_r(r,size)
-                r*Math::sqrt(((size)-2).to_f / (1 - r**2))
+                r * Math::sqrt(((size)-2).to_f / (1 - r**2))
             end
             # Retrieves the probability value (a la SPSS)
             # for a given t, size and number of tails.
@@ -71,7 +71,7 @@ module Statsample
             # * :right, :positive or 1  : for r > 0
             # * :left, :negative        : for r < 0
-            def prop_pearson(t,size, tails=:both)
+            def prop_pearson(t, size, tails=:both)
                 tails=:both if tails==2
                 tails=:right if tails==1 or tails==:positive
                 tails=:left if tails==:negative
@@ -82,16 +82,12 @@ module Statsample
                 else
                     1
                 end
-                if HAS_GSL
-                        t=-t if t>0 and (tails==:both)
-                        cdf=GSL::Cdf::tdist_P(t,size-2)
-                        if(tails==:right)
-                            1.0-(cdf*n_tails)
-                        else
-                            cdf*n_tails
-                        end
+                t=-t if t>0 and (tails==:both)
+                cdf=Distribution::T.cdf(t, size-2)
+                if(tails==:right)
+                    1.0-(cdf*n_tails)
                 else
-                raise "Needs ruby-gsl"
+                    cdf*n_tails
                 end
             end
             # Returns residual score after delete variance
@@ -110,6 +106,8 @@ module Statsample
                 }
                 nv.to_vector(:scale)
             end
+            # Correlation between v1 and v2, controling the effect of
+            # control on both.
             def partial_correlation(v1,v2,control)
                 v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
                 rv1v2=pearson(v1a,v2a)
@@ -119,7 +117,9 @@ module Statsample
                 (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
             end
-            # Covariance matrix
+            # Covariance matrix.
+            # Order of rows and columns depends on Dataset#fields order
             def covariance_matrix(ds)
                 ds.collect_matrix do |row,col|
                     if (ds[row].type!=:scale or ds[col].type!=:scale)
@@ -130,7 +130,8 @@ module Statsample
                 end
             end
-            # The classic correlation matrix for all fields of a dataset
+            # Correlation matrix.
+            # Order of rows and columns depends on Dataset#fields order
             def correlation_matrix(ds)
                 ds.collect_matrix {|row,col|
@@ -154,16 +155,19 @@ module Statsample
                         end
                 }
             end
-            def correlation_probability_matrix(ds)
+            # Matrix of correlation probability
+            # Order of rows and columns depends on Dataset#fields order
+            def correlation_probability_matrix(ds, tails=:both)
                 rows=ds.fields.collect{|row|
                     ds.fields.collect{|col|
                         v1a,v2a=Statsample.only_valid(ds[row],ds[col])
-                        (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size)
+                        (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
                     }
                 }
                 Matrix.rows(rows)
             end
-			# Calculate Spearman correlation coefficient between 2 vectors
+			# Spearman ranked correlation coefficient between 2 vectors
 			def spearman(v1,v2)
 				v1a,v2a=Statsample.only_valid(v1,v2)
 				v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)

data/lib/statsample/combination.rb CHANGED Viewed

@@ -1,12 +1,22 @@
 module Statsample
     # Combination class systematically generates all combinations of n elements, taken r at a time.
-    # Use GSL::Combination is available for extra speed
+    # With rbgsl, GSL::Combination is available for extra speed
     # Source: http://snippets.dzone.com/posts/show/4666
     # Use:
     #  comb=Statsample::Combination.new(3,5)
-    #  comb.each{|c|
-    #     p c
-    #  }
+    #  => #<Statsample::Combination:0x7f6323804e08 @n=5, @d=#<Statsample::Combination::CombinationGsl:0x7f63237ff7f0 @n=5, @k=3, @c=GSL::Combination>, @k=3>
+    #  comb.each{|c| p c }
+    #  [0, 1, 2]
+    #  [0, 1, 3]
+    #  [0, 1, 4]
+    #  [0, 2, 3]
+    #  [0, 2, 4]
+    #  [0, 3, 4]
+    #  [1, 2, 3]
+    #  [1, 2, 4]
+    #  [1, 3, 4]
+    #  [2, 3, 4]
+    #
     class Combination
         attr_reader :d
         def initialize(k,n,only_ruby=false)

data/lib/statsample/converters.rb CHANGED Viewed

@@ -117,6 +117,21 @@ module Statsample
         end
     end
+    class PlainText < SpreadsheetBase
+        class << self
+            def read(filename, fields)
+                ds=Statsample::Dataset.new(fields)
+                fp=File.open(filename,"r")
+                fp.each_line do |line|
+                    row=process_row(line.strip.split(/\s+/),[""])
+                    ds.add_case_array(row)
+                end
+                convert_to_scale(ds,fields)
+                ds.update_valid_data
+                ds
+            end
+        end
+    end
     class Excel < SpreadsheetBase
         class << self
             def write(dataset,filename)
@@ -157,7 +172,7 @@ module Statsample
                         }
                     line_number+=1
                     if(line_number<=ignore_lines)
-                        #puts "Skip line"
+                    #puts "Skip line #{line_number}:#{row.to_s}"
                         next
                     end
                     # This should be fixed.
@@ -235,6 +250,7 @@ module Statsample
         # USE:
         #     Statsample::CSV.write(ds,"test_csv.csv")
         def write(dataset,filename, convert_comma=false,*opts)
+ require 'csv'
                 writer=::CSV.open(filename,'w',*opts)
                 writer << dataset.fields
                 dataset.each_array{|row|

data/lib/statsample/dataset.rb CHANGED Viewed

@@ -36,11 +36,26 @@ module Statsample
         include Writable
         attr_reader :vectors, :fields, :cases, :i
         attr_accessor :labels
-        # To create a dataset
-        # * Dataset.new()
-        # * Dataset.new(%w{v1 v2 v3})
-        # * Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
-        # * Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
+        # Creates a new dataset. A dataset is a set of ordered named vectors
+        # of the same size.
+        #
+        # [vectors] With an array, creates a set of empty vectors named as
+        # values on the array. With a hash, each Vector is assigned as
+        # a variable of the Dataset named as its key
+        # [fields]  Array of names for vectors. Is only used for set the
+        # order of variables. If empty, vectors keys on alfabethic order as
+        # used as fields
+        # [labels]  Hash to set names for fields.
+        #
+        #
+        #   Dataset.new()
+        #   Dataset.new(%w{v1 v2 v3})
+        #   Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
+        #   Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
+        #
+        # The fast way to create a dataset uses Hash#to_dataset, with
+        # fields and labels as arguments
+        #   ds = {'v1'=>[1,2,3].to_vector}.to_dataset
         #
         def initialize(vectors={}, fields=[], labels={})
             if vectors.instance_of? Array
@@ -296,7 +311,7 @@ module Statsample
                 }
             end
             if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
-                def case_as_hash(c)
+                def case_as_hash(c) # :nodoc:
                     Statsample::STATSAMPLE__.case_as_hash(self,c)
                 end
             else
@@ -306,7 +321,7 @@ module Statsample
             end
             if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
-                def case_as_array(c)
+                def case_as_array(c) # :nodoc:
                     Statsample::STATSAMPLE__.case_as_array(self,c)
                 end
             else
@@ -314,16 +329,16 @@ module Statsample
                     _case_as_array(c)
                 end
             end
-        def _case_as_hash(c)
+            def _case_as_hash(c) # :nodoc:
             @fields.inject({}) {|a,x|
                 a[x]=@vectors[x][c]
                 a
             }
         end
-        def _case_as_array(c)
+        def _case_as_array(c) # :nodoc:
             @fields.collect {|x| @vectors[x][c]}
         end
+        # Returns each case as a hash
         def each
             begin
                 @i=0
@@ -337,6 +352,7 @@ module Statsample
                 raise DatasetException.new(self,e)
             end
         end
+        # Returns each case as index and hash
         def each_with_index
             begin
                 @i=0
@@ -350,6 +366,7 @@ module Statsample
                 raise DatasetException.new(self,e)
             end
         end
+        # Returns each case as an array
         def each_array
             @cases.times {|i|
                 @i=i
@@ -495,6 +512,40 @@ module Statsample
             ms
 		end
+        # Returns a vector, based on a string with a calculation based
+        # on vector
+        # The calculation will be eval'ed, so you can put any variable
+        # or expression valid on ruby
+        # For example:
+        #   a=[1,2].to_vector(scale)
+        #   b=[3,4].to_vector(scale)
+        #   ds={'a'=>a,'b'=>b}.to_dataset
+        #   ds.calculate("a+b")
+        #   => Vector [4,6]
+        def compute(text)
+            @fields.each{|f|
+                if @vectors[f].type=:scale
+                    text.gsub!(f,"row['#{f}'].to_f")
+                else
+                    text.gsub!(f,"row['#{f}']")
+                end
+            }
+            collect_with_index {|i,row|
+                invalid=false
+                @fields.each{|f|
+                    if @vectors[f].data_with_nils[i].nil?
+                        invalid=true
+                    end
+                }
+                if invalid
+                    nil
+                else
+                    eval(text)
+                end
+            }
+        end
         # Test each row with one or more tests
         # each test is a Proc with the form
         #   Proc.new {|row| row['age']>0}
@@ -540,5 +591,10 @@ module Statsample
 			}
 			out
 		end
+        def as_r
+            require 'rsruby/dataframe'
+            r=RSRuby.instance
+        end
     end
 end

data/lib/statsample/dominanceanalysis/bootstrap.rb CHANGED Viewed

@@ -69,10 +69,8 @@ class DominanceAnalysis
             out.extend report_type
             out.add _("Summary for Bootstrap Dominance Analysis of %s on %s\n") % [@fields.join(", "), @y_var]
             out.add _("Sample size: %d\n") % @n_samples
-            if HAS_GSL
-                t=GSL::Cdf.tdist_Pinv(1-((1-alfa) / 2),@n_samples - 1)
+            t=Distribution::T.p_value(1-((1-alfa) / 2),@n_samples - 1)
                 out.add "t:#{t}\n"
-            end
             out.add "Linear Regression Engine: #{@lr_class.name}"
             out.nl
             table=ReportTable.new

data/lib/statsample/graph/gdchart.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module Statsample
 		end
 		end
 	end
-	class Nominal
+	class Vector
 		# Creates a barchart using ruby-gdchart
 		def gdchart_frequencies(file, width=300, height=150, chart_type=GDChart::BAR, options={})
 			labels,data=[],[]
@@ -28,9 +28,8 @@ module Statsample
 			options['ext_color']=[0xFF3399,0xFF9933,0xFFEE33,0x33FF33, 0x9966FF]
 			Statsample::Util.chart_gdchart(file,width,height,chart_type, labels,options,1,data)
 		end
-	end
-	class Scale < Ordinal
 		def gdchart_histogram(bins,file, width=300, height=150, chart_type=GDChart::BAR, options={})
+            check_type :scale
             labels=[]
             h=histogram(bins)
             data=[]

data/lib/statsample/graph/svggraph.rb CHANGED Viewed

@@ -27,6 +27,7 @@ module Statsample
 			}
 		end
 		def svggraph_histogram(bins, options={})
+            check_type :scale
             options={:graph_title=>"Histogram", :show_graph_title=>true,:show_normal=>true, :mean=>self.mean, :sigma=>sdp }.merge! options
             graph = Statsample::Graph::SvgHistogram.new(options)
             graph.histogram=histogram(bins)
@@ -35,6 +36,7 @@ module Statsample
         # Returns a Run-Sequence Plot
         # Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/runseqpl.htm
         def svggraph_runsequence_plot(options={})
+            check_type :scale
             options={:graph_title=>"Run-Sequence Plot", :show_graph_title=>true, :scale_x_integers => true, :add_popups=>true }.merge! options
             vx=(1..@data.size).to_a.to_vector(:scale)
             vy=@data.to_vector(:scale)
@@ -45,6 +47,7 @@ module Statsample
             graph
         end
         def svggraph_boxplot(options={})
+            check_type :scale
             options={:graph_title=>"Boxplot", :fields=>['vector'], :show_graph_title=>true}.merge! options
             vx=@data.to_a.to_vector(:scale)
             graph = Statsample::Graph::SvgBoxplot.new(options)
@@ -53,6 +56,7 @@ module Statsample
         end
         def svggraph_lag_plot(options={})
+            check_type :scale
             options={:graph_title=>"Lag Plot", :show_graph_title=>true}.merge! options
             vx=@data[0...(@data.size-1)].to_vector(:scale)
             vy=@data[1...@data.size].to_vector(:scale)
@@ -66,12 +70,12 @@ module Statsample
         # Returns a Normal Probability Plot
         # Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/normprpl.htm
         def svggraph_normalprobability_plot(options={})
-                    extend Statsample::Util
-                    options={:graph_title=>"Normal Probability Plot", :show_graph_title=>true}.merge! options
+            extend Statsample::Util
+            check_type :scale
+            options={:graph_title=>"Normal Probability Plot", :show_graph_title=>true}.merge! options
             n=@data.size
             vx=(1..@data.size).to_a.collect{|i|
-                GSL::Cdf.gaussian_Pinv(normal_order_statistic_medians(i,n))
+                Distribution::Normal.p_value(normal_order_statistic_medians(i,n))
             }.to_vector(:scale)
             vy=@data.sort.to_vector(:scale)
             ds={'normal_order_statistics_medians'=>vx, 'ordered_response'=>vy}.to_dataset