RubyGems - statsample - Versions diffs - 0.3.3 → 0.3.4 - Mend

statsample 0.3.3 → 0.3.4

Files changed (24) hide show

data/History.txt +7 -1
data/Manifest.txt +3 -0
data/demo/benchmark.rb +7 -5
data/demo/regression.rb +30 -4
data/lib/statsample.rb +23 -15
data/lib/statsample/bivariate.rb +28 -9
data/lib/statsample/combination.rb +103 -0
data/lib/statsample/converters.rb +16 -1
data/lib/statsample/dataset.rb +29 -11
data/lib/statsample/dominanceanalysis.rb +15 -11
data/lib/statsample/dominanceanalysis/bootstrap.rb +9 -7
data/lib/statsample/graph/svggraph.rb +1 -3
data/lib/statsample/regression.rb +1 -0
data/lib/statsample/regression/logit.rb +35 -0
data/lib/statsample/regression/multiple.rb +21 -2
data/lib/statsample/regression/multiple/alglibengine.rb +3 -1
data/lib/statsample/vector.rb +168 -183
data/test/test_combination.rb +42 -0
data/test/test_csv.rb +1 -1
data/test/test_dataset.rb +5 -0
data/test/test_statistics.rb +19 -2
data/test/test_svg_graph.rb +5 -2
data/test/test_vector.rb +6 -1
metadata +6 -2

data/History.txt CHANGED Viewed

@@ -1,8 +1,14 @@
+=== 0.3.4 / 2009-08-21
+* Works with statsample-optimization 2.0.0
+* Vector doesn't uses delegation. All methods are part of Vector
+* Added Combination. Generates all combination of n elements taken r at a time
+* Bivariate#prop_pearson now can uses as a second parameter :both, :left, :right, :positive or :negative
+* Added LICENSE.txt
 === 0.3.3 / 2009-08-11
 * Added i18n support. For now, only spanish translation available
 * Bug fix: Test now load libraries on ../lib path
 * Excel and CSV importers automatically modify type of vector to Scale when all data are numbers or nils values
-*
 === 0.3.2 / 2009-08-04

data/Manifest.txt CHANGED Viewed

@@ -25,6 +25,7 @@ lib/statsample/anova.rb
 lib/statsample/bivariate.rb
 lib/statsample/chidistribution.rb
 lib/statsample/codification.rb
+lib/statsample/combination.rb
 lib/statsample/converters.rb
 lib/statsample/crosstab.rb
 lib/statsample/dataset.rb
@@ -38,6 +39,7 @@ lib/statsample/graph/svgscatterplot.rb
 lib/statsample/htmlreport.rb
 lib/statsample/multiset.rb
 lib/statsample/regression.rb
+lib/statsample/regression/logit.rb
 lib/statsample/regression/multiple.rb
 lib/statsample/regression/multiple/alglibengine.rb
 lib/statsample/regression/multiple/gslengine.rb
@@ -54,6 +56,7 @@ setup.rb
 test/_test_chart.rb
 test/test_anova.rb
 test/test_codification.rb
+test/test_combination.rb
 test/test_crosstab.rb
 test/test_csv.csv
 test/test_csv.rb

data/demo/benchmark.rb CHANGED Viewed

@@ -1,4 +1,5 @@
-require File.dirname(__FILE__)+'/../lib/statsample.rb'
+$:.unshift(File.dirname(__FILE__)+'/../lib/')
+require 'statsample'
 require 'benchmark'
 v=(0..10000).collect{|n|
 	r=rand(100)
@@ -37,14 +38,15 @@ ds=Statsample::Dataset.new({'a'=>a.to_vector(:scale),'b'=>b.to_vector(:scale), '
  if (true)
      Benchmark.bm(7) do |x|
-         x.report("Alglib coeffs")   { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs; end }
-         x.report("GslEngine coeffs")   { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs; end }
+         x.report("Alglib coeffs")   { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs;          lr=nil;end }
+         x.report("GslEngine coeffs")   { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs;lr=nil; end }
      end
  end
  if(true)
      Benchmark.bm(7) do |x|
-         x.report("Alglib process")   { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([1,2]); end }
-         x.report("GslEngine process")   { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([1,2]); end }
+         x.report("Alglib process")   { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
+         x.report("GslEngine process")   { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
     end
  end

data/demo/regression.rb CHANGED Viewed

@@ -2,22 +2,36 @@ require File.dirname(__FILE__)+'/../lib/statsample'
 require 'benchmark'
 tests=300
 include Statsample
-r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
 ds=Dataset.new(%w{a b c d y})
     ds['a'].type=:scale
     ds['b'].type=:scale
     ds['c'].type=:scale
     ds['d'].type=:scale
     ds['y'].type=:scale
+if HAS_GSL
+r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
     tests.times {
     a=r.ugaussian
-    b=r.ugaussian
+    b=a*2+r.ugaussian
     c=r.ugaussian
     d=r.ugaussian
     y=a*70+b*30+c*5+r.ugaussian*5
     ds.add_case_array([a,b,c,d,y])
 }
+else
+    tests.times {
+        a=1-rand()*2.0
+    b=1-rand()*2.0
+    c=1-rand()*2.0
+    d=1-rand()*2.0
+    y=a*70+b*30+c*5+(1-rand()*2.0)*5
+    ds.add_case_array([a,b,c,d,y])
+}
+end
 ds.update_valid_data
 if !File.exists? "regression.dab"
@@ -26,15 +40,27 @@ else
     da=Statsample.load("regression.dab")
 end
 times=1
+if(true)
 Benchmark.bm(7) do |x|
+    if HAS_GSL
     x.report("GslEngine:") {
         da.lr_class=Regression::Multiple::GslEngine
         da.bootstrap(times)
     }
+    end
+    if(false)
+    if HAS_ALGIB
     x.report("AlglibEngine:") {
         da.lr_class=Regression::Multiple::AlglibEngine
         da.bootstrap(times)
     }
+    end
+    x.report("RubyEngine:") {
+        da.lr_class=Regression::Multiple::RubyEngine
+        da.bootstrap(times)
+    }
+    end
+end
 end
 puts da.summary
@@ -45,11 +71,11 @@ lr=Regression::Multiple.listwise(ds,"y")
 hr=HtmlReport.new("Regression")
 hr.add_summary("Regression",lr.summary(HtmlSummary))
 hr.add_summary("Analisis de Dominancia ", da.da.summary(HtmlSummary))
+hr.add_correlation_matrix(ds)
 hr.add_summary("Analisis de Dominancia (Bootstrap)", da.summary(HtmlSummary))
 da.fields.each{|f|
- hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
+# hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
 }
 hr.save("Regression Dominance.html")

data/lib/statsample.rb CHANGED Viewed

@@ -72,16 +72,6 @@ end
     rescue LoadError
         HAS_ALGIB=false
     end
-    begin
-        require 'statsample/optimization'
-    rescue LoadError
-        module Statsample
-            OPTIMIZED=false
-        end
-    end
 #
 # Modules for statistical analysis
 # See first:
@@ -90,10 +80,11 @@ end
 # * Dataset: An union of vectors.
 #
 module Statsample
-    VERSION = '0.3.3'
+    VERSION = '0.3.4'
     SPLIT_TOKEN = ","
 	autoload(:Database, 'statsample/converters')
     autoload(:Anova, 'statsample/anova')
+	autoload(:Combination, 'statsample/combination')
 	autoload(:CSV, 'statsample/converters')
 	autoload(:Excel, 'statsample/converters')
 	autoload(:GGobi, 'statsample/converters')
@@ -113,10 +104,15 @@ module Statsample
 	autoload(:Regression, 'statsample/regression')
 	autoload(:Test, 'statsample/test')
     def self.load(filename)
-        fp=File.open(filename,"r")
-        o=Marshal.load(fp)
-        fp.close
+        if File.exists? filename
+            o=false
+            File.open(filename,"r") {|fp|
+                o=Marshal.load(fp)
+            }
         o
+        else
+            false
+        end
     end
 	module Util
@@ -243,9 +239,21 @@ module Statsample
             out
         end
     end
+    module STATSAMPLE__
+    end
+end
+begin
+    require 'statsamplert'
+rescue LoadError
+    module Statsample
+        OPTIMIZED=false
+    end
 end
 require 'statsample/vector'
 require 'statsample/dataset'
 require 'statsample/crosstab'

data/lib/statsample/bivariate.rb CHANGED Viewed

@@ -65,15 +65,34 @@ module Statsample
                 r*Math::sqrt(((size)-2).to_f / (1 - r**2))
             end
             # Retrieves the probability value (a la SPSS)
-            # for a given t, size and number of tails
-            def prop_pearson(t,size, tails=2)
-		if HAS_GSL
-                t=-t if t>0
-                cdf=GSL::Cdf::tdist_P(t,(size)-2)
-                cdf*tails
-		else
-			raise "Needs ruby-gsl"
-		end
+            # for a given t, size and number of tails.
+            # Uses a second parameter
+            # * :both  or 2  : for r!=0
+            # * :right, :positive or 1  : for r > 0
+            # * :left, :negative        : for r < 0
+            def prop_pearson(t,size, tails=:both)
+                tails=:both if tails==2
+                tails=:right if tails==1 or tails==:positive
+                tails=:left if tails==:negative
+                n_tails=case tails
+                when :both
+                    2
+                else
+                    1
+                end
+                if HAS_GSL
+                        t=-t if t>0 and (tails==:both)
+                        cdf=GSL::Cdf::tdist_P(t,size-2)
+                        if(tails==:right)
+                            1.0-(cdf*n_tails)
+                        else
+                            cdf*n_tails
+                        end
+                else
+                raise "Needs ruby-gsl"
+                end
             end
             # Returns residual score after delete variance
             # from another variable

data/lib/statsample/combination.rb ADDED Viewed

@@ -0,0 +1,103 @@
+module Statsample
+    # Combination class systematically generates all combinations of n elements, taken r at a time.
+    # Use GSL::Combination is available for extra speed
+    # Source: http://snippets.dzone.com/posts/show/4666
+    # Use:
+    #  comb=Statsample::Combination.new(3,5)
+    #  comb.each{|c|
+    #     p c
+    #  }
+    class Combination
+        attr_reader :d
+        def initialize(k,n,only_ruby=false)
+            @k=k
+            @n=n
+            if HAS_GSL and !only_ruby
+                @d=CombinationGsl.new(@k,@n)
+            else
+                @d=CombinationRuby.new(@k,@n)
+            end
+        end
+        def each
+            reset
+            while a=next_value
+                yield a
+            end
+        end
+        def reset
+            @d.reset
+        end
+        def next_value
+            @d.next_value
+        end
+        class CombinationRuby
+        attr_reader :data
+        def initialize(k,n)
+            raise "k<=n" if k>n
+            @k=k
+            @n=n
+            reset
+        end
+        def reset
+            @data=[]
+            (0...@k).each {|i|
+                @data[i] = i;
+            }
+        end
+        def each
+            reset
+            while a=next_value
+                yield a
+            end
+        end
+        def next_value
+            return false if !@data
+            old_comb=@data.dup
+            i = @k - 1;
+            @data[i]+=1
+            while ((i >= 0) and (@data[i] >= @n - @k + 1 + i)) do
+                i-=1;
+                @data[i]+=1;
+            end
+            if (@data[0] > @n - @k) # Combination (n-k, n-k+1, ..., n) reached */
+                @data=false # No more combinations can be generated
+            else
+                # comb now looks like (..., x, n, n, n, ..., n).
+                # Turn it into (..., x, x + 1, x + 2, ...)
+                i = i+1
+                (i...@k).each{ |i1|
+                    @data[i1] = @data[i1 - 1] + 1
+                }
+            end
+            return old_comb
+        end
+    end
+    class CombinationGsl
+        def initialize(k,n)
+            require 'gsl'
+            raise "k<=n" if k>n
+            @k=k
+            @n=n
+            reset
+        end
+        def reset
+            @c= ::GSL::Combination.calloc(@n, @k);
+        end
+        def next_value
+            return false if !@c
+            data=@c.data.to_a
+            if @c.next != GSL::SUCCESS
+                @c=false
+            end
+            return data
+        end
+        def each
+            reset
+            begin
+                yield @c.data.to_a
+            end while @c.next == GSL::SUCCESS
+        end
+    end
+end
+end

data/lib/statsample/converters.rb CHANGED Viewed

@@ -148,6 +148,13 @@ module Statsample
                 book = Spreadsheet.open filename
                 sheet= book.worksheet worksheet_id
                 sheet.each do |row|
+                    begin
+                        dates=[]
+                        row.formats.each_index{|i|
+                            if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
+                                dates.push(i)
+                            end
+                        }
                     line_number+=1
                     if(line_number<=ignore_lines)
                         #puts "Skip line"
@@ -155,9 +162,13 @@ module Statsample
                     end
                     # This should be fixed.
                     # If we have a Formula, should be resolver first
+                    i=-1
                     row.collect!{|c|
+                        i+=1
                         if c.is_a? Spreadsheet::Formula
-                            nil
+                            c.value
+                        elsif dates.include? i and !c.nil? and c.is_a? Numeric
+                            row.date(i)
                         else
                             c
                         end
@@ -173,6 +184,10 @@ module Statsample
                         }
                         ds.add_case(rowa,false)
                     end
+                    rescue => e
+                        error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
+                        raise
+                    end
                 end
                 convert_to_scale(ds,fields)
                 ds.update_valid_data

data/lib/statsample/dataset.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Statsample
             @exp=e
         end
         def to_s
-            m="Error:"+@exp.message+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
+            m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
             m+="\nRow: #{@i}" unless @i.nil?
             m
         end
@@ -158,7 +158,7 @@ module Statsample
         end
         # Fast version of add case
         # Can only add one case and no error check if performed
-        # You SHOULD use update_valid_data at the the of insertion cycle
+        # You SHOULD use update_valid_data at the end of insertion cycle
         def add_case_array(v)
             v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
         end
@@ -295,17 +295,35 @@ module Statsample
                     yield k,@vectors[k]
                 }
             end
-        if !Statsample::OPTIMIZED
-            def case_as_hash(c)
-                @fields.inject({}) {|a,x|
-                        a[x]=@vectors[x][c]
-                        a
-            }
+            if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
+                def case_as_hash(c)
+                    Statsample::STATSAMPLE__.case_as_hash(self,c)
+                end
+            else
+                def case_as_hash(c)
+                    _case_as_hash(c)
+                end
+            end
+            if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
+                def case_as_array(c)
+                    Statsample::STATSAMPLE__.case_as_array(self,c)
+                end
+            else
+                def case_as_array(c)
+                    _case_as_array(c)
+                end
             end
-            def case_as_array(c)
-                @fields.collect {|x| @vectors[x][c]}
-            end
+        def _case_as_hash(c)
+            @fields.inject({}) {|a,x|
+                a[x]=@vectors[x][c]
+                a
+            }
+        end
+        def _case_as_array(c)
+            @fields.collect {|x| @vectors[x][c]}
         end
         def each
             begin
                 @i=0