RubyGems - statsample - Versions diffs - 0.3.3 → 0.3.4 - Mend

statsample 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/History.txt +7 -1
data/Manifest.txt +3 -0
data/demo/benchmark.rb +7 -5
data/demo/regression.rb +30 -4
data/lib/statsample.rb +23 -15
data/lib/statsample/bivariate.rb +28 -9
data/lib/statsample/combination.rb +103 -0
data/lib/statsample/converters.rb +16 -1
data/lib/statsample/dataset.rb +29 -11
data/lib/statsample/dominanceanalysis.rb +15 -11
data/lib/statsample/dominanceanalysis/bootstrap.rb +9 -7
data/lib/statsample/graph/svggraph.rb +1 -3
data/lib/statsample/regression.rb +1 -0
data/lib/statsample/regression/logit.rb +35 -0
data/lib/statsample/regression/multiple.rb +21 -2
data/lib/statsample/regression/multiple/alglibengine.rb +3 -1
data/lib/statsample/vector.rb +168 -183
data/test/test_combination.rb +42 -0
data/test/test_csv.rb +1 -1
data/test/test_dataset.rb +5 -0
data/test/test_statistics.rb +19 -2
data/test/test_svg_graph.rb +5 -2
data/test/test_vector.rb +6 -1
metadata +6 -2

data/History.txt CHANGED Viewed

@@ -1,8 +1,14 @@
+=== 0.3.4 / 2009-08-21
+* Works with statsample-optimization 2.0.0
+* Vector doesn't uses delegation. All methods are part of Vector
+* Added Combination. Generates all combination of n elements taken r at a time
+* Bivariate#prop_pearson now can uses as a second parameter :both, :left, :right, :positive or :negative
+* Added LICENSE.txt
 === 0.3.3 / 2009-08-11
 * Added i18n support. For now, only spanish translation available
 * Bug fix: Test now load libraries on ../lib path
 * Excel and CSV importers automatically modify type of vector to Scale when all data are numbers or nils values
-*
 === 0.3.2 / 2009-08-04

data/Manifest.txt CHANGED Viewed

@@ -25,6 +25,7 @@ lib/statsample/anova.rb
 lib/statsample/bivariate.rb
 lib/statsample/chidistribution.rb
 lib/statsample/codification.rb
+lib/statsample/combination.rb
 lib/statsample/converters.rb
 lib/statsample/crosstab.rb
 lib/statsample/dataset.rb
@@ -38,6 +39,7 @@ lib/statsample/graph/svgscatterplot.rb
 lib/statsample/htmlreport.rb
 lib/statsample/multiset.rb
 lib/statsample/regression.rb
+lib/statsample/regression/logit.rb
 lib/statsample/regression/multiple.rb
 lib/statsample/regression/multiple/alglibengine.rb
 lib/statsample/regression/multiple/gslengine.rb
@@ -54,6 +56,7 @@ setup.rb
 test/_test_chart.rb
 test/test_anova.rb
 test/test_codification.rb
+test/test_combination.rb
 test/test_crosstab.rb
 test/test_csv.csv
 test/test_csv.rb

data/demo/benchmark.rb CHANGED Viewed

@@ -1,4 +1,5 @@
-require File.dirname(__FILE__)+'/../lib/statsample.rb'
+$:.unshift(File.dirname(__FILE__)+'/../lib/')
+require 'statsample'
 require 'benchmark'
 v=(0..10000).collect{|n|
 	r=rand(100)
@@ -37,14 +38,15 @@ ds=Statsample::Dataset.new({'a'=>a.to_vector(:scale),'b'=>b.to_vector(:scale), '
  if (true)
      Benchmark.bm(7) do |x|
-         x.report("Alglib coeffs")   { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs; end }
-         x.report("GslEngine coeffs")   { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs; end }
+         x.report("Alglib coeffs")   { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs;          lr=nil;end }
+         x.report("GslEngine coeffs")   { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs;lr=nil; end }
      end
  end
  if(true)
      Benchmark.bm(7) do |x|
-         x.report("Alglib process")   { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([1,2]); end }
-         x.report("GslEngine process")   { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([1,2]); end }
+         x.report("Alglib process")   { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
+         x.report("GslEngine process")   { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
     end
  end

data/demo/regression.rb CHANGED Viewed

@@ -2,22 +2,36 @@ require File.dirname(__FILE__)+'/../lib/statsample'
 require 'benchmark'
 tests=300
 include Statsample
-r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
 ds=Dataset.new(%w{a b c d y})
     ds['a'].type=:scale
     ds['b'].type=:scale
     ds['c'].type=:scale
     ds['d'].type=:scale
     ds['y'].type=:scale
+if HAS_GSL
+r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
     tests.times {
     a=r.ugaussian
-    b=r.ugaussian
+    b=a*2+r.ugaussian
     c=r.ugaussian
     d=r.ugaussian
     y=a*70+b*30+c*5+r.ugaussian*5
     ds.add_case_array([a,b,c,d,y])
 }
+else
+    tests.times {
+        a=1-rand()*2.0
+    b=1-rand()*2.0
+    c=1-rand()*2.0
+    d=1-rand()*2.0
+    y=a*70+b*30+c*5+(1-rand()*2.0)*5
+    ds.add_case_array([a,b,c,d,y])
+}
+end
 ds.update_valid_data
 if !File.exists? "regression.dab"
@@ -26,15 +40,27 @@ else
     da=Statsample.load("regression.dab")
 end
 times=1
+if(true)
 Benchmark.bm(7) do |x|
+    if HAS_GSL
     x.report("GslEngine:") {
         da.lr_class=Regression::Multiple::GslEngine
         da.bootstrap(times)
     }
+    end
+    if(false)
+    if HAS_ALGIB
     x.report("AlglibEngine:") {
         da.lr_class=Regression::Multiple::AlglibEngine
         da.bootstrap(times)
     }
+    end
+    x.report("RubyEngine:") {
+        da.lr_class=Regression::Multiple::RubyEngine
+        da.bootstrap(times)
+    }
+    end
+end
 end
 puts da.summary
@@ -45,11 +71,11 @@ lr=Regression::Multiple.listwise(ds,"y")
 hr=HtmlReport.new("Regression")
 hr.add_summary("Regression",lr.summary(HtmlSummary))
 hr.add_summary("Analisis de Dominancia ", da.da.summary(HtmlSummary))
+hr.add_correlation_matrix(ds)
 hr.add_summary("Analisis de Dominancia (Bootstrap)", da.summary(HtmlSummary))
 da.fields.each{|f|
- hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
+# hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
 }
 hr.save("Regression Dominance.html")

data/lib/statsample.rb CHANGED Viewed

@@ -72,16 +72,6 @@ end
     rescue LoadError
         HAS_ALGIB=false
     end
-    begin
-        require 'statsample/optimization'
-    rescue LoadError
-        module Statsample
-            OPTIMIZED=false
-        end
-    end
 #
 # Modules for statistical analysis
 # See first:
@@ -90,10 +80,11 @@ end
 # * Dataset: An union of vectors.
 #
 module Statsample
-    VERSION = '0.3.3'
+    VERSION = '0.3.4'
     SPLIT_TOKEN = ","
 	autoload(:Database, 'statsample/converters')
     autoload(:Anova, 'statsample/anova')
+	autoload(:Combination, 'statsample/combination')
 	autoload(:CSV, 'statsample/converters')
 	autoload(:Excel, 'statsample/converters')
 	autoload(:GGobi, 'statsample/converters')
@@ -113,10 +104,15 @@ module Statsample
 	autoload(:Regression, 'statsample/regression')
 	autoload(:Test, 'statsample/test')
     def self.load(filename)
-        fp=File.open(filename,"r")
-        o=Marshal.load(fp)
-        fp.close
+        if File.exists? filename
+            o=false
+            File.open(filename,"r") {|fp|
+                o=Marshal.load(fp)
+            }
         o
+        else
+            false
+        end
     end
 	module Util
@@ -243,9 +239,21 @@ module Statsample
             out
         end
     end
+    module STATSAMPLE__
+    end
+end
+begin
+    require 'statsamplert'
+rescue LoadError
+    module Statsample
+        OPTIMIZED=false
+    end
 end
 require 'statsample/vector'
 require 'statsample/dataset'
 require 'statsample/crosstab'

data/lib/statsample/bivariate.rb CHANGED Viewed

@@ -65,15 +65,34 @@ module Statsample
                 r*Math::sqrt(((size)-2).to_f / (1 - r**2))
             end
             # Retrieves the probability value (a la SPSS)
-            # for a given t, size and number of tails
-            def prop_pearson(t,size, tails=2)
-		if HAS_GSL
-                t=-t if t>0
-                cdf=GSL::Cdf::tdist_P(t,(size)-2)
-                cdf*tails
-		else
-			raise "Needs ruby-gsl"
-		end
+            # for a given t, size and number of tails.
+            # Uses a second parameter
+            # * :both  or 2  : for r!=0
+            # * :right, :positive or 1  : for r > 0
+            # * :left, :negative        : for r < 0
+            def prop_pearson(t,size, tails=:both)
+                tails=:both if tails==2
+                tails=:right if tails==1 or tails==:positive
+                tails=:left if tails==:negative
+                n_tails=case tails
+                when :both
+                    2
+                else
+                    1
+                end
+                if HAS_GSL
+                        t=-t if t>0 and (tails==:both)
+                        cdf=GSL::Cdf::tdist_P(t,size-2)
+                        if(tails==:right)
+                            1.0-(cdf*n_tails)
+                        else
+                            cdf*n_tails
+                        end
+                else
+                raise "Needs ruby-gsl"
+                end
             end
             # Returns residual score after delete variance
             # from another variable

data/lib/statsample/combination.rb ADDED Viewed

@@ -0,0 +1,103 @@
+module Statsample
+    # Combination class systematically generates all combinations of n elements, taken r at a time.
+    # Use GSL::Combination is available for extra speed
+    # Source: http://snippets.dzone.com/posts/show/4666
+    # Use:
+    #  comb=Statsample::Combination.new(3,5)
+    #  comb.each{|c|
+    #     p c
+    #  }
+    class Combination
+        attr_reader :d
+        def initialize(k,n,only_ruby=false)
+            @k=k
+            @n=n
+            if HAS_GSL and !only_ruby
+                @d=CombinationGsl.new(@k,@n)
+            else
+                @d=CombinationRuby.new(@k,@n)
+            end
+        end
+        def each
+            reset
+            while a=next_value
+                yield a
+            end
+        end
+        def reset
+            @d.reset
+        end
+        def next_value
+            @d.next_value
+        end
+        class CombinationRuby
+        attr_reader :data
+        def initialize(k,n)
+            raise "k<=n" if k>n
+            @k=k
+            @n=n
+            reset
+        end
+        def reset
+            @data=[]
+            (0...@k).each {|i|
+                @data[i] = i;
+            }
+        end
+        def each
+            reset
+            while a=next_value
+                yield a
+            end
+        end
+        def next_value
+            return false if !@data
+            old_comb=@data.dup
+            i = @k - 1;
+            @data[i]+=1
+            while ((i >= 0) and (@data[i] >= @n - @k + 1 + i)) do
+                i-=1;
+                @data[i]+=1;
+            end
+            if (@data[0] > @n - @k) # Combination (n-k, n-k+1, ..., n) reached */
+                @data=false # No more combinations can be generated
+            else
+                # comb now looks like (..., x, n, n, n, ..., n).
+                # Turn it into (..., x, x + 1, x + 2, ...)
+                i = i+1
+                (i...@k).each{ |i1|
+                    @data[i1] = @data[i1 - 1] + 1
+                }
+            end
+            return old_comb
+        end
+    end
+    class CombinationGsl
+        def initialize(k,n)
+            require 'gsl'
+            raise "k<=n" if k>n
+            @k=k
+            @n=n
+            reset
+        end
+        def reset
+            @c= ::GSL::Combination.calloc(@n, @k);
+        end
+        def next_value
+            return false if !@c
+            data=@c.data.to_a
+            if @c.next != GSL::SUCCESS
+                @c=false
+            end
+            return data
+        end
+        def each
+            reset
+            begin
+                yield @c.data.to_a
+            end while @c.next == GSL::SUCCESS
+        end
+    end
+end
+end

data/lib/statsample/converters.rb CHANGED Viewed

@@ -148,6 +148,13 @@ module Statsample
                 book = Spreadsheet.open filename
                 sheet= book.worksheet worksheet_id
                 sheet.each do |row|
+                    begin
+                        dates=[]
+                        row.formats.each_index{|i|
+                            if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
+                                dates.push(i)
+                            end
+                        }
                     line_number+=1
                     if(line_number<=ignore_lines)
                         #puts "Skip line"
@@ -155,9 +162,13 @@ module Statsample
                     end
                     # This should be fixed.
                     # If we have a Formula, should be resolver first
+                    i=-1
                     row.collect!{|c|
+                        i+=1
                         if c.is_a? Spreadsheet::Formula
-                            nil
+                            c.value
+                        elsif dates.include? i and !c.nil? and c.is_a? Numeric
+                            row.date(i)
                         else
                             c
                         end
@@ -173,6 +184,10 @@ module Statsample
                         }
                         ds.add_case(rowa,false)
                     end
+                    rescue => e
+                        error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
+                        raise
+                    end
                 end
                 convert_to_scale(ds,fields)
                 ds.update_valid_data

data/lib/statsample/dataset.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Statsample
             @exp=e
         end
         def to_s
-            m="Error:"+@exp.message+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
+            m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
             m+="\nRow: #{@i}" unless @i.nil?
             m
         end
@@ -158,7 +158,7 @@ module Statsample
         end
         # Fast version of add case
         # Can only add one case and no error check if performed
-        # You SHOULD use update_valid_data at the the of insertion cycle
+        # You SHOULD use update_valid_data at the end of insertion cycle
         def add_case_array(v)
             v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
         end
@@ -295,17 +295,35 @@ module Statsample
                     yield k,@vectors[k]
                 }
             end
-        if !Statsample::OPTIMIZED
-            def case_as_hash(c)
-                @fields.inject({}) {|a,x|
-                        a[x]=@vectors[x][c]
-                        a
-            }
+            if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
+                def case_as_hash(c)
+                    Statsample::STATSAMPLE__.case_as_hash(self,c)
+                end
+            else
+                def case_as_hash(c)
+                    _case_as_hash(c)
+                end
+            end
+            if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
+                def case_as_array(c)
+                    Statsample::STATSAMPLE__.case_as_array(self,c)
+                end
+            else
+                def case_as_array(c)
+                    _case_as_array(c)
+                end
             end
-            def case_as_array(c)
-                @fields.collect {|x| @vectors[x][c]}
-            end
+        def _case_as_hash(c)
+            @fields.inject({}) {|a,x|
+                a[x]=@vectors[x][c]
+                a
+            }
+        end
+        def _case_as_array(c)
+            @fields.collect {|x| @vectors[x][c]}
         end
         def each
             begin
                 @i=0