statsample 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +3 -1
- data/lib/statsample.rb +175 -179
- data/lib/statsample/codification.rb +1 -1
- data/lib/statsample/converter/csv18.rb +56 -0
- data/lib/statsample/converter/csv19.rb +60 -0
- data/lib/statsample/converters.rb +26 -75
- data/lib/statsample/dataset.rb +38 -29
- data/lib/statsample/dominanceanalysis.rb +6 -6
- data/lib/statsample/graph/gdchart.rb +2 -1
- data/lib/statsample/graph/svggraph.rb +10 -9
- data/lib/statsample/multiset.rb +3 -3
- data/lib/statsample/regression/multiple.rb +43 -271
- data/lib/statsample/regression/multiple/baseengine.rb +235 -0
- data/lib/statsample/regression/multiple/gslengine.rb +2 -2
- data/lib/statsample/vector.rb +754 -736
- data/test/test_csv.rb +3 -4
- data/test/test_dataset.rb +22 -3
- data/test/test_distribution.rb +4 -3
- data/test/test_ggobi.rb +2 -2
- data/test/test_regression.rb +11 -2
- data/test/test_svg_graph.rb +0 -1
- data/test/test_vector.rb +50 -5
- data/test/test_xls.rb +2 -4
- metadata +5 -3
- data/test/_test_chart.rb +0 -58
| @@ -1,284 +1,56 @@ | |
| 1 | 
            +
            require 'statsample/regression/multiple/baseengine'
         | 
| 1 2 | 
             
            module Statsample
         | 
| 2 | 
            -
            module Regression
         | 
| 3 | 
            -
             | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
                    
         | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
                def self.listwise(ds,y_var)
         | 
| 22 | 
            -
                    if HAS_ALGIB
         | 
| 3 | 
            +
              module Regression
         | 
| 4 | 
            +
                # Module for Linear Multiple Regression Analysis
         | 
| 5 | 
            +
                # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
         | 
| 6 | 
            +
                # Example.
         | 
| 7 | 
            +
                #
         | 
| 8 | 
            +
                #  require 'statsample'
         | 
| 9 | 
            +
                #  @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
         | 
| 10 | 
            +
                #  @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
         | 
| 11 | 
            +
                #  @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
         | 
| 12 | 
            +
                #  @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
         | 
| 13 | 
            +
                #  ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
         | 
| 14 | 
            +
                #  lr=Statsample::Regression::Multiple.listwise(ds,'y')        
         | 
| 15 | 
            +
                #  #<Statsample::Regression::Multiple::AlglibEngine:0x7f21912e4758 @ds_valid=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @lr=#<Alglib::LinearRegression:0x7f21912df118 @model=#<Alglib_ext::LinearModel:0x7f21912df708>, @ivars=3, @cases=10, @report=#<Alglib_ext::LrReport:0x7f21912df168>>, @y_var="y", @ds=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @fields=["a", "b", "c"], @lr_s=nil, @dep_columns=[[1, 3, 2, 4, 3, 5, 4, 6, 5, 7], [3, 3, 4, 4, 5, 5, 6, 6, 4, 4], [11, 22, 30, 40, 50, 65, 78, 79, 99, 100]], @ds_indep=#<Statsample::Dataset:69891073180060 @fields=[a,b,c] labels={"a"=>nil, "b"=>nil, "c"=>nil} cases=10, @dy=Vector(type:scale, n:10)[3,4,5,6,7,8,9,10,20,30]>
         | 
| 16 | 
            +
                module Multiple
         | 
| 17 | 
            +
                    # Creates an object for listwise regression. 
         | 
| 18 | 
            +
                    # Alglib is faster, so is prefered over GSL
         | 
| 19 | 
            +
                    #   lr=Statsample::Regression::Multiple.listwise(ds,'y')
         | 
| 20 | 
            +
                    def self.listwise(ds,y_var)
         | 
| 21 | 
            +
                      if HAS_ALGIB
         | 
| 23 22 | 
             
                        AlglibEngine.new(ds,y_var)
         | 
| 24 | 
            -
             | 
| 23 | 
            +
                      elsif HAS_GSL
         | 
| 25 24 | 
             
                        GslEngine.new(ds,y_var)
         | 
| 26 | 
            -
             | 
| 25 | 
            +
                      else
         | 
| 27 26 | 
             
                        ds2=ds.dup_only_valid
         | 
| 28 27 | 
             
                        RubyEngine.new(ds2,y_var)
         | 
| 28 | 
            +
                      end
         | 
| 29 29 | 
             
                    end
         | 
| 30 | 
            -
                end
         | 
| 31 | 
            -
                
         | 
| 32 | 
            -
                # Creates an object for pairwise regression
         | 
| 33 | 
            -
                # For now, always retrieves a RubyEngine
         | 
| 34 | 
            -
                #    lr=Statsample::Regression::Multiple.listwise(ds,'y')
         | 
| 35 | 
            -
                def self.pairwise(ds,y_var)
         | 
| 36 | 
            -
                    RubyEngine.new(ds,y_var)
         | 
| 37 | 
            -
                end
         | 
| 38 | 
            -
                def self.listwise_by_exp(ds,exp)
         | 
| 39 | 
            -
                end
         | 
| 40 | 
            -
                # Returns a dataset and name of criteria using a expression.
         | 
| 41 | 
            -
                # All nominal vectors are replaced by dummy coding
         | 
| 42 | 
            -
                # and interactions are calculated
         | 
| 43 | 
            -
                
         | 
| 44 | 
            -
                def self.ds_by_exp(ds,exp)
         | 
| 45 | 
            -
                    raise "Not implemented"
         | 
| 46 | 
            -
                    parts=exp.split(/[\+=]/)
         | 
| 47 | 
            -
                    dependent=parts.pop
         | 
| 48 | 
            -
                    ds_out=[]
         | 
| 49 | 
            -
                    parts.each{|p|
         | 
| 50 | 
            -
                        
         | 
| 51 | 
            -
                    }
         | 
| 52 | 
            -
                end
         | 
| 53 | 
            -
                # Base class for Multiple Regression Engines
         | 
| 54 | 
            -
                class BaseEngine
         | 
| 55 | 
            -
                def initialize(ds,y_var)
         | 
| 56 | 
            -
                    @ds=ds
         | 
| 57 | 
            -
                    @y_var=y_var
         | 
| 58 | 
            -
                    @r2=nil
         | 
| 59 | 
            -
                end
         | 
| 60 | 
            -
                
         | 
| 61 | 
            -
                # Retrieves a vector with predicted values for y
         | 
| 62 | 
            -
                def predicted
         | 
| 63 | 
            -
                    (0...@ds.cases).collect { |i|
         | 
| 64 | 
            -
                        invalid=false
         | 
| 65 | 
            -
                        vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
         | 
| 66 | 
            -
                        if invalid
         | 
| 67 | 
            -
                            nil
         | 
| 68 | 
            -
                        else
         | 
| 69 | 
            -
                            process(vect)
         | 
| 70 | 
            -
                        end
         | 
| 71 | 
            -
                    }.to_vector(:scale)
         | 
| 72 | 
            -
                end
         | 
| 73 | 
            -
                # Retrieves a vector with standarized values for y
         | 
| 74 | 
            -
                def standarized_predicted
         | 
| 75 | 
            -
                    predicted.standarized
         | 
| 76 | 
            -
                end
         | 
| 77 | 
            -
                # Retrieves a vector with residuals values for y
         | 
| 78 | 
            -
                def residuals
         | 
| 79 | 
            -
                    (0...@ds.cases).collect{|i|
         | 
| 80 | 
            -
                        invalid=false
         | 
| 81 | 
            -
                        vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
         | 
| 82 | 
            -
                        if invalid or @ds[@y_var][i].nil?
         | 
| 83 | 
            -
                            nil
         | 
| 84 | 
            -
                        else
         | 
| 85 | 
            -
                         @ds[@y_var][i] - process(vect)
         | 
| 86 | 
            -
                     end
         | 
| 87 | 
            -
                    }.to_vector(:scale)
         | 
| 88 | 
            -
                end
         | 
| 89 | 
            -
                # R Multiple
         | 
| 90 | 
            -
                def r
         | 
| 91 | 
            -
                    raise "You should implement this"
         | 
| 92 | 
            -
                end
         | 
| 93 | 
            -
                # Sum of squares Total
         | 
| 94 | 
            -
                def sst
         | 
| 95 | 
            -
                    raise "You should implement this"
         | 
| 96 | 
            -
                end
         | 
| 97 | 
            -
                # Sum of squares (regression)
         | 
| 98 | 
            -
                def ssr
         | 
| 99 | 
            -
                    r2*sst
         | 
| 100 | 
            -
                end
         | 
| 101 | 
            -
                # Sum of squares (Error)
         | 
| 102 | 
            -
                def sse
         | 
| 103 | 
            -
                    sst - ssr
         | 
| 104 | 
            -
                end            
         | 
| 105 | 
            -
                # T values for coeffs
         | 
| 106 | 
            -
                def coeffs_t
         | 
| 107 | 
            -
                    out={}
         | 
| 108 | 
            -
                    se=coeffs_se
         | 
| 109 | 
            -
                    coeffs.each{|k,v|
         | 
| 110 | 
            -
                        out[k]=v / se[k] 
         | 
| 111 | 
            -
                    }
         | 
| 112 | 
            -
                    out
         | 
| 113 | 
            -
                end
         | 
| 114 | 
            -
                # Mean square Regression
         | 
| 115 | 
            -
                def msr
         | 
| 116 | 
            -
                    ssr.quo(df_r)
         | 
| 117 | 
            -
                end
         | 
| 118 | 
            -
                # Mean Square Error
         | 
| 119 | 
            -
                def mse
         | 
| 120 | 
            -
                    sse.quo(df_e)
         | 
| 121 | 
            -
                end            
         | 
| 122 | 
            -
                # Degrees of freedom for regression
         | 
| 123 | 
            -
                def df_r
         | 
| 124 | 
            -
                    @dep_columns.size
         | 
| 125 | 
            -
                end
         | 
| 126 | 
            -
                # Degrees of freedom for error
         | 
| 127 | 
            -
                def df_e
         | 
| 128 | 
            -
                    @ds_valid.cases-@dep_columns.size-1
         | 
| 129 | 
            -
                end
         | 
| 130 | 
            -
                # Fisher for Anova
         | 
| 131 | 
            -
                def f
         | 
| 132 | 
            -
                    (ssr.quo(df_r)).quo(sse.quo(df_e))
         | 
| 133 | 
            -
                end
         | 
| 134 | 
            -
                # Significance of Fisher
         | 
| 135 | 
            -
                def significance
         | 
| 136 | 
            -
                    1.0-Distribution::F.cdf(f,df_r,df_e)
         | 
| 137 | 
            -
                end
         | 
| 138 | 
            -
                    # Tolerance for a given variable
         | 
| 139 | 
            -
                    # http://talkstats.com/showthread.php?t=5056
         | 
| 140 | 
            -
                    def tolerance(var)
         | 
| 141 | 
            -
                        ds=assign_names(@dep_columns)
         | 
| 142 | 
            -
                        ds.each{|k,v|
         | 
| 143 | 
            -
                            ds[k]=v.to_vector(:scale)
         | 
| 144 | 
            -
                        }
         | 
| 145 | 
            -
                        lr=Multiple.listwise(ds.to_dataset,var)
         | 
| 146 | 
            -
                        1-lr.r2
         | 
| 147 | 
            -
                    end
         | 
| 148 | 
            -
                    # Tolerances for each coefficient
         | 
| 149 | 
            -
                    def coeffs_tolerances
         | 
| 150 | 
            -
                        @fields.inject({}) {|a,f|
         | 
| 151 | 
            -
                            a[f]=tolerance(f);
         | 
| 152 | 
            -
                            a
         | 
| 153 | 
            -
                        }
         | 
| 154 | 
            -
                    end
         | 
| 155 | 
            -
                    # Standard Error for coefficients
         | 
| 156 | 
            -
                    def coeffs_se
         | 
| 157 | 
            -
                        out={}
         | 
| 158 | 
            -
                        mse=sse.quo(df_e)
         | 
| 159 | 
            -
                        coeffs.each {|k,v|
         | 
| 160 | 
            -
                            out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
         | 
| 161 | 
            -
                        }
         | 
| 162 | 
            -
                        out
         | 
| 163 | 
            -
                    end
         | 
| 164 | 
            -
                    # Estimated Variance-Covariance Matrix
         | 
| 165 | 
            -
                    # Used for calculation of se of constant 
         | 
| 166 | 
            -
                    def estimated_variance_covariance_matrix
         | 
| 167 | 
            -
                        mse_p=mse
         | 
| 168 | 
            -
                        columns=[]
         | 
| 169 | 
            -
                        @ds_valid.each_vector{|k,v|
         | 
| 170 | 
            -
                            columns.push(v.data) unless k==@y_var
         | 
| 171 | 
            -
                        }
         | 
| 172 | 
            -
                        columns.unshift([1.0]*@ds_valid.cases)
         | 
| 173 | 
            -
                        x=Matrix.columns(columns)
         | 
| 174 | 
            -
                        matrix=((x.t*x)).inverse * mse
         | 
| 175 | 
            -
                        matrix.collect {|i|
         | 
| 176 | 
            -
                            Math::sqrt(i) if i>0
         | 
| 177 | 
            -
                        }
         | 
| 178 | 
            -
                    end
         | 
| 179 | 
            -
                    # T for constant
         | 
| 180 | 
            -
                    def constant_t
         | 
| 181 | 
            -
                        constant.to_f/constant_se 
         | 
| 182 | 
            -
                    end
         | 
| 183 | 
            -
                    # Standard error for constant
         | 
| 184 | 
            -
                    def constant_se
         | 
| 185 | 
            -
                        estimated_variance_covariance_matrix[0,0]
         | 
| 186 | 
            -
                    end
         | 
| 187 | 
            -
                    # Retrieves a summary for Regression
         | 
| 188 | 
            -
                    def summary(report_type=ConsoleSummary)
         | 
| 189 | 
            -
                    c=coeffs
         | 
| 190 | 
            -
                    out=""
         | 
| 191 | 
            -
                    out.extend report_type
         | 
| 192 | 
            -
                    out.add <<HEREDOC
         | 
| 193 | 
            -
            Summary for regression of #{@fields.join(',')} over #{@y_var}
         | 
| 194 | 
            -
            *************************************************************
         | 
| 195 | 
            -
            Engine: #{self.class}
         | 
| 196 | 
            -
            Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
         | 
| 197 | 
            -
            r=#{sprintf("%0.3f",r)}
         | 
| 198 | 
            -
            r2=#{sprintf("%0.3f",r2)}
         | 
| 199 | 
            -
            Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
         | 
| 200 | 
            -
            HEREDOC
         | 
| 201 | 
            -
             | 
| 202 | 
            -
                    out.add_line
         | 
| 203 | 
            -
                    out.add "ANOVA TABLE"
         | 
| 204 | 
            -
                    
         | 
| 205 | 
            -
                    t=Statsample::ReportTable.new(%w{source ss df ms f s})
         | 
| 206 | 
            -
                    t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
         | 
| 207 | 
            -
                    t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
         | 
| 208 | 
            -
                    
         | 
| 209 | 
            -
                    t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
         | 
| 210 30 |  | 
| 211 | 
            -
                     | 
| 212 | 
            -
                    
         | 
| 213 | 
            -
                     | 
| 214 | 
            -
                     | 
| 215 | 
            -
             | 
| 216 | 
            -
                    cse=coeffs_se
         | 
| 217 | 
            -
                    t=Statsample::ReportTable.new(%w{coeff b beta se t})
         | 
| 218 | 
            -
                    t.add_row(["Constant", sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
         | 
| 219 | 
            -
                    @fields.each{|f|
         | 
| 220 | 
            -
                    t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
         | 
| 221 | 
            -
                    }
         | 
| 222 | 
            -
                    out.parse_table(t)
         | 
| 223 | 
            -
                    
         | 
| 224 | 
            -
                    rescue
         | 
| 31 | 
            +
                    # Creates an object for pairwise regression
         | 
| 32 | 
            +
                    # For now, always retrieves a RubyEngine
         | 
| 33 | 
            +
                    #    lr=Statsample::Regression::Multiple.listwise(ds,'y')
         | 
| 34 | 
            +
                    def self.pairwise(ds,y_var)
         | 
| 35 | 
            +
                      RubyEngine.new(ds,y_var)
         | 
| 225 36 | 
             
                    end
         | 
| 226 | 
            -
             | 
| 37 | 
            +
                    def self.listwise_by_exp(ds,exp)
         | 
| 38 | 
            +
                      raise "Not implemented yet"
         | 
| 227 39 | 
             
                    end
         | 
| 228 | 
            -
                     | 
| 229 | 
            -
             | 
| 230 | 
            -
             | 
| 231 | 
            -
             | 
| 232 | 
            -
             | 
| 233 | 
            -
             | 
| 40 | 
            +
                    # Returns a dataset and name of criteria using a expression.
         | 
| 41 | 
            +
                    # All nominal vectors are replaced by dummy coding
         | 
| 42 | 
            +
                    # and interactions are calculated
         | 
| 43 | 
            +
                    
         | 
| 44 | 
            +
                    def self.ds_by_exp(ds,exp)
         | 
| 45 | 
            +
                      raise "Not implemented"
         | 
| 46 | 
            +
                      parts=exp.split(/[\+=]/)
         | 
| 47 | 
            +
                      dependent=parts.pop
         | 
| 48 | 
            +
                      ds_out=[]
         | 
| 49 | 
            +
                      parts.each{|p|
         | 
| 50 | 
            +
                      
         | 
| 51 | 
            +
                      }
         | 
| 234 52 | 
             
                    end
         | 
| 235 | 
            -
             | 
| 236 53 |  | 
| 237 | 
            -
                # Deprecated
         | 
| 238 | 
            -
                # Sum of squares of error (manual calculation)
         | 
| 239 | 
            -
                # using the predicted value minus the y_i value
         | 
| 240 | 
            -
                def sse_manual
         | 
| 241 | 
            -
                    pr=predicted
         | 
| 242 | 
            -
                    cases=0
         | 
| 243 | 
            -
                    sse=(0...@ds.cases).inject(0) {|a,i|
         | 
| 244 | 
            -
                        if !@dy.data_with_nils[i].nil? and !pr[i].nil?
         | 
| 245 | 
            -
                            cases+=1
         | 
| 246 | 
            -
                            a+((pr[i]-@dy[i])**2)
         | 
| 247 | 
            -
                        else
         | 
| 248 | 
            -
                            a
         | 
| 249 | 
            -
                        end
         | 
| 250 | 
            -
                    }
         | 
| 251 | 
            -
                    sse*(min_n_valid-1.0).quo(cases-1)
         | 
| 252 | 
            -
                end
         | 
| 253 | 
            -
                # Sum of squares of regression
         | 
| 254 | 
            -
                # using the predicted value minus y mean
         | 
| 255 | 
            -
                def ssr_direct
         | 
| 256 | 
            -
                    mean=@dy.mean
         | 
| 257 | 
            -
                    cases=0
         | 
| 258 | 
            -
                    ssr=(0...@ds.cases).inject(0) {|a,i|
         | 
| 259 | 
            -
                        invalid=false
         | 
| 260 | 
            -
                        v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
         | 
| 261 | 
            -
                        if !invalid
         | 
| 262 | 
            -
                            cases+=1
         | 
| 263 | 
            -
                            a+((process(v)-mean)**2)
         | 
| 264 | 
            -
                        else
         | 
| 265 | 
            -
                            a
         | 
| 266 | 
            -
                        end
         | 
| 267 | 
            -
                    }
         | 
| 268 | 
            -
                    ssr
         | 
| 269 | 
            -
                end
         | 
| 270 | 
            -
                def sse_direct
         | 
| 271 | 
            -
                    sst-ssr
         | 
| 272 54 | 
             
                end
         | 
| 273 | 
            -
             | 
| 274 | 
            -
                    c=coeffs
         | 
| 275 | 
            -
                    total=constant
         | 
| 276 | 
            -
                    @fields.each_index{|i|
         | 
| 277 | 
            -
                    total+=c[@fields[i]]*v[i]
         | 
| 278 | 
            -
                    }
         | 
| 279 | 
            -
                    total
         | 
| 280 | 
            -
                end
         | 
| 281 | 
            -
            end
         | 
| 282 | 
            -
            end
         | 
| 283 | 
            -
            end
         | 
| 55 | 
            +
              end
         | 
| 284 56 | 
             
            end
         | 
| @@ -0,0 +1,235 @@ | |
| 1 | 
            +
            module Statsample
         | 
| 2 | 
            +
              module Regression
         | 
| 3 | 
            +
                module Multiple
         | 
| 4 | 
            +
                  # Base class for Multiple Regression Engines
         | 
| 5 | 
            +
                  class BaseEngine
         | 
| 6 | 
            +
                    def initialize(ds,y_var)
         | 
| 7 | 
            +
                    @ds=ds
         | 
| 8 | 
            +
                    @y_var=y_var
         | 
| 9 | 
            +
                    @r2=nil
         | 
| 10 | 
            +
                    end
         | 
| 11 | 
            +
                    
         | 
| 12 | 
            +
                    # Retrieves a vector with predicted values for y
         | 
| 13 | 
            +
                    def predicted
         | 
| 14 | 
            +
                    (0...@ds.cases).collect { |i|
         | 
| 15 | 
            +
                    invalid=false
         | 
| 16 | 
            +
                    vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
         | 
| 17 | 
            +
                    if invalid
         | 
| 18 | 
            +
                    nil
         | 
| 19 | 
            +
                    else
         | 
| 20 | 
            +
                    process(vect)
         | 
| 21 | 
            +
                    end
         | 
| 22 | 
            +
                    }.to_vector(:scale)
         | 
| 23 | 
            +
                    end
         | 
| 24 | 
            +
                    # Retrieves a vector with standarized values for y
         | 
| 25 | 
            +
                    def standarized_predicted
         | 
| 26 | 
            +
                    predicted.standarized
         | 
| 27 | 
            +
                    end
         | 
| 28 | 
            +
                    # Retrieves a vector with residuals values for y
         | 
| 29 | 
            +
                    def residuals
         | 
| 30 | 
            +
                    (0...@ds.cases).collect{|i|
         | 
| 31 | 
            +
                    invalid=false
         | 
| 32 | 
            +
                    vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
         | 
| 33 | 
            +
                    if invalid or @ds[@y_var][i].nil?
         | 
| 34 | 
            +
                    nil
         | 
| 35 | 
            +
                    else
         | 
| 36 | 
            +
                    @ds[@y_var][i] - process(vect)
         | 
| 37 | 
            +
                    end
         | 
| 38 | 
            +
                    }.to_vector(:scale)
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
                    # R Multiple
         | 
| 41 | 
            +
                    def r
         | 
| 42 | 
            +
                    raise "You should implement this"
         | 
| 43 | 
            +
                    end
         | 
| 44 | 
            +
                    # Sum of squares Total
         | 
| 45 | 
            +
                    def sst
         | 
| 46 | 
            +
                    raise "You should implement this"
         | 
| 47 | 
            +
                    end
         | 
| 48 | 
            +
                    # Sum of squares (regression)
         | 
| 49 | 
            +
                    def ssr
         | 
| 50 | 
            +
                    r2*sst
         | 
| 51 | 
            +
                    end
         | 
| 52 | 
            +
                    # Sum of squares (Error)
         | 
| 53 | 
            +
                    def sse
         | 
| 54 | 
            +
                    sst - ssr
         | 
| 55 | 
            +
                    end            
         | 
| 56 | 
            +
                    # T values for coeffs
         | 
| 57 | 
            +
                    def coeffs_t
         | 
| 58 | 
            +
                    out={}
         | 
| 59 | 
            +
                    se=coeffs_se
         | 
| 60 | 
            +
                    coeffs.each{|k,v|
         | 
| 61 | 
            +
                    out[k]=v / se[k] 
         | 
| 62 | 
            +
                    }
         | 
| 63 | 
            +
                    out
         | 
| 64 | 
            +
                    end
         | 
| 65 | 
            +
                    # Mean square Regression
         | 
| 66 | 
            +
                    def msr
         | 
| 67 | 
            +
                    ssr.quo(df_r)
         | 
| 68 | 
            +
                    end
         | 
| 69 | 
            +
                    # Mean Square Error
         | 
| 70 | 
            +
                    def mse
         | 
| 71 | 
            +
                    sse.quo(df_e)
         | 
| 72 | 
            +
                    end            
         | 
| 73 | 
            +
                    # Degrees of freedom for regression
         | 
| 74 | 
            +
                    def df_r
         | 
| 75 | 
            +
                    @dep_columns.size
         | 
| 76 | 
            +
                    end
         | 
| 77 | 
            +
                    # Degrees of freedom for error
         | 
| 78 | 
            +
                    def df_e
         | 
| 79 | 
            +
                    @ds_valid.cases-@dep_columns.size-1
         | 
| 80 | 
            +
                    end
         | 
| 81 | 
            +
                    # Fisher for Anova
         | 
| 82 | 
            +
                    def f
         | 
| 83 | 
            +
                    (ssr.quo(df_r)).quo(sse.quo(df_e))
         | 
| 84 | 
            +
                    end
         | 
| 85 | 
            +
                    # Significance of Fisher
         | 
| 86 | 
            +
                    def significance
         | 
| 87 | 
            +
                    1.0-Distribution::F.cdf(f,df_r,df_e)
         | 
| 88 | 
            +
                    end
         | 
| 89 | 
            +
                    # Tolerance for a given variable
         | 
| 90 | 
            +
                    # http://talkstats.com/showthread.php?t=5056
         | 
| 91 | 
            +
                    def tolerance(var)
         | 
| 92 | 
            +
                    ds=assign_names(@dep_columns)
         | 
| 93 | 
            +
                    ds.each{|k,v|
         | 
| 94 | 
            +
                    ds[k]=v.to_vector(:scale)
         | 
| 95 | 
            +
                    }
         | 
| 96 | 
            +
                    lr=Multiple.listwise(ds.to_dataset,var)
         | 
| 97 | 
            +
                    1-lr.r2
         | 
| 98 | 
            +
                    end
         | 
| 99 | 
            +
                    # Tolerances for each coefficient
         | 
| 100 | 
            +
                    def coeffs_tolerances
         | 
| 101 | 
            +
                    @fields.inject({}) {|a,f|
         | 
| 102 | 
            +
                    a[f]=tolerance(f);
         | 
| 103 | 
            +
                    a
         | 
| 104 | 
            +
                    }
         | 
| 105 | 
            +
                    end
         | 
| 106 | 
            +
                    # Standard Error for coefficients
         | 
| 107 | 
            +
                    def coeffs_se
         | 
| 108 | 
            +
                    out={}
         | 
| 109 | 
            +
                    mse=sse.quo(df_e)
         | 
| 110 | 
            +
                    coeffs.each {|k,v|
         | 
| 111 | 
            +
                    out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
         | 
| 112 | 
            +
                    }
         | 
| 113 | 
            +
                    out
         | 
| 114 | 
            +
                    end
         | 
| 115 | 
            +
                    # Estimated Variance-Covariance Matrix
         | 
| 116 | 
            +
                    # Used for calculation of se of constant 
         | 
| 117 | 
            +
                    def estimated_variance_covariance_matrix
         | 
| 118 | 
            +
                    mse_p=mse
         | 
| 119 | 
            +
                    columns=[]
         | 
| 120 | 
            +
                    @ds_valid.each_vector{|k,v|
         | 
| 121 | 
            +
                    columns.push(v.data) unless k==@y_var
         | 
| 122 | 
            +
                    }
         | 
| 123 | 
            +
                    columns.unshift([1.0]*@ds_valid.cases)
         | 
| 124 | 
            +
                    x=Matrix.columns(columns)
         | 
| 125 | 
            +
                    matrix=((x.t*x)).inverse * mse
         | 
| 126 | 
            +
                    matrix.collect {|i|
         | 
| 127 | 
            +
                    Math::sqrt(i) if i>0
         | 
| 128 | 
            +
                    }
         | 
| 129 | 
            +
                    end
         | 
| 130 | 
            +
                    # T for constant
         | 
| 131 | 
            +
                    def constant_t
         | 
| 132 | 
            +
                    constant.to_f/constant_se 
         | 
| 133 | 
            +
                    end
         | 
| 134 | 
            +
                    # Standard error for constant
         | 
| 135 | 
            +
                    def constant_se
         | 
| 136 | 
            +
                    estimated_variance_covariance_matrix[0,0]
         | 
| 137 | 
            +
                    end
         | 
| 138 | 
            +
                    # Retrieves a summary for Regression
         | 
| 139 | 
            +
                    def summary(report_type=ConsoleSummary)
         | 
| 140 | 
            +
                    c=coeffs
         | 
| 141 | 
            +
                    out=""
         | 
| 142 | 
            +
                    out.extend report_type
         | 
| 143 | 
            +
                    out.add <<HEREDOC
         | 
| 144 | 
            +
                    Summary for regression of #{@fields.join(',')} over #{@y_var}
         | 
| 145 | 
            +
                    *************************************************************
         | 
| 146 | 
            +
                    Engine: #{self.class}
         | 
| 147 | 
            +
                    Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
         | 
| 148 | 
            +
                    r=#{sprintf("%0.3f",r)}
         | 
| 149 | 
            +
                    r2=#{sprintf("%0.3f",r2)}
         | 
| 150 | 
            +
                    Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
         | 
| 151 | 
            +
            HEREDOC
         | 
| 152 | 
            +
                    
         | 
| 153 | 
            +
                    out.add_line
         | 
| 154 | 
            +
                    out.add "ANOVA TABLE"
         | 
| 155 | 
            +
                    
         | 
| 156 | 
            +
                    t=Statsample::ReportTable.new(%w{source ss df ms f s})
         | 
| 157 | 
            +
                    t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
         | 
| 158 | 
            +
                    t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
         | 
| 159 | 
            +
                    
         | 
| 160 | 
            +
                    t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
         | 
| 161 | 
            +
                    
         | 
| 162 | 
            +
                    out.parse_table(t)
         | 
| 163 | 
            +
                    
         | 
| 164 | 
            +
                    begin
         | 
| 165 | 
            +
                    out.add "Beta coefficientes"
         | 
| 166 | 
            +
                    sc=standarized_coeffs
         | 
| 167 | 
            +
                    cse=coeffs_se
         | 
| 168 | 
            +
                    t=Statsample::ReportTable.new(%w{coeff b beta se t})
         | 
| 169 | 
            +
                    t.add_row(["Constant", sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
         | 
| 170 | 
            +
                    @fields.each{|f|
         | 
| 171 | 
            +
                    t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
         | 
| 172 | 
            +
                    }
         | 
| 173 | 
            +
                    out.parse_table(t)
         | 
| 174 | 
            +
                    
         | 
| 175 | 
            +
                    rescue
         | 
| 176 | 
            +
                    end
         | 
| 177 | 
            +
                    out
         | 
| 178 | 
            +
                    end
         | 
| 179 | 
            +
                    def assign_names(c)
         | 
| 180 | 
            +
                    a={}
         | 
| 181 | 
            +
                    @fields.each_index {|i|
         | 
| 182 | 
            +
                    a[@fields[i]]=c[i]
         | 
| 183 | 
            +
                    }
         | 
| 184 | 
            +
                    a
         | 
| 185 | 
            +
                    end
         | 
| 186 | 
            +
                    
         | 
| 187 | 
            +
                    
         | 
| 188 | 
            +
                    # Deprecated
         | 
| 189 | 
            +
                    # Sum of squares of error (manual calculation)
         | 
| 190 | 
            +
                    # using the predicted value minus the y_i value
         | 
| 191 | 
            +
                    def sse_manual
         | 
| 192 | 
            +
                    pr=predicted
         | 
| 193 | 
            +
                    cases=0
         | 
| 194 | 
            +
                    sse=(0...@ds.cases).inject(0) {|a,i|
         | 
| 195 | 
            +
                    if !@dy.data_with_nils[i].nil? and !pr[i].nil?
         | 
| 196 | 
            +
                    cases+=1
         | 
| 197 | 
            +
                    a+((pr[i]-@dy[i])**2)
         | 
| 198 | 
            +
                    else
         | 
| 199 | 
            +
                    a
         | 
| 200 | 
            +
                    end
         | 
| 201 | 
            +
                    }
         | 
| 202 | 
            +
                    sse*(min_n_valid-1.0).quo(cases-1)
         | 
| 203 | 
            +
                    end
         | 
| 204 | 
            +
                    # Sum of squares of regression
         | 
| 205 | 
            +
                    # using the predicted value minus y mean
         | 
| 206 | 
            +
                    def ssr_direct
         | 
| 207 | 
            +
                    mean=@dy.mean
         | 
| 208 | 
            +
                    cases=0
         | 
| 209 | 
            +
                    ssr=(0...@ds.cases).inject(0) {|a,i|
         | 
| 210 | 
            +
                    invalid=false
         | 
| 211 | 
            +
                    v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
         | 
| 212 | 
            +
                    if !invalid
         | 
| 213 | 
            +
                    cases+=1
         | 
| 214 | 
            +
                    a+((process(v)-mean)**2)
         | 
| 215 | 
            +
                    else
         | 
| 216 | 
            +
                    a
         | 
| 217 | 
            +
                    end
         | 
| 218 | 
            +
                    }
         | 
| 219 | 
            +
                    ssr
         | 
| 220 | 
            +
                    end
         | 
| 221 | 
            +
                    def sse_direct
         | 
| 222 | 
            +
                    sst-ssr
         | 
| 223 | 
            +
                    end
         | 
| 224 | 
            +
                    def process(v)
         | 
| 225 | 
            +
                    c=coeffs
         | 
| 226 | 
            +
                    total=constant
         | 
| 227 | 
            +
                    @fields.each_index{|i|
         | 
| 228 | 
            +
                    total+=c[@fields[i]]*v[i]
         | 
| 229 | 
            +
                    }
         | 
| 230 | 
            +
                    total
         | 
| 231 | 
            +
                    end
         | 
| 232 | 
            +
                  end
         | 
| 233 | 
            +
                end
         | 
| 234 | 
            +
              end
         | 
| 235 | 
            +
            end
         |