statsample 0.6.5 → 0.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -0
- data/Manifest.txt +6 -0
- data/README.txt +30 -12
- data/Rakefile +91 -0
- data/demo/levene.rb +9 -0
- data/demo/multiple_regression.rb +1 -7
- data/demo/polychoric.rb +1 -0
- data/demo/principal_axis.rb +8 -0
- data/lib/distribution/f.rb +22 -22
- data/lib/spss.rb +99 -99
- data/lib/statsample/bivariate/polychoric.rb +32 -22
- data/lib/statsample/bivariate/tetrachoric.rb +212 -207
- data/lib/statsample/bivariate.rb +6 -6
- data/lib/statsample/codification.rb +65 -65
- data/lib/statsample/combination.rb +60 -59
- data/lib/statsample/converter/csv19.rb +12 -12
- data/lib/statsample/converters.rb +1 -1
- data/lib/statsample/dataset.rb +93 -36
- data/lib/statsample/dominanceanalysis/bootstrap.rb +66 -3
- data/lib/statsample/dominanceanalysis.rb +5 -6
- data/lib/statsample/factor/pca.rb +41 -11
- data/lib/statsample/factor/principalaxis.rb +105 -29
- data/lib/statsample/factor/rotation.rb +20 -3
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/graph/gdchart.rb +13 -13
- data/lib/statsample/graph/svggraph.rb +166 -167
- data/lib/statsample/matrix.rb +22 -12
- data/lib/statsample/mle/logit.rb +3 -2
- data/lib/statsample/mle/probit.rb +7 -5
- data/lib/statsample/mle.rb +4 -2
- data/lib/statsample/multiset.rb +125 -124
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression/binomial/logit.rb +4 -3
- data/lib/statsample/regression/binomial/probit.rb +2 -1
- data/lib/statsample/regression/binomial.rb +62 -81
- data/lib/statsample/regression/multiple/baseengine.rb +1 -1
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/matrixengine.rb +12 -6
- data/lib/statsample/regression/multiple.rb +15 -42
- data/lib/statsample/regression/simple.rb +93 -78
- data/lib/statsample/regression.rb +74 -2
- data/lib/statsample/reliability.rb +117 -120
- data/lib/statsample/srs.rb +156 -153
- data/lib/statsample/test/levene.rb +90 -0
- data/lib/statsample/test/umannwhitney.rb +25 -9
- data/lib/statsample/test.rb +2 -0
- data/lib/statsample/vector.rb +388 -413
- data/lib/statsample.rb +74 -30
- data/po/es/statsample.mo +0 -0
- data/test/test_bivariate.rb +5 -4
- data/test/test_combination.rb +1 -1
- data/test/test_dataset.rb +2 -2
- data/test/test_factor.rb +53 -6
- data/test/test_gsl.rb +1 -1
- data/test/test_mle.rb +1 -1
- data/test/test_regression.rb +18 -33
- data/test/test_statistics.rb +15 -33
- data/test/test_stest.rb +35 -0
- data/test/test_svg_graph.rb +2 -2
- data/test/test_vector.rb +331 -333
- metadata +38 -11
    
        data/lib/statsample/dataset.rb
    CHANGED
    
    | @@ -1,22 +1,23 @@ | |
| 1 1 | 
             
            require 'statsample/vector'
         | 
| 2 2 |  | 
| 3 3 | 
             
            class Hash
         | 
| 4 | 
            +
              # Creates a Statsample::Dataset based on a Hash 
         | 
| 4 5 | 
             
              def to_dataset(*args)
         | 
| 5 6 | 
             
                Statsample::Dataset.new(self,*args)
         | 
| 6 7 | 
             
              end
         | 
| 7 8 | 
             
            end
         | 
| 8 9 |  | 
| 9 10 | 
             
            class Array
         | 
| 10 | 
            -
              def prefix(s)
         | 
| 11 | 
            +
              def prefix(s) # :nodoc:
         | 
| 11 12 | 
             
                self.collect{|c| s+c.to_s }
         | 
| 12 13 | 
             
              end
         | 
| 13 | 
            -
              def suffix(s)
         | 
| 14 | 
            +
              def suffix(s) # :nodoc:
         | 
| 14 15 | 
             
                self.collect{|c| c.to_s+s }
         | 
| 15 16 | 
             
              end
         | 
| 16 17 | 
             
            end
         | 
| 17 18 |  | 
| 18 19 | 
             
            module Statsample
         | 
| 19 | 
            -
              class DatasetException < RuntimeError
         | 
| 20 | 
            +
              class DatasetException < RuntimeError # :nodoc:
         | 
| 20 21 | 
             
                attr_reader :ds,:exp
         | 
| 21 22 | 
             
                def initialize(ds,e)
         | 
| 22 23 | 
             
                  @ds=ds
         | 
| @@ -28,15 +29,49 @@ module Statsample | |
| 28 29 | 
             
                  m
         | 
| 29 30 | 
             
                end
         | 
| 30 31 | 
             
              end
         | 
| 32 | 
            +
              # Set of cases with values for one or more variables, 
         | 
| 33 | 
            +
              # analog to a dataframe on R or a standard data file of SPSS.
         | 
| 34 | 
            +
              # Every vector has <tt>#field</tt> name, which represent it. By default,
         | 
| 35 | 
            +
              # the vectors are ordered by it field name, but you can change it 
         | 
| 36 | 
            +
              # the fields order manually.
         | 
| 37 | 
            +
              # The Dataset work as a Hash, with keys are field names
         | 
| 38 | 
            +
              # and values are Statsample::Vector  
         | 
| 39 | 
            +
              # 
         | 
| 40 | 
            +
              # 
         | 
| 41 | 
            +
              # ==Usage
         | 
| 42 | 
            +
              # Create a empty dataset
         | 
| 43 | 
            +
              #   Dataset.new()
         | 
| 44 | 
            +
              # Create a dataset with three empty vectors, called <tt>v1</tt>, <tt>v2</tt> and <tt>v3</tt>
         | 
| 45 | 
            +
              #   Dataset.new(%w{v1 v2 v3})
         | 
| 46 | 
            +
              # Create a dataset with two vectors
         | 
| 47 | 
            +
              #   Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
         | 
| 48 | 
            +
              # Create a dataset with two given vectors (v1 and v2), with vectors on inverted order
         | 
| 49 | 
            +
              #   Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
         | 
| 50 | 
            +
              #
         | 
| 51 | 
            +
              # The fast way to create a dataset uses Hash#to_dataset, with
         | 
| 52 | 
            +
              # field order  as arguments
         | 
| 53 | 
            +
              #   v1 = [1,2,3].to_scale
         | 
| 54 | 
            +
              #   v2 = [1,2,3].to_scale
         | 
| 55 | 
            +
              #   ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})  
         | 
| 56 | 
            +
              
         | 
| 31 57 | 
             
              class Dataset
         | 
| 32 58 | 
             
                include Writable
         | 
| 33 | 
            -
                 | 
| 59 | 
            +
                # Hash of Statsample::Vector
         | 
| 60 | 
            +
                attr_reader :vectors
         | 
| 61 | 
            +
                # Ordered names of vectors
         | 
| 62 | 
            +
                attr_reader :fields
         | 
| 63 | 
            +
                # Number of cases
         | 
| 64 | 
            +
                attr_reader :cases
         | 
| 65 | 
            +
                # Location of pointer on enumerations methods (like #each)
         | 
| 66 | 
            +
                attr_reader :i
         | 
| 67 | 
            +
                # Deprecated: Label of vectors
         | 
| 34 68 | 
             
                attr_accessor :labels
         | 
| 35 69 |  | 
| 36 70 | 
             
                # Generates a new dataset, using three vectors
         | 
| 37 71 | 
             
                # - Rows
         | 
| 38 72 | 
             
                # - Columns
         | 
| 39 73 | 
             
                # - Values
         | 
| 74 | 
            +
                #
         | 
| 40 75 | 
             
                # For example, you have these values
         | 
| 41 76 | 
             
                #
         | 
| 42 77 | 
             
                #   x   y   v
         | 
| @@ -88,16 +123,7 @@ module Statsample | |
| 88 123 | 
             
                # order of variables. If empty, vectors keys on alfabethic order as
         | 
| 89 124 | 
             
                # used as fields
         | 
| 90 125 | 
             
                # [labels]  Hash to set names for fields.
         | 
| 91 | 
            -
             | 
| 92 | 
            -
                #
         | 
| 93 | 
            -
                #   Dataset.new()
         | 
| 94 | 
            -
                #   Dataset.new(%w{v1 v2 v3})
         | 
| 95 | 
            -
                #   Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
         | 
| 96 | 
            -
                #   Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
         | 
| 97 | 
            -
                #
         | 
| 98 | 
            -
                # The fast way to create a dataset uses Hash#to_dataset, with
         | 
| 99 | 
            -
                # fields and labels as arguments
         | 
| 100 | 
            -
                #   ds = {'v1'=>[1,2,3].to_vector}.to_dataset
         | 
| 126 | 
            +
             | 
| 101 127 | 
             
                #
         | 
| 102 128 | 
             
                def initialize(vectors={}, fields=[], labels={})
         | 
| 103 129 | 
             
                  if vectors.instance_of? Array
         | 
| @@ -120,7 +146,8 @@ module Statsample | |
| 120 146 | 
             
                  end
         | 
| 121 147 | 
             
                  matrix
         | 
| 122 148 | 
             
                end
         | 
| 123 | 
            -
                 | 
| 149 | 
            +
                # Retrieves label for a vector, giving a field name.
         | 
| 150 | 
            +
                def label(v_id) 
         | 
| 124 151 | 
             
                  raise "Vector #{v} doesn't exists" unless @fields.include? v_id
         | 
| 125 152 | 
             
                  @labels[v_id].nil? ? v_id : @labels[v_id]
         | 
| 126 153 | 
             
                end
         | 
| @@ -233,12 +260,20 @@ module Statsample | |
| 233 260 | 
             
                  ds_boot.update_valid_data
         | 
| 234 261 | 
             
                  ds_boot
         | 
| 235 262 | 
             
                end
         | 
| 236 | 
            -
                # Fast version of  | 
| 263 | 
            +
                # Fast version of #add_case.
         | 
| 237 264 | 
             
                # Can only add one case and no error check if performed
         | 
| 238 | 
            -
                # You SHOULD use update_valid_data at the end of insertion cycle
         | 
| 265 | 
            +
                # You SHOULD use #update_valid_data at the end of insertion cycle
         | 
| 239 266 | 
             
                def add_case_array(v)
         | 
| 240 267 | 
             
                  v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
         | 
| 241 268 | 
             
                end
         | 
| 269 | 
            +
                # Insert a case, using:
         | 
| 270 | 
            +
                # * Array: size equal to number of vectors and values in the same order as fields
         | 
| 271 | 
            +
                # * Hash: keys equal to fields
         | 
| 272 | 
            +
                # If uvd is false, #update_valid_data is not executed after 
         | 
| 273 | 
            +
                # inserting a case. This is very useful if you want to increase the 
         | 
| 274 | 
            +
                # performance on inserting many cases, 
         | 
| 275 | 
            +
                # because #update_valid_data performs check on vectors and on the dataset
         | 
| 276 | 
            +
                
         | 
| 242 277 | 
             
                def add_case(v,uvd=true)
         | 
| 243 278 | 
             
                  case v
         | 
| 244 279 | 
             
                  when Array
         | 
| @@ -258,14 +293,18 @@ module Statsample | |
| 258 293 | 
             
                    update_valid_data
         | 
| 259 294 | 
             
                  end
         | 
| 260 295 | 
             
                end
         | 
| 296 | 
            +
                # Check vectors and fields after inserting data. Use only 
         | 
| 297 | 
            +
                # after  #add_case_array or #add_case with second parameter to false
         | 
| 261 298 | 
             
                def update_valid_data
         | 
| 262 299 | 
             
                  @fields.each{|f| @vectors[f].set_valid_data}
         | 
| 263 300 | 
             
                  check_length
         | 
| 264 301 | 
             
                end
         | 
| 302 | 
            +
                # Delete a vector
         | 
| 265 303 | 
             
                def delete_vector(name)
         | 
| 266 304 | 
             
                  @fields.delete(name)
         | 
| 267 305 | 
             
                  @vectors.delete(name)
         | 
| 268 306 | 
             
                end
         | 
| 307 | 
            +
                
         | 
| 269 308 | 
             
                def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
         | 
| 270 309 | 
             
                  split=@vectors[name].split_by_separator(sep)
         | 
| 271 310 | 
             
                  i=1
         | 
| @@ -294,7 +333,7 @@ module Statsample | |
| 294 333 | 
             
            		def vector_sum(fields=nil)
         | 
| 295 334 | 
             
            			a=[]
         | 
| 296 335 | 
             
            			fields||=@fields
         | 
| 297 | 
            -
            			collect_with_index do |i | 
| 336 | 
            +
            			collect_with_index do |row, i|
         | 
| 298 337 | 
             
            				if(fields.find{|f| !@vectors[f].data_with_nils[i]})
         | 
| 299 338 | 
             
            					nil
         | 
| 300 339 | 
             
            				else
         | 
| @@ -302,16 +341,17 @@ module Statsample | |
| 302 341 | 
             
            				end
         | 
| 303 342 | 
             
                  end
         | 
| 304 343 | 
             
            		end
         | 
| 344 | 
            +
                # Check if #fields attribute is correct, after inserting or deleting vectors
         | 
| 305 345 | 
             
                def check_fields(fields)
         | 
| 306 346 | 
             
                  fields||=@fields
         | 
| 307 347 | 
             
                  raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
         | 
| 308 348 | 
             
                  fields
         | 
| 309 349 | 
             
                end
         | 
| 350 | 
            +
                
         | 
| 310 351 | 
             
                # Returns a vector with the numbers of missing values for a case
         | 
| 311 | 
            -
             | 
| 312 352 | 
             
                def vector_missing_values(fields=nil)
         | 
| 313 353 | 
             
                  fields=check_fields(fields)
         | 
| 314 | 
            -
                  collect_with_index do |i | 
| 354 | 
            +
                  collect_with_index do |row, i|
         | 
| 315 355 | 
             
                    fields.inject(0) {|a,v|
         | 
| 316 356 | 
             
                      a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
         | 
| 317 357 | 
             
                    }
         | 
| @@ -319,9 +359,8 @@ module Statsample | |
| 319 359 | 
             
                end
         | 
| 320 360 | 
             
                def vector_count_characters(fields=nil)
         | 
| 321 361 | 
             
                  fields=check_fields(fields)
         | 
| 322 | 
            -
                  collect_with_index do |i | 
| 362 | 
            +
                  collect_with_index do |row, i|
         | 
| 323 363 | 
             
                    fields.inject(0){|a,v|
         | 
| 324 | 
            -
             | 
| 325 364 | 
             
                      a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
         | 
| 326 365 | 
             
                    }
         | 
| 327 366 | 
             
                  end
         | 
| @@ -353,7 +392,8 @@ module Statsample | |
| 353 392 | 
             
                  end
         | 
| 354 393 | 
             
                  a.to_vector(:scale)
         | 
| 355 394 | 
             
                end
         | 
| 356 | 
            -
                 | 
| 395 | 
            +
                # Check vectors for type and size.
         | 
| 396 | 
            +
                def check_length # :nodoc:
         | 
| 357 397 | 
             
                  size=nil
         | 
| 358 398 | 
             
                  @vectors.each do |k,v|
         | 
| 359 399 | 
             
                    raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
         | 
| @@ -368,16 +408,19 @@ module Statsample | |
| 368 408 | 
             
                  end
         | 
| 369 409 | 
             
                  @cases=size
         | 
| 370 410 | 
             
                end
         | 
| 371 | 
            -
                 | 
| 372 | 
            -
             | 
| 411 | 
            +
                # Retrieves each vector as [key, vector]
         | 
| 412 | 
            +
                def each_vector # :yield: |key, vector|
         | 
| 413 | 
            +
                  @fields.each{|k| yield k, @vectors[k]}
         | 
| 373 414 | 
             
                end
         | 
| 415 | 
            +
                
         | 
| 374 416 | 
             
                if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
         | 
| 375 417 | 
             
                  def case_as_hash(c) # :nodoc:
         | 
| 376 418 | 
             
                    Statsample::STATSAMPLE__.case_as_hash(self,c)
         | 
| 377 419 | 
             
                  end
         | 
| 378 420 | 
             
                else
         | 
| 379 | 
            -
                   | 
| 380 | 
            -
             | 
| 421 | 
            +
                  # Retrieves case i as a hash
         | 
| 422 | 
            +
                  def case_as_hash(i)
         | 
| 423 | 
            +
                    _case_as_hash(i)
         | 
| 381 424 | 
             
                  end
         | 
| 382 425 | 
             
                end
         | 
| 383 426 |  | 
| @@ -386,8 +429,9 @@ module Statsample | |
| 386 429 | 
             
                    Statsample::STATSAMPLE__.case_as_array(self,c)
         | 
| 387 430 | 
             
                  end
         | 
| 388 431 | 
             
                else
         | 
| 389 | 
            -
                   | 
| 390 | 
            -
             | 
| 432 | 
            +
                  # Retrieves case i as a array, ordered on #fields order
         | 
| 433 | 
            +
                  def case_as_array(i)
         | 
| 434 | 
            +
                    _case_as_array(i)
         | 
| 391 435 | 
             
                  end
         | 
| 392 436 | 
             
                end
         | 
| 393 437 | 
             
                def _case_as_hash(c) # :nodoc:
         | 
| @@ -396,6 +440,7 @@ module Statsample | |
| 396 440 | 
             
                def _case_as_array(c) # :nodoc:
         | 
| 397 441 | 
             
                  @fields.collect {|x| @vectors[x][c]}
         | 
| 398 442 | 
             
                end
         | 
| 443 | 
            +
                
         | 
| 399 444 | 
             
                # Returns each case as a hash
         | 
| 400 445 | 
             
                def each
         | 
| 401 446 | 
             
                  begin
         | 
| @@ -411,7 +456,7 @@ module Statsample | |
| 411 456 | 
             
                  end
         | 
| 412 457 | 
             
                end
         | 
| 413 458 | 
             
                # Returns each case as hash and index
         | 
| 414 | 
            -
                def each_with_index
         | 
| 459 | 
            +
                def each_with_index # :yield: |case, i|
         | 
| 415 460 | 
             
                  begin
         | 
| 416 461 | 
             
                    @i=0
         | 
| 417 462 | 
             
                    @cases.times{|i|
         | 
| @@ -447,6 +492,7 @@ module Statsample | |
| 447 492 | 
             
                  }
         | 
| 448 493 | 
             
                  @i=nil
         | 
| 449 494 | 
             
                end
         | 
| 495 | 
            +
                # Set fields order. If you omit one or more vectors,  
         | 
| 450 496 | 
             
                def fields=(f)
         | 
| 451 497 | 
             
                  @fields=f
         | 
| 452 498 | 
             
                  check_order
         | 
| @@ -470,6 +516,8 @@ module Statsample | |
| 470 516 | 
             
                    raise ArgumentError, "You need a String or a Range"
         | 
| 471 517 | 
             
                  end
         | 
| 472 518 | 
             
                end
         | 
| 519 | 
            +
                # Retrieves a Statsample::Vector, based on the result
         | 
| 520 | 
            +
                # of calculation performed on each case.
         | 
| 473 521 | 
             
                def collect(type=:scale)
         | 
| 474 522 | 
             
                  data=[]
         | 
| 475 523 | 
             
                  each {|row|
         | 
| @@ -477,10 +525,11 @@ module Statsample | |
| 477 525 | 
             
                  }
         | 
| 478 526 | 
             
                  Statsample::Vector.new(data,type)
         | 
| 479 527 | 
             
                end
         | 
| 528 | 
            +
                # Same as #collect, but giving case index as second parameter on yield.
         | 
| 480 529 | 
             
                def collect_with_index(type=:scale)
         | 
| 481 530 | 
             
                  data=[]
         | 
| 482 531 | 
             
                  each_with_index {|row, i|
         | 
| 483 | 
            -
                    data.push(yield(i | 
| 532 | 
            +
                    data.push(yield(row, i))
         | 
| 484 533 | 
             
                  }
         | 
| 485 534 | 
             
                  Statsample::Vector.new(data,type)
         | 
| 486 535 | 
             
                end
         | 
| @@ -504,6 +553,8 @@ module Statsample | |
| 504 553 | 
             
                    raise ArgumentError,"Should pass a Statsample::Vector"
         | 
| 505 554 | 
             
                  end
         | 
| 506 555 | 
             
                end
         | 
| 556 | 
            +
                # Return data as a matrix. Column are ordered by #fields and 
         | 
| 557 | 
            +
                # rows by orden of insertion
         | 
| 507 558 | 
             
                def to_matrix
         | 
| 508 559 | 
             
                  rows=[]
         | 
| 509 560 | 
             
                  self.each_array{|c|
         | 
| @@ -511,7 +562,8 @@ module Statsample | |
| 511 562 | 
             
                  }
         | 
| 512 563 | 
             
                  Matrix.rows(rows)
         | 
| 513 564 | 
             
                end
         | 
| 514 | 
            -
                 | 
| 565 | 
            +
                
         | 
| 566 | 
            +
                if Statsample.has_gsl?
         | 
| 515 567 | 
             
                  def to_matrix_gsl
         | 
| 516 568 | 
             
                  rows=[]
         | 
| 517 569 | 
             
                  self.each_array{|c|
         | 
| @@ -520,15 +572,17 @@ module Statsample | |
| 520 572 | 
             
                  GSL::Matrix.alloc(*rows)
         | 
| 521 573 | 
             
                  end
         | 
| 522 574 | 
             
                end
         | 
| 523 | 
            -
            		 | 
| 575 | 
            +
            		
         | 
| 576 | 
            +
                def to_multiset_by_split(*fields)
         | 
| 524 577 | 
             
            			require 'statsample/multiset'
         | 
| 525 578 | 
             
            			if fields.size==1
         | 
| 526 579 | 
             
            				to_multiset_by_split_one_field(fields[0])
         | 
| 527 580 | 
             
            			else
         | 
| 528 581 | 
             
            				to_multiset_by_split_multiple_fields(*fields)
         | 
| 529 582 | 
             
            			end
         | 
| 530 | 
            -
             | 
| 531 | 
            -
                 | 
| 583 | 
            +
                end
         | 
| 584 | 
            +
                
         | 
| 585 | 
            +
                # Create a new dataset with all cases which the block returns true
         | 
| 532 586 | 
             
                def filter
         | 
| 533 587 | 
             
                  ds=self.dup_empty
         | 
| 534 588 | 
             
                  each {|c|
         | 
| @@ -537,6 +591,7 @@ module Statsample | |
| 537 591 | 
             
                  ds.update_valid_data
         | 
| 538 592 | 
             
                  ds
         | 
| 539 593 | 
             
                end
         | 
| 594 | 
            +
                
         | 
| 540 595 | 
             
            		# creates a new vector with the data of a given field which the block returns true
         | 
| 541 596 | 
             
            		def filter_field(field)
         | 
| 542 597 | 
             
            			a=[]
         | 
| @@ -545,6 +600,7 @@ module Statsample | |
| 545 600 | 
             
            			}
         | 
| 546 601 | 
             
            			a.to_vector(@vectors[field].type)
         | 
| 547 602 | 
             
            		end
         | 
| 603 | 
            +
                
         | 
| 548 604 | 
             
                def to_multiset_by_split_one_field(field)
         | 
| 549 605 | 
             
                  raise ArgumentError,"Should use a correct field name" if !@fields.include? field
         | 
| 550 606 | 
             
                  factors=@vectors[field].factors
         | 
| @@ -604,7 +660,7 @@ module Statsample | |
| 604 660 | 
             
                      text.gsub!(f,"row['#{f}']")
         | 
| 605 661 | 
             
                    end
         | 
| 606 662 | 
             
                  }
         | 
| 607 | 
            -
                  collect_with_index {|i | 
| 663 | 
            +
                  collect_with_index {|row, i|
         | 
| 608 664 | 
             
                    invalid=false
         | 
| 609 665 | 
             
                    @fields.each{|f|
         | 
| 610 666 | 
             
                      if @vectors[f].data_with_nils[i].nil?
         | 
| @@ -653,6 +709,7 @@ module Statsample | |
| 653 709 | 
             
                end
         | 
| 654 710 | 
             
                # Creates a new dataset for one to many relations
         | 
| 655 711 | 
             
                # on a dataset, based on pattern of field names.
         | 
| 712 | 
            +
                # 
         | 
| 656 713 | 
             
                # for example, you have a survey for number of children
         | 
| 657 714 | 
             
                # with this structure:
         | 
| 658 715 | 
             
                #   id, name, child_name_1, child_age_1, child_name_2, child_age_2
         | 
| @@ -1,8 +1,70 @@ | |
| 1 1 | 
             
            module Statsample
         | 
| 2 2 | 
             
              class DominanceAnalysis
         | 
| 3 | 
            +
                # == Goal
         | 
| 3 4 | 
             
                # Generates Bootstrap sample to identity the replicability of a Dominance Analysis. See Azen & Bodescu (2003) for more information.
         | 
| 4 | 
            -
                # | 
| 5 | 
            -
                #  | 
| 5 | 
            +
                #
         | 
| 6 | 
            +
                # == Usage
         | 
| 7 | 
            +
                # 
         | 
| 8 | 
            +
                #  require 'statsample'
         | 
| 9 | 
            +
                #  a=100.times.collect {rand}.to_scale
         | 
| 10 | 
            +
                #  b=100.times.collect {rand}.to_scale
         | 
| 11 | 
            +
                #  c=100.times.collect {rand}.to_scale
         | 
| 12 | 
            +
                #  d=100.times.collect {rand}.to_scale
         | 
| 13 | 
            +
                #  ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
         | 
| 14 | 
            +
                #  ds['y']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()}
         | 
| 15 | 
            +
                #  dab=Statsample::DominanceAnalysis::Bootstrap.new(ds2, 'y', :debug=>true)
         | 
| 16 | 
            +
                #  dab.bootstrap(100,nil)
         | 
| 17 | 
            +
                #  puts dab.summary
         | 
| 18 | 
            +
                # <strong>Output</strong>
         | 
| 19 | 
            +
                #   Sample size: 100
         | 
| 20 | 
            +
                #  t: 1.98421693632958
         | 
| 21 | 
            +
                #  
         | 
| 22 | 
            +
                #  Linear Regression Engine: Statsample::Regression::Multiple::MatrixEngine
         | 
| 23 | 
            +
                #  Table: Bootstrap report
         | 
| 24 | 
            +
                #  --------------------------------------------------------------------------------------------
         | 
| 25 | 
            +
                #  | pairs                 | sD  | Dij    | SE(Dij) | Pij   | Pji   | Pno   | Reproducibility |
         | 
| 26 | 
            +
                #  --------------------------------------------------------------------------------------------
         | 
| 27 | 
            +
                #  | Complete dominance    |
         | 
| 28 | 
            +
                #  --------------------------------------------------------------------------------------------
         | 
| 29 | 
            +
                #  | a - b                 | 1.0 | 0.6150 | 0.454   | 0.550 | 0.320 | 0.130 | 0.550           |
         | 
| 30 | 
            +
                #  | a - c                 | 1.0 | 0.9550 | 0.175   | 0.930 | 0.020 | 0.050 | 0.930           |
         | 
| 31 | 
            +
                #  | a - d                 | 1.0 | 0.9750 | 0.131   | 0.960 | 0.010 | 0.030 | 0.960           |
         | 
| 32 | 
            +
                #  | b - c                 | 1.0 | 0.8800 | 0.276   | 0.820 | 0.060 | 0.120 | 0.820           |
         | 
| 33 | 
            +
                #  | b - d                 | 1.0 | 0.9250 | 0.193   | 0.860 | 0.010 | 0.130 | 0.860           |
         | 
| 34 | 
            +
                #  | c - d                 | 0.5 | 0.5950 | 0.346   | 0.350 | 0.160 | 0.490 | 0.490           |
         | 
| 35 | 
            +
                #  --------------------------------------------------------------------------------------------
         | 
| 36 | 
            +
                #  | Conditional dominance |
         | 
| 37 | 
            +
                #  --------------------------------------------------------------------------------------------
         | 
| 38 | 
            +
                #  | a - b                 | 1.0 | 0.6300 | 0.458   | 0.580 | 0.320 | 0.100 | 0.580           |
         | 
| 39 | 
            +
                #  | a - c                 | 1.0 | 0.9700 | 0.156   | 0.960 | 0.020 | 0.020 | 0.960           |
         | 
| 40 | 
            +
                #  | a - d                 | 1.0 | 0.9800 | 0.121   | 0.970 | 0.010 | 0.020 | 0.970           |
         | 
| 41 | 
            +
                #  | b - c                 | 1.0 | 0.8850 | 0.283   | 0.840 | 0.070 | 0.090 | 0.840           |
         | 
| 42 | 
            +
                #  | b - d                 | 1.0 | 0.9500 | 0.181   | 0.920 | 0.020 | 0.060 | 0.920           |
         | 
| 43 | 
            +
                #  | c - d                 | 0.5 | 0.5800 | 0.360   | 0.350 | 0.190 | 0.460 | 0.460           |
         | 
| 44 | 
            +
                #  --------------------------------------------------------------------------------------------
         | 
| 45 | 
            +
                #  | General Dominance     |
         | 
| 46 | 
            +
                #  --------------------------------------------------------------------------------------------
         | 
| 47 | 
            +
                #  | a - b                 | 1.0 | 0.6500 | 0.479   | 0.650 | 0.350 | 0.000 | 0.650           |
         | 
| 48 | 
            +
                #  | a - c                 | 1.0 | 0.9800 | 0.141   | 0.980 | 0.020 | 0.000 | 0.980           |
         | 
| 49 | 
            +
                #  | a - d                 | 1.0 | 0.9900 | 0.100   | 0.990 | 0.010 | 0.000 | 0.990           |
         | 
| 50 | 
            +
                #  | b - c                 | 1.0 | 0.9000 | 0.302   | 0.900 | 0.100 | 0.000 | 0.900           |
         | 
| 51 | 
            +
                #  | b - d                 | 1.0 | 0.9700 | 0.171   | 0.970 | 0.030 | 0.000 | 0.970           |
         | 
| 52 | 
            +
                #  | c - d                 | 1.0 | 0.5600 | 0.499   | 0.560 | 0.440 | 0.000 | 0.560           |
         | 
| 53 | 
            +
                #  --------------------------------------------------------------------------------------------
         | 
| 54 | 
            +
                #  
         | 
| 55 | 
            +
                #  Table: General averages
         | 
| 56 | 
            +
                #  ---------------------------------------
         | 
| 57 | 
            +
                #  | var | mean  | se    | p.5   | p.95  |
         | 
| 58 | 
            +
                #  ---------------------------------------
         | 
| 59 | 
            +
                #  | a   | 0.133 | 0.049 | 0.062 | 0.218 |
         | 
| 60 | 
            +
                #  | b   | 0.106 | 0.048 | 0.029 | 0.199 |
         | 
| 61 | 
            +
                #  | c   | 0.035 | 0.032 | 0.002 | 0.106 |
         | 
| 62 | 
            +
                #  | d   | 0.023 | 0.019 | 0.002 | 0.062 |
         | 
| 63 | 
            +
                #  ---------------------------------------
         | 
| 64 | 
            +
                #
         | 
| 65 | 
            +
                # == References:
         | 
| 66 | 
            +
                #
         | 
| 67 | 
            +
                # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
         | 
| 6 68 | 
             
                class Bootstrap
         | 
| 7 69 | 
             
                  include GetText
         | 
| 8 70 | 
             
                  include Writable
         | 
| @@ -27,12 +89,13 @@ module Statsample | |
| 27 89 | 
             
                  attr_accessor :alpha
         | 
| 28 90 | 
             
                  # Debug?
         | 
| 29 91 | 
             
                  attr_accessor :debug
         | 
| 92 | 
            +
                  # Default level of confidence for t calculation
         | 
| 93 | 
            +
                  ALPHA=0.95
         | 
| 30 94 | 
             
                  # Create a new Dominance Analysis Bootstrap Object
         | 
| 31 95 | 
             
                  # 
         | 
| 32 96 | 
             
                  # * ds: A Dataset object
         | 
| 33 97 | 
             
                  # * y_var: Name of dependent variable
         | 
| 34 98 | 
             
                  # * opts: Any other attribute of the class 
         | 
| 35 | 
            -
                  ALPHA=0.95
         | 
| 36 99 | 
             
                  def initialize(ds,y_var, opts=Hash.new)
         | 
| 37 100 | 
             
                    @ds=ds
         | 
| 38 101 | 
             
                    @y_var=y_var
         | 
| @@ -1,13 +1,12 @@ | |
| 1 1 | 
             
            require 'statsample/dominanceanalysis/bootstrap'
         | 
| 2 2 | 
             
            module Statsample
         | 
| 3 | 
            -
              # Dominance Analysis is a procedure based on an examination of the R | 
| 3 | 
            +
              # Dominance Analysis is a procedure based on an examination of the R<sup>2</sup> values
         | 
| 4 4 | 
             
              # for all possible subset models, to identify the relevance of one or more 
         | 
| 5 5 | 
             
              # predictors in the prediction of criterium.
         | 
| 6 6 | 
             
              #
         | 
| 7 | 
            -
              #
         | 
| 8 7 | 
             
              # See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
         | 
| 9 8 | 
             
              #
         | 
| 10 | 
            -
              #  | 
| 9 | 
            +
              # == Use
         | 
| 11 10 | 
             
              #
         | 
| 12 11 | 
             
              #  a=1000.times.collect {rand}.to_scale
         | 
| 13 12 | 
             
              #  b=1000.times.collect {rand}.to_scale
         | 
| @@ -17,7 +16,7 @@ module Statsample | |
| 17 16 | 
             
              #  da=Statsample::DominanceAnalysis.new(ds,'y')
         | 
| 18 17 | 
             
              #  puts da.summary
         | 
| 19 18 | 
             
              # 
         | 
| 20 | 
            -
              # Output:
         | 
| 19 | 
            +
              # === Output:
         | 
| 21 20 | 
             
              #
         | 
| 22 21 | 
             
              #  Report: Report 2010-02-08 19:10:11 -0300
         | 
| 23 22 | 
             
              #  Table: Dominance Analysis result
         | 
| @@ -51,12 +50,12 @@ module Statsample | |
| 51 50 | 
             
              #  | a - c | 1.0   | 1.0         | 1.0     |
         | 
| 52 51 | 
             
              #  | b - c | 1.0   | 1.0         | 1.0     |
         | 
| 53 52 | 
             
              #  -----------------------------------------
         | 
| 54 | 
            -
             | 
| 55 53 | 
             
              #
         | 
| 56 54 | 
             
              # == References:
         | 
| 57 55 | 
             
              # * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. <em>Psychological Bulletin, 114</em>, 542-551.
         | 
| 58 56 | 
             
              # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
         | 
| 59 57 | 
             
              # * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. <em>Journal of Educational and Behavioral Statistics, 31</em>(2), 157-180.
         | 
| 58 | 
            +
              #
         | 
| 60 59 | 
             
              class DominanceAnalysis
         | 
| 61 60 | 
             
                include GetText
         | 
| 62 61 | 
             
                bindtextdomain("statsample")
         | 
| @@ -366,7 +365,7 @@ module Statsample | |
| 366 365 | 
             
                  generator.parse_element(t)
         | 
| 367 366 | 
             
                  generator.add_html("</div>")
         | 
| 368 367 | 
             
                end
         | 
| 369 | 
            -
                class ModelData
         | 
| 368 | 
            +
                class ModelData # :nodoc:
         | 
| 370 369 | 
             
                  attr_reader :contributions
         | 
| 371 370 | 
             
                  def initialize(independent, data, da)
         | 
| 372 371 | 
             
                    @independent=independent
         | 
| @@ -1,21 +1,42 @@ | |
| 1 1 | 
             
            module Statsample
         | 
| 2 2 | 
             
            module Factor
         | 
| 3 | 
            -
              # Principal Component Analysis of a  | 
| 4 | 
            -
              #  | 
| 5 | 
            -
              # | 
| 6 | 
            -
              # | 
| 3 | 
            +
              # Principal Component Analysis (PCA) of a 
         | 
| 4 | 
            +
              # covariance or correlation matrix. 
         | 
| 5 | 
            +
              #
         | 
| 6 | 
            +
              # For Principal Axis Analysis, use Statsample::Factor::PrincipalAxis
         | 
| 7 | 
            +
              # 
         | 
| 8 | 
            +
              # == Usage:
         | 
| 9 | 
            +
              #   require 'statsample'
         | 
| 7 10 | 
             
              #   a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
         | 
| 8 11 | 
             
              #   b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale
         | 
| 9 12 | 
             
              #   ds={'a'=>a,'b'=>b}.to_dataset
         | 
| 10 13 | 
             
              #   cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
         | 
| 11 14 | 
             
              #   pca=Statsample::Factor::PCA.new(cor_matrix)
         | 
| 12 | 
            -
              #    | 
| 15 | 
            +
              #   pca.m
         | 
| 16 | 
            +
              #   => 1
         | 
| 17 | 
            +
              #   pca.eigenvalues
         | 
| 18 | 
            +
              #   => [1.92592927269225, 0.0740707273077545]
         | 
| 19 | 
            +
              #   pca.component_matrix
         | 
| 20 | 
            +
              #   => GSL::Matrix
         | 
| 21 | 
            +
              #   [  9.813e-01 
         | 
| 22 | 
            +
              #     9.813e-01 ]
         | 
| 23 | 
            +
              #   pca.communalities
         | 
| 24 | 
            +
              #   => [0.962964636346122, 0.962964636346122]
         | 
| 25 | 
            +
              #
         | 
| 26 | 
            +
              # == References:
         | 
| 27 | 
            +
              #
         | 
| 28 | 
            +
              # * SPSS manual
         | 
| 29 | 
            +
              # * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf 
         | 
| 30 | 
            +
              # 
         | 
| 13 31 | 
             
              class PCA
         | 
| 14 | 
            -
                 | 
| 32 | 
            +
                # Name of analysis
         | 
| 33 | 
            +
                attr_accessor :name
         | 
| 34 | 
            +
                # Number of factors. Set by default to the number of factors
         | 
| 35 | 
            +
                # with eigen values > 1
         | 
| 36 | 
            +
                attr_accessor :m
         | 
| 15 37 | 
             
                include GetText
         | 
| 16 38 | 
             
                bindtextdomain("statsample")
         | 
| 17 39 |  | 
| 18 | 
            -
                
         | 
| 19 40 | 
             
                def initialize(matrix ,opts=Hash.new)
         | 
| 20 41 | 
             
                  if matrix.respond_to? :to_gsl
         | 
| 21 42 | 
             
                    matrix=matrix.to_gsl
         | 
| @@ -42,6 +63,7 @@ module Factor | |
| 42 63 | 
             
                  }
         | 
| 43 64 | 
             
                  @ds=h.to_dataset
         | 
| 44 65 | 
             
                end
         | 
| 66 | 
            +
                
         | 
| 45 67 | 
             
                # Feature vector for m factors
         | 
| 46 68 | 
             
                def feature_vector(m=nil)
         | 
| 47 69 | 
             
                  m||=@m
         | 
| @@ -69,10 +91,10 @@ module Factor | |
| 69 91 | 
             
                    gammas.push(Math::sqrt(@eigenpairs[i][0]))
         | 
| 70 92 | 
             
                  }
         | 
| 71 93 | 
             
                  gamma_m=GSL::Matrix.diagonal(gammas)
         | 
| 72 | 
            -
                  omega_m*(gamma_m)
         | 
| 94 | 
            +
                  (omega_m*(gamma_m)).to_matrix
         | 
| 73 95 | 
             
                end
         | 
| 74 | 
            -
                #  | 
| 75 | 
            -
                def  | 
| 96 | 
            +
                # Communalities for all variables given m factors
         | 
| 97 | 
            +
                def communalities(m=nil)
         | 
| 76 98 | 
             
                  m||=@m
         | 
| 77 99 | 
             
                  h=[]
         | 
| 78 100 | 
             
                  @n_variables.times do |i|
         | 
| @@ -84,9 +106,11 @@ module Factor | |
| 84 106 | 
             
                  end
         | 
| 85 107 | 
             
                  h
         | 
| 86 108 | 
             
                end
         | 
| 109 | 
            +
                # Array with eigenvalues
         | 
| 87 110 | 
             
                def eigenvalues
         | 
| 88 111 | 
             
                  @eigenpairs.collect {|c| c[0] }
         | 
| 89 112 | 
             
                end
         | 
| 113 | 
            +
                
         | 
| 90 114 | 
             
                def calculate_eigenpairs
         | 
| 91 115 | 
             
                  eigval, eigvec= GSL::Eigen.symmv(@matrix)
         | 
| 92 116 | 
             
                  @eigenpairs={}
         | 
| @@ -95,13 +119,18 @@ module Factor | |
| 95 119 | 
             
                  }
         | 
| 96 120 | 
             
                  @eigenpairs=@eigenpairs.sort.reverse
         | 
| 97 121 | 
             
                end
         | 
| 122 | 
            +
                def summary
         | 
| 123 | 
            +
                  rp=ReportBuilder.new()
         | 
| 124 | 
            +
                  rp.add(self)
         | 
| 125 | 
            +
                  rp.to_text
         | 
| 126 | 
            +
                end
         | 
| 98 127 | 
             
                def to_reportbuilder(generator) # :nodoc:
         | 
| 99 128 | 
             
                  anchor=generator.add_toc_entry(_("PCA: ")+name)
         | 
| 100 129 | 
             
                  generator.add_html "<div class='pca'>"+_("PCA")+" #{@name}<a name='#{anchor}'></a>"
         | 
| 101 130 |  | 
| 102 131 | 
             
                  generator.add_text "Number of factors: #{m}"
         | 
| 103 132 | 
             
                  t=ReportBuilder::Table.new(:name=>_("Communalities"), :header=>["Variable","Initial","Extraction"])
         | 
| 104 | 
            -
                   | 
| 133 | 
            +
                  communalities(m).each_with_index {|com,i|
         | 
| 105 134 | 
             
                    t.add_row([i, 1.0, sprintf("%0.3f", com)])
         | 
| 106 135 | 
             
                  }
         | 
| 107 136 | 
             
                  generator.parse_element(t)
         | 
| @@ -122,6 +151,7 @@ module Factor | |
| 122 151 | 
             
                  generator.parse_element(t)
         | 
| 123 152 | 
             
                  generator.add_html("</div>")
         | 
| 124 153 | 
             
                end
         | 
| 154 | 
            +
                private :calculate_eigenpairs, :create_centered_ds
         | 
| 125 155 | 
             
              end
         | 
| 126 156 | 
             
            end
         | 
| 127 157 | 
             
            end
         |